From 65f6884211a28e46503451721a4bcefb77899fcb Mon Sep 17 00:00:00 2001 From: manogna_grandhi Date: Wed, 4 Mar 2026 16:43:07 +0530 Subject: [PATCH 1/4] add error counts / dimm channel for edac collector Signed-off-by: manogna_grandhi --- collector/edac_linux.go | 206 +++++++++++++++------ collector/fixtures/e2e-64k-page-output.txt | 12 ++ collector/fixtures/e2e-output.txt | 24 ++- collector/fixtures/sys.ttar | 40 ++-- 4 files changed, 204 insertions(+), 78 deletions(-) diff --git a/collector/edac_linux.go b/collector/edac_linux.go index d3a2a07a83..25616c1933 100644 --- a/collector/edac_linux.go +++ b/collector/edac_linux.go @@ -18,8 +18,10 @@ package collector import ( "fmt" "log/slog" + "os" "path/filepath" "regexp" + "strings" "github.com/prometheus/client_golang/prometheus" ) @@ -30,115 +32,213 @@ const ( var ( edacMemControllerRE = regexp.MustCompile(`.*devices/system/edac/mc/mc([0-9]*)`) - edacMemCsrowRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/csrow([0-9]*)`) + edacMemDimmRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/dimm([0-9]*)`) ) type edacCollector struct { - ceCount *prometheus.Desc - ueCount *prometheus.Desc - csRowCECount *prometheus.Desc - csRowUECount *prometheus.Desc - logger *slog.Logger + ceCount *prometheus.Desc + ueCount *prometheus.Desc + channelCECount *prometheus.Desc + channelUECount *prometheus.Desc + dimmCECount *prometheus.Desc + dimmUECount *prometheus.Desc + logger *slog.Logger } func init() { registerCollector("edac", defaultEnabled, NewEdacCollector) } -// NewEdacCollector returns a new Collector exposing edac stats. func NewEdacCollector(logger *slog.Logger) (Collector, error) { + return &edacCollector{ + ceCount: prometheus.NewDesc( prometheus.BuildFQName(namespace, edacSubsystem, "correctable_errors_total"), "Total correctable memory errors.", - []string{"controller"}, nil, + []string{"controller"}, + nil, ), + ueCount: prometheus.NewDesc( prometheus.BuildFQName(namespace, edacSubsystem, "uncorrectable_errors_total"), "Total uncorrectable memory errors.", - []string{"controller"}, nil, + []string{"controller"}, + nil, + ), + + channelCECount: prometheus.NewDesc( + prometheus.BuildFQName(namespace, edacSubsystem, "channel_correctable_errors_total"), + "Total correctable memory errors for this channel.", + []string{"controller", "csrow", "channel", "dimm_label"}, + nil, ), - csRowCECount: prometheus.NewDesc( - prometheus.BuildFQName(namespace, edacSubsystem, "csrow_correctable_errors_total"), - "Total correctable memory errors for this csrow.", - []string{"controller", "csrow"}, nil, + + channelUECount: prometheus.NewDesc( + prometheus.BuildFQName(namespace, edacSubsystem, "channel_uncorrectable_errors_total"), + "Total uncorrectable memory errors for this channel.", + []string{"controller", "csrow", "channel", "dimm_label"}, + nil, ), - csRowUECount: prometheus.NewDesc( - prometheus.BuildFQName(namespace, edacSubsystem, "csrow_uncorrectable_errors_total"), - "Total uncorrectable memory errors for this csrow.", - []string{"controller", "csrow"}, nil, + + dimmCECount: prometheus.NewDesc( + prometheus.BuildFQName(namespace, edacSubsystem, "dimm_correctable_errors_total"), + "Total correctable memory errors for this dimm.", + []string{"controller", "dimm"}, + nil, ), + + dimmUECount: prometheus.NewDesc( + prometheus.BuildFQName(namespace, edacSubsystem, "dimm_uncorrectable_errors_total"), + "Total uncorrectable memory errors for this dimm.", + []string{"controller", "dimm"}, + nil, + ), + logger: logger, }, nil } func (c *edacCollector) Update(ch chan<- prometheus.Metric) error { + memControllers, err := filepath.Glob(sysFilePath("devices/system/edac/mc/mc[0-9]*")) if err != nil { return err } + for _, controller := range memControllers { + controllerMatch := edacMemControllerRE.FindStringSubmatch(controller) if controllerMatch == nil { return fmt.Errorf("controller string didn't match regexp: %s", controller) } + controllerNumber := controllerMatch[1] value, err := readUintFromFile(filepath.Join(controller, "ce_count")) - if err != nil { - return fmt.Errorf("couldn't get ce_count for controller %s: %w", controllerNumber, err) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.ceCount, + prometheus.CounterValue, + float64(value), + controllerNumber, + ) } - ch <- prometheus.MustNewConstMetric( - c.ceCount, prometheus.CounterValue, float64(value), controllerNumber) - value, err = readUintFromFile(filepath.Join(controller, "ce_noinfo_count")) - if err != nil { - return fmt.Errorf("couldn't get ce_noinfo_count for controller %s: %w", controllerNumber, err) + value, err = readUintFromFile(filepath.Join(controller, "ue_count")) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.ueCount, + prometheus.CounterValue, + float64(value), + controllerNumber, + ) } - ch <- prometheus.MustNewConstMetric( - c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown") - value, err = readUintFromFile(filepath.Join(controller, "ue_count")) + csrows, err := filepath.Glob(controller + "/csrow[0-9]*") + if err != nil { - return fmt.Errorf("couldn't get ue_count for controller %s: %w", controllerNumber, err) + return err } - ch <- prometheus.MustNewConstMetric( - c.ueCount, prometheus.CounterValue, float64(value), controllerNumber) - value, err = readUintFromFile(filepath.Join(controller, "ue_noinfo_count")) - if err != nil { - return fmt.Errorf("couldn't get ue_noinfo_count for controller %s: %w", controllerNumber, err) + for _, csrow := range csrows { + base := filepath.Base(csrow) + + match := regexp.MustCompile(`csrow([0-9]+)`).FindStringSubmatch(base) + if match == nil { + continue + } + csrowNumber := match[1] + + channelFiles, err := filepath.Glob(csrow + "/ch*_ce_count") + if err != nil { + return err + } + + for _, chFile := range channelFiles { + + base := filepath.Base(chFile) + + match := regexp.MustCompile(`ch([0-9]+)_ce_count`).FindStringSubmatch(base) + if match == nil { + continue + } + + channelNumber := match[1] + label := "unknown" + labelBytes, err := os.ReadFile(filepath.Join(csrow, "ch"+channelNumber+"_dimm_label")) + if err == nil { + label = strings.TrimSpace(string(labelBytes)) + // format label + label = strings.ReplaceAll(label, "#", "") + label = strings.ReplaceAll(label, "csrow", "_csrow") + label = strings.ReplaceAll(label, "channel", "_channel") + } + value, err := readUintFromFile(chFile) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.channelCECount, + prometheus.CounterValue, + float64(value), + controllerNumber, + csrowNumber, + channelNumber, + label, + ) + } + + value, err = readUintFromFile(filepath.Join(csrow, "ch"+channelNumber+"_ue_count")) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.channelUECount, + prometheus.CounterValue, + float64(value), + controllerNumber, + csrowNumber, + channelNumber, + label, + ) + } + } } - ch <- prometheus.MustNewConstMetric( - c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown") - // For each controller, walk the csrow directories. - csrows, err := filepath.Glob(controller + "/csrow[0-9]*") + dimms, err := filepath.Glob(controller + "/dimm[0-9]*") if err != nil { return err } - for _, csrow := range csrows { - csrowMatch := edacMemCsrowRE.FindStringSubmatch(csrow) - if csrowMatch == nil { - return fmt.Errorf("csrow string didn't match regexp: %s", csrow) + + for _, dimm := range dimms { + + dimmMatch := edacMemDimmRE.FindStringSubmatch(dimm) + if dimmMatch == nil || len(dimmMatch) < 2 { + continue } - csrowNumber := csrowMatch[1] - value, err = readUintFromFile(filepath.Join(csrow, "ce_count")) - if err != nil { - return fmt.Errorf("couldn't get ce_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err) + dimmNumber := dimmMatch[1] + + value, err := readUintFromFile(filepath.Join(dimm, "dimm_ce_count")) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.dimmCECount, + prometheus.CounterValue, + float64(value), + controllerNumber, + dimmNumber, + ) } - ch <- prometheus.MustNewConstMetric( - c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber) - value, err = readUintFromFile(filepath.Join(csrow, "ue_count")) - if err != nil { - return fmt.Errorf("couldn't get ue_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err) + value, err = readUintFromFile(filepath.Join(dimm, "dimm_ue_count")) + if err == nil { + ch <- prometheus.MustNewConstMetric( + c.dimmUECount, + prometheus.CounterValue, + float64(value), + controllerNumber, + dimmNumber, + ) } - ch <- prometheus.MustNewConstMetric( - c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber) } } - return err + return nil } diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index b860becad2..3f4f509670 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -740,6 +740,18 @@ node_drbd_remote_pending{device="drbd1"} 12346 # HELP node_drbd_remote_unacknowledged Number of requests received by the peer via the network connection, but that have not yet been answered. # TYPE node_drbd_remote_unacknowledged gauge node_drbd_remote_unacknowledged{device="drbd1"} 12347 +# HELP node_edac_channel_correctable_errors_total Total correctable memory errors for this channel. +# TYPE node_edac_channel_correctable_errors_total counter +node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 0 +node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 0 +node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 0 +node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 0 +# HELP node_edac_channel_uncorrectable_errors_total Total uncorrectable memory errors for this channel. +# TYPE node_edac_channel_uncorrectable_errors_total counter +node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 2 +node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 2 +node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 2 +node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 2 # HELP node_edac_correctable_errors_total Total correctable memory errors. # TYPE node_edac_correctable_errors_total counter node_edac_correctable_errors_total{controller="0"} 1 diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 7ac06c0f87..fd1d520b4f 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -774,18 +774,22 @@ node_drbd_remote_pending{device="drbd1"} 12346 node_drbd_remote_unacknowledged{device="drbd1"} 12347 # HELP node_edac_correctable_errors_total Total correctable memory errors. # TYPE node_edac_correctable_errors_total counter -node_edac_correctable_errors_total{controller="0"} 1 -# HELP node_edac_csrow_correctable_errors_total Total correctable memory errors for this csrow. -# TYPE node_edac_csrow_correctable_errors_total counter -node_edac_csrow_correctable_errors_total{controller="0",csrow="0"} 3 -node_edac_csrow_correctable_errors_total{controller="0",csrow="unknown"} 2 -# HELP node_edac_csrow_uncorrectable_errors_total Total uncorrectable memory errors for this csrow. -# TYPE node_edac_csrow_uncorrectable_errors_total counter -node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="0"} 4 -node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="unknown"} 6 +node_edac_correctable_errors_total{controller="0"} 128 # HELP node_edac_uncorrectable_errors_total Total uncorrectable memory errors. # TYPE node_edac_uncorrectable_errors_total counter -node_edac_uncorrectable_errors_total{controller="0"} 5 +node_edac_uncorrectable_errors_total{controller="0"} 0 +# HELP node_edac_channel_correctable_errors_total Total correctable memory errors for this channel. +# TYPE node_edac_channel_correctable_errors_total counter +node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 0 +node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 0 +node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 0 +node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 0 +# HELP node_edac_channel_uncorrectable_errors_total Total uncorrectable memory errors for this channel. +# TYPE node_edac_channel_uncorrectable_errors_total counter +node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 2 +node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 2 +node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 2 +node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 2 # HELP node_entropy_available_bits Bits of available entropy. # TYPE node_entropy_available_bits gauge node_entropy_available_bits 1337 diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index f8a453fe37..8ff006bdae 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -3409,8 +3409,8 @@ Mode: 444 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/pci0000:00/0000:00:02.1/0000:01:00.0/config Lines: 2 - -TNULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE!PNULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTEEOF +�� +TNULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE��NULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE��!PNULLBYTENULLBYTENULLBYTENULLBYTE�NULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE�NULLBYTENULLBYTEEOF Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/pci0000:00/0000:00:02.1/0000:01:00.0/consistent_dma_mask_bits @@ -5213,7 +5213,7 @@ Mode: 444 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/pci0000:00/0000:00:02.1/config Lines: 1 -"4NULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTEPNULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTEEOF +"4NULLBYTENULLBYTENULLBYTENULLBYTE�NULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE�NULLBYTENULLBYTE������NULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTEPNULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE�NULLBYTENULLBYTEEOF Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/pci0000:00/0000:00:02.1/consistent_dma_mask_bits @@ -8004,8 +8004,8 @@ Mode: 444 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/pci0000:40/0000:40:01.3/0000:45:00.0/vpd Lines: 2 -:NULLBYTEIntel (r) Ethernet Network Adapter I350-T4 for OCP NIC 3.0dNULLBYTEV1:Intel (r) Ethernet Network Adapter I350-T4 for OCP NIC 3.0PN -K53978-004SN 6805CAF0CB12V24521RVxEOF +�:NULLBYTEIntel (r) Ethernet Network Adapter I350-T4 for OCP NIC 3.0�dNULLBYTEV1:Intel (r) Ethernet Network Adapter I350-T4 for OCP NIC 3.0PN +K53978-004SN 6805CAF0CB12V24521RV�xEOF Mode: 600 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/devices/pci0000:40/0000:40:01.3/0000:45:00.0/wakeup @@ -9163,35 +9163,45 @@ Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/system/edac/mc/mc0/ce_count Lines: 1 -1 +128 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: sys/devices/system/edac/mc/mc0/ce_noinfo_count +Path: sys/devices/system/edac/mc/mc0/ue_count Lines: 1 -2 +0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/devices/system/edac/mc/mc0/csrow0 Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: sys/devices/system/edac/mc/mc0/csrow0/ce_count +Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_ce_count +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_ce_count Lines: 1 3 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: sys/devices/system/edac/mc/mc0/csrow0/ue_count +Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_ue_count Lines: 1 -4 +0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: sys/devices/system/edac/mc/mc0/ue_count +Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_ue_count Lines: 1 -5 +0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: sys/devices/system/edac/mc/mc0/ue_noinfo_count +Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_dimm_label Lines: 1 -6 +mc0_csrow0_channel0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_dimm_label +Lines: 1 +mc0_csrow0_channel1 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/devices/system/node From 4061c6b6e42591223e1f61b0d02bc0b45eafc59d Mon Sep 17 00:00:00 2001 From: manogna_grandhi Date: Tue, 24 Mar 2026 13:30:29 +0530 Subject: [PATCH 2/4] updated sys.ttar Signed-off-by: manogna_grandhi --- collector/edac_linux.go | 2 +- collector/fixtures/sys.ttar | 77 +++++++++++++++++++++++++++++++------ 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/collector/edac_linux.go b/collector/edac_linux.go index 25616c1933..0597e7a1fe 100644 --- a/collector/edac_linux.go +++ b/collector/edac_linux.go @@ -210,7 +210,7 @@ func (c *edacCollector) Update(ch chan<- prometheus.Metric) error { for _, dimm := range dimms { dimmMatch := edacMemDimmRE.FindStringSubmatch(dimm) - if dimmMatch == nil || len(dimmMatch) < 2 { + if len(dimmMatch) < 2 { continue } diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index 8ff006bdae..11bde0abed 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -3409,8 +3409,8 @@ Mode: 444 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/pci0000:00/0000:00:02.1/0000:01:00.0/config Lines: 2 -�� -TNULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE��NULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE��!PNULLBYTENULLBYTENULLBYTENULLBYTE�NULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE�NULLBYTENULLBYTEEOF + +TNULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE!PNULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTEEOF Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/pci0000:00/0000:00:02.1/0000:01:00.0/consistent_dma_mask_bits @@ -5213,7 +5213,7 @@ Mode: 444 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/pci0000:00/0000:00:02.1/config Lines: 1 -"4NULLBYTENULLBYTENULLBYTENULLBYTE�NULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE�NULLBYTENULLBYTE������NULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTEPNULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTE�NULLBYTENULLBYTEEOF +"4NULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTEPNULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTENULLBYTEEOF Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/pci0000:00/0000:00:02.1/consistent_dma_mask_bits @@ -8004,8 +8004,8 @@ Mode: 444 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/pci0000:40/0000:40:01.3/0000:45:00.0/vpd Lines: 2 -�:NULLBYTEIntel (r) Ethernet Network Adapter I350-T4 for OCP NIC 3.0�dNULLBYTEV1:Intel (r) Ethernet Network Adapter I350-T4 for OCP NIC 3.0PN -K53978-004SN 6805CAF0CB12V24521RV�xEOF +:NULLBYTEIntel (r) Ethernet Network Adapter I350-T4 for OCP NIC 3.0dNULLBYTEV1:Intel (r) Ethernet Network Adapter I350-T4 for OCP NIC 3.0PN +K53978-004SN 6805CAF0CB12V24521RVxEOF Mode: 600 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/devices/pci0000:40/0000:40:01.3/0000:45:00.0/wakeup @@ -9163,12 +9163,12 @@ Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/system/edac/mc/mc0/ce_count Lines: 1 -128 +1 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: sys/devices/system/edac/mc/mc0/ue_count +Path: sys/devices/system/edac/mc/mc0/ce_noinfo_count Lines: 1 -0 +2 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/devices/system/edac/mc/mc0/csrow0 @@ -9179,29 +9179,82 @@ Lines: 1 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/devices/system/edac/mc/mc0/csrow1 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch0_ce_count +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_ce_count Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch1_ce_count +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow0/ce_count +Lines: 1 3 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_ue_count Lines: 1 -0 +2 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch0_ue_count +Lines: 1 +2 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_ue_count Lines: 1 -0 +2 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch1_ue_count +Lines: 1 +2 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow0/ue_count +Lines: 1 +4 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/ue_count +Lines: 1 +5 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/ue_noinfo_count +Lines: 1 +6 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_dimm_label Lines: 1 -mc0_csrow0_channel0 +mc0csrow0channel0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_dimm_label Lines: 1 -mc0_csrow0_channel1 +mc0csrow0channel1 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch0_dimm_label +Lines: 1 +mc0csrow1channel0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch1_dimm_label +Lines: 1 +mc0csrow1channel1 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/devices/system/node From 4dfa7f532280edb2e1079fe0a25512aa2c044a0b Mon Sep 17 00:00:00 2001 From: manogna_grandhi Date: Wed, 25 Mar 2026 12:10:53 +0530 Subject: [PATCH 3/4] reordered e2e-output.txt file Signed-off-by: manogna_grandhi --- collector/fixtures/e2e-output.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 505e29fb36..57cbc6809e 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -1363,12 +1363,6 @@ node_drbd_remote_pending{device="drbd1"} 12346 # HELP node_drbd_remote_unacknowledged Number of requests received by the peer via the network connection, but that have not yet been answered. # TYPE node_drbd_remote_unacknowledged gauge node_drbd_remote_unacknowledged{device="drbd1"} 12347 -# HELP node_edac_correctable_errors_total Total correctable memory errors. -# TYPE node_edac_correctable_errors_total counter -node_edac_correctable_errors_total{controller="0"} 128 -# HELP node_edac_uncorrectable_errors_total Total uncorrectable memory errors. -# TYPE node_edac_uncorrectable_errors_total counter -node_edac_uncorrectable_errors_total{controller="0"} 0 # HELP node_edac_channel_correctable_errors_total Total correctable memory errors for this channel. # TYPE node_edac_channel_correctable_errors_total counter node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 0 @@ -1381,6 +1375,12 @@ node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="0 node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 2 node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 2 node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 2 +# HELP node_edac_correctable_errors_total Total correctable memory errors. +# TYPE node_edac_correctable_errors_total counter +node_edac_correctable_errors_total{controller="0"} 1 +# HELP node_edac_uncorrectable_errors_total Total uncorrectable memory errors. +# TYPE node_edac_uncorrectable_errors_total counter +node_edac_uncorrectable_errors_total{controller="0"} 5 # HELP node_entropy_available_bits Bits of available entropy. # TYPE node_entropy_available_bits gauge node_entropy_available_bits 1337 From fe61dc5c4418f82308186617e7a957cbaf2442b0 Mon Sep 17 00:00:00 2001 From: manogna_grandhi Date: Wed, 25 Mar 2026 12:18:27 +0530 Subject: [PATCH 4/4] reordered e2e-64k-output.txt file Signed-off-by: manogna_grandhi --- collector/edac_linux.go | 53 ---------------------- collector/fixtures/e2e-64k-page-output.txt | 8 ---- 2 files changed, 61 deletions(-) diff --git a/collector/edac_linux.go b/collector/edac_linux.go index 0597e7a1fe..0fd8776632 100644 --- a/collector/edac_linux.go +++ b/collector/edac_linux.go @@ -40,8 +40,6 @@ type edacCollector struct { ueCount *prometheus.Desc channelCECount *prometheus.Desc channelUECount *prometheus.Desc - dimmCECount *prometheus.Desc - dimmUECount *prometheus.Desc logger *slog.Logger } @@ -81,20 +79,6 @@ func NewEdacCollector(logger *slog.Logger) (Collector, error) { nil, ), - dimmCECount: prometheus.NewDesc( - prometheus.BuildFQName(namespace, edacSubsystem, "dimm_correctable_errors_total"), - "Total correctable memory errors for this dimm.", - []string{"controller", "dimm"}, - nil, - ), - - dimmUECount: prometheus.NewDesc( - prometheus.BuildFQName(namespace, edacSubsystem, "dimm_uncorrectable_errors_total"), - "Total uncorrectable memory errors for this dimm.", - []string{"controller", "dimm"}, - nil, - ), - logger: logger, }, nil } @@ -201,43 +185,6 @@ func (c *edacCollector) Update(ch chan<- prometheus.Metric) error { } } } - - dimms, err := filepath.Glob(controller + "/dimm[0-9]*") - if err != nil { - return err - } - - for _, dimm := range dimms { - - dimmMatch := edacMemDimmRE.FindStringSubmatch(dimm) - if len(dimmMatch) < 2 { - continue - } - - dimmNumber := dimmMatch[1] - - value, err := readUintFromFile(filepath.Join(dimm, "dimm_ce_count")) - if err == nil { - ch <- prometheus.MustNewConstMetric( - c.dimmCECount, - prometheus.CounterValue, - float64(value), - controllerNumber, - dimmNumber, - ) - } - - value, err = readUintFromFile(filepath.Join(dimm, "dimm_ue_count")) - if err == nil { - ch <- prometheus.MustNewConstMetric( - c.dimmUECount, - prometheus.CounterValue, - float64(value), - controllerNumber, - dimmNumber, - ) - } - } } return nil diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index 277c4fe255..d7edcb71b8 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -1346,14 +1346,6 @@ node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="1 # HELP node_edac_correctable_errors_total Total correctable memory errors. # TYPE node_edac_correctable_errors_total counter node_edac_correctable_errors_total{controller="0"} 1 -# HELP node_edac_csrow_correctable_errors_total Total correctable memory errors for this csrow. -# TYPE node_edac_csrow_correctable_errors_total counter -node_edac_csrow_correctable_errors_total{controller="0",csrow="0"} 3 -node_edac_csrow_correctable_errors_total{controller="0",csrow="unknown"} 2 -# HELP node_edac_csrow_uncorrectable_errors_total Total uncorrectable memory errors for this csrow. -# TYPE node_edac_csrow_uncorrectable_errors_total counter -node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="0"} 4 -node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="unknown"} 6 # HELP node_edac_uncorrectable_errors_total Total uncorrectable memory errors. # TYPE node_edac_uncorrectable_errors_total counter node_edac_uncorrectable_errors_total{controller="0"} 5