From d90594ac23ada7f22e35a64e5875cd20e660bc35 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Sun, 15 Oct 2023 22:59:57 -0700 Subject: [PATCH 1/2] fix: Remove confused metrics The exporter presently has metrics that are nonsense for a given type of drive, and remain at zero due to their defaults. Change the behavior to NOT emit a metric if the underlying JSON field is not present. Future related work may include parsing the corresponding metrics for SATA/SAS SSDs (e.g. `smartctl_device_percentage_used` could derived from `SSD_Life_Left` on some drives). Metrics no longer exported for the wrong type of drive: - `smartctl_device_nvme_capacity_bytes` (NVME-specific) - `smartctl_device_available_spare` (NVME-specific, ATA possible) - `smartctl_device_available_spare_threshold` (NVME-specific, ATA possible) - `smartctl_device_critical_warning` (NVME-specific, ATA possible) - `smartctl_device_interface_speed` (ATA-specific) - `smartctl_device_media_errors` (NVME-specific, ATA possible) - `smartctl_device_num_err_log_entries` (NVME-specific, SCSI uses distinct metrics, ATA possible) - `smartctl_device_nvme_capacity_bytes` (NVME-specific) - `smartctl_device_percentage_used` (NVME-specific, ATA possible) Signed-off-by: Robin H. Johnson --- smartctl.go | 142 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 93 insertions(+), 49 deletions(-) diff --git a/smartctl.go b/smartctl.go index 23a374b..7114985 100644 --- a/smartctl.go +++ b/smartctl.go @@ -29,6 +29,9 @@ type SMARTDevice struct { serial string family string model string + // These are used to select types of metrics. + interface_ string + protocol string } // SMARTctl object @@ -41,15 +44,26 @@ type SMARTctl struct { // NewSMARTctl is smartctl constructor func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metric) SMARTctl { + var model_name string + if obj := json.Get("model_name"); obj.Exists() { + model_name = obj.String() + } + // If the drive returns an empty model name, replace that with unknown. + if model_name == "" { + model_name = "unknown" + } + return SMARTctl{ ch: ch, json: json, logger: logger, device: SMARTDevice{ - device: strings.TrimPrefix(strings.TrimSpace(json.Get("device.name").String()), "/dev/"), - serial: strings.TrimSpace(json.Get("serial_number").String()), - family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")), - model: strings.TrimSpace(json.Get("model_name").String()), + device: strings.TrimPrefix(strings.TrimSpace(json.Get("device.name").String()), "/dev/"), + serial: strings.TrimSpace(json.Get("serial_number").String()), + family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")), + model: strings.TrimSpace(model_name), + interface_: strings.TrimSpace(json.Get("device.type").String()), + protocol: strings.TrimSpace(json.Get("device.protocol").String()), }, } } @@ -66,23 +80,29 @@ func (smart *SMARTctl) Collect() { smart.minePowerOnSeconds() smart.mineRotationRate() smart.mineTemperatures() - smart.minePowerCycleCount() + smart.minePowerCycleCount() // ATA/SATA, NVME, SCSI, SAS smart.mineDeviceSCTStatus() smart.mineDeviceStatistics() smart.mineDeviceErrorLog() smart.mineDeviceSelfTestLog() smart.mineDeviceERC() - smart.mineNvmePercentageUsed() - smart.mineNvmeAvailableSpare() - smart.mineNvmeAvailableSpareThreshold() - smart.mineNvmeCriticalWarning() - smart.mineNvmeMediaErrors() - smart.mineNvmeNumErrLogEntries() - smart.mineNvmeBytesRead() - smart.mineNvmeBytesWritten() smart.mineSmartStatus() - smart.mineSCSIGrownDefectList() - smart.mineSCSIErrorCounterLog() + + if smart.device.interface_ == "nvme" { + smart.mineNvmePercentageUsed() + smart.mineNvmeAvailableSpare() + smart.mineNvmeAvailableSpareThreshold() + smart.mineNvmeCriticalWarning() + smart.mineNvmeMediaErrors() + smart.mineNvmeNumErrLogEntries() + smart.mineNvmeBytesRead() + smart.mineNvmeBytesWritten() + } + // SCSI, SAS + if smart.device.interface_ == "scsi" { + smart.mineSCSIGrownDefectList() + smart.mineSCSIErrorCounterLog() + } } func (smart *SMARTctl) mineExitStatus() { @@ -95,14 +115,13 @@ func (smart *SMARTctl) mineExitStatus() { } func (smart *SMARTctl) mineDevice() { - device := smart.json.Get("device") smart.ch <- prometheus.MustNewConstMetric( metricDeviceModel, prometheus.GaugeValue, 1, smart.device.device, - device.Get("type").String(), - device.Get("protocol").String(), + smart.device.interface_, + smart.device.protocol, smart.device.family, smart.device.model, smart.device.serial, @@ -130,12 +149,15 @@ func (smart *SMARTctl) mineCapacity() { smart.json.Get("user_capacity.bytes").Float(), smart.device.device, ) - smart.ch <- prometheus.MustNewConstMetric( - metricDeviceTotalCapacityBytes, - prometheus.GaugeValue, - smart.json.Get("nvme_total_capacity").Float(), - smart.device.device, - ) + nvme_total_capacity := smart.json.Get("nvme_total_capacity") + if nvme_total_capacity.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceTotalCapacityBytes, + prometheus.GaugeValue, + nvme_total_capacity.Float(), + smart.device.device, + ) + } } func (smart *SMARTctl) mineBlockSize() { @@ -152,15 +174,19 @@ func (smart *SMARTctl) mineBlockSize() { func (smart *SMARTctl) mineInterfaceSpeed() { iSpeed := smart.json.Get("interface_speed") - for _, speedType := range []string{"max", "current"} { - tSpeed := iSpeed.Get(speedType) - smart.ch <- prometheus.MustNewConstMetric( - metricDeviceInterfaceSpeed, - prometheus.GaugeValue, - tSpeed.Get("units_per_second").Float()*tSpeed.Get("bits_per_unit").Float(), - smart.device.device, - speedType, - ) + if iSpeed.Exists() { + for _, speedType := range []string{"max", "current"} { + tSpeed := iSpeed.Get(speedType) + if tSpeed.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceInterfaceSpeed, + prometheus.GaugeValue, + tSpeed.Get("units_per_second").Float()*tSpeed.Get("bits_per_unit").Float(), + smart.device.device, + speedType, + ) + } + } } } @@ -200,16 +226,21 @@ func (smart *SMARTctl) mineDeviceAttribute() { func (smart *SMARTctl) minePowerOnSeconds() { pot := smart.json.Get("power_on_time") - smart.ch <- prometheus.MustNewConstMetric( - metricDevicePowerOnSeconds, - prometheus.CounterValue, - GetFloatIfExists(pot, "hours", 0)*60*60+GetFloatIfExists(pot, "minutes", 0)*60, - smart.device.device, - ) + // If the power_on_time is NOT present, do not report as 0. + if pot.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDevicePowerOnSeconds, + prometheus.CounterValue, + GetFloatIfExists(pot, "hours", 0)*60*60+GetFloatIfExists(pot, "minutes", 0)*60, + smart.device.device, + ) + } } func (smart *SMARTctl) mineRotationRate() { rRate := GetFloatIfExists(smart.json, "rotation_rate", 0) + // TODO: what should be done if this is absent vs really zero (for + // solid-state drives)? if rRate > 0 { smart.ch <- prometheus.MustNewConstMetric( metricDeviceRotationRate, @@ -237,12 +268,17 @@ func (smart *SMARTctl) mineTemperatures() { } func (smart *SMARTctl) minePowerCycleCount() { - smart.ch <- prometheus.MustNewConstMetric( - metricDevicePowerCycleCount, - prometheus.CounterValue, - smart.json.Get("power_cycle_count").Float(), - smart.device.device, - ) + // ATA & NVME + powerCycleCount := smart.json.Get("power_cycle_count") + if powerCycleCount.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDevicePowerCycleCount, + prometheus.CounterValue, + powerCycleCount.Float(), + smart.device.device, + ) + return + } } func (smart *SMARTctl) mineDeviceSCTStatus() { @@ -312,25 +348,33 @@ func (smart *SMARTctl) mineNvmeNumErrLogEntries() { } func (smart *SMARTctl) mineNvmeBytesRead() { - blockSize := smart.json.Get("logical_block_size").Float() + blockSize := smart.json.Get("logical_block_size") + data_units_read := smart.json.Get("nvme_smart_health_information_log.data_units_read") + if !blockSize.Exists() || !data_units_read.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceBytesRead, prometheus.CounterValue, // This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up. // When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units. - smart.json.Get("nvme_smart_health_information_log.data_units_read").Float()*1000.0*blockSize, + data_units_read.Float()*1000.0*blockSize.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeBytesWritten() { - blockSize := smart.json.Get("logical_block_size").Float() + blockSize := smart.json.Get("logical_block_size") + data_units_written := smart.json.Get("nvme_smart_health_information_log.data_units_written") + if !blockSize.Exists() || !data_units_written.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceBytesWritten, prometheus.CounterValue, // This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up. // When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units. - smart.json.Get("nvme_smart_health_information_log.data_units_written").Float()*1000.0*blockSize, + data_units_written.Float()*1000.0*blockSize.Float(), smart.device.device, ) } From 9113c6cf0fbd78d8549a393f5352556c40b10ce4 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Sun, 15 Oct 2023 22:59:57 -0700 Subject: [PATCH 2/2] feat: Better SCSI/SAS support Fix the following metrics that were exported as zero because the exporter did not know how to read them for SCSI devices: - smartctl_device_bytes_read - smartctl_device_bytes_written - smartctl_device_power_cycle_count New metrics: - smartctl_read_errors_corrected_by_eccdelayed - smartctl_read_errors_corrected_by_eccfast - smartctl_write_errors_corrected_by_eccdelayed - smartctl_write_errors_corrected_by_eccfast Fix labels: - smartctl_device{model_name} is now populated for SCSI/SAS, using scsi_model_name. New labels: - smartctl_device{} gains: scsi_product,scsi_revision,scsi_vendor,scsi_version Signed-off-by: Robin H. Johnson --- metrics.go | 37 +++++++++++++++++++++++++ smartctl.go | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) diff --git a/metrics.go b/metrics.go index c675fd0..0ac083f 100644 --- a/metrics.go +++ b/metrics.go @@ -44,6 +44,11 @@ var ( "ata_version", "sata_version", "form_factor", + // scsi_model_name is mapped into model_name + "scsi_vendor", + "scsi_product", + "scsi_revision", + "scsi_version", }, nil, ) @@ -293,6 +298,22 @@ var ( }, nil, ) + metricReadErrorsCorrectedByEccFast = prometheus.NewDesc( + "smartctl_read_errors_corrected_by_eccfast", + "Read Errors Corrected by ECC Fast", + []string{ + "device", + }, + nil, + ) + metricReadErrorsCorrectedByEccDelayed = prometheus.NewDesc( + "smartctl_read_errors_corrected_by_eccdelayed", + "Read Errors Corrected by ECC Delayed", + []string{ + "device", + }, + nil, + ) metricReadTotalUncorrectedErrors = prometheus.NewDesc( "smartctl_read_total_uncorrected_errors", "Read Total Uncorrected Errors", @@ -309,6 +330,22 @@ var ( }, nil, ) + metricWriteErrorsCorrectedByEccFast = prometheus.NewDesc( + "smartctl_write_errors_corrected_by_eccfast", + "Write Errors Corrected by ECC Fast", + []string{ + "device", + }, + nil, + ) + metricWriteErrorsCorrectedByEccDelayed = prometheus.NewDesc( + "smartctl_write_errors_corrected_by_eccdelayed", + "Write Errors Corrected by ECC Delayed", + []string{ + "device", + }, + nil, + ) metricWriteTotalUncorrectedErrors = prometheus.NewDesc( "smartctl_write_total_uncorrected_errors", "Write Total Uncorrected Errors", diff --git a/smartctl.go b/smartctl.go index 7114985..d308d63 100644 --- a/smartctl.go +++ b/smartctl.go @@ -47,6 +47,8 @@ func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metr var model_name string if obj := json.Get("model_name"); obj.Exists() { model_name = obj.String() + } else if obj := json.Get("scsi_model_name"); obj.Exists() { + model_name = obj.String() } // If the drive returns an empty model name, replace that with unknown. if model_name == "" { @@ -102,6 +104,8 @@ func (smart *SMARTctl) Collect() { if smart.device.interface_ == "scsi" { smart.mineSCSIGrownDefectList() smart.mineSCSIErrorCounterLog() + smart.mineSCSIBytesRead() + smart.mineSCSIBytesWritten() } } @@ -130,6 +134,11 @@ func (smart *SMARTctl) mineDevice() { smart.json.Get("ata_version.string").String(), smart.json.Get("sata_version.string").String(), smart.json.Get("form_factor.name").String(), + // scsi_model_name is mapped into model_name + smart.json.Get("scsi_vendor").String(), + smart.json.Get("scsi_product").String(), + smart.json.Get("scsi_revision").String(), + smart.json.Get("scsi_version").String(), ) } @@ -173,6 +182,7 @@ func (smart *SMARTctl) mineBlockSize() { } func (smart *SMARTctl) mineInterfaceSpeed() { + // TODO: Support scsi_sas_port_[01].phy_N.negotiated_logical_link_rate iSpeed := smart.json.Get("interface_speed") if iSpeed.Exists() { for _, speedType := range []string{"max", "current"} { @@ -253,6 +263,7 @@ func (smart *SMARTctl) mineRotationRate() { func (smart *SMARTctl) mineTemperatures() { temperatures := smart.json.Get("temperature") + // TODO: Implement scsi_environmental_reports if temperatures.Exists() { temperatures.ForEach(func(key, value gjson.Result) bool { smart.ch <- prometheus.MustNewConstMetric( @@ -279,6 +290,18 @@ func (smart *SMARTctl) minePowerCycleCount() { ) return } + + // SCSI + powerCycleCount = smart.json.Get("scsi_start_stop_cycle_counter.accumulated_start_stop_cycles") + if powerCycleCount.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDevicePowerCycleCount, + prometheus.CounterValue, + powerCycleCount.Float(), + smart.device.device, + ) + return + } } func (smart *SMARTctl) mineDeviceSCTStatus() { @@ -379,6 +402,36 @@ func (smart *SMARTctl) mineNvmeBytesWritten() { ) } +func (smart *SMARTctl) mineSCSIBytesRead() { + SCSIHealth := smart.json.Get("scsi_error_counter_log") + if SCSIHealth.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceBytesRead, + prometheus.CounterValue, + // This value is reported by SMARTctl in GB [10^9]. + // It is possible that some drives mis-report the value, but + // that is not the responsibility of the exporter or smartctl + SCSIHealth.Get("read.gigabytes_processed").Float()*1e9, + smart.device.device, + ) + } +} + +func (smart *SMARTctl) mineSCSIBytesWritten() { + SCSIHealth := smart.json.Get("scsi_error_counter_log") + if SCSIHealth.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceBytesWritten, + prometheus.CounterValue, + // This value is reported by SMARTctl in GB [10^9]. + // It is possible that some drives mis-report the value, but + // that is not the responsibility of the exporter or smartctl + SCSIHealth.Get("write.gigabytes_processed").Float()*1e9, + smart.device.device, + ) + } +} + func (smart *SMARTctl) mineSmartStatus() { smart.ch <- prometheus.MustNewConstMetric( metricDeviceSmartStatus, @@ -504,6 +557,18 @@ func (smart *SMARTctl) mineSCSIErrorCounterLog() { SCSIHealth.Get("read.errors_corrected_by_rereads_rewrites").Float(), smart.device.device, ) + smart.ch <- prometheus.MustNewConstMetric( + metricReadErrorsCorrectedByEccFast, + prometheus.GaugeValue, + SCSIHealth.Get("read.errors_corrected_by_eccfast").Float(), + smart.device.device, + ) + smart.ch <- prometheus.MustNewConstMetric( + metricReadErrorsCorrectedByEccDelayed, + prometheus.GaugeValue, + SCSIHealth.Get("read.errors_corrected_by_eccdelayed").Float(), + smart.device.device, + ) smart.ch <- prometheus.MustNewConstMetric( metricReadTotalUncorrectedErrors, prometheus.GaugeValue, @@ -516,11 +581,24 @@ func (smart *SMARTctl) mineSCSIErrorCounterLog() { SCSIHealth.Get("write.errors_corrected_by_rereads_rewrites").Float(), smart.device.device, ) + smart.ch <- prometheus.MustNewConstMetric( + metricWriteErrorsCorrectedByEccFast, + prometheus.GaugeValue, + SCSIHealth.Get("write.errors_corrected_by_eccfast").Float(), + smart.device.device, + ) + smart.ch <- prometheus.MustNewConstMetric( + metricWriteErrorsCorrectedByEccDelayed, + prometheus.GaugeValue, + SCSIHealth.Get("write.errors_corrected_by_eccdelayed").Float(), + smart.device.device, + ) smart.ch <- prometheus.MustNewConstMetric( metricWriteTotalUncorrectedErrors, prometheus.GaugeValue, SCSIHealth.Get("write.total_uncorrected_errors").Float(), smart.device.device, ) + // TODO: Should we also export the verify category? } }