feat: Better SCSI/SAS support

Fix the following metrics that were exported as zero because the
exporter did not know how to read them for SCSI devices:
- smartctl_device_bytes_read
- smartctl_device_bytes_written
- smartctl_device_power_cycle_count

New metrics:
- smartctl_read_errors_corrected_by_eccdelayed
- smartctl_read_errors_corrected_by_eccfast
- smartctl_write_errors_corrected_by_eccdelayed
- smartctl_write_errors_corrected_by_eccfast

Fix labels:
- smartctl_device{model_name} is now populated for SCSI/SAS, using
  scsi_model_name.

New labels:
- smartctl_device{} gains:
  scsi_product,scsi_revision,scsi_vendor,scsi_version

Signed-off-by: Robin H. Johnson <rjohnson@coreweave.com>
This commit is contained in:
Robin H. Johnson 2023-10-15 22:59:57 -07:00
parent d90594ac23
commit 9113c6cf0f
2 changed files with 115 additions and 0 deletions

View file

@ -44,6 +44,11 @@ var (
"ata_version", "ata_version",
"sata_version", "sata_version",
"form_factor", "form_factor",
// scsi_model_name is mapped into model_name
"scsi_vendor",
"scsi_product",
"scsi_revision",
"scsi_version",
}, },
nil, nil,
) )
@ -293,6 +298,22 @@ var (
}, },
nil, nil,
) )
metricReadErrorsCorrectedByEccFast = prometheus.NewDesc(
"smartctl_read_errors_corrected_by_eccfast",
"Read Errors Corrected by ECC Fast",
[]string{
"device",
},
nil,
)
metricReadErrorsCorrectedByEccDelayed = prometheus.NewDesc(
"smartctl_read_errors_corrected_by_eccdelayed",
"Read Errors Corrected by ECC Delayed",
[]string{
"device",
},
nil,
)
metricReadTotalUncorrectedErrors = prometheus.NewDesc( metricReadTotalUncorrectedErrors = prometheus.NewDesc(
"smartctl_read_total_uncorrected_errors", "smartctl_read_total_uncorrected_errors",
"Read Total Uncorrected Errors", "Read Total Uncorrected Errors",
@ -309,6 +330,22 @@ var (
}, },
nil, nil,
) )
metricWriteErrorsCorrectedByEccFast = prometheus.NewDesc(
"smartctl_write_errors_corrected_by_eccfast",
"Write Errors Corrected by ECC Fast",
[]string{
"device",
},
nil,
)
metricWriteErrorsCorrectedByEccDelayed = prometheus.NewDesc(
"smartctl_write_errors_corrected_by_eccdelayed",
"Write Errors Corrected by ECC Delayed",
[]string{
"device",
},
nil,
)
metricWriteTotalUncorrectedErrors = prometheus.NewDesc( metricWriteTotalUncorrectedErrors = prometheus.NewDesc(
"smartctl_write_total_uncorrected_errors", "smartctl_write_total_uncorrected_errors",
"Write Total Uncorrected Errors", "Write Total Uncorrected Errors",

View file

@ -47,6 +47,8 @@ func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metr
var model_name string var model_name string
if obj := json.Get("model_name"); obj.Exists() { if obj := json.Get("model_name"); obj.Exists() {
model_name = obj.String() model_name = obj.String()
} else if obj := json.Get("scsi_model_name"); obj.Exists() {
model_name = obj.String()
} }
// If the drive returns an empty model name, replace that with unknown. // If the drive returns an empty model name, replace that with unknown.
if model_name == "" { if model_name == "" {
@ -102,6 +104,8 @@ func (smart *SMARTctl) Collect() {
if smart.device.interface_ == "scsi" { if smart.device.interface_ == "scsi" {
smart.mineSCSIGrownDefectList() smart.mineSCSIGrownDefectList()
smart.mineSCSIErrorCounterLog() smart.mineSCSIErrorCounterLog()
smart.mineSCSIBytesRead()
smart.mineSCSIBytesWritten()
} }
} }
@ -130,6 +134,11 @@ func (smart *SMARTctl) mineDevice() {
smart.json.Get("ata_version.string").String(), smart.json.Get("ata_version.string").String(),
smart.json.Get("sata_version.string").String(), smart.json.Get("sata_version.string").String(),
smart.json.Get("form_factor.name").String(), smart.json.Get("form_factor.name").String(),
// scsi_model_name is mapped into model_name
smart.json.Get("scsi_vendor").String(),
smart.json.Get("scsi_product").String(),
smart.json.Get("scsi_revision").String(),
smart.json.Get("scsi_version").String(),
) )
} }
@ -173,6 +182,7 @@ func (smart *SMARTctl) mineBlockSize() {
} }
func (smart *SMARTctl) mineInterfaceSpeed() { func (smart *SMARTctl) mineInterfaceSpeed() {
// TODO: Support scsi_sas_port_[01].phy_N.negotiated_logical_link_rate
iSpeed := smart.json.Get("interface_speed") iSpeed := smart.json.Get("interface_speed")
if iSpeed.Exists() { if iSpeed.Exists() {
for _, speedType := range []string{"max", "current"} { for _, speedType := range []string{"max", "current"} {
@ -253,6 +263,7 @@ func (smart *SMARTctl) mineRotationRate() {
func (smart *SMARTctl) mineTemperatures() { func (smart *SMARTctl) mineTemperatures() {
temperatures := smart.json.Get("temperature") temperatures := smart.json.Get("temperature")
// TODO: Implement scsi_environmental_reports
if temperatures.Exists() { if temperatures.Exists() {
temperatures.ForEach(func(key, value gjson.Result) bool { temperatures.ForEach(func(key, value gjson.Result) bool {
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
@ -279,6 +290,18 @@ func (smart *SMARTctl) minePowerCycleCount() {
) )
return return
} }
// SCSI
powerCycleCount = smart.json.Get("scsi_start_stop_cycle_counter.accumulated_start_stop_cycles")
if powerCycleCount.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDevicePowerCycleCount,
prometheus.CounterValue,
powerCycleCount.Float(),
smart.device.device,
)
return
}
} }
func (smart *SMARTctl) mineDeviceSCTStatus() { func (smart *SMARTctl) mineDeviceSCTStatus() {
@ -379,6 +402,36 @@ func (smart *SMARTctl) mineNvmeBytesWritten() {
) )
} }
func (smart *SMARTctl) mineSCSIBytesRead() {
SCSIHealth := smart.json.Get("scsi_error_counter_log")
if SCSIHealth.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesRead,
prometheus.CounterValue,
// This value is reported by SMARTctl in GB [10^9].
// It is possible that some drives mis-report the value, but
// that is not the responsibility of the exporter or smartctl
SCSIHealth.Get("read.gigabytes_processed").Float()*1e9,
smart.device.device,
)
}
}
func (smart *SMARTctl) mineSCSIBytesWritten() {
SCSIHealth := smart.json.Get("scsi_error_counter_log")
if SCSIHealth.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesWritten,
prometheus.CounterValue,
// This value is reported by SMARTctl in GB [10^9].
// It is possible that some drives mis-report the value, but
// that is not the responsibility of the exporter or smartctl
SCSIHealth.Get("write.gigabytes_processed").Float()*1e9,
smart.device.device,
)
}
}
func (smart *SMARTctl) mineSmartStatus() { func (smart *SMARTctl) mineSmartStatus() {
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
metricDeviceSmartStatus, metricDeviceSmartStatus,
@ -504,6 +557,18 @@ func (smart *SMARTctl) mineSCSIErrorCounterLog() {
SCSIHealth.Get("read.errors_corrected_by_rereads_rewrites").Float(), SCSIHealth.Get("read.errors_corrected_by_rereads_rewrites").Float(),
smart.device.device, smart.device.device,
) )
smart.ch <- prometheus.MustNewConstMetric(
metricReadErrorsCorrectedByEccFast,
prometheus.GaugeValue,
SCSIHealth.Get("read.errors_corrected_by_eccfast").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricReadErrorsCorrectedByEccDelayed,
prometheus.GaugeValue,
SCSIHealth.Get("read.errors_corrected_by_eccdelayed").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
metricReadTotalUncorrectedErrors, metricReadTotalUncorrectedErrors,
prometheus.GaugeValue, prometheus.GaugeValue,
@ -516,11 +581,24 @@ func (smart *SMARTctl) mineSCSIErrorCounterLog() {
SCSIHealth.Get("write.errors_corrected_by_rereads_rewrites").Float(), SCSIHealth.Get("write.errors_corrected_by_rereads_rewrites").Float(),
smart.device.device, smart.device.device,
) )
smart.ch <- prometheus.MustNewConstMetric(
metricWriteErrorsCorrectedByEccFast,
prometheus.GaugeValue,
SCSIHealth.Get("write.errors_corrected_by_eccfast").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricWriteErrorsCorrectedByEccDelayed,
prometheus.GaugeValue,
SCSIHealth.Get("write.errors_corrected_by_eccdelayed").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
metricWriteTotalUncorrectedErrors, metricWriteTotalUncorrectedErrors,
prometheus.GaugeValue, prometheus.GaugeValue,
SCSIHealth.Get("write.total_uncorrected_errors").Float(), SCSIHealth.Get("write.total_uncorrected_errors").Float(),
smart.device.device, smart.device.device,
) )
// TODO: Should we also export the verify category?
} }
} }