Merge pull request #168 from robbat2/rjohnson/scsi-support

feat: Better SCSI/SAS support, and removing confused metrics
This commit is contained in:
David Randall 2023-11-20 18:33:13 -05:00 committed by GitHub
commit 1f56220657
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 208 additions and 49 deletions

View File

@ -44,6 +44,11 @@ var (
"ata_version", "ata_version",
"sata_version", "sata_version",
"form_factor", "form_factor",
// scsi_model_name is mapped into model_name
"scsi_vendor",
"scsi_product",
"scsi_revision",
"scsi_version",
}, },
nil, nil,
) )
@ -293,6 +298,22 @@ var (
}, },
nil, nil,
) )
metricReadErrorsCorrectedByEccFast = prometheus.NewDesc(
"smartctl_read_errors_corrected_by_eccfast",
"Read Errors Corrected by ECC Fast",
[]string{
"device",
},
nil,
)
metricReadErrorsCorrectedByEccDelayed = prometheus.NewDesc(
"smartctl_read_errors_corrected_by_eccdelayed",
"Read Errors Corrected by ECC Delayed",
[]string{
"device",
},
nil,
)
metricReadTotalUncorrectedErrors = prometheus.NewDesc( metricReadTotalUncorrectedErrors = prometheus.NewDesc(
"smartctl_read_total_uncorrected_errors", "smartctl_read_total_uncorrected_errors",
"Read Total Uncorrected Errors", "Read Total Uncorrected Errors",
@ -309,6 +330,22 @@ var (
}, },
nil, nil,
) )
metricWriteErrorsCorrectedByEccFast = prometheus.NewDesc(
"smartctl_write_errors_corrected_by_eccfast",
"Write Errors Corrected by ECC Fast",
[]string{
"device",
},
nil,
)
metricWriteErrorsCorrectedByEccDelayed = prometheus.NewDesc(
"smartctl_write_errors_corrected_by_eccdelayed",
"Write Errors Corrected by ECC Delayed",
[]string{
"device",
},
nil,
)
metricWriteTotalUncorrectedErrors = prometheus.NewDesc( metricWriteTotalUncorrectedErrors = prometheus.NewDesc(
"smartctl_write_total_uncorrected_errors", "smartctl_write_total_uncorrected_errors",
"Write Total Uncorrected Errors", "Write Total Uncorrected Errors",

View File

@ -29,6 +29,9 @@ type SMARTDevice struct {
serial string serial string
family string family string
model string model string
// These are used to select types of metrics.
interface_ string
protocol string
} }
// SMARTctl object // SMARTctl object
@ -41,15 +44,28 @@ type SMARTctl struct {
// NewSMARTctl is smartctl constructor // NewSMARTctl is smartctl constructor
func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metric) SMARTctl { func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metric) SMARTctl {
var model_name string
if obj := json.Get("model_name"); obj.Exists() {
model_name = obj.String()
} else if obj := json.Get("scsi_model_name"); obj.Exists() {
model_name = obj.String()
}
// If the drive returns an empty model name, replace that with unknown.
if model_name == "" {
model_name = "unknown"
}
return SMARTctl{ return SMARTctl{
ch: ch, ch: ch,
json: json, json: json,
logger: logger, logger: logger,
device: SMARTDevice{ device: SMARTDevice{
device: strings.TrimPrefix(strings.TrimSpace(json.Get("device.name").String()), "/dev/"), device: strings.TrimPrefix(strings.TrimSpace(json.Get("device.name").String()), "/dev/"),
serial: strings.TrimSpace(json.Get("serial_number").String()), serial: strings.TrimSpace(json.Get("serial_number").String()),
family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")), family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")),
model: strings.TrimSpace(json.Get("model_name").String()), model: strings.TrimSpace(model_name),
interface_: strings.TrimSpace(json.Get("device.type").String()),
protocol: strings.TrimSpace(json.Get("device.protocol").String()),
}, },
} }
} }
@ -66,23 +82,31 @@ func (smart *SMARTctl) Collect() {
smart.minePowerOnSeconds() smart.minePowerOnSeconds()
smart.mineRotationRate() smart.mineRotationRate()
smart.mineTemperatures() smart.mineTemperatures()
smart.minePowerCycleCount() smart.minePowerCycleCount() // ATA/SATA, NVME, SCSI, SAS
smart.mineDeviceSCTStatus() smart.mineDeviceSCTStatus()
smart.mineDeviceStatistics() smart.mineDeviceStatistics()
smart.mineDeviceErrorLog() smart.mineDeviceErrorLog()
smart.mineDeviceSelfTestLog() smart.mineDeviceSelfTestLog()
smart.mineDeviceERC() smart.mineDeviceERC()
smart.mineNvmePercentageUsed()
smart.mineNvmeAvailableSpare()
smart.mineNvmeAvailableSpareThreshold()
smart.mineNvmeCriticalWarning()
smart.mineNvmeMediaErrors()
smart.mineNvmeNumErrLogEntries()
smart.mineNvmeBytesRead()
smart.mineNvmeBytesWritten()
smart.mineSmartStatus() smart.mineSmartStatus()
smart.mineSCSIGrownDefectList()
smart.mineSCSIErrorCounterLog() if smart.device.interface_ == "nvme" {
smart.mineNvmePercentageUsed()
smart.mineNvmeAvailableSpare()
smart.mineNvmeAvailableSpareThreshold()
smart.mineNvmeCriticalWarning()
smart.mineNvmeMediaErrors()
smart.mineNvmeNumErrLogEntries()
smart.mineNvmeBytesRead()
smart.mineNvmeBytesWritten()
}
// SCSI, SAS
if smart.device.interface_ == "scsi" {
smart.mineSCSIGrownDefectList()
smart.mineSCSIErrorCounterLog()
smart.mineSCSIBytesRead()
smart.mineSCSIBytesWritten()
}
} }
func (smart *SMARTctl) mineExitStatus() { func (smart *SMARTctl) mineExitStatus() {
@ -95,14 +119,13 @@ func (smart *SMARTctl) mineExitStatus() {
} }
func (smart *SMARTctl) mineDevice() { func (smart *SMARTctl) mineDevice() {
device := smart.json.Get("device")
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
metricDeviceModel, metricDeviceModel,
prometheus.GaugeValue, prometheus.GaugeValue,
1, 1,
smart.device.device, smart.device.device,
device.Get("type").String(), smart.device.interface_,
device.Get("protocol").String(), smart.device.protocol,
smart.device.family, smart.device.family,
smart.device.model, smart.device.model,
smart.device.serial, smart.device.serial,
@ -111,6 +134,11 @@ func (smart *SMARTctl) mineDevice() {
smart.json.Get("ata_version.string").String(), smart.json.Get("ata_version.string").String(),
smart.json.Get("sata_version.string").String(), smart.json.Get("sata_version.string").String(),
smart.json.Get("form_factor.name").String(), smart.json.Get("form_factor.name").String(),
// scsi_model_name is mapped into model_name
smart.json.Get("scsi_vendor").String(),
smart.json.Get("scsi_product").String(),
smart.json.Get("scsi_revision").String(),
smart.json.Get("scsi_version").String(),
) )
} }
@ -130,12 +158,15 @@ func (smart *SMARTctl) mineCapacity() {
smart.json.Get("user_capacity.bytes").Float(), smart.json.Get("user_capacity.bytes").Float(),
smart.device.device, smart.device.device,
) )
smart.ch <- prometheus.MustNewConstMetric( nvme_total_capacity := smart.json.Get("nvme_total_capacity")
metricDeviceTotalCapacityBytes, if nvme_total_capacity.Exists() {
prometheus.GaugeValue, smart.ch <- prometheus.MustNewConstMetric(
smart.json.Get("nvme_total_capacity").Float(), metricDeviceTotalCapacityBytes,
smart.device.device, prometheus.GaugeValue,
) nvme_total_capacity.Float(),
smart.device.device,
)
}
} }
func (smart *SMARTctl) mineBlockSize() { func (smart *SMARTctl) mineBlockSize() {
@ -151,16 +182,21 @@ func (smart *SMARTctl) mineBlockSize() {
} }
func (smart *SMARTctl) mineInterfaceSpeed() { func (smart *SMARTctl) mineInterfaceSpeed() {
// TODO: Support scsi_sas_port_[01].phy_N.negotiated_logical_link_rate
iSpeed := smart.json.Get("interface_speed") iSpeed := smart.json.Get("interface_speed")
for _, speedType := range []string{"max", "current"} { if iSpeed.Exists() {
tSpeed := iSpeed.Get(speedType) for _, speedType := range []string{"max", "current"} {
smart.ch <- prometheus.MustNewConstMetric( tSpeed := iSpeed.Get(speedType)
metricDeviceInterfaceSpeed, if tSpeed.Exists() {
prometheus.GaugeValue, smart.ch <- prometheus.MustNewConstMetric(
tSpeed.Get("units_per_second").Float()*tSpeed.Get("bits_per_unit").Float(), metricDeviceInterfaceSpeed,
smart.device.device, prometheus.GaugeValue,
speedType, tSpeed.Get("units_per_second").Float()*tSpeed.Get("bits_per_unit").Float(),
) smart.device.device,
speedType,
)
}
}
} }
} }
@ -200,16 +236,21 @@ func (smart *SMARTctl) mineDeviceAttribute() {
func (smart *SMARTctl) minePowerOnSeconds() { func (smart *SMARTctl) minePowerOnSeconds() {
pot := smart.json.Get("power_on_time") pot := smart.json.Get("power_on_time")
smart.ch <- prometheus.MustNewConstMetric( // If the power_on_time is NOT present, do not report as 0.
metricDevicePowerOnSeconds, if pot.Exists() {
prometheus.CounterValue, smart.ch <- prometheus.MustNewConstMetric(
GetFloatIfExists(pot, "hours", 0)*60*60+GetFloatIfExists(pot, "minutes", 0)*60, metricDevicePowerOnSeconds,
smart.device.device, prometheus.CounterValue,
) GetFloatIfExists(pot, "hours", 0)*60*60+GetFloatIfExists(pot, "minutes", 0)*60,
smart.device.device,
)
}
} }
func (smart *SMARTctl) mineRotationRate() { func (smart *SMARTctl) mineRotationRate() {
rRate := GetFloatIfExists(smart.json, "rotation_rate", 0) rRate := GetFloatIfExists(smart.json, "rotation_rate", 0)
// TODO: what should be done if this is absent vs really zero (for
// solid-state drives)?
if rRate > 0 { if rRate > 0 {
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
metricDeviceRotationRate, metricDeviceRotationRate,
@ -222,6 +263,7 @@ func (smart *SMARTctl) mineRotationRate() {
func (smart *SMARTctl) mineTemperatures() { func (smart *SMARTctl) mineTemperatures() {
temperatures := smart.json.Get("temperature") temperatures := smart.json.Get("temperature")
// TODO: Implement scsi_environmental_reports
if temperatures.Exists() { if temperatures.Exists() {
temperatures.ForEach(func(key, value gjson.Result) bool { temperatures.ForEach(func(key, value gjson.Result) bool {
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
@ -237,12 +279,29 @@ func (smart *SMARTctl) mineTemperatures() {
} }
func (smart *SMARTctl) minePowerCycleCount() { func (smart *SMARTctl) minePowerCycleCount() {
smart.ch <- prometheus.MustNewConstMetric( // ATA & NVME
metricDevicePowerCycleCount, powerCycleCount := smart.json.Get("power_cycle_count")
prometheus.CounterValue, if powerCycleCount.Exists() {
smart.json.Get("power_cycle_count").Float(), smart.ch <- prometheus.MustNewConstMetric(
smart.device.device, metricDevicePowerCycleCount,
) prometheus.CounterValue,
powerCycleCount.Float(),
smart.device.device,
)
return
}
// SCSI
powerCycleCount = smart.json.Get("scsi_start_stop_cycle_counter.accumulated_start_stop_cycles")
if powerCycleCount.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDevicePowerCycleCount,
prometheus.CounterValue,
powerCycleCount.Float(),
smart.device.device,
)
return
}
} }
func (smart *SMARTctl) mineDeviceSCTStatus() { func (smart *SMARTctl) mineDeviceSCTStatus() {
@ -312,29 +371,67 @@ func (smart *SMARTctl) mineNvmeNumErrLogEntries() {
} }
func (smart *SMARTctl) mineNvmeBytesRead() { func (smart *SMARTctl) mineNvmeBytesRead() {
blockSize := smart.json.Get("logical_block_size").Float() blockSize := smart.json.Get("logical_block_size")
data_units_read := smart.json.Get("nvme_smart_health_information_log.data_units_read")
if !blockSize.Exists() || !data_units_read.Exists() {
return
}
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesRead, metricDeviceBytesRead,
prometheus.CounterValue, prometheus.CounterValue,
// This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up. // This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up.
// When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units. // When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units.
smart.json.Get("nvme_smart_health_information_log.data_units_read").Float()*1000.0*blockSize, data_units_read.Float()*1000.0*blockSize.Float(),
smart.device.device, smart.device.device,
) )
} }
func (smart *SMARTctl) mineNvmeBytesWritten() { func (smart *SMARTctl) mineNvmeBytesWritten() {
blockSize := smart.json.Get("logical_block_size").Float() blockSize := smart.json.Get("logical_block_size")
data_units_written := smart.json.Get("nvme_smart_health_information_log.data_units_written")
if !blockSize.Exists() || !data_units_written.Exists() {
return
}
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesWritten, metricDeviceBytesWritten,
prometheus.CounterValue, prometheus.CounterValue,
// This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up. // This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up.
// When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units. // When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units.
smart.json.Get("nvme_smart_health_information_log.data_units_written").Float()*1000.0*blockSize, data_units_written.Float()*1000.0*blockSize.Float(),
smart.device.device, smart.device.device,
) )
} }
func (smart *SMARTctl) mineSCSIBytesRead() {
SCSIHealth := smart.json.Get("scsi_error_counter_log")
if SCSIHealth.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesRead,
prometheus.CounterValue,
// This value is reported by SMARTctl in GB [10^9].
// It is possible that some drives mis-report the value, but
// that is not the responsibility of the exporter or smartctl
SCSIHealth.Get("read.gigabytes_processed").Float()*1e9,
smart.device.device,
)
}
}
func (smart *SMARTctl) mineSCSIBytesWritten() {
SCSIHealth := smart.json.Get("scsi_error_counter_log")
if SCSIHealth.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesWritten,
prometheus.CounterValue,
// This value is reported by SMARTctl in GB [10^9].
// It is possible that some drives mis-report the value, but
// that is not the responsibility of the exporter or smartctl
SCSIHealth.Get("write.gigabytes_processed").Float()*1e9,
smart.device.device,
)
}
}
func (smart *SMARTctl) mineSmartStatus() { func (smart *SMARTctl) mineSmartStatus() {
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
metricDeviceSmartStatus, metricDeviceSmartStatus,
@ -460,6 +557,18 @@ func (smart *SMARTctl) mineSCSIErrorCounterLog() {
SCSIHealth.Get("read.errors_corrected_by_rereads_rewrites").Float(), SCSIHealth.Get("read.errors_corrected_by_rereads_rewrites").Float(),
smart.device.device, smart.device.device,
) )
smart.ch <- prometheus.MustNewConstMetric(
metricReadErrorsCorrectedByEccFast,
prometheus.GaugeValue,
SCSIHealth.Get("read.errors_corrected_by_eccfast").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricReadErrorsCorrectedByEccDelayed,
prometheus.GaugeValue,
SCSIHealth.Get("read.errors_corrected_by_eccdelayed").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
metricReadTotalUncorrectedErrors, metricReadTotalUncorrectedErrors,
prometheus.GaugeValue, prometheus.GaugeValue,
@ -472,11 +581,24 @@ func (smart *SMARTctl) mineSCSIErrorCounterLog() {
SCSIHealth.Get("write.errors_corrected_by_rereads_rewrites").Float(), SCSIHealth.Get("write.errors_corrected_by_rereads_rewrites").Float(),
smart.device.device, smart.device.device,
) )
smart.ch <- prometheus.MustNewConstMetric(
metricWriteErrorsCorrectedByEccFast,
prometheus.GaugeValue,
SCSIHealth.Get("write.errors_corrected_by_eccfast").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricWriteErrorsCorrectedByEccDelayed,
prometheus.GaugeValue,
SCSIHealth.Get("write.errors_corrected_by_eccdelayed").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric( smart.ch <- prometheus.MustNewConstMetric(
metricWriteTotalUncorrectedErrors, metricWriteTotalUncorrectedErrors,
prometheus.GaugeValue, prometheus.GaugeValue,
SCSIHealth.Get("write.total_uncorrected_errors").Float(), SCSIHealth.Get("write.total_uncorrected_errors").Float(),
smart.device.device, smart.device.device,
) )
// TODO: Should we also export the verify category?
} }
} }