smartctl_exporter/smartctl.go
Robin H. Johnson 9113c6cf0f feat: Better SCSI/SAS support
Fix the following metrics that were exported as zero because the
exporter did not know how to read them for SCSI devices:
- smartctl_device_bytes_read
- smartctl_device_bytes_written
- smartctl_device_power_cycle_count

New metrics:
- smartctl_read_errors_corrected_by_eccdelayed
- smartctl_read_errors_corrected_by_eccfast
- smartctl_write_errors_corrected_by_eccdelayed
- smartctl_write_errors_corrected_by_eccfast

Fix labels:
- smartctl_device{model_name} is now populated for SCSI/SAS, using
  scsi_model_name.

New labels:
- smartctl_device{} gains:
  scsi_product,scsi_revision,scsi_vendor,scsi_version

Signed-off-by: Robin H. Johnson <rjohnson@coreweave.com>
2023-10-16 10:15:57 -07:00

604 lines
18 KiB
Go

// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"fmt"
"strings"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/tidwall/gjson"
)
// SMARTDevice - short info about device
type SMARTDevice struct {
device string
serial string
family string
model string
// These are used to select types of metrics.
interface_ string
protocol string
}
// SMARTctl object
type SMARTctl struct {
ch chan<- prometheus.Metric
json gjson.Result
logger log.Logger
device SMARTDevice
}
// NewSMARTctl is smartctl constructor
func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metric) SMARTctl {
var model_name string
if obj := json.Get("model_name"); obj.Exists() {
model_name = obj.String()
} else if obj := json.Get("scsi_model_name"); obj.Exists() {
model_name = obj.String()
}
// If the drive returns an empty model name, replace that with unknown.
if model_name == "" {
model_name = "unknown"
}
return SMARTctl{
ch: ch,
json: json,
logger: logger,
device: SMARTDevice{
device: strings.TrimPrefix(strings.TrimSpace(json.Get("device.name").String()), "/dev/"),
serial: strings.TrimSpace(json.Get("serial_number").String()),
family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")),
model: strings.TrimSpace(model_name),
interface_: strings.TrimSpace(json.Get("device.type").String()),
protocol: strings.TrimSpace(json.Get("device.protocol").String()),
},
}
}
// Collect metrics
func (smart *SMARTctl) Collect() {
level.Debug(smart.logger).Log("msg", "Collecting metrics from", "device", smart.device.device, "family", smart.device.family, "model", smart.device.model)
smart.mineExitStatus()
smart.mineDevice()
smart.mineCapacity()
smart.mineBlockSize()
smart.mineInterfaceSpeed()
smart.mineDeviceAttribute()
smart.minePowerOnSeconds()
smart.mineRotationRate()
smart.mineTemperatures()
smart.minePowerCycleCount() // ATA/SATA, NVME, SCSI, SAS
smart.mineDeviceSCTStatus()
smart.mineDeviceStatistics()
smart.mineDeviceErrorLog()
smart.mineDeviceSelfTestLog()
smart.mineDeviceERC()
smart.mineSmartStatus()
if smart.device.interface_ == "nvme" {
smart.mineNvmePercentageUsed()
smart.mineNvmeAvailableSpare()
smart.mineNvmeAvailableSpareThreshold()
smart.mineNvmeCriticalWarning()
smart.mineNvmeMediaErrors()
smart.mineNvmeNumErrLogEntries()
smart.mineNvmeBytesRead()
smart.mineNvmeBytesWritten()
}
// SCSI, SAS
if smart.device.interface_ == "scsi" {
smart.mineSCSIGrownDefectList()
smart.mineSCSIErrorCounterLog()
smart.mineSCSIBytesRead()
smart.mineSCSIBytesWritten()
}
}
func (smart *SMARTctl) mineExitStatus() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceExitStatus,
prometheus.GaugeValue,
smart.json.Get("smartctl.exit_status").Float(),
smart.device.device,
)
}
func (smart *SMARTctl) mineDevice() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceModel,
prometheus.GaugeValue,
1,
smart.device.device,
smart.device.interface_,
smart.device.protocol,
smart.device.family,
smart.device.model,
smart.device.serial,
GetStringIfExists(smart.json, "ata_additional_product_id", "unknown"),
smart.json.Get("firmware_version").String(),
smart.json.Get("ata_version.string").String(),
smart.json.Get("sata_version.string").String(),
smart.json.Get("form_factor.name").String(),
// scsi_model_name is mapped into model_name
smart.json.Get("scsi_vendor").String(),
smart.json.Get("scsi_product").String(),
smart.json.Get("scsi_revision").String(),
smart.json.Get("scsi_version").String(),
)
}
func (smart *SMARTctl) mineCapacity() {
// The user_capacity exists only when NVMe have single namespace. Otherwise,
// for NVMe devices with multiple namespaces, when device name used without
// namespace number (exporter case) user_capacity will be absent
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceCapacityBlocks,
prometheus.GaugeValue,
smart.json.Get("user_capacity.blocks").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceCapacityBytes,
prometheus.GaugeValue,
smart.json.Get("user_capacity.bytes").Float(),
smart.device.device,
)
nvme_total_capacity := smart.json.Get("nvme_total_capacity")
if nvme_total_capacity.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceTotalCapacityBytes,
prometheus.GaugeValue,
nvme_total_capacity.Float(),
smart.device.device,
)
}
}
func (smart *SMARTctl) mineBlockSize() {
for _, blockType := range []string{"logical", "physical"} {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBlockSize,
prometheus.GaugeValue,
smart.json.Get(fmt.Sprintf("%s_block_size", blockType)).Float(),
smart.device.device,
blockType,
)
}
}
func (smart *SMARTctl) mineInterfaceSpeed() {
// TODO: Support scsi_sas_port_[01].phy_N.negotiated_logical_link_rate
iSpeed := smart.json.Get("interface_speed")
if iSpeed.Exists() {
for _, speedType := range []string{"max", "current"} {
tSpeed := iSpeed.Get(speedType)
if tSpeed.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceInterfaceSpeed,
prometheus.GaugeValue,
tSpeed.Get("units_per_second").Float()*tSpeed.Get("bits_per_unit").Float(),
smart.device.device,
speedType,
)
}
}
}
}
func (smart *SMARTctl) mineDeviceAttribute() {
for _, attribute := range smart.json.Get("ata_smart_attributes.table").Array() {
name := strings.TrimSpace(attribute.Get("name").String())
flagsShort := strings.TrimSpace(attribute.Get("flags.string").String())
flagsLong := smart.mineLongFlags(attribute.Get("flags"), []string{
"prefailure",
"updated_online",
"performance",
"error_rate",
"event_count",
"auto_keep",
})
id := attribute.Get("id").String()
for key, path := range map[string]string{
"value": "value",
"worst": "worst",
"thresh": "thresh",
"raw": "raw.value",
} {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceAttribute,
prometheus.GaugeValue,
attribute.Get(path).Float(),
smart.device.device,
name,
flagsShort,
flagsLong,
key,
id,
)
}
}
}
func (smart *SMARTctl) minePowerOnSeconds() {
pot := smart.json.Get("power_on_time")
// If the power_on_time is NOT present, do not report as 0.
if pot.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDevicePowerOnSeconds,
prometheus.CounterValue,
GetFloatIfExists(pot, "hours", 0)*60*60+GetFloatIfExists(pot, "minutes", 0)*60,
smart.device.device,
)
}
}
func (smart *SMARTctl) mineRotationRate() {
rRate := GetFloatIfExists(smart.json, "rotation_rate", 0)
// TODO: what should be done if this is absent vs really zero (for
// solid-state drives)?
if rRate > 0 {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceRotationRate,
prometheus.GaugeValue,
rRate,
smart.device.device,
)
}
}
func (smart *SMARTctl) mineTemperatures() {
temperatures := smart.json.Get("temperature")
// TODO: Implement scsi_environmental_reports
if temperatures.Exists() {
temperatures.ForEach(func(key, value gjson.Result) bool {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceTemperature,
prometheus.GaugeValue,
value.Float(),
smart.device.device,
key.String(),
)
return true
})
}
}
func (smart *SMARTctl) minePowerCycleCount() {
// ATA & NVME
powerCycleCount := smart.json.Get("power_cycle_count")
if powerCycleCount.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDevicePowerCycleCount,
prometheus.CounterValue,
powerCycleCount.Float(),
smart.device.device,
)
return
}
// SCSI
powerCycleCount = smart.json.Get("scsi_start_stop_cycle_counter.accumulated_start_stop_cycles")
if powerCycleCount.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDevicePowerCycleCount,
prometheus.CounterValue,
powerCycleCount.Float(),
smart.device.device,
)
return
}
}
func (smart *SMARTctl) mineDeviceSCTStatus() {
status := smart.json.Get("ata_sct_status")
if status.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceState,
prometheus.GaugeValue,
status.Get("device_state").Float(),
smart.device.device,
)
}
}
func (smart *SMARTctl) mineNvmePercentageUsed() {
smart.ch <- prometheus.MustNewConstMetric(
metricDevicePercentageUsed,
prometheus.CounterValue,
smart.json.Get("nvme_smart_health_information_log.percentage_used").Float(),
smart.device.device,
)
}
func (smart *SMARTctl) mineNvmeAvailableSpare() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceAvailableSpare,
prometheus.CounterValue,
smart.json.Get("nvme_smart_health_information_log.available_spare").Float(),
smart.device.device,
)
}
func (smart *SMARTctl) mineNvmeAvailableSpareThreshold() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceAvailableSpareThreshold,
prometheus.CounterValue,
smart.json.Get("nvme_smart_health_information_log.available_spare_threshold").Float(),
smart.device.device,
)
}
func (smart *SMARTctl) mineNvmeCriticalWarning() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceCriticalWarning,
prometheus.CounterValue,
smart.json.Get("nvme_smart_health_information_log.critical_warning").Float(),
smart.device.device,
)
}
func (smart *SMARTctl) mineNvmeMediaErrors() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceMediaErrors,
prometheus.CounterValue,
smart.json.Get("nvme_smart_health_information_log.media_errors").Float(),
smart.device.device,
)
}
func (smart *SMARTctl) mineNvmeNumErrLogEntries() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceNumErrLogEntries,
prometheus.CounterValue,
smart.json.Get("nvme_smart_health_information_log.num_err_log_entries").Float(),
smart.device.device,
)
}
func (smart *SMARTctl) mineNvmeBytesRead() {
blockSize := smart.json.Get("logical_block_size")
data_units_read := smart.json.Get("nvme_smart_health_information_log.data_units_read")
if !blockSize.Exists() || !data_units_read.Exists() {
return
}
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesRead,
prometheus.CounterValue,
// This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up.
// When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units.
data_units_read.Float()*1000.0*blockSize.Float(),
smart.device.device,
)
}
func (smart *SMARTctl) mineNvmeBytesWritten() {
blockSize := smart.json.Get("logical_block_size")
data_units_written := smart.json.Get("nvme_smart_health_information_log.data_units_written")
if !blockSize.Exists() || !data_units_written.Exists() {
return
}
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesWritten,
prometheus.CounterValue,
// This value is reported in thousands (i.e., a value of 1 corresponds to 1000 units of 512 bytes written) and is rounded up.
// When the LBA size is a value other than 512 bytes, the controller shall convert the amount of data written to 512 byte units.
data_units_written.Float()*1000.0*blockSize.Float(),
smart.device.device,
)
}
func (smart *SMARTctl) mineSCSIBytesRead() {
SCSIHealth := smart.json.Get("scsi_error_counter_log")
if SCSIHealth.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesRead,
prometheus.CounterValue,
// This value is reported by SMARTctl in GB [10^9].
// It is possible that some drives mis-report the value, but
// that is not the responsibility of the exporter or smartctl
SCSIHealth.Get("read.gigabytes_processed").Float()*1e9,
smart.device.device,
)
}
}
func (smart *SMARTctl) mineSCSIBytesWritten() {
SCSIHealth := smart.json.Get("scsi_error_counter_log")
if SCSIHealth.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesWritten,
prometheus.CounterValue,
// This value is reported by SMARTctl in GB [10^9].
// It is possible that some drives mis-report the value, but
// that is not the responsibility of the exporter or smartctl
SCSIHealth.Get("write.gigabytes_processed").Float()*1e9,
smart.device.device,
)
}
}
func (smart *SMARTctl) mineSmartStatus() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceSmartStatus,
prometheus.GaugeValue,
smart.json.Get("smart_status.passed").Float(),
smart.device.device,
)
}
func (smart *SMARTctl) mineDeviceStatistics() {
for _, page := range smart.json.Get("ata_device_statistics.pages").Array() {
table := strings.TrimSpace(page.Get("name").String())
// skip vendor-specific statistics (they lead to duplicate metric labels on Seagate Exos drives,
// see https://github.com/Sheridan/smartctl_exporter/issues/3 for details)
if table == "Vendor Specific Statistics" {
continue
}
for _, statistic := range page.Get("table").Array() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceStatistics,
prometheus.GaugeValue,
statistic.Get("value").Float(),
smart.device.device,
table,
strings.TrimSpace(statistic.Get("name").String()),
strings.TrimSpace(statistic.Get("flags.string").String()),
smart.mineLongFlags(statistic.Get("flags"), []string{
"valid",
"normalized",
"supports_dsn",
"monitored_condition_met",
}),
)
}
}
for _, statistic := range smart.json.Get("sata_phy_event_counters.table").Array() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceStatistics,
prometheus.GaugeValue,
statistic.Get("value").Float(),
smart.device.device,
"SATA PHY Event Counters",
strings.TrimSpace(statistic.Get("name").String()),
"V---",
"valid",
)
}
}
func (smart *SMARTctl) mineLongFlags(json gjson.Result, flags []string) string {
var result []string
for _, flag := range flags {
jFlag := json.Get(flag)
if jFlag.Exists() && jFlag.Bool() {
result = append(result, flag)
}
}
return strings.Join(result, ",")
}
func (smart *SMARTctl) mineDeviceErrorLog() {
for logType, status := range smart.json.Get("ata_smart_error_log").Map() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceErrorLogCount,
prometheus.GaugeValue,
status.Get("count").Float(),
smart.device.device,
logType,
)
}
}
func (smart *SMARTctl) mineDeviceSelfTestLog() {
for logType, status := range smart.json.Get("ata_smart_self_test_log").Map() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceSelfTestLogCount,
prometheus.GaugeValue,
status.Get("count").Float(),
smart.device.device,
logType,
)
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceSelfTestLogErrorCount,
prometheus.GaugeValue,
status.Get("error_count_total").Float(),
smart.device.device,
logType,
)
}
}
func (smart *SMARTctl) mineDeviceERC() {
for ercType, status := range smart.json.Get("ata_sct_erc").Map() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceERCSeconds,
prometheus.GaugeValue,
status.Get("deciseconds").Float()/10.0,
smart.device.device,
ercType,
)
}
}
func (smart *SMARTctl) mineSCSIGrownDefectList() {
scsi_grown_defect_list := smart.json.Get("scsi_grown_defect_list")
if scsi_grown_defect_list.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricSCSIGrownDefectList,
prometheus.GaugeValue,
scsi_grown_defect_list.Float(),
smart.device.device,
)
}
}
func (smart *SMARTctl) mineSCSIErrorCounterLog() {
SCSIHealth := smart.json.Get("scsi_error_counter_log")
if SCSIHealth.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricReadErrorsCorrectedByRereadsRewrites,
prometheus.GaugeValue,
SCSIHealth.Get("read.errors_corrected_by_rereads_rewrites").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricReadErrorsCorrectedByEccFast,
prometheus.GaugeValue,
SCSIHealth.Get("read.errors_corrected_by_eccfast").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricReadErrorsCorrectedByEccDelayed,
prometheus.GaugeValue,
SCSIHealth.Get("read.errors_corrected_by_eccdelayed").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricReadTotalUncorrectedErrors,
prometheus.GaugeValue,
SCSIHealth.Get("read.total_uncorrected_errors").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricWriteErrorsCorrectedByRereadsRewrites,
prometheus.GaugeValue,
SCSIHealth.Get("write.errors_corrected_by_rereads_rewrites").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricWriteErrorsCorrectedByEccFast,
prometheus.GaugeValue,
SCSIHealth.Get("write.errors_corrected_by_eccfast").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricWriteErrorsCorrectedByEccDelayed,
prometheus.GaugeValue,
SCSIHealth.Get("write.errors_corrected_by_eccdelayed").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricWriteTotalUncorrectedErrors,
prometheus.GaugeValue,
SCSIHealth.Get("write.total_uncorrected_errors").Float(),
smart.device.device,
)
// TODO: Should we also export the verify category?
}
}