diff --git a/.gitignore b/.gitignore index 4e0aca0..d854cb7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ bin *.json Manifest +.idea +smartctl_exporter diff --git a/main.go b/main.go index 4c769b4..9cea81c 100644 --- a/main.go +++ b/main.go @@ -36,6 +36,17 @@ func (i SMARTctlManagerCollector) Collect(ch chan<- prometheus.Metric) { func init() { options = loadOptions() + + if len(options.SMARTctl.Devices) == 0 { + logger.Debug("No devices specified, trying to load them automatically") + json := readSMARTctlDevices() + devices := json.Get("devices").Array() + for _, d := range devices { + device := d.Get("name").String() + logger.Debug("Found device: %s", device) + options.SMARTctl.Devices = append(options.SMARTctl.Devices, device) + } + } } func main() { diff --git a/metrics.go b/metrics.go index 98ef3ee..a638ba5 100644 --- a/metrics.go +++ b/metrics.go @@ -140,6 +140,105 @@ var ( }, nil, ) + metricDevicePercentageUsed = prometheus.NewDesc( + "smartctl_device_percentage_used", + "Device write percentage used", + []string{ + "device", + "model_family", + "model_name", + "serial_number", + }, + nil, + ) + metricDeviceAvailableSpare = prometheus.NewDesc( + "smartctl_device_available_spare", + "Normalized percentage (0 to 100%) of the remaining spare capacity available", + []string{ + "device", + "model_family", + "model_name", + "serial_number", + }, + nil, + ) + metricDeviceAvailableSpareThreshold = prometheus.NewDesc( + "smartctl_device_available_spare_threshold", + "When the Available Spare falls below the threshold indicated in this field, an asynchronous event completion may occur. The value is indicated as a normalized percentage (0 to 100%)", + []string{ + "device", + "model_family", + "model_name", + "serial_number", + }, + nil, + ) + metricDeviceCriticalWarning = prometheus.NewDesc( + "smartctl_device_critical_warning", + "This field indicates critical warnings for the state of the controller", + []string{ + "device", + "model_family", + "model_name", + "serial_number", + }, + nil, + ) + metricDeviceMediaErrors = prometheus.NewDesc( + "smartctl_device_media_errors", + "Contains the number of occurrences where the controller detected an unrecovered data integrity error. Errors such as uncorrectable ECC, CRC checksum failure, or LBA tag mismatch are included in this field", + []string{ + "device", + "model_family", + "model_name", + "serial_number", + }, + nil, + ) + metricDeviceNumErrLogEntries = prometheus.NewDesc( + "smartctl_device_num_err_log_entries", + "Contains the number of Error Information log entries over the life of the controller", + []string{ + "device", + "model_family", + "model_name", + "serial_number", + }, + nil, + ) + metricDeviceBytesRead = prometheus.NewDesc( + "smartctl_device_bytes_read", + "", + []string{ + "device", + "model_family", + "model_name", + "serial_number", + }, + nil, + ) + metricDeviceBytesWritten = prometheus.NewDesc( + "smartctl_device_bytes_written", + "", + []string{ + "device", + "model_family", + "model_name", + "serial_number", + }, + nil, + ) + metricDeviceSmartStatus = prometheus.NewDesc( + "smartctl_device_smart_status", + "General smart status", + []string{ + "device", + "model_family", + "model_name", + "serial_number", + }, + nil, + ) metricDeviceExitStatus = prometheus.NewDesc( "smartctl_device_smartctl_exit_status", "Exit status of smartctl on device", diff --git a/readjson.go b/readjson.go index 39141e1..50bfdc1 100644 --- a/readjson.go +++ b/readjson.go @@ -55,6 +55,15 @@ func readSMARTctl(device string) gjson.Result { return parseJSON(string(out)) } +func readSMARTctlDevices() gjson.Result { + logger.Debug("Collecting devices") + out, err := exec.Command(options.SMARTctl.SMARTctlLocation, "--json", "--scan-open").Output() + if err != nil { + logger.Warning("S.M.A.R.T. output reading error: %s", err) + } + return parseJSON(string(out)) +} + // Select json source and parse func readData(device string) gjson.Result { if options.SMARTctl.FakeJSON { diff --git a/smartctl.go b/smartctl.go index 331ea96..56faf53 100644 --- a/smartctl.go +++ b/smartctl.go @@ -57,6 +57,16 @@ func (smart *SMARTctl) Collect() { smart.mineDeviceErrorLog() smart.mineDeviceSelfTestLog() smart.mineDeviceERC() + smart.minePercentageUsed() + smart.mineAvailableSpare() + smart.mineAvailableSpareThreshold() + smart.mineCriticalWarning() + smart.mineMediaErrors() + smart.mineNumErrLogEntries() + smart.mineBytesRead() + smart.mineBytesWritten() + smart.mineSmartStatus() + } func (smart *SMARTctl) mineExitStatus() { @@ -252,6 +262,116 @@ func (smart *SMARTctl) mineDeviceSCTStatus() { } } +func (smart *SMARTctl) minePercentageUsed() { + smart.ch <- prometheus.MustNewConstMetric( + metricDevicePercentageUsed, + prometheus.CounterValue, + smart.json.Get("nvme_smart_health_information_log.percentage_used").Float(), + smart.device.device, + smart.device.family, + smart.device.model, + smart.device.serial, + ) +} + +func (smart *SMARTctl) mineAvailableSpare() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceAvailableSpare, + prometheus.CounterValue, + smart.json.Get("nvme_smart_health_information_log.available_spare").Float(), + smart.device.device, + smart.device.family, + smart.device.model, + smart.device.serial, + ) +} + +func (smart *SMARTctl) mineAvailableSpareThreshold() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceAvailableSpareThreshold, + prometheus.CounterValue, + smart.json.Get("nvme_smart_health_information_log.available_spare_threshold").Float(), + smart.device.device, + smart.device.family, + smart.device.model, + smart.device.serial, + ) +} + +func (smart *SMARTctl) mineCriticalWarning() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceCriticalWarning, + prometheus.CounterValue, + smart.json.Get("nvme_smart_health_information_log.critical_warning").Float(), + smart.device.device, + smart.device.family, + smart.device.model, + smart.device.serial, + ) +} + +func (smart *SMARTctl) mineMediaErrors() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceMediaErrors, + prometheus.CounterValue, + smart.json.Get("nvme_smart_health_information_log.media_errors").Float(), + smart.device.device, + smart.device.family, + smart.device.model, + smart.device.serial, + ) +} + +func (smart *SMARTctl) mineNumErrLogEntries() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceNumErrLogEntries, + prometheus.CounterValue, + smart.json.Get("nvme_smart_health_information_log.num_err_log_entries").Float(), + smart.device.device, + smart.device.family, + smart.device.model, + smart.device.serial, + ) +} + +func (smart *SMARTctl) mineBytesRead() { + blockSize := smart.json.Get("logical_block_size").Float() * 1024 + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceBytesRead, + prometheus.CounterValue, + smart.json.Get("nvme_smart_health_information_log.data_units_read").Float()*blockSize, + smart.device.device, + smart.device.family, + smart.device.model, + smart.device.serial, + ) +} + +func (smart *SMARTctl) mineBytesWritten() { + blockSize := smart.json.Get("logical_block_size").Float() * 1024 + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceBytesWritten, + prometheus.CounterValue, + smart.json.Get("nvme_smart_health_information_log.data_units_written").Float()*blockSize, + smart.device.device, + smart.device.family, + smart.device.model, + smart.device.serial, + ) +} + +func (smart *SMARTctl) mineSmartStatus() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceSmartStatus, + prometheus.GaugeValue, + smart.json.Get("smart_status.passed").Float(), + smart.device.device, + smart.device.family, + smart.device.model, + smart.device.serial, + ) +} + func (smart *SMARTctl) mineDeviceStatistics() { for _, page := range smart.json.Get("ata_device_statistics.pages").Array() { table := strings.TrimSpace(page.Get("name").String()) diff --git a/smartctl_exporter.yaml b/smartctl_exporter.yaml old mode 100644 new mode 100755 index 7740083..f0cbf2f --- a/smartctl_exporter.yaml +++ b/smartctl_exporter.yaml @@ -1,13 +1,11 @@ smartctl_exporter: - bind_to: "[::1]:9633" + bind_to: "0.0.0.0:9633" url_path: "/metrics" fake_json: no smartctl_location: /usr/sbin/smartctl collect_not_more_than_period: 20s - devices: - - /dev/sda - - /dev/sdb - - /dev/sdc - - /dev/sdd - - /dev/sde - - /dev/sdf +# devices: +# - /dev/nvme0 +# - /dev/nvme1 +# - /dev/nvme2 +# - /dev/nvme3