From 7489f4f7aa027c165dc84106100efa84a37c9f36 Mon Sep 17 00:00:00 2001 From: Konstantin Shalygin Date: Thu, 19 Dec 2024 13:02:09 +0300 Subject: [PATCH] Add support for device types and predictable device paths (rebased) (#257) * Add better error logging on smartctl exec failure We will now log a warning if smartctl path passed via command line is invalid. Signed-off-by: Piotr Dobrowolski (cherry picked from commit 1c9c6943e8becf8b7365120c26d5d6db0101c7ed) * Add support for autoscan device types and predictable device paths This adds a new command line option allowing for customization of autodetected device types and enables use of special "by-id" device type that forces use of predictable device paths (/dev/disk/by-id/...) Relevant change to device name parsing regular expression is included now, so predictable device paths are now also usable when directly specified. Signed-off-by: Piotr Dobrowolski (cherry picked from commit 4c5f721e111e66742fd1243524ab4a07964074d0) Conflicts: - file: 'readjson.go' comment: 'manually resolve new logger issues' * Rework device label, fix SATA discovery, per-device type specification Signed-off-by: Piotr Dobrowolski (cherry picked from commit 319184ce66d88b706d4ce4939b68e0c68a712b0e) Conflicts: - file: 'main.go' comment: 'manually resolve new logger issues' - file: 'readjson.go' comment: 'manually resolve new logger issues' --------- Co-authored-by: Piotr Dobrowolski --- main.go | 79 +++++++++++++++++++++++++++++++++--------------- readjson.go | 36 ++++++++++++++-------- smartctl.go | 30 ++++++------------ smartctl_test.go | 44 +++++++++++++++++++++++++++ 4 files changed, 130 insertions(+), 59 deletions(-) create mode 100644 smartctl_test.go diff --git a/main.go b/main.go index 35528b9..3d8ff86 100644 --- a/main.go +++ b/main.go @@ -34,9 +34,13 @@ import ( // Device type Device struct { - Name string `json:"name"` - Info_Name string `json:"info_name"` - Type string `json:"type"` + Name string + Type string + Label string +} + +func (d Device) String() string { + return d.Name + ";" + d.Type + " (" + d.Label + ")" } // SMARTctlManagerCollector implements the Collector interface. @@ -80,6 +84,7 @@ func (i *SMARTctlManagerCollector) RescanForDevices() { time.Sleep(*smartctlRescanInterval) i.logger.Info("Rescanning for devices") devices := scanDevices(i.logger) + devices = buildDevicesFromFlag(devices) i.mutex.Lock() i.Devices = devices i.mutex.Unlock() @@ -96,8 +101,9 @@ var ( smartctlRescanInterval = kingpin.Flag("smartctl.rescan", "The interval between rescanning for new/disappeared devices. If the interval is smaller than 1s no rescanning takes place. If any devices are configured with smartctl.device also no rescanning takes place.", ).Default("10m").Duration() + smartctlScan = kingpin.Flag("smartctl.scan", "Enable scanning. This is a default if no devices are specified").Default("false").Bool() smartctlDevices = kingpin.Flag("smartctl.device", - "The device to monitor (repeatable)", + "The device to monitor. Device type can be specified after a semicolon, eg. '/dev/bus/0;megaraid,1' (repeatable)", ).Strings() smartctlDeviceExclude = kingpin.Flag( "smartctl.device-exclude", @@ -107,6 +113,10 @@ var ( "smartctl.device-include", "Regexp of devices to exclude from automatic scanning. (mutually exclusive to device-exclude)", ).Default("").String() + smartctlScanDeviceTypes = kingpin.Flag( + "smartctl.scan-device-type", + "Device type to use during automatic scan. Special by-id value forces predictable device names. (repeatable)", + ).Strings() smartctlFakeData = kingpin.Flag("smartctl.fake-data", "The device to monitor (repeatable)", ).Default("false").Hidden().Bool() @@ -120,15 +130,23 @@ func scanDevices(logger *slog.Logger) []Device { scanDevices := json.Get("devices").Array() var scanDeviceResult []Device for _, d := range scanDevices { - deviceName := extractDiskName(strings.TrimSpace(d.Get("info_name").String())) - if filter.ignored(deviceName) { - logger.Info("Ignoring device", "name", deviceName) + deviceName := d.Get("name").String() + deviceType := d.Get("type").String() + + // SATA devices are reported as SCSI during scan - fallback to auto scraping + if deviceType == "scsi" { + deviceType = "auto" + } + + deviceLabel := buildDeviceLabel(deviceName, deviceType) + if filter.ignored(deviceLabel) { + logger.Info("Ignoring device", "name", deviceLabel) } else { - logger.Info("Found device", "name", deviceName) + logger.Info("Found device", "name", deviceLabel) device := Device{ - Name: d.Get("name").String(), - Info_Name: deviceName, - Type: d.Get("type").String(), + Name: deviceName, + Type: deviceType, + Label: deviceLabel, } scanDeviceResult = append(scanDeviceResult, device) } @@ -136,18 +154,21 @@ func scanDevices(logger *slog.Logger) []Device { return scanDeviceResult } -func filterDevices(logger *slog.Logger, devices []Device, filters []string) []Device { - var filtered []Device - for _, d := range devices { - for _, filter := range filters { - logger.Debug("filterDevices", "device", d.Info_Name, "filter", filter) - if strings.Contains(d.Info_Name, filter) { - filtered = append(filtered, d) - break - } +func buildDevicesFromFlag(devices []Device) []Device { + // TODO: deduplication? + for _, device := range *smartctlDevices { + deviceName, deviceType, _ := strings.Cut(device, ";") + if deviceType == "" { + deviceType = "auto" } + + devices = append(devices, Device{ + Name: deviceName, + Type: deviceType, + Label: buildDeviceLabel(deviceName, deviceType), + }) } - return filtered + return devices } func main() { @@ -167,11 +188,19 @@ func main() { logger.Info("Build context", "build_context", version.BuildContext()) var devices []Device - devices = scanDevices(logger) - logger.Info("Number of devices found", "count", len(devices)) + + if len(*smartctlDevices) == 0 { + *smartctlScan = true + } + + if *smartctlScan { + devices = scanDevices(logger) + logger.Info("Number of devices found", "count", len(devices)) + } + if len(*smartctlDevices) > 0 { logger.Info("Devices specified", "devices", strings.Join(*smartctlDevices, ", ")) - devices = filterDevices(logger, devices, *smartctlDevices) + devices = buildDevicesFromFlag(devices) logger.Info("Devices filtered", "count", len(devices)) } @@ -180,7 +209,7 @@ func main() { logger: logger, } - if *smartctlRescanInterval >= 1*time.Second { + if *smartctlScan && *smartctlRescanInterval >= 1*time.Second { logger.Info("Start background scan process") logger.Info("Rescanning for devices every", "rescanInterval", *smartctlRescanInterval) go collector.RescanForDevices() diff --git a/readjson.go b/readjson.go index e163dcb..81f7b12 100644 --- a/readjson.go +++ b/readjson.go @@ -63,22 +63,29 @@ func readFakeSMARTctl(logger *slog.Logger, device Device) gjson.Result { // Get json from smartctl and parse it func readSMARTctl(logger *slog.Logger, device Device) (gjson.Result, bool) { start := time.Now() - out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device="+device.Type, device.Name).Output() + var smartctlArgs = []string{"--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device=" + device.Type, device.Name} + + logger.Debug("Calling smartctl with args", "args", strings.Join(smartctlArgs, " ")) + out, err := exec.Command(*smartctlPath, smartctlArgs...).Output() if err != nil { - logger.Warn("S.M.A.R.T. output reading", "err", err, "device", device.Info_Name) + logger.Warn("S.M.A.R.T. output reading", "err", err, "device", device) } // Accommodate a smartmontools pre-7.3 bug cleaned_out := strings.TrimPrefix(string(out), " Pending defect count:") json := parseJSON(cleaned_out) rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int()) jsonOk := jsonIsOk(logger, json) - logger.Debug("Collected S.M.A.R.T. json data", "device", device.Info_Name, "duration", time.Since(start)) + logger.Debug("Collected S.M.A.R.T. json data", "device", device, "duration", time.Since(start)) return json, rcOk && jsonOk } func readSMARTctlDevices(logger *slog.Logger) gjson.Result { logger.Debug("Scanning for devices") - out, err := exec.Command(*smartctlPath, "--json", "--scan").Output() + var scanArgs []string = []string{"--json", "--scan"} + for _, d := range *smartctlScanDeviceTypes { + scanArgs = append(scanArgs, "--device", d) + } + out, err := exec.Command(*smartctlPath, scanArgs...).Output() if exiterr, ok := err.(*exec.ExitError); ok { logger.Debug("Exit Status", "exit_code", exiterr.ExitCode()) // The smartctl command returns 2 if devices are sleeping, ignore this error. @@ -86,6 +93,9 @@ func readSMARTctlDevices(logger *slog.Logger) gjson.Result { logger.Warn("S.M.A.R.T. output reading error", "err", err) return gjson.Result{} } + } else if err != nil { + logger.Warn("S.M.A.R.T. output reading error", "err", err) + return gjson.Result{} } return parseJSON(string(out)) } @@ -103,7 +113,7 @@ func readData(logger *slog.Logger, device Device) gjson.Result { jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()}) j, found := jsonCache.Load(device) if !found { - logger.Warn("device not found", "device", device.Info_Name) + logger.Warn("device not found", "device", device) } return j.(JSONCache).JSON } @@ -118,30 +128,30 @@ func resultCodeIsOk(logger *slog.Logger, device Device, SMARTCtlResult int64) bo if SMARTCtlResult > 0 { b := SMARTCtlResult if (b & 1) != 0 { - logger.Error("Command line did not parse", "device", device.Info_Name) + logger.Error("Command line did not parse", "device", device) result = false } if (b & (1 << 1)) != 0 { - logger.Error("Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device.Info_Name) + logger.Error("Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device) result = false } if (b & (1 << 2)) != 0 { - logger.Warn("Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device.Info_Name) + logger.Warn("Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device) } if (b & (1 << 3)) != 0 { - logger.Warn("SMART status check returned 'DISK FAILING'", "device", device.Info_Name) + logger.Warn("SMART status check returned 'DISK FAILING'", "device", device) } if (b & (1 << 4)) != 0 { - logger.Warn("We found prefail Attributes <= threshold", "device", device.Info_Name) + logger.Warn("We found prefail Attributes <= threshold", "device", device) } if (b & (1 << 5)) != 0 { - logger.Warn("SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device.Info_Name) + logger.Warn("SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device) } if (b & (1 << 6)) != 0 { - logger.Warn("The device error log contains records of errors", "device", device.Info_Name) + logger.Warn("The device error log contains records of errors", "device", device) } if (b & (1 << 7)) != 0 { - logger.Warn("The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device.Info_Name) + logger.Warn("The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device) } } return result diff --git a/smartctl.go b/smartctl.go index 472cd01..eb94130 100644 --- a/smartctl.go +++ b/smartctl.go @@ -42,28 +42,16 @@ type SMARTctl struct { device SMARTDevice } -func extractDiskName(input string) string { - re := regexp.MustCompile(`^(?:/dev/(?P\S+)/(?P\S+)\s\[|/dev/|\[)(?:\s\[|)(?P[a-z0-9_]+)(?:\].*|)$`) - match := re.FindStringSubmatch(input) +func buildDeviceLabel(inputName string, inputType string) string { + // Strip /dev prefix and replace / with _ (/dev/bus/0 becomes bus_0, /dev/disk/by-id/abcd becomes abcd) + devReg := regexp.MustCompile(`^/dev/(?:disk/by-id/|disk/by-path/|)`) + deviceName := strings.ReplaceAll(devReg.ReplaceAllString(inputName, ""), "/", "_") - if len(match) > 0 { - busNameIndex := re.SubexpIndex("bus_name") - busNumIndex := re.SubexpIndex("bus_num") - diskIndex := re.SubexpIndex("disk") - var name []string - if busNameIndex != -1 && match[busNameIndex] != "" { - name = append(name, match[busNameIndex]) - } - if busNumIndex != -1 && match[busNumIndex] != "" { - name = append(name, match[busNumIndex]) - } - if diskIndex != -1 && match[diskIndex] != "" { - name = append(name, match[diskIndex]) - } - - return strings.Join(name, "_") + if strings.Contains(inputType, ",") { + return deviceName + "_" + strings.ReplaceAll(inputType, ",", "_") } - return "" + + return deviceName } // NewSMARTctl is smartctl constructor @@ -84,7 +72,7 @@ func NewSMARTctl(logger *slog.Logger, json gjson.Result, ch chan<- prometheus.Me json: json, logger: logger, device: SMARTDevice{ - device: extractDiskName(strings.TrimSpace(json.Get("device.info_name").String())), + device: buildDeviceLabel(json.Get("device.name").String(), json.Get("device.type").String()), serial: strings.TrimSpace(json.Get("serial_number").String()), family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")), model: strings.TrimSpace(model_name), diff --git a/smartctl_test.go b/smartctl_test.go new file mode 100644 index 0000000..8c9836c --- /dev/null +++ b/smartctl_test.go @@ -0,0 +1,44 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "testing" +) + +func TestBuildDeviceLabel(t *testing.T) { + tests := []struct { + deviceName string + deviceType string + expectedLabel string + }{ + {"/dev/bus/0", "megaraid,1", "bus_0_megaraid_1"}, + {"/dev/sda", "auto", "sda"}, + {"/dev/disk/by-id/ata-CT500MX500SSD1_ABCDEFGHIJ", "auto", "ata-CT500MX500SSD1_ABCDEFGHIJ"}, + // Some cases extracted from smartctl docs. Are these the prettiest? + // Probably not. Are they unique enough. Definitely. + {"/dev/sg1", "cciss,1", "sg1_cciss_1"}, + {"/dev/bsg/sssraid0", "sssraid,0,1", "bsg_sssraid0_sssraid_0_1"}, + {"/dev/cciss/c0d0", "cciss,0", "cciss_c0d0_cciss_0"}, + {"/dev/sdb", "aacraid,1,0,4", "sdb_aacraid_1_0_4"}, + {"/dev/twl0", "3ware,1", "twl0_3ware_1"}, + } + + for _, test := range tests { + result := buildDeviceLabel(test.deviceName, test.deviceType) + if result != test.expectedLabel { + t.Errorf("deviceName=%v deviceType=%v expected=%v result=%v", test.deviceName, test.deviceType, test.expectedLabel, result) + } + } +}