Add support for device types and predictable device paths (rebased) (#257)

* Add better error logging on smartctl exec failure

We will now log a warning if smartctl path passed via command line is invalid.

Signed-off-by: Piotr Dobrowolski <admin@tastycode.pl>
(cherry picked from commit 1c9c6943e8)

* Add support for autoscan device types and predictable device paths

This adds a new command line option allowing for customization of
autodetected device types and enables use of special "by-id" device type
that forces use of predictable device paths (/dev/disk/by-id/...)

Relevant change to device name parsing regular expression is included
now, so predictable device paths are now also usable when directly
specified.

Signed-off-by: Piotr Dobrowolski <admin@tastycode.pl>
(cherry picked from commit 4c5f721e11)

Conflicts:
  - file: 'readjson.go'
    comment: 'manually resolve new logger issues'

* Rework device label, fix SATA discovery, per-device type specification

Signed-off-by: Piotr Dobrowolski <admin@tastycode.pl>
(cherry picked from commit 319184ce66)

Conflicts:
  - file: 'main.go'
    comment: 'manually resolve new logger issues'
  - file: 'readjson.go'
    comment: 'manually resolve new logger issues'

---------

Co-authored-by: Piotr Dobrowolski <admin@tastycode.pl>
This commit is contained in:
Konstantin Shalygin 2024-12-19 13:02:09 +03:00 committed by GitHub
parent 703f9c8826
commit 7489f4f7aa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 130 additions and 59 deletions

79
main.go
View file

@ -34,9 +34,13 @@ import (
// Device // Device
type Device struct { type Device struct {
Name string `json:"name"` Name string
Info_Name string `json:"info_name"` Type string
Type string `json:"type"` Label string
}
func (d Device) String() string {
return d.Name + ";" + d.Type + " (" + d.Label + ")"
} }
// SMARTctlManagerCollector implements the Collector interface. // SMARTctlManagerCollector implements the Collector interface.
@ -80,6 +84,7 @@ func (i *SMARTctlManagerCollector) RescanForDevices() {
time.Sleep(*smartctlRescanInterval) time.Sleep(*smartctlRescanInterval)
i.logger.Info("Rescanning for devices") i.logger.Info("Rescanning for devices")
devices := scanDevices(i.logger) devices := scanDevices(i.logger)
devices = buildDevicesFromFlag(devices)
i.mutex.Lock() i.mutex.Lock()
i.Devices = devices i.Devices = devices
i.mutex.Unlock() i.mutex.Unlock()
@ -96,8 +101,9 @@ var (
smartctlRescanInterval = kingpin.Flag("smartctl.rescan", smartctlRescanInterval = kingpin.Flag("smartctl.rescan",
"The interval between rescanning for new/disappeared devices. If the interval is smaller than 1s no rescanning takes place. If any devices are configured with smartctl.device also no rescanning takes place.", "The interval between rescanning for new/disappeared devices. If the interval is smaller than 1s no rescanning takes place. If any devices are configured with smartctl.device also no rescanning takes place.",
).Default("10m").Duration() ).Default("10m").Duration()
smartctlScan = kingpin.Flag("smartctl.scan", "Enable scanning. This is a default if no devices are specified").Default("false").Bool()
smartctlDevices = kingpin.Flag("smartctl.device", smartctlDevices = kingpin.Flag("smartctl.device",
"The device to monitor (repeatable)", "The device to monitor. Device type can be specified after a semicolon, eg. '/dev/bus/0;megaraid,1' (repeatable)",
).Strings() ).Strings()
smartctlDeviceExclude = kingpin.Flag( smartctlDeviceExclude = kingpin.Flag(
"smartctl.device-exclude", "smartctl.device-exclude",
@ -107,6 +113,10 @@ var (
"smartctl.device-include", "smartctl.device-include",
"Regexp of devices to exclude from automatic scanning. (mutually exclusive to device-exclude)", "Regexp of devices to exclude from automatic scanning. (mutually exclusive to device-exclude)",
).Default("").String() ).Default("").String()
smartctlScanDeviceTypes = kingpin.Flag(
"smartctl.scan-device-type",
"Device type to use during automatic scan. Special by-id value forces predictable device names. (repeatable)",
).Strings()
smartctlFakeData = kingpin.Flag("smartctl.fake-data", smartctlFakeData = kingpin.Flag("smartctl.fake-data",
"The device to monitor (repeatable)", "The device to monitor (repeatable)",
).Default("false").Hidden().Bool() ).Default("false").Hidden().Bool()
@ -120,15 +130,23 @@ func scanDevices(logger *slog.Logger) []Device {
scanDevices := json.Get("devices").Array() scanDevices := json.Get("devices").Array()
var scanDeviceResult []Device var scanDeviceResult []Device
for _, d := range scanDevices { for _, d := range scanDevices {
deviceName := extractDiskName(strings.TrimSpace(d.Get("info_name").String())) deviceName := d.Get("name").String()
if filter.ignored(deviceName) { deviceType := d.Get("type").String()
logger.Info("Ignoring device", "name", deviceName)
// SATA devices are reported as SCSI during scan - fallback to auto scraping
if deviceType == "scsi" {
deviceType = "auto"
}
deviceLabel := buildDeviceLabel(deviceName, deviceType)
if filter.ignored(deviceLabel) {
logger.Info("Ignoring device", "name", deviceLabel)
} else { } else {
logger.Info("Found device", "name", deviceName) logger.Info("Found device", "name", deviceLabel)
device := Device{ device := Device{
Name: d.Get("name").String(), Name: deviceName,
Info_Name: deviceName, Type: deviceType,
Type: d.Get("type").String(), Label: deviceLabel,
} }
scanDeviceResult = append(scanDeviceResult, device) scanDeviceResult = append(scanDeviceResult, device)
} }
@ -136,18 +154,21 @@ func scanDevices(logger *slog.Logger) []Device {
return scanDeviceResult return scanDeviceResult
} }
func filterDevices(logger *slog.Logger, devices []Device, filters []string) []Device { func buildDevicesFromFlag(devices []Device) []Device {
var filtered []Device // TODO: deduplication?
for _, d := range devices { for _, device := range *smartctlDevices {
for _, filter := range filters { deviceName, deviceType, _ := strings.Cut(device, ";")
logger.Debug("filterDevices", "device", d.Info_Name, "filter", filter) if deviceType == "" {
if strings.Contains(d.Info_Name, filter) { deviceType = "auto"
filtered = append(filtered, d)
break
}
} }
devices = append(devices, Device{
Name: deviceName,
Type: deviceType,
Label: buildDeviceLabel(deviceName, deviceType),
})
} }
return filtered return devices
} }
func main() { func main() {
@ -167,11 +188,19 @@ func main() {
logger.Info("Build context", "build_context", version.BuildContext()) logger.Info("Build context", "build_context", version.BuildContext())
var devices []Device var devices []Device
devices = scanDevices(logger)
logger.Info("Number of devices found", "count", len(devices)) if len(*smartctlDevices) == 0 {
*smartctlScan = true
}
if *smartctlScan {
devices = scanDevices(logger)
logger.Info("Number of devices found", "count", len(devices))
}
if len(*smartctlDevices) > 0 { if len(*smartctlDevices) > 0 {
logger.Info("Devices specified", "devices", strings.Join(*smartctlDevices, ", ")) logger.Info("Devices specified", "devices", strings.Join(*smartctlDevices, ", "))
devices = filterDevices(logger, devices, *smartctlDevices) devices = buildDevicesFromFlag(devices)
logger.Info("Devices filtered", "count", len(devices)) logger.Info("Devices filtered", "count", len(devices))
} }
@ -180,7 +209,7 @@ func main() {
logger: logger, logger: logger,
} }
if *smartctlRescanInterval >= 1*time.Second { if *smartctlScan && *smartctlRescanInterval >= 1*time.Second {
logger.Info("Start background scan process") logger.Info("Start background scan process")
logger.Info("Rescanning for devices every", "rescanInterval", *smartctlRescanInterval) logger.Info("Rescanning for devices every", "rescanInterval", *smartctlRescanInterval)
go collector.RescanForDevices() go collector.RescanForDevices()

View file

@ -63,22 +63,29 @@ func readFakeSMARTctl(logger *slog.Logger, device Device) gjson.Result {
// Get json from smartctl and parse it // Get json from smartctl and parse it
func readSMARTctl(logger *slog.Logger, device Device) (gjson.Result, bool) { func readSMARTctl(logger *slog.Logger, device Device) (gjson.Result, bool) {
start := time.Now() start := time.Now()
out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device="+device.Type, device.Name).Output() var smartctlArgs = []string{"--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device=" + device.Type, device.Name}
logger.Debug("Calling smartctl with args", "args", strings.Join(smartctlArgs, " "))
out, err := exec.Command(*smartctlPath, smartctlArgs...).Output()
if err != nil { if err != nil {
logger.Warn("S.M.A.R.T. output reading", "err", err, "device", device.Info_Name) logger.Warn("S.M.A.R.T. output reading", "err", err, "device", device)
} }
// Accommodate a smartmontools pre-7.3 bug // Accommodate a smartmontools pre-7.3 bug
cleaned_out := strings.TrimPrefix(string(out), " Pending defect count:") cleaned_out := strings.TrimPrefix(string(out), " Pending defect count:")
json := parseJSON(cleaned_out) json := parseJSON(cleaned_out)
rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int()) rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int())
jsonOk := jsonIsOk(logger, json) jsonOk := jsonIsOk(logger, json)
logger.Debug("Collected S.M.A.R.T. json data", "device", device.Info_Name, "duration", time.Since(start)) logger.Debug("Collected S.M.A.R.T. json data", "device", device, "duration", time.Since(start))
return json, rcOk && jsonOk return json, rcOk && jsonOk
} }
func readSMARTctlDevices(logger *slog.Logger) gjson.Result { func readSMARTctlDevices(logger *slog.Logger) gjson.Result {
logger.Debug("Scanning for devices") logger.Debug("Scanning for devices")
out, err := exec.Command(*smartctlPath, "--json", "--scan").Output() var scanArgs []string = []string{"--json", "--scan"}
for _, d := range *smartctlScanDeviceTypes {
scanArgs = append(scanArgs, "--device", d)
}
out, err := exec.Command(*smartctlPath, scanArgs...).Output()
if exiterr, ok := err.(*exec.ExitError); ok { if exiterr, ok := err.(*exec.ExitError); ok {
logger.Debug("Exit Status", "exit_code", exiterr.ExitCode()) logger.Debug("Exit Status", "exit_code", exiterr.ExitCode())
// The smartctl command returns 2 if devices are sleeping, ignore this error. // The smartctl command returns 2 if devices are sleeping, ignore this error.
@ -86,6 +93,9 @@ func readSMARTctlDevices(logger *slog.Logger) gjson.Result {
logger.Warn("S.M.A.R.T. output reading error", "err", err) logger.Warn("S.M.A.R.T. output reading error", "err", err)
return gjson.Result{} return gjson.Result{}
} }
} else if err != nil {
logger.Warn("S.M.A.R.T. output reading error", "err", err)
return gjson.Result{}
} }
return parseJSON(string(out)) return parseJSON(string(out))
} }
@ -103,7 +113,7 @@ func readData(logger *slog.Logger, device Device) gjson.Result {
jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()}) jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()})
j, found := jsonCache.Load(device) j, found := jsonCache.Load(device)
if !found { if !found {
logger.Warn("device not found", "device", device.Info_Name) logger.Warn("device not found", "device", device)
} }
return j.(JSONCache).JSON return j.(JSONCache).JSON
} }
@ -118,30 +128,30 @@ func resultCodeIsOk(logger *slog.Logger, device Device, SMARTCtlResult int64) bo
if SMARTCtlResult > 0 { if SMARTCtlResult > 0 {
b := SMARTCtlResult b := SMARTCtlResult
if (b & 1) != 0 { if (b & 1) != 0 {
logger.Error("Command line did not parse", "device", device.Info_Name) logger.Error("Command line did not parse", "device", device)
result = false result = false
} }
if (b & (1 << 1)) != 0 { if (b & (1 << 1)) != 0 {
logger.Error("Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device.Info_Name) logger.Error("Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device)
result = false result = false
} }
if (b & (1 << 2)) != 0 { if (b & (1 << 2)) != 0 {
logger.Warn("Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device.Info_Name) logger.Warn("Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device)
} }
if (b & (1 << 3)) != 0 { if (b & (1 << 3)) != 0 {
logger.Warn("SMART status check returned 'DISK FAILING'", "device", device.Info_Name) logger.Warn("SMART status check returned 'DISK FAILING'", "device", device)
} }
if (b & (1 << 4)) != 0 { if (b & (1 << 4)) != 0 {
logger.Warn("We found prefail Attributes <= threshold", "device", device.Info_Name) logger.Warn("We found prefail Attributes <= threshold", "device", device)
} }
if (b & (1 << 5)) != 0 { if (b & (1 << 5)) != 0 {
logger.Warn("SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device.Info_Name) logger.Warn("SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device)
} }
if (b & (1 << 6)) != 0 { if (b & (1 << 6)) != 0 {
logger.Warn("The device error log contains records of errors", "device", device.Info_Name) logger.Warn("The device error log contains records of errors", "device", device)
} }
if (b & (1 << 7)) != 0 { if (b & (1 << 7)) != 0 {
logger.Warn("The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device.Info_Name) logger.Warn("The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device)
} }
} }
return result return result

View file

@ -42,28 +42,16 @@ type SMARTctl struct {
device SMARTDevice device SMARTDevice
} }
func extractDiskName(input string) string { func buildDeviceLabel(inputName string, inputType string) string {
re := regexp.MustCompile(`^(?:/dev/(?P<bus_name>\S+)/(?P<bus_num>\S+)\s\[|/dev/|\[)(?:\s\[|)(?P<disk>[a-z0-9_]+)(?:\].*|)$`) // Strip /dev prefix and replace / with _ (/dev/bus/0 becomes bus_0, /dev/disk/by-id/abcd becomes abcd)
match := re.FindStringSubmatch(input) devReg := regexp.MustCompile(`^/dev/(?:disk/by-id/|disk/by-path/|)`)
deviceName := strings.ReplaceAll(devReg.ReplaceAllString(inputName, ""), "/", "_")
if len(match) > 0 { if strings.Contains(inputType, ",") {
busNameIndex := re.SubexpIndex("bus_name") return deviceName + "_" + strings.ReplaceAll(inputType, ",", "_")
busNumIndex := re.SubexpIndex("bus_num")
diskIndex := re.SubexpIndex("disk")
var name []string
if busNameIndex != -1 && match[busNameIndex] != "" {
name = append(name, match[busNameIndex])
}
if busNumIndex != -1 && match[busNumIndex] != "" {
name = append(name, match[busNumIndex])
}
if diskIndex != -1 && match[diskIndex] != "" {
name = append(name, match[diskIndex])
}
return strings.Join(name, "_")
} }
return ""
return deviceName
} }
// NewSMARTctl is smartctl constructor // NewSMARTctl is smartctl constructor
@ -84,7 +72,7 @@ func NewSMARTctl(logger *slog.Logger, json gjson.Result, ch chan<- prometheus.Me
json: json, json: json,
logger: logger, logger: logger,
device: SMARTDevice{ device: SMARTDevice{
device: extractDiskName(strings.TrimSpace(json.Get("device.info_name").String())), device: buildDeviceLabel(json.Get("device.name").String(), json.Get("device.type").String()),
serial: strings.TrimSpace(json.Get("serial_number").String()), serial: strings.TrimSpace(json.Get("serial_number").String()),
family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")), family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")),
model: strings.TrimSpace(model_name), model: strings.TrimSpace(model_name),

44
smartctl_test.go Normal file
View file

@ -0,0 +1,44 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"testing"
)
func TestBuildDeviceLabel(t *testing.T) {
tests := []struct {
deviceName string
deviceType string
expectedLabel string
}{
{"/dev/bus/0", "megaraid,1", "bus_0_megaraid_1"},
{"/dev/sda", "auto", "sda"},
{"/dev/disk/by-id/ata-CT500MX500SSD1_ABCDEFGHIJ", "auto", "ata-CT500MX500SSD1_ABCDEFGHIJ"},
// Some cases extracted from smartctl docs. Are these the prettiest?
// Probably not. Are they unique enough. Definitely.
{"/dev/sg1", "cciss,1", "sg1_cciss_1"},
{"/dev/bsg/sssraid0", "sssraid,0,1", "bsg_sssraid0_sssraid_0_1"},
{"/dev/cciss/c0d0", "cciss,0", "cciss_c0d0_cciss_0"},
{"/dev/sdb", "aacraid,1,0,4", "sdb_aacraid_1_0_4"},
{"/dev/twl0", "3ware,1", "twl0_3ware_1"},
}
for _, test := range tests {
result := buildDeviceLabel(test.deviceName, test.deviceType)
if result != test.expectedLabel {
t.Errorf("deviceName=%v deviceType=%v expected=%v result=%v", test.deviceName, test.deviceType, test.expectedLabel, result)
}
}
}