Rework device label, fix SATA discovery, per-device type specification

Signed-off-by: Piotr Dobrowolski <admin@tastycode.pl>
This commit is contained in:
Piotr Dobrowolski 2024-07-14 23:29:05 +02:00
parent 4c5f721e11
commit 319184ce66
4 changed files with 121 additions and 61 deletions

75
main.go
View File

@ -35,9 +35,13 @@ import (
// Device // Device
type Device struct { type Device struct {
Name string `json:"name"` Name string
Info_Name string `json:"info_name"` Type string
Type string `json:"type"` Label string
}
func (d Device) String() string {
return d.Name + ";" + d.Type + " (" + d.Label + ")"
} }
// SMARTctlManagerCollector implements the Collector interface. // SMARTctlManagerCollector implements the Collector interface.
@ -81,6 +85,7 @@ func (i *SMARTctlManagerCollector) RescanForDevices() {
time.Sleep(*smartctlRescanInterval) time.Sleep(*smartctlRescanInterval)
level.Info(i.logger).Log("msg", "Rescanning for devices") level.Info(i.logger).Log("msg", "Rescanning for devices")
devices := scanDevices(i.logger) devices := scanDevices(i.logger)
devices = buildDevicesFromFlag(devices)
i.mutex.Lock() i.mutex.Lock()
i.Devices = devices i.Devices = devices
i.mutex.Unlock() i.mutex.Unlock()
@ -97,8 +102,9 @@ var (
smartctlRescanInterval = kingpin.Flag("smartctl.rescan", smartctlRescanInterval = kingpin.Flag("smartctl.rescan",
"The interval between rescanning for new/disappeared devices. If the interval is smaller than 1s no rescanning takes place. If any devices are configured with smartctl.device also no rescanning takes place.", "The interval between rescanning for new/disappeared devices. If the interval is smaller than 1s no rescanning takes place. If any devices are configured with smartctl.device also no rescanning takes place.",
).Default("10m").Duration() ).Default("10m").Duration()
smartctlScan = kingpin.Flag("smartctl.scan", "Enable scanning. This is a default if no devices are specified").Default("false").Bool()
smartctlDevices = kingpin.Flag("smartctl.device", smartctlDevices = kingpin.Flag("smartctl.device",
"The device to monitor (repeatable)", "The device to monitor. Device type can be specified after a semicolon, eg. '/dev/bus/0;megaraid,1' (repeatable)",
).Strings() ).Strings()
smartctlDeviceExclude = kingpin.Flag( smartctlDeviceExclude = kingpin.Flag(
"smartctl.device-exclude", "smartctl.device-exclude",
@ -108,8 +114,8 @@ var (
"smartctl.device-include", "smartctl.device-include",
"Regexp of devices to exclude from automatic scanning. (mutually exclusive to device-exclude)", "Regexp of devices to exclude from automatic scanning. (mutually exclusive to device-exclude)",
).Default("").String() ).Default("").String()
smartctlDeviceTypes = kingpin.Flag( smartctlScanDeviceTypes = kingpin.Flag(
"smartctl.device-type", "smartctl.scan-device-type",
"Device type to use during automatic scan. Special by-id value forces predictable device names. (repeatable)", "Device type to use during automatic scan. Special by-id value forces predictable device names. (repeatable)",
).Strings() ).Strings()
smartctlFakeData = kingpin.Flag("smartctl.fake-data", smartctlFakeData = kingpin.Flag("smartctl.fake-data",
@ -125,15 +131,23 @@ func scanDevices(logger log.Logger) []Device {
scanDevices := json.Get("devices").Array() scanDevices := json.Get("devices").Array()
var scanDeviceResult []Device var scanDeviceResult []Device
for _, d := range scanDevices { for _, d := range scanDevices {
deviceName := extractDiskName(strings.TrimSpace(d.Get("info_name").String())) deviceName := d.Get("name").String()
if filter.ignored(deviceName) { deviceType := d.Get("type").String()
level.Info(logger).Log("msg", "Ignoring device", "name", deviceName)
// SATA devices are reported as SCSI during scan - fallback to auto scraping
if deviceType == "scsi" {
deviceType = "auto"
}
deviceLabel := buildDeviceLabel(deviceName, deviceType)
if filter.ignored(deviceLabel) {
level.Info(logger).Log("msg", "Ignoring device", "name", deviceLabel)
} else { } else {
level.Info(logger).Log("msg", "Found device", "name", deviceName) level.Info(logger).Log("msg", "Found device", "name", deviceLabel)
device := Device{ device := Device{
Name: d.Get("name").String(), Name: deviceName,
Info_Name: deviceName, Type: deviceType,
Type: d.Get("type").String(), Label: deviceLabel,
} }
scanDeviceResult = append(scanDeviceResult, device) scanDeviceResult = append(scanDeviceResult, device)
} }
@ -141,18 +155,21 @@ func scanDevices(logger log.Logger) []Device {
return scanDeviceResult return scanDeviceResult
} }
func filterDevices(logger log.Logger, devices []Device, filters []string) []Device { func buildDevicesFromFlag(devices []Device) []Device {
var filtered []Device // TODO: deduplication?
for _, d := range devices { for _, device := range *smartctlDevices {
for _, filter := range filters { deviceName, deviceType, _ := strings.Cut(device, ";")
level.Debug(logger).Log("msg", "filterDevices", "device", d.Info_Name, "filter", filter) if deviceType == "" {
if strings.Contains(d.Info_Name, filter) { deviceType = "auto"
filtered = append(filtered, d)
break
} }
devices = append(devices, Device{
Name: deviceName,
Type: deviceType,
Label: buildDeviceLabel(deviceName, deviceType),
})
} }
} return devices
return filtered
} }
func main() { func main() {
@ -172,11 +189,19 @@ func main() {
level.Info(logger).Log("msg", "Build context", "build_context", version.BuildContext()) level.Info(logger).Log("msg", "Build context", "build_context", version.BuildContext())
var devices []Device var devices []Device
if len(*smartctlDevices) == 0 {
*smartctlScan = true
}
if *smartctlScan {
devices = scanDevices(logger) devices = scanDevices(logger)
level.Info(logger).Log("msg", "Number of devices found", "count", len(devices)) level.Info(logger).Log("msg", "Number of devices found", "count", len(devices))
}
if len(*smartctlDevices) > 0 { if len(*smartctlDevices) > 0 {
level.Info(logger).Log("msg", "Devices specified", "devices", strings.Join(*smartctlDevices, ", ")) level.Info(logger).Log("msg", "Devices specified", "devices", strings.Join(*smartctlDevices, ", "))
devices = filterDevices(logger, devices, *smartctlDevices) devices = buildDevicesFromFlag(devices)
level.Info(logger).Log("msg", "Devices filtered", "count", len(devices)) level.Info(logger).Log("msg", "Devices filtered", "count", len(devices))
} }
@ -185,7 +210,7 @@ func main() {
logger: logger, logger: logger,
} }
if *smartctlRescanInterval >= 1*time.Second { if *smartctlScan && *smartctlRescanInterval >= 1*time.Second {
level.Info(logger).Log("msg", "Start background scan process") level.Info(logger).Log("msg", "Start background scan process")
level.Info(logger).Log("msg", "Rescanning for devices every", "rescanInterval", *smartctlRescanInterval) level.Info(logger).Log("msg", "Rescanning for devices every", "rescanInterval", *smartctlRescanInterval)
go collector.RescanForDevices() go collector.RescanForDevices()

View File

@ -64,21 +64,24 @@ func readFakeSMARTctl(logger log.Logger, device Device) gjson.Result {
// Get json from smartctl and parse it // Get json from smartctl and parse it
func readSMARTctl(logger log.Logger, device Device) (gjson.Result, bool) { func readSMARTctl(logger log.Logger, device Device) (gjson.Result, bool) {
start := time.Now() start := time.Now()
out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device="+device.Type, device.Name).Output() var smartctlArgs = []string{"--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device=" + device.Type, device.Name}
level.Debug(logger).Log("msg", "Calling smartctl with args", "args", strings.Join(smartctlArgs, " "))
out, err := exec.Command(*smartctlPath, smartctlArgs...).Output()
if err != nil { if err != nil {
level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device.Info_Name) level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device)
} }
json := parseJSON(string(out)) json := parseJSON(string(out))
rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int()) rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int())
jsonOk := jsonIsOk(logger, json) jsonOk := jsonIsOk(logger, json)
level.Debug(logger).Log("msg", "Collected S.M.A.R.T. json data", "device", device.Info_Name, "duration", time.Since(start)) level.Debug(logger).Log("msg", "Collected S.M.A.R.T. json data", "device", device, "duration", time.Since(start))
return json, rcOk && jsonOk return json, rcOk && jsonOk
} }
func readSMARTctlDevices(logger log.Logger) gjson.Result { func readSMARTctlDevices(logger log.Logger) gjson.Result {
level.Debug(logger).Log("msg", "Scanning for devices") level.Debug(logger).Log("msg", "Scanning for devices")
var scanArgs []string = []string{"--json", "--scan"} var scanArgs []string = []string{"--json", "--scan"}
for _, d := range *smartctlDeviceTypes { for _, d := range *smartctlScanDeviceTypes {
scanArgs = append(scanArgs, "--device", d) scanArgs = append(scanArgs, "--device", d)
} }
out, err := exec.Command(*smartctlPath, scanArgs...).Output() out, err := exec.Command(*smartctlPath, scanArgs...).Output()
@ -109,7 +112,7 @@ func readData(logger log.Logger, device Device) gjson.Result {
jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()}) jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()})
j, found := jsonCache.Load(device) j, found := jsonCache.Load(device)
if !found { if !found {
level.Warn(logger).Log("msg", "device not found", "device", device.Info_Name) level.Warn(logger).Log("msg", "device not found", "device", device)
} }
return j.(JSONCache).JSON return j.(JSONCache).JSON
} }
@ -124,30 +127,30 @@ func resultCodeIsOk(logger log.Logger, device Device, SMARTCtlResult int64) bool
if SMARTCtlResult > 0 { if SMARTCtlResult > 0 {
b := SMARTCtlResult b := SMARTCtlResult
if (b & 1) != 0 { if (b & 1) != 0 {
level.Error(logger).Log("msg", "Command line did not parse", "device", device.Info_Name) level.Error(logger).Log("msg", "Command line did not parse", "device", device)
result = false result = false
} }
if (b & (1 << 1)) != 0 { if (b & (1 << 1)) != 0 {
level.Error(logger).Log("msg", "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device.Info_Name) level.Error(logger).Log("msg", "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device)
result = false result = false
} }
if (b & (1 << 2)) != 0 { if (b & (1 << 2)) != 0 {
level.Warn(logger).Log("msg", "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device.Info_Name) level.Warn(logger).Log("msg", "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device)
} }
if (b & (1 << 3)) != 0 { if (b & (1 << 3)) != 0 {
level.Warn(logger).Log("msg", "SMART status check returned 'DISK FAILING'", "device", device.Info_Name) level.Warn(logger).Log("msg", "SMART status check returned 'DISK FAILING'", "device", device)
} }
if (b & (1 << 4)) != 0 { if (b & (1 << 4)) != 0 {
level.Warn(logger).Log("msg", "We found prefail Attributes <= threshold", "device", device.Info_Name) level.Warn(logger).Log("msg", "We found prefail Attributes <= threshold", "device", device)
} }
if (b & (1 << 5)) != 0 { if (b & (1 << 5)) != 0 {
level.Warn(logger).Log("msg", "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device.Info_Name) level.Warn(logger).Log("msg", "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device)
} }
if (b & (1 << 6)) != 0 { if (b & (1 << 6)) != 0 {
level.Warn(logger).Log("msg", "The device error log contains records of errors", "device", device.Info_Name) level.Warn(logger).Log("msg", "The device error log contains records of errors", "device", device)
} }
if (b & (1 << 7)) != 0 { if (b & (1 << 7)) != 0 {
level.Warn(logger).Log("msg", "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device.Info_Name) level.Warn(logger).Log("msg", "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device)
} }
} }
return result return result

View File

@ -43,28 +43,16 @@ type SMARTctl struct {
device SMARTDevice device SMARTDevice
} }
func extractDiskName(input string) string { func buildDeviceLabel(inputName string, inputType string) string {
re := regexp.MustCompile(`^(?:/dev/(?P<bus_name>\S+)/(?P<bus_num>\S+)\s\[|/dev/(?:disk\/by-id\/|disk\/by-path\/|)|\[)(?:\s\[|)(?P<disk>[A-Za-z0-9_\-]+)(?:\].*|)$`) // Strip /dev prefix and replace / with _ (/dev/bus/0 becomes bus_0, /dev/disk/by-id/abcd becomes abcd)
match := re.FindStringSubmatch(input) devReg := regexp.MustCompile(`^/dev/(?:disk/by-id/|disk/by-path/|)`)
deviceName := strings.ReplaceAll(devReg.ReplaceAllString(inputName, ""), "/", "_")
if len(match) > 0 { if strings.Contains(inputType, ",") {
busNameIndex := re.SubexpIndex("bus_name") return deviceName + "_" + strings.ReplaceAll(inputType, ",", "_")
busNumIndex := re.SubexpIndex("bus_num")
diskIndex := re.SubexpIndex("disk")
var name []string
if busNameIndex != -1 && match[busNameIndex] != "" {
name = append(name, match[busNameIndex])
}
if busNumIndex != -1 && match[busNumIndex] != "" {
name = append(name, match[busNumIndex])
}
if diskIndex != -1 && match[diskIndex] != "" {
name = append(name, match[diskIndex])
} }
return strings.Join(name, "_") return deviceName
}
return ""
} }
// NewSMARTctl is smartctl constructor // NewSMARTctl is smartctl constructor
@ -85,7 +73,7 @@ func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metr
json: json, json: json,
logger: logger, logger: logger,
device: SMARTDevice{ device: SMARTDevice{
device: extractDiskName(strings.TrimSpace(json.Get("device.info_name").String())), device: buildDeviceLabel(json.Get("device.name").String(), json.Get("device.type").String()),
serial: strings.TrimSpace(json.Get("serial_number").String()), serial: strings.TrimSpace(json.Get("serial_number").String()),
family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")), family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")),
model: strings.TrimSpace(model_name), model: strings.TrimSpace(model_name),

44
smartctl_test.go Normal file
View File

@ -0,0 +1,44 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"testing"
)
func TestBuildDeviceLabel(t *testing.T) {
tests := []struct {
deviceName string
deviceType string
expectedLabel string
}{
{"/dev/bus/0", "megaraid,1", "bus_0_megaraid_1"},
{"/dev/sda", "auto", "sda"},
{"/dev/disk/by-id/ata-CT500MX500SSD1_ABCDEFGHIJ", "auto", "ata-CT500MX500SSD1_ABCDEFGHIJ"},
// Some cases extracted from smartctl docs. Are these the prettiest?
// Probably not. Are they unique enough. Definitely.
{"/dev/sg1", "cciss,1", "sg1_cciss_1"},
{"/dev/bsg/sssraid0", "sssraid,0,1", "bsg_sssraid0_sssraid_0_1"},
{"/dev/cciss/c0d0", "cciss,0", "cciss_c0d0_cciss_0"},
{"/dev/sdb", "aacraid,1,0,4", "sdb_aacraid_1_0_4"},
{"/dev/twl0", "3ware,1", "twl0_3ware_1"},
}
for _, test := range tests {
result := buildDeviceLabel(test.deviceName, test.deviceType)
if result != test.expectedLabel {
t.Errorf("deviceName=%v deviceType=%v expected=%v result=%v", test.deviceName, test.deviceType, test.expectedLabel, result)
}
}
}