mirror of
https://github.com/prometheus-community/smartctl_exporter.git
synced 2024-11-16 01:33:07 +01:00
Rework device label, fix SATA discovery, per-device type specification
Signed-off-by: Piotr Dobrowolski <admin@tastycode.pl>
This commit is contained in:
parent
4c5f721e11
commit
319184ce66
4 changed files with 121 additions and 61 deletions
79
main.go
79
main.go
|
@ -35,9 +35,13 @@ import (
|
||||||
|
|
||||||
// Device
|
// Device
|
||||||
type Device struct {
|
type Device struct {
|
||||||
Name string `json:"name"`
|
Name string
|
||||||
Info_Name string `json:"info_name"`
|
Type string
|
||||||
Type string `json:"type"`
|
Label string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d Device) String() string {
|
||||||
|
return d.Name + ";" + d.Type + " (" + d.Label + ")"
|
||||||
}
|
}
|
||||||
|
|
||||||
// SMARTctlManagerCollector implements the Collector interface.
|
// SMARTctlManagerCollector implements the Collector interface.
|
||||||
|
@ -81,6 +85,7 @@ func (i *SMARTctlManagerCollector) RescanForDevices() {
|
||||||
time.Sleep(*smartctlRescanInterval)
|
time.Sleep(*smartctlRescanInterval)
|
||||||
level.Info(i.logger).Log("msg", "Rescanning for devices")
|
level.Info(i.logger).Log("msg", "Rescanning for devices")
|
||||||
devices := scanDevices(i.logger)
|
devices := scanDevices(i.logger)
|
||||||
|
devices = buildDevicesFromFlag(devices)
|
||||||
i.mutex.Lock()
|
i.mutex.Lock()
|
||||||
i.Devices = devices
|
i.Devices = devices
|
||||||
i.mutex.Unlock()
|
i.mutex.Unlock()
|
||||||
|
@ -97,8 +102,9 @@ var (
|
||||||
smartctlRescanInterval = kingpin.Flag("smartctl.rescan",
|
smartctlRescanInterval = kingpin.Flag("smartctl.rescan",
|
||||||
"The interval between rescanning for new/disappeared devices. If the interval is smaller than 1s no rescanning takes place. If any devices are configured with smartctl.device also no rescanning takes place.",
|
"The interval between rescanning for new/disappeared devices. If the interval is smaller than 1s no rescanning takes place. If any devices are configured with smartctl.device also no rescanning takes place.",
|
||||||
).Default("10m").Duration()
|
).Default("10m").Duration()
|
||||||
|
smartctlScan = kingpin.Flag("smartctl.scan", "Enable scanning. This is a default if no devices are specified").Default("false").Bool()
|
||||||
smartctlDevices = kingpin.Flag("smartctl.device",
|
smartctlDevices = kingpin.Flag("smartctl.device",
|
||||||
"The device to monitor (repeatable)",
|
"The device to monitor. Device type can be specified after a semicolon, eg. '/dev/bus/0;megaraid,1' (repeatable)",
|
||||||
).Strings()
|
).Strings()
|
||||||
smartctlDeviceExclude = kingpin.Flag(
|
smartctlDeviceExclude = kingpin.Flag(
|
||||||
"smartctl.device-exclude",
|
"smartctl.device-exclude",
|
||||||
|
@ -108,8 +114,8 @@ var (
|
||||||
"smartctl.device-include",
|
"smartctl.device-include",
|
||||||
"Regexp of devices to exclude from automatic scanning. (mutually exclusive to device-exclude)",
|
"Regexp of devices to exclude from automatic scanning. (mutually exclusive to device-exclude)",
|
||||||
).Default("").String()
|
).Default("").String()
|
||||||
smartctlDeviceTypes = kingpin.Flag(
|
smartctlScanDeviceTypes = kingpin.Flag(
|
||||||
"smartctl.device-type",
|
"smartctl.scan-device-type",
|
||||||
"Device type to use during automatic scan. Special by-id value forces predictable device names. (repeatable)",
|
"Device type to use during automatic scan. Special by-id value forces predictable device names. (repeatable)",
|
||||||
).Strings()
|
).Strings()
|
||||||
smartctlFakeData = kingpin.Flag("smartctl.fake-data",
|
smartctlFakeData = kingpin.Flag("smartctl.fake-data",
|
||||||
|
@ -125,15 +131,23 @@ func scanDevices(logger log.Logger) []Device {
|
||||||
scanDevices := json.Get("devices").Array()
|
scanDevices := json.Get("devices").Array()
|
||||||
var scanDeviceResult []Device
|
var scanDeviceResult []Device
|
||||||
for _, d := range scanDevices {
|
for _, d := range scanDevices {
|
||||||
deviceName := extractDiskName(strings.TrimSpace(d.Get("info_name").String()))
|
deviceName := d.Get("name").String()
|
||||||
if filter.ignored(deviceName) {
|
deviceType := d.Get("type").String()
|
||||||
level.Info(logger).Log("msg", "Ignoring device", "name", deviceName)
|
|
||||||
|
// SATA devices are reported as SCSI during scan - fallback to auto scraping
|
||||||
|
if deviceType == "scsi" {
|
||||||
|
deviceType = "auto"
|
||||||
|
}
|
||||||
|
|
||||||
|
deviceLabel := buildDeviceLabel(deviceName, deviceType)
|
||||||
|
if filter.ignored(deviceLabel) {
|
||||||
|
level.Info(logger).Log("msg", "Ignoring device", "name", deviceLabel)
|
||||||
} else {
|
} else {
|
||||||
level.Info(logger).Log("msg", "Found device", "name", deviceName)
|
level.Info(logger).Log("msg", "Found device", "name", deviceLabel)
|
||||||
device := Device{
|
device := Device{
|
||||||
Name: d.Get("name").String(),
|
Name: deviceName,
|
||||||
Info_Name: deviceName,
|
Type: deviceType,
|
||||||
Type: d.Get("type").String(),
|
Label: deviceLabel,
|
||||||
}
|
}
|
||||||
scanDeviceResult = append(scanDeviceResult, device)
|
scanDeviceResult = append(scanDeviceResult, device)
|
||||||
}
|
}
|
||||||
|
@ -141,18 +155,21 @@ func scanDevices(logger log.Logger) []Device {
|
||||||
return scanDeviceResult
|
return scanDeviceResult
|
||||||
}
|
}
|
||||||
|
|
||||||
func filterDevices(logger log.Logger, devices []Device, filters []string) []Device {
|
func buildDevicesFromFlag(devices []Device) []Device {
|
||||||
var filtered []Device
|
// TODO: deduplication?
|
||||||
for _, d := range devices {
|
for _, device := range *smartctlDevices {
|
||||||
for _, filter := range filters {
|
deviceName, deviceType, _ := strings.Cut(device, ";")
|
||||||
level.Debug(logger).Log("msg", "filterDevices", "device", d.Info_Name, "filter", filter)
|
if deviceType == "" {
|
||||||
if strings.Contains(d.Info_Name, filter) {
|
deviceType = "auto"
|
||||||
filtered = append(filtered, d)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
devices = append(devices, Device{
|
||||||
|
Name: deviceName,
|
||||||
|
Type: deviceType,
|
||||||
|
Label: buildDeviceLabel(deviceName, deviceType),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
return filtered
|
return devices
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
@ -172,11 +189,19 @@ func main() {
|
||||||
level.Info(logger).Log("msg", "Build context", "build_context", version.BuildContext())
|
level.Info(logger).Log("msg", "Build context", "build_context", version.BuildContext())
|
||||||
|
|
||||||
var devices []Device
|
var devices []Device
|
||||||
devices = scanDevices(logger)
|
|
||||||
level.Info(logger).Log("msg", "Number of devices found", "count", len(devices))
|
if len(*smartctlDevices) == 0 {
|
||||||
|
*smartctlScan = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if *smartctlScan {
|
||||||
|
devices = scanDevices(logger)
|
||||||
|
level.Info(logger).Log("msg", "Number of devices found", "count", len(devices))
|
||||||
|
}
|
||||||
|
|
||||||
if len(*smartctlDevices) > 0 {
|
if len(*smartctlDevices) > 0 {
|
||||||
level.Info(logger).Log("msg", "Devices specified", "devices", strings.Join(*smartctlDevices, ", "))
|
level.Info(logger).Log("msg", "Devices specified", "devices", strings.Join(*smartctlDevices, ", "))
|
||||||
devices = filterDevices(logger, devices, *smartctlDevices)
|
devices = buildDevicesFromFlag(devices)
|
||||||
level.Info(logger).Log("msg", "Devices filtered", "count", len(devices))
|
level.Info(logger).Log("msg", "Devices filtered", "count", len(devices))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -185,7 +210,7 @@ func main() {
|
||||||
logger: logger,
|
logger: logger,
|
||||||
}
|
}
|
||||||
|
|
||||||
if *smartctlRescanInterval >= 1*time.Second {
|
if *smartctlScan && *smartctlRescanInterval >= 1*time.Second {
|
||||||
level.Info(logger).Log("msg", "Start background scan process")
|
level.Info(logger).Log("msg", "Start background scan process")
|
||||||
level.Info(logger).Log("msg", "Rescanning for devices every", "rescanInterval", *smartctlRescanInterval)
|
level.Info(logger).Log("msg", "Rescanning for devices every", "rescanInterval", *smartctlRescanInterval)
|
||||||
go collector.RescanForDevices()
|
go collector.RescanForDevices()
|
||||||
|
|
29
readjson.go
29
readjson.go
|
@ -64,21 +64,24 @@ func readFakeSMARTctl(logger log.Logger, device Device) gjson.Result {
|
||||||
// Get json from smartctl and parse it
|
// Get json from smartctl and parse it
|
||||||
func readSMARTctl(logger log.Logger, device Device) (gjson.Result, bool) {
|
func readSMARTctl(logger log.Logger, device Device) (gjson.Result, bool) {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device="+device.Type, device.Name).Output()
|
var smartctlArgs = []string{"--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device=" + device.Type, device.Name}
|
||||||
|
|
||||||
|
level.Debug(logger).Log("msg", "Calling smartctl with args", "args", strings.Join(smartctlArgs, " "))
|
||||||
|
out, err := exec.Command(*smartctlPath, smartctlArgs...).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device.Info_Name)
|
level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device)
|
||||||
}
|
}
|
||||||
json := parseJSON(string(out))
|
json := parseJSON(string(out))
|
||||||
rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int())
|
rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int())
|
||||||
jsonOk := jsonIsOk(logger, json)
|
jsonOk := jsonIsOk(logger, json)
|
||||||
level.Debug(logger).Log("msg", "Collected S.M.A.R.T. json data", "device", device.Info_Name, "duration", time.Since(start))
|
level.Debug(logger).Log("msg", "Collected S.M.A.R.T. json data", "device", device, "duration", time.Since(start))
|
||||||
return json, rcOk && jsonOk
|
return json, rcOk && jsonOk
|
||||||
}
|
}
|
||||||
|
|
||||||
func readSMARTctlDevices(logger log.Logger) gjson.Result {
|
func readSMARTctlDevices(logger log.Logger) gjson.Result {
|
||||||
level.Debug(logger).Log("msg", "Scanning for devices")
|
level.Debug(logger).Log("msg", "Scanning for devices")
|
||||||
var scanArgs []string = []string{"--json", "--scan"}
|
var scanArgs []string = []string{"--json", "--scan"}
|
||||||
for _, d := range *smartctlDeviceTypes {
|
for _, d := range *smartctlScanDeviceTypes {
|
||||||
scanArgs = append(scanArgs, "--device", d)
|
scanArgs = append(scanArgs, "--device", d)
|
||||||
}
|
}
|
||||||
out, err := exec.Command(*smartctlPath, scanArgs...).Output()
|
out, err := exec.Command(*smartctlPath, scanArgs...).Output()
|
||||||
|
@ -109,7 +112,7 @@ func readData(logger log.Logger, device Device) gjson.Result {
|
||||||
jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()})
|
jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()})
|
||||||
j, found := jsonCache.Load(device)
|
j, found := jsonCache.Load(device)
|
||||||
if !found {
|
if !found {
|
||||||
level.Warn(logger).Log("msg", "device not found", "device", device.Info_Name)
|
level.Warn(logger).Log("msg", "device not found", "device", device)
|
||||||
}
|
}
|
||||||
return j.(JSONCache).JSON
|
return j.(JSONCache).JSON
|
||||||
}
|
}
|
||||||
|
@ -124,30 +127,30 @@ func resultCodeIsOk(logger log.Logger, device Device, SMARTCtlResult int64) bool
|
||||||
if SMARTCtlResult > 0 {
|
if SMARTCtlResult > 0 {
|
||||||
b := SMARTCtlResult
|
b := SMARTCtlResult
|
||||||
if (b & 1) != 0 {
|
if (b & 1) != 0 {
|
||||||
level.Error(logger).Log("msg", "Command line did not parse", "device", device.Info_Name)
|
level.Error(logger).Log("msg", "Command line did not parse", "device", device)
|
||||||
result = false
|
result = false
|
||||||
}
|
}
|
||||||
if (b & (1 << 1)) != 0 {
|
if (b & (1 << 1)) != 0 {
|
||||||
level.Error(logger).Log("msg", "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device.Info_Name)
|
level.Error(logger).Log("msg", "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device)
|
||||||
result = false
|
result = false
|
||||||
}
|
}
|
||||||
if (b & (1 << 2)) != 0 {
|
if (b & (1 << 2)) != 0 {
|
||||||
level.Warn(logger).Log("msg", "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device.Info_Name)
|
level.Warn(logger).Log("msg", "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device)
|
||||||
}
|
}
|
||||||
if (b & (1 << 3)) != 0 {
|
if (b & (1 << 3)) != 0 {
|
||||||
level.Warn(logger).Log("msg", "SMART status check returned 'DISK FAILING'", "device", device.Info_Name)
|
level.Warn(logger).Log("msg", "SMART status check returned 'DISK FAILING'", "device", device)
|
||||||
}
|
}
|
||||||
if (b & (1 << 4)) != 0 {
|
if (b & (1 << 4)) != 0 {
|
||||||
level.Warn(logger).Log("msg", "We found prefail Attributes <= threshold", "device", device.Info_Name)
|
level.Warn(logger).Log("msg", "We found prefail Attributes <= threshold", "device", device)
|
||||||
}
|
}
|
||||||
if (b & (1 << 5)) != 0 {
|
if (b & (1 << 5)) != 0 {
|
||||||
level.Warn(logger).Log("msg", "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device.Info_Name)
|
level.Warn(logger).Log("msg", "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device)
|
||||||
}
|
}
|
||||||
if (b & (1 << 6)) != 0 {
|
if (b & (1 << 6)) != 0 {
|
||||||
level.Warn(logger).Log("msg", "The device error log contains records of errors", "device", device.Info_Name)
|
level.Warn(logger).Log("msg", "The device error log contains records of errors", "device", device)
|
||||||
}
|
}
|
||||||
if (b & (1 << 7)) != 0 {
|
if (b & (1 << 7)) != 0 {
|
||||||
level.Warn(logger).Log("msg", "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device.Info_Name)
|
level.Warn(logger).Log("msg", "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
30
smartctl.go
30
smartctl.go
|
@ -43,28 +43,16 @@ type SMARTctl struct {
|
||||||
device SMARTDevice
|
device SMARTDevice
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractDiskName(input string) string {
|
func buildDeviceLabel(inputName string, inputType string) string {
|
||||||
re := regexp.MustCompile(`^(?:/dev/(?P<bus_name>\S+)/(?P<bus_num>\S+)\s\[|/dev/(?:disk\/by-id\/|disk\/by-path\/|)|\[)(?:\s\[|)(?P<disk>[A-Za-z0-9_\-]+)(?:\].*|)$`)
|
// Strip /dev prefix and replace / with _ (/dev/bus/0 becomes bus_0, /dev/disk/by-id/abcd becomes abcd)
|
||||||
match := re.FindStringSubmatch(input)
|
devReg := regexp.MustCompile(`^/dev/(?:disk/by-id/|disk/by-path/|)`)
|
||||||
|
deviceName := strings.ReplaceAll(devReg.ReplaceAllString(inputName, ""), "/", "_")
|
||||||
|
|
||||||
if len(match) > 0 {
|
if strings.Contains(inputType, ",") {
|
||||||
busNameIndex := re.SubexpIndex("bus_name")
|
return deviceName + "_" + strings.ReplaceAll(inputType, ",", "_")
|
||||||
busNumIndex := re.SubexpIndex("bus_num")
|
|
||||||
diskIndex := re.SubexpIndex("disk")
|
|
||||||
var name []string
|
|
||||||
if busNameIndex != -1 && match[busNameIndex] != "" {
|
|
||||||
name = append(name, match[busNameIndex])
|
|
||||||
}
|
|
||||||
if busNumIndex != -1 && match[busNumIndex] != "" {
|
|
||||||
name = append(name, match[busNumIndex])
|
|
||||||
}
|
|
||||||
if diskIndex != -1 && match[diskIndex] != "" {
|
|
||||||
name = append(name, match[diskIndex])
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.Join(name, "_")
|
|
||||||
}
|
}
|
||||||
return ""
|
|
||||||
|
return deviceName
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewSMARTctl is smartctl constructor
|
// NewSMARTctl is smartctl constructor
|
||||||
|
@ -85,7 +73,7 @@ func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metr
|
||||||
json: json,
|
json: json,
|
||||||
logger: logger,
|
logger: logger,
|
||||||
device: SMARTDevice{
|
device: SMARTDevice{
|
||||||
device: extractDiskName(strings.TrimSpace(json.Get("device.info_name").String())),
|
device: buildDeviceLabel(json.Get("device.name").String(), json.Get("device.type").String()),
|
||||||
serial: strings.TrimSpace(json.Get("serial_number").String()),
|
serial: strings.TrimSpace(json.Get("serial_number").String()),
|
||||||
family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")),
|
family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")),
|
||||||
model: strings.TrimSpace(model_name),
|
model: strings.TrimSpace(model_name),
|
||||||
|
|
44
smartctl_test.go
Normal file
44
smartctl_test.go
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
// Copyright 2024 The Prometheus Authors
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestBuildDeviceLabel(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
deviceName string
|
||||||
|
deviceType string
|
||||||
|
expectedLabel string
|
||||||
|
}{
|
||||||
|
{"/dev/bus/0", "megaraid,1", "bus_0_megaraid_1"},
|
||||||
|
{"/dev/sda", "auto", "sda"},
|
||||||
|
{"/dev/disk/by-id/ata-CT500MX500SSD1_ABCDEFGHIJ", "auto", "ata-CT500MX500SSD1_ABCDEFGHIJ"},
|
||||||
|
// Some cases extracted from smartctl docs. Are these the prettiest?
|
||||||
|
// Probably not. Are they unique enough. Definitely.
|
||||||
|
{"/dev/sg1", "cciss,1", "sg1_cciss_1"},
|
||||||
|
{"/dev/bsg/sssraid0", "sssraid,0,1", "bsg_sssraid0_sssraid_0_1"},
|
||||||
|
{"/dev/cciss/c0d0", "cciss,0", "cciss_c0d0_cciss_0"},
|
||||||
|
{"/dev/sdb", "aacraid,1,0,4", "sdb_aacraid_1_0_4"},
|
||||||
|
{"/dev/twl0", "3ware,1", "twl0_3ware_1"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
result := buildDeviceLabel(test.deviceName, test.deviceType)
|
||||||
|
if result != test.expectedLabel {
|
||||||
|
t.Errorf("deviceName=%v deviceType=%v expected=%v result=%v", test.deviceName, test.deviceType, test.expectedLabel, result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue