mirror of
https://github.com/prometheus-community/smartctl_exporter.git
synced 2024-11-23 01:43:07 +01:00
Implemented new features - extract raid member disk name.
Modified smartctl.device param - now you can set it as sda, megaraid_disk_01, etc. Signed-off-by: Denys <zxzharmlesszxz@gmail.com>
This commit is contained in:
parent
84d8cc3d4d
commit
3a012b5bb1
4 changed files with 69 additions and 30 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -3,6 +3,7 @@
|
|||
/.release
|
||||
/.tarballs
|
||||
debug/
|
||||
.idea/
|
||||
|
||||
Manifest
|
||||
smartctl_exporter
|
||||
|
|
49
main.go
49
main.go
|
@ -16,6 +16,7 @@ package main
|
|||
import (
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
|
@ -32,11 +33,18 @@ import (
|
|||
webflag "github.com/prometheus/exporter-toolkit/web/kingpinflag"
|
||||
)
|
||||
|
||||
// Device
|
||||
type Device struct {
|
||||
Name string `json:"name"`
|
||||
Info_Name string `json:"info_name"`
|
||||
Type string `json:"type"`
|
||||
}
|
||||
|
||||
// SMARTctlManagerCollector implements the Collector interface.
|
||||
type SMARTctlManagerCollector struct {
|
||||
CollectPeriod string
|
||||
CollectPeriodDuration time.Duration
|
||||
Devices []string
|
||||
Devices []Device
|
||||
|
||||
logger log.Logger
|
||||
mutex sync.Mutex
|
||||
|
@ -106,24 +114,43 @@ var (
|
|||
)
|
||||
|
||||
// scanDevices uses smartctl to gather the list of available devices.
|
||||
func scanDevices(logger log.Logger) []string {
|
||||
func scanDevices(logger log.Logger) []Device {
|
||||
filter := newDeviceFilter(*smartctlDeviceExclude, *smartctlDeviceInclude)
|
||||
|
||||
json := readSMARTctlDevices(logger)
|
||||
scanDevices := json.Get("devices").Array()
|
||||
var scanDeviceResult []string
|
||||
var scanDeviceResult []Device
|
||||
for _, d := range scanDevices {
|
||||
deviceName := d.Get("name").String()
|
||||
deviceName := extractDiskName(strings.TrimSpace(d.Get("info_name").String()))
|
||||
if filter.ignored(deviceName) {
|
||||
level.Info(logger).Log("msg", "Ignoring device", "name", deviceName)
|
||||
} else {
|
||||
level.Info(logger).Log("msg", "Found device", "name", deviceName)
|
||||
scanDeviceResult = append(scanDeviceResult, deviceName)
|
||||
device := Device{
|
||||
Name: d.Get("name").String(),
|
||||
Info_Name: deviceName,
|
||||
Type: d.Get("type").String(),
|
||||
}
|
||||
scanDeviceResult = append(scanDeviceResult, device)
|
||||
}
|
||||
}
|
||||
return scanDeviceResult
|
||||
}
|
||||
|
||||
func filterDevices(logger log.Logger, devices []Device, filters []string) []Device {
|
||||
var filtered []Device
|
||||
for _, d := range devices {
|
||||
for _, filter := range filters {
|
||||
level.Debug(logger).Log("msg", "filterDevices", "device", d.Info_Name, "filter", filter)
|
||||
if strings.Contains(d.Info_Name, filter) {
|
||||
filtered = append(filtered, d)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
func main() {
|
||||
metricsPath := kingpin.Flag(
|
||||
"web.telemetry-path", "Path under which to expose metrics",
|
||||
|
@ -140,13 +167,13 @@ func main() {
|
|||
level.Info(logger).Log("msg", "Starting smartctl_exporter", "version", version.Info())
|
||||
level.Info(logger).Log("msg", "Build context", "build_context", version.BuildContext())
|
||||
|
||||
var devices []string
|
||||
if len(*smartctlDevices) > 0 {
|
||||
devices = *smartctlDevices
|
||||
} else {
|
||||
level.Info(logger).Log("msg", "No devices specified, trying to load them automatically")
|
||||
var devices []Device
|
||||
devices = scanDevices(logger)
|
||||
level.Info(logger).Log("msg", "Number of devices found", "count", len(devices))
|
||||
if len(*smartctlDevices) > 0 {
|
||||
level.Info(logger).Log("msg", "Devices specified", "devices", strings.Join(*smartctlDevices, ", "))
|
||||
devices = filterDevices(logger, devices, *smartctlDevices)
|
||||
level.Info(logger).Log("msg", "Devices filtered", "count", len(devices))
|
||||
}
|
||||
|
||||
collector := SMARTctlManagerCollector{
|
||||
|
@ -154,7 +181,7 @@ func main() {
|
|||
logger: logger,
|
||||
}
|
||||
|
||||
if *smartctlRescanInterval >= 1*time.Second && len(*smartctlDevices) == 0 {
|
||||
if *smartctlRescanInterval >= 1*time.Second {
|
||||
level.Info(logger).Log("msg", "Start background scan process")
|
||||
level.Info(logger).Log("msg", "Rescanning for devices every", "rescanInterval", *smartctlRescanInterval)
|
||||
go collector.RescanForDevices()
|
||||
|
|
34
readjson.go
34
readjson.go
|
@ -49,8 +49,8 @@ func parseJSON(data string) gjson.Result {
|
|||
}
|
||||
|
||||
// Reading fake smartctl json
|
||||
func readFakeSMARTctl(logger log.Logger, device string) gjson.Result {
|
||||
s := strings.Split(device, "/")
|
||||
func readFakeSMARTctl(logger log.Logger, device Device) gjson.Result {
|
||||
s := strings.Split(device.Name, "/")
|
||||
filename := fmt.Sprintf("debug/%s.json", s[len(s)-1])
|
||||
level.Debug(logger).Log("msg", "Read fake S.M.A.R.T. data from json", "filename", filename)
|
||||
jsonFile, err := os.ReadFile(filename)
|
||||
|
@ -62,16 +62,16 @@ func readFakeSMARTctl(logger log.Logger, device string) gjson.Result {
|
|||
}
|
||||
|
||||
// Get json from smartctl and parse it
|
||||
func readSMARTctl(logger log.Logger, device string) (gjson.Result, bool) {
|
||||
func readSMARTctl(logger log.Logger, device Device) (gjson.Result, bool) {
|
||||
start := time.Now()
|
||||
out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", device).Output()
|
||||
out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", device.Name, "-d", device.Type).Output()
|
||||
if err != nil {
|
||||
level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device)
|
||||
level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device.Info_Name)
|
||||
}
|
||||
json := parseJSON(string(out))
|
||||
rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int())
|
||||
jsonOk := jsonIsOk(logger, json)
|
||||
level.Debug(logger).Log("msg", "Collected S.M.A.R.T. json data", "device", device, "duration", time.Since(start))
|
||||
level.Debug(logger).Log("msg", "Collected S.M.A.R.T. json data", "device", device.Info_Name, "duration", time.Since(start))
|
||||
return json, rcOk && jsonOk
|
||||
}
|
||||
|
||||
|
@ -90,7 +90,7 @@ func readSMARTctlDevices(logger log.Logger) gjson.Result {
|
|||
}
|
||||
|
||||
// Select json source and parse
|
||||
func readData(logger log.Logger, device string) gjson.Result {
|
||||
func readData(logger log.Logger, device Device) gjson.Result {
|
||||
if *smartctlFakeData {
|
||||
return readFakeSMARTctl(logger, device)
|
||||
}
|
||||
|
@ -102,7 +102,7 @@ func readData(logger log.Logger, device string) gjson.Result {
|
|||
jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()})
|
||||
j, found := jsonCache.Load(device)
|
||||
if !found {
|
||||
level.Warn(logger).Log("msg", "device not found", "device", device)
|
||||
level.Warn(logger).Log("msg", "device not found", "device", device.Info_Name)
|
||||
}
|
||||
return j.(JSONCache).JSON
|
||||
}
|
||||
|
@ -112,35 +112,35 @@ func readData(logger log.Logger, device string) gjson.Result {
|
|||
}
|
||||
|
||||
// Parse smartctl return code
|
||||
func resultCodeIsOk(logger log.Logger, device string, SMARTCtlResult int64) bool {
|
||||
func resultCodeIsOk(logger log.Logger, device Device, SMARTCtlResult int64) bool {
|
||||
result := true
|
||||
if SMARTCtlResult > 0 {
|
||||
b := SMARTCtlResult
|
||||
if (b & 1) != 0 {
|
||||
level.Error(logger).Log("msg", "Command line did not parse", "device", device)
|
||||
level.Error(logger).Log("msg", "Command line did not parse", "device", device.Info_Name)
|
||||
result = false
|
||||
}
|
||||
if (b & (1 << 1)) != 0 {
|
||||
level.Error(logger).Log("msg", "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device)
|
||||
level.Error(logger).Log("msg", "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device.Info_Name)
|
||||
result = false
|
||||
}
|
||||
if (b & (1 << 2)) != 0 {
|
||||
level.Warn(logger).Log("msg", "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device)
|
||||
level.Warn(logger).Log("msg", "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device.Info_Name)
|
||||
}
|
||||
if (b & (1 << 3)) != 0 {
|
||||
level.Warn(logger).Log("msg", "SMART status check returned 'DISK FAILING'", "device", device)
|
||||
level.Warn(logger).Log("msg", "SMART status check returned 'DISK FAILING'", "device", device.Info_Name)
|
||||
}
|
||||
if (b & (1 << 4)) != 0 {
|
||||
level.Warn(logger).Log("msg", "We found prefail Attributes <= threshold", "device", device)
|
||||
level.Warn(logger).Log("msg", "We found prefail Attributes <= threshold", "device", device.Info_Name)
|
||||
}
|
||||
if (b & (1 << 5)) != 0 {
|
||||
level.Warn(logger).Log("msg", "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device)
|
||||
level.Warn(logger).Log("msg", "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device.Info_Name)
|
||||
}
|
||||
if (b & (1 << 6)) != 0 {
|
||||
level.Warn(logger).Log("msg", "The device error log contains records of errors", "device", device)
|
||||
level.Warn(logger).Log("msg", "The device error log contains records of errors", "device", device.Info_Name)
|
||||
}
|
||||
if (b & (1 << 7)) != 0 {
|
||||
level.Warn(logger).Log("msg", "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device)
|
||||
level.Warn(logger).Log("msg", "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device.Info_Name)
|
||||
}
|
||||
}
|
||||
return result
|
||||
|
|
13
smartctl.go
13
smartctl.go
|
@ -15,6 +15,7 @@ package main
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/go-kit/log"
|
||||
|
@ -42,6 +43,16 @@ type SMARTctl struct {
|
|||
device SMARTDevice
|
||||
}
|
||||
|
||||
func extractDiskName(input string) string {
|
||||
re := regexp.MustCompile(`^(?:/dev/\S+/\S+\s\[|/dev/|\[)(?:\s\[|)(?P<disk>[a-z0-9_]+)(?:\].*|)$`)
|
||||
match := re.FindStringSubmatch(input)
|
||||
|
||||
if len(match) > 0 {
|
||||
return match[re.SubexpIndex("disk")]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// NewSMARTctl is smartctl constructor
|
||||
func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metric) SMARTctl {
|
||||
var model_name string
|
||||
|
@ -60,7 +71,7 @@ func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metr
|
|||
json: json,
|
||||
logger: logger,
|
||||
device: SMARTDevice{
|
||||
device: strings.TrimPrefix(strings.TrimSpace(json.Get("device.name").String()), "/dev/"),
|
||||
device: extractDiskName(strings.TrimSpace(json.Get("device.info_name").String())),
|
||||
serial: strings.TrimSpace(json.Get("serial_number").String()),
|
||||
family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")),
|
||||
model: strings.TrimSpace(model_name),
|
||||
|
|
Loading…
Reference in a new issue