2022-08-05 03:37:13 +02:00
// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2019-08-14 22:34:49 +02:00
package main
import (
"fmt"
2024-10-18 09:18:48 +02:00
"log/slog"
2023-04-12 09:35:03 +02:00
"os"
2019-08-14 22:34:49 +02:00
"os/exec"
"strings"
2022-11-05 13:45:48 +01:00
"sync"
2019-08-15 23:01:16 +02:00
"time"
2019-08-14 22:34:49 +02:00
"github.com/tidwall/gjson"
)
2019-08-15 23:01:16 +02:00
// JSONCache caching json
type JSONCache struct {
JSON gjson . Result
LastCollect time . Time
}
var (
2022-11-05 13:45:48 +01:00
jsonCache sync . Map
2019-08-15 23:01:16 +02:00
)
func init ( ) {
2022-11-05 13:45:48 +01:00
jsonCache . Store ( "" , JSONCache { } )
2019-08-15 23:01:16 +02:00
}
2019-08-14 22:34:49 +02:00
// Parse json to gjson object
2019-08-15 23:01:16 +02:00
func parseJSON ( data string ) gjson . Result {
2019-08-14 22:34:49 +02:00
if ! gjson . Valid ( data ) {
2019-08-15 23:01:16 +02:00
return gjson . Parse ( "{}" )
2019-08-14 22:34:49 +02:00
}
2019-08-15 23:01:16 +02:00
return gjson . Parse ( data )
2019-08-14 22:34:49 +02:00
}
// Reading fake smartctl json
2024-10-18 09:18:48 +02:00
func readFakeSMARTctl ( logger * slog . Logger , device Device ) gjson . Result {
2024-03-08 15:39:33 +01:00
s := strings . Split ( device . Name , "/" )
2022-06-16 10:08:20 +02:00
filename := fmt . Sprintf ( "debug/%s.json" , s [ len ( s ) - 1 ] )
2024-10-18 09:18:48 +02:00
logger . Debug ( "Read fake S.M.A.R.T. data from json" , "filename" , filename )
2023-04-12 09:35:03 +02:00
jsonFile , err := os . ReadFile ( filename )
2019-08-14 22:34:49 +02:00
if err != nil {
2024-10-18 09:18:48 +02:00
logger . Error ( "Fake S.M.A.R.T. data reading error" , "err" , err )
2019-08-14 22:34:49 +02:00
return parseJSON ( "{}" )
}
return parseJSON ( string ( jsonFile ) )
}
// Get json from smartctl and parse it
2024-10-18 09:18:48 +02:00
func readSMARTctl ( logger * slog . Logger , device Device ) ( gjson . Result , bool ) {
2024-01-25 17:52:05 +01:00
start := time . Now ( )
2024-05-05 16:29:02 +02:00
out , err := exec . Command ( * smartctlPath , "--json" , "--info" , "--health" , "--attributes" , "--tolerance=verypermissive" , "--nocheck=standby" , "--format=brief" , "--log=error" , "--device=" + device . Type , device . Name ) . Output ( )
2019-08-14 22:34:49 +02:00
if err != nil {
2024-10-18 09:18:48 +02:00
logger . Warn ( "S.M.A.R.T. output reading" , "err" , err , "device" , device . Info_Name )
2019-08-14 22:34:49 +02:00
}
2024-11-08 09:02:29 +01:00
// Accommodate a smartmontools pre-7.3 bug
cleaned_out := strings . TrimPrefix ( string ( out ) , " Pending defect count:" )
json := parseJSON ( cleaned_out )
2023-08-08 11:38:12 +02:00
rcOk := resultCodeIsOk ( logger , device , json . Get ( "smartctl.exit_status" ) . Int ( ) )
2022-10-03 11:16:00 +02:00
jsonOk := jsonIsOk ( logger , json )
2024-10-18 09:18:48 +02:00
logger . Debug ( "Collected S.M.A.R.T. json data" , "device" , device . Info_Name , "duration" , time . Since ( start ) )
2020-10-29 22:35:49 +01:00
return json , rcOk && jsonOk
2019-08-14 22:34:49 +02:00
}
2024-10-18 09:18:48 +02:00
func readSMARTctlDevices ( logger * slog . Logger ) gjson . Result {
logger . Debug ( "Scanning for devices" )
2022-10-03 14:49:32 +02:00
out , err := exec . Command ( * smartctlPath , "--json" , "--scan" ) . Output ( )
if exiterr , ok := err . ( * exec . ExitError ) ; ok {
2024-10-18 09:18:48 +02:00
logger . Debug ( "Exit Status" , "exit_code" , exiterr . ExitCode ( ) )
2022-10-03 14:49:32 +02:00
// The smartctl command returns 2 if devices are sleeping, ignore this error.
if exiterr . ExitCode ( ) != 2 {
2024-10-18 09:18:48 +02:00
logger . Warn ( "S.M.A.R.T. output reading error" , "err" , err )
2022-10-03 14:49:32 +02:00
return gjson . Result { }
}
2020-10-06 13:05:00 +02:00
}
return parseJSON ( string ( out ) )
}
2019-08-14 22:34:49 +02:00
// Select json source and parse
2024-10-18 09:18:48 +02:00
func readData ( logger * slog . Logger , device Device ) gjson . Result {
2022-10-03 11:16:00 +02:00
if * smartctlFakeData {
2022-11-04 19:42:36 +01:00
return readFakeSMARTctl ( logger , device )
2019-08-14 22:34:49 +02:00
}
2019-08-15 23:01:16 +02:00
2022-11-05 13:45:48 +01:00
cacheValue , cacheOk := jsonCache . Load ( device )
if ! cacheOk || time . Now ( ) . After ( cacheValue . ( JSONCache ) . LastCollect . Add ( * smartctlInterval ) ) {
2022-10-15 05:39:36 +02:00
json , ok := readSMARTctl ( logger , device )
if ok {
2022-11-05 13:45:48 +01:00
jsonCache . Store ( device , JSONCache { JSON : json , LastCollect : time . Now ( ) } )
j , found := jsonCache . Load ( device )
if ! found {
2024-10-18 09:18:48 +02:00
logger . Warn ( "device not found" , "device" , device . Info_Name )
2022-11-05 13:45:48 +01:00
}
2022-11-04 19:42:36 +01:00
return j . ( JSONCache ) . JSON
2020-10-29 22:35:49 +01:00
}
2022-11-04 19:42:36 +01:00
return gjson . Result { }
2020-10-29 22:35:49 +01:00
}
2022-11-04 19:42:36 +01:00
return cacheValue . ( JSONCache ) . JSON
2020-10-29 22:35:49 +01:00
}
// Parse smartctl return code
2024-10-18 09:18:48 +02:00
func resultCodeIsOk ( logger * slog . Logger , device Device , SMARTCtlResult int64 ) bool {
2020-10-29 22:35:49 +01:00
result := true
if SMARTCtlResult > 0 {
2022-03-17 23:52:15 +01:00
b := SMARTCtlResult
if ( b & 1 ) != 0 {
2024-10-18 09:18:48 +02:00
logger . Error ( "Command line did not parse" , "device" , device . Info_Name )
2020-10-29 22:35:49 +01:00
result = false
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 1 ) ) != 0 {
2024-10-18 09:18:48 +02:00
logger . Error ( "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode" , "device" , device . Info_Name )
2020-10-29 22:35:49 +01:00
result = false
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 2 ) ) != 0 {
2024-10-18 09:18:48 +02:00
logger . Warn ( "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure" , "device" , device . Info_Name )
2020-10-29 22:35:49 +01:00
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 3 ) ) != 0 {
2024-10-18 09:18:48 +02:00
logger . Warn ( "SMART status check returned 'DISK FAILING'" , "device" , device . Info_Name )
2020-10-29 22:35:49 +01:00
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 4 ) ) != 0 {
2024-10-18 09:18:48 +02:00
logger . Warn ( "We found prefail Attributes <= threshold" , "device" , device . Info_Name )
2020-10-29 22:35:49 +01:00
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 5 ) ) != 0 {
2024-10-18 09:18:48 +02:00
logger . Warn ( "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past" , "device" , device . Info_Name )
2020-10-29 22:35:49 +01:00
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 6 ) ) != 0 {
2024-10-18 09:18:48 +02:00
logger . Warn ( "The device error log contains records of errors" , "device" , device . Info_Name )
2020-10-29 22:35:49 +01:00
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 7 ) ) != 0 {
2024-10-18 09:18:48 +02:00
logger . Warn ( "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored" , "device" , device . Info_Name )
2020-10-29 22:35:49 +01:00
}
}
return result
}
// Check json
2024-10-18 09:18:48 +02:00
func jsonIsOk ( logger * slog . Logger , json gjson . Result ) bool {
2020-10-29 22:35:49 +01:00
messages := json . Get ( "smartctl.messages" )
// logger.Debug(messages.String())
if messages . Exists ( ) {
for _ , message := range messages . Array ( ) {
if message . Get ( "severity" ) . String ( ) == "error" {
2024-11-28 08:42:49 +01:00
// if the string contains "GetLogPage failed", then ignore it
// this is a known issue with Apple internal SSDs
if strings . Contains ( message . Get ( "string" ) . String ( ) , "GetLogPage failed" ) {
logger . Warn ( "Ignoring GetLogPage failed error" , "device" , json . Get ( "device.name" ) . String ( ) , "message" , message . Get ( "string" ) . String ( ) )
continue
}
2024-10-18 09:18:48 +02:00
logger . Error ( message . Get ( "string" ) . String ( ) )
2020-10-29 22:35:49 +01:00
return false
}
2019-08-15 23:01:16 +02:00
}
}
2020-10-29 22:35:49 +01:00
return true
2019-08-14 22:34:49 +02:00
}