2022-08-05 03:37:13 +02:00
// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2019-08-14 22:34:49 +02:00
package main
import (
"fmt"
"io/ioutil"
2020-10-29 22:35:49 +01:00
"os"
2019-08-14 22:34:49 +02:00
"os/exec"
"strings"
2019-08-15 23:01:16 +02:00
"time"
2019-08-14 22:34:49 +02:00
2022-10-03 11:16:00 +02:00
"github.com/go-kit/log"
"github.com/go-kit/log/level"
2019-08-14 22:34:49 +02:00
"github.com/tidwall/gjson"
)
2019-08-15 23:01:16 +02:00
// JSONCache caching json
type JSONCache struct {
JSON gjson . Result
LastCollect time . Time
}
var (
jsonCache map [ string ] JSONCache
)
func init ( ) {
jsonCache = make ( map [ string ] JSONCache )
}
2019-08-14 22:34:49 +02:00
// Parse json to gjson object
2019-08-15 23:01:16 +02:00
func parseJSON ( data string ) gjson . Result {
2019-08-14 22:34:49 +02:00
if ! gjson . Valid ( data ) {
2019-08-15 23:01:16 +02:00
return gjson . Parse ( "{}" )
2019-08-14 22:34:49 +02:00
}
2019-08-15 23:01:16 +02:00
return gjson . Parse ( data )
2019-08-14 22:34:49 +02:00
}
// Reading fake smartctl json
2022-10-03 11:16:00 +02:00
func readFakeSMARTctl ( logger log . Logger , device string ) gjson . Result {
2022-06-16 10:08:20 +02:00
s := strings . Split ( device , "/" )
filename := fmt . Sprintf ( "debug/%s.json" , s [ len ( s ) - 1 ] )
2022-10-03 11:16:00 +02:00
level . Debug ( logger ) . Log ( "msg" , "Read fake S.M.A.R.T. data from json" , "filename" , filename )
2019-08-14 22:34:49 +02:00
jsonFile , err := ioutil . ReadFile ( filename )
if err != nil {
2022-10-03 11:16:00 +02:00
level . Error ( logger ) . Log ( "msg" , "Fake S.M.A.R.T. data reading error" , "err" , err )
2019-08-14 22:34:49 +02:00
return parseJSON ( "{}" )
}
return parseJSON ( string ( jsonFile ) )
}
// Get json from smartctl and parse it
2022-10-03 11:16:00 +02:00
func readSMARTctl ( logger log . Logger , device string ) ( gjson . Result , bool ) {
level . Debug ( logger ) . Log ( "msg" , "Collecting S.M.A.R.T. counters" , "device" , device )
2022-10-03 14:49:32 +02:00
out , err := exec . Command ( * smartctlPath , "--json" , "--info" , "--health" , "--attributes" , "--tolerance=verypermissive" , "--nocheck=standby" , "--format=brief" , device ) . Output ( )
2019-08-14 22:34:49 +02:00
if err != nil {
2022-10-03 11:16:00 +02:00
level . Warn ( logger ) . Log ( "msg" , "S.M.A.R.T. output reading" , "err" , err )
2019-08-14 22:34:49 +02:00
}
2020-10-29 22:35:49 +01:00
json := parseJSON ( string ( out ) )
2022-10-03 11:16:00 +02:00
rcOk := resultCodeIsOk ( logger , json . Get ( "smartctl.exit_status" ) . Int ( ) )
jsonOk := jsonIsOk ( logger , json )
2020-10-29 22:35:49 +01:00
return json , rcOk && jsonOk
2019-08-14 22:34:49 +02:00
}
2022-10-03 11:16:00 +02:00
func readSMARTctlDevices ( logger log . Logger ) gjson . Result {
2022-10-03 14:49:32 +02:00
level . Debug ( logger ) . Log ( "msg" , "Scanning for devices" )
out , err := exec . Command ( * smartctlPath , "--json" , "--scan" ) . Output ( )
if exiterr , ok := err . ( * exec . ExitError ) ; ok {
level . Debug ( logger ) . Log ( "msg" , "Exit Status" , "exit_code" , exiterr . ExitCode ( ) )
// The smartctl command returns 2 if devices are sleeping, ignore this error.
if exiterr . ExitCode ( ) != 2 {
level . Warn ( logger ) . Log ( "msg" , "S.M.A.R.T. output reading error" , "err" , err )
return gjson . Result { }
}
2020-10-06 13:05:00 +02:00
}
return parseJSON ( string ( out ) )
}
2019-08-14 22:34:49 +02:00
// Select json source and parse
2022-10-03 11:16:00 +02:00
func readData ( logger log . Logger , device string ) ( gjson . Result , error ) {
if * smartctlFakeData {
return readFakeSMARTctl ( logger , device ) , nil
2019-08-14 22:34:49 +02:00
}
2019-08-15 23:01:16 +02:00
2020-10-29 22:35:49 +01:00
if _ , err := os . Stat ( device ) ; err == nil {
cacheValue , cacheOk := jsonCache [ device ]
2022-10-03 11:16:00 +02:00
if ! cacheOk || time . Now ( ) . After ( cacheValue . LastCollect . Add ( * smartctlInterval ) ) {
json , ok := readSMARTctl ( logger , device )
2020-10-29 22:35:49 +01:00
if ok {
jsonCache [ device ] = JSONCache { JSON : json , LastCollect : time . Now ( ) }
return jsonCache [ device ] . JSON , nil
}
return gjson . Parse ( "{}" ) , fmt . Errorf ( "smartctl returned bad data for device %s" , device )
}
2022-08-05 03:09:55 +02:00
return cacheValue . JSON , nil
2020-10-29 22:35:49 +01:00
}
return gjson . Parse ( "{}" ) , fmt . Errorf ( "Device %s unavialable" , device )
}
// Parse smartctl return code
2022-10-03 11:16:00 +02:00
func resultCodeIsOk ( logger log . Logger , SMARTCtlResult int64 ) bool {
2020-10-29 22:35:49 +01:00
result := true
if SMARTCtlResult > 0 {
2022-03-17 23:52:15 +01:00
b := SMARTCtlResult
if ( b & 1 ) != 0 {
2022-10-03 11:16:00 +02:00
level . Error ( logger ) . Log ( "msg" , "Command line did not parse." )
2020-10-29 22:35:49 +01:00
result = false
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 1 ) ) != 0 {
2022-10-03 11:16:00 +02:00
level . Error ( logger ) . Log ( "msg" , "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode" )
2020-10-29 22:35:49 +01:00
result = false
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 2 ) ) != 0 {
2022-10-03 11:16:00 +02:00
level . Warn ( logger ) . Log ( "msg" , "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure" )
2020-10-29 22:35:49 +01:00
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 3 ) ) != 0 {
2022-10-03 11:16:00 +02:00
level . Warn ( logger ) . Log ( "msg" , "SMART status check returned 'DISK FAILING'." )
2020-10-29 22:35:49 +01:00
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 4 ) ) != 0 {
2022-10-03 11:16:00 +02:00
level . Warn ( logger ) . Log ( "msg" , "We found prefail Attributes <= threshold." )
2020-10-29 22:35:49 +01:00
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 5 ) ) != 0 {
2022-10-03 11:16:00 +02:00
level . Warn ( logger ) . Log ( "msg" , "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past." )
2020-10-29 22:35:49 +01:00
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 6 ) ) != 0 {
2022-10-03 11:16:00 +02:00
level . Warn ( logger ) . Log ( "msg" , "The device error log contains records of errors." )
2020-10-29 22:35:49 +01:00
}
2022-03-17 23:52:15 +01:00
if ( b & ( 1 << 7 ) ) != 0 {
2022-10-03 11:16:00 +02:00
level . Warn ( logger ) . Log ( "msg" , "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored." )
2020-10-29 22:35:49 +01:00
}
}
return result
}
// Check json
2022-10-03 11:16:00 +02:00
func jsonIsOk ( logger log . Logger , json gjson . Result ) bool {
2020-10-29 22:35:49 +01:00
messages := json . Get ( "smartctl.messages" )
// logger.Debug(messages.String())
if messages . Exists ( ) {
for _ , message := range messages . Array ( ) {
if message . Get ( "severity" ) . String ( ) == "error" {
2022-10-03 11:16:00 +02:00
level . Error ( logger ) . Log ( "msg" , message . Get ( "string" ) . String ( ) )
2020-10-29 22:35:49 +01:00
return false
}
2019-08-15 23:01:16 +02:00
}
}
2020-10-29 22:35:49 +01:00
return true
2019-08-14 22:34:49 +02:00
}