Skip to content

Commit

Permalink
fix(dmesg): "journalctl" as fallback, when older dmesg does not suppo…
Browse files Browse the repository at this point in the history
…rt "--since" flag (<2.37) (#282)

* fix problem two in #272
  • Loading branch information
sunhailin-Leo authored Jan 3, 2025
1 parent 02995c7 commit 8a85e09
Show file tree
Hide file tree
Showing 7 changed files with 223 additions and 27 deletions.
3 changes: 1 addition & 2 deletions components/diagnose/diagnose.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import (
"github.com/leptonai/gpud/components/dmesg"
query_log_common "github.com/leptonai/gpud/components/query/log/common"
query_log_tail "github.com/leptonai/gpud/components/query/log/tail"
pkg_dmesg "github.com/leptonai/gpud/pkg/dmesg"
"github.com/leptonai/gpud/pkg/host"
"github.com/leptonai/gpud/pkg/process"
pkd_systemd "github.com/leptonai/gpud/pkg/systemd"
Expand Down Expand Up @@ -150,7 +149,7 @@ func run(ctx context.Context, dir string, opts ...OpOption) error {
query_log_tail.WithCommands(defaultDmesgCfg.Log.Scan.Commands),
query_log_tail.WithLinesToTail(5000),
query_log_tail.WithSelectFilter(defaultDmesgCfg.Log.SelectFilters...),
query_log_tail.WithExtractTime(pkg_dmesg.ParseISOtimeWithError),
query_log_tail.WithExtractTime(defaultDmesgCfg.Log.TimeParseFunc),
query_log_tail.WithProcessMatched(func(time time.Time, line []byte, matched *query_log_common.Filter) {
o.CheckSummary = append(o.CheckSummary, fmt.Sprintf("dmesg match: %s", string(line)))
}),
Expand Down
3 changes: 1 addition & 2 deletions components/diagnose/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
query_log_tail "github.com/leptonai/gpud/components/query/log/tail"
"github.com/leptonai/gpud/log"
"github.com/leptonai/gpud/pkg/disk"
pkg_dmesg "github.com/leptonai/gpud/pkg/dmesg"
"github.com/leptonai/gpud/pkg/file"
"github.com/leptonai/gpud/pkg/fuse"
"github.com/leptonai/gpud/pkg/host"
Expand Down Expand Up @@ -226,7 +225,7 @@ func Scan(ctx context.Context, opts ...OpOption) error {
query_log_tail.WithCommands(defaultDmesgCfg.Log.Scan.Commands),
query_log_tail.WithLinesToTail(op.lines),
query_log_tail.WithSelectFilter(defaultDmesgCfg.Log.SelectFilters...),
query_log_tail.WithExtractTime(pkg_dmesg.ParseISOtimeWithError),
query_log_tail.WithExtractTime(defaultDmesgCfg.Log.TimeParseFunc),
query_log_tail.WithProcessMatched(func(time time.Time, line []byte, matched *query_log_common.Filter) {
log.Logger.Debugw("matched", "line", string(line))
fmt.Println("line", string(line))
Expand Down
123 changes: 110 additions & 13 deletions components/dmesg/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,14 @@ import (
"encoding/json"
"os"
"os/exec"
"regexp"
"strconv"
"strings"

query_config "github.com/leptonai/gpud/components/query/config"
query_log_config "github.com/leptonai/gpud/components/query/log/config"
"github.com/leptonai/gpud/pkg/dmesg"
"github.com/leptonai/gpud/pkg/process"
)

type Config struct {
Expand Down Expand Up @@ -45,25 +50,87 @@ func DmesgExists() bool {
return p != ""
}

const DefaultDmesgFile = "/var/log/dmesg"
const (
// DefaultDmesgFile default path with dmesg file
DefaultDmesgFile = "/var/log/dmesg"
// DefaultDmesgCmd DefaultDmesgCmdWithSince default scan dmesg command (in newer util-linux it works, but older is not)
// some old dmesg versions don't support --since, thus fall back to the one without --since and tail the last 200 lines
// ref. https://github.com/leptonai/gpud/issues/32
DefaultDmesgCmd = "dmesg --time-format=iso --nopager --buffer-size 163920"
DefaultDmesgCmdWithSince = "dmesg --time-format=iso --nopager --buffer-size 163920 --since '1 hour ago'"
DefaultScanDmesgCmd = DefaultDmesgCmdWithSince + " || " + DefaultDmesgCmd + " | tail -n 200"

// dmesg --version that supports "--since" flag
// ref. https://github.com/util-linux/util-linux/blob/master/Documentation/releases/v2.37-ReleaseNotes
dmesgSinceFlagSupportVersion = 2.37

// DefaultJournalCtlCmd default scan journalctl command
DefaultJournalCtlCmd = "journalctl -qk -o short-iso --no-pager --since '1 hour ago' | tail -n 200"
DefaultJournalCtlScanCmd = "journalctl -qk -o short-iso --no-pager --since '1 hour ago' -f || true"
)

var dmesgVersionRegPattern = regexp.MustCompile(`\d+\.\d+`)

func checkDmesgVersionOutputForSinceFlag(verOutput string) bool {
matches := dmesgVersionRegPattern.FindString(verOutput)
if matches != "" {
if versionF, parseErr := strconv.ParseFloat(matches, 64); parseErr == nil {
if versionF >= dmesgSinceFlagSupportVersion {
return true
}
}
}

return false
}

func checkDmesgSupportsSinceFlag(ctx context.Context) bool {
p, err := process.New(
process.WithCommand("dmesg --version"),
process.WithRunAsBashScript(),
)
if err != nil {
return false
}

if err := p.Start(ctx); err != nil {
return false
}

lines := make([]string, 0)
if err := process.Read(
ctx,
p,
process.WithReadStdout(),
process.WithReadStderr(),
process.WithProcessLine(func(line string) {
lines = append(lines, line)
}),
process.WithWaitForCmd(),
); err != nil {
return false
}

line := strings.Join(lines, "\n")
line = strings.TrimSpace(line)

return checkDmesgVersionOutputForSinceFlag(line)
}

func DefaultConfig(ctx context.Context) (Config, error) {
// isUse is false ==> journalctl, true ==> dmesg
if supported := checkDmesgSupportsSinceFlag(ctx); !supported {
return journalCtlDefaultConfig(ctx)
}

defaultFilters, err := DefaultLogFilters(ctx)
if err != nil {
return Config{}, err
}

scanCommands := [][]string{
// some old dmesg versions don't support --since, thus fall back to the one without --since and tail the last 200 lines
// ref. https://github.com/leptonai/gpud/issues/32
{"dmesg --time-format=iso --nopager --buffer-size 163920 --since '1 hour ago' || dmesg --time-format=iso --nopager --buffer-size 163920 | tail -n 200"},
}
scanCommands := [][]string{{DefaultScanDmesgCmd}}
if _, err := os.Stat(DefaultDmesgFile); os.IsNotExist(err) {
scanCommands = [][]string{
// some old dmesg versions don't support --since, thus fall back to the one without --since and tail the last 200 lines
// ref. https://github.com/leptonai/gpud/issues/32
{"dmesg --time-format=iso --nopager --buffer-size 163920 --since '1 hour ago' || dmesg --time-format=iso --nopager --buffer-size 163920 | tail -n 200"},
}
scanCommands = [][]string{{DefaultScanDmesgCmd}}
}

cfg := Config{
Expand All @@ -73,8 +140,8 @@ func DefaultConfig(ctx context.Context) (Config, error) {

Commands: [][]string{
// run last commands as fallback, in case dmesg flag only works in some machines
{"dmesg --time-format=iso --nopager --buffer-size 163920 --since '1 hour ago' -w || dmesg --time-format=iso --nopager --buffer-size 163920 -w || true"},
{"dmesg --time-format=iso --nopager --buffer-size 163920 --since '1 hour ago' -W || dmesg --time-format=iso --nopager --buffer-size 163920 -W"},
{DefaultDmesgCmdWithSince + " -w || " + DefaultDmesgCmd + " -w || true"},
{DefaultDmesgCmdWithSince + " -W || " + DefaultDmesgCmd + " -W"},
},

Scan: &query_log_config.Scan{
Expand All @@ -83,6 +150,36 @@ func DefaultConfig(ctx context.Context) (Config, error) {
},

SelectFilters: defaultFilters,

TimeParseFunc: dmesg.ParseISOtimeWithError,
},
}
return cfg, nil
}

// journalCtlDefaultConfig In older util-linux version it can`t compatible dmesg command args
// like --time-format or --since. So, we will use journalctl -k to instead it.
func journalCtlDefaultConfig(ctx context.Context) (Config, error) {
defaultFilters, err := DefaultLogFilters(ctx)
if err != nil {
return Config{}, err
}

cfg := Config{
Log: query_log_config.Config{
Query: query_config.DefaultConfig(),
BufferSize: query_log_config.DefaultBufferSize,

Commands: [][]string{{DefaultJournalCtlScanCmd}},

Scan: &query_log_config.Scan{
Commands: [][]string{{DefaultJournalCtlCmd}},
LinesToTail: 10000,
},

SelectFilters: defaultFilters,

TimeParseFunc: dmesg.ParseShortISOtimeWithError,
},
}
return cfg, nil
Expand Down
3 changes: 3 additions & 0 deletions components/query/log/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ type Config struct {

// Used to commit the last seek info to disk.
SeekInfoSyncer func(ctx context.Context, file string, seekInfo tail.SeekInfo) `json:"-"`

// Parse time format
TimeParseFunc query_log_common.ExtractTimeFunc `json:"-"`
}

// For each interval, execute the scanning operation
Expand Down
11 changes: 11 additions & 0 deletions pkg/dmesg/dmesg-time-format-short-iso.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: docker0: port 1(vetha17b2cc) entered disabled state
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: docker0: port 1(vetha17b2cc) entered disabled state
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: device vetha17b2cc left promiscuous mode
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: docker0: port 1(vetha17b2cc) entered disabled state
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: docker0: port 1(veth9d4400a) entered blocking state
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: docker0: port 1(veth9d4400a) entered disabled state
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: device veth9d4400a entered promiscuous mode
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: IPv6: ADDRCONF(NETDEV_UP): veth9d4400a: link is not ready
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: docker0: port 1(veth9d4400a) entered blocking state
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: docker0: port 1(veth9d4400a) entered forwarding state
2025-01-02T15:20:12+0800 jmsph-pro-a800-77-123 kernel: IPv6: ADDRCONF(NETDEV_CHANGE): veth9d4400a: link becomes ready
40 changes: 31 additions & 9 deletions pkg/dmesg/dmesg.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,17 @@ import (
"time"
)

const isoTimeFormat = "2006-01-02T15:04:05,999999-07:00"
const (
isoTimeFormat = "2006-01-02T15:04:05,999999-07:00"
shortIsoTimeFormat = "2006-01-02T15:04:05-0700"
)

var isoTimeFormatN = len(isoTimeFormat)
var (
isoTimeFormatN = len(isoTimeFormat)
shortIsoTimeFormatN = len(shortIsoTimeFormat)
)

// Parses the timestamp from "dmesg --time-format=iso" output lines.
// ParseISOtimeWithError parses the timestamp from "dmesg --time-format=iso" output lines.
// ref.
// "The definition of the iso timestamp is: YYYY-MM-DD<T>HH:MM:SS,<microseconds>←+><timezone offset from UTC>."
func ParseISOtimeWithError(line []byte) (time.Time, []byte, error) {
Expand All @@ -30,21 +36,37 @@ func ParseISOtimeWithError(line []byte) (time.Time, []byte, error) {
return parsedTime, extractedLine, nil
}

// ParseShortISOtimeWithError parses the timestamp from "journalctl -o short-iso" output lines.
func ParseShortISOtimeWithError(line []byte) (time.Time, []byte, error) {
if len(line) < shortIsoTimeFormatN {
return time.Time{}, nil, errors.New("line is too short")
}

// Example input: 2025-01-02T15:20:12+0800
parsedTime, err := time.Parse("2006-01-02T15:04:05-0700", string(line[:shortIsoTimeFormatN]))
if err != nil {
return time.Time{}, nil, err
}

extractedLine := bytes.TrimSpace(line[shortIsoTimeFormatN:])
return parsedTime, extractedLine, nil
}

var regexForDmesgTime = regexp.MustCompile(`^\[([^\]]+)\]`)

// Parses the timestamp from "dmesg --ctime" output lines.
// ParseCtimeWithError Parses the timestamp from "dmesg --ctime" output lines.
// Returns a zero time if the timestamp is not found or the timestamp is invalid.
// Returns an error if the timestamp is not found or the timestamp is invalid.
func ParseCtimeWithError(line []byte) (time.Time, error) {
func ParseCtimeWithError(line []byte) (time.Time, []byte, error) {
matches := regexForDmesgTime.FindStringSubmatch(string(line))
if len(matches) == 0 {
return time.Time{}, errors.New("no timestamp matches found")
return time.Time{}, nil, errors.New("no timestamp matches found")
}

s := matches[1]
timestamp, err := time.Parse("Mon Jan 2 15:04:05 2006", s)
timestamp, err := time.Parse(time.ANSIC, s)
if err != nil {
return time.Time{}, err
return time.Time{}, nil, err
}
return timestamp, nil
return timestamp, line, nil
}
67 changes: 66 additions & 1 deletion pkg/dmesg/dmesg_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,24 @@ func TestParseISOtimeWithFile(t *testing.T) {
}
}

func TestParseShortISOtimeWithFile(t *testing.T) {
b, err := os.ReadFile("dmesg-time-format-short-iso.log")
if err != nil {
t.Fatalf("failed to read dmesg-time-format-short-iso.log: %v", err)
}

for _, line := range strings.Split(string(b), "\n") {
if len(line) == 0 {
continue
}
time, extractedLine, err := ParseShortISOtimeWithError([]byte(line))
if err != nil {
t.Errorf("failed to parse line: %v", err)
}
t.Logf("time: %s, line: %s (original: %s)", time, extractedLine, line)
}
}

func TestParseISOtimeWithError(t *testing.T) {
tests := []struct {
name string
Expand Down Expand Up @@ -73,6 +91,53 @@ func TestParseISOtimeWithError(t *testing.T) {
}
}

func TestParseShortISOtimeWithError(t *testing.T) {
tests := []struct {
name string
line []byte
want time.Time
wantLine []byte
wantErr bool
}{
{
name: "ValidISO8601",
line: []byte("2024-11-15T12:02:03+0000 abc"),
want: time.Date(2024, 11, 15, 12, 2, 3, 0, time.UTC),
wantLine: []byte("abc"),
wantErr: false,
},
{
name: "ShortLine",
line: []byte("2024-11-15"),
want: time.Time{},
wantLine: nil,
wantErr: true,
},
{
name: "InvalidFormat",
line: []byte("2024-11-15 12:02:03,561522+00:00 abc"),
want: time.Time{},
wantLine: nil,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, line, err := ParseShortISOtimeWithError(tt.line)
if (err != nil) != tt.wantErr {
t.Errorf("ParseShortISOtimeWithError() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !got.Equal(tt.want) {
t.Errorf("ParseShortISOtimeWithError() got = %v, want %v", got, tt.want)
}
if !bytes.Equal(line, tt.wantLine) {
t.Errorf("ParseShortISOtimeWithError() line = %v, want %v", string(line), string(tt.wantLine))
}
})
}
}

func TestParseCtime(t *testing.T) {
tests := []struct {
name string
Expand Down Expand Up @@ -198,7 +263,7 @@ func TestParseCtime(t *testing.T) {

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := ParseCtimeWithError([]byte(tt.input))
got, _, err := ParseCtimeWithError([]byte(tt.input))
if (err != nil) != tt.wantErr {
t.Errorf("ParseCtimeWithError() error = %v, wantErr %v", err, tt.wantErr)
return
Expand Down

0 comments on commit 8a85e09

Please sign in to comment.