diff --git a/pkg/logql/log/filter.go b/pkg/logql/log/filter.go index 03a53b92b93c4..e6a93ff744cba 100644 --- a/pkg/logql/log/filter.go +++ b/pkg/logql/log/filter.go @@ -421,6 +421,9 @@ func (l equalFilter) String() string { } func newEqualFilter(match []byte, caseInsensitive bool) MatcherFilterer { + if caseInsensitive { + match = bytes.ToLower(match) + } return equalFilter{match, caseInsensitive} } @@ -441,7 +444,7 @@ func contains(line, substr []byte, caseInsensitive bool) bool { } // containsLower verifies if substr is a substring of line, with case insensitive comparison. -// substr is expected to be in lowercase. +// substr MUST be in lowercase before calling this function. func containsLower(line, substr []byte) bool { if len(substr) == 0 { return true @@ -458,7 +461,11 @@ func containsLower(line, substr []byte) bool { for i <= maxIndex { // Find potential first byte match c := line[i] - if c != firstByte && c+'a'-'A' != firstByte && c != firstByte+'a'-'A' { + // Fast path for ASCII - if c is uppercase letter, convert to lowercase + if c >= 'A' && c <= 'Z' { + c += 'a' - 'A' + } + if c != firstByte { i++ continue } @@ -472,9 +479,13 @@ func containsLower(line, substr []byte) bool { c := line[linePos] s := substr[substrPos] - // Fast ASCII comparison + // Fast path for ASCII if c < utf8.RuneSelf && s < utf8.RuneSelf { - if c != s && c+'a'-'A' != s && c != s+'a'-'A' { + // Convert line char to lowercase if needed + if c >= 'A' && c <= 'Z' { + c += 'a' - 'A' + } + if c != s { matched = false break } @@ -485,13 +496,28 @@ func containsLower(line, substr []byte) bool { // Slower Unicode path only when needed lr, lineSize := utf8.DecodeRune(line[linePos:]) - mr, substrSize := utf8.DecodeRune(substr[substrPos:]) + if lr == utf8.RuneError && lineSize == 1 { + // Invalid UTF-8, treat as raw bytes + if c >= 'A' && c <= 'Z' { + c += 'a' - 'A' + } + if c != s { + matched = false + break + } + linePos++ + substrPos++ + continue + } - if lr == utf8.RuneError || mr == utf8.RuneError { + mr, substrSize := utf8.DecodeRune(substr[substrPos:]) + if mr == utf8.RuneError && substrSize == 1 { + // Invalid UTF-8 in pattern (shouldn't happen as substr should be valid) matched = false break } + // Compare line rune converted to lowercase with pattern (which is already lowercase) if unicode.ToLower(lr) != mr { matched = false break diff --git a/pkg/logql/log/filter_test.go b/pkg/logql/log/filter_test.go index 3568e92557cb0..b364ee2888e7f 100644 --- a/pkg/logql/log/filter_test.go +++ b/pkg/logql/log/filter_test.go @@ -220,87 +220,158 @@ func Test_rune(t *testing.T) { require.True(t, newContainsFilter([]byte("foo"), true).Filter([]byte("foo"))) } -func BenchmarkContainsLower(b *testing.B) { - cases := []struct { - name string - line string - substr string - expected bool - }{ - { - name: "short_line_no_match", - line: "this is a short log line", - substr: "missing", - expected: false, - }, - { - name: "short_line_with_match", - line: "this is a short log line", - substr: "SHORT", - expected: true, - }, - { - name: "long_line_no_match", - line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", - substr: "nonexistent", - expected: false, - }, - { - name: "long_line_match_start", - line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", - substr: "2023", - expected: true, - }, - { - name: "long_line_match_middle", - line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", - substr: "LEVELS", - expected: true, - }, - { - name: "long_line_match_end", - line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", - substr: "status", - expected: true, - }, - { - name: "short_unicode_line_no_match", - line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß", - substr: "missing", - expected: false, - }, - { - name: "short_unicode_line_with_match", - line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß", - substr: "EMOJIS", - expected: true, - }, - { - name: "long_unicode_line_no_match", - line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", - substr: "nonexistent", - expected: false, - }, - { - name: "long_unicode_line_match_start", - line: "2023-06-14T12:34:56.789Z 🚀[МИКРОСервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", - substr: "микросервис", - expected: true, - }, - { - name: "long_unicode_line_match_middle", - line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", - substr: "UNICODE", - expected: true, - }, - { - name: "long_unicode_line_match_end", - line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", - substr: "τέλος", - expected: true, - }, +var cases = []struct { + name string + line string + substr string + expected bool +}{ + { + name: "short_line_no_match", + line: "this is a short log line", + substr: "missing", + expected: false, + }, + { + name: "short_line_no_match_special_chars", + line: "this contains a \\ character", + substr: "|", + expected: false, + }, + { + name: "short_line_no_match_special_chars_match", + line: "this contains a | character", + substr: "|", + expected: true, + }, + { + name: "short_line_with_match", + line: "this is a shorT log line", + substr: "short", + expected: true, + }, + { + name: "long_line_no_match", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "nonexistent", + expected: false, + }, + { + name: "long_line_match_start", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "2023", + expected: true, + }, + { + name: "long_line_match_middle", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, leVelS and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "levels", + expected: true, + }, + { + name: "long_line_match_end", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "status", + expected: true, + }, + { + name: "short_unicode_line_no_match", + line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß", + substr: "missing", + expected: false, + }, + { + name: "short_unicode_line_with_match", + line: "🌟 Unicode line with eMojiS 🎉 and special chars ñ é ß", + substr: "emojis", + expected: true, + }, + { + name: "long_unicode_line_no_match", + line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "nonexistent", + expected: false, + }, + { + name: "long_unicode_line_match_start", + line: "2023-06-14T12:34:56.789Z 🚀[МИКРОСервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "микросервис", + expected: true, + }, + { + name: "long_unicode_line_match_middle", + line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "unicode", + expected: true, + }, + { + name: "long_unicode_line_match_end", + line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "τέλος", + expected: true, + }, + { + name: "utf8_case_insensitive_match_middle", + line: "ΣΑΣ ΓΕΙΑ ΚΟΣΜΕ", // "WORLD HELLO WORLD" in Greek uppercase + substr: "γεια", // "hello" in Greek lowercase + expected: true, + }, + { + name: "utf8_case_insensitive_no_match", + line: "ΣΑΣ ΚΟΣΜΕ", // "WORLD WORLD" in Greek uppercase + substr: "γεια", // "hello" in Greek lowercase + expected: false, + }, + { + name: "empty_substr", + line: "any line", + substr: "", + expected: true, + }, + { + name: "empty_line", + line: "", + substr: "something", + expected: false, + }, + { + name: "both_empty", + line: "", + substr: "", + expected: true, + }, + { + name: "substr_longer_than_line", + line: "short", + substr: "longer than line", + expected: false, + }, + { + name: "invalid_utf8_in_line", + line: string([]byte{0xFF, 0xFE, 0xFD}), + substr: "test", + expected: false, + }, + { + name: "partial_utf8_match", + line: "Hello 世界", // "Hello World" with CJK characters + substr: "世", // Just "World" + expected: true, + }, +} + +func Test_containsLower(t *testing.T) { + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + line := []byte(c.line) + substr := []byte(c.substr) + m := containsLower(line, substr) + require.Equal(t, c.expected, m, "line: %s substr: %s", c.line, c.substr) + }) } +} +func BenchmarkContainsLower(b *testing.B) { var m bool for _, c := range cases { b.Run(c.name, func(b *testing.B) { diff --git a/pkg/logql/syntax/ast_test.go b/pkg/logql/syntax/ast_test.go index 88fc0021eb33f..ea155520edad5 100644 --- a/pkg/logql/syntax/ast_test.go +++ b/pkg/logql/syntax/ast_test.go @@ -586,6 +586,20 @@ func Test_FilterMatcher(t *testing.T) { }, []linecheck{{"counter=1", false}, {"counter=0", false}, {"counter=-1", true}, {"counter=-2", true}}, }, + { + `{app="foo"} |~ "\\|"`, + []*labels.Matcher{ + mustNewMatcher(labels.MatchEqual, "app", "foo"), + }, + []linecheck{{"\\", false}, {"|", true}}, + }, + { + `{app="foo"} |~ "(?i)\\|"`, + []*labels.Matcher{ + mustNewMatcher(labels.MatchEqual, "app", "foo"), + }, + []linecheck{{"\\", false}, {"|", true}}, + }, } { t.Run(tt.q, func(t *testing.T) { t.Parallel()