Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: prevent fast ascii comparison if char is not letter #15774

Merged
merged 3 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 32 additions & 6 deletions pkg/logql/log/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,9 @@ func (l equalFilter) String() string {
}

func newEqualFilter(match []byte, caseInsensitive bool) MatcherFilterer {
if caseInsensitive {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @MasslessParticle contains expects lowered values.

Not sure why we use contains for equality match btw.

match = bytes.ToLower(match)
}
return equalFilter{match, caseInsensitive}
}

Expand All @@ -441,7 +444,7 @@ func contains(line, substr []byte, caseInsensitive bool) bool {
}

// containsLower verifies if substr is a substring of line, with case insensitive comparison.
// substr is expected to be in lowercase.
// substr MUST be in lowercase before calling this function.
func containsLower(line, substr []byte) bool {
if len(substr) == 0 {
return true
Expand All @@ -458,7 +461,11 @@ func containsLower(line, substr []byte) bool {
for i <= maxIndex {
// Find potential first byte match
c := line[i]
if c != firstByte && c+'a'-'A' != firstByte && c != firstByte+'a'-'A' {
// Fast path for ASCII - if c is uppercase letter, convert to lowercase
if c >= 'A' && c <= 'Z' {
c += 'a' - 'A'
}
if c != firstByte {
i++
continue
}
Expand All @@ -472,9 +479,13 @@ func containsLower(line, substr []byte) bool {
c := line[linePos]
s := substr[substrPos]

// Fast ASCII comparison
// Fast path for ASCII
if c < utf8.RuneSelf && s < utf8.RuneSelf {
if c != s && c+'a'-'A' != s && c != s+'a'-'A' {
// Convert line char to lowercase if needed
if c >= 'A' && c <= 'Z' {
c += 'a' - 'A'
}
if c != s {
matched = false
break
}
Expand All @@ -485,13 +496,28 @@ func containsLower(line, substr []byte) bool {

// Slower Unicode path only when needed
lr, lineSize := utf8.DecodeRune(line[linePos:])
mr, substrSize := utf8.DecodeRune(substr[substrPos:])
if lr == utf8.RuneError && lineSize == 1 {
// Invalid UTF-8, treat as raw bytes
if c >= 'A' && c <= 'Z' {
c += 'a' - 'A'
}
if c != s {
matched = false
break
}
linePos++
substrPos++
continue
}

if lr == utf8.RuneError || mr == utf8.RuneError {
mr, substrSize := utf8.DecodeRune(substr[substrPos:])
if mr == utf8.RuneError && substrSize == 1 {
// Invalid UTF-8 in pattern (shouldn't happen as substr should be valid)
matched = false
break
}

// Compare line rune converted to lowercase with pattern (which is already lowercase)
if unicode.ToLower(lr) != mr {
matched = false
break
Expand Down
229 changes: 150 additions & 79 deletions pkg/logql/log/filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,87 +220,158 @@ func Test_rune(t *testing.T) {
require.True(t, newContainsFilter([]byte("foo"), true).Filter([]byte("foo")))
}

func BenchmarkContainsLower(b *testing.B) {
cases := []struct {
name string
line string
substr string
expected bool
}{
{
name: "short_line_no_match",
line: "this is a short log line",
substr: "missing",
expected: false,
},
{
name: "short_line_with_match",
line: "this is a short log line",
substr: "SHORT",
expected: true,
},
{
name: "long_line_no_match",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "nonexistent",
expected: false,
},
{
name: "long_line_match_start",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "2023",
expected: true,
},
{
name: "long_line_match_middle",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "LEVELS",
expected: true,
},
{
name: "long_line_match_end",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "status",
expected: true,
},
{
name: "short_unicode_line_no_match",
line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß",
substr: "missing",
expected: false,
},
{
name: "short_unicode_line_with_match",
line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß",
substr: "EMOJIS",
expected: true,
},
{
name: "long_unicode_line_no_match",
line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "nonexistent",
expected: false,
},
{
name: "long_unicode_line_match_start",
line: "2023-06-14T12:34:56.789Z 🚀[МИКРОСервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "микросервис",
expected: true,
},
{
name: "long_unicode_line_match_middle",
line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "UNICODE",
expected: true,
},
{
name: "long_unicode_line_match_end",
line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "τέλος",
expected: true,
},
var cases = []struct {
name string
line string
substr string
expected bool
}{
{
name: "short_line_no_match",
line: "this is a short log line",
substr: "missing",
expected: false,
},
{
name: "short_line_no_match_special_chars",
line: "this contains a \\ character",
substr: "|",
expected: false,
},
{
name: "short_line_no_match_special_chars_match",
line: "this contains a | character",
substr: "|",
expected: true,
},
{
name: "short_line_with_match",
line: "this is a shorT log line",
substr: "short",
expected: true,
},
{
name: "long_line_no_match",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "nonexistent",
expected: false,
},
{
name: "long_line_match_start",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "2023",
expected: true,
},
{
name: "long_line_match_middle",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, leVelS and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "levels",
expected: true,
},
{
name: "long_line_match_end",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "status",
expected: true,
},
{
name: "short_unicode_line_no_match",
line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß",
substr: "missing",
expected: false,
},
{
name: "short_unicode_line_with_match",
line: "🌟 Unicode line with eMojiS 🎉 and special chars ñ é ß",
substr: "emojis",
expected: true,
},
{
name: "long_unicode_line_no_match",
line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "nonexistent",
expected: false,
},
{
name: "long_unicode_line_match_start",
line: "2023-06-14T12:34:56.789Z 🚀[МИКРОСервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "микросервис",
expected: true,
},
{
name: "long_unicode_line_match_middle",
line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "unicode",
expected: true,
},
{
name: "long_unicode_line_match_end",
line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "τέλος",
expected: true,
},
{
name: "utf8_case_insensitive_match_middle",
line: "ΣΑΣ ΓΕΙΑ ΚΟΣΜΕ", // "WORLD HELLO WORLD" in Greek uppercase
substr: "γεια", // "hello" in Greek lowercase
expected: true,
},
{
name: "utf8_case_insensitive_no_match",
line: "ΣΑΣ ΚΟΣΜΕ", // "WORLD WORLD" in Greek uppercase
substr: "γεια", // "hello" in Greek lowercase
expected: false,
},
{
name: "empty_substr",
line: "any line",
substr: "",
expected: true,
},
{
name: "empty_line",
line: "",
substr: "something",
expected: false,
},
{
name: "both_empty",
line: "",
substr: "",
expected: true,
},
{
name: "substr_longer_than_line",
line: "short",
substr: "longer than line",
expected: false,
},
{
name: "invalid_utf8_in_line",
line: string([]byte{0xFF, 0xFE, 0xFD}),
substr: "test",
expected: false,
},
{
name: "partial_utf8_match",
line: "Hello 世界", // "Hello World" with CJK characters
substr: "世", // Just "World"
expected: true,
},
}

func Test_containsLower(t *testing.T) {
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
line := []byte(c.line)
substr := []byte(c.substr)
m := containsLower(line, substr)
require.Equal(t, c.expected, m, "line: %s substr: %s", c.line, c.substr)
})
}
}

func BenchmarkContainsLower(b *testing.B) {
var m bool
for _, c := range cases {
b.Run(c.name, func(b *testing.B) {
Expand Down
14 changes: 14 additions & 0 deletions pkg/logql/syntax/ast_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,20 @@ func Test_FilterMatcher(t *testing.T) {
},
[]linecheck{{"counter=1", false}, {"counter=0", false}, {"counter=-1", true}, {"counter=-2", true}},
},
{
`{app="foo"} |~ "\\|"`,
[]*labels.Matcher{
mustNewMatcher(labels.MatchEqual, "app", "foo"),
},
[]linecheck{{"\\", false}, {"|", true}},
},
{
`{app="foo"} |~ "(?i)\\|"`,
[]*labels.Matcher{
mustNewMatcher(labels.MatchEqual, "app", "foo"),
},
[]linecheck{{"\\", false}, {"|", true}},
},
} {
t.Run(tt.q, func(t *testing.T) {
t.Parallel()
Expand Down
Loading