Skip to content

Commit

Permalink
[fix] - Improve UTF8 decoder's handling of non-printable characters (#…
Browse files Browse the repository at this point in the history
…3588)

* Avoid removing non-printable characters when decoding

* use byte slice

* remove new line

---------

Co-authored-by: Miccah <[email protected]>
  • Loading branch information
ahrav and mcastorina authored Nov 15, 2024
1 parent cca7e6b commit c6abe85
Show file tree
Hide file tree
Showing 3 changed files with 350 additions and 35 deletions.
4 changes: 2 additions & 2 deletions pkg/decoders/utf16.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ func utf16ToUTF8(b []byte) ([]byte, error) {
var bufBE, bufLE bytes.Buffer
for i := 0; i < len(b)-1; i += 2 {
if r := rune(binary.BigEndian.Uint16(b[i:])); b[i] == 0 && utf8.ValidRune(r) {
if isValidByte(byte(r)) {
if isPrintableByte(byte(r)) {
bufBE.WriteRune(r)
}
}
if r := rune(binary.LittleEndian.Uint16(b[i:])); b[i+1] == 0 && utf8.ValidRune(r) {
if isValidByte(byte(r)) {
if isPrintableByte(byte(r)) {
bufLE.WriteRune(r)
}
}
Expand Down
70 changes: 46 additions & 24 deletions pkg/decoders/utf8.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package decoders

import (
"bytes"
"unicode/utf8"

"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
Expand Down Expand Up @@ -29,35 +28,58 @@ func (d *UTF8) FromChunk(chunk *sources.Chunk) *DecodableChunk {
return decodableChunk
}

// extractSubstrings performs similarly to the strings binutil,
// extacting contigous portions of printable characters that we care
// about from some bytes
func extractSubstrings(b []byte) []byte {
// utf8ReplacementBytes holds the UTF-8 encoded form of the Unicode replacement character (U+FFFD).
// This is pre-computed since it's used frequently when replacing invalid UTF-8 sequences
// and control characters.
var utf8ReplacementBytes = []byte(string(utf8.RuneError))

field := make([]byte, len(b))
fieldLen := 0
buf := &bytes.Buffer{}
for i, c := range b {
if isValidByte(c) {
field[fieldLen] = c
fieldLen++
} else {
if fieldLen > 5 {
buf.Write(field[:fieldLen])
// extractSubstrings sanitizes byte sequences to ensure consistent handling of malformed input
// while maintaining readable content. It handles ASCII and UTF-8 data as follows:
//
// For ASCII range (0-127): preserves printable characters (32-126) while replacing
// control characters with the UTF-8 replacement character.
// https://cs.opensource.google/go/go/+/refs/tags/go1.23.3:src/unicode/utf8/utf8.go;l=16
//
// For multi-byte sequences: preserves valid UTF-8 as-is, while invalid sequences
// are replaced with a single UTF-8 replacement character.
func extractSubstrings(b []byte) []byte {
dataLen := len(b)
buf := make([]byte, 0, dataLen)
for idx := 0; idx < dataLen; {
// If it's ASCII, handle separately.
// This is faster than decoding for common cases.
if b[idx] < utf8.RuneSelf {
if isPrintableByte(b[idx]) {
buf = append(buf, b[idx])
} else {
buf = append(buf, utf8ReplacementBytes...)
}
fieldLen = 0
idx++
continue
}

if i == len(b)-1 && fieldLen > 5 {
buf.Write(field[:fieldLen])
r, size := utf8.DecodeRune(b[idx:])
if r == utf8.RuneError {
// Collapse any malformed sequence into a single replacement character
// rather than replacing each byte individually.
buf = append(buf, utf8ReplacementBytes...)
idx++
} else {
// Keep valid multi-byte UTF-8 sequences intact to preserve unicode characters.
buf = append(buf, b[idx:idx+size]...)
idx += size
}
}

return buf.Bytes()
return buf
}

func isValidByte(c byte) bool {
// https://www.rapidtables.com/code/text/ascii-table.html
// split on anything that is not ascii space through tilde
return c > 31 && c < 127
}
// isPrintableByte reports whether a byte represents a printable ASCII character
// using a fast byte-range check. This avoids the overhead of utf8.DecodeRune
// for the common case of ASCII characters (0-127), since we know any byte < 128
// represents a complete ASCII character and doesn't need UTF-8 decoding.
// This includes letters, digits, punctuation, and symbols, but excludes control characters.
// The upper bound is 127 (not 128) because 127 is the DEL control character.
//
// https://www.rapidtables.com/code/text/ascii-table.html
func isPrintableByte(c byte) bool { return c > 31 && c < 127 }
Loading

0 comments on commit c6abe85

Please sign in to comment.