Skip to content

Commit

Permalink
rpm: extract filename information
Browse files Browse the repository at this point in the history
This should allow us to extract a list of known file patterns for use
later in the Indexer pipeline.

Signed-off-by: Hank Donnay <[email protected]>
  • Loading branch information
hdonnay committed Apr 15, 2024
1 parent e8f9aff commit a24db2a
Show file tree
Hide file tree
Showing 5 changed files with 1,613 additions and 1 deletion.
77 changes: 76 additions & 1 deletion rpm/native_db.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"context"
"fmt"
"io"
"path"
"regexp"
"runtime/trace"
"strings"

Expand Down Expand Up @@ -121,14 +123,17 @@ type Info struct {
Module string
Arch string
Digest string
Signature []byte // This is a PGP signature packet.
Signature []byte // This is a PGP signature packet.
Filenames []string // Filtered by the [filePatterns] regexp.
DigestAlgo int
Epoch int
}

// Load populates the receiver with information extracted from the provided
// [rpm.Header].
func (i *Info) Load(ctx context.Context, h *rpm.Header) error {
var dirname, basename []string
var dirindex []int32
for idx := range h.Infos {
e := &h.Infos[idx]
if _, ok := wantTags[e.Tag]; !ok {
Expand Down Expand Up @@ -159,14 +164,84 @@ func (i *Info) Load(ctx context.Context, h *rpm.Header) error {
i.Digest = v.([]string)[0]
case rpm.TagSigPGP:
i.Signature = v.([]byte)
case rpm.TagDirnames:
dirname = v.([]string)
case rpm.TagDirindexes:
dirindex = v.([]int32)
case rpm.TagBasenames:
basename = v.([]string)
case rpm.TagFilenames:

Check warning on line 173 in rpm/native_db.go

View check run for this annotation

Codecov / codecov/patch

rpm/native_db.go#L173

Added line #L173 was not covered by tests
// Filenames is the tag used in rpm4 -- this is a best-effort for
// supporting it.
for _, name := range v.([]string) {
if !filePatterns.MatchString(name) {

Check warning on line 177 in rpm/native_db.go

View check run for this annotation

Codecov / codecov/patch

rpm/native_db.go#L176-L177

Added lines #L176 - L177 were not covered by tests
// Record the name as a relative path, as that's what we use
// everywhere else.
i.Filenames = append(i.Filenames, name[1:])

Check warning on line 180 in rpm/native_db.go

View check run for this annotation

Codecov / codecov/patch

rpm/native_db.go#L180

Added line #L180 was not covered by tests
}
}
}
}

// Catch panics from malformed headers. Can't think of a better way to
// handle this.
defer func() {
if r := recover(); r == nil {
return
}
zlog.Warn(ctx).
Str("name", i.Name).
Strs("basename", basename).
Strs("dirname", dirname).
Ints32("dirindex", dirindex).
Msg("caught panic in filename construction")
i.Filenames = nil

Check warning on line 198 in rpm/native_db.go

View check run for this annotation

Codecov / codecov/patch

rpm/native_db.go#L192-L198

Added lines #L192 - L198 were not covered by tests
}()
for j := range basename {
// We only want '/'-separated paths, even if running on some other,
// weird OS. It seems that RPM assumes '/' throughout.
name := path.Join(dirname[dirindex[j]], basename[j])
if filePatterns.MatchString(name) {
// Record the name as a relative path, as that's what we use
// everywhere else.
i.Filenames = append(i.Filenames, name[1:])
}
}
return nil
}

// FilePatterns is a regular expression for *any* file that may need to be
// recorded alongside a package.
//
// The tested strings are absolute paths.
var filePatterns *regexp.Regexp

func init() {
// TODO(hank) The blanket binary pattern is too broad and can miss things.
// Long-term, we should add pattern matching akin to [yara] or file(1) as a
// plugin mechanism that all indexers can use. That way, the Go indexer
// could register a pattern and use a shared filter over the
// [fs.WalkDirFunc] while this package (and dpkg, etc) can tell that another
// indexer will find those files relevant.
//
// [yara]: https://github.com/VirusTotal/yara
pat := []string{
`^.*/[^/]+\.jar$`, // Jar files
`^.*/site-packages/[^/]+\.egg-info/PKG-INFO$`, // Python packages
`^.*/package.json$`, // npm packages
`^.*/[^/]+\.gemspec$`, // ruby gems
`^/usr/bin/[^/]+$`, // any executable
}
filePatterns = regexp.MustCompile(strings.Join(pat, `|`))
}

var wantTags = map[rpm.Tag]struct{}{
rpm.TagArch: {},
rpm.TagBasenames: {},
rpm.TagDirindexes: {},
rpm.TagDirnames: {},
rpm.TagEpoch: {},
rpm.TagFilenames: {},
rpm.TagModularityLabel: {},
rpm.TagName: {},
rpm.TagPayloadDigest: {},
Expand Down
125 changes: 125 additions & 0 deletions rpm/native_db_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package rpm

import (
"context"
"encoding/json"
"os"
"path/filepath"
"strings"
"testing"

"github.com/google/go-cmp/cmp"
"github.com/quay/zlog"
"golang.org/x/tools/txtar"

"github.com/quay/claircore/rpm/bdb"
"github.com/quay/claircore/rpm/internal/rpm"
"github.com/quay/claircore/rpm/ndb"
"github.com/quay/claircore/rpm/sqlite"
)

func TestInfo(t *testing.T) {
t.Run("Files", func(t *testing.T) {
ms, err := filepath.Glob("testdata/Info.Files.*.txtar")
if err != nil {
t.Fatal(err)
}
for _, m := range ms {
ar, err := txtar.ParseFile(m)
if err != nil {
t.Fatal(err)
}
name := strings.TrimPrefix(strings.TrimSuffix(filepath.Base(m), ".txtar"), "Info.Files.")
t.Run(name, func(t *testing.T) {
t.Parallel()
ctx := zlog.Test(context.Background(), t)
filename := strings.TrimSpace(string(ar.Comment))
t.Logf("opening %q", filename)

var want map[string][]string
for _, f := range ar.Files {
if f.Name == "want.json" {
want = make(map[string][]string)
if err := json.Unmarshal(f.Data, &want); err != nil {
t.Fatal(err)
}
break
}
}
if want == nil {
t.Fatal(`"want.json" not found`)
}

pre, _, ok := strings.Cut(filename, `/testdata/`)
if !ok {
t.Fatal("input file not in a testdata directory")
}

var nat nativeDB
switch pre {
case `bdb`:
f, err := os.Open(filename)
if err != nil {
t.Fatal(err)
} else {
t.Cleanup(func() { f.Close() })
}
var db bdb.PackageDB
if err := db.Parse(f); err != nil {
t.Fatal(err)
}
nat = &db
case `ndb`:
f, err := os.Open(filename)
if err != nil {
t.Fatal(err)
} else {
t.Cleanup(func() { f.Close() })
}
var db ndb.PackageDB
if err := db.Parse(f); err != nil {
t.Fatal(err)
}
nat = &db
case `sqlite`:
db, err := sqlite.Open(filename)
if err != nil {
t.Fatal(err)
} else {
t.Cleanup(func() { db.Close() })
}
nat = db
}

rds, err := nat.AllHeaders(ctx)
if err != nil {
t.Fatal(err)
}

got := make(map[string][]string, len(want))
for _, rd := range rds {
var h rpm.Header
if err := h.Parse(ctx, rd); err != nil {
t.Error(err)
continue
}
var info Info
if err := info.Load(ctx, &h); err != nil {
t.Error(err)
continue
}
if info.Name == "gpg-pubkey" {
// This is *not* an rpm package. It is just a public key stored in the rpm database.
// Ignore this "package".
continue
}
got[info.Name] = info.Filenames
}

if !cmp.Equal(got, want) {
t.Error(cmp.Diff(got, want))
}
})
}
})
}
Loading

0 comments on commit a24db2a

Please sign in to comment.