From 05789ac825fcb011640a01e1bddc72f2c0bf4f10 Mon Sep 17 00:00:00 2001
From: Hank Donnay <hdonnay@redhat.com>
Date: Fri, 29 Mar 2024 16:20:36 -0500
Subject: [PATCH] rpm: extract filename information

This should allow us to extract a list of known file patterns for use
later in the Indexer pipeline.

Signed-off-by: Hank Donnay <hdonnay@redhat.com>
---
 rpm/native_db.go | 77 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/rpm/native_db.go b/rpm/native_db.go
index 1cc43f72e..d4ca70540 100644
--- a/rpm/native_db.go
+++ b/rpm/native_db.go
@@ -5,6 +5,8 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"path"
+	"regexp"
 	"runtime/trace"
 	"strings"
 
@@ -121,7 +123,8 @@ type Info struct {
 	Module     string
 	Arch       string
 	Digest     string
-	Signature  []byte // This is a PGP signature packet.
+	Signature  []byte   // This is a PGP signature packet.
+	Filenames  []string // Filtered by the [filePatterns] regexp.
 	DigestAlgo int
 	Epoch      int
 }
@@ -129,6 +132,8 @@ type Info struct {
 // Load populates the receiver with information extracted from the provided
 // [rpm.Header].
 func (i *Info) Load(ctx context.Context, h *rpm.Header) error {
+	var dirname, basename []string
+	var dirindex []int32
 	for idx := range h.Infos {
 		e := &h.Infos[idx]
 		if _, ok := wantTags[e.Tag]; !ok {
@@ -159,14 +164,84 @@ func (i *Info) Load(ctx context.Context, h *rpm.Header) error {
 			i.Digest = v.([]string)[0]
 		case rpm.TagSigPGP:
 			i.Signature = v.([]byte)
+		case rpm.TagDirnames:
+			dirname = v.([]string)
+		case rpm.TagDirindexes:
+			dirindex = v.([]int32)
+		case rpm.TagBasenames:
+			basename = v.([]string)
+		case rpm.TagFilenames:
+			// Filenames is the tag used in rpm4 -- this is a best-effort for
+			// supporting it.
+			for _, name := range v.([]string) {
+				if !filePatterns.MatchString(name) {
+					// Record the name as a relative path, as that's what we use
+					// everywhere else.
+					i.Filenames = append(i.Filenames, name[1:])
+				}
+			}
+		}
+	}
+
+	// Catch panics from malformed headers. Can't think of a better way to
+	// handle this.
+	defer func() {
+		if r := recover(); r == nil {
+			return
+		}
+		zlog.Warn(ctx).
+			Str("name", i.Name).
+			Strs("basename", basename).
+			Strs("dirname", dirname).
+			Ints32("dirindex", dirindex).
+			Msg("caught panic in filename construction")
+		i.Filenames = nil
+	}()
+	for j := range basename {
+		// We only want '/'-separated paths, even if running on some other,
+		// weird OS. It seems that RPM assumes '/' throughout.
+		name := path.Join(dirname[dirindex[j]], basename[j])
+		if !filePatterns.MatchString(name) {
+			// Record the name as a relative path, as that's what we use
+			// everywhere else.
+			i.Filenames = append(i.Filenames, name[1:])
 		}
 	}
 	return nil
 }
 
+// FilePatterns is a regular expression for *any* file that may need to be
+// recorded alongside a package.
+//
+// The tested strings are absolute paths.
+var filePatterns *regexp.Regexp
+
+func init() {
+	// TODO(hank) The blanket binary pattern is too broad and can miss things.
+	// Long-term, we should add pattern matching akin to [yara] or file(1) as a
+	// plugin mechanism that all indexers can use. That way, the Go indexer
+	// could register a pattern and use a shared filter over the
+	// [fs.WalkDirFunc] while this package (and dpkg, etc) can tell that another
+	// indexer will find those files relevant.
+	//
+	// [yara]: https://github.com/VirusTotal/yara
+	pat := []string{
+		`^.*/[^/]+\.jar$`, // Jar files
+		`^.*/site-packages/[^/]+\.egg-info/PKG-INFO$`, // Python packages
+		`^.*/package.json$`,                           // npm packages
+		`^.*/[^/]+\.gemspec$`,                         // ruby gems
+		`^/usr/bin/[^/]+$`,                            // any executable
+	}
+	regexp.MustCompile(strings.Join(pat, `|`))
+}
+
 var wantTags = map[rpm.Tag]struct{}{
 	rpm.TagArch:              {},
+	rpm.TagBasenames:         {},
+	rpm.TagDirindexes:        {},
+	rpm.TagDirnames:          {},
 	rpm.TagEpoch:             {},
+	rpm.TagFilenames:         {},
 	rpm.TagModularityLabel:   {},
 	rpm.TagName:              {},
 	rpm.TagPayloadDigest:     {},