From 25c8f40a09ad1b35eab2b702ed87a48c2a9cb8fe Mon Sep 17 00:00:00 2001 From: yzqzss Date: Sat, 24 Aug 2024 03:38:32 +0800 Subject: [PATCH] chore: compile regexs only once --- internal/pkg/crawl/assets.go | 10 +++++----- .../pkg/crawl/sitespecific/truthsocial/truthsocial.go | 11 +++-------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index f2b7e0ad..fed41119 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -12,6 +12,9 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/utils" ) +var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) +var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) + func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) { var rawAssets []string var URL = utils.URLToString(item.URL) @@ -45,8 +48,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu doc.Find("*").Each(func(index int, item *goquery.Selection) { style, exists := item.Attr("style") if exists { - re := regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) - matches := re.FindAllStringSubmatch(style, -1) + matches := backgroundImageRegex.FindAllStringSubmatch(style, -1) for match := range matches { if len(matches[match]) > 0 { @@ -110,9 +112,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu if !utils.StringInSlice("style", c.DisabledHTMLTags) { doc.Find("style").Each(func(index int, item *goquery.Selection) { - re := regexp.MustCompile(`(?m)url\((.*?)\)`) - matches := re.FindAllStringSubmatch(item.Text(), -1) - + matches := urlRegex.FindAllStringSubmatch(item.Text(), -1) for match := range matches { matchReplacement := matches[match][1] matchReplacement = strings.Replace(matchReplacement, "'", "", -1) diff --git a/internal/pkg/crawl/sitespecific/truthsocial/truthsocial.go b/internal/pkg/crawl/sitespecific/truthsocial/truthsocial.go index f17e5be7..7cc04bfa 100644 --- a/internal/pkg/crawl/sitespecific/truthsocial/truthsocial.go +++ b/internal/pkg/crawl/sitespecific/truthsocial/truthsocial.go @@ -7,15 +7,10 @@ import ( "strings" ) -func IsTruthSocialURL(URL string) bool { - regexPattern := `https?://truthsocial\.com/@[A-Za-z0-9_]+/posts/\d+` +var truthSocialPostURLRegex = regexp.MustCompile(`https?://truthsocial\.com/@[A-Za-z0-9_]+/posts/\d+`) - match, err := regexp.MatchString(regexPattern, URL) - if err != nil { - return false - } - - return match +func IsTruthSocialURL(URL string) bool { + return truthSocialPostURLRegex.MatchString(URL) } func extractPostID(URL string) (string, error) {