Skip to content

Commit

Permalink
chore: compile regexs only once
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqzss committed Aug 23, 2024
1 parent ed0f33b commit 25c8f40
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 13 deletions.
10 changes: 5 additions & 5 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ import (
"github.com/internetarchive/Zeno/internal/pkg/utils"
)

var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`)
var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`)

func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) {
var rawAssets []string
var URL = utils.URLToString(item.URL)
Expand Down Expand Up @@ -45,8 +48,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
doc.Find("*").Each(func(index int, item *goquery.Selection) {
style, exists := item.Attr("style")
if exists {
re := regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`)
matches := re.FindAllStringSubmatch(style, -1)
matches := backgroundImageRegex.FindAllStringSubmatch(style, -1)

for match := range matches {
if len(matches[match]) > 0 {
Expand Down Expand Up @@ -110,9 +112,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu

if !utils.StringInSlice("style", c.DisabledHTMLTags) {
doc.Find("style").Each(func(index int, item *goquery.Selection) {
re := regexp.MustCompile(`(?m)url\((.*?)\)`)
matches := re.FindAllStringSubmatch(item.Text(), -1)

matches := urlRegex.FindAllStringSubmatch(item.Text(), -1)
for match := range matches {
matchReplacement := matches[match][1]
matchReplacement = strings.Replace(matchReplacement, "'", "", -1)
Expand Down
11 changes: 3 additions & 8 deletions internal/pkg/crawl/sitespecific/truthsocial/truthsocial.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,10 @@ import (
"strings"
)

func IsTruthSocialURL(URL string) bool {
regexPattern := `https?://truthsocial\.com/@[A-Za-z0-9_]+/posts/\d+`
var truthSocialPostURLRegex = regexp.MustCompile(`https?://truthsocial\.com/@[A-Za-z0-9_]+/posts/\d+`)

match, err := regexp.MatchString(regexPattern, URL)
if err != nil {
return false
}

return match
func IsTruthSocialURL(URL string) bool {
return truthSocialPostURLRegex.MatchString(URL)
}

func extractPostID(URL string) (string, error) {
Expand Down

0 comments on commit 25c8f40

Please sign in to comment.