Skip to content

Commit

Permalink
feat: Printing the url list to txt file with output-file (#11)
Browse files Browse the repository at this point in the history
* feat: Printing the url list to txt file with `output-file`
upd: The url list of all request is saved in the `./output/output.txt` file.
feat: Print query results as array with `output-json`
upg: Dependencies upgraded.
upd: Readme new flags added.

* fix: Fixes for PR.

* fix: output files fixes.

* fix: Disagreements were resolved.

* upd: Readme & help info updated.

* fix: The problem of program stopping at error messages has been solved
upd: Updated Json MarshalIndent output to `\t`.

* fix: Changes completed.

* upd: Readme markdown lint updated.
upd: Go sum regenerated.

* fix print lock related bug, remove redundant comments

---------

Co-authored-by: Ramazan Sancar <[email protected]>
  • Loading branch information
atomicptr and ramazansancar authored May 26, 2024
1 parent a7ff556 commit 5c6523f
Show file tree
Hide file tree
Showing 8 changed files with 182 additions and 14 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
/main*
/dist
/snap.login
/output/*
*.json

# Created by https://www.gitignore.io/api/go,linux,macos,windows,intellij+all,visualstudiocode
# Edit at https://www.gitignore.io/?templates=go,linux,macos,windows,intellij+all,visualstudiocode
Expand Down
32 changes: 31 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# crab

[![.github/workflows/build.yml](https://github.com/atomicptr/crab/actions/workflows/build.yml/badge.svg)](https://github.com/atomicptr/crab/actions/workflows/build.yml)
[![Go Report Card](https://goreportcard.com/badge/github.com/atomicptr/crab)](https://goreportcard.com/report/github.com/atomicptr/crab)
[![Coverage Status](https://coveralls.io/repos/github/atomicptr/crab/badge.svg?branch=master)](https://coveralls.io/github/atomicptr/crab?branch=master)

A versatile tool to crawl dozens of URLs from a given source, like a sitemap or an URL list.

Useful for:

* Warming site caches
* Checking response times
* Identifying dead or broken pages
Expand All @@ -16,6 +18,16 @@ Useful for:

[You can download the newest release from here for Linux (including .deb and .rpm), macOS and Windows.](https://github.com/atomicptr/crab/releases/)

### Build Command (Personal & Development Usage)

```bash
#Linux (Debian/Ubuntu) & MacOS
$ go build -o crab cmd/crab/main.go

#Windows
$ go build -o crab.exe cmd/crab/main.go
```

### Docker

[Docker Hub](https://hub.docker.com/r/atomicptr/crab)
Expand Down Expand Up @@ -85,6 +97,24 @@ $ crab crawl:sitemap https://domain.com/sitemap.xml --filter-status=200,404
$ crab crawl:sitemap https://domain.com/sitemap.xml --filter-status=>500
```

### Save Url List in File

You can save the url list to a file

```bash
# This will save the output to a file called output.txt
$ crab crawl:sitemap https://domain.com/sitemap.xml --output-file ./output/output.txt
```

### Save Output to JSON

You can save the output to a JSON file

```bash
# This will save the output to a file called output.json
$ crab crawl:sitemap https://domain.com/sitemap.xml --output-json ./output/output.json
```

## License

MIT
[MIT](./LICENSE)
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ module github.com/atomicptr/crab
go 1.19

require (
github.com/beevik/etree v1.1.0
github.com/beevik/etree v1.3.0
github.com/pkg/errors v0.9.1
github.com/spf13/cobra v1.6.1
github.com/spf13/cobra v1.8.0
github.com/stretchr/testify v1.8.1
)

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/inconshreveable/mousetrap v1.0.1 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
Expand Down
14 changes: 7 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
github.com/beevik/etree v1.1.0 h1:T0xke/WvNtMoCqgzPhkX2r4rjY3GDZFi+FjpRZY2Jbs=
github.com/beevik/etree v1.1.0/go.mod h1:r8Aw8JqVegEf0w2fDnATrX9VpkMcyFeM0FhwO62wh+A=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/beevik/etree v1.3.0 h1:hQTc+pylzIKDb23yYprodCWWTt+ojFfUZyzU09a/hmU=
github.com/beevik/etree v1.3.0/go.mod h1:aiPf89g/1k3AShMVAzriilpcE4R/Vuor90y83zVZWFc=
github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/inconshreveable/mousetrap v1.0.1 h1:U3uMjPSQEBMNp1lFxmllqCPM6P5u/Xq7Pgzkat/bFNc=
github.com/inconshreveable/mousetrap v1.0.1/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/spf13/cobra v1.6.1 h1:o94oiPyS4KD1mPy2fmcYYHHfCxLqYjJOhGsCHFZtEzA=
github.com/spf13/cobra v1.6.1/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY=
github.com/spf13/cobra v1.8.0 h1:7aJaZx1B85qltLMc546zn58BxxfZdR/W22ej9CFoEf0=
github.com/spf13/cobra v1.8.0/go.mod h1:WXLWApfZ71AjXPya3WOlMsY9yMs7YeiHhFVlvLyhcho=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
Expand Down
16 changes: 16 additions & 0 deletions pkg/cli/crawl/commons.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,20 @@ func registerStandardCrawlCommandFlags(cmd *cobra.Command, flagOptions *crawlerF
"",
"filter logs by status",
)
cmd.PersistentFlags().StringVarP(
&flagOptions.OutputFile,
"output-file",
"",
"",
"set output file for results in text format (example: \"./path/to/output.txt\")",
)
cmd.PersistentFlags().StringVarP(
&flagOptions.OutputJson,
"output-json",
"",
"",
"set output file for results in json format (example: \"./path/to/output.json\")",
)
}

func registerStandardCrawlCommandFlagModifiers(modifier *crawler.RequestModifier, flagOptions crawlerFlagOptions) {
Expand Down Expand Up @@ -110,6 +124,8 @@ func crawlUrls(urls []string, modifier crawler.RequestModifier, flagOptions craw
},
FilterStatusQuery: flagOptions.FilterStatusQuery,
OutWriter: outWriter,
OutputFile: flagOptions.OutputFile,
OutputJson: flagOptions.OutputJson,
}
crawl.Crawl(requests)

Expand Down
2 changes: 2 additions & 0 deletions pkg/cli/crawl/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ type crawlerFlagOptions struct {
FilterStatusQuery string
cookieMap map[string]string
headerMap map[string]string
OutputFile string
OutputJson string
}

const (
Expand Down
6 changes: 4 additions & 2 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,19 @@ import (
"github.com/pkg/errors"
)

//Crawler crawls urls in parallel
// Crawler crawls urls in parallel
type Crawler struct {
HttpClient http.Client
NumberOfWorkers int
FilterStatusQuery string
OutWriter io.Writer
statusFilter *filter.Filter
printMutex sync.Mutex
OutputFile string
OutputJson string
}

//Crawl crawls a list of HTTP requests with a set number of workers
// Crawl crawls a list of HTTP requests with a set number of workers
func (c *Crawler) Crawl(requests []*http.Request) {
requestNum := len(requests)

Expand Down
118 changes: 117 additions & 1 deletion pkg/crawler/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import (
"encoding/base64"
"encoding/json"
"fmt"
"log"
"os"
"path/filepath"
"time"

"github.com/atomicptr/crab/pkg/filter"
Expand All @@ -17,8 +20,26 @@ func (c *Crawler) safePrintln(statusCode int, message string) {

if c.statusFilter.IsValid(c.FilterStatusQuery, int64(statusCode)) {
c.printMutex.Lock()
defer c.printMutex.Unlock()

logsToFile := false

if c.OutputJson != "" {
c.writeLineToJsonFile(message, c.OutputJson)
logsToFile = true
}

if c.OutputFile != "" {
c.writeLineToFile(message, c.OutputFile)
logsToFile = true
}

// dont log to stdout if we log to file
if logsToFile {
return
}

_, _ = fmt.Fprintln(c.OutWriter, message)
c.printMutex.Unlock()
}
}

Expand Down Expand Up @@ -56,3 +77,98 @@ func escapeString(str string) string {
}
return string(b)
}

// assureFileExists Check if file path exists and create the file otherwise
func (c *Crawler) assureFileExists(filePath string) error {
err := os.MkdirAll(filepath.Dir(filePath), 0755)
if err != nil {
return err
}

if _, err := os.Stat(filePath); os.IsNotExist(err) {
_, err := os.Create(filePath)
if err != nil {
return err
}
}

return nil
}

// writeLineToFile Writes message to file
func (c *Crawler) writeLineToFile(message, filePath string) {
err := c.assureFileExists(filePath)
if err != nil {
log.Fatal(err)
}

file, err := os.OpenFile(filePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
log.Fatal(err)
}
defer file.Close()

data := struct {
Err string
Status int
Url string
Time int
Duration int
}{}
if err := json.Unmarshal([]byte(message), &data); err != nil {
log.Fatal(err)
}

if data.Err != "" {
_, err = file.WriteString(fmt.Sprintf("%s\t%s\t%d\t%dms", data.Err, data.Url, data.Time, data.Duration) + "\n")
if err != nil {
log.Fatal(err)
}

return
}

_, err = file.WriteString(fmt.Sprintf("%d\t%s\t%d\t%dms", data.Status, data.Url, data.Time, data.Duration) + "\n")
if err != nil {
log.Fatal(err)
}
}

// writeJsonFile Write message to json file
func (c *Crawler) writeLineToJsonFile(message, filePath string) {
err := c.assureFileExists(filePath)
if err != nil {
log.Fatal(err)
}

file, err := os.ReadFile(filePath)
if err != nil {
log.Fatal(err)
}

if len(file) == 0 {
file = []byte("[]")
}

var temp []map[string]interface{}
if err := json.Unmarshal(file, &temp); err != nil {
log.Fatal(err)
}

var msg map[string]interface{}
if err := json.Unmarshal([]byte(message), &msg); err != nil {
log.Fatal(err)
}

temp = append(temp, msg)

jsonData, err := json.MarshalIndent(temp, "", "\t")
if err != nil {
log.Fatal(err)
}

err = os.WriteFile(filePath, jsonData, 0644)
if err != nil {
log.Fatal(err)
}
}

0 comments on commit 5c6523f

Please sign in to comment.