-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.sh
executable file
·233 lines (219 loc) · 6.36 KB
/
spider.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/bin/bash
crash_include logger.sh
data_path=data
delete_downloads=0
if [ "$1" == "--help" ] || [ "$1" == "-h" ]
then
echo "usage: $0 [ip]"
echo "description: collects urls and stores them"
echo " either starting from provided addres or randomly generated one"
exit 0
fi
function rand_byte() {
# printf might be more consistent
# but i have no clue how to please shellcheck with printf
echo -n "$((0 + RANDOM % 255))"
}
function get_random_ip() {
echo "$(rand_byte).$(rand_byte).$(rand_byte).$(rand_byte)"
}
function clean_and_sort_data() {
# sort and unique ips
mv $data_path/ips.txt $data_path/tmp/ips.txt
sort $data_path/tmp/ips.txt | uniq > $data_path/ips.txt
# sort and unique known ips
mv $data_path/known_ips.txt $data_path/tmp/known_ips.txt
sort $data_path/tmp/known_ips.txt | uniq > $data_path/known_ips.txt
# remove known ips from ips
comm -23 $data_path/ips.txt $data_path/known_ips.txt > $data_path/tmp/ips.txt
mv $data_path/tmp/ips.txt $data_path/ips.txt
}
function get_next_ip() {
if [ ! -f $data_path/ips.txt ]
then
exit 1
fi
clean_and_sort_data
tail -n1 $data_path/ips.txt
head -n -1 $data_path/ips.txt > $data_path/tmp/ips.txt
sort $data_path/tmp/ips.txt | uniq > $data_path/ips.txt
}
function parse_line() {
# TODO: support http not only https
# TODO: prefer quoted url to get the whole url not only base
local line=$1
# log "line: $line"
if [[ "$line" =~ (\"https://[a-zA-Z0-9\./%-]{3,256}\")(.*)+ ]]
then
m="${BASH_REMATCH[1]}"
m=${m:1:-1} # chop off quotes
# dbg "match: $m"
if [ -f "$data_path/known_ips.txt" ] && grep -q "$addr" "$data_path/known_ips.txt"
then
dbg "ignoring known address '$addr'"
else
if [ ! -d "$data_path" ]
then
err "data path does not exist '$data_path'"
exit 1
fi
echo "$m" >> $data_path/ips.txt
fi
if [ "${BASH_REMATCH[2]}" != "" ]
then
parse_line "${BASH_REMATCH[2]}"
fi
fi
}
function parse_file() {
local file=$1
while read -r line; do
parse_line "$line"
done < <(grep "https://" "$file")
}
function download_site() {
local addr=$1
local addr_path=""
local addr_file=""
local dir
dir=$(pwd)
if [ "$#" != "1" ]
then
err "download_site() failed:"
err "invalid number of arguemnts"
exit 1
fi
dbg "downloading site addr='$addr'"
addr="${addr#https://}"
addr="${addr#http://}"
addr="${addr%%+(/)}" # strip trailing slash
addr="$(echo "$addr" | sed 's/^\/*//g')" # strip leading slashes (https://///foo.bar)
if [[ $addr =~ .*/$ ]]
then
wrn "stripping trailing slash failed '$addr'"
addr="${addr::-1}"
if [[ $addr =~ .*/$ ]]
then
err "download_site() failed:"
err "could not manually fix the trailing slash."
err "addr='$addr'"
exit 1
fi
fi
dbg "trailing slash '$addr'"
addr_file="${addr##*/}" # get last word after slash
addr_path="$addr"
if [[ ! "$addr" =~ / ]]
then
wrn "address does not include a slash"
else
if [[ $addr_file =~ \. ]]
then
dbg "detected a file link '$addr_file'"
addr_path="${addr%/*}"
dbg "strip file path until last slash '$addr_path'"
fi
fi
dbg "addr='$addr' path='$addr_path' file='$addr_file'"
dbg "dir=$dir"
mkdir -p "$data_web/$addr_path" || exit 1
cd "$data_web/$addr_path" || exit 1
dbg "pwd=$(pwd)"
wget_out="$(wget -o - --tries=1 --timeout=10 "$addr" 2>&1)"
wget_code="$?"
if [ "$wget_code" != "0" ]
then
# only code 1, 2 and 3 are problematic
# all the other error codes can be caused by invalid urls
if [ "$wget_code" -gt 3 ]
then
wrn "wget exited with the error code $wget_code"
cd "$dir" || exit 1
return
else
err "download_site() failed:"
err "wget failed with error code $wget_code"
err "wget output:"
echo "$wget_out"
exit 1
fi
fi
if [ "$wget_out" == "" ]
then
err "download_site() failed:"
err "wget had no output"
exit 1
fi
filename="$(echo "$wget_out" | tail -n1)"
local pattern='^[0-9]{4}-[0-9]{2}-[0-9]{2}[[:space:]][0-9]{2}:[0-9]{2}:[0-9]{2}[[:space:]]\(.*\)[[:space:]]-[[:space:]](.*)[[:space:]]saved[[:space:]]\[.*\]$'
if [[ $filename =~ $pattern ]]
then
filename="${BASH_REMATCH[1]}"
else
err "download_site() failed:"
err "pattern did not match filename='$filename'"
exit 1
fi
if [ "$filename" == "" ] || [ "${#filename}" -lt 3 ]
then
err "download_site() failed:"
err "invalid filname='$filename' wget output:"
echo "$wget_out"
exit 1
fi
filename="${filename:1:-1}"
dbg "downloaded file='$filename'"
parse_file "$filename"
if [ "$delete_downloads" == "1" ]
then
wrn "deleting file '$filename' ..."
rm "$filename"
fi
cd "$dir" || exit 1
}
function scrape_ip() {
local addr=$1
log "scraping address '$addr'"
if [ -f "$data_path/known_ips.txt" ] && grep -q "$addr" "$data_path/known_ips.txt"
then
log "skipping known address '$addr'"
else
download_site "$addr"
fi
# TODO: use a bash hash for known ips and increase number
echo "$ip" >> $data_path/known_ips.txt
mv $data_path/known_ips.txt $data_path/tmp/known_ips.txt
sort $data_path/tmp/known_ips.txt | uniq > $data_path/known_ips.txt
ip=$(get_next_ip)
if [ "$ip" == "" ]
then
err "ip list is empty"
exit 1
fi
scrape_ip "$ip"
}
# create data path and make it absolute
mkdir -p $data_path/tmp || exit 1
mkdir -p $data_path/web || exit 1
if [[ $data_path =~ ^/.* ]]
then
suc "using absolute data path '$data_path'"
else
wrn "data path is relative '$data_path'"
data_path="$(pwd)/$data_path"
log "using absolute data path '$data_path'"
fi
data_tmp="$data_path/tmp"
data_web="$data_path/web"
if [ $# -gt 0 ]
then
scrape_ip "$1"
else
ip=$(get_next_ip)
if [ "$ip" == "" ]
then
scrape_ip "$(get_random_ip)"
else
scrape_ip "$ip"
fi
fi