-
Notifications
You must be signed in to change notification settings - Fork 0
/
my-web-crawler.py
211 lines (189 loc) · 7.95 KB
/
my-web-crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from argparse import ArgumentParser as ap
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from urllib.robotparser import RobotFileParser as rbp
import matplotlib.pyplot as plt
#lists of links
visited_links = set()
file_counts = {}
files = {}
pie_value=[]
pie_label=[]
def extract_last_word(url):
#for getting url of sitemap from robots.txt
# Retrieve the HTML content of the page
response = requests.get(url)
html_content = response.text
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')
# Extract the text content from the HTML
text = soup.get_text()
# Split the text into words
words = text.split()
# Retrieve the last word
last_word = words[-1] if words else None
return last_word
def extract_locs_from_sitemap(url):
#for getting urls from sitemap
response = requests.get(url)
soup = BeautifulSoup(response.text, 'xml')
locs = soup.find_all('loc')
loc_urls = [loc.text for loc in locs]
return loc_urls
def check_robots(domain):
robots_txt_url = "http://" + domain + "/robots.txt"
response = requests.get(robots_txt_url)
rp = rbp()
rp.parse(response.text.splitlines())
return rp
def crawl(url, threshold, output_file, robots):
#main crawler
#get domain of url
parsed_url = urlparse(url)
domain = parsed_url.netloc
robots_txt = check_robots(domain) if robots else None
def extract_file_type(link):
# Extract the file extension from the link
parsed_url = urlparse(link)
path = parsed_url.path
file_type = path.split("/")[-1].split(".")[-1].lower() if "." in path else None
return file_type
def process_link(link, depth):
#dont check visited links or go too deep
if link in visited_links or depth > threshold:
return
visited_links.add(link)
response = requests.get(link, allow_redirects=True)
#deadends
if response.status_code != 200:
return
#find all links
soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all(["a", "link", "script", "img"])
#get href and src links
for link in links:
href = link.get("href")
src = link.get("src")
if href:
href = urljoin(url, href)
parsed_href = urlparse(href)
newdom = parsed_href.netloc
#increment the corresponding filetype
file_type = extract_file_type(href)
if file_type:
file_counts[file_type] = file_counts.get(file_type, 0) + 1
files.setdefault(file_type, []).append(href)
#if robots flag is on check if it is in the rules
if newdom == domain and (not robots_txt or robots_txt.can_fetch("*", href)):
#go deeper
process_link(href, depth + 1)
if src:
#similar to href but src
src = urljoin(url, src)
parsed_src = urlparse(src)
newdom = parsed_src.netloc
file_type = extract_file_type(src)
if file_type:
file_counts[file_type] = file_counts.get(file_type, 0) + 1
files.setdefault(file_type, []).append(src)
if newdom == domain and (not robots_txt or robots_txt.can_fetch("*", src)):
process_link(src, depth + 1)
process_link(url, 1)
if not output_file:
#for printing on terminal
if robots:
print("Checking robots.txt")
response = requests.get("https://"+domain+"/robots.txt")
if response.status_code == 200:
print("robots.txt file found on", url)
print("Checking sitemap")
smap = extract_last_word("https://"+domain+"/robots.txt")
response = requests.get(smap)
if response.status_code == 200:
maplist = extract_locs_from_sitemap(smap)
print("Sitemap :")
for word in maplist:
print(word)
else:
print("No sitemap found\n")
else:
print("No robots.txt file found on", url)
else:
print("Not checking for robots.txt")
print(f"At recursion level {threshold}")
print("Total files found:", sum(file_counts.values()))
for file_type, links in files.items():
pie_label.append(file_type)
pie_value.append(len(links))
print(file_type.capitalize() + ":" + str(len(links)))
for link in links:
print(link)
print("Summary :")
for i in range(len(pie_label)):
print(pie_label[i]," : ",pie_value[i])
else:
#for printing in the output file
with open(output_file,"w") as f:
if robots:
f.write("Checking robots.txt \n")
response = requests.get("https://"+domain+"/robots.txt")
if response.status_code == 200:
f.write("robots.txt file found on "+ url+"\n")
f.write("Checking sitemap\n")
smap = extract_last_word("https://"+domain+"/robots.txt")
response = requests.get(smap)
if response.status_code == 200:
maplist = extract_locs_from_sitemap(smap)
f.write("Sitemap : "+str(len(maplist))+"\n")
for word in maplist:
f.write(word + "\n")
else:
f.write("No sitemap found\n")
else:
f.write("No robots.txt file found on "+ url + "\n")
else:
f.write("Not checking for robots.txt \n")
f.write(f"At recursion level {threshold}\n")
f.write("Total files found:"+ str(sum(file_counts.values()))+"\n")
for file_type, links in files.items():
pie_label.append(file_type)
pie_value.append(len(links))
f.write(file_type.capitalize() + ":" + str(len(links))+"\n")
for link in links:
f.write(link+"\n")
f.write("Summary :\n")
for i in range(len(pie_label)):
f.write(pie_label[i]+" : "+str(pie_value[i])+"\n")
plt.pie(pie_value,labels=pie_label)
plt.savefig('pie.png')
plt.close()
if __name__ == "__main__":
#get arguments from cli
parser = ap()
parser.add_argument("-u", "--url", type=str, required=True)
parser.add_argument("-t", "--threshold", type=int, default=float('inf'))
parser.add_argument("-o", "--output", type=str)
parser.add_argument("-r", "--robot", action="store_true")
args = parser.parse_args()
url = args.url
threshold = args.threshold
output = args.output
robots = args.robot
if not output:
print(r'''
____ ____ __
\_______/ |_ _| |_ _| [ |
`.,-'\_____/`-.,' \ \ /\ / /.---. | |.--.
/`..'\ _ /`.,'\ \ \/ \/ // /__\\ | '/'`\ \
/ /`.,' `.,'\ \ \ /\ / | \__., | \__/ |
/__/__/ \__\__\__ ______ \/ \/ '.__.'[__;.__.' __
\ \ \ / / / .' ___ | [ |
\ \,'`._,'`./ / / .' \_| _ .--. ,--. _ _ __ | | .---. _ .--.
\,'`./___\,'`./ | | [ `/'`\]`'_\ :[ \ [ \ [ ]| |/ /__\\[ `/'`\]
,'`-./_____\,-'`. \ `.___.'\ | | // | |,\ \/\ \/ / | || \__., | |
/ \ `.____ .'[___] \'-;__/ \__/\__/ [___]'.__.'[___]
''')
crawl(url,threshold,None,robots)
else:
crawl(url,threshold,output,robots)