-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbaidusearchspider.py
121 lines (103 loc) · 3.42 KB
/
baidusearchspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from lxml import etree
from newspaper import Article, Config
from proxy_ip_pool import get_new_headers, ProxyIpPool
DEBUG = 0
BAIDU_HOST_URL = "http://www.baidu.com"
BAIDU_SEARCH_URL = "http://www.baidu.com/s?ie=utf-8&tn=baidu&wd="
PROXY_POOL = ProxyIpPool()
def multi_thread_search(querys, num_results=10):
if DEBUG:
print('multi search start: debug print may be a mess cuz multi threads')
res = [None] * len(querys)
with ThreadPoolExecutor(max_workers=10) as executor:
for i, query in enumerate(querys):
res[i] = executor.submit(single_thread_search, query, num_results)
return [item.result() for item in res if item.done()]
def single_thread_search(query, num_results=10):
if DEBUG:
print('single search starts: query is', query)
s = search(query, num_results)
ret = {'query': query, 'urls': s}
if DEBUG:
print('----single search ends:', query)
return ret
def search(query, num_results=10, retry=2):
urls = None
for i in range(retry):
session = requests.Session()
session.headers = get_new_headers()
# headers can't be None
proxy_ip = PROXY_POOL.pop()
session.proxies = {'http': proxy_ip}
with session:
try:
urls = baidu_search(session, query, num_results)
except Exception:
continue
if urls:
PROXY_POOL.push(proxy_ip)
break
if urls:
return urls
else:
return None
def baidu_search(session, query, num_results):
total_urls = []
next_url = BAIDU_SEARCH_URL + query
while len(total_urls) < num_results and next_url:
html = session.get(next_url, timeout=1)
html.encoding = 'utf-8'
tree = etree.HTML(html.text)
urls = []
h3_selectors = tree.xpath('//h3/a')
for i, sel in enumerate(h3_selectors):
# title = sel.xpath('text()')[0]
href = sel.xpath('@href')[0]
# href = session.get(href).url
urls.append(href)
if urls:
total_urls += urls
try:
next_url = BAIDU_HOST_URL + tree.xpath('//a[@class="n"]/@href')[0]
except Exception:
next_url = None
return total_urls
def get_article(url):
config = Config()
config.request_timeout = 1.3
article = Article(url, language='zh', config=config)
try:
article.download()
article.parse()
# article.nlp()
except Exception:
return
return article.text
def single_thread_get_article(url):
a = get_article(url)
news = {"url": url, "content": a}
return news
def get_articles(urls):
res = [None] * len(urls)
with ThreadPoolExecutor(max_workers=100) as executor:
for i, url in enumerate(urls):
res[i] = executor.submit(single_thread_get_article, url)
return [item.result() for item in res if item.done()]
if __name__ == '__main__':
DEBUG = 1
PROXY_POOL = ProxyIpPool(DEBUG)
PROXY_POOL.load()
_querys = ['1096', '114514', '9876543210.33', '1000-7']
_ret = multi_thread_search(_querys)
# _ret = single_thread_search('zdasdq', num_results=10)
PROXY_POOL.dump()
# for _ in _ret:
# print(_)
# print(_ret)
aaa = get_articles(_ret[1]['urls'])
print(aaa)
pass