-
Notifications
You must be signed in to change notification settings - Fork 59
/
crawl_v2.py
223 lines (190 loc) · 7.75 KB
/
crawl_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#coding:utf-8
import socket
socket.setdefaulttimeout(60)
import requests
import urllib2
# import cchardet
import os,time
from lxml import etree
import threading
import re
import random
filenames=os.listdir('.')
count=0
for fname in filenames:
if fname.startswith('gid_log'):
count+=1
gid_path='gid_log_%d' %(count)
# 1、2步分开运行要注意gid_path
# gid_path='gid_log_12'
def get_html(url): #得到网页源码
headers = {
"Accept-Language": "zh-CN,zh;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Host": "www.pkulaw.cn",
"Cookie": "bdyh_record=1970324860086081%2C1970324860087844%2C1970324860087837%2C1970324860087907%2C1970324860085114%2C1970324860087657%2C1970324860087697%2C1970324860087631%2C1970324860087701%2C1970324860087851%2C1970324860086614%2C1970324860000764%2C1970324845231811%2C1970324860004991%2C1970324860002384%2C1970324845231794%2C1970324845231624%2C1970324860002207%2C1970324860046814%2C1970324860046704%2C; CheckIPAuto=0; CheckIPDate=2016-10-15 10:03:46; gm3jc5afyl35gm2yt55kc4m1isIPlogin=1; ASP.NET_SessionId=davttbjhikxhqyn1lj5alhsb; Hm_lvt_58c470ff9657d300e66c7f33590e53a8=1476497011,1476498348,1476498528,1476499578; Hm_lpvt_58c470ff9657d300e66c7f33590e53a8=1476499578; Hm_lvt_8266968662c086f34b2a3e2ae9014bf8=1476497011,1476498348,1476498528,1476499578; Hm_lpvt_8266968662c086f34b2a3e2ae9014bf8=1476499578; CookieId=gm3jc5afyl35gm2yt55kc4m1; FWinCookie=1",
"Upgrade-Insecure-Requests": "1",
"Proxy-Connection": "keep-alive"
}
html=requests.get(url,headers=headers).text
return html
def write2file(content,filename): # 将爬取的文书写入文件保存
try:
f=open(filename,'w')
except Exception,e:
filename=filename.split(u'、')[0]+'_error_filename.txt'
f=open(filename,'w')
f.write(content.encode('utf-8'))
f.close()
#下载ihref对应的文书
def load_one_wenshu(gid,title):
ex_href='http://www.pkulaw.cn/case/FullText/_getFulltext?library=pfnl&gid=#gid#&loginSucc=0'
href=ex_href.replace('#gid#',gid)
html=get_html(href)
page=etree.HTML(html)
content=page.xpath('body')[0].xpath('string(.)').strip()
write2file(content,filepath+os.sep+title+'.txt')
def load_one_page_wenshu(gid_list,titles): # 多线程抓取多个href的文书
# threads=[] # 尝试多线程加速 失败 访问频繁 出现验证码 封ip
# for i in range(len(gid_list)):
# gid,title=gid_list[i],titles[i]
# threads.append(threading.Thread(target=load_one_wenshu,args=(gid,title,)))
# for t in threads:
# t.start()
# t.join() # 阻塞
for i in range(len(gid_list)): # 顺序爬取 时间过长 一个月大概需要20~30h
load_one_wenshu(gid_list[i],titles[i])
# time.sleep(0.1)
# 保存案件标题和id至文件
def save_gids(pageIndex,gid_list,titles):
fpath=gid_path
if not os.path.exists(fpath):
os.mkdir(fpath)
f=open(fpath+os.sep+str(pageIndex)+'.txt','w')
for i in range(len(gid_list)):
f.write('%s\t%s\n' %(titles[i].encode('utf-8'),gid_list[i]))
f.close()
#得到一页上所有的文书名称和案件id并保存
def get_one_page_all_href(href,pageIndex):
html=get_html(href.replace('#pageIndex#',str(pageIndex)))
# time.sleep(random.random())
page=etree.HTML(html)
items=page.xpath('//dl[@class="contentList"]/dd/a')
print len(items)
gid_list=[]
titles=[]
for item in items:
ihref=item.attrib['href']
title=item.text.strip()
# if u'、' in title:
# title=title.split(u'、')[1]
gid=re.findall(r'_(.*?).html',ihref)[0]
if gid not in gid_list:
gid_list.append(gid)
titles.append(title)
# print len(set(titles))
print 'page:%d has %d different case.' %(pageIndex,len(gid_list))
# load_one_page_wenshu(gid_list,titles)
save_gids(pageIndex,gid_list,titles)
# 获取当前log文件的所有title和id
def get_titles_gids(filename):
gid_list=[]
titles=[]
f=open(filename,'r')
for line in f:
pieces=line.strip().split('\t')
title,gid=pieces[0],pieces[1]
title=title.replace('?','')
# print cchardet.detect(title)
gid_list.append(gid)
titles.append(title.decode('utf-8'))
return gid_list,titles
def load_one_page_from_gid_log(filename): # 从下载好的gid中开始下载文书
gid_list,titles=get_titles_gids(filename) # 得到 gid_list 和 titles
f=open('href_error_log.txt','a')
for i in range(len(gid_list)):
try:
load_one_wenshu(gid_list[i],titles[i])
print '%s-%d load success..' %(filename,i+1)
except Exception,e: # 若该项抓取出错 记录至error_log.txt
print '%s-%d load failed...' %(filename,i+1),e
f.write('%s-%d:\t%s\t%s\n' %(filename,i+1,titles[i].encode('utf-8'),gid_list[i]))
f.flush()
time.sleep(1)
f.close()
#得到目标日期范围内的数据页数pageNum
def getPageNum(href):
html=get_html(href.replace('#pageIndex#','0'))
page=etree.HTML(html)
pageNum=page.xpath('//*[@id="toppager"]/span/span[2]')
if pageNum!=None:
pageNum=int(pageNum[0].xpath('string(.)').strip())
else:
pageNum=50
print 'pageNum:',pageNum
return pageNum
def main():
# PageSize=1000&Pager.PageIndex=0
# Start"%3A"2016.09.24"%2C"End"%3A"2016.10.13"%7D
# href为查询案例与裁判文书的 链接 可设置页大小 页码 起始结束日期
href='http://www.pkulaw.cn/case/Search/Record?Menu=CASE\
&IsFullTextSearch=False&MatchType=Exact&Keywords=\
&OrderByIndex=0&GroupByIndex=0&ShowType=1\
&ClassCodeKey=#classcode#%2C%2C&OrderByIndex=0&GroupByIndex=0\
&ShowType=0&ClassCodeKey=#classcode#%2C%2C&Library=PFNL\
&FilterItems.CourtGrade=&FilterItems.TrialStep=\
&FilterItems.DocumentAttr=&FilterItems.TrialStepCount=\
&FilterItems.LastInstanceDate=%7B"Start"%3A"#start_date#"%2C"End"%3A"#end_date#"%7D\
&FilterItems.CriminalPunish=&FilterItems.SutraCase=\
&FilterItems.CaseGistMark=&FilterItems.ForeignCase=&GroupIndex=\
&GroupValue=&TitleKeywords=&FullTextKeywords=\
&Pager.PageSize=1000&Pager.PageIndex=#pageIndex#&X-Requested-With=XMLHttpRequest'
global filepath
# filepath='2017-12_15-1_19' # 日期修改 href中也要修改
# filepath='2017_01_01-2017_09_01'
print 'input date info (eg:2017_01_01-2017_09_01):'
filepath=raw_input(">").strip()
start_date,end_date=filepath.split('-')
start_date=start_date.replace('_','.')
end_date=end_date.replace('_','.')
print 'start_date:',start_date
print 'end_date:',end_date
# classcode='007'
print 'input classcode:(eg:007)'
classcode=raw_input(">").strip()
href=href.replace('#start_date#',start_date).replace('#end_date#',end_date).replace('#classcode#',classcode)
# filepath='tmp'
if not os.path.exists(filepath):
os.mkdir(filepath)
pageNum=getPageNum(href) # 得到所有案件页数
print pageNum
# 第一步 下载hrefs 和 titles
for i in range(pageNum): # 页数要修改
get_one_page_all_href(href,i)
'''
# t0=time.time()
# threads=[] # 多线程
# for i in range(459):
# threads.append(threading.Thread(target=get_one_page_all_href,args=(href,i,)))
# for t in threads:
# t.start()
# t.join()
# print 'load %s cost:%.2f' %(filepath,time.time()-t0)
'''
# 第二步 根据gid文件下载相应的文书
for i in range(pageNum):
f=open('page_error_log.txt','a')
try:
fname=gid_path+os.sep+str(i)+'.txt'
load_one_page_from_gid_log(fname) #从gid_log中取title和id 下载相关文书
print '%s load success...' %(fname)
except Exception,e:
print '%s load failed...' %(fname),e
f.write('%s' %(fname))
f.flush()
time.sleep(10) # 休眠10s
f.close()
if __name__ == '__main__':
main()