-
Notifications
You must be signed in to change notification settings - Fork 0
/
file_for_Python
135 lines (120 loc) · 4.81 KB
/
file_for_Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import re
from time import perf_counter
from functools import wraps
import requests
from scrapy import Selector
def timer(func):
@wraps(func)
def wrapper(*args, **kwargs):
start_time = perf_counter()
result = func(*args, **kwargs)
end_time = perf_counter()
cls_name = func.__name__
fmt = '{cls_name} {args} spend time: {time:.5f}'
print(fmt.format(cls_name=cls_name, args=args, time=end_time - start_time))
return (result)
return (wrapper)
def get_content_css(url):
req = requests.get(url)
content = req.content.decode('utf-8')
selector = Selector(text=content)
return (selector)
##获取页面中的图片组名和链接
def get_one_page_dict(url):
selector = get_content_css(url)
for num in selector.css('ul#pins li'):
yield {
'url' : num.css('a::attr(href)').extract_first(),
'title' : num.css('a>img::attr(alt)').extract_first(),
}
##获取当前类型所有的图片组
def get_all_page_dict(url,type_page_num):
dicts = []
for page_num in range(1,type_page_num + 1):
one_page_dict = get_one_page_dict(url %(page_num))
for page_dict in one_page_dict:
dicts.append(page_dict)
return (dicts)
##通过图片组链接获取的最大页码
def get_group_max_page_num(group_url):
selector = get_content_css(group_url)
group_page_num = selector.css('.pagenavi a span::text').extract()[-2]
return (group_page_num)
##获取图片的下载链接
def get_img_download_url(image_url):
selector = get_content_css(image_url)
img_download_url = selector.css('.main-image img::attr(src)').extract()[0]
return (img_download_url)
##判断文件夹是否存在,不存在则创建
def is_dir(dir_name):
if os.path.exists(dir_name):
return (True)
else:
os.mkdir(dir_name)
return (False)
##过滤掉特殊字符及空格
def filter_char(str):
r = '["*/:;?\|]|[<b>]|[</b>]'
return(re.sub(r,'',str).replace(' ',''))
##获取当前类型的最大页码
def get_mz_type_max_page_num(url):
selector = get_content_css(url)
type_page_num =selector.css('div.nav-links a::text').extract()[-2]
return (type_page_num)
##保存单张图片
@timer
def save_image(dir_name, image_download_url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36','Connection': 'Keep-Alive',
'Referer': "http://www.mzitu.com/"}
req = requests.get(image_download_url, headers=headers)
con = req.content
filename = dir_name + image_download_url.rsplit('/', 1)[-1]
save_img = os.path.join(dir_name, filename)
with open(save_img, 'wb') as f:
f.write(con)
##循环并保存整组的图片
def loop_and_save_img(group_url,group_title,group_max_page_num):
for page_num in range(1, group_max_page_num + 1):
image_url = get_img_download_url(group_url + '/' + str(page_num))
save_image(group_title, image_url)
##获取url
def get_url():
global mz_type_default
mz_type_default = {'1':['性感妹子','xinggan/'],'2':['日本妹子','japan/'],'3':['台湾妹子','taiwan/'],'4':['清纯妹子','mm/'],'5':['妹子自拍','zipai/'],'6':['我全都要','']}
for mz_type_len in range(1,mz_type_default.__len__() + 1):
print(mz_type_len,' : ',mz_type_default.get(str(mz_type_len))[0])
while True:
global mz_type_in
mz_type_in = str(input("选择妹子类型(1-6数字)(5还没有实现):"))
if re.compile('^[1-6]$').match(mz_type_in):
mz_type = mz_type_default.get(mz_type_in)[1]
break;
url = 'http://www.mzitu.com/' + mz_type
if mz_type == 'zipai/':
url = 'http://www.mzitu.com/' + mz_type + 'comment-page-{page_num}/#comments'
else:
url = 'http://www.mzitu.com/' +mz_type + 'page/%s/'
return (url)
##神奇的开始
def get_something_amazing():
url = get_url()
type_page_num = int(get_mz_type_max_page_num(url % (1)))
group_lists = get_all_page_dict(url, type_page_num)
print(mz_type_default[mz_type_in][0], ' 共有 ', type_page_num, ' 页 ', group_lists.__len__(), ' 组图片')
for group_list in group_lists:
group_url = group_list['url']
group_title = filter_char(group_list['title'])
print(group_url, group_title)
if is_dir(group_title):
print(group_title, ' 已存在,跳过')
else:
group_max_page_num = int(get_group_max_page_num(group_url))
loop_and_save_img(group_url, group_title, group_max_page_num)
if __name__ == "__main__":
start = perf_counter()
get_something_amazing()
end = perf_counter()
print(format('end', '*^100'))
print('download all images cost time:{:.3f}'.format(end - start))