forked from taizilongxu/stackoverflow_python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_so_100.py
158 lines (112 loc) · 4.2 KB
/
get_so_100.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
from lxml import etree
import datetime
import time
URL = "https://stackoverflow.com/questions/tagged/python?page={}&sort=votes&pagesize=15"
PATH = "/Users/limbo/github/stackoverflow_python"
TEMPLETE = """
| rank | vote | view | answer | url |
|:-:|:-:|:-:|:-:|:-:|
|{index}|{vote}|{views}|{answers}| [url](http://stackoverflow.com{href}) |
"""
SUMMARY_TEMPLETE = """
# Summary
## 简介
* [Introduction](README.md)
## TOP 100
{}
"""
README_TEMPLETE = """
# Stackoverflow 上关于 Python 的问题
[![](https://img.shields.io/github/stars/taizilongxu/stackoverflow_python.svg?style=for-the-badge&label=Stars)](https://github.com/taizilongxu/stackoverflow_python)
排名根据 vote 数量选取, 许多 SO 上的回答质量确实高, 有能力建议查看原文, 一般引用的文章也非常好,
翻译是根据 question id 写在 data 文件夹, 运行脚本 `get_so_100.py` 自动生成 `part` 部分文档
## 目录
> 图表数据更新时间 {}
| rank | vote | view | answer | url |中文|
|:-:|:-:|:-:|:-:|:-:|:-|
{}
"""
def get_page(num):
data = []
r = requests.get(URL.format(num))
selector = etree.HTML(r.text)
votes = selector.xpath('//*[@id="questions"]/div/div[1]/div[1]/div[1]/div/span/strong/text()')
views = selector.xpath('//*[@id="questions"]/div/div[1]/div[2]/@title')
titles = selector.xpath('//*[@id="questions"]/div/div[2]/h3/a/text()')
hrefs = selector.xpath('//*[@id="questions"]/div/div[2]/h3/a/@href')
answers = selector.xpath('//*[@id="questions"]/div/div[1]/div[1]/div[2]/strong/text()')
for vote, view, title, href, answers in zip(votes, views, titles, hrefs, answers):
data.append({
'vote': vote,
'views': get_view(view),
'title': title,
'href': href,
'answers': answers,
'pid': get_id(href)
})
return data
def get_view(view):
return ''.join(view.split(','))[:-6]
def get_id(href):
return href.split('/')[2]
def get_100():
return get_page(1) + get_page(2)
def get_content_by_pid(pid):
try:
with open('{}/data/{}.md'.format(PATH, pid), 'r') as F:
return ''.join(F.readlines())
except Exception as e:
return ''
def get_tran_title(content):
try:
title = re.search(r"## (.*?)\n", content).group(1)
return title
except Exception as e:
return ''
def make_toc(trans_title, file_name):
return "* [{}](part/{})".format(trans_title, file_name)
def make_summary(data):
tocs = []
for index, i in enumerate(data):
content = get_content_by_pid(i['pid'])
file_name = '{}.md'.format(index + 1)
tocs.append(make_toc(get_tran_title(content), file_name))
return SUMMARY_TEMPLETE.format('\n'.join(tocs))
def make_readme(data):
lines = []
for index, i in enumerate(data):
content = get_content_by_pid(i['pid'])
file_name = '{}.md'.format(index + 1)
cn_name = get_tran_title(content)
line = "|{}|{}|{}|{}|[url](http://stackoverflow.com{})|[{}](part/{})|".format(index+1, i['vote'], i['views'],
i['answers'], i['href'], cn_name, file_name)
lines.append(line)
update_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
return README_TEMPLETE.format(update_time, '\n'.join(lines))
def make_summary_file(content):
with open('{}/SUMMARY.md'.format(PATH), 'w') as F:
F.write(content)
def make_question_file(content, file_name):
with open('{}/part/{}'.format(PATH, file_name), 'w') as F:
F.write(content)
def make_question_files(data):
for index, i in enumerate(data):
i['index'] = index + 1
content = get_content_by_pid(i['pid'])
content = TEMPLETE.format(**i) + content
file_name = '{}.md'.format(index + 1)
make_question_file(content, file_name)
def make_readme_file(data):
with open('{}/README.md'.format(PATH), 'w') as F:
F.write(data)
data = get_100()
content = make_summary(data)
make_summary_file(content)
readme = make_readme(data)
make_readme_file(readme)
make_question_files(data)