-
Notifications
You must be signed in to change notification settings - Fork 6
/
pytorch2docset.py
146 lines (126 loc) · 5.45 KB
/
pytorch2docset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Aziz Alto
"""
Dependency:
- requests
- BeautifulSoup4
- httrack (to download html documentation. Well, website mirroring)
"""
import sqlite3
import os
import plistlib
import string
import requests
from bs4 import BeautifulSoup
class Docset(object):
def __init__(self, docset_name, index_page, pages, icon_url, html_url, download_html=False):
self.name = docset_name
self.index_page = index_page
self.pages = pages
self.docs_output = None
self.docset_name = '{}.docset'.format(self.name)
self.setup_docset(html_url, download_html)
self.add_infoplist()
self.cur, self.db = self.connect_db()
self.scrape_urls()
self.get_icon(icon_url)
self.report()
def setup_docset(self, url, download_html):
self.docs_output = self.docset_name + '/Contents/Resources/Documents/'
if not os.path.exists(self.docs_output):
os.makedirs(self.docs_output)
cmd = """
cd {0} &&
httrack -%v2 -T60 -R99 --sockets=7 -%c1000 -c10 -A999999999 -%N0 --disable-security-limits --keep-links=K4 -F 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.19 (KHTML, like Gecko) Ubuntu/11.10 Chromium/18.0.1025.168' --mirror --keep-alive --robots=0 "{1}" -n -* +*.css +*css.php +*.ico +*/fonts/* +*.svg +*.ttf +fonts.googleapis.com* +*.woff +*.eot +*.png +*.jpg +*.gif +*.jpeg +*.js +{1}* -github.com* +raw.github.com* &&
rm -rf hts-* &&
mkdir -p Contents/Resources/Documents &&
mv -f *.* Contents/Resources/Documents/
""".format(self.docset_name, url)
if download_html:
os.system(cmd)
def connect_db(self):
db = sqlite3.connect(self.docset_name + '/Contents/Resources/docSet.dsidx')
cursor = db.cursor()
try:
cursor.execute('DROP TABLE searchIndex;')
except sqlite3.OperationalError:
cursor.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
cursor.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')
return cursor, db
def scrape_urls(self):
idx = (i + j for i in string.ascii_lowercase for j in string.ascii_lowercase)
pages = self.pages
for entry_type in pages:
# base path of current page
base_path = pages[entry_type].split("//")[1]
# soup each page
html = requests.get(pages[entry_type]).text
soup = BeautifulSoup(html, 'html.parser')
# find href and populate entries to db
for a in soup.findAll('a', class_='reference internal'):
entry_name = a.text.strip()
path = a.get('href')
if entry_type == 'Guide':
entry_name = '{}: {}'.format(idx.next(), entry_name.encode('ascii', 'ignore'))
path = base_path + path
entry_name = entry_name.encode('ascii', 'ignore')
self.update_db(entry_name, entry_type, path)
def update_db(self, entry_name, typ, path):
self.cur.execute("SELECT rowid FROM searchIndex WHERE path = ?", (path,))
dbpath = self.cur.fetchone()
self.cur.execute("SELECT rowid FROM searchIndex WHERE name = ?", (entry_name,))
dbname = self.cur.fetchone()
if dbpath is None and dbname is None:
self.cur.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)',
(entry_name, typ, path))
print('DB add >> name: {0} | type: {1} | path: {2}'.format(entry_name, typ, path))
else:
print("record exists")
def add_infoplist(self):
index_file = self.index_page.split("//")[1]
plist_path = os.path.join(self.docset_name, "Contents", "Info.plist")
plist_cfg = {
'CFBundleIdentifier': self.name,
'CFBundleName': self.name,
'DocSetPlatformFamily': self.name.lower(),
'DashDocSetFamily': 'python',
'isDashDocset': True,
'isJavaScriptEnabled': True,
'dashIndexFilePath': index_file
}
plistlib.writePlist(plist_cfg, plist_path)
def report(self):
self.cur.execute('SELECT count(*) FROM searchIndex;')
entry = self.cur.fetchone()
# commit and close db
self.db.commit()
self.db.close()
self.compress_docset()
print("{} entry.".format(entry))
def get_icon(self, png_url):
"""grab icon and resize to 32x32 and 16x16 pixel"""
cmd = """
wget -O icon.png {} &&
cp icon.png {}/icon.png &&
cp icon.png [email protected] &&
sips -z 32 32 [email protected] &&
sips -z 16 16 icon.png
""".format(png_url, self.docset_name)
os.system(cmd)
def compress_docset(self):
"""compress the docset as .tgz file"""
cmd = """
tar --exclude='.DS_Store' -cvzf {0}.tgz {0}.docset
""".format(self.docset_name.replace('.docset', ''))
os.system(cmd)
if __name__ == '__main__':
name = 'PyTorch'
download_url = 'http://pytorch.org/'
index_page = 'http://pytorch.org/docs/index.html'
entry_pages = {
'func': 'http://pytorch.org/docs/master/',
'Guide': 'http://pytorch.org/tutorials/'
}
icon = 'https://avatars2.githubusercontent.com/u/21003710'
Docset(name, index_page, entry_pages, icon, download_url, download_html=True)