forked from tfmorris/googlecode2github
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikiconvert.py
147 lines (119 loc) · 4.68 KB
/
wikiconvert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python
"""
Usage:
python googlecode2github/wikiconfig.py PROJID SRCDIR DSTDIR
where "PROJID" is the github project id, e.g. "trentm/python-markdown2",
"SRCDIR" is a Google Code project wiki Subversion working copy dir and
"DSTDIR" is the git clone dir of the git project's wiki.
"""
__version__ = "1.0.0"
import re
import sys
from os.path import *
from glob import glob
from pprint import pprint
import codecs
from hashlib import md5
def log(s):
sys.stderr.write(s+"\n")
def convert_dir(proj_id, src_dir, dst_dir):
if isfile(src_dir):
convert_file(proj_id, src_dir, dst_dir)
else:
for f in glob(join(src_dir, "*.wiki")):
convert_file(proj_id, f, dst_dir)
def convert_file(proj_id, src_path, dst_dir):
src = codecs.open(src_path, 'r', 'utf-8').read()
meta_lines = []
body_lines = []
lines = src.splitlines(False)
for i, line in enumerate(lines):
if line.startswith("#"):
meta_lines.append(line)
elif line.strip():
body_lines = lines[i:]
break
meta = {}
for line in meta_lines:
k,v = line[1:].split(None, 1)
meta[k] = v
text = '\n'.join(body_lines)
s_from_hash = {}
text = re.sub(r'^<wiki:toc.*\n', r'<!-- No auto-Table of Contents support! -->\n',
text)
# Code blocks
text = re.compile(r'^{{{+ *\n', re.M).sub(r"```\n", text)
text = re.compile(r'^}}}+ *(\n|$)', re.M).sub(r"```\n", text)
# TODO: Add support for `backtick` code quotes
text = re.sub(r'{{{(.*?)}}}', r'`\1`', text)
# Headings.
text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text)
text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text)
text = re.compile(r'^=(.*?)=\s*$', re.M).sub(lambda m: "# %s\n"%m.group(1).strip(), text)
# Tables
def sub_table(m):
rows = []
for line in m.group(0).splitlines(False):
if not line.strip():
continue
rows.append(list(c.strip() for c in line.split("||")[1:-1]))
lines = ['<table>']
for row in rows:
lines.append(' <tr>%s</tr>' % ''.join('<td>%s</td>' % c for c in row))
lines.append('</table>')
return '\n\n' + '\n'.join(lines)
text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table, text)
# Lists. Try to handle nested lists by keeping leading whitespace.
text = re.compile(r'^([ \t]+)\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'\1- \2', text)
text = re.compile(r'^([ \t]+)#[ \t]+(.*?)[ \t]*$', re.M).sub(r'\g<1>1. \2', text)
# wiki links.
def sub_wikilink(m):
gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ')
if m.group(2):
s = "[[%s|%s]]" % (gh_page_name, m.group(2))
pass
else:
s = "[[%s]]" % gh_page_name
hash = md5(s.encode('utf8')).hexdigest()
s_from_hash[hash] = s
return hash
text = re.compile(r'\[((?:[A-Z][a-z]+)+)(?:\s+(.*?))?\]', re.S).sub(sub_wikilink, text)
# Links
def sub_link(m):
s = "[%s](%s)" % (m.group(2), m.group(1))
hash = md5(s.encode('utf8')).hexdigest()
s_from_hash[hash] = s
return hash
text = re.compile(r'(?<!\[)\[([^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
# Italics, bold.
# in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)
text = re.compile(r'(?<![*\w])\*([^*]+?)\*(?![*\w])', re.S).sub(r'**\1**', text)
text = re.compile(r'(?<![_\w])_([^_]+?)_(?![_\w])', re.S).sub(r'*\1*', text)
# Auto-linking "issue \d+"
text = re.compile(r'(?<!\[)(issue (\d+))(?!\])').sub(
r'[\1](https://github.com/%s/issues#issue/\2)' % proj_id, text)
# Restore hashed-out blocks.
for hash, s in s_from_hash.items():
text = text.replace(hash, s)
# Add summary.
if "summary" in meta:
text = ("# %s\n\n" % meta["summary"]) + text
base = splitext(basename(src_path))[0]
gh_page_name = _gh_page_name_from_gc_page_name(base)
dst_path = join(dst_dir, gh_page_name+".md")
if not exists(dst_path) or codecs.open(dst_path, 'r', 'utf-8').read() != text:
codecs.open(dst_path, 'w', 'utf-8').write(text)
log("wrote '%s'" % dst_path)
#---- internal support stuff
def _indent(text):
return '\n ' + '\n '.join(text.splitlines(False))
def _gh_page_name_from_gc_page_name(gc):
"""Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
gh = re.sub(r'([A-Za-z]+)_?', r'-\1', gc)[1:]
return gh
#---- mainline
if __name__ == '__main__':
if len(sys.argv) != 4:
print __doc__
sys.exit(1)
convert_dir(sys.argv[1], sys.argv[2], sys.argv[3])