-
Notifications
You must be signed in to change notification settings - Fork 19
/
counttemplates.py
executable file
·160 lines (121 loc) · 4.67 KB
/
counttemplates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
##########################################################################
# #
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
##########################################################################
from bz2 import BZ2File
import sys
## etree
from lxml import etree
## multiprocessing
from multiprocessing import Queue, Process
from sonet import mediawiki as mwlib
from sonet.graph import load as sg_load
count = 0
lang = None
old_user = None
g = None
lang_user, lang_user_talk = None, None
tag = {}
en_user, en_user_talk = u"User", u"User talk"
templates = {}
queue, done_queue = Queue(), Queue()
### CHILD PROCESS
def merge_templates(big, small):
for k,v in small.iteritems():
big.setdefault(k, 0) #set big[k] if not already defined
big[k] += v
def get_freq_dist(q, done_q, templates=None):
'''
Accepts as input a queue q containing data to be analyzed
done_q is a queue with returned results
'''
if not templates:
templates = {}
while 1:
s = q.get()
try:
page_templates = mwlib.getTemplates(s)
##TODO: replace this with collections.Counter
merge_templates(templates, page_templates)
except TypeError: ## end
done_q.put(templates)
return
### MAIN PROCESS
def process_page(elem, queue=None):
q = queue
user = None
global count
for child in elem:
if child.tag == tag['title'] and child.text:
a_title = child.text.split('/')[0].split(':')
if len(a_title) > 1 and a_title[0] in (en_user, lang_user):
#if len(a_title) > 1 and a_title[0] == en_user:
user = a_title[1]
else:
return
elif child.tag == tag['revision']:
for rc in child:
if rc.tag != tag['text']:
continue
#assert user, "User still not defined"
if not (rc.text and user):
continue
user = user.encode('utf-8')
try:
q.put((user_classes[user], rc.text))
except:
## fix for anonymous users not in the rich file
if mwlib.isip(user):
send.send(('anonymous', rc.text))
else:
logging.warn("Exception with user %s", user)
count_missing += 1
count += 1
if not count % 500:
print >>sys.stderr, count
def main():
from functools import partial
import optparse
from operator import itemgetter
p = optparse.OptionParser(
usage="usage: %prog [options] current_dump rich_graph"
)
_, files = p.parse_args()
if len(files) != 2:
p.error("Give me a file, please ;-)")
xml_filename = files[0]
rich_fn = files[1]
global lang_user_talk, lang_user, tag, templates
src = BZ2File(xml_filename)
tag = mwlib.get_tags(src)
translations = mwlib.get_translations(src)
lang_user, lang_user_talk = translations['User'], translations['User talk']
assert lang_user, "User namespace not found"
assert lang_user_talk, "User Talk namespace not found"
user_classes = dict(sg_load(rich_fn).get_user_class('username',
('anonymous', 'bot', 'bureaucrat','sysop')))
p = Process(target=get_freq_dist, args=(queue, done_queue))
p.start()
## XML Reader Process
partial_process_page = partial(process_page, queue=queue)
mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
partial_process_page)
print >>sys.stderr, "end of XML processing"
queue.put(None) ## this STOPS the process
templates = done_queue.get()
p.join()
for k, v in sorted(templates.items(), key=itemgetter(1), reverse=True):
print v, k.encode('utf-8')
if __name__ == "__main__":
#import cProfile as profile
#profile.run('main()', 'mainprof')
main()