-
Notifications
You must be signed in to change notification settings - Fork 4
/
pythonTest.py
117 lines (89 loc) · 3.95 KB
/
pythonTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-
import re
#from pytagcloud import create_tag_image, make_tags
#from pytagcloud.lang.counter import get_tag_counts
#import jieba
#import jieba.analyse
#import Pyweibo
import webbrowser
import mongoDBUtil
a = 'action-data="allowForward=1&rootmid=3522439063677390&rootname=8090情侣手册&rootuid=1807128011&rooturl=http://weibo.com/1807128011/z9pK2fqEK&url=http://weibo.com/3024618893/z9Bbg7nxB&mid=3522878781758791&name=獨善其身TT&uid=3024618893&domain=1807128011&pid=6bb695cbjw1dzqzne13umj" action-type="feed_list_forward" href="javascript:void(0);" onclick="return false;'
#print (re.findall(r'(?<=rootmid)\d{1,}(?=\&))', a))
#print re.findall (r'rootmid=+(\d+)&' , a)[0]
#print re.findall (r'rooturl=+(.+?)&' , a)
#print re.findall (r'rootname=+(.+?)&' , a)[0]
weibo = {}
bb = []
rootmid = re.findall(r'rootmid=+(\d+)&', a)[0] #equal mid may not need
rootname = re.findall(r'rootname=+(.+?)&', a)[0]
rootuid = re.findall(r'rootuid=+(\d+)+&', a)[0]
rooturl = re.findall(r'rooturl=+(.+?)&', a)[0] #equal URL may not need
reposturl = re.findall (r';url=+(.+?)&' , a)[0]
repostmid = re.findall(r';mid=+(\d+)+&', a)[0]
repostname = re.findall(r';name=+(.+?)&', a)[0]
repostuid = re.findall(r';uid=+(\d+)&', a)[0]
weibo[1] = "%s\t%s\t%s\t%s\t\n" % (rootmid, rootname, rootuid, rooturl)
weibo[2] = "%s\t%s\t%s\t%s\t\n" % (repostmid, repostname, repostuid, reposturl)
#bb.append(weibo[1])
#print bb
#cc = '"http://tp3.sinaimg.cn/2789672350/50/5637245856/1" usercard="id=2789672350" width="30" height="30" alt="qpzmwoxn6312" /></a></dt><dd><a href="/2789672350" title="qpzmwoxn6312" nick-name="qpzmwoxn6312" '
dd = '/<a href="/n/%E4%BA%92%E8%81%94%E7%BD%91%E4%BA%BA%E5%A3%AB-%E6%9D%8E%E6%BE%8D%E6%99%9F" usercard="name=互联网人士-李澍晟" >@互联网人士-李澍晟</a>'
ee = '<em> //<a href=" //<a href=" //<a href=" '
#print cc
#nick-name="qpzmwoxn6312"
#rootname = re.findall(r'nick-name=\"+(.+?)\"', cc)[0]
#rootname = len(re.findall(r'href=', ee))
#print rootname
#content1 = open('chi.txt','rb').read()
#pyweibo = Pyweibo.Pyweibo()
#pyweibo.login('xxxxxxxxxxxxx', '***********')
#pyweibo.getPersonalFeeds(2218904682)
#content1 = open('feedsContent','rb').read()
#tags = pyweibo.getKeyword(content1, 5)
#pyweibo.generateTagCloudFile(content1, 10)
#print ",".join(tags)
#print tags
#min, max = tags[0][0], tags[5-1][0]
#print min
#print max
#outputs = run("a <- 3; print(a + 5)")
#webbrowser.open("file://" + 'D:/GitHub/Mining-the-Soc ial-Web/web_code/wp_cumulus/tagcloud_template.html')
#a = [{'1':'a'}, {'2':'q'}, {'3':'aw'}, {'4':'e'}, {'5':'ar'}, {'6':'at'}, {'7':'ay'}]
#b = [{'interestTag':['appale', 'orange']}, {'interestTag':['banaba', 'orange']}]
#mongo = mongoDBUtil.mongoDBUtil()
#con = mongo.saveData(b, 'weibo', 'test')
#mongo.analyseCollection2(con, topN=1)
from pymongo import Connection
from bson.code import Code
#'''
#Open a connection to MongoDb (localhost)
connection = Connection()
db = connection.test
map = Code("function () {"
"var words = this.text.match(/\w+/g)"
"if(words == null){"
" return;"
"}"
"for (var i = 0; i < words.length; i++){"
"emit(this.freq, {count:1});"
"}")
reduce = Code("function (key, values) {"
" var total = 0;"
" for (var i = 0; i < values.length; i++) {"
" total += values[i].count;"
" }"
" return {count:total};"
"}")
#Remove any existing data
db.texts.remove()
#Insert the data
lines = open('2329.txt').readlines()
[db.texts.insert({'text': line}) for line in lines]
#Load map and reduce functions
#map = Code(open('wordMap.js','r').read())
#reduce = Code(open('wordReduce.js','r').read())
#Run the map-reduce query
results = db.texts.map_reduce(map, reduce, "collection_name")
#Print the results
for result in results.find():
print result['_id'] , result['value']['count']