-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTweetsClassifier.py
75 lines (62 loc) · 2.37 KB
/
TweetsClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
import re
import csv
from TweetsProcessor import TweetProcessor
def main():
print""
class Classifier:
def __init__(self):
self.tweetprocessor = TweetProcessor()
self.topsportswords = open(r'topsportswords', 'r').read().splitlines()
self.toppoliticswords = open(r'toppoliticswords', 'r').read().splitlines()
self.topsportshashtags = open(r'topsportshashtags', 'r').read().splitlines()
self.toppoliticshashtags = open(r'toppoliticshashtags', 'r').read().splitlines()
self.sportstweets = json.load(open("sportstrainingset"))
self.politicstweets = json.load(open("politicstrainingset"))
self.categorisetweets()
def categorisetweets(self):
politicswords = {}
sportswords= {}
politicshashtags = {}
sportshashtags= {}
politicsmentions = {}
sportsmentions= {}
#Filling in most common words and hash tags
for word in self.topsportswords:
sportswords[word] = 10
for word in self.topsportshashtags:
sportshashtags[word] = 5
for word in self.toppoliticswords:
politicswords[word] = 10
for word in self.toppoliticshashtags:
politicshashtags[word] = 5
#Analyzing the tweets
for tweet in self.sportstweets:
tweet = self.tweetprocessor.processTweet(tweet)
words= self.tweetprocessor.getwords(tweet)
for word in words:
if(word[0] == '#' ):
sportshashtags[word] = sportshashtags.get(word, 0) +1
elif(word[0] == '@' ):
sportsmentions[word] = sportsmentions.get(word, 0) +1
else:
sportswords[word] = sportswords.get(word, 0) +1
for tweet in self.politicstweets:
tweet = self.tweetprocessor.processTweet(tweet)
words= self.tweetprocessor.getwords(tweet)
for word in words:
if(word[0] == '#' ):
politicshashtags[word] = politicshashtags.get(word, 0) +1
elif(word[0] == '@' ):
politicsmentions[word] = politicsmentions.get(word, 0) +1
else:
politicswords[word] = politicswords.get(word, 0) +1
#Saving the categorised tweets
json.dump(sportswords, open("sportswords", 'wb'))
json.dump(sportshashtags, open("sportshashtags", 'wb'))
json.dump(politicswords, open("politicswords", 'wb'))
json.dump(politicshashtags, open("politicshashtags", 'wb'))
json.dump(politicsmentions, open("politicsmentions", 'wb'))
json.dump(sportsmentions, open("sportsmentions", 'wb'))
if __name__ == "__main__":
main()