forked from WorldsOfBabylon/cambridgeAnalytica
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtwitterSentiment.py
317 lines (240 loc) · 12.2 KB
/
twitterSentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 18 08:45:50 2016
@author: Michael
For starters, we will just get sentiment from textBlob for tweets containing keywords like "Trump", "Carson", "Cruz", "Bern", Bernie", "guns", "immigration", "immigrants", etc.
matplotlib the results.
Stuff to do:
Get user IDs
retrieve all the user's recent tweets and favorites.
separate tweets into groups containing each keyword
get sentiment graph of the whole group with textBlob and matplotlib
"""
#Import the necessary methods from tweepy library
import sys
import time
# Libraries to pull twitter data
import tweepy
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from tweepy import Stream
# Data and Text Analysis
import textblob
import numpy as np
import pandas
def readFromFileA(filename,splitter=',', lineStart = 0, lineEnd = 1000):
f = open(filename,'r')
lines_list = f.readlines()
f.close()
my_data = [[str(val) for val in line.split(splitter)[0:]] for line in lines_list[lineStart:lineEnd]]
my_data = filter(lambda a: a != ['\n'], my_data)
return my_data
class MyStreamListener(tweepy.StreamListener):
def on_status(self, status):
randIds.append(status.user.id)
# input is a tweet as a single line str.
# This function will convert it all to lower case,
# remove useless words, and put in the format for the neural network.
# def parseTweet(tweet):
# will make this when I'm working on the nn part of the project. not needed if using the textblob tool
if __name__ == '__main__':
#Variables that contains the user credentials to access Twitter API: these are not real
consumerKey = 'BSfiAQWf44tc7'
consumerSecret = 'Nk0x66OaUrHNn4WjC'
accessToken = '324214621-bkOFZdKv1X9Rd9pTI6TC'
accessTokenSecret = 'grLo38PqPDQy'
idsFileName = "./data/twitter_ids.csv"
dataFileName2 = "./data/Video Transcript.txt"
randIdsFileName = "./data/randomIds.txt"
auth = OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth)
ids = readFromFileA(idsFileName, splitter = ',', lineStart = 1, lineEnd = 3000)
"""if we are using a randomized control group, this code creates the ids list for them."""
#if getting randoms
# ids = []
# with open(randIdsFileName, 'r') as f:
# for line in f:
# ids.append(int(line))
#
# ids = ids[:720]
"""this is only for non-tweet files, such as focus group transcripts"""
# lines = []
#
# with open(dataFileName2, 'r') as f:
# for line in f:
# lines.append(line.lower())
"""this is the tweet id of roughly the time that the news broke that Carson dropped out. I pull only tweets since that time"""
carsonDropsOutTweetId = 705885709861715968
"""hard coded lists for keywords that I'm searching for. This is used to compile data for sentiments regarding tweets containing each keyword."""
#each sentiments list will have tuples: (sentiment, tweetID)
#note: could include many more keywords like "feelthebern" for example, but need neutral keywords to get true sentiments. feelthebern would be a biased term.
hillarySentiments = []
hillaryKeywords = ['hillary', 'clinton', 'hillaryclinton']
trumpSentiments = []
trumpKeywords = ['trump', 'realdonaldtrump']
cruzSentiments = []
cruzKeywords = ['cruz', 'tedcruz']
bernieSentiments = []
bernieKeywords = ['bern', 'bernie', 'sanders', 'sensanders']
obamaSentiments = []
obamaKeywords = ['obama', 'barack', 'barackobama']
republicanSentiments = []
republicanKeywords = ['republican', 'conservative']
democratSentiments = []
democratKeywords = ['democrat', 'dems', 'liberal']
gunsSentiments = []
gunsKeywords = ['guns', 'gun', 'nra', 'pistol', 'firearm', 'shooting']
immigrationSentiments = []
immigrationKeywords = ['immigration', 'immigrants', 'citizenship', 'naturalization', 'visas']
employmentSentiments = []
emplyomentKeywords = ['jobs', 'employment', 'unemployment', 'job']
inflationSentiments = []
inflationKeywords = ['inflate', 'inflation', 'price hike', 'price increase', 'prices rais']
minimumwageupSentiments = []
minimumwageupKeywords = ['raise minimum wage', 'wage increase', 'raise wage', 'wage hike']
abortionSentiments = []
abortionKeywords = ['abortion', 'pro-choice', 'planned parenthood']
governmentspendingSentiments = []
governmentspendingKeywords = ['gov spending', 'government spending', 'gov. spending', 'expenditure']
taxesupSentiments = []
taxesupKeywords = ['raise tax', 'tax hike', 'taxes up', 'tax up', 'increase taxes', 'taxes increase', 'tax increase']
taxesdownSentiments = []
taxesdownKeywords = ['lower tax', 'tax cut', 'tax slash', 'taxes down', 'tax down', 'decrease taxes', 'taxes decrease', 'tax decrease']
#(nameOfTuple, sentimentList, keywordList)
personSentimentList = [('hillary', hillarySentiments, hillaryKeywords), ('trump', trumpSentiments, trumpKeywords), ('cruz', cruzSentiments, cruzKeywords),
('bernie', bernieSentiments, bernieKeywords), ('obama', obamaSentiments, obamaKeywords)]
issueSentimentList = [('guns', gunsSentiments, gunsKeywords), ('immigration', immigrationSentiments, immigrationKeywords),
('employment', employmentSentiments, emplyomentKeywords), ('inflation', inflationSentiments, inflationKeywords),
('minimum wage up', minimumwageupSentiments, minimumwageupKeywords), ('abortion', abortionSentiments, abortionKeywords),
('government spending', governmentspendingSentiments, governmentspendingKeywords), ('taxes up', taxesupSentiments, taxesupKeywords),
('taxes down', taxesdownSentiments, taxesdownKeywords) ]
"""this bit is for taking random twitter IDs for the control group. It simply skims the most recent tweets that have mentioned one of our keywords.
it turned out that skimming all of the tweets found very very few occurances of keywords since twitter is such a global/multilingual platform"""
#randIds = []
# allKeys = []
# for person in personSentimentList:
# for keyWord in person[2]:
# allKeys.append(keyWord)
# for issue in issueSentimentList:
# for keyWord in issue[2]:
# allKeys.append(keyWord)
#
# myStreamListener = MyStreamListener()
# myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)
# sys.exit()
# myStream.filter(track = allKeys)
"""here is the format for the basic loop for finding text that has the keywords we're searching for.
It then finds the sentiment and adds that to the respective keywords' data list
This particular segment uses 'lines' which is for if we're looking through non-tweets (like transcripts of some sort"""
# for line in lines:
# for person in personSentimentList:
# for keyword in person[2]:
# if keyword in line:
# try:
# tb=textblob.TextBlob(line)
# person[1].append((tb.sentiment.polarity, 5))
# break
# except:
# continue
#
# for line in lines:
# for person in issueSentimentList:
# for keyword in person[2]:
# if keyword in line:
# try:
# tb=textblob.TextBlob(line)
# person[1].append((tb.sentiment.polarity, 5))
# break
# except:
# continue
"""this big block goes through tweets of each user, looks for keywords, and if the keyword is there,
we find the sentiment for that tweet and add it to the sentiment data list"""
start = time.time()
try:
ids = np.asarray(ids)[:,1]
except:
ids = np.asarray(ids)
try:
ids = ids.astype(np.int)
except:
print "whoops"
i = 0
counter = 0
totalIdsWithMentions = 0
mentionFlag = False
for idno in ids:
try:
idno = int(idno)
except:
print 'idno too long to convert to int'
if mentionFlag == True:
totalIdsWithMentions = totalIdsWithMentions + 1
mentionFlag = False
"""the rate limit is handled here. Also, if for some reason we can't access the tweets (like internet failure)
we don't want to crash, so we wait 30 seconds and try again."""
if i % 2 == 0:
try:
apiInfo = api.rate_limit_status()['resources']['statuses']['/statuses/user_timeline']
except:
print 'no internet, sleeping for 30 seconds'
time.sleep(30)
if apiInfo['remaining'] < 2:
timeToSleep = apiInfo['reset'] - time.time()
if timeToSleep > 0:
print 'sleeping for: ', timeToSleep, ' seconds'
sys.stdout.flush()
time.sleep(timeToSleep + 1)
else:
time.sleep(1)
if i % 100 == 0:
print "on id number: ", i
sys.stdout.flush()
i = i + 1
counter = counter + 1
try:
for status in tweepy.Cursor(api.user_timeline, user_id = idno, since_id = carsonDropsOutTweetId).items(20):
statusText = status.text.lower()
for person in personSentimentList:
for keyword in person[2]:
if keyword in statusText:
tb = textblob.TextBlob(statusText)
person[1].append((tb.sentiment.polarity, status.id))
mentionFlag = True
break
for issue in issueSentimentList:
for keyword in issue[2]:
if keyword in statusText:
tb = textblob.TextBlob(statusText)
issue[1].append((tb.sentiment.polarity, status.id))
mentionFlag = True
break
except KeyboardInterrupt:
raise
except:
print sys.exc_info()[0]
sys.stdout.flush()
counter = counter - 1
continue
arrayList = []
""" here we're just compiling the sentiment data for each keyword group into an easier to work with format (dataframe).
df will contain the mean and median and mention count data. Note that it is only meaningful if compared with a
control group, since keyword selection is impossible to employ neutrally. """
for person in personSentimentList:
sentimentData = np.asarray(person[1])
if len(sentimentData) > 0:
arrayList.append([person[0], np.mean(sentimentData[:,0]), np.percentile(sentimentData[:,0], 50), len(sentimentData)] )
for issue in issueSentimentList:
sentimentData = np.asarray(issue[1])
if len(sentimentData) > 0:
arrayList.append([issue[0], np.mean(sentimentData[:,0]), np.percentile(sentimentData[:,0], 50), len(sentimentData)])
meanMedianCountData = np.asarray(arrayList)
df = pandas.DataFrame(meanMedianCountData, columns=['name', 'mean', 'median', 'count'])
df[['name']] = df[['name']].astype(str)
df[['mean', 'median']] = df[['mean', 'median']].astype(float)
df[['count']] = df[['count']].astype(int)
df.sort(['count'], ascending = 0, inplace = True)
print df
print 'time taken: ' , time.time()- start
print 'number of ids read: ' , counter
print 'Number of ids with keyword mentions: ' , totalIdsWithMentions