-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
100 lines (74 loc) · 3.08 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time
import csv
# Create driver instance, using Firefox, fuck chrome.
driver = webdriver.Firefox()
# Navigating to twitter using get method
driver.get("https://twitter.com/search?q=%20f&src=typed_query")
# Checking if twitter is in the driver title
assert "Twitter" in driver.title
# Wait for the pop-up to appear
not_now_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//span[text()="Not now"]')))
# Close the pop-up window
not_now_button.click()
# Go to advanced search
advanced_search_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(
(By.XPATH, "//span[text()='Advanced search']/parent::div")))
advanced_search_button.click()
# Input Hastag
these_hashtags = driver.find_element("xpath", "//input[@name='theseHashtags']")
these_hashtags.click()
these_hashtags.send_keys('hello')
# Click search
search_button = driver.find_element(
"xpath", "/html/body/div[1]/div/div/div[1]/div[2]/div/div/div/div/div/div[2]/div[2]/div/div/div/div[1]/div/div/div/div/div/div[3]/div/div/span/span")
search_button.click()
# Click latest
latest_button = driver.find_element(
"xpath", "/html/body/div[1]/div/div/div[2]/main/div/div/div/div[1]/div/div[1]/div[1]/div[2]/nav/div/div[2]/div/div[2]/a")
latest_button.click()
# Define the regular expressions for links and hashtags
link_pattern = re.compile(r"http\S+")
hashtag_pattern = re.compile(r"#\w+")
# Define the maximum number of tweets to scrape
max_tweets = 100
link_pattern = re.compile(r"http\S+")
hashtag_pattern = re.compile(r"#\w+")
# Define an empty list to store the tweet content
tweet_content = []
# Scroll down the page to load more tweets
scroll_pause_time = 1
last_height = driver.execute_script("return document.body.scrollHeight")
while len(tweet_content) < max_tweets:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_pause_time)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Find all the tweets on the page
tweets = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@data-testid="tweetText"]')))
# Loop through the tweets and extract the text content
for tweet in tweets:
try:
content = tweet.text
lang = tweet.get_attribute("lang")
if lang != "en":
continue
content = re.sub(link_pattern, "", content)
content = re.sub(hashtag_pattern, "", content)
tweet_content.append(content)
except:
continue
if len(tweet_content) >= max_tweets:
break
with open('tweets.csv', mode='w') as file:
writer = csv.writer(file)
for tweet in tweet_content:
writer.writerow([tweet])