-
Notifications
You must be signed in to change notification settings - Fork 0
/
conference_talks.py
119 lines (103 loc) · 5.21 KB
/
conference_talks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pdb
import time
from random import randint
years = range(1971,2022)
months = ['04', '10']
home_page = 'https://www.churchofjesuschrist.org'
var_names = ['year', 'month', 'speaker', 'title', 'text']
conf_data = []
for year in years:
for month in months:
start_time_month = time.time()
# Download conference html file for given date
conf_page = requests.get(f'{home_page}/general-conference/{year}/{month}?lang=eng')
conf_soup = BeautifulSoup(conf_page.content, 'html.parser')
# Information for each talk in given conference
links = conf_soup.find_all(class_="item-3cCP7")
for link_contents in links:
try:
speaker_name = link_contents.find(class_='subtitle-MuO4X')
if speaker_name is None:
continue
speaker_name = speaker_name.string
subdir = link_contents.get('href')
# Skip talk if there is no html site with talk text (usually general women's meeting)
if re.search('media', subdir) != None:
print(f'Missing talk text: {speaker_name}, {month} {year}')
conf_data.append((year, month, speaker_name, None, None))
continue
except:
print('Exception: Problem getting speaker name')
print(f'{month} {year} Conference')
print(link_contents)
raise
title_obj = link_contents.find('div', class_="itemTitle-23vMm")
title = title_obj.find('p').getText()
# print(f"\nCurrent Talk: {title}")
# print(f"Date: {month}, {year}")
# print(f"Speaker: {speaker_name}")
# Refreshing page up to 3 times if an error occurs
talk_loaded = False
iterations = 0
MAX_ITER = 3
while talk_loaded == False and iterations < MAX_ITER:
# Get talk text
talk_page = requests.get(f'{home_page}{subdir}')
talk_soup = BeautifulSoup(talk_page.content, 'html.parser')
talk_title = talk_soup.title.string
# Check that GET request was successful
if iterations == MAX_ITER:
print(f'Talk failed loading {MAX_ITER} times\nAborting')
raise
elif talk_title == 'Service Not Available':
print(f'WARNING: {month} {year} conference talk by {speaker_name} did not load--{talk_page}')
iterations += 1
time.sleep(randint(2,5))
else:
talk_loaded = True
# Replace lettered references with reference content
references = talk_soup.find_all(id=re.compile('note[0-9]+'))
# Combine references that are from same note
new_references = []
note = ''
for ref in references:
# If this is the beginning of a section of a new footnote, append as new footnote
new_string = ' '.join(ref.stripped_strings)
next_note = re.search('note[0-9]+', ref.get('id')).group(0)
# If the next section is part of the same footnote, add to same footnote
if note == next_note:
new_references[-1] = ' '.join([new_references[-1], new_string])
else:
new_references.append(new_string)
note = next_note
references = new_references
# Replace number in talk body text with formatted citation, be it scripture or other reference
talk_refs = talk_soup.find_all(href=re.compile('note[0-9]+'))
for i, new_string in enumerate(references):
try:
talk_refs[i].string.replace_with(f' ({new_string}) ')
except:
print('Exception: Problem replacing reference string')
print(f'{month} {year} Conference')
print(f'Speaker: {speaker_name}, Talk: {talk_title}')
print(f'Talk total references: {len(references)}, Currently on: {i}')
raise
# Get talk body text after adding references
try:
talk_text = talk_soup.find(class_='body-block').get_text(separator=' ', strip=True)
except:
print('Exception: Problem obtaining body text')
print(f'{month} {year} Conference')
print(f'Speaker: {speaker_name}, Talk: {talk_title}')
raise
conf_data.append((year, month, speaker_name, talk_title, talk_text))
# Compute and report data scraping time for each conference session
time_min_month = round( (time.time()-start_time_month)/60, 2)
print(f'Extracted {month} {year} conference talks in {time_min_month} minutes\n')
conf_df = pd.DataFrame(conf_data, columns=var_names)
conf_df.to_csv('conference.csv', index=False)