-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathea-records.py
100 lines (83 loc) · 3.89 KB
/
ea-records.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import re
import argparse
from bs4 import BeautifulSoup
def parse_venue(venue):
venue = venue.strip()
parts = venue.split(', ')
if len(parts) == 1:
city, country = parse_country_and_city(parts[0])
return pd.Series({'country': country, 'city': city, 'stadium': None})
stadium = parts[0]
rest = ', '.join(parts[1:])
city, country = parse_country_and_city(rest)
return pd.Series({'country': country, 'city': city, 'stadium': stadium})
def parse_country_and_city(country_and_city):
regex = re.compile(r'(.+?)\s*\((\w+(?:\s+\w+)*)\)')
match_result = regex.match(country_and_city)
if not match_result:
raise ValueError(
f"Cannot parse country and city from {country_and_city}")
city = match_result.group(1)
country = match_result.group(2)
return city, country
if __name__ == '__main__':
argparser = argparse.ArgumentParser()
argparser.add_argument('--input', help='HTML file', required=True)
argparser.add_argument('--output', help='Output CSV file', required=True)
argparser.add_argument('--recordType', help='Record Type', required=True)
argparser.add_argument(
'--sex', choices=["Male", "Female", "Mixed"], required=True)
args = argparser.parse_args()
rows = []
with open(args.input, 'r', encoding='utf-8', errors='ignore') as f_in:
soup = BeautifulSoup(f_in.read(), 'html.parser')
for row in soup.find_all('div', class_='ea-table-row'):
row_data = []
for field in row.find_all('div', class_='ea-table-field'):
title = field.find(
'div', class_='mobile-field-title').text.strip()
value = field.find(
'div', class_='mobile-field-content').text.strip()
if (title == "Discipline"):
row_data.append({"Discipline": value})
elif (title == "Perf."):
row_data.append({"Performance": value})
elif (title == "Wind"):
row_data.append({"Wind": value})
elif (title == "Competitor"):
row_data.append({"Competitor": value})
elif (title == "DOB"):
row_data.append({"DOB": value})
elif (title == "Nat."):
row_data.append({"Nation": value})
elif (title == "Venue"):
row_data.append({"Venue": value})
elif (title == "Date"):
row_data.append({"Date": value})
rows.append({k: v for d in row_data for k, v in d.items()})
df = pd.DataFrame(rows)
df["Record Type"] = args.recordType
df["Sex"] = args.sex
df['Date'] = pd.to_datetime(df['Date'], format='%d %b %Y')
df = df.rename(columns={"Discipline": "Discipline",
"Performance": "Result", "Wind": "Wind", "Competitor": "Name"})
df = df[~df["Discipline"].str.contains("\✱")] # ratification
df = df[~df["Result"].str.contains("ℹ")]
df["Environment"] = df["Venue"].apply(
lambda x: "Indoor" if "(i)" in x else "Outdoor")
df["Name"] = df.apply(lambda row: "" if row["DOB"].strip() == "" else row["Name"], axis=1) # Relay
# remove Short Track from discipline
df['Discipline'] = df['Discipline'].str.replace(r' Short Track$', '', regex=True)
# filter all where venue is not empty
df = df[df["Venue"] != ""]
df[['Venue Country', 'Venue', 'Stadium']] = df['Venue'].apply(parse_venue)
df.drop(columns=['Stadium'], inplace=True)
if 'DOB' in df.columns:
df['DOB'] = pd.to_datetime(
df['DOB'], format='%d %b %Y', errors='coerce')
df["YOB"] = df['DOB'].dt.year
df['YOB'] = df['YOB'].fillna('').astype(str)
df['YOB'] = df['YOB'].replace('', '-1')
df['YOB'] = df['YOB'].astype(float).astype(int)
df.to_csv(args.output, index=False)