Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
nimarion committed Feb 25, 2024
0 parents commit b731ff2
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 0 deletions.
27 changes: 27 additions & 0 deletions .github/workflows/update.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Update Records

on:
workflow_dispatch:
schedule:
- cron: '0 0 * * *'

jobs:
update:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setting up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'
- name: Install requirements
run: pip install -r requirements.txt
- name: Run
run: |
python main.py
git config user.name Github Action
git config user.email [email protected]
git add data/*
timestamp=$(date -u)
git commit -m "update records data: ${timestamp}" || exit 0
git push
118 changes: 118 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import requests
import pandas as pd
import os

def getEvents(sex):
url = f'https://www.tilastopaja.eu/api/inputs/events/nationalrecords/senior/{sex}'
response = requests.get(url)
json = response.json()
data = json["data"]
return [x["value"] for x in data]

def determine_environment(row):
if 'rowTitle' in row and row['rowTitle'] == "indoor":
return "Indoor"
if row['result'].endswith('i'):
return "Indoor"
return "Outdoor"

def isTechnical(discipline):
technical_disciplines = ["HJ", "PV", "LJ", "TJ", "SP", "DT", "HT", "JT"]
for prefix in technical_disciplines:
if discipline.startswith(prefix):
return True
return False

def performance_to_float(performance):
performance = performance.strip().replace(",", ".")
if ":" in performance:
# Running disciplines with format "1:23.45" or "1:23" or "2:29:08"
parts = performance.split(":")
if len(parts) != 2:
return 0
if "." in parts[1]:
sub_parts = parts[1].split(".")
minutes = int(parts[0])
seconds = int(sub_parts[0])
milliseconds = int(sub_parts[1])
return (minutes * 60 + seconds) * 1000 + milliseconds
return int(parts[0]) * 60 + int(parts[1])
else:
# Technical disciplines and sprint disciplines with format "10.23", "1.70"
try:
converted_performance = float(performance)
except ValueError:
return 0
return int(converted_performance * 1000)

def getRecords(sex, event):
url = f'https://www.tilastopaja.eu/api/nationalrecords/world/{sex}?indoor=all&event={event}'
response = requests.get(url)
json = response.json()
if (json['templates'] is None):
return None
records = json['templates'][0]["divs"][0]["tables"][0]["body"]
df = pd.DataFrame(records)

df = df.rename(columns={'country': 'nation'})
# add event column
df['event'] = event
df['sex'] = "Male" if sex == "men" else "Female"
df['type'] = "NR"
df['environment'] = df.apply(determine_environment, axis=1)
# remove all non numeric characters from result epext , :
df['result'] = df['result'].str.replace(r'[^0-9:.]', '', regex=True)

# only keep rows result, nation, venue, venueCountry, date, name, yearOfBirth, sex, event, type, environment
desired_columns = ['result', 'nation', 'venue', 'venueCountry', 'date', 'name', 'yearOfBirth', 'sex', 'event', 'type', 'environment']

valid_columns = []

for col in desired_columns:
if col in df.columns:
valid_columns.append(col)

return df[valid_columns]

def saveRecords(df, output):
output_dir = os.path.dirname(output)
discipline = os.path.basename(output).split(".")[0]
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)

df = df[df['environment'] != 'Indoor']
df = df.drop(columns=['environment'])

df = df.sort_values('date', ascending=True).drop_duplicates(subset=['result', 'name', 'nation'], keep='first')
df = df.sort_values('nation')

df['result'] = df['result'].astype(str)
df['Performance'] = df['result'].apply(performance_to_float)

if isTechnical(discipline):
df = df.sort_values(['nation', 'Performance'], ascending=[True, False])
df = df.groupby('nation').apply(lambda x: x[x['Performance'] == x['Performance'].max()], include_groups=False).reset_index(drop=True)
else:
df = df.sort_values(['nation', 'Performance'], ascending=[True, True])
df = df.groupby('nation').apply(lambda x: x[x['Performance'] == x['Performance'].min()], include_groups=False).reset_index(drop=True)

df = df.drop(columns=['Performance'])

if df.empty:
if os.path.exists(output):
os.remove(output)
return

df.to_csv(output, index=False)

if __name__ == '__main__':
for sex in ["men", "women"]:
events = getEvents(sex)

for event in events:
print(f'Getting records for {event}')
records = getRecords(sex, event)
if records is not None:
saveRecords(records, f'./data/{sex}/{event}.csv')
else:
print(f'No records found for {event}')
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
requests==2.31.0
pandas==2.2.0
pyarrow==15.0.0
lxml==5.1.0
openpyxl==3.1.0

0 comments on commit b731ff2

Please sign in to comment.