This repository has been archived by the owner on Mar 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
96 lines (78 loc) · 3.29 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import pandas as pd
from bs4 import BeautifulSoup
SCORE_DIR = "data/scores"
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]
def parse_html(box_score):
"""Parses and cleans the html for further processing"""
with open(box_score, encoding="utf-8", errors="ignore") as f:
try:
html = f.read()
except (OSError, ValueError):
print(box_score)
soup = BeautifulSoup(html, "html.parser")
# Decomposing unwanted table assets, the dataframe gets messy otherwise
[s.decompose() for s in soup.select("tr.over_header")]
[s.decompose() for s in soup.select("tr.thead")]
return soup
def read_line_score(soup):
line_score = pd.read_html(str(soup), attrs={"id": "line_score"})[0]
cols = list(line_score.columns)
cols[0] = "team"
cols[-1] = "total"
line_score.columns = cols
# Removed quarterly score totals because pandas gets weird with OT
# TODO: find efficient way to store OT data without messing with the dataframe
line_score = line_score[["team", "total"]]
return line_score
def read_stats(soup, team, stat):
df = pd.read_html(str(soup), attrs = {'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
df = df.apply(pd.to_numeric, errors="coerce")
return df
def read_season_info(soup):
nav = soup.select("#bottom_nav_container")[0]
hrefs = [a["href"] for a in nav.find_all("a")]
season = os.path.basename(hrefs[1]).split("_")[0]
return season
base_cols = None
games = []
for box_score in box_scores:
soup = parse_html(box_score)
line_score = read_line_score(soup)
teams = list(line_score["team"])
summaries = []
for team in teams:
basic = read_stats(soup, team, "basic")
advanced = read_stats(soup, team, "advanced")
# Indexes dataframe by position
totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
totals.index = totals.index.str.lower()
# Concatenates team in single series
maxes = pd.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
maxes.index = maxes.index.str.lower() + "_max"
summary = pd.concat([totals, maxes])
if base_cols is None:
base_cols = list(summary.index.drop_duplicates(keep="first"))
# BPM exists in some box scores and not others, causes issues, safer to remove
base_cols = [b for b in base_cols if "bpm" not in b]
summary = summary[base_cols]
summaries.append(summary)
summary = pd.concat(summaries, axis=1).T
game = pd.concat([summary, line_score], axis=1)
game["home"] = [0, 1]
game_opp = game.iloc[::-1].reset_index()
game_opp.columns += "_opp"
full_game = pd.concat([game, game_opp], axis = 1)
full_game["season"] = read_season_info(soup)
full_game["date"] = os.path.basename(box_score)[:8]
full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")
full_game["won"] = full_game["total"] > full_game["total_opp"]
games.append(full_game)
if len(games) % 5 == 0:
print(f"{len(games)} / {len(box_scores)}")
elif len(games) % 100 == 0:
print(f"{len(games)} / {len(box_scores)}")
games_df = pd.concat(games, ignore_index=True)
print(games_df)
games_df.to_csv("nba_games.csv")