Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
jaebradley committed Jun 6, 2024
1 parent d8a38be commit 1330186
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 54 deletions.
4 changes: 2 additions & 2 deletions basketball_reference_web_scraper/contracts/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ class SeasonSalaryData:

@dataclasses.dataclass(frozen=True)
class RowData:
player: PlayerData
team_id: str
player: Optional[PlayerData]
team_id: Optional[str]
salary_by_season: Dict[str, Optional[SeasonSalaryData]]
68 changes: 38 additions & 30 deletions basketball_reference_web_scraper/contracts/readers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
import enum
from typing import Dict, Any, Optional
from typing import Optional, Callable, Dict, Any

from lxml.etree import Element

from .data import PlayerData


def read_player_cell_data(cell: Element) -> PlayerData:
return PlayerData(name=cell.text_content(), identifier=cell.get("data-append-csv"))


def read_team_identifier_cell_data(cell: Element) -> str:
return cell.text_content()


class Column(enum.Enum):
PLAYER = "player"
TEAM = "team_id"
Expand All @@ -17,43 +25,43 @@ class Column(enum.Enum):
SIXTH_SEASON_SALARY = "y6"


class SingleCellFinder:
class CellIdentifier:
def __init__(self, column: Column):
self.column = column

def find(self, row: Element) -> Optional[Element]:
def identify_cell(self, row: Element) -> Optional[Element]:
matching_cells = row.xpath(f'./td[@data-stat="{self.column.value}"]')
if 1 == len(matching_cells):
return matching_cells[0]


class SingleCellValueReader:
def __init__(self, cell_finder: SingleCellFinder, cell_reader):
self.cell_finder = cell_finder
self.cell_reader = cell_reader

def read(self, row: Element):
cell = self.cell_finder.find(row=row)
if cell:
return self.cell_reader.read(cell=cell)


class RowDataReader:
def __init__(self, cell_readers_by_column: Dict[Column, SingleCellValueReader]):
self.cell_readers_by_column = cell_readers_by_column

def read(self, row: Element) -> Dict[Column, Optional[Any]]:
return dict(map(lambda e: [e[0], e[1].read(row=row)], self.cell_readers_by_column.items()))


class PlayerDataCellReader:
def __init__(self, player_identifier_attribute_name):
self.player_identifier_attribute_name = player_identifier_attribute_name
def __init__(self,
cell_identifiers_by_column: Dict[Column, CellIdentifier],
cell_value_readers_by_column: Dict[Column, Callable[[Element], Any]]):
self.cell_identifiers_by_column = cell_identifiers_by_column
self.cell_value_readers_by_column = cell_value_readers_by_column

def read(self, cell: Element) -> PlayerData:
return PlayerData(name=cell.text_content(), identifier=cell.get(self.player_identifier_attribute_name))


class TeamDataCellReader:
def read(self, cell: Element) -> str:
return cell.text_content()
def read(self, row: Element) -> Dict[Column, Optional[Any]]:
return dict(map(lambda column_and_cell: [
column_and_cell[0],
None if column_and_cell[1] is None \
else self.cell_value_readers_by_column.get(column_and_cell[0])(column_and_cell[1])
], map(
lambda e: [e[0], e[1].identify_cell(row=row)],
self.cell_identifiers_by_column.items())))

@staticmethod
def instance():
# TODO: don't recreate the object on each invocation
return RowDataReader(
cell_identifiers_by_column={
Column.PLAYER: CellIdentifier(column=Column.PLAYER),
Column.TEAM: CellIdentifier(column=Column.TEAM),
},
cell_value_readers_by_column={
Column.PLAYER: read_player_cell_data,
Column.TEAM: read_team_identifier_cell_data
}
)
11 changes: 5 additions & 6 deletions basketball_reference_web_scraper/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
from lxml import html
from lxml.html import HtmlComment

from basketball_reference_web_scraper.contracts.readers import RowDataReader, Column, PlayerDataCellReader, \
SingleCellValueReader, SingleCellFinder
from basketball_reference_web_scraper.contracts.readers import RowDataReader, Column


class BasicBoxScoreRow:
Expand Down Expand Up @@ -1236,8 +1235,8 @@ def losses(self):


class PlayerContractsTableReader:
def __init__(self, row_reader):
self.row_reader = row_reader
def __init__(self, row_data_reader: RowDataReader):
self.row_data_reader = row_data_reader

@property
def column_names_by_identifier(self) -> Dict[str, str]:
Expand All @@ -1249,8 +1248,8 @@ def column_names_by_identifier(self) -> Dict[str, str]:
Column)))

def rows(self, table):
for row_html in table.xpath('./tbody/tr[not[@class]]'):
yield PlayerContractsTableReader.row_reader.read(row=row_html)
for row_html in table.xpath('./tbody/tr[@data-row and not(@class)]'):
yield self.row_data_reader.read(row=row_html)


class PlayerContractsRow:
Expand Down
24 changes: 8 additions & 16 deletions tests/integration/html/test_player_contracts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@

from lxml import html

from basketball_reference_web_scraper.html import PlayerContractsTableReader, Column, RowDataReader, \
SingleCellValueReader, SingleCellFinder, PlayerDataCellReader
from basketball_reference_web_scraper.contracts.readers import TeamDataCellReader
from basketball_reference_web_scraper.contracts.readers import RowDataReader
from basketball_reference_web_scraper.html import PlayerContractsTableReader, Column


class TestPlayerContractsTable(unittest.TestCase):
Expand Down Expand Up @@ -1072,19 +1071,12 @@ def test(self):
</tbody></table>
"""
)
for row in PlayerContractsTableReader(row_reader=RowDataReader(
{
Column.PLAYER:
SingleCellValueReader(
cell_finder=SingleCellFinder(column=Column.PLAYER),
cell_reader=PlayerDataCellReader(player_identifier_attribute_name="data-append-csv")),
Column.TEAM:
SingleCellValueReader(
cell_finder=SingleCellFinder(column=Column.TEAM),
cell_reader=TeamDataCellReader()
)
}
)).rows(table_html):
count = 0
table_reader = PlayerContractsTableReader(row_data_reader=RowDataReader.instance())
for row in table_reader.rows(table_html):
self.assertIsNotNone(row)
self.assertIsNotNone(row.get(Column.PLAYER))
self.assertIsNotNone(row.get(Column.TEAM))
count += 1

self.assertGreater(count, 0)

0 comments on commit 1330186

Please sign in to comment.