Skip to content

Commit

Permalink
Merge pull request #142 from SpokaneTech/joeriddles/138-scrape-images
Browse files Browse the repository at this point in the history
  • Loading branch information
joeriddles authored Sep 16, 2024
2 parents e2cfdb7 + da0f6a4 commit ba0f2db
Show file tree
Hide file tree
Showing 10 changed files with 385 additions and 75 deletions.
95 changes: 70 additions & 25 deletions src/web/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
import pathlib
import re
import urllib.parse
import zoneinfo
from datetime import datetime, timedelta
from typing import Any, Protocol, TypeAlias, TypeVar

import eventbrite.access_methods
import requests
import zoneinfo
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup
from bs4.element import Tag
from django.conf import settings
from django.utils import timezone
from eventbrite import Eventbrite
Expand All @@ -26,7 +27,7 @@ def get_venue(self, id, **data):


def get_event_description(self, id, **data):
return self.get("/events/{0}/description//".format(id), data=data)
return self.get("/events/{0}/description/".format(id), data=data)


setattr(eventbrite.access_methods.AccessMethodsMixin, "get_venue", get_venue)
Expand All @@ -39,10 +40,23 @@ def scrape(self, url: str) -> ST:
...


EventScraperResult: TypeAlias = tuple[models.Event, list[models.Tag]]
ImageResult: TypeAlias = tuple[str, bytes]
EventScraperResult: TypeAlias = tuple[models.Event, list[models.Tag], ImageResult | None]


class ScraperMixin:
def _get_image(self, image_url: str) -> ImageResult:
image_name = self._parse_image_name(image_url)
response = requests.get(image_url, timeout=10)
response.raise_for_status()
image = response.content
return image_name, image

def _parse_image_name(self, image_url: str) -> str:
return image_url.rsplit("/", maxsplit=1)[-1].split("?", maxsplit=1)[0]

class MeetupScraperMixin:

class MeetupScraperMixin(ScraperMixin):
"""Common Meetup scraping functionality."""

def _parse_apollo_state(self, soup: BeautifulSoup) -> dict:
Expand Down Expand Up @@ -84,8 +98,8 @@ def scrape(self, url: str) -> list[str]:
else:
upcoming_section = soup.find_all(id="upcoming-section")[0]
events = upcoming_section.find_all_next(id=re.compile(r"event-card-"))
filtered_event_containers = [event for event in events if self._filter_event_tag(event)]
event_urls = [event_container["href"] for event_container in filtered_event_containers]
filtered_event_containers: list[Tag] = [event for event in events if self._filter_event_tag(event)] # type: ignore
event_urls: list[str] = [event_container["href"] for event_container in filtered_event_containers] # type: ignore

return [url for url in event_urls if self._filter_repeating_events(url)]

Expand Down Expand Up @@ -136,27 +150,31 @@ def scrape(self, url: str) -> EventScraperResult:
location_data = apollo_state[event_json["venue"]["__ref"]]
location = f"{location_data['address']}, {location_data['city']}, {location_data['state']}"
external_id = event_json["id"]
event_photo = event_json["featuredEventPhoto"]["__ref"]
image_url = apollo_state[event_photo].get("highResUrl", apollo_state[event_photo]["baseUrl"])
except KeyError:
name = self._parse_name(soup)
description = self._parse_description(soup)
date_time = self._parse_date_time(soup)
duration = self._parse_duration(soup)
location = self._parse_location(soup)
external_id = self._parse_external_id(url)
image_url = self._parse_image(soup)

if image_url:
image_result = self._get_image(image_url)

tags = self._parse_tags(soup)
return (
models.Event(
name=name,
description=description,
date_time=date_time,
duration=duration,
location=location,
external_id=external_id,
url=url,
),
tags,
event = models.Event(
name=name,
description=description,
date_time=date_time,
duration=duration,
location=location,
external_id=external_id,
url=url,
)
return (event, tags, image_result)

def _parse_name(self, soup: BeautifulSoup) -> str:
name: str = soup.find_all("h1")[0].text
Expand All @@ -171,10 +189,16 @@ def _parse_description(self, soup: BeautifulSoup) -> str:
return description

def _parse_date_time(self, soup: BeautifulSoup) -> datetime:
return datetime.fromisoformat(soup.find_all("time")[0]["datetime"])
time: Tag | None = soup.find("time") # type: ignore
if not time:
raise ValueError("could not find time")
dt: str = time["datetime"] # type: ignore
return datetime.fromisoformat(dt)

def _parse_duration(self, soup: BeautifulSoup) -> timedelta:
time: Tag = soup.find_all("time")[0]
time: Tag | None = soup.find("time") # type: ignore
if not time:
raise ValueError("could not find time")
matches = self.DURATION_PATTERN.findall(time.text)
if not matches:
raise ValueError("Could not find duration from:", time.text)
Expand All @@ -199,8 +223,18 @@ def _parse_tags(self, soup: BeautifulSoup) -> list[models.Tag]:
tags = [re.sub(r"\s+", " ", t.text) for t in tags] # Some tags have newlines & extra spaces
return [models.Tag(value=t) for t in tags]

def _parse_image(self, soup: BeautifulSoup) -> str | None:
picture = soup.find(attrs={"data-testid": "event-description-image"})
if not picture:
return None
img: Tag | None = picture.find("img") # type: ignore
if not img:
return None
src: str = img["src"] # type: ignore
return src


class EventbriteScraper(Scraper[list[EventScraperResult]]):
class EventbriteScraper(ScraperMixin, Scraper[list[EventScraperResult]]):
def __init__(self, api_token: str | None = None):
self.client = Eventbrite(api_token or settings.EVENTBRITE_API_TOKEN)
self._location_by_venue_id: dict[str, str] = {}
Expand All @@ -209,11 +243,12 @@ def scrape(self, organization_id: str) -> list[EventScraperResult]:
response = self.client.get_organizer_events(
organization_id,
status="live",
expand="logo",
)
events_and_tags = [self.map_to_event(eventbrite_event) for eventbrite_event in response["events"]]
return events_and_tags
results = [self.map_to_event(eventbrite_event) for eventbrite_event in response["events"]]
return results

def map_to_event(self, eventbrite_event: dict) -> tuple[models.Event, list[models.Tag]]:
def map_to_event(self, eventbrite_event: dict) -> EventScraperResult:
name = eventbrite_event["name"]["text"]
start = datetime.fromisoformat(eventbrite_event["start"]["utc"])
end = datetime.fromisoformat(eventbrite_event["end"]["utc"])
Expand All @@ -230,6 +265,16 @@ def map_to_event(self, eventbrite_event: dict) -> tuple[models.Event, list[model
# short description
description = eventbrite_event["description"]["html"]

try:
image_url = eventbrite_event["logo"]["original"]["url"]
image_result = self._get_image(image_url)
except (KeyError, requests.HTTPError):
try:
image_url = eventbrite_event["logo"]["url"]
image_result = self._get_image(image_url)
except KeyError:
image_result = None

event = models.Event(
name=name,
description=description,
Expand All @@ -249,7 +294,7 @@ def map_to_event(self, eventbrite_event: dict) -> tuple[models.Event, list[model
# if subcategory_name:
# tags.append(models.Tag(value=subcategory_name))

return event, []
return event, [], image_result

@functools.lru_cache
def _get_venue_location(self, venue_id: str) -> str:
Expand Down
95 changes: 68 additions & 27 deletions src/web/services.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,91 @@
from datetime import timedelta
from typing import Protocol

from django.core.files.base import ContentFile
from django.forms.models import model_to_dict
from django.utils import timezone

from web import models, scrapers


class EventService:
def save_event_from_result(
self,
result: scrapers.EventScraperResult,
tech_group: models.TechGroup,
) -> None:
event, tags, image_result = result
event = self._save_event(event, tech_group)
self._save_tags(event, tags)
if image_result is not None:
self._save_image(event, image_result)

def _save_event(
self,
event: models.Event,
tech_group: models.TechGroup,
) -> models.Event:
event.group = tech_group
event.approved_at = timezone.localtime()
defaults = model_to_dict(event, exclude=["id"])
defaults["group"] = tech_group

del defaults["tags"] # Can't apply Many-to-Many relationship untill after the event has been saved.
del defaults["image"]

updated_event, _ = models.Event.objects.update_or_create(
external_id=event.external_id,
defaults=defaults,
)
return updated_event

def _save_tags(
self,
event: models.Event,
tags: list[models.Tag],
) -> None:
for tag in tags:
tag, _ = models.Tag.objects.get_or_create(value=tag)
event.tags.add(tag)

def _save_image(
self,
event: models.Event,
image_result: scrapers.ImageResult,
) -> None:
image_name, image = image_result

# If images are the same, don't re-upload
has_existing_image = bool(event.image)
if has_existing_image:
existing_image = event.image.read()
if existing_image == image:
return

file = ContentFile(image, name=image_name)
event.image.save(image_name, file)


class MeetupService:
def __init__(
self,
homepage_scraper: scrapers.Scraper[list[str]] | None = None,
event_scraper: scrapers.Scraper[scrapers.EventScraperResult] | None = None,
event_service: EventService | None = None,
) -> None:
self.homepage_scraper: scrapers.Scraper[list[str]] = homepage_scraper or scrapers.MeetupHomepageScraper()
self.event_scraper: scrapers.Scraper[scrapers.EventScraperResult] = (
event_scraper or scrapers.MeetupEventScraper()
)
self.event_service = event_service or EventService()

def save_events(self) -> None:
"""Scrape upcoming events from Meetup and save them to the database."""
now = timezone.localtime()
for tech_group in models.TechGroup.objects.filter(homepage__icontains="meetup.com"):
event_urls = self.homepage_scraper.scrape(tech_group.homepage) # type: ignore
for event_url in event_urls: # TODO: parallelize (with async?)
event, tags = self.event_scraper.scrape(event_url)
event.group = tech_group
event.approved_at = now
defaults = model_to_dict(event, exclude=["id"])
defaults["group"] = tech_group

del defaults["tags"] # Can't apply Many-to-Many relationship untill after the event has been saved.
new_event, _ = models.Event.objects.update_or_create(
external_id=event.external_id,
defaults=defaults,
)
for tag in tags:
tag, _ = models.Tag.objects.get_or_create(value=tag)
new_event.tags.add(tag)
result = self.event_scraper.scrape(event_url)
self.event_service.save_event_from_result(result, tech_group)


class EventbriteService:
Expand All @@ -46,28 +94,21 @@ class EventbriteService:
def __init__(
self,
events_scraper: scrapers.Scraper[list[scrapers.EventScraperResult]] | None = None,
event_service: EventService | None = None,
) -> None:
self.events_scraper = events_scraper or scrapers.EventbriteScraper()
self.event_service = event_service or EventService()

def save_events(self) -> None:
"""Fetch upcoming events from Eventbrite and save them.
Note: this uses an API and doesn't actually web scrape.
"""
now = timezone.localtime()
for eventbrite_organization in models.EventbriteOrganization.objects.prefetch_related("tech_group"):
tech_group = eventbrite_organization.tech_group
events_and_tags = self.events_scraper.scrape(eventbrite_organization.eventbrite_id)
for event, _ in events_and_tags:
event.group = tech_group
event.approved_at = now
defaults = model_to_dict(event, exclude=["id"])
defaults["group"] = tech_group
del defaults["tags"] # Can't apply Many-to-Many relationship untill after the event has been saved.
models.Event.objects.update_or_create(
external_id=event.external_id,
defaults=defaults,
)
results = self.events_scraper.scrape(eventbrite_organization.eventbrite_id)
for result in results:
self.event_service.save_event_from_result(result, tech_group)


class Sender(Protocol):
Expand Down
3 changes: 3 additions & 0 deletions src/web/tests/data/eventbrite/event_description.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"description": "<div>Full Day of Panels, Speakers and Vendors on Cybersecurity, AI and Compliance. FREE with pre-registration - space limited - Oct 2, 2024"
}
Binary file added src/web/tests/data/eventbrite/event_image.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 25 additions & 0 deletions src/web/tests/data/eventbrite/event_venue.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"address": {
"address_1": "702 East Desmet Avenue",
"address_2": "",
"city": "Spokane",
"region": "WA",
"postal_code": "99202",
"country": "US",
"latitude": "47.6672448",
"longitude": "-117.3999126",
"localized_address_display": "702 East Desmet Avenue, Spokane, WA 99202",
"localized_area_display": "Spokane, WA",
"localized_multi_line_address_display": [
"702 East Desmet Avenue",
"Spokane, WA 99202"
]
},
"resource_uri": "https://www.eventbriteapi.com/v3/venues/214450569/",
"id": "214450569",
"age_restriction": null,
"capacity": null,
"name": "John J. Hemmingson Center",
"latitude": "47.6672448",
"longitude": "-117.3999126"
}
Loading

0 comments on commit ba0f2db

Please sign in to comment.