Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrape and save event images for Events #142

Merged
merged 2 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 70 additions & 25 deletions src/web/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
import pathlib
import re
import urllib.parse
import zoneinfo
from datetime import datetime, timedelta
from typing import Any, Protocol, TypeAlias, TypeVar

import eventbrite.access_methods
import requests
import zoneinfo
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup
from bs4.element import Tag
from django.conf import settings
from django.utils import timezone
from eventbrite import Eventbrite
Expand All @@ -26,7 +27,7 @@ def get_venue(self, id, **data):


def get_event_description(self, id, **data):
return self.get("/events/{0}/description//".format(id), data=data)
return self.get("/events/{0}/description/".format(id), data=data)


setattr(eventbrite.access_methods.AccessMethodsMixin, "get_venue", get_venue)
Expand All @@ -39,10 +40,23 @@ def scrape(self, url: str) -> ST:
...


EventScraperResult: TypeAlias = tuple[models.Event, list[models.Tag]]
ImageResult: TypeAlias = tuple[str, bytes]
EventScraperResult: TypeAlias = tuple[models.Event, list[models.Tag], ImageResult | None]


class ScraperMixin:
def _get_image(self, image_url: str) -> ImageResult:
image_name = self._parse_image_name(image_url)
response = requests.get(image_url, timeout=10)
response.raise_for_status()
image = response.content
return image_name, image

def _parse_image_name(self, image_url: str) -> str:
return image_url.rsplit("/", maxsplit=1)[-1].split("?", maxsplit=1)[0]

class MeetupScraperMixin:

class MeetupScraperMixin(ScraperMixin):
"""Common Meetup scraping functionality."""

def _parse_apollo_state(self, soup: BeautifulSoup) -> dict:
Expand Down Expand Up @@ -84,8 +98,8 @@ def scrape(self, url: str) -> list[str]:
else:
upcoming_section = soup.find_all(id="upcoming-section")[0]
events = upcoming_section.find_all_next(id=re.compile(r"event-card-"))
filtered_event_containers = [event for event in events if self._filter_event_tag(event)]
event_urls = [event_container["href"] for event_container in filtered_event_containers]
filtered_event_containers: list[Tag] = [event for event in events if self._filter_event_tag(event)] # type: ignore
event_urls: list[str] = [event_container["href"] for event_container in filtered_event_containers] # type: ignore

return [url for url in event_urls if self._filter_repeating_events(url)]

Expand Down Expand Up @@ -136,27 +150,31 @@ def scrape(self, url: str) -> EventScraperResult:
location_data = apollo_state[event_json["venue"]["__ref"]]
location = f"{location_data['address']}, {location_data['city']}, {location_data['state']}"
external_id = event_json["id"]
event_photo = event_json["featuredEventPhoto"]["__ref"]
image_url = apollo_state[event_photo].get("highResUrl", apollo_state[event_photo]["baseUrl"])
except KeyError:
name = self._parse_name(soup)
description = self._parse_description(soup)
date_time = self._parse_date_time(soup)
duration = self._parse_duration(soup)
location = self._parse_location(soup)
external_id = self._parse_external_id(url)
image_url = self._parse_image(soup)

if image_url:
image_result = self._get_image(image_url)

tags = self._parse_tags(soup)
return (
models.Event(
name=name,
description=description,
date_time=date_time,
duration=duration,
location=location,
external_id=external_id,
url=url,
),
tags,
event = models.Event(
name=name,
description=description,
date_time=date_time,
duration=duration,
location=location,
external_id=external_id,
url=url,
)
return (event, tags, image_result)

def _parse_name(self, soup: BeautifulSoup) -> str:
name: str = soup.find_all("h1")[0].text
Expand All @@ -171,10 +189,16 @@ def _parse_description(self, soup: BeautifulSoup) -> str:
return description

def _parse_date_time(self, soup: BeautifulSoup) -> datetime:
return datetime.fromisoformat(soup.find_all("time")[0]["datetime"])
time: Tag | None = soup.find("time") # type: ignore
if not time:
raise ValueError("could not find time")
dt: str = time["datetime"] # type: ignore
return datetime.fromisoformat(dt)

def _parse_duration(self, soup: BeautifulSoup) -> timedelta:
time: Tag = soup.find_all("time")[0]
time: Tag | None = soup.find("time") # type: ignore
if not time:
raise ValueError("could not find time")
matches = self.DURATION_PATTERN.findall(time.text)
if not matches:
raise ValueError("Could not find duration from:", time.text)
Expand All @@ -199,8 +223,18 @@ def _parse_tags(self, soup: BeautifulSoup) -> list[models.Tag]:
tags = [re.sub(r"\s+", " ", t.text) for t in tags] # Some tags have newlines & extra spaces
return [models.Tag(value=t) for t in tags]

def _parse_image(self, soup: BeautifulSoup) -> str | None:
picture = soup.find(attrs={"data-testid": "event-description-image"})
if not picture:
return None
img: Tag | None = picture.find("img") # type: ignore
if not img:
return None
src: str = img["src"] # type: ignore
return src


class EventbriteScraper(Scraper[list[EventScraperResult]]):
class EventbriteScraper(ScraperMixin, Scraper[list[EventScraperResult]]):
def __init__(self, api_token: str | None = None):
self.client = Eventbrite(api_token or settings.EVENTBRITE_API_TOKEN)
self._location_by_venue_id: dict[str, str] = {}
Expand All @@ -209,11 +243,12 @@ def scrape(self, organization_id: str) -> list[EventScraperResult]:
response = self.client.get_organizer_events(
organization_id,
status="live",
expand="logo",
)
events_and_tags = [self.map_to_event(eventbrite_event) for eventbrite_event in response["events"]]
return events_and_tags
results = [self.map_to_event(eventbrite_event) for eventbrite_event in response["events"]]
return results

def map_to_event(self, eventbrite_event: dict) -> tuple[models.Event, list[models.Tag]]:
def map_to_event(self, eventbrite_event: dict) -> EventScraperResult:
name = eventbrite_event["name"]["text"]
start = datetime.fromisoformat(eventbrite_event["start"]["utc"])
end = datetime.fromisoformat(eventbrite_event["end"]["utc"])
Expand All @@ -230,6 +265,16 @@ def map_to_event(self, eventbrite_event: dict) -> tuple[models.Event, list[model
# short description
description = eventbrite_event["description"]["html"]

try:
image_url = eventbrite_event["logo"]["original"]["url"]
image_result = self._get_image(image_url)
except (KeyError, requests.HTTPError):
try:
image_url = eventbrite_event["logo"]["url"]
image_result = self._get_image(image_url)
except KeyError:
image_result = None

event = models.Event(
name=name,
description=description,
Expand All @@ -249,7 +294,7 @@ def map_to_event(self, eventbrite_event: dict) -> tuple[models.Event, list[model
# if subcategory_name:
# tags.append(models.Tag(value=subcategory_name))

return event, []
return event, [], image_result

@functools.lru_cache
def _get_venue_location(self, venue_id: str) -> str:
Expand Down
95 changes: 68 additions & 27 deletions src/web/services.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,91 @@
from datetime import timedelta
from typing import Protocol

from django.core.files.base import ContentFile
from django.forms.models import model_to_dict
from django.utils import timezone

from web import models, scrapers


class EventService:
def save_event_from_result(
self,
result: scrapers.EventScraperResult,
tech_group: models.TechGroup,
) -> None:
event, tags, image_result = result
event = self._save_event(event, tech_group)
self._save_tags(event, tags)
if image_result is not None:
self._save_image(event, image_result)

def _save_event(
self,
event: models.Event,
tech_group: models.TechGroup,
) -> models.Event:
event.group = tech_group
event.approved_at = timezone.localtime()
defaults = model_to_dict(event, exclude=["id"])
defaults["group"] = tech_group

del defaults["tags"] # Can't apply Many-to-Many relationship untill after the event has been saved.
del defaults["image"]

updated_event, _ = models.Event.objects.update_or_create(
external_id=event.external_id,
defaults=defaults,
)
return updated_event

def _save_tags(
self,
event: models.Event,
tags: list[models.Tag],
) -> None:
for tag in tags:
tag, _ = models.Tag.objects.get_or_create(value=tag)
event.tags.add(tag)

def _save_image(
self,
event: models.Event,
image_result: scrapers.ImageResult,
) -> None:
image_name, image = image_result

# If images are the same, don't re-upload
has_existing_image = bool(event.image)
if has_existing_image:
existing_image = event.image.read()
if existing_image == image:
return

file = ContentFile(image, name=image_name)
event.image.save(image_name, file)


class MeetupService:
def __init__(
self,
homepage_scraper: scrapers.Scraper[list[str]] | None = None,
event_scraper: scrapers.Scraper[scrapers.EventScraperResult] | None = None,
event_service: EventService | None = None,
) -> None:
self.homepage_scraper: scrapers.Scraper[list[str]] = homepage_scraper or scrapers.MeetupHomepageScraper()
self.event_scraper: scrapers.Scraper[scrapers.EventScraperResult] = (
event_scraper or scrapers.MeetupEventScraper()
)
self.event_service = event_service or EventService()

def save_events(self) -> None:
"""Scrape upcoming events from Meetup and save them to the database."""
now = timezone.localtime()
for tech_group in models.TechGroup.objects.filter(homepage__icontains="meetup.com"):
event_urls = self.homepage_scraper.scrape(tech_group.homepage) # type: ignore
for event_url in event_urls: # TODO: parallelize (with async?)
event, tags = self.event_scraper.scrape(event_url)
event.group = tech_group
event.approved_at = now
defaults = model_to_dict(event, exclude=["id"])
defaults["group"] = tech_group

del defaults["tags"] # Can't apply Many-to-Many relationship untill after the event has been saved.
new_event, _ = models.Event.objects.update_or_create(
external_id=event.external_id,
defaults=defaults,
)
for tag in tags:
tag, _ = models.Tag.objects.get_or_create(value=tag)
new_event.tags.add(tag)
result = self.event_scraper.scrape(event_url)
self.event_service.save_event_from_result(result, tech_group)


class EventbriteService:
Expand All @@ -46,28 +94,21 @@ class EventbriteService:
def __init__(
self,
events_scraper: scrapers.Scraper[list[scrapers.EventScraperResult]] | None = None,
event_service: EventService | None = None,
) -> None:
self.events_scraper = events_scraper or scrapers.EventbriteScraper()
self.event_service = event_service or EventService()

def save_events(self) -> None:
"""Fetch upcoming events from Eventbrite and save them.

Note: this uses an API and doesn't actually web scrape.
"""
now = timezone.localtime()
for eventbrite_organization in models.EventbriteOrganization.objects.prefetch_related("tech_group"):
tech_group = eventbrite_organization.tech_group
events_and_tags = self.events_scraper.scrape(eventbrite_organization.eventbrite_id)
for event, _ in events_and_tags:
event.group = tech_group
event.approved_at = now
defaults = model_to_dict(event, exclude=["id"])
defaults["group"] = tech_group
del defaults["tags"] # Can't apply Many-to-Many relationship untill after the event has been saved.
models.Event.objects.update_or_create(
external_id=event.external_id,
defaults=defaults,
)
results = self.events_scraper.scrape(eventbrite_organization.eventbrite_id)
for result in results:
self.event_service.save_event_from_result(result, tech_group)


class Sender(Protocol):
Expand Down
3 changes: 3 additions & 0 deletions src/web/tests/data/eventbrite/event_description.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"description": "<div>Full Day of Panels, Speakers and Vendors on Cybersecurity, AI and Compliance. FREE with pre-registration - space limited - Oct 2, 2024"
}
Binary file added src/web/tests/data/eventbrite/event_image.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 25 additions & 0 deletions src/web/tests/data/eventbrite/event_venue.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"address": {
"address_1": "702 East Desmet Avenue",
"address_2": "",
"city": "Spokane",
"region": "WA",
"postal_code": "99202",
"country": "US",
"latitude": "47.6672448",
"longitude": "-117.3999126",
"localized_address_display": "702 East Desmet Avenue, Spokane, WA 99202",
"localized_area_display": "Spokane, WA",
"localized_multi_line_address_display": [
"702 East Desmet Avenue",
"Spokane, WA 99202"
]
},
"resource_uri": "https://www.eventbriteapi.com/v3/venues/214450569/",
"id": "214450569",
"age_restriction": null,
"capacity": null,
"name": "John J. Hemmingson Center",
"latitude": "47.6672448",
"longitude": "-117.3999126"
}
Loading