Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Evenbrite event series to scraper #141

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,28 @@ services:
image: spokanetech:latest
build:
context: .
command: [
"python",
"-m",
"celery",
"--workdir",
"./src",
"-A",
"spokanetech.celery",
"worker",
"-B",
"-l",
"INFO",
"--events"
]
container_name: worker
ports:
- "5555:5555"
env_file:
- .env
environment:
SPOKANE_TECH_DEV: false
CELERY_BROKER_URL: "redis://redis:6379/0"

redis:
image: redis:7.2
Expand Down
1 change: 1 addition & 0 deletions src/spokanetech/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@
CELERY_TASK_ACKS_LATE = True
CELERY_TIMEZONE = TIME_ZONE
CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
CELERY_LATE_ACK = True


# Discord
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 5.0.8 on 2024-09-11 05:09

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('web', '0014_event_image_techgroup_image'),
]

operations = [
migrations.RemoveField(
model_name='eventbriteorganization',
name='eventbrite_id',
),
]
1 change: 0 additions & 1 deletion src/web/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,3 @@ def get_absolute_url(self) -> str:
class EventbriteOrganization(models.Model):
tech_group = models.ForeignKey(TechGroup, on_delete=models.CASCADE)
url = models.URLField()
eventbrite_id = models.CharField(max_length=256)
64 changes: 53 additions & 11 deletions src/web/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
import pathlib
import re
import urllib.parse
import zoneinfo
from datetime import datetime, timedelta
from typing import Any, Protocol, TypeAlias, TypeVar
from typing import Any, Callable, Protocol, TypeAlias, TypeVar

import eventbrite.access_methods
import requests
import zoneinfo
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup
from bs4.element import Tag
from django.conf import settings
from django.utils import timezone
from eventbrite import Eventbrite
Expand All @@ -26,11 +27,16 @@ def get_venue(self, id, **data):


def get_event_description(self, id, **data):
return self.get("/events/{0}/description//".format(id), data=data)
return self.get("/events/{0}/description/".format(id), data=data)


def get_events_for_series(self, id, **data):
return self.get("/series/{0}/events/".format(id), data=data)


setattr(eventbrite.access_methods.AccessMethodsMixin, "get_venue", get_venue)
setattr(eventbrite.access_methods.AccessMethodsMixin, "get_event_description", get_event_description)
setattr(eventbrite.access_methods.AccessMethodsMixin, "get_events_for_series", get_events_for_series)


class Scraper(Protocol[ST]):
Expand Down Expand Up @@ -174,7 +180,7 @@ def _parse_date_time(self, soup: BeautifulSoup) -> datetime:
return datetime.fromisoformat(soup.find_all("time")[0]["datetime"])

def _parse_duration(self, soup: BeautifulSoup) -> timedelta:
time: Tag = soup.find_all("time")[0]
time = soup.find_all("time")[0]
matches = self.DURATION_PATTERN.findall(time.text)
if not matches:
raise ValueError("Could not find duration from:", time.text)
Expand All @@ -201,18 +207,54 @@ def _parse_tags(self, soup: BeautifulSoup) -> list[models.Tag]:


class EventbriteScraper(Scraper[list[EventScraperResult]]):
ORGANIZATION_ID_PATTERN = re.compile(r"/o/[A-z-]+(\d+)")
EVENT_SERIES_ID_PATTERN = re.compile(r"/e/[A-z-]+(\d+)")

def __init__(self, api_token: str | None = None):
self.client = Eventbrite(api_token or settings.EVENTBRITE_API_TOKEN)
self._location_by_venue_id: dict[str, str] = {}

def scrape(self, organization_id: str) -> list[EventScraperResult]:
response = self.client.get_organizer_events(
organization_id,
status="live",
)
events_and_tags = [self.map_to_event(eventbrite_event) for eventbrite_event in response["events"]]
def scrape(self, url: str) -> list[EventScraperResult]:
request_func, id = self.get_request_func(url)
request_func = functools.partial(request_func, id)
events = self.paginate_all(request_func, "events")
events_and_tags = [self.map_to_event(eventbrite_event) for eventbrite_event in events]
return events_and_tags

def get_request_func(self, url: str) -> tuple[Callable[..., Any], int]:
"""Parse the API request function and ID from the URL."""
if matches := self.ORGANIZATION_ID_PATTERN.findall(url):
organization_id = matches[0]
return (functools.partial(self.client.get_organizer_events, status="live"), organization_id)
elif matches := self.EVENT_SERIES_ID_PATTERN.findall(url):
event_series_id = matches[0]
return (self.client.get_events_for_series, event_series_id) # type: ignore
else:
raise ValueError(f"invalid Eventbrite url: {url}")

def paginate_all(self, request_func: Callable[..., Any], key: str) -> list:
"""Iterate through all the pages of the request."""
response = request_func()
self.check_response(response)
result = response[key]
if getattr(response, "is_paginated", False):
while response["pagination"]["has_more_items"]:
continuation = response["pagination"]["continuation"]
response = request_func(continuation=continuation)
self.check_response(response)
result = result + response[key]
return result

def check_response(self, response: Any) -> None:
status_code: int = getattr(response, "status_code", 0)
if not status_code:
status_code = response["status_code"]

if status_code >= 400:
raise ValueError(
f"Evenbrite scrape error: [{status_code}] {response["error"]}: {response["error_description"]}"
)

def map_to_event(self, eventbrite_event: dict) -> tuple[models.Event, list[models.Tag]]:
name = eventbrite_event["name"]["text"]
start = datetime.fromisoformat(eventbrite_event["start"]["utc"])
Expand Down
2 changes: 1 addition & 1 deletion src/web/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def save_events(self) -> None:
now = timezone.localtime()
for eventbrite_organization in models.EventbriteOrganization.objects.prefetch_related("tech_group"):
tech_group = eventbrite_organization.tech_group
events_and_tags = self.events_scraper.scrape(eventbrite_organization.eventbrite_id)
events_and_tags = self.events_scraper.scrape(eventbrite_organization.url)
for event, _ in events_and_tags:
event.group = tech_group
event.approved_at = now
Expand Down
4 changes: 2 additions & 2 deletions src/web/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ def scrape_events_from_meetup():
def scrape_events_from_eventbrite():
"""Scrape upcoming events from Eventbrite."""
events_scraper = scrapers.EventbriteScraper()
meetup_service = services.EventbriteService(events_scraper)
meetup_service.save_events()
eventbrite_service = services.EventbriteService(events_scraper)
eventbrite_service.save_events()


@shared_task()
Expand Down
14 changes: 10 additions & 4 deletions src/web/tests/test_scrapers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import pathlib
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

import freezegun
import responses
import pytest
import responses
from django.test import TestCase

from web import models, scrapers
from zoneinfo import ZoneInfo


class TestMeetupHomepageScraper(TestCase):
Expand Down Expand Up @@ -126,9 +127,9 @@ class TestEventbriteScraper(TestCase):
To run them, set the `EVENTBRITE_API_TOKEN` envrionment variable.
"""

def test_scraper(self):
def test_scraper_organization_id(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: this test is failing as the event seems to have disappeared.

scraper = scrapers.EventbriteScraper()
result = scraper.scrape("72020528223")
result = scraper.scrape("https://www.eventbrite.com/o/inch360-72020528223")
actual: models.Event = result[0][0]
assert actual.name == "Spring Cyber - Training Series"
assert actual.description and actual.description.startswith(
Expand All @@ -139,3 +140,8 @@ def test_scraper(self):
assert actual.location == "2818 North Sullivan Road #Suite 100, Spokane Valley, WA 99216"
assert actual.url == "https://www.eventbrite.com/e/spring-cyber-training-series-tickets-860181354587"
assert actual.external_id == "860181354587"

def test_scraper_event_series_id(self):
scraper = scrapers.EventbriteScraper()
result = scraper.scrape("https://www.eventbrite.com/e/cda-machine-learners-ai-ml-club-tickets-640757311367")
assert len(result) > 1