Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Unit Test Coverage and Cache Functionality for QuoteScraper #475

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DIRECTORY.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
/home/runner/work/_temp/fcbe3165-95cc-4878-8e84-ef18a16fbf3a.sh: line 1: scripts/build_directory_md.py: Permission denied
/home/runner/work/_temp/406417ad-64a9-4d03-9ecc-60105918e50f.sh: line 1: scripts/build_directory_md.py: Permission denied
68 changes: 36 additions & 32 deletions pysnippets/webscrape/scraper.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
import asyncio
import aiohttp
import logging
import random
import time
import pickle
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
import logging
import json
from typing import List, Dict, Optional
from typing import List, Dict, Any
from datetime import datetime
from pathlib import Path
import aiohttp
import asyncio
from fake_useragent import UserAgent
import concurrent.futures
from retry import retry
from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from typing import Any
import time
import argparse
import random
import pickle

logging.basicConfig(
filename='scraper.log',
Expand All @@ -35,16 +33,17 @@ class Quote:
class QuoteScraper:
"""A class to handle quote scraping with enhanced readability and performance."""

def __init__(self, base_url: str, output_dir: str = 'output', cache_file: str = 'cache.pkl') -> None:
def __init__(self, base_url: str, output_dir: str = 'output', cache_file: str = 'cache.pkl', max_concurrent_requests: int = 5) -> None:
self.base_url = base_url
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
self.cache_file = Path(cache_file)
self.session = requests.Session()
self.ua = UserAgent()
self.quotes_cache: Dict[str, List[Quote]] = self.load_cache()
self.max_concurrent_requests = max_concurrent_requests
logging.debug("Initialized QuoteScraper with base_url: %s and output_dir: %s", base_url, output_dir)

def load_cache(self) -> Dict[str, List[Quote]]:
"""Load cached data from a file if available."""
if self.cache_file.exists():
Expand Down Expand Up @@ -108,35 +107,37 @@ def _parse_quote_element(self, quote_element: Any) -> Optional[Quote]:
logging.error("Error parsing quote: %s", e)
return None

async def scrape_page_async(self, page: int) -> List[Quote]:
async def scrape_page_async(self, page: int, semaphore: asyncio.Semaphore) -> List[Quote]:
"""Asynchronously scrape a single page and return a list of Quote objects."""
url = f"{self.base_url}/page/{page}/"
logging.info("Asynchronously scraping page: %d", page)
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=self.get_headers()) as response:
if response.status == 200:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
quotes = [self._parse_quote_element(quote) for quote in soup.find_all('div', class_='quote') if self._parse_quote_element(quote)]
logging.info("Scraped %d quotes from page %d", len(quotes), page)
return quotes
else:
logging.error("Error %d on page %d", response.status, page)
return []
except aiohttp.ClientError as e:
logging.error("Async scraping error on page %d: %s", page, e)
return []
async with semaphore:
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=self.get_headers()) as response:
if response.status == 200:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
quotes = [self._parse_quote_element(quote) for quote in soup.find_all('div', class_='quote') if self._parse_quote_element(quote)]
logging.info("Scraped %d quotes from page %d", len(quotes), page)
return quotes
else:
logging.error("Error %d on page %d", response.status, page)
return []
except aiohttp.ClientError as e:
logging.error("Async scraping error on page %d: %s", page, e)
return []

async def scrape_multiple_pages_async(self, num_pages: int) -> List[Quote]:
"""Scrape multiple pages asynchronously."""
tasks = [self.scrape_page_async(page) for page in range(1, num_pages + 1)]
semaphore = asyncio.Semaphore(self.max_concurrent_requests)
tasks = [self.scrape_page_async(page, semaphore) for page in range(1, num_pages + 1)]
results = await asyncio.gather(*tasks)
return [quote for page_quotes in results for quote in page_quotes]

def scrape_with_threading(self, num_pages: int, max_workers: int = 4) -> List[Quote]:
"""Scrape multiple pages using thread pool."""
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {
executor.submit(self.scrape_static_quotes,
f"{self.base_url}/page/{page}/"): page
Expand All @@ -162,15 +163,18 @@ def save_to_multiple_formats(self, quotes: List[Quote], base_filename: str) -> N
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
base_filename = f"{base_filename}_{timestamp}"

# Save CSV
csv_path = self.output_dir / f"{base_filename}.csv"
df.to_csv(csv_path, index=False, encoding='utf-8')
logging.debug("Saved quotes to CSV: %s", csv_path)

# Save JSON
json_path = self.output_dir / f"{base_filename}.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump([quote.__dict__ for quote in quotes], f, ensure_ascii=False, indent=2)
logging.debug("Saved quotes to JSON: %s", json_path)

# Save Excel
excel_path = self.output_dir / f"{base_filename}.xlsx"
df.to_excel(excel_path, index=False)
logging.debug("Saved quotes to Excel: %s", excel_path)
Expand Down
61 changes: 48 additions & 13 deletions pysnippets/webscrape/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
from bs4 import BeautifulSoup
from datetime import datetime
from pysnippets.webscrape.scraper import QuoteScraper, Quote
import time


class TestQuoteScraper(unittest.TestCase):

def setUp(self):
"""Set up the test environment and sample HTML."""
self.scraper = QuoteScraper('https://quotes.toscrape.com')
self.sample_html = '''
<div class="quote">
Expand All @@ -17,37 +20,42 @@ def setUp(self):
</span>
</div>
'''
self.incomplete_html = '''
<div class="quote">
<span class="text">“Incomplete quote without author.”</span>
<span>
<a class="tag" href="/tag/incomplete/page/1/">incomplete</a>
</span>
</div>
'''
self.sample_quote = Quote(
text='“A witty saying proves nothing.”',
author='Voltaire',
tags=['wit'],
scraped_at=datetime.now() # Dynamic field
)

def test_parse_quote_element_valid(self):
"""Test parsing a valid quote element."""
soup = BeautifulSoup(self.sample_html, 'html.parser')
quote_element = soup.find('div', class_='quote')
quote = self.scraper._parse_quote_element(quote_element)

# Expected quote
# Ensure the scraped_at field is excluded from comparison
expected_quote = Quote(
text='“A witty saying proves nothing.”',
author='Voltaire',
tags=['wit'],
scraped_at=quote.scraped_at # Dynamic field
scraped_at=quote.scraped_at # Allow dynamic scraped_at
)

# Check if parsed data matches expected

self.assertEqual(quote.text, expected_quote.text)
self.assertEqual(quote.author, expected_quote.author)
self.assertEqual(quote.tags, expected_quote.tags)

def test_parse_quote_element_missing_author(self):
"""Test parsing when the author is missing."""
incomplete_html = '''
<div class="quote">
<span class="text">“Incomplete quote without author.”</span>
<span>
<a class="tag" href="/tag/incomplete/page/1/">incomplete</a>
</span>
</div>
'''
soup = BeautifulSoup(incomplete_html, 'html.parser')
soup = BeautifulSoup(self.incomplete_html, 'html.parser')
quote_element = soup.find('div', class_='quote')
quote = self.scraper._parse_quote_element(quote_element)
self.assertIsNone(quote) # Expect None due to missing author
Expand Down Expand Up @@ -84,6 +92,33 @@ def test_scrape_static_quotes(self, mock_fetch_page):
self.assertEqual(quotes[0]['author'], "Voltaire")
self.assertIn("wit", quotes[0]['tags'])

@patch('pysnippets.webscrape.scraper.QuoteScraper.save_cache')
@patch('pysnippets.webscrape.scraper.QuoteScraper.load_cache')
def test_cache_functionality(self, mock_load_cache, mock_save_cache):
"""Test cache loading and saving functionality."""
mock_load_cache.return_value = {
'https://quotes.toscrape.com/page/1/': [self.sample_quote]
}

# Verify that cache loads successfully
cached_quotes = self.scraper.quotes_cache
self.assertEqual(len(cached_quotes), 1)
self.assertEqual(cached_quotes['https://quotes.toscrape.com/page/1/'][0].text, "“A witty saying proves nothing.”")

# Trigger saving cache after scrape
self.scraper.save_cache()
mock_save_cache.assert_called_once()

@patch('pysnippets.webscrape.scraper.datetime')
def test_parse_quote_element_with_datetime_mock(self, mock_datetime):
"""Test parsing with mocked datetime."""
mock_datetime.now.return_value = datetime(2025, 1, 5, 10, 0, 0)
soup = BeautifulSoup(self.sample_html, 'html.parser')
quote_element = soup.find('div', class_='quote')
quote = self.scraper._parse_quote_element(quote_element)

self.assertEqual(quote.scraped_at, datetime(2025, 1, 5, 10, 0, 0))

def tearDown(self):
"""Clean up after tests if needed."""
pass
Expand Down
Loading