UTSAVS26 · smog-root · Jan 5, 2025 · Jan 5, 2025 · Jan 5, 2025
diff --git a/DIRECTORY.md b/DIRECTORY.md
@@ -1 +1 @@
-/home/runner/work/_temp/fcbe3165-95cc-4878-8e84-ef18a16fbf3a.sh: line 1: scripts/build_directory_md.py: Permission denied
+/home/runner/work/_temp/406417ad-64a9-4d03-9ecc-60105918e50f.sh: line 1: scripts/build_directory_md.py: Permission denied
diff --git a/pysnippets/webscrape/scraper.py b/pysnippets/webscrape/scraper.py
@@ -1,22 +1,20 @@
+import asyncio
+import aiohttp
+import logging
+import random
+import time
+import pickle
+import pandas as pd
 import requests
 from bs4 import BeautifulSoup
-import pandas as pd
-import logging
-import json
-from typing import List, Dict, Optional
+from typing import List, Dict, Any
 from datetime import datetime
 from pathlib import Path
-import aiohttp
-import asyncio
-from fake_useragent import UserAgent
-import concurrent.futures
 from retry import retry
+from fake_useragent import UserAgent
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
-from typing import Any
-import time
 import argparse
-import random
-import pickle
 
 logging.basicConfig(
     filename='scraper.log',
@@ -35,16 +33,17 @@ class Quote:
 class QuoteScraper:
     """A class to handle quote scraping with enhanced readability and performance."""
 
-    def __init__(self, base_url: str, output_dir: str = 'output', cache_file: str = 'cache.pkl') -> None:
+    def __init__(self, base_url: str, output_dir: str = 'output', cache_file: str = 'cache.pkl', max_concurrent_requests: int = 5) -> None:
         self.base_url = base_url
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(exist_ok=True)
         self.cache_file = Path(cache_file)
         self.session = requests.Session()
         self.ua = UserAgent()
         self.quotes_cache: Dict[str, List[Quote]] = self.load_cache()
+        self.max_concurrent_requests = max_concurrent_requests
         logging.debug("Initialized QuoteScraper with base_url: %s and output_dir: %s", base_url, output_dir)
-        
+
     def load_cache(self) -> Dict[str, List[Quote]]:
         """Load cached data from a file if available."""
         if self.cache_file.exists():
@@ -108,35 +107,37 @@ def _parse_quote_element(self, quote_element: Any) -> Optional[Quote]:
             logging.error("Error parsing quote: %s", e)
             return None
 
-    async def scrape_page_async(self, page: int) -> List[Quote]:
+    async def scrape_page_async(self, page: int, semaphore: asyncio.Semaphore) -> List[Quote]:
         """Asynchronously scrape a single page and return a list of Quote objects."""
         url = f"{self.base_url}/page/{page}/"
         logging.info("Asynchronously scraping page: %d", page)
-        try:
-            async with aiohttp.ClientSession() as session:
-                async with session.get(url, headers=self.get_headers()) as response:
-                    if response.status == 200:
-                        html = await response.text()
-                        soup = BeautifulSoup(html, 'html.parser')
-                        quotes = [self._parse_quote_element(quote) for quote in soup.find_all('div', class_='quote') if self._parse_quote_element(quote)]
-                        logging.info("Scraped %d quotes from page %d", len(quotes), page)
-                        return quotes
-                    else:
-                        logging.error("Error %d on page %d", response.status, page)
-                        return []
-        except aiohttp.ClientError as e:
-            logging.error("Async scraping error on page %d: %s", page, e)
-            return []
+        async with semaphore:
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(url, headers=self.get_headers()) as response:
+                        if response.status == 200:
+                            html = await response.text()
+                            soup = BeautifulSoup(html, 'html.parser')
+                            quotes = [self._parse_quote_element(quote) for quote in soup.find_all('div', class_='quote') if self._parse_quote_element(quote)]
+                            logging.info("Scraped %d quotes from page %d", len(quotes), page)
+                            return quotes
+                        else:
+                            logging.error("Error %d on page %d", response.status, page)
+                            return []
+            except aiohttp.ClientError as e:
+                logging.error("Async scraping error on page %d: %s", page, e)
+                return []
 
     async def scrape_multiple_pages_async(self, num_pages: int) -> List[Quote]:
         """Scrape multiple pages asynchronously."""
-        tasks = [self.scrape_page_async(page) for page in range(1, num_pages + 1)]
+        semaphore = asyncio.Semaphore(self.max_concurrent_requests)
+        tasks = [self.scrape_page_async(page, semaphore) for page in range(1, num_pages + 1)]
         results = await asyncio.gather(*tasks)
         return [quote for page_quotes in results for quote in page_quotes]
 
     def scrape_with_threading(self, num_pages: int, max_workers: int = 4) -> List[Quote]:
         """Scrape multiple pages using thread pool."""
-        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
             future_to_url = {
                 executor.submit(self.scrape_static_quotes, 
                               f"{self.base_url}/page/{page}/"): page 
@@ -162,15 +163,18 @@ def save_to_multiple_formats(self, quotes: List[Quote], base_filename: str) -> N
             timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
             base_filename = f"{base_filename}_{timestamp}"
 
+            # Save CSV
             csv_path = self.output_dir / f"{base_filename}.csv"
             df.to_csv(csv_path, index=False, encoding='utf-8')
             logging.debug("Saved quotes to CSV: %s", csv_path)
 
+            # Save JSON
             json_path = self.output_dir / f"{base_filename}.json"
             with open(json_path, 'w', encoding='utf-8') as f:
                 json.dump([quote.__dict__ for quote in quotes], f, ensure_ascii=False, indent=2)
             logging.debug("Saved quotes to JSON: %s", json_path)
 
+            # Save Excel
             excel_path = self.output_dir / f"{base_filename}.xlsx"
             df.to_excel(excel_path, index=False)
             logging.debug("Saved quotes to Excel: %s", excel_path)

diff --git a/pysnippets/webscrape/test_scraper.py b/pysnippets/webscrape/test_scraper.py
@@ -3,10 +3,13 @@
 from bs4 import BeautifulSoup
 from datetime import datetime
 from pysnippets.webscrape.scraper import QuoteScraper, Quote
+import time
+
 
 class TestQuoteScraper(unittest.TestCase):
 
     def setUp(self):
+        """Set up the test environment and sample HTML."""
         self.scraper = QuoteScraper('https://quotes.toscrape.com')
         self.sample_html = '''
         <div class="quote">
@@ -17,37 +20,42 @@ def setUp(self):
             </span>
         </div>
         '''
+        self.incomplete_html = '''
+        <div class="quote">
+            <span class="text">“Incomplete quote without author.”</span>
+            <span>
+                <a class="tag" href="/tag/incomplete/page/1/">incomplete</a>
+            </span>
+        </div>
+        '''
+        self.sample_quote = Quote(
+            text='“A witty saying proves nothing.”',
+            author='Voltaire',
+            tags=['wit'],
+            scraped_at=datetime.now()  # Dynamic field
+        )
 
     def test_parse_quote_element_valid(self):
         """Test parsing a valid quote element."""
         soup = BeautifulSoup(self.sample_html, 'html.parser')
         quote_element = soup.find('div', class_='quote')
         quote = self.scraper._parse_quote_element(quote_element)
 
-        # Expected quote
+        # Ensure the scraped_at field is excluded from comparison
         expected_quote = Quote(
             text='“A witty saying proves nothing.”',
             author='Voltaire',
             tags=['wit'],
-            scraped_at=quote.scraped_at  # Dynamic field
+            scraped_at=quote.scraped_at  # Allow dynamic scraped_at
         )
-
-        # Check if parsed data matches expected
+
         self.assertEqual(quote.text, expected_quote.text)
         self.assertEqual(quote.author, expected_quote.author)
         self.assertEqual(quote.tags, expected_quote.tags)
 
     def test_parse_quote_element_missing_author(self):
         """Test parsing when the author is missing."""
-        incomplete_html = '''
-        <div class="quote">
-            <span class="text">“Incomplete quote without author.”</span>
-            <span>
-                <a class="tag" href="/tag/incomplete/page/1/">incomplete</a>
-            </span>
-        </div>
-        '''
-        soup = BeautifulSoup(incomplete_html, 'html.parser')
+        soup = BeautifulSoup(self.incomplete_html, 'html.parser')
         quote_element = soup.find('div', class_='quote')
         quote = self.scraper._parse_quote_element(quote_element)
         self.assertIsNone(quote)  # Expect None due to missing author
@@ -84,6 +92,33 @@ def test_scrape_static_quotes(self, mock_fetch_page):
         self.assertEqual(quotes[0]['author'], "Voltaire")
         self.assertIn("wit", quotes[0]['tags'])
 
+    @patch('pysnippets.webscrape.scraper.QuoteScraper.save_cache')
+    @patch('pysnippets.webscrape.scraper.QuoteScraper.load_cache')
+    def test_cache_functionality(self, mock_load_cache, mock_save_cache):
+        """Test cache loading and saving functionality."""
+        mock_load_cache.return_value = {
+            'https://quotes.toscrape.com/page/1/': [self.sample_quote]
+        }
+
+        # Verify that cache loads successfully
+        cached_quotes = self.scraper.quotes_cache
+        self.assertEqual(len(cached_quotes), 1)
+        self.assertEqual(cached_quotes['https://quotes.toscrape.com/page/1/'][0].text, "“A witty saying proves nothing.”")
+
+        # Trigger saving cache after scrape
+        self.scraper.save_cache()
+        mock_save_cache.assert_called_once()
+
+    @patch('pysnippets.webscrape.scraper.datetime')
+    def test_parse_quote_element_with_datetime_mock(self, mock_datetime):
+        """Test parsing with mocked datetime."""
+        mock_datetime.now.return_value = datetime(2025, 1, 5, 10, 0, 0)
+        soup = BeautifulSoup(self.sample_html, 'html.parser')
+        quote_element = soup.find('div', class_='quote')
+        quote = self.scraper._parse_quote_element(quote_element)
+
+        self.assertEqual(quote.scraped_at, datetime(2025, 1, 5, 10, 0, 0))
+
     def tearDown(self):
         """Clean up after tests if needed."""
         pass
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		/home/runner/work/_temp/fcbe3165-95cc-4878-8e84-ef18a16fbf3a.sh: line 1: scripts/build_directory_md.py: Permission denied
		/home/runner/work/_temp/406417ad-64a9-4d03-9ecc-60105918e50f.sh: line 1: scripts/build_directory_md.py: Permission denied