python
Utilities
Web Scraper with BeautifulSoup
Production-ready web scraper with retry logic, rate limiting, user-agent rotation, and structured data extraction using BeautifulSoup.
Apex Logic
0 copies
python
import requests
from bs4 import BeautifulSoup
import time
import random
from dataclasses import dataclass
from typing import List, Optional
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
]
@dataclass
class ScrapedItem:
title: str
url: str
price: Optional[str] = None
description: Optional[str] = None
def scrape_page(url: str, retries: int = 3) -> Optional[BeautifulSoup]:
for attempt in range(retries):
try:
headers = {"User-Agent": random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return BeautifulSoup(response.text, "html.parser")
except requests.RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt < retries - 1:
time.sleep(2 ** attempt + random.random())
return None
def extract_items(soup: BeautifulSoup, selector: str) -> List[ScrapedItem]:
items = []
for el in soup.select(selector):
title = el.select_one("h2, h3, .title")
link = el.select_one("a[href]")
price = el.select_one(".price, .cost")
if title:
items.append(ScrapedItem(
title=title.get_text(strip=True),
url=link["href"] if link else "",
price=price.get_text(strip=True) if price else None,
))
return items
if __name__ == "__main__":
soup = scrape_page("https://example.com/products")
if soup:
items = extract_items(soup, ".product-card")
for item in items:
print(f"{item.title} - {item.price}")