python Utilities

Web Scraper with BeautifulSoup

Production-ready web scraper with retry logic, rate limiting, user-agent rotation, and structured data extraction using BeautifulSoup.

Apex Logic 0 copies
python
import requests
from bs4 import BeautifulSoup
import time
import random
from dataclasses import dataclass
from typing import List, Optional

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
]

@dataclass
class ScrapedItem:
    title: str
    url: str
    price: Optional[str] = None
    description: Optional[str] = None

def scrape_page(url: str, retries: int = 3) -> Optional[BeautifulSoup]:
    for attempt in range(retries):
        try:
            headers = {"User-Agent": random.choice(USER_AGENTS)}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.text, "html.parser")
        except requests.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(2 ** attempt + random.random())
    return None

def extract_items(soup: BeautifulSoup, selector: str) -> List[ScrapedItem]:
    items = []
    for el in soup.select(selector):
        title = el.select_one("h2, h3, .title")
        link = el.select_one("a[href]")
        price = el.select_one(".price, .cost")
        if title:
            items.append(ScrapedItem(
                title=title.get_text(strip=True),
                url=link["href"] if link else "",
                price=price.get_text(strip=True) if price else None,
            ))
    return items

if __name__ == "__main__":
    soup = scrape_page("https://example.com/products")
    if soup:
        items = extract_items(soup, ".product-card")
        for item in items:
            print(f"{item.title} - {item.price}")

Tags

scraping beautifulsoup requests automation

Related Snippets

javascript

File Upload Handler with Validation

python

CSV Data Processor

python

REST API Client with Retry

python

Email Sender with Templates