TechLead
Lesson 23 of 25
5 min read
Python

Web Scraping

Learn to extract data from websites using BeautifulSoup, requests, and ethical scraping practices

Introduction to Web Scraping

Web scraping is the process of automatically extracting data from websites. Python is the most popular language for web scraping thanks to libraries like requests for fetching pages and BeautifulSoup for parsing HTML. Before scraping any website, always check its robots.txt and terms of service.

Ethical Scraping Guidelines

  • Check robots.txt: Respect the site's crawling directives
  • Rate limiting: Add delays between requests to avoid overwhelming servers
  • Terms of service: Read and comply with the website's ToS
  • Use APIs first: If an API is available, prefer it over scraping
  • Identify yourself: Set a proper User-Agent header

Fetching Pages with requests

import requests

# Basic GET request
response = requests.get("https://httpbin.org/html")
print(response.status_code)  # 200
print(response.text[:200])   # HTML content

# Custom headers
headers = {
    "User-Agent": "Mozilla/5.0 (compatible; MyBot/1.0; +http://example.com/bot)",
    "Accept-Language": "en-US,en;q=0.9",
}
response = requests.get("https://example.com", headers=headers)

# Handling errors
response = requests.get("https://httpbin.org/status/404")
response.raise_for_status()  # Raises HTTPError for 4xx/5xx

# Session for multiple requests (reuses connections)
session = requests.Session()
session.headers.update({"User-Agent": "MyBot/1.0"})

response1 = session.get("https://httpbin.org/get")
response2 = session.get("https://httpbin.org/headers")

# Timeout and retries
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

retry_strategy = Retry(total=3, backoff_factor=1)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

response = session.get("https://example.com", timeout=10)

Parsing HTML with BeautifulSoup

# pip install beautifulsoup4 lxml
from bs4 import BeautifulSoup

html = """

Products

  

Laptop

$999.99

A powerful laptop for developers

Keyboard

$149.99

Mechanical keyboard with RGB

Mouse

$79.99

Ergonomic wireless mouse

""" soup = BeautifulSoup(html, "lxml") # Find elements title = soup.find("title").text print(title) # "Products" # Find all products products = soup.find_all("div", class_="product") for product in products: name = product.find("h2", class_="name").text price = product.find("span", class_="price").text desc = product.find("p", class_="description").text product_id = product["data-id"] print(f"[{product_id}] {name}: {price} - {desc}") # CSS selectors (more flexible) names = soup.select(".product .name") prices = soup.select(".product .price") for name, price in zip(names, prices): print(f"{name.text}: {price.text}") # Navigate the tree first_product = soup.select_one(".product") print(first_product.parent.name) # "div" print(first_product.find_next_sibling("div")["data-id"]) # "2"

Complete Scraping Example

import requests
from bs4 import BeautifulSoup
import csv
import time

def scrape_books(base_url: str, max_pages: int = 5) -> list[dict]:
    """Scrape book data from a website."""
    books = []
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Educational Scraper/1.0"
    })
    
    for page in range(1, max_pages + 1):
        url = f"{base_url}/catalogue/page-{page}.html"
        
        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            continue
        
        soup = BeautifulSoup(response.text, "lxml")
        articles = soup.select("article.product_pod")
        
        for article in articles:
            title = article.select_one("h3 a")["title"]
            price = article.select_one(".price_color").text
            rating_class = article.select_one(".star-rating")["class"][1]
            availability = article.select_one(".availability").text.strip()
            
            books.append({
                "title": title,
                "price": price,
                "rating": rating_class,
                "available": "In stock" in availability,
            })
        
        print(f"Scraped page {page}: {len(articles)} books")
        time.sleep(1)  # Be polite!
    
    return books

def save_to_csv(data: list[dict], filename: str):
    """Save scraped data to CSV."""
    if not data:
        return
    with open(filename, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

# Usage:
# books = scrape_books("https://books.toscrape.com")
# save_to_csv(books, "books.csv")
# print(f"Scraped {len(books)} books")

Key Takeaways

  • Use sessions: Reuse connections and headers across requests
  • CSS selectors: Prefer .select() for complex queries
  • Error handling: Always handle network errors and missing elements
  • Be respectful: Rate limit requests and check robots.txt

Continue Learning