What is Web Scraping?

Learn to extract data from websites using BeautifulSoup, requests, and ethical scraping practices

Web Scraping - Python Tutorial | TechLead

Q: Introduction to Web Scraping

Web scraping is the process of automatically extracting data from websites. Python is the most popular language for web scraping thanks to libraries like requests for fetching pages and BeautifulSoup for parsing HTML. Before scraping any website, always check its robots.txt and terms of service.

Introduction to Web Scraping

Web scraping is the process of automatically extracting data from websites. Python is the most popular language for web scraping thanks to libraries like requests for fetching pages and BeautifulSoup for parsing HTML. Before scraping any website, always check its robots.txt and terms of service.

Ethical Scraping Guidelines

Check robots.txt: Respect the site's crawling directives
Rate limiting: Add delays between requests to avoid overwhelming servers
Terms of service: Read and comply with the website's ToS
Use APIs first: If an API is available, prefer it over scraping
Identify yourself: Set a proper User-Agent header

Fetching Pages with requests

import requests

# Basic GET request
response = requests.get("https://httpbin.org/html")
print(response.status_code)  # 200
print(response.text[:200])   # HTML content

# Custom headers
headers = {
    "User-Agent": "Mozilla/5.0 (compatible; MyBot/1.0; +http://example.com/bot)",
    "Accept-Language": "en-US,en;q=0.9",
}
response = requests.get("https://example.com", headers=headers)

# Handling errors
response = requests.get("https://httpbin.org/status/404")
response.raise_for_status()  # Raises HTTPError for 4xx/5xx

# Session for multiple requests (reuses connections)
session = requests.Session()
session.headers.update({"User-Agent": "MyBot/1.0"})

response1 = session.get("https://httpbin.org/get")
response2 = session.get("https://httpbin.org/headers")

# Timeout and retries
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

retry_strategy = Retry(total=3, backoff_factor=1)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

response = session.get("https://example.com", timeout=10)

Parsing HTML with BeautifulSoup

# pip install beautifulsoup4 lxml
from bs4 import BeautifulSoup

html = """

Products

  
    
      Laptop
      $999.99
      A powerful laptop for developers
    
    
      Keyboard
      $149.99
      Mechanical keyboard with RGB
    
    
      Mouse
      $79.99
      Ergonomic wireless mouse
    
  


"""

soup = BeautifulSoup(html, "lxml")

# Find elements
title = soup.find("title").text
print(title)  # "Products"

# Find all products
products = soup.find_all("div", class_="product")
for product in products:
    name = product.find("h2", class_="name").text
    price = product.find("span", class_="price").text
    desc = product.find("p", class_="description").text
    product_id = product["data-id"]
    print(f"[{product_id}] {name}: {price} - {desc}")

# CSS selectors (more flexible)
names = soup.select(".product .name")
prices = soup.select(".product .price")

for name, price in zip(names, prices):
    print(f"{name.text}: {price.text}")

# Navigate the tree
first_product = soup.select_one(".product")
print(first_product.parent.name)  # "div"
print(first_product.find_next_sibling("div")["data-id"])  # "2"

Complete Scraping Example

import requests
from bs4 import BeautifulSoup
import csv
import time

def scrape_books(base_url: str, max_pages: int = 5) -> list[dict]:
    """Scrape book data from a website."""
    books = []
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Educational Scraper/1.0"
    })
    
    for page in range(1, max_pages + 1):
        url = f"{base_url}/catalogue/page-{page}.html"
        
        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            continue
        
        soup = BeautifulSoup(response.text, "lxml")
        articles = soup.select("article.product_pod")
        
        for article in articles:
            title = article.select_one("h3 a")["title"]
            price = article.select_one(".price_color").text
            rating_class = article.select_one(".star-rating")["class"][1]
            availability = article.select_one(".availability").text.strip()
            
            books.append({
                "title": title,
                "price": price,
                "rating": rating_class,
                "available": "In stock" in availability,
            })
        
        print(f"Scraped page {page}: {len(articles)} books")
        time.sleep(1)  # Be polite!
    
    return books

def save_to_csv(data: list[dict], filename: str):
    """Save scraped data to CSV."""
    if not data:
        return
    with open(filename, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

# Usage:
# books = scrape_books("https://books.toscrape.com")
# save_to_csv(books, "books.csv")
# print(f"Scraped {len(books)} books")

Key Takeaways

Use sessions: Reuse connections and headers across requests
CSS selectors: Prefer .select() for complex queries
Error handling: Always handle network errors and missing elements
Be respectful: Rate limit requests and check robots.txt

Web Scraping

Introduction to Web Scraping

Ethical Scraping Guidelines

Fetching Pages with requests

Parsing HTML with BeautifulSoup

Laptop

Keyboard

Mouse

Complete Scraping Example

Key Takeaways

Continue Learning

AI Agents & RAG

Data Engineering

AI & Machine Learning

Algorithms

Testing