Introduccion al web scraping
El web scraping es el proceso de extraer datos automaticamente de sitios web. Python es el lenguaje mas popular para web scraping gracias a bibliotecas como requests para obtener paginas y BeautifulSoup para analizar HTML. Antes de hacer scraping en cualquier sitio web, siempre verifica su robots.txt y terminos de servicio.
Directrices de scraping etico
- Verifica robots.txt: Respeta las directivas de rastreo del sitio
- Limitacion de velocidad: Agrega retrasos entre solicitudes para no sobrecargar servidores
- Terminos de servicio: Lee y cumple con los ToS del sitio web
- Usa APIs primero: Si hay una API disponible, prefierela sobre el scraping
- Identificate: Establece un header User-Agent apropiado
Obtencion de paginas con requests
import requests
# Basic GET request
response = requests.get("https://httpbin.org/html")
print(response.status_code) # 200
print(response.text[:200]) # HTML content
# Custom headers
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MyBot/1.0; +http://example.com/bot)",
"Accept-Language": "en-US,en;q=0.9",
}
response = requests.get("https://example.com", headers=headers)
# Handling errors
response = requests.get("https://httpbin.org/status/404")
response.raise_for_status() # Raises HTTPError for 4xx/5xx
# Session for multiple requests (reuses connections)
session = requests.Session()
session.headers.update({"User-Agent": "MyBot/1.0"})
response1 = session.get("https://httpbin.org/get")
response2 = session.get("https://httpbin.org/headers")
# Timeout and retries
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
retry_strategy = Retry(total=3, backoff_factor=1)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)
response = session.get("https://example.com", timeout=10)
Analisis de HTML con BeautifulSoup
# pip install beautifulsoup4 lxml
from bs4 import BeautifulSoup
html = """
Products
Laptop
$999.99
A powerful laptop for developers
Keyboard
$149.99
Mechanical keyboard with RGB
Mouse
$79.99
Ergonomic wireless mouse
"""
soup = BeautifulSoup(html, "lxml")
# Find elements
title = soup.find("title").text
print(title) # "Products"
# Find all products
products = soup.find_all("div", class_="product")
for product in products:
name = product.find("h2", class_="name").text
price = product.find("span", class_="price").text
desc = product.find("p", class_="description").text
product_id = product["data-id"]
print(f"[{product_id}] {name}: {price} - {desc}")
# CSS selectors (more flexible)
names = soup.select(".product .name")
prices = soup.select(".product .price")
for name, price in zip(names, prices):
print(f"{name.text}: {price.text}")
# Navigate the tree
first_product = soup.select_one(".product")
print(first_product.parent.name) # "div"
print(first_product.find_next_sibling("div")["data-id"]) # "2"
Ejemplo completo de scraping
import requests
from bs4 import BeautifulSoup
import csv
import time
def scrape_books(base_url: str, max_pages: int = 5) -> list[dict]:
"""Scrape book data from a website."""
books = []
session = requests.Session()
session.headers.update({
"User-Agent": "Educational Scraper/1.0"
})
for page in range(1, max_pages + 1):
url = f"{base_url}/catalogue/page-{page}.html"
try:
response = session.get(url, timeout=10)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error fetching page {page}: {e}")
continue
soup = BeautifulSoup(response.text, "lxml")
articles = soup.select("article.product_pod")
for article in articles:
title = article.select_one("h3 a")["title"]
price = article.select_one(".price_color").text
rating_class = article.select_one(".star-rating")["class"][1]
availability = article.select_one(".availability").text.strip()
books.append({
"title": title,
"price": price,
"rating": rating_class,
"available": "In stock" in availability,
})
print(f"Scraped page {page}: {len(articles)} books")
time.sleep(1) # Be polite!
return books
def save_to_csv(data: list[dict], filename: str):
"""Save scraped data to CSV."""
if not data:
return
with open(filename, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
# Usage:
# books = scrape_books("https://books.toscrape.com")
# save_to_csv(books, "books.csv")
# print(f"Scraped {len(books)} books")
Puntos clave
- Usa sesiones: Reutiliza conexiones y headers entre solicitudes
- Selectores CSS: Prefiere .select() para consultas complejas
- Manejo de errores: Siempre maneja errores de red y elementos faltantes
- Se respetuoso: Limita la velocidad de las solicitudes y verifica robots.txt