Introduccion a pandas
pandas es la biblioteca fundamental para manipulacion de datos en Python. Proporciona dos estructuras de datos principales: Series (array etiquetado 1D) y DataFrame (tabla etiquetada 2D). pandas facilita cargar, limpiar, transformar, analizar y exportar datos de practicamente cualquier fuente.
Creacion de DataFrames
import pandas as pd
import numpy as np
# From a dictionary
data = {
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
"age": [30, 25, 35, 28, 32],
"city": ["NYC", "London", "Tokyo", "Paris", "Berlin"],
"salary": [95000, 72000, 110000, 88000, 105000],
}
df = pd.DataFrame(data)
print(df)
# From a CSV file
# df = pd.read_csv("data.csv")
# From JSON
# df = pd.read_json("data.json")
# From SQL
# import sqlite3
# conn = sqlite3.connect("database.db")
# df = pd.read_sql("SELECT * FROM users", conn)
# Basic info
print(df.shape) # (5, 4)
print(df.dtypes) # Column data types
print(df.describe()) # Statistical summary
print(df.info()) # Memory usage and non-null counts
print(df.head(3)) # First 3 rows
print(df.tail(2)) # Last 2 rows
Seleccion y filtrado de datos
import pandas as pd
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
"age": [30, 25, 35, 28, 32],
"department": ["Engineering", "Marketing", "Engineering", "Marketing", "Engineering"],
"salary": [95000, 72000, 110000, 88000, 105000],
})
# Select columns
print(df["name"]) # Series
print(df[["name", "age"]]) # DataFrame with two columns
# Select rows by index
print(df.iloc[0]) # First row
print(df.iloc[0:3]) # First three rows
print(df.iloc[0, 1]) # First row, second column
# Select rows by label
df.index = ["a", "b", "c", "d", "e"]
print(df.loc["a"]) # Row with label 'a'
print(df.loc["a":"c"]) # Rows a through c (inclusive!)
# Boolean filtering
young = df[df["age"] < 30]
engineers = df[df["department"] == "Engineering"]
high_earners = df[df["salary"] > 100000]
# Multiple conditions
senior_engineers = df[
(df["department"] == "Engineering") & (df["age"] > 30)
]
# isin for multiple values
selected = df[df["name"].isin(["Alice", "Bob", "Eve"])]
# query method (more readable)
result = df.query("department == 'Engineering' and salary > 100000")
# String methods
df[df["name"].str.startswith("A")]
df[df["name"].str.contains("li")]
Transformacion de datos
# Adding columns
df["bonus"] = df["salary"] * 0.1
df["senior"] = df["age"] >= 30
# Apply function to a column
df["name_upper"] = df["name"].apply(str.upper)
df["tax"] = df["salary"].apply(lambda s: s * 0.3 if s > 90000 else s * 0.2)
# Renaming columns
df = df.rename(columns={"name": "full_name", "age": "years"})
# Sorting
df_sorted = df.sort_values("salary", ascending=False)
df_sorted = df.sort_values(["department", "salary"], ascending=[True, False])
# Group by and aggregate
dept_stats = df.groupby("department").agg(
avg_salary=("salary", "mean"),
max_salary=("salary", "max"),
count=("name", "count"),
avg_age=("age", "mean"),
)
print(dept_stats)
# Pivot tables
pivot = df.pivot_table(
values="salary",
index="department",
columns="senior",
aggfunc="mean"
)
# Value counts
print(df["department"].value_counts())
Manejo de datos faltantes
import pandas as pd
import numpy as np
df = pd.DataFrame({
"A": [1, 2, np.nan, 4],
"B": [np.nan, 2, 3, 4],
"C": [1, 2, 3, np.nan],
})
# Detect missing values
print(df.isna()) # Boolean mask
print(df.isna().sum()) # Count per column
# Drop missing values
df.dropna() # Drop rows with any NaN
df.dropna(subset=["A"]) # Drop only if A is NaN
df.dropna(how="all") # Drop only if all values are NaN
# Fill missing values
df.fillna(0) # Fill with constant
df["A"].fillna(df["A"].mean()) # Fill with mean
df.fillna(method="ffill") # Forward fill
df.fillna(method="bfill") # Backward fill
# Interpolation
df["A"].interpolate() # Linear interpolation
Fusion y union
import pandas as pd
orders = pd.DataFrame({
"order_id": [1, 2, 3, 4],
"customer_id": [101, 102, 101, 103],
"amount": [250, 180, 320, 150],
})
customers = pd.DataFrame({
"customer_id": [101, 102, 103, 104],
"name": ["Alice", "Bob", "Charlie", "Diana"],
"city": ["NYC", "London", "Tokyo", "Paris"],
})
# Merge (SQL-style join)
merged = pd.merge(orders, customers, on="customer_id", how="inner")
print(merged)
# Left join (keep all orders)
left = pd.merge(orders, customers, on="customer_id", how="left")
# Concatenate DataFrames
df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
df2 = pd.DataFrame({"A": [5, 6], "B": [7, 8]})
combined = pd.concat([df1, df2], ignore_index=True)
# Export
# df.to_csv("output.csv", index=False)
# df.to_json("output.json", orient="records")
# df.to_excel("output.xlsx", index=False)
Puntos clave
- DataFrames son centrales: Representan datos tabulares con filas y columnas etiquetadas
- Operaciones vectorizadas: Opera sobre columnas completas, no fila por fila
- groupby para analisis: Patron split-apply-combine para agregacion
- Maneja NaN correctamente: Usa isna(), fillna() y dropna() para datos faltantes