Introduction to pandas
pandas is the cornerstone library for data manipulation in Python. It provides two primary data structures: Series (1D labeled array) and DataFrame (2D labeled table). pandas makes it easy to load, clean, transform, analyze, and export data from virtually any source.
Creating DataFrames
import pandas as pd
import numpy as np
# From a dictionary
data = {
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
"age": [30, 25, 35, 28, 32],
"city": ["NYC", "London", "Tokyo", "Paris", "Berlin"],
"salary": [95000, 72000, 110000, 88000, 105000],
}
df = pd.DataFrame(data)
print(df)
# name age city salary
# 0 Alice 30 NYC 95000
# 1 Bob 25 London 72000
# 2 Charlie 35 Tokyo 110000
# 3 Diana 28 Paris 88000
# 4 Eve 32 Berlin 105000
# From a CSV file
# df = pd.read_csv("data.csv")
# From JSON
# df = pd.read_json("data.json")
# From SQL
# import sqlite3
# conn = sqlite3.connect("database.db")
# df = pd.read_sql("SELECT * FROM users", conn)
# Basic info
print(df.shape) # (5, 4)
print(df.dtypes) # Column data types
print(df.describe()) # Statistical summary
print(df.info()) # Memory usage and non-null counts
print(df.head(3)) # First 3 rows
print(df.tail(2)) # Last 2 rows
Selecting and Filtering Data
import pandas as pd
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
"age": [30, 25, 35, 28, 32],
"department": ["Engineering", "Marketing", "Engineering", "Marketing", "Engineering"],
"salary": [95000, 72000, 110000, 88000, 105000],
})
# Select columns
print(df["name"]) # Series
print(df[["name", "age"]]) # DataFrame with two columns
# Select rows by index
print(df.iloc[0]) # First row
print(df.iloc[0:3]) # First three rows
print(df.iloc[0, 1]) # First row, second column
# Select rows by label
df.index = ["a", "b", "c", "d", "e"]
print(df.loc["a"]) # Row with label 'a'
print(df.loc["a":"c"]) # Rows a through c (inclusive!)
# Boolean filtering
young = df[df["age"] < 30]
engineers = df[df["department"] == "Engineering"]
high_earners = df[df["salary"] > 100000]
# Multiple conditions
senior_engineers = df[
(df["department"] == "Engineering") & (df["age"] > 30)
]
# isin for multiple values
selected = df[df["name"].isin(["Alice", "Bob", "Eve"])]
# query method (more readable)
result = df.query("department == 'Engineering' and salary > 100000")
# String methods
df[df["name"].str.startswith("A")]
df[df["name"].str.contains("li")]
Data Transformation
# Adding columns
df["bonus"] = df["salary"] * 0.1
df["senior"] = df["age"] >= 30
# Apply function to a column
df["name_upper"] = df["name"].apply(str.upper)
df["tax"] = df["salary"].apply(lambda s: s * 0.3 if s > 90000 else s * 0.2)
# Renaming columns
df = df.rename(columns={"name": "full_name", "age": "years"})
# Sorting
df_sorted = df.sort_values("salary", ascending=False)
df_sorted = df.sort_values(["department", "salary"], ascending=[True, False])
# Group by and aggregate
dept_stats = df.groupby("department").agg(
avg_salary=("salary", "mean"),
max_salary=("salary", "max"),
count=("name", "count"),
avg_age=("age", "mean"),
)
print(dept_stats)
# Pivot tables
pivot = df.pivot_table(
values="salary",
index="department",
columns="senior",
aggfunc="mean"
)
# Value counts
print(df["department"].value_counts())
Handling Missing Data
import pandas as pd
import numpy as np
df = pd.DataFrame({
"A": [1, 2, np.nan, 4],
"B": [np.nan, 2, 3, 4],
"C": [1, 2, 3, np.nan],
})
# Detect missing values
print(df.isna()) # Boolean mask
print(df.isna().sum()) # Count per column
# Drop missing values
df.dropna() # Drop rows with any NaN
df.dropna(subset=["A"]) # Drop only if A is NaN
df.dropna(how="all") # Drop only if all values are NaN
# Fill missing values
df.fillna(0) # Fill with constant
df["A"].fillna(df["A"].mean()) # Fill with mean
df.fillna(method="ffill") # Forward fill
df.fillna(method="bfill") # Backward fill
# Interpolation
df["A"].interpolate() # Linear interpolation
Merging and Joining
import pandas as pd
orders = pd.DataFrame({
"order_id": [1, 2, 3, 4],
"customer_id": [101, 102, 101, 103],
"amount": [250, 180, 320, 150],
})
customers = pd.DataFrame({
"customer_id": [101, 102, 103, 104],
"name": ["Alice", "Bob", "Charlie", "Diana"],
"city": ["NYC", "London", "Tokyo", "Paris"],
})
# Merge (SQL-style join)
merged = pd.merge(orders, customers, on="customer_id", how="inner")
print(merged)
# Left join (keep all orders)
left = pd.merge(orders, customers, on="customer_id", how="left")
# Concatenate DataFrames
df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
df2 = pd.DataFrame({"A": [5, 6], "B": [7, 8]})
combined = pd.concat([df1, df2], ignore_index=True)
# Export
# df.to_csv("output.csv", index=False)
# df.to_json("output.json", orient="records")
# df.to_excel("output.xlsx", index=False)
Key Takeaways
- DataFrames are central: They represent tabular data with labeled rows and columns
- Vectorized operations: Operate on entire columns, not row by row
- groupby for analysis: Split-apply-combine pattern for aggregation
- Handle NaN properly: Use isna(), fillna(), and dropna() for missing data