TechLead
Lesson 21 of 25
5 min read
Python

Pandas Basics

Learn pandas for data manipulation including DataFrames, filtering, grouping, and data cleaning

Introduction to pandas

pandas is the cornerstone library for data manipulation in Python. It provides two primary data structures: Series (1D labeled array) and DataFrame (2D labeled table). pandas makes it easy to load, clean, transform, analyze, and export data from virtually any source.

Creating DataFrames

import pandas as pd
import numpy as np

# From a dictionary
data = {
    "name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
    "age": [30, 25, 35, 28, 32],
    "city": ["NYC", "London", "Tokyo", "Paris", "Berlin"],
    "salary": [95000, 72000, 110000, 88000, 105000],
}
df = pd.DataFrame(data)
print(df)
#       name  age    city  salary
# 0    Alice   30     NYC   95000
# 1      Bob   25  London   72000
# 2  Charlie   35   Tokyo  110000
# 3    Diana   28   Paris   88000
# 4      Eve   32  Berlin  105000

# From a CSV file
# df = pd.read_csv("data.csv")

# From JSON
# df = pd.read_json("data.json")

# From SQL
# import sqlite3
# conn = sqlite3.connect("database.db")
# df = pd.read_sql("SELECT * FROM users", conn)

# Basic info
print(df.shape)      # (5, 4)
print(df.dtypes)     # Column data types
print(df.describe()) # Statistical summary
print(df.info())     # Memory usage and non-null counts
print(df.head(3))    # First 3 rows
print(df.tail(2))    # Last 2 rows

Selecting and Filtering Data

import pandas as pd

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
    "age": [30, 25, 35, 28, 32],
    "department": ["Engineering", "Marketing", "Engineering", "Marketing", "Engineering"],
    "salary": [95000, 72000, 110000, 88000, 105000],
})

# Select columns
print(df["name"])            # Series
print(df[["name", "age"]])   # DataFrame with two columns

# Select rows by index
print(df.iloc[0])       # First row
print(df.iloc[0:3])     # First three rows
print(df.iloc[0, 1])    # First row, second column

# Select rows by label
df.index = ["a", "b", "c", "d", "e"]
print(df.loc["a"])           # Row with label 'a'
print(df.loc["a":"c"])       # Rows a through c (inclusive!)

# Boolean filtering
young = df[df["age"] < 30]
engineers = df[df["department"] == "Engineering"]
high_earners = df[df["salary"] > 100000]

# Multiple conditions
senior_engineers = df[
    (df["department"] == "Engineering") & (df["age"] > 30)
]

# isin for multiple values
selected = df[df["name"].isin(["Alice", "Bob", "Eve"])]

# query method (more readable)
result = df.query("department == 'Engineering' and salary > 100000")

# String methods
df[df["name"].str.startswith("A")]
df[df["name"].str.contains("li")]

Data Transformation

# Adding columns
df["bonus"] = df["salary"] * 0.1
df["senior"] = df["age"] >= 30

# Apply function to a column
df["name_upper"] = df["name"].apply(str.upper)
df["tax"] = df["salary"].apply(lambda s: s * 0.3 if s > 90000 else s * 0.2)

# Renaming columns
df = df.rename(columns={"name": "full_name", "age": "years"})

# Sorting
df_sorted = df.sort_values("salary", ascending=False)
df_sorted = df.sort_values(["department", "salary"], ascending=[True, False])

# Group by and aggregate
dept_stats = df.groupby("department").agg(
    avg_salary=("salary", "mean"),
    max_salary=("salary", "max"),
    count=("name", "count"),
    avg_age=("age", "mean"),
)
print(dept_stats)

# Pivot tables
pivot = df.pivot_table(
    values="salary",
    index="department",
    columns="senior",
    aggfunc="mean"
)

# Value counts
print(df["department"].value_counts())

Handling Missing Data

import pandas as pd
import numpy as np

df = pd.DataFrame({
    "A": [1, 2, np.nan, 4],
    "B": [np.nan, 2, 3, 4],
    "C": [1, 2, 3, np.nan],
})

# Detect missing values
print(df.isna())         # Boolean mask
print(df.isna().sum())   # Count per column

# Drop missing values
df.dropna()              # Drop rows with any NaN
df.dropna(subset=["A"])  # Drop only if A is NaN
df.dropna(how="all")     # Drop only if all values are NaN

# Fill missing values
df.fillna(0)                    # Fill with constant
df["A"].fillna(df["A"].mean())  # Fill with mean
df.fillna(method="ffill")       # Forward fill
df.fillna(method="bfill")       # Backward fill

# Interpolation
df["A"].interpolate()           # Linear interpolation

Merging and Joining

import pandas as pd

orders = pd.DataFrame({
    "order_id": [1, 2, 3, 4],
    "customer_id": [101, 102, 101, 103],
    "amount": [250, 180, 320, 150],
})

customers = pd.DataFrame({
    "customer_id": [101, 102, 103, 104],
    "name": ["Alice", "Bob", "Charlie", "Diana"],
    "city": ["NYC", "London", "Tokyo", "Paris"],
})

# Merge (SQL-style join)
merged = pd.merge(orders, customers, on="customer_id", how="inner")
print(merged)

# Left join (keep all orders)
left = pd.merge(orders, customers, on="customer_id", how="left")

# Concatenate DataFrames
df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
df2 = pd.DataFrame({"A": [5, 6], "B": [7, 8]})
combined = pd.concat([df1, df2], ignore_index=True)

# Export
# df.to_csv("output.csv", index=False)
# df.to_json("output.json", orient="records")
# df.to_excel("output.xlsx", index=False)

Key Takeaways

  • DataFrames are central: They represent tabular data with labeled rows and columns
  • Vectorized operations: Operate on entire columns, not row by row
  • groupby for analysis: Split-apply-combine pattern for aggregation
  • Handle NaN properly: Use isna(), fillna(), and dropna() for missing data

Continue Learning