Intermediate
20 min
Full Guide
Data Preparation for Machine Learning
Master feature engineering, data cleaning, handling missing values, encoding, train/test splits, cross-validation, and data balancing
Why Data Preparation Matters
Data preparation typically consumes 60-80% of a data scientist's time. The quality of your data directly determines model performance. A mediocre model with great data will outperform a great model with poor data every time.
Garbage In, Garbage Out:
No amount of model tuning can compensate for bad data. Always invest in data quality before model complexity.
Data Cleaning
import pandas as pd
import numpy as np
# Load dataset
df = pd.DataFrame({
'age': [25, 30, None, 45, 200, 35, -5, 28, None, 40],
'salary': [50000, 60000, 55000, None, 80000, 70000, 65000, None, 45000, 90000],
'city': ['NYC', 'LA', 'NYC', 'Chicago', 'NYC', None, 'LA', 'Chicago', 'LA', 'NYC'],
'experience': [2, 5, 3, 15, 10, 7, 4, 3, 1, 12],
'hired': [1, 1, 0, 1, 1, 0, 1, 0, 0, 1]
})
print("=== STEP 1: Inspect Data ===")
print(f"Shape: {df.shape}")
print(f"Missing values:\n{df.isnull().sum()}")
print(f"Data types:\n{df.dtypes}")
print("\n=== STEP 2: Handle Missing Values ===")
# Strategy 1: Drop rows with missing values (loses data!)
df_dropped = df.dropna()
# Strategy 2: Fill with statistics (mean, median, mode)
df['age'] = df['age'].fillna(df['age'].median())
df['salary'] = df['salary'].fillna(df['salary'].mean())
df['city'] = df['city'].fillna(df['city'].mode()[0])
print("\n=== STEP 3: Handle Outliers ===")
# IQR method for outlier detection
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df['age'] = df['age'].clip(lower=max(0, lower), upper=upper)
# Remove impossible values
df.loc[df['age'] < 0, 'age'] = df['age'].median()
df.loc[df['age'] > 120, 'age'] = df['age'].median()
print(f"Cleaned age range: {df['age'].min():.0f} - {df['age'].max():.0f}")
print("\n=== STEP 4: Remove Duplicates ===")
df = df.drop_duplicates()
print(f"Shape after dedup: {df.shape}")
Feature Engineering
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
# Sample dataset
df = pd.DataFrame({
'age': [25, 30, 35, 45, 28],
'salary': [50000, 60000, 70000, 90000, 55000],
'city': ['NYC', 'LA', 'Chicago', 'NYC', 'LA'],
'education': ['BS', 'MS', 'PhD', 'MS', 'BS'],
'join_date': pd.to_datetime(['2020-01-15', '2019-06-01', '2018-03-10',
'2021-09-20', '2020-11-05']),
})
# 1. Encoding categorical variables
# One-hot encoding (for nominal categories like city)
city_dummies = pd.get_dummies(df['city'], prefix='city', drop_first=True)
df = pd.concat([df, city_dummies], axis=1)
# Label encoding (for ordinal categories like education)
edu_order = {'BS': 0, 'MS': 1, 'PhD': 2}
df['education_encoded'] = df['education'].map(edu_order)
# 2. Feature scaling
scaler_standard = StandardScaler() # mean=0, std=1 (for most ML)
scaler_minmax = MinMaxScaler() # scale to [0, 1] (for neural nets)
df['age_scaled'] = scaler_standard.fit_transform(df[['age']])
df['salary_normalized'] = scaler_minmax.fit_transform(df[['salary']])
# 3. Creating new features from existing ones
df['salary_per_year_age'] = df['salary'] / df['age']
df['years_at_company'] = (pd.Timestamp.now() - df['join_date']).dt.days / 365.25
df['is_senior'] = (df['age'] > 35).astype(int)
# 4. Date features
df['join_month'] = df['join_date'].dt.month
df['join_year'] = df['join_date'].dt.year
df['join_quarter'] = df['join_date'].dt.quarter
print(df[['age', 'age_scaled', 'salary', 'salary_normalized']].round(2))
print(f"\nNew features: {[c for c in df.columns if c not in ['age','salary','city','education','join_date']]}")
Train/Test/Validation Splits
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import numpy as np
# Generate sample data
np.random.seed(42)
X = np.random.randn(1000, 10)
y = np.random.randint(0, 2, 1000)
# ===== Simple Train/Test Split =====
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")
# ===== Train/Validation/Test Split =====
# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Second split: 75% train, 25% val (of the remaining 80%)
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)
print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
# 60% train, 20% val, 20% test
# ===== K-Fold Cross-Validation =====
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []
for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
X_fold_train, X_fold_val = X[train_idx], X[val_idx]
y_fold_train, y_fold_val = y[train_idx], y[val_idx]
# Train and evaluate model on this fold
# accuracy = train_and_evaluate(X_fold_train, y_fold_train, X_fold_val, y_fold_val)
accuracy = 0.85 + np.random.random() * 0.1 # simulated
fold_scores.append(accuracy)
print(f" Fold {fold+1}: Accuracy = {accuracy:.4f}")
print(f"\nMean CV Accuracy: {np.mean(fold_scores):.4f} +/- {np.std(fold_scores):.4f}")
# Best practices:
# - Always use stratify=y for classification tasks
# - Use K-Fold CV for small datasets (< 10K samples)
# - Never touch the test set during model development
# - Use time-based splits for time series data
Handling Imbalanced Data
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
# Common in real-world: fraud (0.1%), disease (2%), churn (5%)
# Problem: Model predicts majority class always, gets 99% accuracy but 0% on minority
# 1. SMOTE: Synthetic Minority Oversampling
from imblearn.over_sampling import SMOTE
from collections import Counter
X = np.random.randn(1000, 10)
y = np.array([0]*950 + [1]*50) # 95% vs 5%
print(f"Before SMOTE: {Counter(y)}")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print(f"After SMOTE: {Counter(y_resampled)}")
# 2. Class weights (no data modification needed)
weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(zip(np.unique(y), weights))
print(f"Class weights: {class_weights}")
# {0: 0.526, 1: 10.0} -> minority class gets 10x weight in loss
# In PyTorch:
import torch
import torch.nn as nn
weight_tensor = torch.tensor([weights[0], weights[1]], dtype=torch.float32)
criterion = nn.CrossEntropyLoss(weight=weight_tensor)
# 3. Undersampling (remove majority class samples)
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X, y)
print(f"After undersampling: {Counter(y_under)}")
# 4. Evaluation metrics for imbalanced data
# DON'T use accuracy! Use:
# - Precision: of predicted positives, how many are actually positive
# - Recall: of actual positives, how many did we catch
# - F1 Score: harmonic mean of precision and recall
# - AUC-ROC: area under ROC curve (threshold-independent)
# - PR-AUC: precision-recall AUC (better for very imbalanced data)
print("\nFor imbalanced data, use F1-score or PR-AUC, NOT accuracy!")
Key Takeaways
- Data preparation consumes 60-80% of ML project time and determines model quality
- Handle missing values with median/mean imputation or model-based methods
- Use one-hot encoding for nominal categories, ordinal encoding for ordered ones
- Always stratify splits for classification; use K-Fold CV for small datasets
- For imbalanced data, use SMOTE or class weights and evaluate with F1/PR-AUC, not accuracy