Feature Selection
Feature selection is the process of selecting a subset of relevant features for model training, removing redundant or irrelevant features to improve performance and interpretability.
Why Feature Selection?
Original: [f1, f2, f3, f4, f5, ..., f100]
↓
Feature Selection
↓
Selected: [f2, f7, f23, f45] ← Most relevant
Benefits
- Reduced overfitting: Fewer features = less noise
- Improved accuracy: Remove misleading features
- Faster training: Less computation
- Better interpretability: Understand what matters
- Lower storage: Smaller datasets
Three Main Approaches
┌─────────────────────────────────────────────────────┐
│ Feature Selection Methods │
├─────────────────┬─────────────────┬─────────────────┤
│ Filter │ Wrapper │ Embedded │
│ │ │ │
│ Statistical │ Model-based │ Built into │
│ tests only │ evaluation │ training │
│ │ │ │
│ Fast │ Slow │ Medium │
│ Model-agnostic │ Model-specific │ Model-specific │
└─────────────────┴─────────────────┴─────────────────┘
Filter Methods
Evaluate features independently of any model.
Correlation-Based
import pandas as pd
import numpy as np
# Remove highly correlated features
def remove_correlated_features(df, threshold=0.9):
corr_matrix = df.corr().abs()
upper = corr_matrix.where(
np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)
to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
return df.drop(columns=to_drop)
# Correlation with target
correlations = df.corrwith(target).abs().sort_values(ascending=False)
top_features = correlations.head(10).index.tolist()
Variance Threshold
from sklearn.feature_selection import VarianceThreshold
# Remove low-variance features (near-constant)
selector = VarianceThreshold(threshold=0.01)
X_filtered = selector.fit_transform(X)
selected_features = X.columns[selector.get_support()]
Statistical Tests
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
# ANOVA F-test (for classification)
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
scores = pd.DataFrame({
'feature': X.columns,
'score': selector.scores_
}).sort_values('score', ascending=False)
# Mutual Information (captures non-linear relationships)
selector_mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_selected_mi = selector_mi.fit_transform(X, y)
Chi-Square (Categorical)
from sklearn.feature_selection import chi2
# For non-negative features only
selector = SelectKBest(score_func=chi2, k=10)
X_selected = selector.fit_transform(X, y)
Wrapper Methods
Use a model to evaluate feature subsets.
Recursive Feature Elimination (RFE)
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier
# Basic RFE
model = RandomForestClassifier(n_estimators=100)
rfe = RFE(estimator=model, n_features_to_select=10)
rfe.fit(X, y)
selected = X.columns[rfe.support_]
ranking = pd.DataFrame({
'feature': X.columns,
'ranking': rfe.ranking_
}).sort_values('ranking')
# RFE with cross-validation (finds optimal number)
rfecv = RFECV(estimator=model, step=1, cv=5, scoring='accuracy')
rfecv.fit(X, y)
print(f"Optimal features: {rfecv.n_features_}")
Forward/Backward Selection
from mlxtend.feature_selection import SequentialFeatureSelector
# Forward selection (add features one by one)
sfs = SequentialFeatureSelector(
model,
k_features=10,
forward=True,
scoring='accuracy',
cv=5
)
sfs.fit(X, y)
selected_features = list(sfs.k_feature_names_)
# Backward selection (remove features one by one)
sbs = SequentialFeatureSelector(
model,
k_features=10,
forward=False, # Backward
scoring='accuracy',
cv=5
)
sbs.fit(X, y)
Embedded Methods
Feature selection during model training.
L1 Regularization (Lasso)
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
# Lasso automatically zeros out irrelevant features
lasso = LassoCV(cv=5)
lasso.fit(X, y)
# Get non-zero features
selected = X.columns[lasso.coef_ != 0]
print(f"Selected {len(selected)} features")
# Or use SelectFromModel
selector = SelectFromModel(lasso, prefit=True)
X_selected = selector.transform(X)
Tree-Based Importance
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# Random Forest feature importance
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, y)
importances = pd.DataFrame({
'feature': X.columns,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
# Select top features
top_n = 15
selected_features = importances.head(top_n)['feature'].tolist()
# Plot
import matplotlib.pyplot as plt
plt.barh(importances['feature'][:20], importances['importance'][:20])
plt.xlabel('Importance')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.show()
Permutation Importance
from sklearn.inspection import permutation_importance
# More reliable than built-in importance
result = permutation_importance(
rf, X_test, y_test,
n_repeats=10,
random_state=42
)
importances = pd.DataFrame({
'feature': X.columns,
'importance': result.importances_mean,
'std': result.importances_std
}).sort_values('importance', ascending=False)
Comparison of Methods
| Method | Speed | Considers Feature Interactions | Requires Model |
|---|---|---|---|
| Variance Threshold | Fast | No | No |
| Correlation | Fast | Pairwise only | No |
| Statistical Tests | Fast | No | No |
| RFE | Slow | Yes | Yes |
| Forward/Backward | Very Slow | Yes | Yes |
| L1 Regularization | Medium | Some | Yes |
| Tree Importance | Medium | Yes | Yes |
Practical Workflow
def feature_selection_pipeline(X, y, n_features=20):
results = {}
# 1. Remove low variance
var_selector = VarianceThreshold(threshold=0.01)
X_var = var_selector.fit_transform(X)
print(f"After variance filter: {X_var.shape[1]} features")
# 2. Remove highly correlated
X_df = pd.DataFrame(X_var)
corr_matrix = X_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]
X_uncorr = X_df.drop(columns=to_drop)
print(f"After correlation filter: {X_uncorr.shape[1]} features")
# 3. Statistical test
selector = SelectKBest(score_func=mutual_info_classif, k=min(50, X_uncorr.shape[1]))
X_stat = selector.fit_transform(X_uncorr, y)
print(f"After statistical filter: {X_stat.shape[1]} features")
# 4. Model-based selection
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=rf, n_features_to_select=n_features)
X_final = rfe.fit_transform(X_stat, y)
print(f"Final: {X_final.shape[1]} features")
return X_final, rfe.support_
Common Pitfalls
1. Data Leakage
# WRONG: Feature selection on entire dataset
selector.fit(X, y)
X_train_selected = selector.transform(X_train) # Leakage!
# CORRECT: Only fit on training data
selector.fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test) # Same features
2. Ignoring Feature Interactions
# Features may be useless alone but powerful together
# Solution: Use wrapper or embedded methods
3. Over-selecting
# Too few features can hurt performance
# Use cross-validation to find optimal number
Key Takeaways
- Filter methods are fast but don't consider feature interactions
- Wrapper methods are thorough but computationally expensive
- Embedded methods balance speed and effectiveness
- Always perform feature selection on training data only
- Use cross-validation to determine optimal number of features
- Combine multiple methods for robust selection