Le problème sans pipelines
# WRONG: Fit scaler on all data (data leakage!)
scaler.fit(X) # should only fit on train set
X_scaled = scaler.transform(X)
cross_val_score(model, X_scaled, y) # LEAKS validation data into scaler
La bonne approche
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
pipe = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler()),
('model', LGBMClassifier()),
])
# Now CV is correct — scaler only fits on train fold
cross_val_score(pipe, X, y, cv=5) # CORRECT
class LogTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None): return self
def transform(self, X):
return np.log1p(np.abs(X))