# scikit-learn 机器学习指南 ## 数据预处理 ```python from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder from sklearn.impute import SimpleImputer # 标准化 scaler = StandardScaler() X_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # 归一化 scaler = MinMaxScaler() X_normalized = scaler.fit_transform(X) # 缺失值填充 imputer = SimpleImputer(strategy='median') # mean, most_frequent X_imputed = imputer.fit_transform(X) # 标签编码 le = LabelEncoder() y_encoded = le.fit_transform(y) # One-Hot 编码 from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') X_encoded = encoder.fit_transform(X[['category']]) ``` ## 特征工程 ```python from sklearn.feature_selection import SelectKBest, f_classif, RFE from sklearn.decomposition import PCA # 单变量特征选择 selector = SelectKBest(f_classif, k=10) X_selected = selector.fit_transform(X, y) selected_features = X.columns[selector.get_support()] # 递归特征消除 from sklearn.ensemble import RandomForestClassifier rfe = RFE(RandomForestClassifier(), n_features_to_select=10) X_rfe = rfe.fit_transform(X, y) # PCA 降维 pca = PCA(n_components=0.95) # 保留95%方差 X_pca = pca.fit_transform(X) print(f"降维后维度: {X_pca.shape[1]}") ``` ## 模型训练 ### 分类 ```python from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier # 逻辑回归 lr = LogisticRegression(max_iter=1000, C=1.0) lr.fit(X_train, y_train) # 随机森林 rf = RandomForestClassifier( n_estimators=100, max_depth=10, min_samples_split=5, random_state=42, n_jobs=-1 ) rf.fit(X_train, y_train) # 梯度提升 gb = GradientBoostingClassifier( n_estimators=100, learning_rate=0.1, max_depth=5 ) gb.fit(X_train, y_train) ``` ### 回归 ```python from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet from sklearn.ensemble import RandomForestRegressor # 线性回归 lr = LinearRegression() lr.fit(X_train, y_train) # Ridge 回归 ridge = Ridge(alpha=1.0) ridge.fit(X_train, y_train) # Lasso 回归 lasso = Lasso(alpha=0.1) lasso.fit(X_train, y_train) # 随机森林回归 rf = RandomForestRegressor(n_estimators=100, random_state=42) rf.fit(X_train, y_train) ``` ### 聚类 ```python from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering # K-Means kmeans = KMeans(n_clusters=5, random_state=42, n_init=10) labels = kmeans.fit_predict(X) # 肘部法则确定K inertias = [] for k in range(1, 11): kmeans = KMeans(n_clusters=k, random_state=42) kmeans.fit(X) inertias.append(kmeans.inertia_) # DBSCAN dbscan = DBSCAN(eps=0.5, min_samples=5) labels = dbscan.fit_predict(X) ``` ## 模型评估 ```python from sklearn.model_selection import cross_val_score, GridSearchCV from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score) # 分类评估 y_pred = model.predict(X_test) y_proba = model.predict_proba(X_test)[:, 1] print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}") print(f"Precision: {precision_score(y_test, y_pred, average='macro'):.4f}") print(f"Recall: {recall_score(y_test, y_pred, average='macro'):.4f}") print(f"F1: {f1_score(y_test, y_pred, average='macro'):.4f}") print(f"AUC: {roc_auc_score(y_test, y_proba):.4f}") # 混淆矩阵 cm = confusion_matrix(y_test, y_pred) # 回归评估 print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}") print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}") print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}") print(f"R²: {r2_score(y_test, y_pred):.4f}") # 交叉验证 scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro') print(f"CV F1: {scores.mean():.4f} (+/- {scores.std()*2:.4f})") ``` ## 超参数调优 ```python from sklearn.model_selection import GridSearchCV, RandomizedSearchCV # 网格搜索 param_grid = { 'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10] } grid_search = GridSearchCV( RandomForestClassifier(), param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1 ) grid_search.fit(X_train, y_train) print(f"Best params: {grid_search.best_params_}") print(f"Best score: {grid_search.best_score_:.4f}") # 随机搜索(大参数空间) from scipy.stats import randint, uniform param_dist = { 'n_estimators': randint(100, 500), 'max_depth': randint(5, 20), 'min_samples_split': randint(2, 20) } random_search = RandomizedSearchCV( RandomForestClassifier(), param_dist, n_iter=50, cv=5, scoring='f1_macro', n_jobs=-1 ) random_search.fit(X_train, y_train) ``` ## XGBoost / LightGBM ```python import xgboost as xgb import lightgbm as lgb # XGBoost xgb_model = xgb.XGBClassifier( n_estimators=100, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='logloss' ) xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10) # LightGBM lgb_model = lgb.LGBMClassifier( n_estimators=100, max_depth=6, learning_rate=0.1, num_leaves=31, subsample=0.8, colsample_bytree=0.8 ) lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(10)]) # 特征重要性 importance = pd.DataFrame({ 'feature': X.columns, 'importance': model.feature_importances_ }).sort_values('importance', ascending=False) ``` ## Pipeline ```python from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer # 数值和类别特征分别处理 numeric_features = ['age', 'income'] categorical_features = ['gender', 'city'] preprocessor = ColumnTransformer([ ('num', StandardScaler(), numeric_features), ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) ]) # 完整 Pipeline pipeline = Pipeline([ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier()) ]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) ```