6.2 KiB
6.2 KiB
scikit-learn 机器学习指南
数据预处理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 归一化
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
# 缺失值填充
imputer = SimpleImputer(strategy='median') # mean, most_frequent
X_imputed = imputer.fit_transform(X)
# 标签编码
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# One-Hot 编码
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X[['category']])
特征工程
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.decomposition import PCA
# 单变量特征选择
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
# 递归特征消除
from sklearn.ensemble import RandomForestClassifier
rfe = RFE(RandomForestClassifier(), n_features_to_select=10)
X_rfe = rfe.fit_transform(X, y)
# PCA 降维
pca = PCA(n_components=0.95) # 保留95%方差
X_pca = pca.fit_transform(X)
print(f"降维后维度: {X_pca.shape[1]}")
模型训练
分类
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# 逻辑回归
lr = LogisticRegression(max_iter=1000, C=1.0)
lr.fit(X_train, y_train)
# 随机森林
rf = RandomForestClassifier(
n_estimators=100,
max_depth=10,
min_samples_split=5,
random_state=42,
n_jobs=-1
)
rf.fit(X_train, y_train)
# 梯度提升
gb = GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=5
)
gb.fit(X_train, y_train)
回归
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
# 线性回归
lr = LinearRegression()
lr.fit(X_train, y_train)
# Ridge 回归
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
# Lasso 回归
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
# 随机森林回归
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
聚类
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
# K-Means
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
# 肘部法则确定K
inertias = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
inertias.append(kmeans.inertia_)
# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(X)
模型评估
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, confusion_matrix,
mean_squared_error, mean_absolute_error, r2_score)
# 分类评估
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='macro'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='macro'):.4f}")
print(f"F1: {f1_score(y_test, y_pred, average='macro'):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_proba):.4f}")
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
# 回归评估
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")
# 交叉验证
scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
print(f"CV F1: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")
超参数调优
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# 网格搜索
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [5, 10, 15, None],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(
RandomForestClassifier(),
param_grid,
cv=5,
scoring='f1_macro',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")
# 随机搜索(大参数空间)
from scipy.stats import randint, uniform
param_dist = {
'n_estimators': randint(100, 500),
'max_depth': randint(5, 20),
'min_samples_split': randint(2, 20)
}
random_search = RandomizedSearchCV(
RandomForestClassifier(),
param_dist,
n_iter=50,
cv=5,
scoring='f1_macro',
n_jobs=-1
)
random_search.fit(X_train, y_train)
XGBoost / LightGBM
import xgboost as xgb
import lightgbm as lgb
# XGBoost
xgb_model = xgb.XGBClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
use_label_encoder=False,
eval_metric='logloss'
)
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10)
# LightGBM
lgb_model = lgb.LGBMClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
num_leaves=31,
subsample=0.8,
colsample_bytree=0.8
)
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(10)])
# 特征重要性
importance = pd.DataFrame({
'feature': X.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# 数值和类别特征分别处理
numeric_features = ['age', 'income']
categorical_features = ['gender', 'city']
preprocessor = ColumnTransformer([
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
# 完整 Pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)