268 lines
6.2 KiB
Markdown
268 lines
6.2 KiB
Markdown
|
|
# scikit-learn 机器学习指南
|
||
|
|
|
||
|
|
## 数据预处理
|
||
|
|
|
||
|
|
```python
|
||
|
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
|
||
|
|
from sklearn.impute import SimpleImputer
|
||
|
|
|
||
|
|
# 标准化
|
||
|
|
scaler = StandardScaler()
|
||
|
|
X_scaled = scaler.fit_transform(X_train)
|
||
|
|
X_test_scaled = scaler.transform(X_test)
|
||
|
|
|
||
|
|
# 归一化
|
||
|
|
scaler = MinMaxScaler()
|
||
|
|
X_normalized = scaler.fit_transform(X)
|
||
|
|
|
||
|
|
# 缺失值填充
|
||
|
|
imputer = SimpleImputer(strategy='median') # mean, most_frequent
|
||
|
|
X_imputed = imputer.fit_transform(X)
|
||
|
|
|
||
|
|
# 标签编码
|
||
|
|
le = LabelEncoder()
|
||
|
|
y_encoded = le.fit_transform(y)
|
||
|
|
|
||
|
|
# One-Hot 编码
|
||
|
|
from sklearn.preprocessing import OneHotEncoder
|
||
|
|
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
|
||
|
|
X_encoded = encoder.fit_transform(X[['category']])
|
||
|
|
```
|
||
|
|
|
||
|
|
## 特征工程
|
||
|
|
|
||
|
|
```python
|
||
|
|
from sklearn.feature_selection import SelectKBest, f_classif, RFE
|
||
|
|
from sklearn.decomposition import PCA
|
||
|
|
|
||
|
|
# 单变量特征选择
|
||
|
|
selector = SelectKBest(f_classif, k=10)
|
||
|
|
X_selected = selector.fit_transform(X, y)
|
||
|
|
selected_features = X.columns[selector.get_support()]
|
||
|
|
|
||
|
|
# 递归特征消除
|
||
|
|
from sklearn.ensemble import RandomForestClassifier
|
||
|
|
rfe = RFE(RandomForestClassifier(), n_features_to_select=10)
|
||
|
|
X_rfe = rfe.fit_transform(X, y)
|
||
|
|
|
||
|
|
# PCA 降维
|
||
|
|
pca = PCA(n_components=0.95) # 保留95%方差
|
||
|
|
X_pca = pca.fit_transform(X)
|
||
|
|
print(f"降维后维度: {X_pca.shape[1]}")
|
||
|
|
```
|
||
|
|
|
||
|
|
## 模型训练
|
||
|
|
|
||
|
|
### 分类
|
||
|
|
|
||
|
|
```python
|
||
|
|
from sklearn.linear_model import LogisticRegression
|
||
|
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
||
|
|
from sklearn.svm import SVC
|
||
|
|
from sklearn.neighbors import KNeighborsClassifier
|
||
|
|
|
||
|
|
# 逻辑回归
|
||
|
|
lr = LogisticRegression(max_iter=1000, C=1.0)
|
||
|
|
lr.fit(X_train, y_train)
|
||
|
|
|
||
|
|
# 随机森林
|
||
|
|
rf = RandomForestClassifier(
|
||
|
|
n_estimators=100,
|
||
|
|
max_depth=10,
|
||
|
|
min_samples_split=5,
|
||
|
|
random_state=42,
|
||
|
|
n_jobs=-1
|
||
|
|
)
|
||
|
|
rf.fit(X_train, y_train)
|
||
|
|
|
||
|
|
# 梯度提升
|
||
|
|
gb = GradientBoostingClassifier(
|
||
|
|
n_estimators=100,
|
||
|
|
learning_rate=0.1,
|
||
|
|
max_depth=5
|
||
|
|
)
|
||
|
|
gb.fit(X_train, y_train)
|
||
|
|
```
|
||
|
|
|
||
|
|
### 回归
|
||
|
|
|
||
|
|
```python
|
||
|
|
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
|
||
|
|
from sklearn.ensemble import RandomForestRegressor
|
||
|
|
|
||
|
|
# 线性回归
|
||
|
|
lr = LinearRegression()
|
||
|
|
lr.fit(X_train, y_train)
|
||
|
|
|
||
|
|
# Ridge 回归
|
||
|
|
ridge = Ridge(alpha=1.0)
|
||
|
|
ridge.fit(X_train, y_train)
|
||
|
|
|
||
|
|
# Lasso 回归
|
||
|
|
lasso = Lasso(alpha=0.1)
|
||
|
|
lasso.fit(X_train, y_train)
|
||
|
|
|
||
|
|
# 随机森林回归
|
||
|
|
rf = RandomForestRegressor(n_estimators=100, random_state=42)
|
||
|
|
rf.fit(X_train, y_train)
|
||
|
|
```
|
||
|
|
|
||
|
|
### 聚类
|
||
|
|
|
||
|
|
```python
|
||
|
|
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
||
|
|
|
||
|
|
# K-Means
|
||
|
|
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
|
||
|
|
labels = kmeans.fit_predict(X)
|
||
|
|
|
||
|
|
# 肘部法则确定K
|
||
|
|
inertias = []
|
||
|
|
for k in range(1, 11):
|
||
|
|
kmeans = KMeans(n_clusters=k, random_state=42)
|
||
|
|
kmeans.fit(X)
|
||
|
|
inertias.append(kmeans.inertia_)
|
||
|
|
|
||
|
|
# DBSCAN
|
||
|
|
dbscan = DBSCAN(eps=0.5, min_samples=5)
|
||
|
|
labels = dbscan.fit_predict(X)
|
||
|
|
```
|
||
|
|
|
||
|
|
## 模型评估
|
||
|
|
|
||
|
|
```python
|
||
|
|
from sklearn.model_selection import cross_val_score, GridSearchCV
|
||
|
|
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
|
||
|
|
f1_score, roc_auc_score, confusion_matrix,
|
||
|
|
mean_squared_error, mean_absolute_error, r2_score)
|
||
|
|
|
||
|
|
# 分类评估
|
||
|
|
y_pred = model.predict(X_test)
|
||
|
|
y_proba = model.predict_proba(X_test)[:, 1]
|
||
|
|
|
||
|
|
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
|
||
|
|
print(f"Precision: {precision_score(y_test, y_pred, average='macro'):.4f}")
|
||
|
|
print(f"Recall: {recall_score(y_test, y_pred, average='macro'):.4f}")
|
||
|
|
print(f"F1: {f1_score(y_test, y_pred, average='macro'):.4f}")
|
||
|
|
print(f"AUC: {roc_auc_score(y_test, y_proba):.4f}")
|
||
|
|
|
||
|
|
# 混淆矩阵
|
||
|
|
cm = confusion_matrix(y_test, y_pred)
|
||
|
|
|
||
|
|
# 回归评估
|
||
|
|
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
|
||
|
|
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}")
|
||
|
|
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
|
||
|
|
print(f"R²: {r2_score(y_test, y_pred):.4f}")
|
||
|
|
|
||
|
|
# 交叉验证
|
||
|
|
scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
|
||
|
|
print(f"CV F1: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")
|
||
|
|
```
|
||
|
|
|
||
|
|
## 超参数调优
|
||
|
|
|
||
|
|
```python
|
||
|
|
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
|
||
|
|
|
||
|
|
# 网格搜索
|
||
|
|
param_grid = {
|
||
|
|
'n_estimators': [100, 200, 300],
|
||
|
|
'max_depth': [5, 10, 15, None],
|
||
|
|
'min_samples_split': [2, 5, 10]
|
||
|
|
}
|
||
|
|
|
||
|
|
grid_search = GridSearchCV(
|
||
|
|
RandomForestClassifier(),
|
||
|
|
param_grid,
|
||
|
|
cv=5,
|
||
|
|
scoring='f1_macro',
|
||
|
|
n_jobs=-1,
|
||
|
|
verbose=1
|
||
|
|
)
|
||
|
|
grid_search.fit(X_train, y_train)
|
||
|
|
|
||
|
|
print(f"Best params: {grid_search.best_params_}")
|
||
|
|
print(f"Best score: {grid_search.best_score_:.4f}")
|
||
|
|
|
||
|
|
# 随机搜索(大参数空间)
|
||
|
|
from scipy.stats import randint, uniform
|
||
|
|
|
||
|
|
param_dist = {
|
||
|
|
'n_estimators': randint(100, 500),
|
||
|
|
'max_depth': randint(5, 20),
|
||
|
|
'min_samples_split': randint(2, 20)
|
||
|
|
}
|
||
|
|
|
||
|
|
random_search = RandomizedSearchCV(
|
||
|
|
RandomForestClassifier(),
|
||
|
|
param_dist,
|
||
|
|
n_iter=50,
|
||
|
|
cv=5,
|
||
|
|
scoring='f1_macro',
|
||
|
|
n_jobs=-1
|
||
|
|
)
|
||
|
|
random_search.fit(X_train, y_train)
|
||
|
|
```
|
||
|
|
|
||
|
|
## XGBoost / LightGBM
|
||
|
|
|
||
|
|
```python
|
||
|
|
import xgboost as xgb
|
||
|
|
import lightgbm as lgb
|
||
|
|
|
||
|
|
# XGBoost
|
||
|
|
xgb_model = xgb.XGBClassifier(
|
||
|
|
n_estimators=100,
|
||
|
|
max_depth=6,
|
||
|
|
learning_rate=0.1,
|
||
|
|
subsample=0.8,
|
||
|
|
colsample_bytree=0.8,
|
||
|
|
use_label_encoder=False,
|
||
|
|
eval_metric='logloss'
|
||
|
|
)
|
||
|
|
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10)
|
||
|
|
|
||
|
|
# LightGBM
|
||
|
|
lgb_model = lgb.LGBMClassifier(
|
||
|
|
n_estimators=100,
|
||
|
|
max_depth=6,
|
||
|
|
learning_rate=0.1,
|
||
|
|
num_leaves=31,
|
||
|
|
subsample=0.8,
|
||
|
|
colsample_bytree=0.8
|
||
|
|
)
|
||
|
|
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(10)])
|
||
|
|
|
||
|
|
# 特征重要性
|
||
|
|
importance = pd.DataFrame({
|
||
|
|
'feature': X.columns,
|
||
|
|
'importance': model.feature_importances_
|
||
|
|
}).sort_values('importance', ascending=False)
|
||
|
|
```
|
||
|
|
|
||
|
|
## Pipeline
|
||
|
|
|
||
|
|
```python
|
||
|
|
from sklearn.pipeline import Pipeline
|
||
|
|
from sklearn.compose import ColumnTransformer
|
||
|
|
|
||
|
|
# 数值和类别特征分别处理
|
||
|
|
numeric_features = ['age', 'income']
|
||
|
|
categorical_features = ['gender', 'city']
|
||
|
|
|
||
|
|
preprocessor = ColumnTransformer([
|
||
|
|
('num', StandardScaler(), numeric_features),
|
||
|
|
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
|
||
|
|
])
|
||
|
|
|
||
|
|
# 完整 Pipeline
|
||
|
|
pipeline = Pipeline([
|
||
|
|
('preprocessor', preprocessor),
|
||
|
|
('classifier', RandomForestClassifier())
|
||
|
|
])
|
||
|
|
|
||
|
|
pipeline.fit(X_train, y_train)
|
||
|
|
y_pred = pipeline.predict(X_test)
|
||
|
|
```
|