bookworm-smart-assistant/skills/ai-ml-expert/scripts/evaluate.py

#!/usr/bin/env python3
"""
AI/ML 评估工具函数
Evaluation Utility Functions
"""

import numpy as np
import torch
from typing import Dict, List, Optional, Union
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    mean_squared_error, mean_absolute_error, r2_score
)
import matplotlib.pyplot as plt
import seaborn as sns


def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray,
                          y_proba: Optional[np.ndarray] = None,
                          average: str = 'macro') -> Dict[str, float]:
    """计算分类指标"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average=average, zero_division=0),
        'recall': recall_score(y_true, y_pred, average=average, zero_division=0),
        'f1': f1_score(y_true, y_pred, average=average, zero_division=0)
    }
    
    if y_proba is not None:
        try:
            if y_proba.ndim == 1 or y_proba.shape[1] == 2:
                # 二分类
                proba = y_proba[:, 1] if y_proba.ndim == 2 else y_proba
                metrics['auc'] = roc_auc_score(y_true, proba)
            else:
                # 多分类
                metrics['auc'] = roc_auc_score(y_true, y_proba, multi_class='ovr', average=average)
        except Exception:
            pass
    
    return metrics


def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
    """计算回归指标"""
    return {
        'mse': mean_squared_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2': r2_score(y_true, y_pred),
        'mape': np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100
    }


def plot_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray,
                         class_names: Optional[List[str]] = None,
                         normalize: bool = True,
                         figsize: tuple = (10, 8)) -> plt.Figure:
    """绘制混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    fig, ax = plt.subplots(figsize=figsize)
    sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd',
                cmap='Blues', ax=ax,
                xticklabels=class_names, yticklabels=class_names)
    ax.set_xlabel('预测标签')
    ax.set_ylabel('真实标签')
    ax.set_title('混淆矩阵' + (' (归一化)' if normalize else ''))
    plt.tight_layout()
    return fig


def plot_roc_curve(y_true: np.ndarray, y_proba: np.ndarray,
                  class_names: Optional[List[str]] = None,
                  figsize: tuple = (10, 8)) -> plt.Figure:
    """绘制 ROC 曲线"""
    from sklearn.metrics import roc_curve, auc
    from sklearn.preprocessing import label_binarize
    
    fig, ax = plt.subplots(figsize=figsize)
    
    if y_proba.ndim == 1 or y_proba.shape[1] == 2:
        # 二分类
        proba = y_proba[:, 1] if y_proba.ndim == 2 else y_proba
        fpr, tpr, _ = roc_curve(y_true, proba)
        roc_auc = auc(fpr, tpr)
        ax.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc:.3f})')
    else:
        # 多分类
        n_classes = y_proba.shape[1]
        y_bin = label_binarize(y_true, classes=range(n_classes))
        
        for i in range(n_classes):
            fpr, tpr, _ = roc_curve(y_bin[:, i], y_proba[:, i])
            roc_auc = auc(fpr, tpr)
            label = class_names[i] if class_names else f'Class {i}'
            ax.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.3f})')
    
    ax.plot([0, 1], [0, 1], 'k--', label='随机')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('ROC 曲线')
    ax.legend(loc='lower right')
    plt.tight_layout()
    return fig


def plot_precision_recall_curve(y_true: np.ndarray, y_proba: np.ndarray,
                               figsize: tuple = (10, 8)) -> plt.Figure:
    """绘制 PR 曲线"""
    from sklearn.metrics import precision_recall_curve, average_precision_score
    
    fig, ax = plt.subplots(figsize=figsize)
    
    proba = y_proba[:, 1] if y_proba.ndim == 2 else y_proba
    precision, recall, _ = precision_recall_curve(y_true, proba)
    ap = average_precision_score(y_true, proba)
    
    ax.plot(recall, precision, label=f'PR (AP = {ap:.3f})')
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title('Precision-Recall 曲线')
    ax.legend()
    plt.tight_layout()
    return fig


def plot_learning_curves(history: Dict[str, List[float]],
                        figsize: tuple = (12, 4)) -> plt.Figure:
    """绘制学习曲线"""
    fig, axes = plt.subplots(1, 2, figsize=figsize)
    
    # Loss
    axes[0].plot(history['train_loss'], label='Train')
    if 'val_loss' in history:
        axes[0].plot(history['val_loss'], label='Validation')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('Loss 曲线')
    axes[0].legend()
    
    # Accuracy
    if 'train_acc' in history:
        axes[1].plot(history['train_acc'], label='Train')
        if 'val_acc' in history:
            axes[1].plot(history['val_acc'], label='Validation')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Accuracy')
        axes[1].set_title('Accuracy 曲线')
        axes[1].legend()
    
    plt.tight_layout()
    return fig


def plot_feature_importance(importance: np.ndarray, feature_names: List[str],
                           top_k: int = 20, figsize: tuple = (10, 8)) -> plt.Figure:
    """绘制特征重要性"""
    idx = np.argsort(importance)[-top_k:]
    
    fig, ax = plt.subplots(figsize=figsize)
    ax.barh(range(len(idx)), importance[idx])
    ax.set_yticks(range(len(idx)))
    ax.set_yticklabels([feature_names[i] for i in idx])
    ax.set_xlabel('重要性')
    ax.set_title(f'特征重要性 Top {top_k}')
    plt.tight_layout()
    return fig


def print_classification_report(y_true: np.ndarray, y_pred: np.ndarray,
                               class_names: Optional[List[str]] = None):
    """打印分类报告"""
    print("\n" + "="*60)
    print("分类报告")
    print("="*60)
    print(classification_report(y_true, y_pred, target_names=class_names, digits=4))


# NER 评估
def ner_metrics(y_true: List[List[str]], y_pred: List[List[str]]) -> Dict[str, float]:
    """NER 实体级别评估"""
    from seqeval.metrics import f1_score as seq_f1, precision_score as seq_p, recall_score as seq_r
    
    return {
        'precision': seq_p(y_true, y_pred),
        'recall': seq_r(y_true, y_pred),
        'f1': seq_f1(y_true, y_pred)
    }


# 目标检测评估
def compute_iou(box1: np.ndarray, box2: np.ndarray) -> float:
    """计算 IoU"""
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union = area1 + area2 - intersection
    
    return intersection / union if union > 0 else 0


if __name__ == '__main__':
    # 测试分类指标
    y_true = np.array([0, 1, 1, 0, 1, 0, 1, 1])
    y_pred = np.array([0, 1, 0, 0, 1, 1, 1, 1])
    y_proba = np.array([[0.8, 0.2], [0.3, 0.7], [0.6, 0.4], [0.9, 0.1],
                        [0.2, 0.8], [0.4, 0.6], [0.3, 0.7], [0.1, 0.9]])
    
    metrics = classification_metrics(y_true, y_pred, y_proba)
    print("分类指标:", metrics)
    
    # 测试回归指标
    y_true_reg = np.array([3.0, 5.0, 2.5, 7.0])
    y_pred_reg = np.array([2.8, 5.2, 2.3, 6.8])
    
    metrics_reg = regression_metrics(y_true_reg, y_pred_reg)
    print("回归指标:", metrics_reg)
Initial: Bookworm Smart Assistant v6.5.1 (byte-preserved, 809 files, fp 26b83e1b38cdf64a) 2026-04-21 17:57:05 +08:00			`#!/usr/bin/env python3`
			`"""`
			`AI/ML 评估工具函数`
			`Evaluation Utility Functions`
			`"""`

			`import numpy as np`
			`import torch`
			`from typing import Dict, List, Optional, Union`
			`from sklearn.metrics import (`
			`accuracy_score, precision_score, recall_score, f1_score,`
			`roc_auc_score, confusion_matrix, classification_report,`
			`mean_squared_error, mean_absolute_error, r2_score`
			`)`
			`import matplotlib.pyplot as plt`
			`import seaborn as sns`


			`def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray,`
			`y_proba: Optional[np.ndarray] = None,`
			`average: str = 'macro') -> Dict[str, float]:`
			`"""计算分类指标"""`
			`metrics = {`
			`'accuracy': accuracy_score(y_true, y_pred),`
			`'precision': precision_score(y_true, y_pred, average=average, zero_division=0),`
			`'recall': recall_score(y_true, y_pred, average=average, zero_division=0),`
			`'f1': f1_score(y_true, y_pred, average=average, zero_division=0)`
			`}`

			`if y_proba is not None:`
			`try:`
			`if y_proba.ndim == 1 or y_proba.shape[1] == 2:`
			`# 二分类`
			`proba = y_proba[:, 1] if y_proba.ndim == 2 else y_proba`
			`metrics['auc'] = roc_auc_score(y_true, proba)`
			`else:`
			`# 多分类`
			`metrics['auc'] = roc_auc_score(y_true, y_proba, multi_class='ovr', average=average)`
			`except Exception:`
			`pass`

			`return metrics`


			`def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:`
			`"""计算回归指标"""`
			`return {`
			`'mse': mean_squared_error(y_true, y_pred),`
			`'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),`
			`'mae': mean_absolute_error(y_true, y_pred),`
			`'r2': r2_score(y_true, y_pred),`
			`'mape': np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100`
			`}`


			`def plot_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray,`
			`class_names: Optional[List[str]] = None,`
			`normalize: bool = True,`
			`figsize: tuple = (10, 8)) -> plt.Figure:`
			`"""绘制混淆矩阵"""`
			`cm = confusion_matrix(y_true, y_pred)`
			`if normalize:`
			`cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]`

			`fig, ax = plt.subplots(figsize=figsize)`
			`sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd',`
			`cmap='Blues', ax=ax,`
			`xticklabels=class_names, yticklabels=class_names)`
			`ax.set_xlabel('预测标签')`
			`ax.set_ylabel('真实标签')`
			`ax.set_title('混淆矩阵' + (' (归一化)' if normalize else ''))`
			`plt.tight_layout()`
			`return fig`


			`def plot_roc_curve(y_true: np.ndarray, y_proba: np.ndarray,`
			`class_names: Optional[List[str]] = None,`
			`figsize: tuple = (10, 8)) -> plt.Figure:`
			`"""绘制 ROC 曲线"""`
			`from sklearn.metrics import roc_curve, auc`
			`from sklearn.preprocessing import label_binarize`

			`fig, ax = plt.subplots(figsize=figsize)`

			`if y_proba.ndim == 1 or y_proba.shape[1] == 2:`
			`# 二分类`
			`proba = y_proba[:, 1] if y_proba.ndim == 2 else y_proba`
			`fpr, tpr, _ = roc_curve(y_true, proba)`
			`roc_auc = auc(fpr, tpr)`
			`ax.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc:.3f})')`
			`else:`
			`# 多分类`
			`n_classes = y_proba.shape[1]`
			`y_bin = label_binarize(y_true, classes=range(n_classes))`

			`for i in range(n_classes):`
			`fpr, tpr, _ = roc_curve(y_bin[:, i], y_proba[:, i])`
			`roc_auc = auc(fpr, tpr)`
			`label = class_names[i] if class_names else f'Class {i}'`
			`ax.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.3f})')`

			`ax.plot([0, 1], [0, 1], 'k--', label='随机')`
			`ax.set_xlabel('False Positive Rate')`
			`ax.set_ylabel('True Positive Rate')`
			`ax.set_title('ROC 曲线')`
			`ax.legend(loc='lower right')`
			`plt.tight_layout()`
			`return fig`


			`def plot_precision_recall_curve(y_true: np.ndarray, y_proba: np.ndarray,`
			`figsize: tuple = (10, 8)) -> plt.Figure:`
			`"""绘制 PR 曲线"""`
			`from sklearn.metrics import precision_recall_curve, average_precision_score`

			`fig, ax = plt.subplots(figsize=figsize)`

			`proba = y_proba[:, 1] if y_proba.ndim == 2 else y_proba`
			`precision, recall, _ = precision_recall_curve(y_true, proba)`
			`ap = average_precision_score(y_true, proba)`

			`ax.plot(recall, precision, label=f'PR (AP = {ap:.3f})')`
			`ax.set_xlabel('Recall')`
			`ax.set_ylabel('Precision')`
			`ax.set_title('Precision-Recall 曲线')`
			`ax.legend()`
			`plt.tight_layout()`
			`return fig`


			`def plot_learning_curves(history: Dict[str, List[float]],`
			`figsize: tuple = (12, 4)) -> plt.Figure:`
			`"""绘制学习曲线"""`
			`fig, axes = plt.subplots(1, 2, figsize=figsize)`

			`# Loss`
			`axes[0].plot(history['train_loss'], label='Train')`
			`if 'val_loss' in history:`
			`axes[0].plot(history['val_loss'], label='Validation')`
			`axes[0].set_xlabel('Epoch')`
			`axes[0].set_ylabel('Loss')`
			`axes[0].set_title('Loss 曲线')`
			`axes[0].legend()`

			`# Accuracy`
			`if 'train_acc' in history:`
			`axes[1].plot(history['train_acc'], label='Train')`
			`if 'val_acc' in history:`
			`axes[1].plot(history['val_acc'], label='Validation')`
			`axes[1].set_xlabel('Epoch')`
			`axes[1].set_ylabel('Accuracy')`
			`axes[1].set_title('Accuracy 曲线')`
			`axes[1].legend()`

			`plt.tight_layout()`
			`return fig`


			`def plot_feature_importance(importance: np.ndarray, feature_names: List[str],`
			`top_k: int = 20, figsize: tuple = (10, 8)) -> plt.Figure:`
			`"""绘制特征重要性"""`
			`idx = np.argsort(importance)[-top_k:]`

			`fig, ax = plt.subplots(figsize=figsize)`
			`ax.barh(range(len(idx)), importance[idx])`
			`ax.set_yticks(range(len(idx)))`
			`ax.set_yticklabels([feature_names[i] for i in idx])`
			`ax.set_xlabel('重要性')`
			`ax.set_title(f'特征重要性 Top {top_k}')`
			`plt.tight_layout()`
			`return fig`


			`def print_classification_report(y_true: np.ndarray, y_pred: np.ndarray,`
			`class_names: Optional[List[str]] = None):`
			`"""打印分类报告"""`
			`print("\n" + "="*60)`
			`print("分类报告")`
			`print("="*60)`
			`print(classification_report(y_true, y_pred, target_names=class_names, digits=4))`


			`# NER 评估`
			`def ner_metrics(y_true: List[List[str]], y_pred: List[List[str]]) -> Dict[str, float]:`
			`"""NER 实体级别评估"""`
			`from seqeval.metrics import f1_score as seq_f1, precision_score as seq_p, recall_score as seq_r`

			`return {`
			`'precision': seq_p(y_true, y_pred),`
			`'recall': seq_r(y_true, y_pred),`
			`'f1': seq_f1(y_true, y_pred)`
			`}`


			`# 目标检测评估`
			`def compute_iou(box1: np.ndarray, box2: np.ndarray) -> float:`
			`"""计算 IoU"""`
			`x1 = max(box1[0], box2[0])`
			`y1 = max(box1[1], box2[1])`
			`x2 = min(box1[2], box2[2])`
			`y2 = min(box1[3], box2[3])`

			`intersection = max(0, x2 - x1) * max(0, y2 - y1)`
			`area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])`
			`area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])`
			`union = area1 + area2 - intersection`

			`return intersection / union if union > 0 else 0`


			`if __name__ == '__main__':`
			`# 测试分类指标`
			`y_true = np.array([0, 1, 1, 0, 1, 0, 1, 1])`
			`y_pred = np.array([0, 1, 0, 0, 1, 1, 1, 1])`
			`y_proba = np.array([[0.8, 0.2], [0.3, 0.7], [0.6, 0.4], [0.9, 0.1],`
			`[0.2, 0.8], [0.4, 0.6], [0.3, 0.7], [0.1, 0.9]])`

			`metrics = classification_metrics(y_true, y_pred, y_proba)`
			`print("分类指标:", metrics)`

			`# 测试回归指标`
			`y_true_reg = np.array([3.0, 5.0, 2.5, 7.0])`
			`y_pred_reg = np.array([2.8, 5.2, 2.3, 6.8])`

			`metrics_reg = regression_metrics(y_true_reg, y_pred_reg)`
			`print("回归指标:", metrics_reg)`