bookworm-smart-assistant/scripts/archive/p1-benchmark.js

#!/usr/bin/env node
/**
 * P1 基准测试 — 对比 P0 基线 vs P1 (分段 IDF + reranking) 的路由质量
 *
 * 测试指标:
 *   1. 置信度分布 (top-1 平均置信度)
 *   2. Top-1/Top-2 gap (区分度)
 *   3. 消歧命中率
 *   4. 冷启动触发频率
 *   5. Core keyword 覆盖率 (P1-B 精排指标)
 *
 * 用法: node scripts/p1-benchmark.js
 */

const fs = require('fs');
const path = require('path');
const ra = require('./route-analyzer.js');
const rt = require('./route-telemetry.js');

const INDEX_FILE = path.join(__dirname, '..', 'skills-index.json');
if (!fs.existsSync(INDEX_FILE)) {
  console.error('缺少 skills-index.json');
  process.exit(1);
}
const index = JSON.parse(fs.readFileSync(INDEX_FILE, 'utf8'));

// 测试用例: 覆盖不同 domain 和歧义场景
const testCases = [
  // domain: development
  { query: 'React component performance optimization', expected: 'frontend-expert' },
  { query: 'debug memory leak in Node.js API', expected: 'debugger-expert' },
  { query: 'write a Python web scraper', expected: 'browser-automation-expert' },
  { query: 'build REST API with Express', expected: 'backend-builder' },
  // domain: architecture
  { query: 'optimize slow SQL queries with indexes', expected: 'database-tuning-expert' },
  { query: 'deploy microservices to Kubernetes', expected: 'cloud-native-expert' },
  { query: 'system design for high availability', expected: 'architect-expert' },
  // domain: devops
  { query: 'setup CI/CD pipeline with Docker', expected: 'devops-expert' },
  { query: 'fix Git merge conflict on feature branch', expected: 'git-operation-master' },
  // domain: quality
  { query: 'write unit tests for React components', expected: 'tester-expert' },
  { query: 'code review this pull request', expected: 'reviewer-expert' },
  // domain: ai-data
  { query: 'fine-tune LLM with LoRA', expected: 'ai-ml-expert' },
  { query: 'build ETL pipeline with Spark', expected: 'data-engineer-expert' },
  // domain: business
  { query: '撰写商业计划书融资 BP', expected: 'business-plan-skill' },
  { query: 'SaaS pricing model freemium', expected: 'pricing-strategist' },
  // domain: content
  { query: 'write API documentation README', expected: 'tech-writer-expert' },
  { query: 'SEO optimize sitemap robots.txt', expected: 'technical-seo-expert' },
  // domain: imported-development (10 new skills)
  { query: 'TypeScript generic conditional types utility', expected: 'typescript-pro' },
  { query: 'Python async await asyncio coroutine patterns', expected: 'python-pro' },
  { query: 'Go goroutine channel concurrency patterns', expected: 'golang-pro' },
  { query: 'Rust ownership borrow checker lifetime annotations', expected: 'rust-engineer' },
  { query: 'Angular standalone components Signals RxJS', expected: 'angular-architect' },
  { query: 'Vue 3 Composition API Pinia state management', expected: 'vue-expert' },
  { query: 'Next.js App Router Server Components RSC', expected: 'nextjs-developer' },
  { query: 'Flutter Widget Riverpod state management Dart', expected: 'flutter-expert' },
  { query: 'Swift SwiftUI async await Core Data iOS', expected: 'swift-expert' },
  { query: 'WebSocket Socket.IO real-time heartbeat reconnect', expected: 'websocket-engineer' },
  // domain: imported-architecture (3 new skills)
  { query: 'design REST API OpenAPI 3.1 specification versioning', expected: 'api-designer' },
  { query: 'GraphQL schema resolver subscription Apollo DataLoader', expected: 'graphql-architect' },
  { query: 'AWS Lambda serverless cloud architecture FinOps', expected: 'cloud-architect' },
  // domain: imported-devops (2 new skills)
  { query: 'Kubernetes Helm chart RBAC NetworkPolicy Pod', expected: 'kubernetes-specialist' },
  { query: 'Terraform HCL infrastructure as code modules state', expected: 'terraform-engineer' },
  // 歧义场景 (消歧规则应正确触发)
  { query: 'fix React bug component not rendering', expected: 'debugger-expert' },
  { query: 'review code for security vulnerabilities', expected: 'reviewer-expert' },
  { query: 'K8s deployment scaling strategy', expected: 'cloud-native-expert' },
];

// 运行基准测试
const bm25Params = ra.buildBM25Params(index);
const routeStats = rt.getSkillRouteStats(30);

let correct = 0;
let totalTop1Conf = 0;
let totalGap12 = 0;
let totalDisambHits = 0;
let totalColdStart = 0;
let totalRerankBoost = 0;
let rerankCount = 0;

const results = [];

for (const tc of testCases) {
  const tokens = ra.tokenize(tc.query);

  // 评分
  const scored = index.skills.map(s => {
    const { totalScore, matchedKeywords } = ra.scoreSkill(s, tokens, bm25Params);
    return {
      name: s.name,
      score: Math.round(totalScore * 100) / 100,
      matchedKeywords,
    };
  }).sort((a, b) => b.score - a.score);

  // 消歧
  const { results: disamb, firedRules } = ra.applyDisambiguation(scored, tc.query, index);

  // 冷启动
  const { boostedSkills } = ra.applyColdStartBoost(disamb, routeStats);

  // Reranking
  const reranked = ra.rerankTopK(disamb, tokens, index, 10);
  reranked.sort((a, b) => b.score - a.score);

  // 归一化
  const normalized = ra.normalizeScores(reranked).slice(0, 5);

  const top1 = normalized[0];
  const top2 = normalized[1];
  const isCorrect = top1 && top1.name === tc.expected;
  const conf = top1 ? top1.confidence : 0;
  const gap = (top1 ? top1.confidence : 0) - (top2 ? top2.confidence : 0);

  if (isCorrect) correct++;
  totalTop1Conf += conf;
  totalGap12 += gap;
  if (firedRules.length > 0) totalDisambHits++;
  if (boostedSkills && boostedSkills.length > 0) totalColdStart++;

  // 统计 rerank boost
  if (top1 && top1._rerankBoost) {
    totalRerankBoost += top1._rerankBoost;
    rerankCount++;
  }

  const status = isCorrect ? '✓' : '✗';
  const actual = top1 ? top1.name : 'none';
  results.push({ status, query: tc.query.slice(0, 40), expected: tc.expected, actual, conf, gap, rules: firedRules });
}

// 输出报告
console.log('╔══════════════════════════════════════════════════════════╗');
console.log('║          P1 路由质量基准测试报告                         ║');
console.log('╠══════════════════════════════════════════════════════════╣');
console.log('');

for (const r of results) {
  const marker = r.status === '✗' ? ' ← WRONG (got: ' + r.actual + ')' : '';
  console.log(`  ${r.status} ${r.query.padEnd(42)} → ${r.expected}${marker}`);
  if (r.rules.length > 0) console.log(`    rules: ${r.rules.join(', ')}`);
}

const n = testCases.length;
console.log('');
console.log('╠══════════════════════════════════════════════════════════╣');
console.log(`║  准确率:       ${correct}/${n} (${(correct/n*100).toFixed(1)}%)`.padEnd(59) + '║');
console.log(`║  平均置信度:   ${(totalTop1Conf/n).toFixed(3)}`.padEnd(59) + '║');
console.log(`║  平均 gap1-2:  ${(totalGap12/n).toFixed(3)}`.padEnd(59) + '║');
console.log(`║  消歧命中:     ${totalDisambHits}/${n} (${(totalDisambHits/n*100).toFixed(1)}%)`.padEnd(59) + '║');
console.log(`║  冷启动触发:   ${totalColdStart}/${n} (${(totalColdStart/n*100).toFixed(1)}%)`.padEnd(59) + '║');
if (rerankCount > 0) {
  console.log(`║  平均 rerank:  ×${(totalRerankBoost/rerankCount).toFixed(3)}`.padEnd(59) + '║');
}
console.log('╚══════════════════════════════════════════════════════════╝');

// 写入遥测
const benchmarkEntry = {
  ts: new Date().toISOString(),
  type: 'p1-benchmark',
  accuracy: correct / n,
  avgConfidence: totalTop1Conf / n,
  avgGap12: totalGap12 / n,
  disambHitRate: totalDisambHits / n,
  coldStartRate: totalColdStart / n,
  avgRerankBoost: rerankCount > 0 ? totalRerankBoost / rerankCount : 0,
  testCount: n,
};
try {
  const debugDir = path.join(__dirname, '..', 'debug');
  if (!fs.existsSync(debugDir)) fs.mkdirSync(debugDir, { recursive: true });
  fs.appendFileSync(path.join(debugDir, 'route-metrics.jsonl'), JSON.stringify(benchmarkEntry) + '\n');
  console.log('\n基准数据已追加到 debug/route-metrics.jsonl');
} catch {}

process.exit(correct === n ? 0 : 1);