bookworm-smart-assistant/scripts/archive/p1-benchmark.js

185 lines
8.5 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* P1 基准测试 — 对比 P0 基线 vs P1 (分段 IDF + reranking) 的路由质量
*
* 测试指标:
* 1. 置信度分布 (top-1 平均置信度)
* 2. Top-1/Top-2 gap (区分度)
* 3. 消歧命中率
* 4. 冷启动触发频率
* 5. Core keyword 覆盖率 (P1-B 精排指标)
*
* 用法: node scripts/p1-benchmark.js
*/
const fs = require('fs');
const path = require('path');
const ra = require('./route-analyzer.js');
const rt = require('./route-telemetry.js');
const INDEX_FILE = path.join(__dirname, '..', 'skills-index.json');
if (!fs.existsSync(INDEX_FILE)) {
console.error('缺少 skills-index.json');
process.exit(1);
}
const index = JSON.parse(fs.readFileSync(INDEX_FILE, 'utf8'));
// 测试用例: 覆盖不同 domain 和歧义场景
const testCases = [
// domain: development
{ query: 'React component performance optimization', expected: 'frontend-expert' },
{ query: 'debug memory leak in Node.js API', expected: 'debugger-expert' },
{ query: 'write a Python web scraper', expected: 'browser-automation-expert' },
{ query: 'build REST API with Express', expected: 'backend-builder' },
// domain: architecture
{ query: 'optimize slow SQL queries with indexes', expected: 'database-tuning-expert' },
{ query: 'deploy microservices to Kubernetes', expected: 'cloud-native-expert' },
{ query: 'system design for high availability', expected: 'architect-expert' },
// domain: devops
{ query: 'setup CI/CD pipeline with Docker', expected: 'devops-expert' },
{ query: 'fix Git merge conflict on feature branch', expected: 'git-operation-master' },
// domain: quality
{ query: 'write unit tests for React components', expected: 'tester-expert' },
{ query: 'code review this pull request', expected: 'reviewer-expert' },
// domain: ai-data
{ query: 'fine-tune LLM with LoRA', expected: 'ai-ml-expert' },
{ query: 'build ETL pipeline with Spark', expected: 'data-engineer-expert' },
// domain: business
{ query: '撰写商业计划书融资 BP', expected: 'business-plan-skill' },
{ query: 'SaaS pricing model freemium', expected: 'pricing-strategist' },
// domain: content
{ query: 'write API documentation README', expected: 'tech-writer-expert' },
{ query: 'SEO optimize sitemap robots.txt', expected: 'technical-seo-expert' },
// domain: imported-development (10 new skills)
{ query: 'TypeScript generic conditional types utility', expected: 'typescript-pro' },
{ query: 'Python async await asyncio coroutine patterns', expected: 'python-pro' },
{ query: 'Go goroutine channel concurrency patterns', expected: 'golang-pro' },
{ query: 'Rust ownership borrow checker lifetime annotations', expected: 'rust-engineer' },
{ query: 'Angular standalone components Signals RxJS', expected: 'angular-architect' },
{ query: 'Vue 3 Composition API Pinia state management', expected: 'vue-expert' },
{ query: 'Next.js App Router Server Components RSC', expected: 'nextjs-developer' },
{ query: 'Flutter Widget Riverpod state management Dart', expected: 'flutter-expert' },
{ query: 'Swift SwiftUI async await Core Data iOS', expected: 'swift-expert' },
{ query: 'WebSocket Socket.IO real-time heartbeat reconnect', expected: 'websocket-engineer' },
// domain: imported-architecture (3 new skills)
{ query: 'design REST API OpenAPI 3.1 specification versioning', expected: 'api-designer' },
{ query: 'GraphQL schema resolver subscription Apollo DataLoader', expected: 'graphql-architect' },
{ query: 'AWS Lambda serverless cloud architecture FinOps', expected: 'cloud-architect' },
// domain: imported-devops (2 new skills)
{ query: 'Kubernetes Helm chart RBAC NetworkPolicy Pod', expected: 'kubernetes-specialist' },
{ query: 'Terraform HCL infrastructure as code modules state', expected: 'terraform-engineer' },
// 歧义场景 (消歧规则应正确触发)
{ query: 'fix React bug component not rendering', expected: 'debugger-expert' },
{ query: 'review code for security vulnerabilities', expected: 'reviewer-expert' },
{ query: 'K8s deployment scaling strategy', expected: 'cloud-native-expert' },
];
// 运行基准测试
const bm25Params = ra.buildBM25Params(index);
const routeStats = rt.getSkillRouteStats(30);
let correct = 0;
let totalTop1Conf = 0;
let totalGap12 = 0;
let totalDisambHits = 0;
let totalColdStart = 0;
let totalRerankBoost = 0;
let rerankCount = 0;
const results = [];
for (const tc of testCases) {
const tokens = ra.tokenize(tc.query);
// 评分
const scored = index.skills.map(s => {
const { totalScore, matchedKeywords } = ra.scoreSkill(s, tokens, bm25Params);
return {
name: s.name,
score: Math.round(totalScore * 100) / 100,
matchedKeywords,
};
}).sort((a, b) => b.score - a.score);
// 消歧
const { results: disamb, firedRules } = ra.applyDisambiguation(scored, tc.query, index);
// 冷启动
const { boostedSkills } = ra.applyColdStartBoost(disamb, routeStats);
// Reranking
const reranked = ra.rerankTopK(disamb, tokens, index, 10);
reranked.sort((a, b) => b.score - a.score);
// 归一化
const normalized = ra.normalizeScores(reranked).slice(0, 5);
const top1 = normalized[0];
const top2 = normalized[1];
const isCorrect = top1 && top1.name === tc.expected;
const conf = top1 ? top1.confidence : 0;
const gap = (top1 ? top1.confidence : 0) - (top2 ? top2.confidence : 0);
if (isCorrect) correct++;
totalTop1Conf += conf;
totalGap12 += gap;
if (firedRules.length > 0) totalDisambHits++;
if (boostedSkills && boostedSkills.length > 0) totalColdStart++;
// 统计 rerank boost
if (top1 && top1._rerankBoost) {
totalRerankBoost += top1._rerankBoost;
rerankCount++;
}
const status = isCorrect ? '✓' : '✗';
const actual = top1 ? top1.name : 'none';
results.push({ status, query: tc.query.slice(0, 40), expected: tc.expected, actual, conf, gap, rules: firedRules });
}
// 输出报告
console.log('╔══════════════════════════════════════════════════════════╗');
console.log('║ P1 路由质量基准测试报告 ║');
console.log('╠══════════════════════════════════════════════════════════╣');
console.log('');
for (const r of results) {
const marker = r.status === '✗' ? ' ← WRONG (got: ' + r.actual + ')' : '';
console.log(` ${r.status} ${r.query.padEnd(42)}${r.expected}${marker}`);
if (r.rules.length > 0) console.log(` rules: ${r.rules.join(', ')}`);
}
const n = testCases.length;
console.log('');
console.log('╠══════════════════════════════════════════════════════════╣');
console.log(`║ 准确率: ${correct}/${n} (${(correct/n*100).toFixed(1)}%)`.padEnd(59) + '║');
console.log(`║ 平均置信度: ${(totalTop1Conf/n).toFixed(3)}`.padEnd(59) + '║');
console.log(`║ 平均 gap1-2: ${(totalGap12/n).toFixed(3)}`.padEnd(59) + '║');
console.log(`║ 消歧命中: ${totalDisambHits}/${n} (${(totalDisambHits/n*100).toFixed(1)}%)`.padEnd(59) + '║');
console.log(`║ 冷启动触发: ${totalColdStart}/${n} (${(totalColdStart/n*100).toFixed(1)}%)`.padEnd(59) + '║');
if (rerankCount > 0) {
console.log(`║ 平均 rerank: ×${(totalRerankBoost/rerankCount).toFixed(3)}`.padEnd(59) + '║');
}
console.log('╚══════════════════════════════════════════════════════════╝');
// 写入遥测
const benchmarkEntry = {
ts: new Date().toISOString(),
type: 'p1-benchmark',
accuracy: correct / n,
avgConfidence: totalTop1Conf / n,
avgGap12: totalGap12 / n,
disambHitRate: totalDisambHits / n,
coldStartRate: totalColdStart / n,
avgRerankBoost: rerankCount > 0 ? totalRerankBoost / rerankCount : 0,
testCount: n,
};
try {
const debugDir = path.join(__dirname, '..', 'debug');
if (!fs.existsSync(debugDir)) fs.mkdirSync(debugDir, { recursive: true });
fs.appendFileSync(path.join(debugDir, 'route-metrics.jsonl'), JSON.stringify(benchmarkEntry) + '\n');
console.log('\n基准数据已追加到 debug/route-metrics.jsonl');
} catch {}
process.exit(correct === n ? 0 : 1);