bookworm-smart-assistant/scripts/archive/p1-benchmark.js

185 lines
8.5 KiB
JavaScript
Raw Normal View History

#!/usr/bin/env node
/**
* P1 基准测试 对比 P0 基线 vs P1 (分段 IDF + reranking) 的路由质量
*
* 测试指标:
* 1. 置信度分布 (top-1 平均置信度)
* 2. Top-1/Top-2 gap (区分度)
* 3. 消歧命中率
* 4. 冷启动触发频率
* 5. Core keyword 覆盖率 (P1-B 精排指标)
*
* 用法: node scripts/p1-benchmark.js
*/
const fs = require('fs');
const path = require('path');
const ra = require('./route-analyzer.js');
const rt = require('./route-telemetry.js');
const INDEX_FILE = path.join(__dirname, '..', 'skills-index.json');
if (!fs.existsSync(INDEX_FILE)) {
console.error('缺少 skills-index.json');
process.exit(1);
}
const index = JSON.parse(fs.readFileSync(INDEX_FILE, 'utf8'));
// 测试用例: 覆盖不同 domain 和歧义场景
const testCases = [
// domain: development
{ query: 'React component performance optimization', expected: 'frontend-expert' },
{ query: 'debug memory leak in Node.js API', expected: 'debugger-expert' },
{ query: 'write a Python web scraper', expected: 'browser-automation-expert' },
{ query: 'build REST API with Express', expected: 'backend-builder' },
// domain: architecture
{ query: 'optimize slow SQL queries with indexes', expected: 'database-tuning-expert' },
{ query: 'deploy microservices to Kubernetes', expected: 'cloud-native-expert' },
{ query: 'system design for high availability', expected: 'architect-expert' },
// domain: devops
{ query: 'setup CI/CD pipeline with Docker', expected: 'devops-expert' },
{ query: 'fix Git merge conflict on feature branch', expected: 'git-operation-master' },
// domain: quality
{ query: 'write unit tests for React components', expected: 'tester-expert' },
{ query: 'code review this pull request', expected: 'reviewer-expert' },
// domain: ai-data
{ query: 'fine-tune LLM with LoRA', expected: 'ai-ml-expert' },
{ query: 'build ETL pipeline with Spark', expected: 'data-engineer-expert' },
// domain: business
{ query: '撰写商业计划书融资 BP', expected: 'business-plan-skill' },
{ query: 'SaaS pricing model freemium', expected: 'pricing-strategist' },
// domain: content
{ query: 'write API documentation README', expected: 'tech-writer-expert' },
{ query: 'SEO optimize sitemap robots.txt', expected: 'technical-seo-expert' },
// domain: imported-development (10 new skills)
{ query: 'TypeScript generic conditional types utility', expected: 'typescript-pro' },
{ query: 'Python async await asyncio coroutine patterns', expected: 'python-pro' },
{ query: 'Go goroutine channel concurrency patterns', expected: 'golang-pro' },
{ query: 'Rust ownership borrow checker lifetime annotations', expected: 'rust-engineer' },
{ query: 'Angular standalone components Signals RxJS', expected: 'angular-architect' },
{ query: 'Vue 3 Composition API Pinia state management', expected: 'vue-expert' },
{ query: 'Next.js App Router Server Components RSC', expected: 'nextjs-developer' },
{ query: 'Flutter Widget Riverpod state management Dart', expected: 'flutter-expert' },
{ query: 'Swift SwiftUI async await Core Data iOS', expected: 'swift-expert' },
{ query: 'WebSocket Socket.IO real-time heartbeat reconnect', expected: 'websocket-engineer' },
// domain: imported-architecture (3 new skills)
{ query: 'design REST API OpenAPI 3.1 specification versioning', expected: 'api-designer' },
{ query: 'GraphQL schema resolver subscription Apollo DataLoader', expected: 'graphql-architect' },
{ query: 'AWS Lambda serverless cloud architecture FinOps', expected: 'cloud-architect' },
// domain: imported-devops (2 new skills)
{ query: 'Kubernetes Helm chart RBAC NetworkPolicy Pod', expected: 'kubernetes-specialist' },
{ query: 'Terraform HCL infrastructure as code modules state', expected: 'terraform-engineer' },
// 歧义场景 (消歧规则应正确触发)
{ query: 'fix React bug component not rendering', expected: 'debugger-expert' },
{ query: 'review code for security vulnerabilities', expected: 'reviewer-expert' },
{ query: 'K8s deployment scaling strategy', expected: 'cloud-native-expert' },
];
// 运行基准测试
const bm25Params = ra.buildBM25Params(index);
const routeStats = rt.getSkillRouteStats(30);
let correct = 0;
let totalTop1Conf = 0;
let totalGap12 = 0;
let totalDisambHits = 0;
let totalColdStart = 0;
let totalRerankBoost = 0;
let rerankCount = 0;
const results = [];
for (const tc of testCases) {
const tokens = ra.tokenize(tc.query);
// 评分
const scored = index.skills.map(s => {
const { totalScore, matchedKeywords } = ra.scoreSkill(s, tokens, bm25Params);
return {
name: s.name,
score: Math.round(totalScore * 100) / 100,
matchedKeywords,
};
}).sort((a, b) => b.score - a.score);
// 消歧
const { results: disamb, firedRules } = ra.applyDisambiguation(scored, tc.query, index);
// 冷启动
const { boostedSkills } = ra.applyColdStartBoost(disamb, routeStats);
// Reranking
const reranked = ra.rerankTopK(disamb, tokens, index, 10);
reranked.sort((a, b) => b.score - a.score);
// 归一化
const normalized = ra.normalizeScores(reranked).slice(0, 5);
const top1 = normalized[0];
const top2 = normalized[1];
const isCorrect = top1 && top1.name === tc.expected;
const conf = top1 ? top1.confidence : 0;
const gap = (top1 ? top1.confidence : 0) - (top2 ? top2.confidence : 0);
if (isCorrect) correct++;
totalTop1Conf += conf;
totalGap12 += gap;
if (firedRules.length > 0) totalDisambHits++;
if (boostedSkills && boostedSkills.length > 0) totalColdStart++;
// 统计 rerank boost
if (top1 && top1._rerankBoost) {
totalRerankBoost += top1._rerankBoost;
rerankCount++;
}
const status = isCorrect ? '✓' : '✗';
const actual = top1 ? top1.name : 'none';
results.push({ status, query: tc.query.slice(0, 40), expected: tc.expected, actual, conf, gap, rules: firedRules });
}
// 输出报告
console.log('╔══════════════════════════════════════════════════════════╗');
console.log('║ P1 路由质量基准测试报告 ║');
console.log('╠══════════════════════════════════════════════════════════╣');
console.log('');
for (const r of results) {
const marker = r.status === '✗' ? ' ← WRONG (got: ' + r.actual + ')' : '';
console.log(` ${r.status} ${r.query.padEnd(42)}${r.expected}${marker}`);
if (r.rules.length > 0) console.log(` rules: ${r.rules.join(', ')}`);
}
const n = testCases.length;
console.log('');
console.log('╠══════════════════════════════════════════════════════════╣');
console.log(`║ 准确率: ${correct}/${n} (${(correct/n*100).toFixed(1)}%)`.padEnd(59) + '║');
console.log(`║ 平均置信度: ${(totalTop1Conf/n).toFixed(3)}`.padEnd(59) + '║');
console.log(`║ 平均 gap1-2: ${(totalGap12/n).toFixed(3)}`.padEnd(59) + '║');
console.log(`║ 消歧命中: ${totalDisambHits}/${n} (${(totalDisambHits/n*100).toFixed(1)}%)`.padEnd(59) + '║');
console.log(`║ 冷启动触发: ${totalColdStart}/${n} (${(totalColdStart/n*100).toFixed(1)}%)`.padEnd(59) + '║');
if (rerankCount > 0) {
console.log(`║ 平均 rerank: ×${(totalRerankBoost/rerankCount).toFixed(3)}`.padEnd(59) + '║');
}
console.log('╚══════════════════════════════════════════════════════════╝');
// 写入遥测
const benchmarkEntry = {
ts: new Date().toISOString(),
type: 'p1-benchmark',
accuracy: correct / n,
avgConfidence: totalTop1Conf / n,
avgGap12: totalGap12 / n,
disambHitRate: totalDisambHits / n,
coldStartRate: totalColdStart / n,
avgRerankBoost: rerankCount > 0 ? totalRerankBoost / rerankCount : 0,
testCount: n,
};
try {
const debugDir = path.join(__dirname, '..', 'debug');
if (!fs.existsSync(debugDir)) fs.mkdirSync(debugDir, { recursive: true });
fs.appendFileSync(path.join(debugDir, 'route-metrics.jsonl'), JSON.stringify(benchmarkEntry) + '\n');
console.log('\n基准数据已追加到 debug/route-metrics.jsonl');
} catch {}
process.exit(correct === n ? 0 : 1);