185 lines
8.5 KiB
JavaScript
185 lines
8.5 KiB
JavaScript
|
|
#!/usr/bin/env node
|
|||
|
|
/**
|
|||
|
|
* P1 基准测试 — 对比 P0 基线 vs P1 (分段 IDF + reranking) 的路由质量
|
|||
|
|
*
|
|||
|
|
* 测试指标:
|
|||
|
|
* 1. 置信度分布 (top-1 平均置信度)
|
|||
|
|
* 2. Top-1/Top-2 gap (区分度)
|
|||
|
|
* 3. 消歧命中率
|
|||
|
|
* 4. 冷启动触发频率
|
|||
|
|
* 5. Core keyword 覆盖率 (P1-B 精排指标)
|
|||
|
|
*
|
|||
|
|
* 用法: node scripts/p1-benchmark.js
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
const fs = require('fs');
|
|||
|
|
const path = require('path');
|
|||
|
|
const ra = require('./route-analyzer.js');
|
|||
|
|
const rt = require('./route-telemetry.js');
|
|||
|
|
|
|||
|
|
const INDEX_FILE = path.join(__dirname, '..', 'skills-index.json');
|
|||
|
|
if (!fs.existsSync(INDEX_FILE)) {
|
|||
|
|
console.error('缺少 skills-index.json');
|
|||
|
|
process.exit(1);
|
|||
|
|
}
|
|||
|
|
const index = JSON.parse(fs.readFileSync(INDEX_FILE, 'utf8'));
|
|||
|
|
|
|||
|
|
// 测试用例: 覆盖不同 domain 和歧义场景
|
|||
|
|
const testCases = [
|
|||
|
|
// domain: development
|
|||
|
|
{ query: 'React component performance optimization', expected: 'frontend-expert' },
|
|||
|
|
{ query: 'debug memory leak in Node.js API', expected: 'debugger-expert' },
|
|||
|
|
{ query: 'write a Python web scraper', expected: 'browser-automation-expert' },
|
|||
|
|
{ query: 'build REST API with Express', expected: 'backend-builder' },
|
|||
|
|
// domain: architecture
|
|||
|
|
{ query: 'optimize slow SQL queries with indexes', expected: 'database-tuning-expert' },
|
|||
|
|
{ query: 'deploy microservices to Kubernetes', expected: 'cloud-native-expert' },
|
|||
|
|
{ query: 'system design for high availability', expected: 'architect-expert' },
|
|||
|
|
// domain: devops
|
|||
|
|
{ query: 'setup CI/CD pipeline with Docker', expected: 'devops-expert' },
|
|||
|
|
{ query: 'fix Git merge conflict on feature branch', expected: 'git-operation-master' },
|
|||
|
|
// domain: quality
|
|||
|
|
{ query: 'write unit tests for React components', expected: 'tester-expert' },
|
|||
|
|
{ query: 'code review this pull request', expected: 'reviewer-expert' },
|
|||
|
|
// domain: ai-data
|
|||
|
|
{ query: 'fine-tune LLM with LoRA', expected: 'ai-ml-expert' },
|
|||
|
|
{ query: 'build ETL pipeline with Spark', expected: 'data-engineer-expert' },
|
|||
|
|
// domain: business
|
|||
|
|
{ query: '撰写商业计划书融资 BP', expected: 'business-plan-skill' },
|
|||
|
|
{ query: 'SaaS pricing model freemium', expected: 'pricing-strategist' },
|
|||
|
|
// domain: content
|
|||
|
|
{ query: 'write API documentation README', expected: 'tech-writer-expert' },
|
|||
|
|
{ query: 'SEO optimize sitemap robots.txt', expected: 'technical-seo-expert' },
|
|||
|
|
// domain: imported-development (10 new skills)
|
|||
|
|
{ query: 'TypeScript generic conditional types utility', expected: 'typescript-pro' },
|
|||
|
|
{ query: 'Python async await asyncio coroutine patterns', expected: 'python-pro' },
|
|||
|
|
{ query: 'Go goroutine channel concurrency patterns', expected: 'golang-pro' },
|
|||
|
|
{ query: 'Rust ownership borrow checker lifetime annotations', expected: 'rust-engineer' },
|
|||
|
|
{ query: 'Angular standalone components Signals RxJS', expected: 'angular-architect' },
|
|||
|
|
{ query: 'Vue 3 Composition API Pinia state management', expected: 'vue-expert' },
|
|||
|
|
{ query: 'Next.js App Router Server Components RSC', expected: 'nextjs-developer' },
|
|||
|
|
{ query: 'Flutter Widget Riverpod state management Dart', expected: 'flutter-expert' },
|
|||
|
|
{ query: 'Swift SwiftUI async await Core Data iOS', expected: 'swift-expert' },
|
|||
|
|
{ query: 'WebSocket Socket.IO real-time heartbeat reconnect', expected: 'websocket-engineer' },
|
|||
|
|
// domain: imported-architecture (3 new skills)
|
|||
|
|
{ query: 'design REST API OpenAPI 3.1 specification versioning', expected: 'api-designer' },
|
|||
|
|
{ query: 'GraphQL schema resolver subscription Apollo DataLoader', expected: 'graphql-architect' },
|
|||
|
|
{ query: 'AWS Lambda serverless cloud architecture FinOps', expected: 'cloud-architect' },
|
|||
|
|
// domain: imported-devops (2 new skills)
|
|||
|
|
{ query: 'Kubernetes Helm chart RBAC NetworkPolicy Pod', expected: 'kubernetes-specialist' },
|
|||
|
|
{ query: 'Terraform HCL infrastructure as code modules state', expected: 'terraform-engineer' },
|
|||
|
|
// 歧义场景 (消歧规则应正确触发)
|
|||
|
|
{ query: 'fix React bug component not rendering', expected: 'debugger-expert' },
|
|||
|
|
{ query: 'review code for security vulnerabilities', expected: 'reviewer-expert' },
|
|||
|
|
{ query: 'K8s deployment scaling strategy', expected: 'cloud-native-expert' },
|
|||
|
|
];
|
|||
|
|
|
|||
|
|
// 运行基准测试
|
|||
|
|
const bm25Params = ra.buildBM25Params(index);
|
|||
|
|
const routeStats = rt.getSkillRouteStats(30);
|
|||
|
|
|
|||
|
|
let correct = 0;
|
|||
|
|
let totalTop1Conf = 0;
|
|||
|
|
let totalGap12 = 0;
|
|||
|
|
let totalDisambHits = 0;
|
|||
|
|
let totalColdStart = 0;
|
|||
|
|
let totalRerankBoost = 0;
|
|||
|
|
let rerankCount = 0;
|
|||
|
|
|
|||
|
|
const results = [];
|
|||
|
|
|
|||
|
|
for (const tc of testCases) {
|
|||
|
|
const tokens = ra.tokenize(tc.query);
|
|||
|
|
|
|||
|
|
// 评分
|
|||
|
|
const scored = index.skills.map(s => {
|
|||
|
|
const { totalScore, matchedKeywords } = ra.scoreSkill(s, tokens, bm25Params);
|
|||
|
|
return {
|
|||
|
|
name: s.name,
|
|||
|
|
score: Math.round(totalScore * 100) / 100,
|
|||
|
|
matchedKeywords,
|
|||
|
|
};
|
|||
|
|
}).sort((a, b) => b.score - a.score);
|
|||
|
|
|
|||
|
|
// 消歧
|
|||
|
|
const { results: disamb, firedRules } = ra.applyDisambiguation(scored, tc.query, index);
|
|||
|
|
|
|||
|
|
// 冷启动
|
|||
|
|
const { boostedSkills } = ra.applyColdStartBoost(disamb, routeStats);
|
|||
|
|
|
|||
|
|
// Reranking
|
|||
|
|
const reranked = ra.rerankTopK(disamb, tokens, index, 10);
|
|||
|
|
reranked.sort((a, b) => b.score - a.score);
|
|||
|
|
|
|||
|
|
// 归一化
|
|||
|
|
const normalized = ra.normalizeScores(reranked).slice(0, 5);
|
|||
|
|
|
|||
|
|
const top1 = normalized[0];
|
|||
|
|
const top2 = normalized[1];
|
|||
|
|
const isCorrect = top1 && top1.name === tc.expected;
|
|||
|
|
const conf = top1 ? top1.confidence : 0;
|
|||
|
|
const gap = (top1 ? top1.confidence : 0) - (top2 ? top2.confidence : 0);
|
|||
|
|
|
|||
|
|
if (isCorrect) correct++;
|
|||
|
|
totalTop1Conf += conf;
|
|||
|
|
totalGap12 += gap;
|
|||
|
|
if (firedRules.length > 0) totalDisambHits++;
|
|||
|
|
if (boostedSkills && boostedSkills.length > 0) totalColdStart++;
|
|||
|
|
|
|||
|
|
// 统计 rerank boost
|
|||
|
|
if (top1 && top1._rerankBoost) {
|
|||
|
|
totalRerankBoost += top1._rerankBoost;
|
|||
|
|
rerankCount++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const status = isCorrect ? '✓' : '✗';
|
|||
|
|
const actual = top1 ? top1.name : 'none';
|
|||
|
|
results.push({ status, query: tc.query.slice(0, 40), expected: tc.expected, actual, conf, gap, rules: firedRules });
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 输出报告
|
|||
|
|
console.log('╔══════════════════════════════════════════════════════════╗');
|
|||
|
|
console.log('║ P1 路由质量基准测试报告 ║');
|
|||
|
|
console.log('╠══════════════════════════════════════════════════════════╣');
|
|||
|
|
console.log('');
|
|||
|
|
|
|||
|
|
for (const r of results) {
|
|||
|
|
const marker = r.status === '✗' ? ' ← WRONG (got: ' + r.actual + ')' : '';
|
|||
|
|
console.log(` ${r.status} ${r.query.padEnd(42)} → ${r.expected}${marker}`);
|
|||
|
|
if (r.rules.length > 0) console.log(` rules: ${r.rules.join(', ')}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const n = testCases.length;
|
|||
|
|
console.log('');
|
|||
|
|
console.log('╠══════════════════════════════════════════════════════════╣');
|
|||
|
|
console.log(`║ 准确率: ${correct}/${n} (${(correct/n*100).toFixed(1)}%)`.padEnd(59) + '║');
|
|||
|
|
console.log(`║ 平均置信度: ${(totalTop1Conf/n).toFixed(3)}`.padEnd(59) + '║');
|
|||
|
|
console.log(`║ 平均 gap1-2: ${(totalGap12/n).toFixed(3)}`.padEnd(59) + '║');
|
|||
|
|
console.log(`║ 消歧命中: ${totalDisambHits}/${n} (${(totalDisambHits/n*100).toFixed(1)}%)`.padEnd(59) + '║');
|
|||
|
|
console.log(`║ 冷启动触发: ${totalColdStart}/${n} (${(totalColdStart/n*100).toFixed(1)}%)`.padEnd(59) + '║');
|
|||
|
|
if (rerankCount > 0) {
|
|||
|
|
console.log(`║ 平均 rerank: ×${(totalRerankBoost/rerankCount).toFixed(3)}`.padEnd(59) + '║');
|
|||
|
|
}
|
|||
|
|
console.log('╚══════════════════════════════════════════════════════════╝');
|
|||
|
|
|
|||
|
|
// 写入遥测
|
|||
|
|
const benchmarkEntry = {
|
|||
|
|
ts: new Date().toISOString(),
|
|||
|
|
type: 'p1-benchmark',
|
|||
|
|
accuracy: correct / n,
|
|||
|
|
avgConfidence: totalTop1Conf / n,
|
|||
|
|
avgGap12: totalGap12 / n,
|
|||
|
|
disambHitRate: totalDisambHits / n,
|
|||
|
|
coldStartRate: totalColdStart / n,
|
|||
|
|
avgRerankBoost: rerankCount > 0 ? totalRerankBoost / rerankCount : 0,
|
|||
|
|
testCount: n,
|
|||
|
|
};
|
|||
|
|
try {
|
|||
|
|
const debugDir = path.join(__dirname, '..', 'debug');
|
|||
|
|
if (!fs.existsSync(debugDir)) fs.mkdirSync(debugDir, { recursive: true });
|
|||
|
|
fs.appendFileSync(path.join(debugDir, 'route-metrics.jsonl'), JSON.stringify(benchmarkEntry) + '\n');
|
|||
|
|
console.log('\n基准数据已追加到 debug/route-metrics.jsonl');
|
|||
|
|
} catch {}
|
|||
|
|
|
|||
|
|
process.exit(correct === n ? 0 : 1);
|