#!/usr/bin/env node /** * P1 基准测试 — 对比 P0 基线 vs P1 (分段 IDF + reranking) 的路由质量 * * 测试指标: * 1. 置信度分布 (top-1 平均置信度) * 2. Top-1/Top-2 gap (区分度) * 3. 消歧命中率 * 4. 冷启动触发频率 * 5. Core keyword 覆盖率 (P1-B 精排指标) * * 用法: node scripts/p1-benchmark.js */ const fs = require('fs'); const path = require('path'); const ra = require('./route-analyzer.js'); const rt = require('./route-telemetry.js'); const INDEX_FILE = path.join(__dirname, '..', 'skills-index.json'); if (!fs.existsSync(INDEX_FILE)) { console.error('缺少 skills-index.json'); process.exit(1); } const index = JSON.parse(fs.readFileSync(INDEX_FILE, 'utf8')); // 测试用例: 覆盖不同 domain 和歧义场景 const testCases = [ // domain: development { query: 'React component performance optimization', expected: 'frontend-expert' }, { query: 'debug memory leak in Node.js API', expected: 'debugger-expert' }, { query: 'write a Python web scraper', expected: 'browser-automation-expert' }, { query: 'build REST API with Express', expected: 'backend-builder' }, // domain: architecture { query: 'optimize slow SQL queries with indexes', expected: 'database-tuning-expert' }, { query: 'deploy microservices to Kubernetes', expected: 'cloud-native-expert' }, { query: 'system design for high availability', expected: 'architect-expert' }, // domain: devops { query: 'setup CI/CD pipeline with Docker', expected: 'devops-expert' }, { query: 'fix Git merge conflict on feature branch', expected: 'git-operation-master' }, // domain: quality { query: 'write unit tests for React components', expected: 'tester-expert' }, { query: 'code review this pull request', expected: 'reviewer-expert' }, // domain: ai-data { query: 'fine-tune LLM with LoRA', expected: 'ai-ml-expert' }, { query: 'build ETL pipeline with Spark', expected: 'data-engineer-expert' }, // domain: business { query: '撰写商业计划书融资 BP', expected: 'business-plan-skill' }, { query: 'SaaS pricing model freemium', expected: 'pricing-strategist' }, // domain: content { query: 'write API documentation README', expected: 'tech-writer-expert' }, { query: 'SEO optimize sitemap robots.txt', expected: 'technical-seo-expert' }, // domain: imported-development (10 new skills) { query: 'TypeScript generic conditional types utility', expected: 'typescript-pro' }, { query: 'Python async await asyncio coroutine patterns', expected: 'python-pro' }, { query: 'Go goroutine channel concurrency patterns', expected: 'golang-pro' }, { query: 'Rust ownership borrow checker lifetime annotations', expected: 'rust-engineer' }, { query: 'Angular standalone components Signals RxJS', expected: 'angular-architect' }, { query: 'Vue 3 Composition API Pinia state management', expected: 'vue-expert' }, { query: 'Next.js App Router Server Components RSC', expected: 'nextjs-developer' }, { query: 'Flutter Widget Riverpod state management Dart', expected: 'flutter-expert' }, { query: 'Swift SwiftUI async await Core Data iOS', expected: 'swift-expert' }, { query: 'WebSocket Socket.IO real-time heartbeat reconnect', expected: 'websocket-engineer' }, // domain: imported-architecture (3 new skills) { query: 'design REST API OpenAPI 3.1 specification versioning', expected: 'api-designer' }, { query: 'GraphQL schema resolver subscription Apollo DataLoader', expected: 'graphql-architect' }, { query: 'AWS Lambda serverless cloud architecture FinOps', expected: 'cloud-architect' }, // domain: imported-devops (2 new skills) { query: 'Kubernetes Helm chart RBAC NetworkPolicy Pod', expected: 'kubernetes-specialist' }, { query: 'Terraform HCL infrastructure as code modules state', expected: 'terraform-engineer' }, // 歧义场景 (消歧规则应正确触发) { query: 'fix React bug component not rendering', expected: 'debugger-expert' }, { query: 'review code for security vulnerabilities', expected: 'reviewer-expert' }, { query: 'K8s deployment scaling strategy', expected: 'cloud-native-expert' }, ]; // 运行基准测试 const bm25Params = ra.buildBM25Params(index); const routeStats = rt.getSkillRouteStats(30); let correct = 0; let totalTop1Conf = 0; let totalGap12 = 0; let totalDisambHits = 0; let totalColdStart = 0; let totalRerankBoost = 0; let rerankCount = 0; const results = []; for (const tc of testCases) { const tokens = ra.tokenize(tc.query); // 评分 const scored = index.skills.map(s => { const { totalScore, matchedKeywords } = ra.scoreSkill(s, tokens, bm25Params); return { name: s.name, score: Math.round(totalScore * 100) / 100, matchedKeywords, }; }).sort((a, b) => b.score - a.score); // 消歧 const { results: disamb, firedRules } = ra.applyDisambiguation(scored, tc.query, index); // 冷启动 const { boostedSkills } = ra.applyColdStartBoost(disamb, routeStats); // Reranking const reranked = ra.rerankTopK(disamb, tokens, index, 10); reranked.sort((a, b) => b.score - a.score); // 归一化 const normalized = ra.normalizeScores(reranked).slice(0, 5); const top1 = normalized[0]; const top2 = normalized[1]; const isCorrect = top1 && top1.name === tc.expected; const conf = top1 ? top1.confidence : 0; const gap = (top1 ? top1.confidence : 0) - (top2 ? top2.confidence : 0); if (isCorrect) correct++; totalTop1Conf += conf; totalGap12 += gap; if (firedRules.length > 0) totalDisambHits++; if (boostedSkills && boostedSkills.length > 0) totalColdStart++; // 统计 rerank boost if (top1 && top1._rerankBoost) { totalRerankBoost += top1._rerankBoost; rerankCount++; } const status = isCorrect ? '✓' : '✗'; const actual = top1 ? top1.name : 'none'; results.push({ status, query: tc.query.slice(0, 40), expected: tc.expected, actual, conf, gap, rules: firedRules }); } // 输出报告 console.log('╔══════════════════════════════════════════════════════════╗'); console.log('║ P1 路由质量基准测试报告 ║'); console.log('╠══════════════════════════════════════════════════════════╣'); console.log(''); for (const r of results) { const marker = r.status === '✗' ? ' ← WRONG (got: ' + r.actual + ')' : ''; console.log(` ${r.status} ${r.query.padEnd(42)} → ${r.expected}${marker}`); if (r.rules.length > 0) console.log(` rules: ${r.rules.join(', ')}`); } const n = testCases.length; console.log(''); console.log('╠══════════════════════════════════════════════════════════╣'); console.log(`║ 准确率: ${correct}/${n} (${(correct/n*100).toFixed(1)}%)`.padEnd(59) + '║'); console.log(`║ 平均置信度: ${(totalTop1Conf/n).toFixed(3)}`.padEnd(59) + '║'); console.log(`║ 平均 gap1-2: ${(totalGap12/n).toFixed(3)}`.padEnd(59) + '║'); console.log(`║ 消歧命中: ${totalDisambHits}/${n} (${(totalDisambHits/n*100).toFixed(1)}%)`.padEnd(59) + '║'); console.log(`║ 冷启动触发: ${totalColdStart}/${n} (${(totalColdStart/n*100).toFixed(1)}%)`.padEnd(59) + '║'); if (rerankCount > 0) { console.log(`║ 平均 rerank: ×${(totalRerankBoost/rerankCount).toFixed(3)}`.padEnd(59) + '║'); } console.log('╚══════════════════════════════════════════════════════════╝'); // 写入遥测 const benchmarkEntry = { ts: new Date().toISOString(), type: 'p1-benchmark', accuracy: correct / n, avgConfidence: totalTop1Conf / n, avgGap12: totalGap12 / n, disambHitRate: totalDisambHits / n, coldStartRate: totalColdStart / n, avgRerankBoost: rerankCount > 0 ? totalRerankBoost / rerankCount : 0, testCount: n, }; try { const debugDir = path.join(__dirname, '..', 'debug'); if (!fs.existsSync(debugDir)) fs.mkdirSync(debugDir, { recursive: true }); fs.appendFileSync(path.join(debugDir, 'route-metrics.jsonl'), JSON.stringify(benchmarkEntry) + '\n'); console.log('\n基准数据已追加到 debug/route-metrics.jsonl'); } catch {} process.exit(correct === n ? 0 : 1);