#!/usr/bin/env node /** * IR 评估模块 (P1-FIX) * * 计算路由系统的标准 IR 指标: * - MRR (Mean Reciprocal Rank): 正确技能在排名中的平均倒数位置 * - P@1 (Precision at 1): 首位命中率 * * 数据源: route-feedback.jsonl 中的纠正记录 * * 用法: * node scripts/ir-eval.js # 评估当前系统 * node scripts/ir-eval.js --json # JSON 输出 * node scripts/ir-eval.js --reject 0.5 # MRR 低于 0.5 时返回非零退出码 */ const fs = require('fs'); const path = require('path'); const CLAUDE_ROOT = require('./paths.config.js').PATHS.root; const FEEDBACK_FILE = path.join(CLAUDE_ROOT, 'debug', 'route-feedback.jsonl'); function loadCorrections() { if (!fs.existsSync(FEEDBACK_FILE)) return []; return fs.readFileSync(FEEDBACK_FILE, 'utf8').trim().split('\n') .map(line => { try { return JSON.parse(line); } catch { return null; } }) .filter(e => e && e.correctedTo && e.routedTo !== e.correctedTo); } function loadGoldenSet() { const gsFile = path.join(CLAUDE_ROOT, 'scripts', 'golden-set.json'); if (!fs.existsSync(gsFile)) return []; try { const gs = JSON.parse(fs.readFileSync(gsFile, 'utf8')); return (gs.entries || []).map(e => ({ query: e.query, correctedTo: e.expectedSkill, source: e.source || 'golden', })); } catch { return []; } } function evaluate() { const corrections = loadCorrections(); const goldenEntries = loadGoldenSet(); // 合并: golden set + corrections (golden set 优先作为评估数据源) const allEntries = goldenEntries.length > 0 ? goldenEntries : corrections; // 如果有 golden set,忽略 vacuous check if (allEntries.length === 0) return { mrr: 1.0, p1: 1.0, samples: 0, note: 'No data' }; // 加载路由引擎进行离线评估 let routeAnalyzer; try { routeAnalyzer = require('./route-analyzer.js'); } catch { return { error: 'Cannot load route-analyzer' }; } let rrSum = 0; // 倒数排名之和 let p1Hits = 0; // P@1 命中数 let evaluated = 0; for (const c of allEntries) { if (!c.query && !c.prompt) continue; const query = c.query || c.prompt || ''; const correctSkill = c.correctedTo; try { // 加载索引 const idxFile = path.join(CLAUDE_ROOT, 'skills-index-lite.json'); if (!fs.existsSync(idxFile)) continue; const index = JSON.parse(fs.readFileSync(idxFile, 'utf8')); const tokens = routeAnalyzer.tokenize ? routeAnalyzer.tokenize(query) : new Set(query.toLowerCase().split(/\s+/)); const bm25Params = routeAnalyzer.buildBM25Params ? routeAnalyzer.buildBM25Params(index) : null; const scores = []; for (const skill of (index.skills || [])) { const result = routeAnalyzer.scoreSkill ? routeAnalyzer.scoreSkill(skill, tokens, bm25Params) : { totalScore: 0 }; scores.push({ name: skill.name, score: result.totalScore }); } scores.sort((a, b) => b.score - a.score); // 找正确技能的排名 const rank = scores.findIndex(s => s.name === correctSkill) + 1; if (rank > 0) { rrSum += 1.0 / rank; if (rank === 1) p1Hits++; } evaluated++; } catch {} } if (evaluated === 0) return { mrr: 1.0, p1: 1.0, samples: 0, note: 'No evaluable corrections' }; return { mrr: Math.round((rrSum / evaluated) * 1000) / 1000, p1: Math.round((p1Hits / evaluated) * 1000) / 1000, samples: evaluated, totalCorrections: allEntries.length, goldenSetSize: goldenEntries.length, }; } // CLI 入口 if (require.main === module) { const result = evaluate(); const isJson = process.argv.includes('--json'); const rejectIdx = process.argv.indexOf('--reject'); const rejectThreshold = rejectIdx >= 0 ? parseFloat(process.argv[rejectIdx + 1]) : null; if (isJson) { console.log(JSON.stringify(result, null, 2)); } else { console.log('=== IR Evaluation ==='); console.log('MRR: ' + (result.mrr || 'N/A')); console.log('P@1: ' + (result.p1 || 'N/A')); console.log('Samples: ' + (result.samples || 0)); if (result.note) console.log('Note: ' + result.note); } if (rejectThreshold !== null && result.mrr < rejectThreshold) { console.error('MRR ' + result.mrr + ' below threshold ' + rejectThreshold); process.exit(1); } } module.exports = { evaluate, loadCorrections };