127 lines
4.3 KiB
JavaScript
127 lines
4.3 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* IR 评估模块 (P1-FIX)
|
||
*
|
||
* 计算路由系统的标准 IR 指标:
|
||
* - MRR (Mean Reciprocal Rank): 正确技能在排名中的平均倒数位置
|
||
* - P@1 (Precision at 1): 首位命中率
|
||
*
|
||
* 数据源: route-feedback.jsonl 中的纠正记录
|
||
*
|
||
* 用法:
|
||
* node scripts/ir-eval.js # 评估当前系统
|
||
* node scripts/ir-eval.js --json # JSON 输出
|
||
* node scripts/ir-eval.js --reject 0.5 # MRR 低于 0.5 时返回非零退出码
|
||
*/
|
||
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
const CLAUDE_ROOT = require('./paths.config.js').PATHS.root;
|
||
const FEEDBACK_FILE = path.join(CLAUDE_ROOT, 'debug', 'route-feedback.jsonl');
|
||
|
||
function loadCorrections() {
|
||
if (!fs.existsSync(FEEDBACK_FILE)) return [];
|
||
return fs.readFileSync(FEEDBACK_FILE, 'utf8').trim().split('\n')
|
||
.map(line => { try { return JSON.parse(line); } catch { return null; } })
|
||
.filter(e => e && e.correctedTo && e.routedTo !== e.correctedTo);
|
||
}
|
||
|
||
|
||
function loadGoldenSet() {
|
||
const gsFile = path.join(CLAUDE_ROOT, 'scripts', 'golden-set.json');
|
||
if (!fs.existsSync(gsFile)) return [];
|
||
try {
|
||
const gs = JSON.parse(fs.readFileSync(gsFile, 'utf8'));
|
||
return (gs.entries || []).map(e => ({
|
||
query: e.query,
|
||
correctedTo: e.expectedSkill,
|
||
source: e.source || 'golden',
|
||
}));
|
||
} catch { return []; }
|
||
}
|
||
|
||
function evaluate() {
|
||
const corrections = loadCorrections();
|
||
const goldenEntries = loadGoldenSet();
|
||
// 合并: golden set + corrections (golden set 优先作为评估数据源)
|
||
const allEntries = goldenEntries.length > 0 ? goldenEntries : corrections;
|
||
// 如果有 golden set,忽略 vacuous check
|
||
if (allEntries.length === 0) return { mrr: 1.0, p1: 1.0, samples: 0, note: 'No data' };
|
||
|
||
// 加载路由引擎进行离线评估
|
||
let routeAnalyzer;
|
||
try { routeAnalyzer = require('./route-analyzer.js'); } catch { return { error: 'Cannot load route-analyzer' }; }
|
||
|
||
let rrSum = 0; // 倒数排名之和
|
||
let p1Hits = 0; // P@1 命中数
|
||
let evaluated = 0;
|
||
|
||
for (const c of allEntries) {
|
||
if (!c.query && !c.prompt) continue;
|
||
const query = c.query || c.prompt || '';
|
||
const correctSkill = c.correctedTo;
|
||
|
||
try {
|
||
// 加载索引
|
||
const idxFile = path.join(CLAUDE_ROOT, 'skills-index-lite.json');
|
||
if (!fs.existsSync(idxFile)) continue;
|
||
const index = JSON.parse(fs.readFileSync(idxFile, 'utf8'));
|
||
|
||
const tokens = routeAnalyzer.tokenize ? routeAnalyzer.tokenize(query) : new Set(query.toLowerCase().split(/\s+/));
|
||
const bm25Params = routeAnalyzer.buildBM25Params ? routeAnalyzer.buildBM25Params(index) : null;
|
||
|
||
const scores = [];
|
||
for (const skill of (index.skills || [])) {
|
||
const result = routeAnalyzer.scoreSkill
|
||
? routeAnalyzer.scoreSkill(skill, tokens, bm25Params)
|
||
: { totalScore: 0 };
|
||
scores.push({ name: skill.name, score: result.totalScore });
|
||
}
|
||
scores.sort((a, b) => b.score - a.score);
|
||
|
||
// 找正确技能的排名
|
||
const rank = scores.findIndex(s => s.name === correctSkill) + 1;
|
||
if (rank > 0) {
|
||
rrSum += 1.0 / rank;
|
||
if (rank === 1) p1Hits++;
|
||
}
|
||
evaluated++;
|
||
} catch {}
|
||
}
|
||
|
||
if (evaluated === 0) return { mrr: 1.0, p1: 1.0, samples: 0, note: 'No evaluable corrections' };
|
||
|
||
return {
|
||
mrr: Math.round((rrSum / evaluated) * 1000) / 1000,
|
||
p1: Math.round((p1Hits / evaluated) * 1000) / 1000,
|
||
samples: evaluated,
|
||
totalCorrections: allEntries.length,
|
||
goldenSetSize: goldenEntries.length,
|
||
};
|
||
}
|
||
|
||
// CLI 入口
|
||
if (require.main === module) {
|
||
const result = evaluate();
|
||
const isJson = process.argv.includes('--json');
|
||
const rejectIdx = process.argv.indexOf('--reject');
|
||
const rejectThreshold = rejectIdx >= 0 ? parseFloat(process.argv[rejectIdx + 1]) : null;
|
||
|
||
if (isJson) {
|
||
console.log(JSON.stringify(result, null, 2));
|
||
} else {
|
||
console.log('=== IR Evaluation ===');
|
||
console.log('MRR: ' + (result.mrr || 'N/A'));
|
||
console.log('P@1: ' + (result.p1 || 'N/A'));
|
||
console.log('Samples: ' + (result.samples || 0));
|
||
if (result.note) console.log('Note: ' + result.note);
|
||
}
|
||
|
||
if (rejectThreshold !== null && result.mrr < rejectThreshold) {
|
||
console.error('MRR ' + result.mrr + ' below threshold ' + rejectThreshold);
|
||
process.exit(1);
|
||
}
|
||
}
|
||
|
||
module.exports = { evaluate, loadCorrections };
|