127 lines
4.3 KiB
JavaScript
127 lines
4.3 KiB
JavaScript
|
|
#!/usr/bin/env node
|
|||
|
|
/**
|
|||
|
|
* IR 评估模块 (P1-FIX)
|
|||
|
|
*
|
|||
|
|
* 计算路由系统的标准 IR 指标:
|
|||
|
|
* - MRR (Mean Reciprocal Rank): 正确技能在排名中的平均倒数位置
|
|||
|
|
* - P@1 (Precision at 1): 首位命中率
|
|||
|
|
*
|
|||
|
|
* 数据源: route-feedback.jsonl 中的纠正记录
|
|||
|
|
*
|
|||
|
|
* 用法:
|
|||
|
|
* node scripts/ir-eval.js # 评估当前系统
|
|||
|
|
* node scripts/ir-eval.js --json # JSON 输出
|
|||
|
|
* node scripts/ir-eval.js --reject 0.5 # MRR 低于 0.5 时返回非零退出码
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
const fs = require('fs');
|
|||
|
|
const path = require('path');
|
|||
|
|
const CLAUDE_ROOT = require('./paths.config.js').PATHS.root;
|
|||
|
|
const FEEDBACK_FILE = path.join(CLAUDE_ROOT, 'debug', 'route-feedback.jsonl');
|
|||
|
|
|
|||
|
|
function loadCorrections() {
|
|||
|
|
if (!fs.existsSync(FEEDBACK_FILE)) return [];
|
|||
|
|
return fs.readFileSync(FEEDBACK_FILE, 'utf8').trim().split('\n')
|
|||
|
|
.map(line => { try { return JSON.parse(line); } catch { return null; } })
|
|||
|
|
.filter(e => e && e.correctedTo && e.routedTo !== e.correctedTo);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
function loadGoldenSet() {
|
|||
|
|
const gsFile = path.join(CLAUDE_ROOT, 'scripts', 'golden-set.json');
|
|||
|
|
if (!fs.existsSync(gsFile)) return [];
|
|||
|
|
try {
|
|||
|
|
const gs = JSON.parse(fs.readFileSync(gsFile, 'utf8'));
|
|||
|
|
return (gs.entries || []).map(e => ({
|
|||
|
|
query: e.query,
|
|||
|
|
correctedTo: e.expectedSkill,
|
|||
|
|
source: e.source || 'golden',
|
|||
|
|
}));
|
|||
|
|
} catch { return []; }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function evaluate() {
|
|||
|
|
const corrections = loadCorrections();
|
|||
|
|
const goldenEntries = loadGoldenSet();
|
|||
|
|
// 合并: golden set + corrections (golden set 优先作为评估数据源)
|
|||
|
|
const allEntries = goldenEntries.length > 0 ? goldenEntries : corrections;
|
|||
|
|
// 如果有 golden set,忽略 vacuous check
|
|||
|
|
if (allEntries.length === 0) return { mrr: 1.0, p1: 1.0, samples: 0, note: 'No data' };
|
|||
|
|
|
|||
|
|
// 加载路由引擎进行离线评估
|
|||
|
|
let routeAnalyzer;
|
|||
|
|
try { routeAnalyzer = require('./route-analyzer.js'); } catch { return { error: 'Cannot load route-analyzer' }; }
|
|||
|
|
|
|||
|
|
let rrSum = 0; // 倒数排名之和
|
|||
|
|
let p1Hits = 0; // P@1 命中数
|
|||
|
|
let evaluated = 0;
|
|||
|
|
|
|||
|
|
for (const c of allEntries) {
|
|||
|
|
if (!c.query && !c.prompt) continue;
|
|||
|
|
const query = c.query || c.prompt || '';
|
|||
|
|
const correctSkill = c.correctedTo;
|
|||
|
|
|
|||
|
|
try {
|
|||
|
|
// 加载索引
|
|||
|
|
const idxFile = path.join(CLAUDE_ROOT, 'skills-index-lite.json');
|
|||
|
|
if (!fs.existsSync(idxFile)) continue;
|
|||
|
|
const index = JSON.parse(fs.readFileSync(idxFile, 'utf8'));
|
|||
|
|
|
|||
|
|
const tokens = routeAnalyzer.tokenize ? routeAnalyzer.tokenize(query) : new Set(query.toLowerCase().split(/\s+/));
|
|||
|
|
const bm25Params = routeAnalyzer.buildBM25Params ? routeAnalyzer.buildBM25Params(index) : null;
|
|||
|
|
|
|||
|
|
const scores = [];
|
|||
|
|
for (const skill of (index.skills || [])) {
|
|||
|
|
const result = routeAnalyzer.scoreSkill
|
|||
|
|
? routeAnalyzer.scoreSkill(skill, tokens, bm25Params)
|
|||
|
|
: { totalScore: 0 };
|
|||
|
|
scores.push({ name: skill.name, score: result.totalScore });
|
|||
|
|
}
|
|||
|
|
scores.sort((a, b) => b.score - a.score);
|
|||
|
|
|
|||
|
|
// 找正确技能的排名
|
|||
|
|
const rank = scores.findIndex(s => s.name === correctSkill) + 1;
|
|||
|
|
if (rank > 0) {
|
|||
|
|
rrSum += 1.0 / rank;
|
|||
|
|
if (rank === 1) p1Hits++;
|
|||
|
|
}
|
|||
|
|
evaluated++;
|
|||
|
|
} catch {}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (evaluated === 0) return { mrr: 1.0, p1: 1.0, samples: 0, note: 'No evaluable corrections' };
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
mrr: Math.round((rrSum / evaluated) * 1000) / 1000,
|
|||
|
|
p1: Math.round((p1Hits / evaluated) * 1000) / 1000,
|
|||
|
|
samples: evaluated,
|
|||
|
|
totalCorrections: allEntries.length,
|
|||
|
|
goldenSetSize: goldenEntries.length,
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CLI 入口
|
|||
|
|
if (require.main === module) {
|
|||
|
|
const result = evaluate();
|
|||
|
|
const isJson = process.argv.includes('--json');
|
|||
|
|
const rejectIdx = process.argv.indexOf('--reject');
|
|||
|
|
const rejectThreshold = rejectIdx >= 0 ? parseFloat(process.argv[rejectIdx + 1]) : null;
|
|||
|
|
|
|||
|
|
if (isJson) {
|
|||
|
|
console.log(JSON.stringify(result, null, 2));
|
|||
|
|
} else {
|
|||
|
|
console.log('=== IR Evaluation ===');
|
|||
|
|
console.log('MRR: ' + (result.mrr || 'N/A'));
|
|||
|
|
console.log('P@1: ' + (result.p1 || 'N/A'));
|
|||
|
|
console.log('Samples: ' + (result.samples || 0));
|
|||
|
|
if (result.note) console.log('Note: ' + result.note);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (rejectThreshold !== null && result.mrr < rejectThreshold) {
|
|||
|
|
console.error('MRR ' + result.mrr + ' below threshold ' + rejectThreshold);
|
|||
|
|
process.exit(1);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
module.exports = { evaluate, loadCorrections };
|