bookworm-smart-assistant/scripts/ir-eval.js

127 lines
4.3 KiB
JavaScript
Raw Normal View History

#!/usr/bin/env node
/**
* IR 评估模块 (P1-FIX)
*
* 计算路由系统的标准 IR 指标:
* - MRR (Mean Reciprocal Rank): 正确技能在排名中的平均倒数位置
* - P@1 (Precision at 1): 首位命中率
*
* 数据源: route-feedback.jsonl 中的纠正记录
*
* 用法:
* node scripts/ir-eval.js # 评估当前系统
* node scripts/ir-eval.js --json # JSON 输出
* node scripts/ir-eval.js --reject 0.5 # MRR 低于 0.5 时返回非零退出码
*/
const fs = require('fs');
const path = require('path');
const CLAUDE_ROOT = require('./paths.config.js').PATHS.root;
const FEEDBACK_FILE = path.join(CLAUDE_ROOT, 'debug', 'route-feedback.jsonl');
function loadCorrections() {
if (!fs.existsSync(FEEDBACK_FILE)) return [];
return fs.readFileSync(FEEDBACK_FILE, 'utf8').trim().split('\n')
.map(line => { try { return JSON.parse(line); } catch { return null; } })
.filter(e => e && e.correctedTo && e.routedTo !== e.correctedTo);
}
function loadGoldenSet() {
const gsFile = path.join(CLAUDE_ROOT, 'scripts', 'golden-set.json');
if (!fs.existsSync(gsFile)) return [];
try {
const gs = JSON.parse(fs.readFileSync(gsFile, 'utf8'));
return (gs.entries || []).map(e => ({
query: e.query,
correctedTo: e.expectedSkill,
source: e.source || 'golden',
}));
} catch { return []; }
}
function evaluate() {
const corrections = loadCorrections();
const goldenEntries = loadGoldenSet();
// 合并: golden set + corrections (golden set 优先作为评估数据源)
const allEntries = goldenEntries.length > 0 ? goldenEntries : corrections;
// 如果有 golden set忽略 vacuous check
if (allEntries.length === 0) return { mrr: 1.0, p1: 1.0, samples: 0, note: 'No data' };
// 加载路由引擎进行离线评估
let routeAnalyzer;
try { routeAnalyzer = require('./route-analyzer.js'); } catch { return { error: 'Cannot load route-analyzer' }; }
let rrSum = 0; // 倒数排名之和
let p1Hits = 0; // P@1 命中数
let evaluated = 0;
for (const c of allEntries) {
if (!c.query && !c.prompt) continue;
const query = c.query || c.prompt || '';
const correctSkill = c.correctedTo;
try {
// 加载索引
const idxFile = path.join(CLAUDE_ROOT, 'skills-index-lite.json');
if (!fs.existsSync(idxFile)) continue;
const index = JSON.parse(fs.readFileSync(idxFile, 'utf8'));
const tokens = routeAnalyzer.tokenize ? routeAnalyzer.tokenize(query) : new Set(query.toLowerCase().split(/\s+/));
const bm25Params = routeAnalyzer.buildBM25Params ? routeAnalyzer.buildBM25Params(index) : null;
const scores = [];
for (const skill of (index.skills || [])) {
const result = routeAnalyzer.scoreSkill
? routeAnalyzer.scoreSkill(skill, tokens, bm25Params)
: { totalScore: 0 };
scores.push({ name: skill.name, score: result.totalScore });
}
scores.sort((a, b) => b.score - a.score);
// 找正确技能的排名
const rank = scores.findIndex(s => s.name === correctSkill) + 1;
if (rank > 0) {
rrSum += 1.0 / rank;
if (rank === 1) p1Hits++;
}
evaluated++;
} catch {}
}
if (evaluated === 0) return { mrr: 1.0, p1: 1.0, samples: 0, note: 'No evaluable corrections' };
return {
mrr: Math.round((rrSum / evaluated) * 1000) / 1000,
p1: Math.round((p1Hits / evaluated) * 1000) / 1000,
samples: evaluated,
totalCorrections: allEntries.length,
goldenSetSize: goldenEntries.length,
};
}
// CLI 入口
if (require.main === module) {
const result = evaluate();
const isJson = process.argv.includes('--json');
const rejectIdx = process.argv.indexOf('--reject');
const rejectThreshold = rejectIdx >= 0 ? parseFloat(process.argv[rejectIdx + 1]) : null;
if (isJson) {
console.log(JSON.stringify(result, null, 2));
} else {
console.log('=== IR Evaluation ===');
console.log('MRR: ' + (result.mrr || 'N/A'));
console.log('P@1: ' + (result.p1 || 'N/A'));
console.log('Samples: ' + (result.samples || 0));
if (result.note) console.log('Note: ' + result.note);
}
if (rejectThreshold !== null && result.mrr < rejectThreshold) {
console.error('MRR ' + result.mrr + ' below threshold ' + rejectThreshold);
process.exit(1);
}
}
module.exports = { evaluate, loadCorrections };