bookworm-smart-assistant/scripts/ir-eval.js

127 lines
4.3 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* IR 评估模块 (P1-FIX)
*
* 计算路由系统的标准 IR 指标:
* - MRR (Mean Reciprocal Rank): 正确技能在排名中的平均倒数位置
* - P@1 (Precision at 1): 首位命中率
*
* 数据源: route-feedback.jsonl 中的纠正记录
*
* 用法:
* node scripts/ir-eval.js # 评估当前系统
* node scripts/ir-eval.js --json # JSON 输出
* node scripts/ir-eval.js --reject 0.5 # MRR 低于 0.5 时返回非零退出码
*/
const fs = require('fs');
const path = require('path');
const CLAUDE_ROOT = require('./paths.config.js').PATHS.root;
const FEEDBACK_FILE = path.join(CLAUDE_ROOT, 'debug', 'route-feedback.jsonl');
function loadCorrections() {
if (!fs.existsSync(FEEDBACK_FILE)) return [];
return fs.readFileSync(FEEDBACK_FILE, 'utf8').trim().split('\n')
.map(line => { try { return JSON.parse(line); } catch { return null; } })
.filter(e => e && e.correctedTo && e.routedTo !== e.correctedTo);
}
function loadGoldenSet() {
const gsFile = path.join(CLAUDE_ROOT, 'scripts', 'golden-set.json');
if (!fs.existsSync(gsFile)) return [];
try {
const gs = JSON.parse(fs.readFileSync(gsFile, 'utf8'));
return (gs.entries || []).map(e => ({
query: e.query,
correctedTo: e.expectedSkill,
source: e.source || 'golden',
}));
} catch { return []; }
}
function evaluate() {
const corrections = loadCorrections();
const goldenEntries = loadGoldenSet();
// 合并: golden set + corrections (golden set 优先作为评估数据源)
const allEntries = goldenEntries.length > 0 ? goldenEntries : corrections;
// 如果有 golden set忽略 vacuous check
if (allEntries.length === 0) return { mrr: 1.0, p1: 1.0, samples: 0, note: 'No data' };
// 加载路由引擎进行离线评估
let routeAnalyzer;
try { routeAnalyzer = require('./route-analyzer.js'); } catch { return { error: 'Cannot load route-analyzer' }; }
let rrSum = 0; // 倒数排名之和
let p1Hits = 0; // P@1 命中数
let evaluated = 0;
for (const c of allEntries) {
if (!c.query && !c.prompt) continue;
const query = c.query || c.prompt || '';
const correctSkill = c.correctedTo;
try {
// 加载索引
const idxFile = path.join(CLAUDE_ROOT, 'skills-index-lite.json');
if (!fs.existsSync(idxFile)) continue;
const index = JSON.parse(fs.readFileSync(idxFile, 'utf8'));
const tokens = routeAnalyzer.tokenize ? routeAnalyzer.tokenize(query) : new Set(query.toLowerCase().split(/\s+/));
const bm25Params = routeAnalyzer.buildBM25Params ? routeAnalyzer.buildBM25Params(index) : null;
const scores = [];
for (const skill of (index.skills || [])) {
const result = routeAnalyzer.scoreSkill
? routeAnalyzer.scoreSkill(skill, tokens, bm25Params)
: { totalScore: 0 };
scores.push({ name: skill.name, score: result.totalScore });
}
scores.sort((a, b) => b.score - a.score);
// 找正确技能的排名
const rank = scores.findIndex(s => s.name === correctSkill) + 1;
if (rank > 0) {
rrSum += 1.0 / rank;
if (rank === 1) p1Hits++;
}
evaluated++;
} catch {}
}
if (evaluated === 0) return { mrr: 1.0, p1: 1.0, samples: 0, note: 'No evaluable corrections' };
return {
mrr: Math.round((rrSum / evaluated) * 1000) / 1000,
p1: Math.round((p1Hits / evaluated) * 1000) / 1000,
samples: evaluated,
totalCorrections: allEntries.length,
goldenSetSize: goldenEntries.length,
};
}
// CLI 入口
if (require.main === module) {
const result = evaluate();
const isJson = process.argv.includes('--json');
const rejectIdx = process.argv.indexOf('--reject');
const rejectThreshold = rejectIdx >= 0 ? parseFloat(process.argv[rejectIdx + 1]) : null;
if (isJson) {
console.log(JSON.stringify(result, null, 2));
} else {
console.log('=== IR Evaluation ===');
console.log('MRR: ' + (result.mrr || 'N/A'));
console.log('P@1: ' + (result.p1 || 'N/A'));
console.log('Samples: ' + (result.samples || 0));
if (result.note) console.log('Note: ' + result.note);
}
if (rejectThreshold !== null && result.mrr < rejectThreshold) {
console.error('MRR ' + result.mrr + ' below threshold ' + rejectThreshold);
process.exit(1);
}
}
module.exports = { evaluate, loadCorrections };