142 lines
5.2 KiB
JavaScript
142 lines
5.2 KiB
JavaScript
|
|
#!/usr/bin/env node
|
|||
|
|
/**
|
|||
|
|
* A/B 回测脚本: 用历史纠正数据验证消歧规则效果
|
|||
|
|
*
|
|||
|
|
* 读取 route-feedback.jsonl 的纠正记录,
|
|||
|
|
* 对每个 query 分别用 "原始 BM25" 和 "BM25 + 消歧" 评分,
|
|||
|
|
* 对比两者的路由准确率变化。
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
const fs = require('fs');
|
|||
|
|
const path = require('path');
|
|||
|
|
|
|||
|
|
const detectRoot = () => require('./paths.config.js').PATHS.root;
|
|||
|
|
|
|||
|
|
const ROOT = detectRoot();
|
|||
|
|
|
|||
|
|
// 加载 route-analyzer 的各导出函数
|
|||
|
|
const analyzer = require(path.join(ROOT, 'scripts', 'route-analyzer.js'));
|
|||
|
|
const { tokenize, scoreSkill, buildBM25Params, normalizeScores, applyDisambiguation } = analyzer;
|
|||
|
|
|
|||
|
|
// 加载 skills-index.json
|
|||
|
|
const indexFile = path.join(ROOT, 'skills-index.json');
|
|||
|
|
const index = JSON.parse(fs.readFileSync(indexFile, 'utf8'));
|
|||
|
|
|
|||
|
|
// 构建 BM25 全局参数
|
|||
|
|
const bm25Params = buildBM25Params(index);
|
|||
|
|
|
|||
|
|
// 加载反馈数据
|
|||
|
|
const feedbackFile = path.join(ROOT, 'debug', 'route-feedback.jsonl');
|
|||
|
|
const lines = fs.readFileSync(feedbackFile, 'utf8').trim().split('\n');
|
|||
|
|
const entries = lines.map(l => { try { return JSON.parse(l); } catch { return null; } }).filter(Boolean);
|
|||
|
|
|
|||
|
|
// 分离纠正和确认
|
|||
|
|
const corrections = entries.filter(e => e.routedTo !== e.correctedTo);
|
|||
|
|
const confirms = entries.filter(e => e.routedTo === e.correctedTo);
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 对 query 运行评分管道,返回排序后的结果
|
|||
|
|
* @param {string} query
|
|||
|
|
* @param {boolean} withDisambig - 是否启用消歧规则
|
|||
|
|
*/
|
|||
|
|
function scoreQuery(query, withDisambig = true) {
|
|||
|
|
const queryTokens = tokenize(query);
|
|||
|
|
|
|||
|
|
// BM25 评分所有技能
|
|||
|
|
const results = index.skills.map(skill => {
|
|||
|
|
const { totalScore, matchedKeywords } = scoreSkill(skill, queryTokens, bm25Params);
|
|||
|
|
return {
|
|||
|
|
name: skill.name,
|
|||
|
|
score: Math.round(totalScore * 100) / 100,
|
|||
|
|
matchedKeywords,
|
|||
|
|
};
|
|||
|
|
}).sort((a, b) => b.score - a.score);
|
|||
|
|
|
|||
|
|
// 可选: 消歧
|
|||
|
|
const final = withDisambig ? applyDisambiguation(results, query, index) : results;
|
|||
|
|
|
|||
|
|
return normalizeScores(final);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
console.log('=== A/B 回测: 消歧规则效果验证 ===\n');
|
|||
|
|
console.log(`反馈总数: ${entries.length} (确认 ${confirms.length}, 纠正 ${corrections.length})`);
|
|||
|
|
console.log(`原始准确率: ${((confirms.length / entries.length) * 100).toFixed(1)}%\n`);
|
|||
|
|
|
|||
|
|
// 对每个纠正 case 分别测试 "无消歧" 和 "有消歧"
|
|||
|
|
let fixedByDisambig = 0;
|
|||
|
|
let stillWrong = 0;
|
|||
|
|
let confirmStillCorrect = 0;
|
|||
|
|
let confirmRegressed = 0;
|
|||
|
|
let alreadyCorrectWithout = 0;
|
|||
|
|
|
|||
|
|
console.log('--- 纠正 case 回测 ---');
|
|||
|
|
for (const c of corrections) {
|
|||
|
|
const withoutD = scoreQuery(c.query, false);
|
|||
|
|
const withD = scoreQuery(c.query, true);
|
|||
|
|
|
|||
|
|
const oldTop = withoutD[0]?.name || '(none)';
|
|||
|
|
const newTop = withD[0]?.name || '(none)';
|
|||
|
|
const expected = c.correctedTo;
|
|||
|
|
|
|||
|
|
if (oldTop === expected) {
|
|||
|
|
alreadyCorrectWithout++;
|
|||
|
|
console.log(` [ALREADY] "${c.query}" → ${oldTop} (BM25 已修复,可能因关键词降级)`);
|
|||
|
|
} else if (newTop === expected) {
|
|||
|
|
fixedByDisambig++;
|
|||
|
|
console.log(` [FIXED] "${c.query}"`);
|
|||
|
|
console.log(` BM25: ${oldTop} → 消歧: ${newTop} (正确)`);
|
|||
|
|
} else {
|
|||
|
|
stillWrong++;
|
|||
|
|
console.log(` [MISS] "${c.query}"`);
|
|||
|
|
console.log(` BM25: ${oldTop}, 消歧: ${newTop}, 期望: ${expected}`);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 对确认 case 验证无回归
|
|||
|
|
console.log('\n--- 确认 case 回归检查 ---');
|
|||
|
|
for (const c of confirms) {
|
|||
|
|
const withD = scoreQuery(c.query, true);
|
|||
|
|
const newTop = withD[0]?.name || '(none)';
|
|||
|
|
|
|||
|
|
if (newTop === c.correctedTo) {
|
|||
|
|
confirmStillCorrect++;
|
|||
|
|
} else {
|
|||
|
|
confirmRegressed++;
|
|||
|
|
console.log(` [REGRESS] "${c.query}"`);
|
|||
|
|
console.log(` 原: ${c.correctedTo} → 新: ${newTop}`);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 汇总
|
|||
|
|
const newCorrectCount = confirms.length - confirmRegressed + fixedByDisambig;
|
|||
|
|
const newAccuracy = (newCorrectCount / entries.length * 100).toFixed(1);
|
|||
|
|
const oldAccuracy = (confirms.length / entries.length * 100).toFixed(1);
|
|||
|
|
|
|||
|
|
console.log('\n=== 汇总 ===');
|
|||
|
|
console.log(`纠正修复: ${fixedByDisambig}/${corrections.length}`);
|
|||
|
|
console.log(`仍然错误: ${stillWrong}/${corrections.length}`);
|
|||
|
|
console.log(`确认保持: ${confirmStillCorrect}/${confirms.length}`);
|
|||
|
|
console.log(`确认回归: ${confirmRegressed}/${confirms.length}`);
|
|||
|
|
console.log(`\n准确率: ${oldAccuracy}% → ${newAccuracy}% (${newCorrectCount}/${entries.length})`);
|
|||
|
|
console.log(`提升: +${(newAccuracy - oldAccuracy).toFixed(1)}%`);
|
|||
|
|
|
|||
|
|
if (confirmRegressed > 0) {
|
|||
|
|
console.log(`\n[WARN] 发现 ${confirmRegressed} 个回归,需要检查消歧规则`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// JSON 输出
|
|||
|
|
const report = {
|
|||
|
|
ts: new Date().toISOString(),
|
|||
|
|
totalEntries: entries.length,
|
|||
|
|
originalAccuracy: parseFloat(oldAccuracy),
|
|||
|
|
newAccuracy: parseFloat(newAccuracy),
|
|||
|
|
improvement: parseFloat((newAccuracy - oldAccuracy).toFixed(1)),
|
|||
|
|
corrections: { total: corrections.length, fixed: fixedByDisambig, missed: stillWrong },
|
|||
|
|
confirms: { total: confirms.length, kept: confirmStillCorrect, regressed: confirmRegressed },
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
// 保存报告
|
|||
|
|
const reportFile = path.join(ROOT, 'debug', 'ab-backtest-report.json');
|
|||
|
|
fs.writeFileSync(reportFile, JSON.stringify(report, null, 2) + '\n');
|
|||
|
|
console.log(`\n报告已保存: ${reportFile}`);
|