bookworm-smart-assistant/scripts/patches/test-route-regression-0427.js

158 lines
6.2 KiB
JavaScript
Raw Normal View History

#!/usr/bin/env node
'use strict';
/**
* v6.6.1 路由精度回归测试 5 测试用例
* 直接调用 route-engine + intent-classifier + disambiguation 验证
*/
const path = require('path');
const fs = require('fs');
const ROOT = path.join(__dirname, '..', '..');
// 加载核心模块
const routeEngine = require(path.join(ROOT, 'scripts', 'route-engine.js'));
const intentClassifier = require(path.join(ROOT, 'scripts', 'intent-classifier.js'));
const cwd = process.cwd();
let passed = 0, failed = 0;
function test(name, prompt, expectPrimary, opts = {}) {
const intent = intentClassifier.classify ? intentClassifier.classify(prompt) : { intents: [], entities: [], modifiers: [], complexity: 'medium' };
const result = routeEngine.runRouteEngine(prompt, cwd, intent);
const primary = result.primary;
const confidence = result.confidence;
const candidates = (result.candidates || []).slice(0, 5);
const coldStart = result._coldStartApplied || false;
const firedRules = (result._firedRules || []).map(r => r.id || r.rule || '').filter(Boolean);
// 检查是否命中期望 skill (主路由或 top-3 候选)
const top3Names = candidates.slice(0, 3).map(c => c.name);
const isPrimaryHit = primary === expectPrimary;
const isTop3Hit = top3Names.includes(expectPrimary);
const hit = isPrimaryHit || (opts.allowTop3 && isTop3Hit);
const status = hit ? 'PASS' : 'FAIL';
if (hit) passed++; else failed++;
console.log(`\n[${status}] ${name}`);
console.log(` prompt: "${prompt}"`);
console.log(` expect: ${expectPrimary}`);
console.log(` got: ${primary} (cf: ${confidence})`);
console.log(` top-3: ${top3Names.join(', ')}`);
console.log(` rules: ${firedRules.length > 0 ? firedRules.join(', ') : '(none)'}`);
console.log(` coldStart: ${coldStart}`);
if (opts.checkCap && coldStart) {
const capApplied = confidence <= 0.65;
console.log(` cap@0.65: ${capApplied ? 'YES' : 'NO (BUG!)'}`);
}
if (!hit) {
console.log(` ** MISMATCH: expected ${expectPrimary}, got ${primary}`);
if (isTop3Hit) console.log(` ** (但在 top-3 候选中)`);
}
}
console.log('=== Bookworm v6.6.1 Route Regression Test ===\n');
// TC1: R90 sre-expert
test('TC1: SLI 监控告警 → sre-expert (R90)',
'SLI 监控告警配置',
'sre-expert');
// TC2: R91 impact-analyst
test('TC2: 函数影响分析 → impact-analyst (R91)',
'改这个函数会影响哪些模块',
'impact-analyst');
// TC3: R92 data-analyst-expert
test('TC3: Google Sheets 数据分析 → data-analyst-expert (R92)',
'从 Google Sheets 分析销售数据',
'data-analyst-expert');
// TC4: 确认词 "执行" — 路由引擎层面应该是低置信度/none (继承在 bundle 层处理)
// 这里验证路由引擎不会错误地高置信度命中无关 skill
test('TC4: 确认词 "执行" (路由引擎层)',
'执行',
'none',
{ allowTop3: true }); // 路由引擎对单字返回 none 或低置信度是正确行为
// TC5: 图片查询 — 路由引擎层面应该返回 none (继承在 bundle 层处理)
test('TC5: 图片查询 (路由引擎层)',
'[Image #1] 看看这个报错',
'debugger-expert',
{ allowTop3: true }); // 图片+附带文字可能有语义命中
// === 补充: 冷启动 cap 验证 ===
// 运行一个会触发冷启动的查询,检查 cap 是否生效
console.log('\n--- 补充: 冷启动 cap 机制验证 ---');
const csResult = routeEngine.runRouteEngine('帮我检查一下系统健康状态', cwd,
{ intents: ['general'], entities: [], modifiers: [], complexity: 'medium' });
const csApplied = csResult._coldStartApplied || false;
const csConf = csResult.confidence;
if (csApplied && csResult.candidates && csResult.candidates.length >= 2) {
const gap = (csResult.candidates[0]?.confidence || 0) - (csResult.candidates[1]?.confidence || 0);
console.log(` coldStart: true, gap: ${gap.toFixed(3)}, confidence: ${csConf}`);
if (gap < 0.15 && csConf <= 0.65) {
console.log(' [PASS] cap 在 route-engine 层生效');
passed++;
} else if (gap >= 0.15) {
console.log(' [SKIP] gap >= 0.15, cap 不需要触发');
} else {
console.log(' [FAIL] cap 应为 0.65 但实际为 ' + csConf);
failed++;
}
} else {
console.log(` coldStart: ${csApplied}, confidence: ${csConf} — cap 验证跳过`);
}
// === TC4/TC5 继承逻辑验证 (模拟 bundle 层) ===
console.log('\n--- 补充: 继承逻辑模拟验证 ---');
// 模拟 route-state-current.json 中有有效上一轮路由
const mockPrevState = {
ts: new Date().toISOString(),
routing: {
primary: 'debugger-expert',
candidates: [{ name: 'debugger-expert', confidence: 0.85 }],
confidence: 0.85,
chain: [],
lastValidPrimary: 'debugger-expert',
},
lastValidPrimary: 'debugger-expert',
};
// TC4-inherit: 确认词继承
const confirmWords = ['执行', '开始', '继续', '确认', '好的', '行', '可以', 'go', 'yes', 'proceed', 'ok'];
const tc4prompt = '执行';
const isConfirm = confirmWords.some(w => tc4prompt.includes(w));
if (isConfirm) {
console.log(` [PASS] TC4-inherit: "${tc4prompt}" 匹配确认词列表, bundle 层会触发 tryInherit()`);
console.log(` → 继承结果: ${mockPrevState.routing.primary} (cf: ${(mockPrevState.routing.confidence * 0.7).toFixed(2)})`);
passed++;
} else {
console.log(` [FAIL] TC4-inherit: "${tc4prompt}" 未匹配确认词`);
failed++;
}
// TC5-inherit: 图片继承 via lastValidPrimary
const tc5prompt = '[Image #1] 看看这个报错';
const isImage = /\[Image\s*#?\d+\]/.test(tc5prompt);
if (isImage) {
const lvp = mockPrevState.lastValidPrimary || (mockPrevState.routing && mockPrevState.routing.lastValidPrimary);
if (lvp && lvp !== 'none') {
console.log(` [PASS] TC5-inherit: 图片检测 + lastValidPrimary="${lvp}" → 继承成功`);
passed++;
} else {
console.log(` [FAIL] TC5-inherit: 图片检测成功但 lastValidPrimary 为空`);
failed++;
}
} else {
console.log(` [FAIL] TC5-inherit: 未检测到图片模式`);
failed++;
}
// === 总结 ===
console.log(`\n${'='.repeat(50)}`);
console.log(`TOTAL: ${passed + failed} tests, ${passed} PASS, ${failed} FAIL`);
console.log(`VERDICT: ${failed === 0 ? 'ALL PASS ✓' : `${failed} FAILURES ✗`}`);
process.exit(failed > 0 ? 1 : 0);