bookworm-smart-assistant/scripts/archive/keyword-gap-detector.js

241 lines
7.3 KiB
JavaScript

#!/usr/bin/env node
/**
* 关键词缺口检测器 (Keyword Gap Detector)
*
* 分析 route-feedback.jsonl 中的 miss 案例,检测 skills-index.json 中
* 缺失的关键词覆盖,输出建议报告。
*
* 原则: 只读分析,不自动修改任何 SKILL.md 或 skills-index.json。
*
* 用法:
* node scripts/keyword-gap-detector.js # 文本报告
* node scripts/keyword-gap-detector.js --json # JSON 输出
*/
const fs = require('fs');
const path = require('path');
function detectRoot() {
try { return require('./paths.config.js').PATHS.root; } catch {
const selfDir = __dirname;
if (selfDir.includes('.claude')) return selfDir.replace(/[/\\]scripts$/, '');
return '/mnt/c/Users/janson9527us/.claude';
}
}
const ROOT = detectRoot();
const DEBUG_DIR = path.join(ROOT, 'debug');
const INDEX_FILE = path.join(ROOT, 'skills-index.json');
const FEEDBACK_FILE = path.join(DEBUG_DIR, 'route-feedback.jsonl');
const REPORT_FILE = path.join(DEBUG_DIR, 'keyword-gap-report.json');
const JSON_MODE = process.argv.includes('--json');
// === 数据加载 ===
function loadIndex() {
if (!fs.existsSync(INDEX_FILE)) return null;
return JSON.parse(fs.readFileSync(INDEX_FILE, 'utf8'));
}
function loadFeedback() {
if (!fs.existsSync(FEEDBACK_FILE)) return [];
return fs.readFileSync(FEEDBACK_FILE, 'utf8')
.split('\n').filter(Boolean)
.map(line => { try { return JSON.parse(line); } catch { return null; } })
.filter(Boolean);
}
// === 分析引擎 ===
/**
* 构建技能关键词集合映射
* @param {Object} index - skills-index.json
* @returns {Object} { skillName: Set<keyword> }
*/
function buildSkillKeywordMap(index) {
const map = {};
for (const skill of index.skills) {
map[skill.name] = new Set(skill.keywords.map(k => k.keyword.toLowerCase()));
}
return map;
}
/**
* 分析单条 miss 案例的关键词缺口
* @param {Object} fb - 反馈条目
* @param {Object} skillKeywords - 技能关键词映射
* @returns {Object|null} 缺口分析结果
*/
function analyzeGap(fb, skillKeywords) {
if (fb.routedTo === fb.correctedTo) return null; // 非纠正
if (fb.routedTo === 'unknown') return null;
const correctSkillKw = skillKeywords[fb.correctedTo];
if (!correctSkillKw) return null; // 目标技能不存在
const queryTokens = new Set(fb.queryTokens || []);
// 查询 token 中有多少被正确技能覆盖
const covered = [];
const uncovered = [];
for (const token of queryTokens) {
if (correctSkillKw.has(token)) {
covered.push(token);
} else {
uncovered.push(token);
}
}
// 过滤噪声: 排除单字中文、同义词展开的通用词
const NOISE_PATTERNS = /^(test|testing|e2e|security|防护|加固|hardening|漏洞|vulnerability|性能|performance|optimization|调优|tuning|加速|接口|interface|rest|restful|graphql|endpoint|部署|deploy|deployment|发布|上线|release|publish|前端|frontend|front-end|客户端|client-side|界面|容器|container|docker|镜像|image|容器化|containerize|机器学习|machine learning|ml|model|训练|training|深度学习|deep learning|dl|神经网络|neural network|tensorflow)$/;
const meaningfulUncovered = uncovered.filter(t =>
t.length >= 2 && !NOISE_PATTERNS.test(t)
);
// 仅在有有意义的未覆盖 token 时报告
if (meaningfulUncovered.length === 0) return null;
const coverageRate = queryTokens.size > 0
? Math.round(covered.length / queryTokens.size * 100)
: 0;
return {
query: fb.query,
routedTo: fb.routedTo,
correctedTo: fb.correctedTo,
coverageRate,
coveredCount: covered.length,
totalTokens: queryTokens.size,
suggestedKeywords: meaningfulUncovered,
covered,
};
}
/**
* 聚合所有缺口,按技能分组
* @param {Array} gaps - 缺口分析结果
* @returns {Object} { skillName: { gaps, suggestedKeywords: { keyword: frequency } } }
*/
function aggregateBySkill(gaps) {
const bySkill = {};
for (const gap of gaps) {
if (!bySkill[gap.correctedTo]) {
bySkill[gap.correctedTo] = { gaps: [], keywordFreq: {} };
}
bySkill[gap.correctedTo].gaps.push(gap);
for (const kw of gap.suggestedKeywords) {
bySkill[gap.correctedTo].keywordFreq[kw] =
(bySkill[gap.correctedTo].keywordFreq[kw] || 0) + 1;
}
}
// 排序关键词建议按频率降序
const result = {};
for (const [skill, data] of Object.entries(bySkill)) {
result[skill] = {
missCount: data.gaps.length,
suggestedKeywords: Object.entries(data.keywordFreq)
.sort((a, b) => b[1] - a[1])
.map(([keyword, frequency]) => ({ keyword, frequency })),
queries: data.gaps.map(g => g.query),
};
}
return result;
}
// === 主流程 ===
function detect() {
const index = loadIndex();
if (!index) {
console.error('skills-index.json 不可用');
process.exit(1);
}
const feedback = loadFeedback();
if (feedback.length === 0) {
console.log('无反馈数据');
return null;
}
const skillKeywords = buildSkillKeywordMap(index);
const gaps = feedback
.map(fb => analyzeGap(fb, skillKeywords))
.filter(Boolean);
const bySkill = aggregateBySkill(gaps);
// 全局统计
const corrections = feedback.filter(f => f.routedTo !== f.correctedTo && f.routedTo !== 'unknown');
const avgCoverage = gaps.length > 0
? Math.round(gaps.reduce((s, g) => s + g.coverageRate, 0) / gaps.length)
: 100;
const report = {
generated: new Date().toISOString(),
summary: {
totalFeedback: feedback.length,
totalCorrections: corrections.length,
gapsDetected: gaps.length,
avgCoverageRate: avgCoverage + '%',
skillsAffected: Object.keys(bySkill).length,
},
bySkill,
details: gaps,
};
// 写入报告文件
if (!fs.existsSync(DEBUG_DIR)) fs.mkdirSync(DEBUG_DIR, { recursive: true });
fs.writeFileSync(REPORT_FILE, JSON.stringify(report, null, 2) + '\n');
return report;
}
function main() {
const report = detect();
if (!report) return;
if (JSON_MODE) {
console.log(JSON.stringify(report, null, 2));
return;
}
console.log('=== 关键词缺口检测报告 ===\n');
console.log(`反馈总数: ${report.summary.totalFeedback}`);
console.log(`纠正总数: ${report.summary.totalCorrections}`);
console.log(`缺口数: ${report.summary.gapsDetected}`);
console.log(`平均覆盖: ${report.summary.avgCoverageRate}`);
console.log(`涉及技能: ${report.summary.skillsAffected}`);
console.log('');
for (const [skill, data] of Object.entries(report.bySkill)) {
console.log(`## ${skill} (${data.missCount} miss)`);
console.log(` 建议添加关键词:`);
for (const kw of data.suggestedKeywords.slice(0, 10)) {
console.log(` + "${kw.keyword}" (出现 ${kw.frequency} 次)`);
}
console.log(` 相关查询:`);
for (const q of data.queries) {
console.log(` - "${q}"`);
}
console.log('');
}
console.log(`报告已保存: ${REPORT_FILE}`);
}
// 模块导出
if (typeof module !== 'undefined') {
module.exports = {
loadIndex, loadFeedback, buildSkillKeywordMap,
analyzeGap, aggregateBySkill, detect, main,
};
}
if (require.main === module) {
main();
}