241 lines
7.3 KiB
JavaScript
241 lines
7.3 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* 关键词缺口检测器 (Keyword Gap Detector)
|
|
*
|
|
* 分析 route-feedback.jsonl 中的 miss 案例,检测 skills-index.json 中
|
|
* 缺失的关键词覆盖,输出建议报告。
|
|
*
|
|
* 原则: 只读分析,不自动修改任何 SKILL.md 或 skills-index.json。
|
|
*
|
|
* 用法:
|
|
* node scripts/keyword-gap-detector.js # 文本报告
|
|
* node scripts/keyword-gap-detector.js --json # JSON 输出
|
|
*/
|
|
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
|
|
function detectRoot() {
|
|
try { return require('./paths.config.js').PATHS.root; } catch {
|
|
const selfDir = __dirname;
|
|
if (selfDir.includes('.claude')) return selfDir.replace(/[/\\]scripts$/, '');
|
|
return '/mnt/c/Users/janson9527us/.claude';
|
|
}
|
|
}
|
|
|
|
const ROOT = detectRoot();
|
|
const DEBUG_DIR = path.join(ROOT, 'debug');
|
|
const INDEX_FILE = path.join(ROOT, 'skills-index.json');
|
|
const FEEDBACK_FILE = path.join(DEBUG_DIR, 'route-feedback.jsonl');
|
|
const REPORT_FILE = path.join(DEBUG_DIR, 'keyword-gap-report.json');
|
|
const JSON_MODE = process.argv.includes('--json');
|
|
|
|
// === 数据加载 ===
|
|
|
|
function loadIndex() {
|
|
if (!fs.existsSync(INDEX_FILE)) return null;
|
|
return JSON.parse(fs.readFileSync(INDEX_FILE, 'utf8'));
|
|
}
|
|
|
|
function loadFeedback() {
|
|
if (!fs.existsSync(FEEDBACK_FILE)) return [];
|
|
return fs.readFileSync(FEEDBACK_FILE, 'utf8')
|
|
.split('\n').filter(Boolean)
|
|
.map(line => { try { return JSON.parse(line); } catch { return null; } })
|
|
.filter(Boolean);
|
|
}
|
|
|
|
// === 分析引擎 ===
|
|
|
|
/**
|
|
* 构建技能关键词集合映射
|
|
* @param {Object} index - skills-index.json
|
|
* @returns {Object} { skillName: Set<keyword> }
|
|
*/
|
|
function buildSkillKeywordMap(index) {
|
|
const map = {};
|
|
for (const skill of index.skills) {
|
|
map[skill.name] = new Set(skill.keywords.map(k => k.keyword.toLowerCase()));
|
|
}
|
|
return map;
|
|
}
|
|
|
|
/**
|
|
* 分析单条 miss 案例的关键词缺口
|
|
* @param {Object} fb - 反馈条目
|
|
* @param {Object} skillKeywords - 技能关键词映射
|
|
* @returns {Object|null} 缺口分析结果
|
|
*/
|
|
function analyzeGap(fb, skillKeywords) {
|
|
if (fb.routedTo === fb.correctedTo) return null; // 非纠正
|
|
if (fb.routedTo === 'unknown') return null;
|
|
|
|
const correctSkillKw = skillKeywords[fb.correctedTo];
|
|
if (!correctSkillKw) return null; // 目标技能不存在
|
|
|
|
const queryTokens = new Set(fb.queryTokens || []);
|
|
|
|
// 查询 token 中有多少被正确技能覆盖
|
|
const covered = [];
|
|
const uncovered = [];
|
|
for (const token of queryTokens) {
|
|
if (correctSkillKw.has(token)) {
|
|
covered.push(token);
|
|
} else {
|
|
uncovered.push(token);
|
|
}
|
|
}
|
|
|
|
// 过滤噪声: 排除单字中文、同义词展开的通用词
|
|
const NOISE_PATTERNS = /^(test|testing|e2e|security|防护|加固|hardening|漏洞|vulnerability|性能|performance|optimization|调优|tuning|加速|接口|interface|rest|restful|graphql|endpoint|部署|deploy|deployment|发布|上线|release|publish|前端|frontend|front-end|客户端|client-side|界面|容器|container|docker|镜像|image|容器化|containerize|机器学习|machine learning|ml|model|训练|training|深度学习|deep learning|dl|神经网络|neural network|tensorflow)$/;
|
|
const meaningfulUncovered = uncovered.filter(t =>
|
|
t.length >= 2 && !NOISE_PATTERNS.test(t)
|
|
);
|
|
|
|
// 仅在有有意义的未覆盖 token 时报告
|
|
if (meaningfulUncovered.length === 0) return null;
|
|
|
|
const coverageRate = queryTokens.size > 0
|
|
? Math.round(covered.length / queryTokens.size * 100)
|
|
: 0;
|
|
|
|
return {
|
|
query: fb.query,
|
|
routedTo: fb.routedTo,
|
|
correctedTo: fb.correctedTo,
|
|
coverageRate,
|
|
coveredCount: covered.length,
|
|
totalTokens: queryTokens.size,
|
|
suggestedKeywords: meaningfulUncovered,
|
|
covered,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* 聚合所有缺口,按技能分组
|
|
* @param {Array} gaps - 缺口分析结果
|
|
* @returns {Object} { skillName: { gaps, suggestedKeywords: { keyword: frequency } } }
|
|
*/
|
|
function aggregateBySkill(gaps) {
|
|
const bySkill = {};
|
|
|
|
for (const gap of gaps) {
|
|
if (!bySkill[gap.correctedTo]) {
|
|
bySkill[gap.correctedTo] = { gaps: [], keywordFreq: {} };
|
|
}
|
|
bySkill[gap.correctedTo].gaps.push(gap);
|
|
|
|
for (const kw of gap.suggestedKeywords) {
|
|
bySkill[gap.correctedTo].keywordFreq[kw] =
|
|
(bySkill[gap.correctedTo].keywordFreq[kw] || 0) + 1;
|
|
}
|
|
}
|
|
|
|
// 排序关键词建议按频率降序
|
|
const result = {};
|
|
for (const [skill, data] of Object.entries(bySkill)) {
|
|
result[skill] = {
|
|
missCount: data.gaps.length,
|
|
suggestedKeywords: Object.entries(data.keywordFreq)
|
|
.sort((a, b) => b[1] - a[1])
|
|
.map(([keyword, frequency]) => ({ keyword, frequency })),
|
|
queries: data.gaps.map(g => g.query),
|
|
};
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// === 主流程 ===
|
|
|
|
function detect() {
|
|
const index = loadIndex();
|
|
if (!index) {
|
|
console.error('skills-index.json 不可用');
|
|
process.exit(1);
|
|
}
|
|
|
|
const feedback = loadFeedback();
|
|
if (feedback.length === 0) {
|
|
console.log('无反馈数据');
|
|
return null;
|
|
}
|
|
|
|
const skillKeywords = buildSkillKeywordMap(index);
|
|
const gaps = feedback
|
|
.map(fb => analyzeGap(fb, skillKeywords))
|
|
.filter(Boolean);
|
|
|
|
const bySkill = aggregateBySkill(gaps);
|
|
|
|
// 全局统计
|
|
const corrections = feedback.filter(f => f.routedTo !== f.correctedTo && f.routedTo !== 'unknown');
|
|
const avgCoverage = gaps.length > 0
|
|
? Math.round(gaps.reduce((s, g) => s + g.coverageRate, 0) / gaps.length)
|
|
: 100;
|
|
|
|
const report = {
|
|
generated: new Date().toISOString(),
|
|
summary: {
|
|
totalFeedback: feedback.length,
|
|
totalCorrections: corrections.length,
|
|
gapsDetected: gaps.length,
|
|
avgCoverageRate: avgCoverage + '%',
|
|
skillsAffected: Object.keys(bySkill).length,
|
|
},
|
|
bySkill,
|
|
details: gaps,
|
|
};
|
|
|
|
// 写入报告文件
|
|
if (!fs.existsSync(DEBUG_DIR)) fs.mkdirSync(DEBUG_DIR, { recursive: true });
|
|
fs.writeFileSync(REPORT_FILE, JSON.stringify(report, null, 2) + '\n');
|
|
|
|
return report;
|
|
}
|
|
|
|
function main() {
|
|
const report = detect();
|
|
if (!report) return;
|
|
|
|
if (JSON_MODE) {
|
|
console.log(JSON.stringify(report, null, 2));
|
|
return;
|
|
}
|
|
|
|
console.log('=== 关键词缺口检测报告 ===\n');
|
|
console.log(`反馈总数: ${report.summary.totalFeedback}`);
|
|
console.log(`纠正总数: ${report.summary.totalCorrections}`);
|
|
console.log(`缺口数: ${report.summary.gapsDetected}`);
|
|
console.log(`平均覆盖: ${report.summary.avgCoverageRate}`);
|
|
console.log(`涉及技能: ${report.summary.skillsAffected}`);
|
|
console.log('');
|
|
|
|
for (const [skill, data] of Object.entries(report.bySkill)) {
|
|
console.log(`## ${skill} (${data.missCount} miss)`);
|
|
console.log(` 建议添加关键词:`);
|
|
for (const kw of data.suggestedKeywords.slice(0, 10)) {
|
|
console.log(` + "${kw.keyword}" (出现 ${kw.frequency} 次)`);
|
|
}
|
|
console.log(` 相关查询:`);
|
|
for (const q of data.queries) {
|
|
console.log(` - "${q}"`);
|
|
}
|
|
console.log('');
|
|
}
|
|
|
|
console.log(`报告已保存: ${REPORT_FILE}`);
|
|
}
|
|
|
|
// 模块导出
|
|
if (typeof module !== 'undefined') {
|
|
module.exports = {
|
|
loadIndex, loadFeedback, buildSkillKeywordMap,
|
|
analyzeGap, aggregateBySkill, detect, main,
|
|
};
|
|
}
|
|
|
|
if (require.main === module) {
|
|
main();
|
|
}
|