bookworm-smart-assistant/scripts/synonym-miner.js

161 lines
5.8 KiB
JavaScript
Raw Normal View History

#!/usr/bin/env node
/**
* synonym-miner.js skills-index-lite.json 自动挖掘同义词候选组
*
* 算法:
* 1. 加载所有技能的关键词
* 2. 统计每个关键词出现在多少个技能中
* 3. 出现在 2-4 个技能中的关键词为候选太通用的排除太独特的无同义价值
* 4. 对候选关键词计算 Jaccard 相似度找到共现组
* 5. 与现有 synonyms.json 对比只输出尚未覆盖的新候选
*
* 用法: node synonym-miner.js [--min-skills 2] [--max-skills 4] [--threshold 0.3]
*/
const fs = require('fs');
const path = require('path');
// 配置
const args = process.argv.slice(2);
const getArg = (name, def) => {
const idx = args.indexOf(`--${name}`);
return idx >= 0 && args[idx + 1] ? Number(args[idx + 1]) : def;
};
const MIN_SKILLS = getArg('min-skills', 2);
const MAX_SKILLS = getArg('max-skills', 4);
const JACCARD_THRESHOLD = getArg('threshold', 0.25);
const BASE = path.resolve(__dirname, '..');
const skillsPath = path.join(BASE, 'skills-index-lite.json');
const synonymsPath = path.join(__dirname, 'synonyms.json');
// 加载数据
const skillsData = JSON.parse(fs.readFileSync(skillsPath, 'utf8'));
const synonymsData = JSON.parse(fs.readFileSync(synonymsPath, 'utf8'));
// 已有同义词词汇集合(小写化)
const existingWords = new Set();
synonymsData.groups.forEach(g => {
g.words.forEach(w => existingWords.add(w.toLowerCase()));
});
// 过滤噪声关键词(太短、纯符号、通用词)
const NOISE = new Set([
'测试', '架构', '性能', '配置', '日志', '组件', '认证', '安全',
'test', 'architecture', 'performance', 'config', 'log', 'component',
'auth', 'security', 'deploy', 'monitoring', 'database', 'template',
'interface', 'routing', 'cache', 'microservice', 'cluster',
'container', 'frontend', 'backend', 'api', 'queue',
// 过短
'pt', 'ts', 'ui', 'ux', 'ml', 'dl', 'db', 'ci', 'cd',
// 通用中文
'使用', '语言', '应用', '描述', '示例', '当用户需要', '专家', '技术栈',
'流程', '策略', '场景', '优化', '设计', '管理', '系统', '工具',
]);
// 第1步: 统计关键词→技能映射
const kwToSkills = new Map();
const skillToKws = new Map();
skillsData.skills.forEach(skill => {
const kws = new Set();
skill.keywords.forEach(k => {
const kw = k.keyword.toLowerCase().trim();
// 跳过太长(描述性)、太短(<2字符或噪声词
if (kw.length < 2 || kw.length > 20 || NOISE.has(kw)) return;
// 跳过含有明显描述性文本的关键词
if (kw.includes('当用户') || kw.includes('专家') || kw.includes('推荐') || kw.includes('适用')) return;
kws.add(kw);
if (!kwToSkills.has(kw)) kwToSkills.set(kw, new Set());
kwToSkills.get(kw).add(skill.name);
});
skillToKws.set(skill.name, kws);
});
// 第2步: 筛选出现在 MIN_SKILLS~MAX_SKILLS 个技能中的关键词
const candidates = [];
for (const [kw, skills] of kwToSkills) {
if (skills.size >= MIN_SKILLS && skills.size <= MAX_SKILLS) {
// 排除已在 synonyms.json 中的
if (!existingWords.has(kw)) {
candidates.push({ keyword: kw, skills: [...skills], count: skills.size });
}
}
}
console.log(`=== Synonym Miner Report ===`);
console.log(`技能总数: ${skillsData.skills.length}`);
console.log(`关键词总数: ${kwToSkills.size}`);
console.log(`出现在 ${MIN_SKILLS}-${MAX_SKILLS} 个技能中的候选词: ${candidates.length}`);
console.log(`已有同义词覆盖的词数: ${existingWords.size}`);
console.log();
// 第3步: 用 Jaccard 相似度聚类候选词
// 对每对候选词,计算它们所属技能集合的 Jaccard 相似度
const groups = [];
const used = new Set();
candidates.sort((a, b) => b.count - a.count);
for (let i = 0; i < candidates.length; i++) {
if (used.has(candidates[i].keyword)) continue;
const group = [candidates[i]];
const skillsA = new Set(candidates[i].skills);
for (let j = i + 1; j < candidates.length; j++) {
if (used.has(candidates[j].keyword)) continue;
const skillsB = new Set(candidates[j].skills);
// 计算 Jaccard
const intersection = [...skillsA].filter(s => skillsB.has(s)).length;
const union = new Set([...skillsA, ...skillsB]).size;
const jaccard = intersection / union;
if (jaccard >= JACCARD_THRESHOLD) {
group.push(candidates[j]);
}
}
// 只保留 >= 2 个词的组
if (group.length >= 2) {
group.forEach(g => used.add(g.keyword));
groups.push({
words: group.map(g => g.keyword),
sharedSkills: [...new Set(group.flatMap(g => g.skills))],
avgJaccard: 'computed',
});
}
}
// 第4步: 输出结果
console.log(`--- 建议新增同义词组 (${groups.length} 组) ---\n`);
groups.forEach((g, idx) => {
console.log(`[组 ${idx + 1}] 词汇: ${g.words.join(', ')}`);
console.log(` 关联技能: ${g.sharedSkills.join(', ')}`);
console.log();
});
// 第5步: 输出未聚类的高频候选词
const unclustered = candidates.filter(c => !used.has(c.keyword));
if (unclustered.length > 0) {
console.log(`--- 未聚类的候选词 (前 30) ---\n`);
unclustered.slice(0, 30).forEach(c => {
console.log(` ${c.keyword} (${c.count} 技能): ${c.skills.join(', ')}`);
});
}
// 第6步: 覆盖率统计
const allKws = [...kwToSkills.keys()];
const coveredBySynonyms = allKws.filter(kw => existingWords.has(kw)).length;
const coveredAfterMining = allKws.filter(kw => existingWords.has(kw) || used.has(kw)).length;
console.log(`\n--- 覆盖率统计 ---`);
console.log(`当前同义词覆盖率: ${(coveredBySynonyms / allKws.length * 100).toFixed(1)}% (${coveredBySynonyms}/${allKws.length})`);
console.log(`加入建议后覆盖率: ${(coveredAfterMining / allKws.length * 100).toFixed(1)}% (${coveredAfterMining}/${allKws.length})`);
console.log(`提升: +${(coveredAfterMining - coveredBySynonyms)} 个关键词`);