161 lines
5.8 KiB
JavaScript
161 lines
5.8 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* synonym-miner.js — 从 skills-index-lite.json 自动挖掘同义词候选组
|
||
*
|
||
* 算法:
|
||
* 1. 加载所有技能的关键词
|
||
* 2. 统计每个关键词出现在多少个技能中
|
||
* 3. 出现在 2-4 个技能中的关键词为候选(太通用的排除,太独特的无同义价值)
|
||
* 4. 对候选关键词,计算 Jaccard 相似度找到共现组
|
||
* 5. 与现有 synonyms.json 对比,只输出尚未覆盖的新候选
|
||
*
|
||
* 用法: node synonym-miner.js [--min-skills 2] [--max-skills 4] [--threshold 0.3]
|
||
*/
|
||
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// 配置
|
||
const args = process.argv.slice(2);
|
||
const getArg = (name, def) => {
|
||
const idx = args.indexOf(`--${name}`);
|
||
return idx >= 0 && args[idx + 1] ? Number(args[idx + 1]) : def;
|
||
};
|
||
|
||
const MIN_SKILLS = getArg('min-skills', 2);
|
||
const MAX_SKILLS = getArg('max-skills', 4);
|
||
const JACCARD_THRESHOLD = getArg('threshold', 0.25);
|
||
|
||
const BASE = path.resolve(__dirname, '..');
|
||
const skillsPath = path.join(BASE, 'skills-index-lite.json');
|
||
const synonymsPath = path.join(__dirname, 'synonyms.json');
|
||
|
||
// 加载数据
|
||
const skillsData = JSON.parse(fs.readFileSync(skillsPath, 'utf8'));
|
||
const synonymsData = JSON.parse(fs.readFileSync(synonymsPath, 'utf8'));
|
||
|
||
// 已有同义词词汇集合(小写化)
|
||
const existingWords = new Set();
|
||
synonymsData.groups.forEach(g => {
|
||
g.words.forEach(w => existingWords.add(w.toLowerCase()));
|
||
});
|
||
|
||
// 过滤噪声关键词(太短、纯符号、通用词)
|
||
const NOISE = new Set([
|
||
'测试', '架构', '性能', '配置', '日志', '组件', '认证', '安全',
|
||
'test', 'architecture', 'performance', 'config', 'log', 'component',
|
||
'auth', 'security', 'deploy', 'monitoring', 'database', 'template',
|
||
'interface', 'routing', 'cache', 'microservice', 'cluster',
|
||
'container', 'frontend', 'backend', 'api', 'queue',
|
||
// 过短
|
||
'pt', 'ts', 'ui', 'ux', 'ml', 'dl', 'db', 'ci', 'cd',
|
||
// 通用中文
|
||
'使用', '语言', '应用', '描述', '示例', '当用户需要', '专家', '技术栈',
|
||
'流程', '策略', '场景', '优化', '设计', '管理', '系统', '工具',
|
||
]);
|
||
|
||
// 第1步: 统计关键词→技能映射
|
||
const kwToSkills = new Map();
|
||
const skillToKws = new Map();
|
||
|
||
skillsData.skills.forEach(skill => {
|
||
const kws = new Set();
|
||
skill.keywords.forEach(k => {
|
||
const kw = k.keyword.toLowerCase().trim();
|
||
// 跳过太长(描述性)、太短(<2字符)或噪声词
|
||
if (kw.length < 2 || kw.length > 20 || NOISE.has(kw)) return;
|
||
// 跳过含有明显描述性文本的关键词
|
||
if (kw.includes('当用户') || kw.includes('专家') || kw.includes('推荐') || kw.includes('适用')) return;
|
||
|
||
kws.add(kw);
|
||
if (!kwToSkills.has(kw)) kwToSkills.set(kw, new Set());
|
||
kwToSkills.get(kw).add(skill.name);
|
||
});
|
||
skillToKws.set(skill.name, kws);
|
||
});
|
||
|
||
// 第2步: 筛选出现在 MIN_SKILLS~MAX_SKILLS 个技能中的关键词
|
||
const candidates = [];
|
||
for (const [kw, skills] of kwToSkills) {
|
||
if (skills.size >= MIN_SKILLS && skills.size <= MAX_SKILLS) {
|
||
// 排除已在 synonyms.json 中的
|
||
if (!existingWords.has(kw)) {
|
||
candidates.push({ keyword: kw, skills: [...skills], count: skills.size });
|
||
}
|
||
}
|
||
}
|
||
|
||
console.log(`=== Synonym Miner Report ===`);
|
||
console.log(`技能总数: ${skillsData.skills.length}`);
|
||
console.log(`关键词总数: ${kwToSkills.size}`);
|
||
console.log(`出现在 ${MIN_SKILLS}-${MAX_SKILLS} 个技能中的候选词: ${candidates.length}`);
|
||
console.log(`已有同义词覆盖的词数: ${existingWords.size}`);
|
||
console.log();
|
||
|
||
// 第3步: 用 Jaccard 相似度聚类候选词
|
||
// 对每对候选词,计算它们所属技能集合的 Jaccard 相似度
|
||
const groups = [];
|
||
const used = new Set();
|
||
|
||
candidates.sort((a, b) => b.count - a.count);
|
||
|
||
for (let i = 0; i < candidates.length; i++) {
|
||
if (used.has(candidates[i].keyword)) continue;
|
||
|
||
const group = [candidates[i]];
|
||
const skillsA = new Set(candidates[i].skills);
|
||
|
||
for (let j = i + 1; j < candidates.length; j++) {
|
||
if (used.has(candidates[j].keyword)) continue;
|
||
|
||
const skillsB = new Set(candidates[j].skills);
|
||
|
||
// 计算 Jaccard
|
||
const intersection = [...skillsA].filter(s => skillsB.has(s)).length;
|
||
const union = new Set([...skillsA, ...skillsB]).size;
|
||
const jaccard = intersection / union;
|
||
|
||
if (jaccard >= JACCARD_THRESHOLD) {
|
||
group.push(candidates[j]);
|
||
}
|
||
}
|
||
|
||
// 只保留 >= 2 个词的组
|
||
if (group.length >= 2) {
|
||
group.forEach(g => used.add(g.keyword));
|
||
groups.push({
|
||
words: group.map(g => g.keyword),
|
||
sharedSkills: [...new Set(group.flatMap(g => g.skills))],
|
||
avgJaccard: 'computed',
|
||
});
|
||
}
|
||
}
|
||
|
||
// 第4步: 输出结果
|
||
console.log(`--- 建议新增同义词组 (${groups.length} 组) ---\n`);
|
||
|
||
groups.forEach((g, idx) => {
|
||
console.log(`[组 ${idx + 1}] 词汇: ${g.words.join(', ')}`);
|
||
console.log(` 关联技能: ${g.sharedSkills.join(', ')}`);
|
||
console.log();
|
||
});
|
||
|
||
// 第5步: 输出未聚类的高频候选词
|
||
const unclustered = candidates.filter(c => !used.has(c.keyword));
|
||
if (unclustered.length > 0) {
|
||
console.log(`--- 未聚类的候选词 (前 30) ---\n`);
|
||
unclustered.slice(0, 30).forEach(c => {
|
||
console.log(` ${c.keyword} (${c.count} 技能): ${c.skills.join(', ')}`);
|
||
});
|
||
}
|
||
|
||
// 第6步: 覆盖率统计
|
||
const allKws = [...kwToSkills.keys()];
|
||
const coveredBySynonyms = allKws.filter(kw => existingWords.has(kw)).length;
|
||
const coveredAfterMining = allKws.filter(kw => existingWords.has(kw) || used.has(kw)).length;
|
||
|
||
console.log(`\n--- 覆盖率统计 ---`);
|
||
console.log(`当前同义词覆盖率: ${(coveredBySynonyms / allKws.length * 100).toFixed(1)}% (${coveredBySynonyms}/${allKws.length})`);
|
||
console.log(`加入建议后覆盖率: ${(coveredAfterMining / allKws.length * 100).toFixed(1)}% (${coveredAfterMining}/${allKws.length})`);
|
||
console.log(`提升: +${(coveredAfterMining - coveredBySynonyms)} 个关键词`);
|