bookworm-smart-assistant/scripts/synonym-miner.js

161 lines
5.8 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* synonym-miner.js — 从 skills-index-lite.json 自动挖掘同义词候选组
*
* 算法:
* 1. 加载所有技能的关键词
* 2. 统计每个关键词出现在多少个技能中
* 3. 出现在 2-4 个技能中的关键词为候选(太通用的排除,太独特的无同义价值)
* 4. 对候选关键词,计算 Jaccard 相似度找到共现组
* 5. 与现有 synonyms.json 对比,只输出尚未覆盖的新候选
*
* 用法: node synonym-miner.js [--min-skills 2] [--max-skills 4] [--threshold 0.3]
*/
const fs = require('fs');
const path = require('path');
// 配置
const args = process.argv.slice(2);
const getArg = (name, def) => {
const idx = args.indexOf(`--${name}`);
return idx >= 0 && args[idx + 1] ? Number(args[idx + 1]) : def;
};
const MIN_SKILLS = getArg('min-skills', 2);
const MAX_SKILLS = getArg('max-skills', 4);
const JACCARD_THRESHOLD = getArg('threshold', 0.25);
const BASE = path.resolve(__dirname, '..');
const skillsPath = path.join(BASE, 'skills-index-lite.json');
const synonymsPath = path.join(__dirname, 'synonyms.json');
// 加载数据
const skillsData = JSON.parse(fs.readFileSync(skillsPath, 'utf8'));
const synonymsData = JSON.parse(fs.readFileSync(synonymsPath, 'utf8'));
// 已有同义词词汇集合(小写化)
const existingWords = new Set();
synonymsData.groups.forEach(g => {
g.words.forEach(w => existingWords.add(w.toLowerCase()));
});
// 过滤噪声关键词(太短、纯符号、通用词)
const NOISE = new Set([
'测试', '架构', '性能', '配置', '日志', '组件', '认证', '安全',
'test', 'architecture', 'performance', 'config', 'log', 'component',
'auth', 'security', 'deploy', 'monitoring', 'database', 'template',
'interface', 'routing', 'cache', 'microservice', 'cluster',
'container', 'frontend', 'backend', 'api', 'queue',
// 过短
'pt', 'ts', 'ui', 'ux', 'ml', 'dl', 'db', 'ci', 'cd',
// 通用中文
'使用', '语言', '应用', '描述', '示例', '当用户需要', '专家', '技术栈',
'流程', '策略', '场景', '优化', '设计', '管理', '系统', '工具',
]);
// 第1步: 统计关键词→技能映射
const kwToSkills = new Map();
const skillToKws = new Map();
skillsData.skills.forEach(skill => {
const kws = new Set();
skill.keywords.forEach(k => {
const kw = k.keyword.toLowerCase().trim();
// 跳过太长(描述性)、太短(<2字符或噪声词
if (kw.length < 2 || kw.length > 20 || NOISE.has(kw)) return;
// 跳过含有明显描述性文本的关键词
if (kw.includes('当用户') || kw.includes('专家') || kw.includes('推荐') || kw.includes('适用')) return;
kws.add(kw);
if (!kwToSkills.has(kw)) kwToSkills.set(kw, new Set());
kwToSkills.get(kw).add(skill.name);
});
skillToKws.set(skill.name, kws);
});
// 第2步: 筛选出现在 MIN_SKILLS~MAX_SKILLS 个技能中的关键词
const candidates = [];
for (const [kw, skills] of kwToSkills) {
if (skills.size >= MIN_SKILLS && skills.size <= MAX_SKILLS) {
// 排除已在 synonyms.json 中的
if (!existingWords.has(kw)) {
candidates.push({ keyword: kw, skills: [...skills], count: skills.size });
}
}
}
console.log(`=== Synonym Miner Report ===`);
console.log(`技能总数: ${skillsData.skills.length}`);
console.log(`关键词总数: ${kwToSkills.size}`);
console.log(`出现在 ${MIN_SKILLS}-${MAX_SKILLS} 个技能中的候选词: ${candidates.length}`);
console.log(`已有同义词覆盖的词数: ${existingWords.size}`);
console.log();
// 第3步: 用 Jaccard 相似度聚类候选词
// 对每对候选词,计算它们所属技能集合的 Jaccard 相似度
const groups = [];
const used = new Set();
candidates.sort((a, b) => b.count - a.count);
for (let i = 0; i < candidates.length; i++) {
if (used.has(candidates[i].keyword)) continue;
const group = [candidates[i]];
const skillsA = new Set(candidates[i].skills);
for (let j = i + 1; j < candidates.length; j++) {
if (used.has(candidates[j].keyword)) continue;
const skillsB = new Set(candidates[j].skills);
// 计算 Jaccard
const intersection = [...skillsA].filter(s => skillsB.has(s)).length;
const union = new Set([...skillsA, ...skillsB]).size;
const jaccard = intersection / union;
if (jaccard >= JACCARD_THRESHOLD) {
group.push(candidates[j]);
}
}
// 只保留 >= 2 个词的组
if (group.length >= 2) {
group.forEach(g => used.add(g.keyword));
groups.push({
words: group.map(g => g.keyword),
sharedSkills: [...new Set(group.flatMap(g => g.skills))],
avgJaccard: 'computed',
});
}
}
// 第4步: 输出结果
console.log(`--- 建议新增同义词组 (${groups.length} 组) ---\n`);
groups.forEach((g, idx) => {
console.log(`[组 ${idx + 1}] 词汇: ${g.words.join(', ')}`);
console.log(` 关联技能: ${g.sharedSkills.join(', ')}`);
console.log();
});
// 第5步: 输出未聚类的高频候选词
const unclustered = candidates.filter(c => !used.has(c.keyword));
if (unclustered.length > 0) {
console.log(`--- 未聚类的候选词 (前 30) ---\n`);
unclustered.slice(0, 30).forEach(c => {
console.log(` ${c.keyword} (${c.count} 技能): ${c.skills.join(', ')}`);
});
}
// 第6步: 覆盖率统计
const allKws = [...kwToSkills.keys()];
const coveredBySynonyms = allKws.filter(kw => existingWords.has(kw)).length;
const coveredAfterMining = allKws.filter(kw => existingWords.has(kw) || used.has(kw)).length;
console.log(`\n--- 覆盖率统计 ---`);
console.log(`当前同义词覆盖率: ${(coveredBySynonyms / allKws.length * 100).toFixed(1)}% (${coveredBySynonyms}/${allKws.length})`);
console.log(`加入建议后覆盖率: ${(coveredAfterMining / allKws.length * 100).toFixed(1)}% (${coveredAfterMining}/${allKws.length})`);
console.log(`提升: +${(coveredAfterMining - coveredBySynonyms)} 个关键词`);