121 lines
3.9 KiB
JavaScript
121 lines
3.9 KiB
JavaScript
|
|
#!/usr/bin/env node
|
|||
|
|
/**
|
|||
|
|
* TF-IDF 关键词加权引擎 (v4.9)
|
|||
|
|
*
|
|||
|
|
* 为 skills-index.json 中的关键词计算 TF-IDF 权重,
|
|||
|
|
* 区分高区分度关键词 (如 "pytorch") 和低区分度关键词 (如 "部署")。
|
|||
|
|
*
|
|||
|
|
* 核心函数:
|
|||
|
|
* buildCorpus(skills) → 倒排索引 {keyword → Set<skillName>}
|
|||
|
|
* computeIDF(df, N) → 平滑 IDF 值
|
|||
|
|
* applyTFIDFWeights(index) → 为每个关键词附加 tfidfWeight 字段
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
const fs = require('fs');
|
|||
|
|
const path = require('path');
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 构建倒排索引: 每个关键词出现在哪些技能中
|
|||
|
|
* @param {Array} skills - skills-index.json 中的 skills 数组
|
|||
|
|
* @returns {Map<string, Set<string>>} 关键词 → 技能名集合
|
|||
|
|
*/
|
|||
|
|
function buildCorpus(skills) {
|
|||
|
|
const corpus = new Map();
|
|||
|
|
for (const skill of skills) {
|
|||
|
|
for (const { keyword } of (skill.keywords || [])) {
|
|||
|
|
const kw = keyword.toLowerCase();
|
|||
|
|
if (!corpus.has(kw)) corpus.set(kw, new Set());
|
|||
|
|
corpus.get(kw).add(skill.name);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return corpus;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 计算平滑 IDF 值
|
|||
|
|
* 公式: log((N-df+0.5)/(df+0.5) + 1) — BM25 Robertson-Walker IDF
|
|||
|
|
* @param {number} df - 文档频率 (出现在多少个技能中)
|
|||
|
|
* @param {number} N - 技能总数
|
|||
|
|
* @returns {number} IDF 值
|
|||
|
|
*/
|
|||
|
|
function computeIDF(df, N) {
|
|||
|
|
// P1-FIX: 统一为 BM25 Robertson-Walker IDF (与 route-analyzer.js 一致)
|
|||
|
|
return Math.log((N - df + 0.5) / (df + 0.5) + 1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 为索引中的每个关键词附加 tfidfWeight 字段
|
|||
|
|
* tfidfWeight = 原始 weight * IDF
|
|||
|
|
* @param {Object} index - skills-index.json 完整对象
|
|||
|
|
*/
|
|||
|
|
function applyTFIDFWeights(index) {
|
|||
|
|
const skills = index.skills || [];
|
|||
|
|
const N = skills.length;
|
|||
|
|
const corpus = buildCorpus(skills);
|
|||
|
|
|
|||
|
|
for (const skill of skills) {
|
|||
|
|
for (const kwEntry of (skill.keywords || [])) {
|
|||
|
|
const kw = kwEntry.keyword.toLowerCase();
|
|||
|
|
const df = corpus.has(kw) ? corpus.get(kw).size : 0;
|
|||
|
|
const idf = computeIDF(df, N);
|
|||
|
|
// TF 简化为 1 (布尔频率: 关键词在技能中出现即为 1)
|
|||
|
|
kwEntry.tfidfWeight = Math.round(kwEntry.weight * idf * 100) / 100;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 模块导出
|
|||
|
|
if (typeof module !== 'undefined') {
|
|||
|
|
module.exports = { buildCorpus, computeIDF, applyTFIDFWeights };
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CLI 入口: 可独立运行查看统计
|
|||
|
|
if (require.main === module) {
|
|||
|
|
const detectClaudeRoot = () => require('./paths.config.js').PATHS.root;
|
|||
|
|
|
|||
|
|
const ROOT = detectClaudeRoot();
|
|||
|
|
const indexFile = path.join(ROOT, 'skills-index.json');
|
|||
|
|
|
|||
|
|
if (!fs.existsSync(indexFile)) {
|
|||
|
|
console.error('skills-index.json 不存在,请先运行 generate-skill-index.js');
|
|||
|
|
process.exit(1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const index = JSON.parse(fs.readFileSync(indexFile, 'utf8'));
|
|||
|
|
const corpus = buildCorpus(index.skills);
|
|||
|
|
const N = index.skills.length;
|
|||
|
|
|
|||
|
|
console.log(`=== TF-IDF 统计 ===`);
|
|||
|
|
console.log(`技能总数: ${N}`);
|
|||
|
|
console.log(`唯一关键词: ${corpus.size}`);
|
|||
|
|
console.log('');
|
|||
|
|
|
|||
|
|
// 高区分度关键词 (仅出现在 1-2 个技能中)
|
|||
|
|
const highIDF = [];
|
|||
|
|
for (const [kw, skills] of corpus) {
|
|||
|
|
if (skills.size <= 2) {
|
|||
|
|
highIDF.push({ keyword: kw, df: skills.size, idf: computeIDF(skills.size, N) });
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
highIDF.sort((a, b) => b.idf - a.idf);
|
|||
|
|
|
|||
|
|
console.log('高区分度关键词 (df<=2, Top 20):');
|
|||
|
|
for (const item of highIDF.slice(0, 20)) {
|
|||
|
|
console.log(` ${item.keyword.padEnd(25)} df=${item.df} idf=${item.idf.toFixed(2)}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 低区分度关键词 (出现在 >10 个技能中)
|
|||
|
|
const lowIDF = [];
|
|||
|
|
for (const [kw, skills] of corpus) {
|
|||
|
|
if (skills.size > 10) {
|
|||
|
|
lowIDF.push({ keyword: kw, df: skills.size, idf: computeIDF(skills.size, N) });
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
lowIDF.sort((a, b) => a.idf - b.idf);
|
|||
|
|
|
|||
|
|
console.log('\n低区分度关键词 (df>10):');
|
|||
|
|
for (const item of lowIDF) {
|
|||
|
|
console.log(` ${item.keyword.padEnd(25)} df=${item.df} idf=${item.idf.toFixed(2)}`);
|
|||
|
|
}
|
|||
|
|
}
|