bookworm-smart-assistant/scripts/semantic-scorer.js

261 lines
7.3 KiB
JavaScript

#!/usr/bin/env node
/**
* 语义评分引擎 (v5.9)
*
* 基于 TF-IDF 向量余弦相似度的轻量级语义匹配。
* 无需外部 embedding 模型,纯 JS 实现。
*
* 原理:
* 1. 为每个技能构建 TF-IDF 向量 (基于 keywords + description)
* 2. 用户查询也生成 TF-IDF 向量
* 3. 余弦相似度作为语义分数
*
* 模块导出:
* buildVectors(index) → { skillVectors, idfMap, vocabulary }
* queryVector(tokens, idfMap, vocabulary) → Float64Array
* cosineSimilarity(vecA, vecB) → number (0~1)
* semanticScore(query, index, cache) → { name, score }[]
*/
const path = require('path');
const fs = require('fs');
const detectClaudeRoot = () => require('./paths.config.js').PATHS.root;
/**
* 简易分词 (与 route-analyzer.js tokenize 保持一致)
*/
function tokenize(text) {
const tokens = [];
const cnChars = (text || '').match(/[\u4e00-\u9fff]+/g) || [];
for (const chunk of cnChars) {
for (let len = 2; len <= Math.min(4, chunk.length); len++) {
for (let i = 0; i <= chunk.length - len; i++) {
tokens.push(chunk.slice(i, i + len).toLowerCase());
}
}
}
const enWords = (text || '').match(/[A-Za-z][\w.-]*/g) || [];
for (const w of enWords) {
tokens.push(w.toLowerCase());
}
return tokens;
}
/**
* P1-3: 安全求 Map/迭代器最大值 (避免 Math.max(...) 在大集合时栈溢出)
* @param {Iterable<number>} iterable - 可迭代数值集合
* @param {number} defaultVal - 无元素时返回的默认值
* @returns {number}
*/
function iterableMax(iterable, defaultVal) {
let max = defaultVal;
for (const v of iterable) {
if (v > max) max = v;
}
return max;
}
/**
* 构建 TF-IDF 向量空间
* @param {Object} index - skills-index.json
* @returns {{ skillVectors: Map, idfMap: Map, vocabulary: string[], vocabIndex: Map }}
*/
function buildVectors(index) {
const skills = index.skills || [];
const N = skills.length;
// Phase 1: 收集所有文档的词频
const docTokens = []; // Array<Map<token, count>>
const df = new Map(); // document frequency
for (const skill of skills) {
const text = [
skill.name,
skill.description || '',
...(skill.keywords || []).map(k => k.keyword),
].join(' ');
const tokens = tokenize(text);
const tf = new Map();
for (const t of tokens) {
tf.set(t, (tf.get(t) || 0) + 1);
}
docTokens.push(tf);
// 统计 document frequency
for (const t of tf.keys()) {
df.set(t, (df.get(t) || 0) + 1);
}
}
// Phase 2: 构建词汇表 (只保留出现在 2+ 文档中但不超过 80% 文档的词)
const vocabulary = [];
const vocabIndex = new Map();
for (const [token, count] of df.entries()) {
if (count >= 2 && count <= N * 0.8) {
vocabIndex.set(token, vocabulary.length);
vocabulary.push(token);
}
}
// Phase 3: IDF 计算
const idfMap = new Map();
for (const token of vocabulary) {
const docFreq = df.get(token) || 0;
idfMap.set(token, Math.log((N + 1) / (docFreq + 1)) + 1);
}
// Phase 4: 为每个技能构建 TF-IDF 向量
const dim = vocabulary.length;
const skillVectors = new Map();
for (let i = 0; i < skills.length; i++) {
const vec = new Float64Array(dim);
const tf = docTokens[i];
// P1-3 修复: 用循环求最大值,避免 Math.max(...tf.values()) 在大 Map 时栈溢出
const maxTF = iterableMax(tf.values(), 1);
for (const [token, count] of tf.entries()) {
const idx = vocabIndex.get(token);
if (idx !== undefined) {
// 归一化 TF * IDF
vec[idx] = (count / maxTF) * (idfMap.get(token) || 0);
}
}
// L2 归一化
let norm = 0;
for (let j = 0; j < dim; j++) norm += vec[j] * vec[j];
norm = Math.sqrt(norm);
if (norm > 0) {
for (let j = 0; j < dim; j++) vec[j] /= norm;
}
skillVectors.set(skills[i].name, vec);
}
return { skillVectors, idfMap, vocabulary, vocabIndex, dim };
}
/**
* 将查询文本转为 TF-IDF 向量
* @param {string[]} tokens - 查询分词结果
* @param {Map} idfMap - IDF 表
* @param {Map} vocabIndex - 词汇→索引映射
* @param {number} dim - 向量维度
* @returns {Float64Array}
*/
function queryVector(tokens, idfMap, vocabIndex, dim) {
const vec = new Float64Array(dim);
const tf = new Map();
for (const t of tokens) tf.set(t, (tf.get(t) || 0) + 1);
// P1-3 修复: 用循环求最大值,避免 Math.max(...tf.values()) 在大 Map 时栈溢出
const maxTF = iterableMax(tf.values(), 1);
for (const [token, count] of tf.entries()) {
const idx = vocabIndex.get(token);
if (idx !== undefined) {
vec[idx] = (count / maxTF) * (idfMap.get(token) || 0);
}
}
// L2 归一化
let norm = 0;
for (let j = 0; j < dim; j++) norm += vec[j] * vec[j];
norm = Math.sqrt(norm);
if (norm > 0) {
for (let j = 0; j < dim; j++) vec[j] /= norm;
}
return vec;
}
/**
* 余弦相似度
*/
function cosineSimilarity(vecA, vecB) {
if (vecA.length !== vecB.length) return 0;
let dot = 0;
for (let i = 0; i < vecA.length; i++) dot += vecA[i] * vecB[i];
return Math.max(0, Math.min(1, dot)); // L2 归一化后 dot = cosine
}
// 缓存向量空间 (V05 修复: 添加 mtime 检查防止缓存过期)
let _cache = null;
let _cacheMtime = 0;
function _getIndexMtime() {
try {
const ROOT = typeof detectClaudeRoot === 'function' ? detectClaudeRoot() : '';
const indexFile = path.join(ROOT, 'skills-index.json');
return require('fs').statSync(indexFile).mtimeMs;
} catch { return 0; }
}
/**
* 语义评分
* @param {string} query - 用户查询
* @param {Object} index - skills-index.json
* @returns {{ name: string, score: number }[]}
*/
function semanticScore(query, index) {
const currentMtime = _getIndexMtime();
if (!_cache || (currentMtime > 0 && currentMtime !== _cacheMtime)) {
_cache = buildVectors(index);
_cacheMtime = currentMtime;
}
const tokens = tokenize(query);
const qVec = queryVector(tokens, _cache.idfMap, _cache.vocabIndex, _cache.dim);
const scores = [];
for (const [name, sVec] of _cache.skillVectors.entries()) {
const sim = cosineSimilarity(qVec, sVec);
if (sim > 0.01) {
scores.push({ name, score: Math.round(sim * 100) / 100 });
}
}
return scores.sort((a, b) => b.score - a.score);
}
function clearCache() {
_cache = null;
}
// 模块导出
if (typeof module !== 'undefined') {
module.exports = {
buildVectors, queryVector, cosineSimilarity, semanticScore,
tokenize, clearCache,
};
}
// CLI 入口
if (require.main === module) {
const ROOT = detectClaudeRoot();
const indexFile = path.join(ROOT, 'skills-index.json');
try {
const index = JSON.parse(fs.readFileSync(indexFile, 'utf8'));
const vectors = buildVectors(index);
console.log(`向量空间: ${vectors.vocabulary.length} 维, ${vectors.skillVectors.size} 技能`);
const testQueries = [
'帮我用 React 写一个表单组件',
'PyTorch 训练图像分类模型',
'Docker 部署到生产环境',
'写一份 BP 商业计划书',
'数据库查询太慢了怎么办',
];
for (const q of testQueries) {
const results = semanticScore(q, index);
const top3 = results.slice(0, 3).map(r => `${r.name}(${r.score})`).join(', ');
console.log(`\n"${q}"\n${top3}`);
}
} catch (e) {
console.error('Error:', e.message);
}
}