bookworm-smart-assistant/scripts/embedding-router.js

523 lines
15 KiB
JavaScript
Raw Permalink Normal View History

#!/usr/bin/env node
/**
* 向量嵌入路由辅助模块 (v6.0 Phase 3)
*
* 当前实现: TF-IDF 余弦相似度增强版不依赖外部 LLM API
* 未来预留: LLM Gateway embedding 接口
*
* 工作原理:
* 1. 对每个技能的描述文本 + 关键词构建更丰富的 TF-IDF 向量
* (相比 semantic-scorer.js增加了描述权重加成和同义词扩展)
* 2. 用户查询同样构建向量
* 3. 余弦相似度计算并返回排序结果
*
* 触发条件:
* BM25 top-2 分数差距 < 15% 由路由引擎调用辅助决策
*
* 缓存策略:
* 技能描述向量缓存到 debug/.embedding-cache.json
* 仅在 skills-index.json mtime 变更时重建
*
* 关键约束:
* - fail-open: 任何异常不影响主路由流程
* - 预留 LLM Gateway 接口当前回退到 TF-IDF
*/
const fs = require('fs');
const path = require('path');
const detectClaudeRoot = () => require('./paths.config.js').PATHS.root;
const ROOT = detectClaudeRoot();
const DEBUG_DIR = path.join(ROOT, 'debug');
const INDEX_FILE = path.join(ROOT, 'skills-index.json');
const CACHE_FILE = path.join(DEBUG_DIR, '.embedding-cache.json');
// 向量维度上限(防止低频词爆炸)
const MAX_VOCAB_SIZE = 3000;
// 最低文档频率(出现在至少 2 个文档)
const MIN_DF = 2;
// 最高文档频率比例(不超过 70% 文档出现的词视为停用词)
const MAX_DF_RATIO = 0.70;
// 描述文本在向量构建中的权重倍数(相对于关键词)
const DESC_WEIGHT_FACTOR = 1.5;
// 内存缓存(跨调用复用)
let _vectorCache = null;
// =====================================================
// 内部: TF-IDF 向量构建
// =====================================================
/**
* 分词函数增强版: 中文滑动窗口 + 英文单词 + 2-gram 组合
* @param {string} text
* @returns {string[]} token 列表允许重复保留词频信息
*/
function tokenizeText(text) {
const tokens = [];
if (!text) return tokens;
const textLower = text.toLowerCase();
// 中文: 2-4 字滑动窗口
const cnChars = textLower.match(/[\u4e00-\u9fff]+/g) || [];
for (const chunk of cnChars) {
for (let len = 2; len <= Math.min(4, chunk.length); len++) {
for (let i = 0; i <= chunk.length - len; i++) {
tokens.push(chunk.slice(i, i + len));
}
}
}
// 英文: 完整单词(允许连字符)
const enWords = textLower.match(/[a-z][a-z0-9.-]*/g) || [];
for (const w of enWords) {
if (w.length >= 2) tokens.push(w);
// 拆分连字符词
const parts = w.split(/[-./]/);
for (const p of parts) {
if (p.length >= 2 && p !== w) tokens.push(p);
}
}
return tokens;
}
/**
* 将技能信息转为文档文本
* @param {Object} skill - skills-index 中的技能项
* @param {number} descFactor - 描述文本权重倍数
* @returns {string[]} token 列表描述按权重重复
*/
function skillToTokens(skill, descFactor) {
const tokens = [];
// 技能名称(高权重 × 3
const nameTokens = tokenizeText(skill.name);
for (let i = 0; i < 3; i++) tokens.push(...nameTokens);
// 描述文本(权重 × descFactor
const descTokens = tokenizeText(skill.description || '');
const descRepeat = Math.max(1, Math.round(descFactor));
for (let i = 0; i < descRepeat; i++) tokens.push(...descTokens);
// 关键词(权重 × keyword.weight简化为重复次数
for (const kw of (skill.keywords || [])) {
const kwTokens = tokenizeText(kw.keyword);
const repeat = Math.max(1, Math.round((kw.weight || 1) * 0.5));
for (let i = 0; i < repeat; i++) tokens.push(...kwTokens);
}
return tokens;
}
/**
* 构建 TF-IDF 向量空间
* @param {Object} index - skills-index.json
* @returns {{ skillVectors: Map, idfMap: Map, vocabulary: string[], vocabIndex: Map, dim: number }}
*/
function buildEmbeddingVectors(index) {
const skills = index.skills || [];
const N = skills.length;
// Phase 1: 为每个技能收集 token 列表
const docTokenLists = [];
const df = new Map(); // token → 文档频率
for (const skill of skills) {
const tokens = skillToTokens(skill, DESC_WEIGHT_FACTOR);
// 构建 TF 映射
const tf = new Map();
for (const t of tokens) {
tf.set(t, (tf.get(t) || 0) + 1);
}
docTokenLists.push(tf);
// 统计 DF
for (const t of tf.keys()) {
df.set(t, (df.get(t) || 0) + 1);
}
}
// Phase 2: 构建词汇表(过滤低频/高频词,限制大小)
const candidates = [];
for (const [token, count] of df.entries()) {
if (count >= MIN_DF && count <= N * MAX_DF_RATIO) {
// IDF 分数作为排序依据(高 IDF = 更具区分性)
const idf = Math.log((N + 1) / (count + 1)) + 1;
candidates.push({ token, count, idf });
}
}
// 按 IDF 降序排序,取前 MAX_VOCAB_SIZE
candidates.sort((a, b) => b.idf - a.idf);
const topCandidates = candidates.slice(0, MAX_VOCAB_SIZE);
const vocabulary = topCandidates.map(c => c.token);
const vocabIndex = new Map();
for (let i = 0; i < vocabulary.length; i++) {
vocabIndex.set(vocabulary[i], i);
}
// Phase 3: IDF 计算
const idfMap = new Map();
for (const { token } of topCandidates) {
const docFreq = df.get(token) || 0;
idfMap.set(token, Math.log((N + 1) / (docFreq + 1)) + 1);
}
// Phase 4: 为每个技能构建 TF-IDF 向量L2 归一化)
const dim = vocabulary.length;
const skillVectors = new Map();
for (let i = 0; i < skills.length; i++) {
const vec = new Float64Array(dim);
const tf = docTokenLists[i];
// 用循环求最大值,避免大 Map 时 Math.max(...tf.values()) 栈溢出
let maxTF = 1;
for (const v of tf.values()) if (v > maxTF) maxTF = v;
for (const [token, count] of tf.entries()) {
const idx = vocabIndex.get(token);
if (idx !== undefined) {
vec[idx] = (count / maxTF) * (idfMap.get(token) || 0);
}
}
// L2 归一化
let norm = 0;
for (let j = 0; j < dim; j++) norm += vec[j] * vec[j];
norm = Math.sqrt(norm);
if (norm > 0) {
for (let j = 0; j < dim; j++) vec[j] /= norm;
}
skillVectors.set(skills[i].name, vec);
}
return { skillVectors, idfMap, vocabulary, vocabIndex, dim };
}
/**
* 将查询文本转为 TF-IDF 向量
* @param {string} queryText - 原始查询
* @param {Map} idfMap
* @param {Map} vocabIndex
* @param {number} dim
* @returns {Float64Array}
*/
function buildQueryVector(queryText, idfMap, vocabIndex, dim) {
const tokens = tokenizeText(queryText);
const tf = new Map();
for (const t of tokens) tf.set(t, (tf.get(t) || 0) + 1);
// 用循环求最大值,避免大 Map 时 Math.max(...tf.values()) 栈溢出
let maxTF = 1;
for (const v of tf.values()) if (v > maxTF) maxTF = v;
const vec = new Float64Array(dim);
for (const [token, count] of tf.entries()) {
const idx = vocabIndex.get(token);
if (idx !== undefined) {
vec[idx] = (count / maxTF) * (idfMap.get(token) || 0);
}
}
// L2 归一化
let norm = 0;
for (let j = 0; j < dim; j++) norm += vec[j] * vec[j];
norm = Math.sqrt(norm);
if (norm > 0) {
for (let j = 0; j < dim; j++) vec[j] /= norm;
}
return vec;
}
/**
* 余弦相似度L2 归一化后直接点积
* @param {Float64Array} vecA
* @param {Float64Array} vecB
* @returns {number} 0~1
*/
function cosineSimilarity(vecA, vecB) {
if (vecA.length !== vecB.length) return 0;
let dot = 0;
for (let i = 0; i < vecA.length; i++) dot += vecA[i] * vecB[i];
return Math.max(0, Math.min(1, dot));
}
// =====================================================
// 内部: 向量缓存管理
// =====================================================
/**
* 获取 skills-index.json mtime用于缓存失效判断
* @returns {number} 时间戳毫秒失败返回 0
*/
function getIndexMtime() {
try {
return fs.statSync(INDEX_FILE).mtimeMs;
} catch {
return 0;
}
}
/**
* 从磁盘加载缓存如果 mtime 匹配且格式有效
* @param {number} currentMtime
* @returns {Object|null} 向量数据或 null
*/
function loadCacheFromDisk(currentMtime) {
try {
if (!fs.existsSync(CACHE_FILE)) return null;
const raw = JSON.parse(fs.readFileSync(CACHE_FILE, 'utf8'));
// 缓存校验: mtime 必须匹配
if (raw.indexMtime !== currentMtime) return null;
if (!raw.vocabulary || !raw.idfData || !raw.vectorData) return null;
// 反序列化向量数据
const vocabIndex = new Map();
const vocabulary = raw.vocabulary;
for (let i = 0; i < vocabulary.length; i++) {
vocabIndex.set(vocabulary[i], i);
}
const idfMap = new Map(raw.idfData);
const skillVectors = new Map();
const dim = vocabulary.length;
for (const [name, arr] of Object.entries(raw.vectorData)) {
skillVectors.set(name, new Float64Array(arr));
}
return { skillVectors, idfMap, vocabulary, vocabIndex, dim };
} catch {
return null;
}
}
/**
* 将向量数据序列化到磁盘缓存
* @param {Object} vectors - buildEmbeddingVectors 的返回值
* @param {number} indexMtime
*/
function saveCacheToDisk(vectors, indexMtime) {
try {
if (!fs.existsSync(DEBUG_DIR)) {
fs.mkdirSync(DEBUG_DIR, { recursive: true });
}
// 序列化向量Float64Array → 普通数组)
const vectorData = {};
for (const [name, vec] of vectors.skillVectors.entries()) {
vectorData[name] = Array.from(vec);
}
const cache = {
indexMtime,
generatedAt: new Date().toISOString(),
vocabulary: vectors.vocabulary,
idfData: Array.from(vectors.idfMap.entries()),
vectorData,
};
fs.writeFileSync(CACHE_FILE, JSON.stringify(cache));
} catch {
// 缓存写失败不影响主流程
}
}
/**
* 获取或重建向量空间优先使用内存缓存
* @returns {Object|null} 向量数据失败返回 null
*/
function getVectors() {
try {
const mtime = getIndexMtime();
// 1. 内存缓存命中
if (_vectorCache && _vectorCache._indexMtime === mtime) {
return _vectorCache;
}
// 2. 磁盘缓存命中
const diskCache = loadCacheFromDisk(mtime);
if (diskCache) {
diskCache._indexMtime = mtime;
_vectorCache = diskCache;
return _vectorCache;
}
// 3. 重建向量空间
if (!fs.existsSync(INDEX_FILE)) return null;
const index = JSON.parse(fs.readFileSync(INDEX_FILE, 'utf8'));
const vectors = buildEmbeddingVectors(index);
vectors._indexMtime = mtime;
// 异步写缓存(不阻塞返回)
setImmediate(() => saveCacheToDisk(vectors, mtime));
_vectorCache = vectors;
return _vectorCache;
} catch {
return null;
}
}
// =====================================================
// 公共 API
// =====================================================
/**
* 判断是否应该激活 embedding 辅助路由
* 触发条件: BM25 top-2 分数差距 < 15%
*
* @param {number[]} top2Scores - top-2 候选的分数数组 [score1, score2]
* @returns {boolean}
*/
function shouldActivate(top2Scores) {
if (!Array.isArray(top2Scores) || top2Scores.length < 2) return false;
const [score1, score2] = top2Scores;
if (score1 <= 0) return false;
const gap = (score1 - score2) / score1;
return gap < 0.15;
}
/**
* 基于 TF-IDF 余弦相似度计算技能相关性
* 当前实现不依赖外部 LLM API
*
* @param {string} query - 用户查询文本
* @param {string[]} [skillNames] - 限定的技能名列表为空时对所有技能评分
* @returns {{ skill: string, similarity: number }[]} 按相似度降序排列
*/
function computeSimilarity(query, skillNames) {
try {
const vectors = getVectors();
if (!vectors) return [];
const qVec = buildQueryVector(query, vectors.idfMap, vectors.vocabIndex, vectors.dim);
const results = [];
const targetNames = (skillNames && skillNames.length > 0)
? new Set(skillNames)
: null;
for (const [name, sVec] of vectors.skillVectors.entries()) {
if (targetNames && !targetNames.has(name)) continue;
const similarity = cosineSimilarity(qVec, sVec);
if (similarity > 0.005) {
results.push({ skill: name, similarity: Math.round(similarity * 1000) / 1000 });
}
}
return results.sort((a, b) => b.similarity - a.similarity);
} catch {
return []; // fail-open
}
}
/**
* 预留接口: 基于 LLM 真实 embedding 的相似度计算
* llmGateway 可用时使用真实 embedding否则回退到 TF-IDF
*
* @param {string} query - 用户查询文本
* @param {string[]} skillDescriptions - 技能描述列表格式: [{skill, description}]
* @param {Object|null} llmGateway - LLM Gateway MCP 实例可选
* @returns {Promise<{ skill: string, similarity: number }[]>}
*/
async function computeEmbeddingSimilarity(query, skillDescriptions, llmGateway) {
// 当 LLM Gateway 可用时,使用真实 embedding未来实现
if (llmGateway && typeof llmGateway.embed === 'function') {
try {
// 预留: 调用 LLM Gateway 的 embedding 接口
// const queryEmb = await llmGateway.embed(query);
// const results = [];
// for (const { skill, description } of skillDescriptions) {
// const skillEmb = await llmGateway.embed(description);
// const sim = cosineSimilarity(queryEmb, skillEmb);
// results.push({ skill, similarity: sim });
// }
// return results.sort((a, b) => b.similarity - a.similarity);
throw new Error('LLM Gateway embedding 接口尚未实现,回退到 TF-IDF');
} catch {
// 回退到 TF-IDF
}
}
// 回退: 使用 TF-IDF 余弦相似度
const skillNames = (skillDescriptions || []).map(s =>
typeof s === 'string' ? s : (s.skill || s.name)
).filter(Boolean);
return computeSimilarity(query, skillNames.length > 0 ? skillNames : undefined);
}
/**
* 清除内存缓存供测试使用
*/
function clearCache() {
_vectorCache = null;
}
/**
* 获取向量空间元信息供调试使用
* @returns {{ vocabSize: number, skillCount: number, fromCache: boolean } | null}
*/
function getVectorStats() {
try {
const vectors = getVectors();
if (!vectors) return null;
return {
vocabSize: vectors.vocabulary.length,
skillCount: vectors.skillVectors.size,
dim: vectors.dim,
};
} catch {
return null;
}
}
// =====================================================
// 模块导出
// =====================================================
if (typeof module !== 'undefined') {
module.exports = {
// 核心接口
shouldActivate,
computeSimilarity,
computeEmbeddingSimilarity,
// 工具函数
clearCache,
getVectorStats,
// 底层函数(供测试)
tokenizeText,
cosineSimilarity,
buildQueryVector,
};
}
// CLI 入口
if (require.main === module) {
const query = process.argv.slice(2).join(' ') || '帮我优化首屏加载速度';
console.log(`=== embedding-router 测试 ===`);
console.log(`查询: "${query}"`);
const stats = getVectorStats();
if (stats) {
console.log(`向量空间: ${stats.vocabSize} 词, ${stats.skillCount} 技能, ${stats.dim}`);
}
// 测试 shouldActivate
console.log(`\nshouldActivate([0.85, 0.82]): ${shouldActivate([0.85, 0.82])}`);
console.log(`shouldActivate([0.85, 0.60]): ${shouldActivate([0.85, 0.60])}`);
// 测试 computeSimilarity
const results = computeSimilarity(query);
console.log(`\nTop-5 相似度:`);
results.slice(0, 5).forEach((r, i) => {
console.log(` ${i + 1}. ${r.skill.padEnd(30)} ${r.similarity}`);
});
}