bookworm-smart-assistant/scripts/embedding-router.js

523 lines
15 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* 向量嵌入路由辅助模块 (v6.0 Phase 3)
*
* 当前实现: TF-IDF 余弦相似度增强版(不依赖外部 LLM API
* 未来预留: LLM Gateway embedding 接口
*
* 工作原理:
* 1. 对每个技能的描述文本 + 关键词构建更丰富的 TF-IDF 向量
* (相比 semantic-scorer.js增加了描述权重加成和同义词扩展)
* 2. 用户查询同样构建向量
* 3. 余弦相似度计算并返回排序结果
*
* 触发条件:
* 当 BM25 top-2 分数差距 < 15% 时,由路由引擎调用辅助决策
*
* 缓存策略:
* 技能描述向量缓存到 debug/.embedding-cache.json
* 仅在 skills-index.json 的 mtime 变更时重建
*
* 关键约束:
* - fail-open: 任何异常不影响主路由流程
* - 预留 LLM Gateway 接口(当前回退到 TF-IDF
*/
const fs = require('fs');
const path = require('path');
const detectClaudeRoot = () => require('./paths.config.js').PATHS.root;
const ROOT = detectClaudeRoot();
const DEBUG_DIR = path.join(ROOT, 'debug');
const INDEX_FILE = path.join(ROOT, 'skills-index.json');
const CACHE_FILE = path.join(DEBUG_DIR, '.embedding-cache.json');
// 向量维度上限(防止低频词爆炸)
const MAX_VOCAB_SIZE = 3000;
// 最低文档频率(出现在至少 2 个文档)
const MIN_DF = 2;
// 最高文档频率比例(不超过 70% 文档出现的词视为停用词)
const MAX_DF_RATIO = 0.70;
// 描述文本在向量构建中的权重倍数(相对于关键词)
const DESC_WEIGHT_FACTOR = 1.5;
// 内存缓存(跨调用复用)
let _vectorCache = null;
// =====================================================
// 内部: TF-IDF 向量构建
// =====================================================
/**
* 分词函数(增强版: 中文滑动窗口 + 英文单词 + 2-gram 组合)
* @param {string} text
* @returns {string[]} token 列表(允许重复,保留词频信息)
*/
function tokenizeText(text) {
const tokens = [];
if (!text) return tokens;
const textLower = text.toLowerCase();
// 中文: 2-4 字滑动窗口
const cnChars = textLower.match(/[\u4e00-\u9fff]+/g) || [];
for (const chunk of cnChars) {
for (let len = 2; len <= Math.min(4, chunk.length); len++) {
for (let i = 0; i <= chunk.length - len; i++) {
tokens.push(chunk.slice(i, i + len));
}
}
}
// 英文: 完整单词(允许连字符)
const enWords = textLower.match(/[a-z][a-z0-9.-]*/g) || [];
for (const w of enWords) {
if (w.length >= 2) tokens.push(w);
// 拆分连字符词
const parts = w.split(/[-./]/);
for (const p of parts) {
if (p.length >= 2 && p !== w) tokens.push(p);
}
}
return tokens;
}
/**
* 将技能信息转为文档文本
* @param {Object} skill - skills-index 中的技能项
* @param {number} descFactor - 描述文本权重倍数
* @returns {string[]} token 列表(描述按权重重复)
*/
function skillToTokens(skill, descFactor) {
const tokens = [];
// 技能名称(高权重 × 3
const nameTokens = tokenizeText(skill.name);
for (let i = 0; i < 3; i++) tokens.push(...nameTokens);
// 描述文本(权重 × descFactor
const descTokens = tokenizeText(skill.description || '');
const descRepeat = Math.max(1, Math.round(descFactor));
for (let i = 0; i < descRepeat; i++) tokens.push(...descTokens);
// 关键词(权重 × keyword.weight简化为重复次数
for (const kw of (skill.keywords || [])) {
const kwTokens = tokenizeText(kw.keyword);
const repeat = Math.max(1, Math.round((kw.weight || 1) * 0.5));
for (let i = 0; i < repeat; i++) tokens.push(...kwTokens);
}
return tokens;
}
/**
* 构建 TF-IDF 向量空间
* @param {Object} index - skills-index.json
* @returns {{ skillVectors: Map, idfMap: Map, vocabulary: string[], vocabIndex: Map, dim: number }}
*/
function buildEmbeddingVectors(index) {
const skills = index.skills || [];
const N = skills.length;
// Phase 1: 为每个技能收集 token 列表
const docTokenLists = [];
const df = new Map(); // token → 文档频率
for (const skill of skills) {
const tokens = skillToTokens(skill, DESC_WEIGHT_FACTOR);
// 构建 TF 映射
const tf = new Map();
for (const t of tokens) {
tf.set(t, (tf.get(t) || 0) + 1);
}
docTokenLists.push(tf);
// 统计 DF
for (const t of tf.keys()) {
df.set(t, (df.get(t) || 0) + 1);
}
}
// Phase 2: 构建词汇表(过滤低频/高频词,限制大小)
const candidates = [];
for (const [token, count] of df.entries()) {
if (count >= MIN_DF && count <= N * MAX_DF_RATIO) {
// IDF 分数作为排序依据(高 IDF = 更具区分性)
const idf = Math.log((N + 1) / (count + 1)) + 1;
candidates.push({ token, count, idf });
}
}
// 按 IDF 降序排序,取前 MAX_VOCAB_SIZE
candidates.sort((a, b) => b.idf - a.idf);
const topCandidates = candidates.slice(0, MAX_VOCAB_SIZE);
const vocabulary = topCandidates.map(c => c.token);
const vocabIndex = new Map();
for (let i = 0; i < vocabulary.length; i++) {
vocabIndex.set(vocabulary[i], i);
}
// Phase 3: IDF 计算
const idfMap = new Map();
for (const { token } of topCandidates) {
const docFreq = df.get(token) || 0;
idfMap.set(token, Math.log((N + 1) / (docFreq + 1)) + 1);
}
// Phase 4: 为每个技能构建 TF-IDF 向量L2 归一化)
const dim = vocabulary.length;
const skillVectors = new Map();
for (let i = 0; i < skills.length; i++) {
const vec = new Float64Array(dim);
const tf = docTokenLists[i];
// 用循环求最大值,避免大 Map 时 Math.max(...tf.values()) 栈溢出
let maxTF = 1;
for (const v of tf.values()) if (v > maxTF) maxTF = v;
for (const [token, count] of tf.entries()) {
const idx = vocabIndex.get(token);
if (idx !== undefined) {
vec[idx] = (count / maxTF) * (idfMap.get(token) || 0);
}
}
// L2 归一化
let norm = 0;
for (let j = 0; j < dim; j++) norm += vec[j] * vec[j];
norm = Math.sqrt(norm);
if (norm > 0) {
for (let j = 0; j < dim; j++) vec[j] /= norm;
}
skillVectors.set(skills[i].name, vec);
}
return { skillVectors, idfMap, vocabulary, vocabIndex, dim };
}
/**
* 将查询文本转为 TF-IDF 向量
* @param {string} queryText - 原始查询
* @param {Map} idfMap
* @param {Map} vocabIndex
* @param {number} dim
* @returns {Float64Array}
*/
function buildQueryVector(queryText, idfMap, vocabIndex, dim) {
const tokens = tokenizeText(queryText);
const tf = new Map();
for (const t of tokens) tf.set(t, (tf.get(t) || 0) + 1);
// 用循环求最大值,避免大 Map 时 Math.max(...tf.values()) 栈溢出
let maxTF = 1;
for (const v of tf.values()) if (v > maxTF) maxTF = v;
const vec = new Float64Array(dim);
for (const [token, count] of tf.entries()) {
const idx = vocabIndex.get(token);
if (idx !== undefined) {
vec[idx] = (count / maxTF) * (idfMap.get(token) || 0);
}
}
// L2 归一化
let norm = 0;
for (let j = 0; j < dim; j++) norm += vec[j] * vec[j];
norm = Math.sqrt(norm);
if (norm > 0) {
for (let j = 0; j < dim; j++) vec[j] /= norm;
}
return vec;
}
/**
* 余弦相似度L2 归一化后直接点积)
* @param {Float64Array} vecA
* @param {Float64Array} vecB
* @returns {number} 0~1
*/
function cosineSimilarity(vecA, vecB) {
if (vecA.length !== vecB.length) return 0;
let dot = 0;
for (let i = 0; i < vecA.length; i++) dot += vecA[i] * vecB[i];
return Math.max(0, Math.min(1, dot));
}
// =====================================================
// 内部: 向量缓存管理
// =====================================================
/**
* 获取 skills-index.json 的 mtime用于缓存失效判断
* @returns {number} 时间戳毫秒,失败返回 0
*/
function getIndexMtime() {
try {
return fs.statSync(INDEX_FILE).mtimeMs;
} catch {
return 0;
}
}
/**
* 从磁盘加载缓存(如果 mtime 匹配且格式有效)
* @param {number} currentMtime
* @returns {Object|null} 向量数据或 null
*/
function loadCacheFromDisk(currentMtime) {
try {
if (!fs.existsSync(CACHE_FILE)) return null;
const raw = JSON.parse(fs.readFileSync(CACHE_FILE, 'utf8'));
// 缓存校验: mtime 必须匹配
if (raw.indexMtime !== currentMtime) return null;
if (!raw.vocabulary || !raw.idfData || !raw.vectorData) return null;
// 反序列化向量数据
const vocabIndex = new Map();
const vocabulary = raw.vocabulary;
for (let i = 0; i < vocabulary.length; i++) {
vocabIndex.set(vocabulary[i], i);
}
const idfMap = new Map(raw.idfData);
const skillVectors = new Map();
const dim = vocabulary.length;
for (const [name, arr] of Object.entries(raw.vectorData)) {
skillVectors.set(name, new Float64Array(arr));
}
return { skillVectors, idfMap, vocabulary, vocabIndex, dim };
} catch {
return null;
}
}
/**
* 将向量数据序列化到磁盘缓存
* @param {Object} vectors - buildEmbeddingVectors 的返回值
* @param {number} indexMtime
*/
function saveCacheToDisk(vectors, indexMtime) {
try {
if (!fs.existsSync(DEBUG_DIR)) {
fs.mkdirSync(DEBUG_DIR, { recursive: true });
}
// 序列化向量Float64Array → 普通数组)
const vectorData = {};
for (const [name, vec] of vectors.skillVectors.entries()) {
vectorData[name] = Array.from(vec);
}
const cache = {
indexMtime,
generatedAt: new Date().toISOString(),
vocabulary: vectors.vocabulary,
idfData: Array.from(vectors.idfMap.entries()),
vectorData,
};
fs.writeFileSync(CACHE_FILE, JSON.stringify(cache));
} catch {
// 缓存写失败不影响主流程
}
}
/**
* 获取(或重建)向量空间,优先使用内存缓存
* @returns {Object|null} 向量数据,失败返回 null
*/
function getVectors() {
try {
const mtime = getIndexMtime();
// 1. 内存缓存命中
if (_vectorCache && _vectorCache._indexMtime === mtime) {
return _vectorCache;
}
// 2. 磁盘缓存命中
const diskCache = loadCacheFromDisk(mtime);
if (diskCache) {
diskCache._indexMtime = mtime;
_vectorCache = diskCache;
return _vectorCache;
}
// 3. 重建向量空间
if (!fs.existsSync(INDEX_FILE)) return null;
const index = JSON.parse(fs.readFileSync(INDEX_FILE, 'utf8'));
const vectors = buildEmbeddingVectors(index);
vectors._indexMtime = mtime;
// 异步写缓存(不阻塞返回)
setImmediate(() => saveCacheToDisk(vectors, mtime));
_vectorCache = vectors;
return _vectorCache;
} catch {
return null;
}
}
// =====================================================
// 公共 API
// =====================================================
/**
* 判断是否应该激活 embedding 辅助路由
* 触发条件: BM25 top-2 分数差距 < 15%
*
* @param {number[]} top2Scores - top-2 候选的分数数组 [score1, score2]
* @returns {boolean}
*/
function shouldActivate(top2Scores) {
if (!Array.isArray(top2Scores) || top2Scores.length < 2) return false;
const [score1, score2] = top2Scores;
if (score1 <= 0) return false;
const gap = (score1 - score2) / score1;
return gap < 0.15;
}
/**
* 基于 TF-IDF 余弦相似度计算技能相关性
* (当前实现,不依赖外部 LLM API
*
* @param {string} query - 用户查询文本
* @param {string[]} [skillNames] - 限定的技能名列表;为空时对所有技能评分
* @returns {{ skill: string, similarity: number }[]} 按相似度降序排列
*/
function computeSimilarity(query, skillNames) {
try {
const vectors = getVectors();
if (!vectors) return [];
const qVec = buildQueryVector(query, vectors.idfMap, vectors.vocabIndex, vectors.dim);
const results = [];
const targetNames = (skillNames && skillNames.length > 0)
? new Set(skillNames)
: null;
for (const [name, sVec] of vectors.skillVectors.entries()) {
if (targetNames && !targetNames.has(name)) continue;
const similarity = cosineSimilarity(qVec, sVec);
if (similarity > 0.005) {
results.push({ skill: name, similarity: Math.round(similarity * 1000) / 1000 });
}
}
return results.sort((a, b) => b.similarity - a.similarity);
} catch {
return []; // fail-open
}
}
/**
* 预留接口: 基于 LLM 真实 embedding 的相似度计算
* 当 llmGateway 可用时使用真实 embedding否则回退到 TF-IDF
*
* @param {string} query - 用户查询文本
* @param {string[]} skillDescriptions - 技能描述列表(格式: [{skill, description}]
* @param {Object|null} llmGateway - LLM Gateway MCP 实例(可选)
* @returns {Promise<{ skill: string, similarity: number }[]>}
*/
async function computeEmbeddingSimilarity(query, skillDescriptions, llmGateway) {
// 当 LLM Gateway 可用时,使用真实 embedding未来实现
if (llmGateway && typeof llmGateway.embed === 'function') {
try {
// 预留: 调用 LLM Gateway 的 embedding 接口
// const queryEmb = await llmGateway.embed(query);
// const results = [];
// for (const { skill, description } of skillDescriptions) {
// const skillEmb = await llmGateway.embed(description);
// const sim = cosineSimilarity(queryEmb, skillEmb);
// results.push({ skill, similarity: sim });
// }
// return results.sort((a, b) => b.similarity - a.similarity);
throw new Error('LLM Gateway embedding 接口尚未实现,回退到 TF-IDF');
} catch {
// 回退到 TF-IDF
}
}
// 回退: 使用 TF-IDF 余弦相似度
const skillNames = (skillDescriptions || []).map(s =>
typeof s === 'string' ? s : (s.skill || s.name)
).filter(Boolean);
return computeSimilarity(query, skillNames.length > 0 ? skillNames : undefined);
}
/**
* 清除内存缓存(供测试使用)
*/
function clearCache() {
_vectorCache = null;
}
/**
* 获取向量空间元信息(供调试使用)
* @returns {{ vocabSize: number, skillCount: number, fromCache: boolean } | null}
*/
function getVectorStats() {
try {
const vectors = getVectors();
if (!vectors) return null;
return {
vocabSize: vectors.vocabulary.length,
skillCount: vectors.skillVectors.size,
dim: vectors.dim,
};
} catch {
return null;
}
}
// =====================================================
// 模块导出
// =====================================================
if (typeof module !== 'undefined') {
module.exports = {
// 核心接口
shouldActivate,
computeSimilarity,
computeEmbeddingSimilarity,
// 工具函数
clearCache,
getVectorStats,
// 底层函数(供测试)
tokenizeText,
cosineSimilarity,
buildQueryVector,
};
}
// CLI 入口
if (require.main === module) {
const query = process.argv.slice(2).join(' ') || '帮我优化首屏加载速度';
console.log(`=== embedding-router 测试 ===`);
console.log(`查询: "${query}"`);
const stats = getVectorStats();
if (stats) {
console.log(`向量空间: ${stats.vocabSize} 词, ${stats.skillCount} 技能, ${stats.dim}`);
}
// 测试 shouldActivate
console.log(`\nshouldActivate([0.85, 0.82]): ${shouldActivate([0.85, 0.82])}`);
console.log(`shouldActivate([0.85, 0.60]): ${shouldActivate([0.85, 0.60])}`);
// 测试 computeSimilarity
const results = computeSimilarity(query);
console.log(`\nTop-5 相似度:`);
results.slice(0, 5).forEach((r, i) => {
console.log(` ${i + 1}. ${r.skill.padEnd(30)} ${r.similarity}`);
});
}