523 lines
15 KiB
JavaScript
523 lines
15 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* 向量嵌入路由辅助模块 (v6.0 Phase 3)
|
||
*
|
||
* 当前实现: TF-IDF 余弦相似度增强版(不依赖外部 LLM API)
|
||
* 未来预留: LLM Gateway embedding 接口
|
||
*
|
||
* 工作原理:
|
||
* 1. 对每个技能的描述文本 + 关键词构建更丰富的 TF-IDF 向量
|
||
* (相比 semantic-scorer.js,增加了描述权重加成和同义词扩展)
|
||
* 2. 用户查询同样构建向量
|
||
* 3. 余弦相似度计算并返回排序结果
|
||
*
|
||
* 触发条件:
|
||
* 当 BM25 top-2 分数差距 < 15% 时,由路由引擎调用辅助决策
|
||
*
|
||
* 缓存策略:
|
||
* 技能描述向量缓存到 debug/.embedding-cache.json
|
||
* 仅在 skills-index.json 的 mtime 变更时重建
|
||
*
|
||
* 关键约束:
|
||
* - fail-open: 任何异常不影响主路由流程
|
||
* - 预留 LLM Gateway 接口(当前回退到 TF-IDF)
|
||
*/
|
||
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
const detectClaudeRoot = () => require('./paths.config.js').PATHS.root;
|
||
|
||
const ROOT = detectClaudeRoot();
|
||
const DEBUG_DIR = path.join(ROOT, 'debug');
|
||
const INDEX_FILE = path.join(ROOT, 'skills-index.json');
|
||
const CACHE_FILE = path.join(DEBUG_DIR, '.embedding-cache.json');
|
||
|
||
// 向量维度上限(防止低频词爆炸)
|
||
const MAX_VOCAB_SIZE = 3000;
|
||
// 最低文档频率(出现在至少 2 个文档)
|
||
const MIN_DF = 2;
|
||
// 最高文档频率比例(不超过 70% 文档出现的词视为停用词)
|
||
const MAX_DF_RATIO = 0.70;
|
||
|
||
// 描述文本在向量构建中的权重倍数(相对于关键词)
|
||
const DESC_WEIGHT_FACTOR = 1.5;
|
||
|
||
// 内存缓存(跨调用复用)
|
||
let _vectorCache = null;
|
||
|
||
// =====================================================
|
||
// 内部: TF-IDF 向量构建
|
||
// =====================================================
|
||
|
||
/**
|
||
* 分词函数(增强版: 中文滑动窗口 + 英文单词 + 2-gram 组合)
|
||
* @param {string} text
|
||
* @returns {string[]} token 列表(允许重复,保留词频信息)
|
||
*/
|
||
function tokenizeText(text) {
|
||
const tokens = [];
|
||
if (!text) return tokens;
|
||
|
||
const textLower = text.toLowerCase();
|
||
|
||
// 中文: 2-4 字滑动窗口
|
||
const cnChars = textLower.match(/[\u4e00-\u9fff]+/g) || [];
|
||
for (const chunk of cnChars) {
|
||
for (let len = 2; len <= Math.min(4, chunk.length); len++) {
|
||
for (let i = 0; i <= chunk.length - len; i++) {
|
||
tokens.push(chunk.slice(i, i + len));
|
||
}
|
||
}
|
||
}
|
||
|
||
// 英文: 完整单词(允许连字符)
|
||
const enWords = textLower.match(/[a-z][a-z0-9.-]*/g) || [];
|
||
for (const w of enWords) {
|
||
if (w.length >= 2) tokens.push(w);
|
||
// 拆分连字符词
|
||
const parts = w.split(/[-./]/);
|
||
for (const p of parts) {
|
||
if (p.length >= 2 && p !== w) tokens.push(p);
|
||
}
|
||
}
|
||
|
||
return tokens;
|
||
}
|
||
|
||
/**
|
||
* 将技能信息转为文档文本
|
||
* @param {Object} skill - skills-index 中的技能项
|
||
* @param {number} descFactor - 描述文本权重倍数
|
||
* @returns {string[]} token 列表(描述按权重重复)
|
||
*/
|
||
function skillToTokens(skill, descFactor) {
|
||
const tokens = [];
|
||
|
||
// 技能名称(高权重 × 3)
|
||
const nameTokens = tokenizeText(skill.name);
|
||
for (let i = 0; i < 3; i++) tokens.push(...nameTokens);
|
||
|
||
// 描述文本(权重 × descFactor)
|
||
const descTokens = tokenizeText(skill.description || '');
|
||
const descRepeat = Math.max(1, Math.round(descFactor));
|
||
for (let i = 0; i < descRepeat; i++) tokens.push(...descTokens);
|
||
|
||
// 关键词(权重 × keyword.weight,简化为重复次数)
|
||
for (const kw of (skill.keywords || [])) {
|
||
const kwTokens = tokenizeText(kw.keyword);
|
||
const repeat = Math.max(1, Math.round((kw.weight || 1) * 0.5));
|
||
for (let i = 0; i < repeat; i++) tokens.push(...kwTokens);
|
||
}
|
||
|
||
return tokens;
|
||
}
|
||
|
||
/**
|
||
* 构建 TF-IDF 向量空间
|
||
* @param {Object} index - skills-index.json
|
||
* @returns {{ skillVectors: Map, idfMap: Map, vocabulary: string[], vocabIndex: Map, dim: number }}
|
||
*/
|
||
function buildEmbeddingVectors(index) {
|
||
const skills = index.skills || [];
|
||
const N = skills.length;
|
||
|
||
// Phase 1: 为每个技能收集 token 列表
|
||
const docTokenLists = [];
|
||
const df = new Map(); // token → 文档频率
|
||
|
||
for (const skill of skills) {
|
||
const tokens = skillToTokens(skill, DESC_WEIGHT_FACTOR);
|
||
// 构建 TF 映射
|
||
const tf = new Map();
|
||
for (const t of tokens) {
|
||
tf.set(t, (tf.get(t) || 0) + 1);
|
||
}
|
||
docTokenLists.push(tf);
|
||
|
||
// 统计 DF
|
||
for (const t of tf.keys()) {
|
||
df.set(t, (df.get(t) || 0) + 1);
|
||
}
|
||
}
|
||
|
||
// Phase 2: 构建词汇表(过滤低频/高频词,限制大小)
|
||
const candidates = [];
|
||
for (const [token, count] of df.entries()) {
|
||
if (count >= MIN_DF && count <= N * MAX_DF_RATIO) {
|
||
// IDF 分数作为排序依据(高 IDF = 更具区分性)
|
||
const idf = Math.log((N + 1) / (count + 1)) + 1;
|
||
candidates.push({ token, count, idf });
|
||
}
|
||
}
|
||
// 按 IDF 降序排序,取前 MAX_VOCAB_SIZE
|
||
candidates.sort((a, b) => b.idf - a.idf);
|
||
const topCandidates = candidates.slice(0, MAX_VOCAB_SIZE);
|
||
|
||
const vocabulary = topCandidates.map(c => c.token);
|
||
const vocabIndex = new Map();
|
||
for (let i = 0; i < vocabulary.length; i++) {
|
||
vocabIndex.set(vocabulary[i], i);
|
||
}
|
||
|
||
// Phase 3: IDF 计算
|
||
const idfMap = new Map();
|
||
for (const { token } of topCandidates) {
|
||
const docFreq = df.get(token) || 0;
|
||
idfMap.set(token, Math.log((N + 1) / (docFreq + 1)) + 1);
|
||
}
|
||
|
||
// Phase 4: 为每个技能构建 TF-IDF 向量(L2 归一化)
|
||
const dim = vocabulary.length;
|
||
const skillVectors = new Map();
|
||
|
||
for (let i = 0; i < skills.length; i++) {
|
||
const vec = new Float64Array(dim);
|
||
const tf = docTokenLists[i];
|
||
// 用循环求最大值,避免大 Map 时 Math.max(...tf.values()) 栈溢出
|
||
let maxTF = 1;
|
||
for (const v of tf.values()) if (v > maxTF) maxTF = v;
|
||
|
||
for (const [token, count] of tf.entries()) {
|
||
const idx = vocabIndex.get(token);
|
||
if (idx !== undefined) {
|
||
vec[idx] = (count / maxTF) * (idfMap.get(token) || 0);
|
||
}
|
||
}
|
||
|
||
// L2 归一化
|
||
let norm = 0;
|
||
for (let j = 0; j < dim; j++) norm += vec[j] * vec[j];
|
||
norm = Math.sqrt(norm);
|
||
if (norm > 0) {
|
||
for (let j = 0; j < dim; j++) vec[j] /= norm;
|
||
}
|
||
|
||
skillVectors.set(skills[i].name, vec);
|
||
}
|
||
|
||
return { skillVectors, idfMap, vocabulary, vocabIndex, dim };
|
||
}
|
||
|
||
/**
|
||
* 将查询文本转为 TF-IDF 向量
|
||
* @param {string} queryText - 原始查询
|
||
* @param {Map} idfMap
|
||
* @param {Map} vocabIndex
|
||
* @param {number} dim
|
||
* @returns {Float64Array}
|
||
*/
|
||
function buildQueryVector(queryText, idfMap, vocabIndex, dim) {
|
||
const tokens = tokenizeText(queryText);
|
||
const tf = new Map();
|
||
for (const t of tokens) tf.set(t, (tf.get(t) || 0) + 1);
|
||
// 用循环求最大值,避免大 Map 时 Math.max(...tf.values()) 栈溢出
|
||
let maxTF = 1;
|
||
for (const v of tf.values()) if (v > maxTF) maxTF = v;
|
||
|
||
const vec = new Float64Array(dim);
|
||
for (const [token, count] of tf.entries()) {
|
||
const idx = vocabIndex.get(token);
|
||
if (idx !== undefined) {
|
||
vec[idx] = (count / maxTF) * (idfMap.get(token) || 0);
|
||
}
|
||
}
|
||
|
||
// L2 归一化
|
||
let norm = 0;
|
||
for (let j = 0; j < dim; j++) norm += vec[j] * vec[j];
|
||
norm = Math.sqrt(norm);
|
||
if (norm > 0) {
|
||
for (let j = 0; j < dim; j++) vec[j] /= norm;
|
||
}
|
||
|
||
return vec;
|
||
}
|
||
|
||
/**
|
||
* 余弦相似度(L2 归一化后直接点积)
|
||
* @param {Float64Array} vecA
|
||
* @param {Float64Array} vecB
|
||
* @returns {number} 0~1
|
||
*/
|
||
function cosineSimilarity(vecA, vecB) {
|
||
if (vecA.length !== vecB.length) return 0;
|
||
let dot = 0;
|
||
for (let i = 0; i < vecA.length; i++) dot += vecA[i] * vecB[i];
|
||
return Math.max(0, Math.min(1, dot));
|
||
}
|
||
|
||
// =====================================================
|
||
// 内部: 向量缓存管理
|
||
// =====================================================
|
||
|
||
/**
|
||
* 获取 skills-index.json 的 mtime(用于缓存失效判断)
|
||
* @returns {number} 时间戳毫秒,失败返回 0
|
||
*/
|
||
function getIndexMtime() {
|
||
try {
|
||
return fs.statSync(INDEX_FILE).mtimeMs;
|
||
} catch {
|
||
return 0;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 从磁盘加载缓存(如果 mtime 匹配且格式有效)
|
||
* @param {number} currentMtime
|
||
* @returns {Object|null} 向量数据或 null
|
||
*/
|
||
function loadCacheFromDisk(currentMtime) {
|
||
try {
|
||
if (!fs.existsSync(CACHE_FILE)) return null;
|
||
const raw = JSON.parse(fs.readFileSync(CACHE_FILE, 'utf8'));
|
||
// 缓存校验: mtime 必须匹配
|
||
if (raw.indexMtime !== currentMtime) return null;
|
||
if (!raw.vocabulary || !raw.idfData || !raw.vectorData) return null;
|
||
|
||
// 反序列化向量数据
|
||
const vocabIndex = new Map();
|
||
const vocabulary = raw.vocabulary;
|
||
for (let i = 0; i < vocabulary.length; i++) {
|
||
vocabIndex.set(vocabulary[i], i);
|
||
}
|
||
|
||
const idfMap = new Map(raw.idfData);
|
||
const skillVectors = new Map();
|
||
const dim = vocabulary.length;
|
||
|
||
for (const [name, arr] of Object.entries(raw.vectorData)) {
|
||
skillVectors.set(name, new Float64Array(arr));
|
||
}
|
||
|
||
return { skillVectors, idfMap, vocabulary, vocabIndex, dim };
|
||
} catch {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 将向量数据序列化到磁盘缓存
|
||
* @param {Object} vectors - buildEmbeddingVectors 的返回值
|
||
* @param {number} indexMtime
|
||
*/
|
||
function saveCacheToDisk(vectors, indexMtime) {
|
||
try {
|
||
if (!fs.existsSync(DEBUG_DIR)) {
|
||
fs.mkdirSync(DEBUG_DIR, { recursive: true });
|
||
}
|
||
|
||
// 序列化向量(Float64Array → 普通数组)
|
||
const vectorData = {};
|
||
for (const [name, vec] of vectors.skillVectors.entries()) {
|
||
vectorData[name] = Array.from(vec);
|
||
}
|
||
|
||
const cache = {
|
||
indexMtime,
|
||
generatedAt: new Date().toISOString(),
|
||
vocabulary: vectors.vocabulary,
|
||
idfData: Array.from(vectors.idfMap.entries()),
|
||
vectorData,
|
||
};
|
||
|
||
fs.writeFileSync(CACHE_FILE, JSON.stringify(cache));
|
||
} catch {
|
||
// 缓存写失败不影响主流程
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 获取(或重建)向量空间,优先使用内存缓存
|
||
* @returns {Object|null} 向量数据,失败返回 null
|
||
*/
|
||
function getVectors() {
|
||
try {
|
||
const mtime = getIndexMtime();
|
||
|
||
// 1. 内存缓存命中
|
||
if (_vectorCache && _vectorCache._indexMtime === mtime) {
|
||
return _vectorCache;
|
||
}
|
||
|
||
// 2. 磁盘缓存命中
|
||
const diskCache = loadCacheFromDisk(mtime);
|
||
if (diskCache) {
|
||
diskCache._indexMtime = mtime;
|
||
_vectorCache = diskCache;
|
||
return _vectorCache;
|
||
}
|
||
|
||
// 3. 重建向量空间
|
||
if (!fs.existsSync(INDEX_FILE)) return null;
|
||
const index = JSON.parse(fs.readFileSync(INDEX_FILE, 'utf8'));
|
||
const vectors = buildEmbeddingVectors(index);
|
||
vectors._indexMtime = mtime;
|
||
|
||
// 异步写缓存(不阻塞返回)
|
||
setImmediate(() => saveCacheToDisk(vectors, mtime));
|
||
|
||
_vectorCache = vectors;
|
||
return _vectorCache;
|
||
} catch {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
// =====================================================
|
||
// 公共 API
|
||
// =====================================================
|
||
|
||
/**
|
||
* 判断是否应该激活 embedding 辅助路由
|
||
* 触发条件: BM25 top-2 分数差距 < 15%
|
||
*
|
||
* @param {number[]} top2Scores - top-2 候选的分数数组 [score1, score2]
|
||
* @returns {boolean}
|
||
*/
|
||
function shouldActivate(top2Scores) {
|
||
if (!Array.isArray(top2Scores) || top2Scores.length < 2) return false;
|
||
const [score1, score2] = top2Scores;
|
||
if (score1 <= 0) return false;
|
||
const gap = (score1 - score2) / score1;
|
||
return gap < 0.15;
|
||
}
|
||
|
||
/**
|
||
* 基于 TF-IDF 余弦相似度计算技能相关性
|
||
* (当前实现,不依赖外部 LLM API)
|
||
*
|
||
* @param {string} query - 用户查询文本
|
||
* @param {string[]} [skillNames] - 限定的技能名列表;为空时对所有技能评分
|
||
* @returns {{ skill: string, similarity: number }[]} 按相似度降序排列
|
||
*/
|
||
function computeSimilarity(query, skillNames) {
|
||
try {
|
||
const vectors = getVectors();
|
||
if (!vectors) return [];
|
||
|
||
const qVec = buildQueryVector(query, vectors.idfMap, vectors.vocabIndex, vectors.dim);
|
||
|
||
const results = [];
|
||
const targetNames = (skillNames && skillNames.length > 0)
|
||
? new Set(skillNames)
|
||
: null;
|
||
|
||
for (const [name, sVec] of vectors.skillVectors.entries()) {
|
||
if (targetNames && !targetNames.has(name)) continue;
|
||
const similarity = cosineSimilarity(qVec, sVec);
|
||
if (similarity > 0.005) {
|
||
results.push({ skill: name, similarity: Math.round(similarity * 1000) / 1000 });
|
||
}
|
||
}
|
||
|
||
return results.sort((a, b) => b.similarity - a.similarity);
|
||
} catch {
|
||
return []; // fail-open
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 预留接口: 基于 LLM 真实 embedding 的相似度计算
|
||
* 当 llmGateway 可用时使用真实 embedding,否则回退到 TF-IDF
|
||
*
|
||
* @param {string} query - 用户查询文本
|
||
* @param {string[]} skillDescriptions - 技能描述列表(格式: [{skill, description}])
|
||
* @param {Object|null} llmGateway - LLM Gateway MCP 实例(可选)
|
||
* @returns {Promise<{ skill: string, similarity: number }[]>}
|
||
*/
|
||
async function computeEmbeddingSimilarity(query, skillDescriptions, llmGateway) {
|
||
// 当 LLM Gateway 可用时,使用真实 embedding(未来实现)
|
||
if (llmGateway && typeof llmGateway.embed === 'function') {
|
||
try {
|
||
// 预留: 调用 LLM Gateway 的 embedding 接口
|
||
// const queryEmb = await llmGateway.embed(query);
|
||
// const results = [];
|
||
// for (const { skill, description } of skillDescriptions) {
|
||
// const skillEmb = await llmGateway.embed(description);
|
||
// const sim = cosineSimilarity(queryEmb, skillEmb);
|
||
// results.push({ skill, similarity: sim });
|
||
// }
|
||
// return results.sort((a, b) => b.similarity - a.similarity);
|
||
throw new Error('LLM Gateway embedding 接口尚未实现,回退到 TF-IDF');
|
||
} catch {
|
||
// 回退到 TF-IDF
|
||
}
|
||
}
|
||
|
||
// 回退: 使用 TF-IDF 余弦相似度
|
||
const skillNames = (skillDescriptions || []).map(s =>
|
||
typeof s === 'string' ? s : (s.skill || s.name)
|
||
).filter(Boolean);
|
||
|
||
return computeSimilarity(query, skillNames.length > 0 ? skillNames : undefined);
|
||
}
|
||
|
||
/**
|
||
* 清除内存缓存(供测试使用)
|
||
*/
|
||
function clearCache() {
|
||
_vectorCache = null;
|
||
}
|
||
|
||
/**
|
||
* 获取向量空间元信息(供调试使用)
|
||
* @returns {{ vocabSize: number, skillCount: number, fromCache: boolean } | null}
|
||
*/
|
||
function getVectorStats() {
|
||
try {
|
||
const vectors = getVectors();
|
||
if (!vectors) return null;
|
||
return {
|
||
vocabSize: vectors.vocabulary.length,
|
||
skillCount: vectors.skillVectors.size,
|
||
dim: vectors.dim,
|
||
};
|
||
} catch {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
// =====================================================
|
||
// 模块导出
|
||
// =====================================================
|
||
if (typeof module !== 'undefined') {
|
||
module.exports = {
|
||
// 核心接口
|
||
shouldActivate,
|
||
computeSimilarity,
|
||
computeEmbeddingSimilarity,
|
||
// 工具函数
|
||
clearCache,
|
||
getVectorStats,
|
||
// 底层函数(供测试)
|
||
tokenizeText,
|
||
cosineSimilarity,
|
||
buildQueryVector,
|
||
};
|
||
}
|
||
|
||
// CLI 入口
|
||
if (require.main === module) {
|
||
const query = process.argv.slice(2).join(' ') || '帮我优化首屏加载速度';
|
||
console.log(`=== embedding-router 测试 ===`);
|
||
console.log(`查询: "${query}"`);
|
||
|
||
const stats = getVectorStats();
|
||
if (stats) {
|
||
console.log(`向量空间: ${stats.vocabSize} 词, ${stats.skillCount} 技能, ${stats.dim} 维`);
|
||
}
|
||
|
||
// 测试 shouldActivate
|
||
console.log(`\nshouldActivate([0.85, 0.82]): ${shouldActivate([0.85, 0.82])}`);
|
||
console.log(`shouldActivate([0.85, 0.60]): ${shouldActivate([0.85, 0.60])}`);
|
||
|
||
// 测试 computeSimilarity
|
||
const results = computeSimilarity(query);
|
||
console.log(`\nTop-5 相似度:`);
|
||
results.slice(0, 5).forEach((r, i) => {
|
||
console.log(` ${i + 1}. ${r.skill.padEnd(30)} ${r.similarity}`);
|
||
});
|
||
}
|