bookworm-smart-assistant/scripts/synonym-expander.js

135 lines
3.8 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* 同义词展开器 (v4.9)
*
* 加载 synonyms.json将查询 token 展开为同义词组。
* 供 route-analyzer.js 和 route-feedback.js 的 tokenize() 使用。
*
* 核心函数:
* loadSynonymMap() → Map<word, string[]> 单例缓存
* expandSynonyms(tokens) → 展开后的 Set<string>
*/
const fs = require('fs');
const path = require('path');
// 单例缓存
let _synonymMap = null;
/**
* 加载同义词映射表 (单例)
* @returns {Map<string, string[]>} 每个词 → 所属组的全部同义词
*/
function loadSynonymMap() {
if (_synonymMap) return _synonymMap;
_synonymMap = new Map();
try {
const selfDir = path.dirname(__filename);
const synFile = path.join(selfDir, 'synonyms.json');
if (!fs.existsSync(synFile)) return _synonymMap;
const data = JSON.parse(fs.readFileSync(synFile, 'utf8'));
for (const group of (data.groups || [])) {
const words = (group.words || []).map(w => w.toLowerCase());
for (const word of words) {
// 每个词映射到组内其他所有词
const others = words.filter(w => w !== word);
if (_synonymMap.has(word)) {
// 合并多组同义词
const existing = _synonymMap.get(word);
for (const o of others) {
if (!existing.includes(o)) existing.push(o);
}
} else {
_synonymMap.set(word, [...others]);
}
}
}
} catch {}
return _synonymMap;
}
/**
* 展开 token 集合为包含同义词的更大集合
* @param {Set<string>|Array<string>} tokens - 原始 token 集合
* @returns {Set<string>} 展开后的 token 集合 (包含原始 + 同义词)
*/
function expandSynonyms(tokens) {
const synMap = loadSynonymMap();
const expanded = new Set(tokens);
for (const token of tokens) {
const synonyms = synMap.get(token.toLowerCase());
if (synonyms) {
for (const syn of synonyms) {
expanded.add(syn);
}
}
}
return expanded;
}
/**
* 重置单例缓存 (测试用)
*/
/**
* P1-FIX: 加权同义词展开
* 原词权重 1.0, 主同义词 0.7, 次同义词 0.4
* @param {Set<string>} tokens
* @returns {{ expanded: Set<string>, weights: Map<string, number> }}
*/
function expandSynonymsWeighted(tokens) {
const synMap = loadSynonymMap();
const expanded = new Set(tokens);
const weights = new Map();
// 原词权重 1.0
for (const t of tokens) weights.set(t, 1.0);
for (const token of tokens) {
const synonyms = synMap.get(token.toLowerCase());
if (synonyms) {
for (let i = 0; i < synonyms.length; i++) {
const syn = synonyms[i];
expanded.add(syn);
// 前 3 个同义词为主同义词(0.7),其余为次(0.4)
const w = i < 3 ? 0.7 : 0.4;
if (!weights.has(syn) || weights.get(syn) < w) {
weights.set(syn, w);
}
}
}
}
return { expanded, weights };
}
function resetCache() {
_synonymMap = null;
}
// 模块导出
if (typeof module !== 'undefined') {
module.exports = { loadSynonymMap, expandSynonyms, expandSynonymsWeighted, resetCache };
}
// CLI 入口
if (require.main === module) {
const query = process.argv.slice(2).join(' ');
if (!query) {
console.log('Usage: node synonym-expander.js <tokens...>');
console.log('Example: node synonym-expander.js 前端 部署');
process.exit(0);
}
const tokens = new Set(query.toLowerCase().split(/\s+/));
const expanded = expandSynonyms(tokens);
console.log('原始 tokens:', Array.from(tokens).join(', '));
console.log('展开后:', Array.from(expanded).join(', '));
console.log(`展开: ${tokens.size}${expanded.size} (+${expanded.size - tokens.size})`);
}