bookworm-smart-assistant/scripts/generate-skill-index.js

448 lines
17 KiB
JavaScript
Raw Normal View History

#!/usr/bin/env node
/**
* 技能语义索引生成器
*
* 读取全部 SKILL.md提取 description + 触发关键词
* 生成 skills-index.json 加权关键词映射 ( route-analyzer 使用)
*
* 用法:
* node scripts/generate-skill-index.js # 生成索引
* node scripts/generate-skill-index.js --stats # 生成并显示统计
*/
const fs = require('fs');
const path = require('path');
const detectClaudeRoot = () => require('./paths.config.js').PATHS.root;
const CLAUDE_ROOT = detectClaudeRoot();
const SKILLS_DIR = path.join(CLAUDE_ROOT, 'skills');
const OUTPUT_FILE = path.join(CLAUDE_ROOT, 'skills-index.json');
// AGENTS_DIR_SCAN_PATCH_2026_04_20
const AGENTS_DIR = path.join(CLAUDE_ROOT, 'agents');
const OUTPUT_LITE = path.join(CLAUDE_ROOT, 'skills-index-lite.json');
// === YAML frontmatter 解析 (轻量, 无依赖) ===
function parseFrontmatter(content) {
const match = content.match(/^---\n([\s\S]*?)\n---/);
if (!match) return {};
const yaml = match[1];
const result = {};
// 简单 YAML 解析: key: value 或 key: >\n multiline
let currentKey = null;
let currentValue = '';
let isMultiline = false;
for (const line of yaml.split('\n')) {
if (!isMultiline) {
const kvMatch = line.match(/^(\w[\w-]*):\s*(.*)/);
if (kvMatch) {
if (currentKey) result[currentKey] = currentValue.trim();
currentKey = kvMatch[1];
const val = kvMatch[2].trim();
if (val === '>' || val === '|') {
isMultiline = true;
currentValue = '';
} else {
currentValue = val;
isMultiline = false;
}
}
} else {
if (/^\S/.test(line) && !line.startsWith(' ')) {
// 新的顶层 key
if (currentKey) result[currentKey] = currentValue.trim();
const kvMatch = line.match(/^(\w[\w-]*):\s*(.*)/);
if (kvMatch) {
currentKey = kvMatch[1];
const val = kvMatch[2].trim();
if (val === '>' || val === '|') {
isMultiline = true;
currentValue = '';
} else {
currentValue = val;
isMultiline = false;
}
}
} else {
currentValue += ' ' + line.trim();
}
}
}
if (currentKey) result[currentKey] = currentValue.trim();
return result;
}
// === 关键词层级分类 (v4.9) ===
// core: 表格/引号 = 1.0, strong: 列表 = 0.8, extended: 描述 = 0.5
function classifyKeywordTier(source) {
switch (source) {
case 'table': return { tier: 'core', weight: 1.0 };
case 'quoted': return { tier: 'core', weight: 1.0 };
case 'list': return { tier: 'strong', weight: 0.8 };
case 'description': return { tier: 'extended', weight: 0.5 };
default: return { tier: 'extended', weight: 0.5 };
}
}
// === 分隔符/噪声过滤 (v5.9.1: 增强) ===
const SEPARATOR_RE = /^[-|:\s.…]+$/;
function isNoise(str) {
if (SEPARATOR_RE.test(str)) return true;
if (str.length < 2 || str.length > 30) return true;
// 过滤描述文本碎片: 含句式连词的中文长词组
if (str.length > 8 && /[的了或在与及和是从到]/.test(str) && /[\u4e00-\u9fff]/.test(str)) return true;
// 过滤文件路径碎片
if (/^references?\//.test(str) || /\.md$/.test(str)) return true;
// 过滤省略号
if (str === '...' || str === '…') return true;
return false;
}
// === 关键词提取 ===
function extractKeywords(content) {
const keywords = new Map(); // keyword -> { weight, tier }
// 1. 从触发关键词表提取 (core 层 1.0)
const tableRows = content.matchAll(/\|\s*[^|]+\|\s*([^|]+)\|/g);
for (const row of tableRows) {
const cell = row[1].trim();
if (cell === '关键词' || cell === '---') continue;
const { weight } = classifyKeywordTier('table');
for (const kw of cell.split(/[,、]/)) {
const clean = kw.trim().replace(/`/g, '').replace(/\s+/g, ' ').toLowerCase();
if (clean && !isNoise(clean)) {
const prev = keywords.get(clean);
if (!prev || prev.weight < weight) {
keywords.set(clean, { weight, tier: 'core' });
}
}
}
}
// 列表格式: - **类别**: `kw1`, `kw2` (strong 层 0.8, v5.9.1: 加句式过滤)
const listItems = content.matchAll(/-\s*\*\*[^*]+\*\*\s*[:]\s*(.+)/g);
for (const item of listItems) {
const kwStr = item[1];
const { weight } = classifyKeywordTier('list');
for (const kw of kwStr.split(/[,、]/)) {
const clean = kw.trim().replace(/`/g, '').replace(/\s+/g, ' ').toLowerCase();
if (clean && !isNoise(clean)) {
// 过滤指令性句子 (含动宾结构的长文本不是关键词)
if (clean.length > 12 && /[\u4e00-\u9fff]/.test(clean)) continue;
const prev = keywords.get(clean);
if (!prev || prev.weight < weight) {
keywords.set(clean, { weight, tier: 'strong' });
}
}
}
}
// 2. 从 description 提取 (extended 层 0.5)
const frontmatter = parseFrontmatter(content);
if (frontmatter.description) {
const desc = frontmatter.description;
const { weight } = classifyKeywordTier('description');
// 提取中文词组 (2-6 字符)
const cnWords = desc.matchAll(/[\u4e00-\u9fff]{2,6}/g);
for (const w of cnWords) {
const kw = w[0].toLowerCase();
if (!isNoise(kw)) {
const prev = keywords.get(kw);
if (!prev || prev.weight < weight) {
keywords.set(kw, { weight, tier: 'extended' });
}
}
}
// 提取英文词/短语
const enWords = desc.matchAll(/[A-Za-z][\w.-]*(?:\s+[A-Za-z][\w.-]*){0,2}/g);
for (const w of enWords) {
const kw = w[0].toLowerCase().trim();
if (kw.length >= 3 && !isNoise(kw)) {
const prev = keywords.get(kw);
if (!prev || prev.weight < weight) {
keywords.set(kw, { weight, tier: 'extended' });
}
}
}
}
// 3. 提取带引号的触发示例 (core 层 1.0, v5.9.1: 加句式过滤)
const quoted = content.matchAll(/["\u201c]([^"\u201d]{2,20})["\u201d]/g);
for (const q of quoted) {
const kw = q[1].toLowerCase().trim();
const { weight } = classifyKeywordTier('quoted');
if (kw.length >= 2 && !isNoise(kw)) {
// 过滤句子级内容: 含标点或动宾结构的完整句子不是关键词
if (kw.length > 10 && /[,。!?、;:]/.test(kw)) continue;
// 过滤含"的"的描述性短语 (如 "阻力最小的路径")
if (kw.length > 6 && /[\u4e00-\u9fff].*的.*[\u4e00-\u9fff]/.test(kw)) continue;
const prev = keywords.get(kw);
if (!prev || prev.weight < weight) {
keywords.set(kw, { weight, tier: 'core' });
}
}
}
// 过滤噪声词 (v5.9.1: 大幅增强停用词表)
const STOP_WORDS = new Set([
// 英文通用
'the', 'and', 'for', 'this', 'that', 'with', 'from', 'when', 'use',
'output', 'style', 'reference', 'references',
// 中文描述碎片 (从 description 泄漏)
'当用户', '使用此', '时使用', '需要', '或说', '此技能', '当', '的', '了',
'本技能', '内联', '输出', '规范', '深度专家', '专家',
'时使用此技能', '当用户需要', '当用户需要进', '使用此技能',
// 通用动词/泛化词 (区分度极低)
'开发', '设计', '优化', '分析', '管理', '处理', '实现', '构建',
]);
for (const kw of keywords.keys()) {
if (STOP_WORDS.has(kw)) keywords.delete(kw);
}
// 4. 中英同义词自动补齐 (v5.9.1: 提升双语路由精度)
const SYNONYM_PAIRS = [
['测试', 'test'], ['部署', 'deploy'], ['安全', 'security'],
['数据库', 'database'], ['前端', 'frontend'], ['后端', 'backend'],
['架构', 'architecture'], ['性能', 'performance'], ['监控', 'monitoring'],
['容器', 'container'], ['微服务', 'microservice'], ['缓存', 'cache'],
['日志', 'log'], ['调试', 'debug'], ['重构', 'refactor'],
['认证', 'auth'], ['加密', 'encryption'], ['中间件', 'middleware'],
['索引', 'index'], ['事务', 'transaction'], ['队列', 'queue'],
['集群', 'cluster'], ['负载均衡', 'load balancing'], ['网关', 'gateway'],
['回滚', 'rollback'], ['迁移', 'migration'], ['接口', 'interface'],
['组件', 'component'], ['模板', 'template'], ['路由', 'routing'],
['钩子', 'hook'], ['插件', 'plugin'], ['配置', 'config'],
];
for (const [cn, en] of SYNONYM_PAIRS) {
const hasCn = [...keywords.keys()].some(k => k.includes(cn));
const hasEn = [...keywords.keys()].some(k => k.includes(en));
if (hasCn && !hasEn && !STOP_WORDS.has(en)) {
keywords.set(en, { weight: 0.5, tier: 'alias' });
} else if (!hasCn && hasEn && !STOP_WORDS.has(cn)) {
keywords.set(cn, { weight: 0.5, tier: 'alias' });
}
}
// 转为排序数组 (包含 tier 字段)
return Array.from(keywords.entries())
.sort((a, b) => b[1].weight - a[1].weight)
.map(([keyword, { weight, tier }]) => ({
keyword,
weight: Math.round(weight * 100) / 100,
tier,
}));
}
// === 提取 composable 字段 (v4.1) ===
function extractComposable(content) {
// 先检查布尔形式: composable: true
const boolMatch = content.match(/composable:\s*true\b/);
if (boolMatch) {
return { isComposable: true, enhances: [], requires: [], conflicts: [] };
}
// 再检查嵌套块形式
const blockMatch = content.match(/composable:\s*\n((?:\s+\w+:.*\n?)*)/);
if (!blockMatch) return null;
const block = blockMatch[1];
const composable = { isComposable: true };
const reqMatch = block.match(/requires:\s*\[([^\]]*)\]/);
composable.requires = reqMatch ? reqMatch[1].split(',').map(s => s.trim()).filter(Boolean) : [];
const enhMatch = block.match(/enhances:\s*\[([^\]]*)\]/);
composable.enhances = enhMatch ? enhMatch[1].split(',').map(s => s.trim()).filter(Boolean) : [];
const confMatch = block.match(/conflicts:\s*\[([^\]]*)\]/);
composable.conflicts = confMatch ? confMatch[1].split(',').map(s => s.trim()).filter(Boolean) : [];
return composable;
}
// === 第 3 层防线: 关键词冲突检测 (编译期消歧) ===
function detectKeywordCollisions(index) {
// 收集所有 core 层关键词 → 对应技能列表
const coreMap = new Map(); // keyword → [skill1, skill2, ...]
for (const skill of index.skills) {
for (const kw of skill.keywords) {
if (kw.tier === 'core') {
const list = coreMap.get(kw.keyword) || [];
list.push(skill.name);
coreMap.set(kw.keyword, list);
}
}
}
// 找出被 2+ 个技能共享的 core 关键词
const collisions = [];
for (const [keyword, skills] of coreMap.entries()) {
if (skills.length >= 2) {
collisions.push({ keyword, skills, count: skills.length });
}
}
return collisions.sort((a, b) => b.count - a.count);
}
// === 主流程 ===
function main() {
const showStats = process.argv.includes('--stats');
const skillDirs = fs.readdirSync(SKILLS_DIR).filter(d => {
return fs.existsSync(path.join(SKILLS_DIR, d, 'SKILL.md'));
}).sort();
const index = {
generated: new Date().toISOString(),
version: (function() {
try {
var md = require('fs').readFileSync(require('path').join(CLAUDE_ROOT, 'CLAUDE.md'), 'utf8');
var m = md.match(/Smart Assistant.*?(v[\d.]+)/);
return m ? m[1] : 'v6.2';
} catch { return 'v6.2'; }
})(),
skillCount: skillDirs.length,
skills: [],
};
let totalKeywords = 0;
for (const dir of skillDirs) {
const skillFile = path.join(SKILLS_DIR, dir, 'SKILL.md');
const content = fs.readFileSync(skillFile, 'utf8');
const frontmatter = parseFrontmatter(content);
const keywords = extractKeywords(content);
const composable = extractComposable(content);
const entry = {
name: frontmatter.name || dir,
description: (frontmatter.description || '').slice(0, 200),
maturity: frontmatter.maturity || 'unknown',
isComposable: !!(composable || frontmatter.composable === 'true'), // v5.8: 布尔或嵌套块均计入
allowedTools: (frontmatter['allowed-tools'] || '').split(',').map(s => s.trim()).filter(Boolean),
keywords,
};
if (composable) entry.composable = composable;
index.skills.push(entry);
totalKeywords += keywords.length;
}
// v4.9: TF-IDF 加权 (如果引擎可用)
try {
const tfidf = require('./tfidf-engine.js');
tfidf.applyTFIDFWeights(index);
} catch (e) {
// tfidf-engine.js 尚未创建时 fallback 静默
if (!e.code || e.code !== 'MODULE_NOT_FOUND') {
console.warn(' [warn] TF-IDF 加权失败:', e.message);
}
}
// v5.9.1: 长期休眠技能降权标记 (B+C 类: 导入语言 + 商业技能)
// 从 route-stats.json 读取活跃技能30天零命中的 imported/business 技能标记 coldPenalty
const BUSINESS_SKILLS = new Set([
'business-plan-skill', 'copywriter-expert', 'customer-success-expert',
'email-communicator', 'finance-advisor', 'industry-research-cn',
'investor-review-guide', 'pricing-strategist', 'product-manager-expert',
'project-coordinator', 'sales-consultant', 'social-media-manager',
'technical-seo-expert', 'ux-researcher',
]);
try {
const statsPath = path.join(CLAUDE_ROOT, 'debug', 'route-stats.json');
if (fs.existsSync(statsPath)) {
const routeStats = JSON.parse(fs.readFileSync(statsPath, 'utf8'));
const activeSkills = new Set(Object.keys(routeStats.stats || {}));
let penalizedCount = 0;
for (const skill of index.skills) {
const isImported = skill.maturity === 'imported';
const isBusiness = BUSINESS_SKILLS.has(skill.name);
if ((isImported || isBusiness) && !activeSkills.has(skill.name)) {
skill.coldPenalty = 0.5; // BM25 评分 ×0.5
penalizedCount++;
}
}
if (penalizedCount > 0) {
console.log(` [cold-penalty] ${penalizedCount} 个休眠技能已标记 coldPenalty=0.5`);
}
}
} catch {}
// 第 3 层防线: 编译期关键词冲突检测
const collisions = detectKeywordCollisions(index);
if (collisions.length > 0) {
console.warn('\n [COLLISION] 以下 core 关键词被多个技能共享,可能导致路由歧义:');
for (const c of collisions) {
console.warn(` "${c.keyword}" → ${c.skills.join(', ')}`);
}
console.warn(`${collisions.length} 个冲突,建议在 SKILL.md 中消歧或调整权重层级\n`);
}
// 写入索引文件 (附带冲突报告)
index.collisions = collisions;
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(index, null, 2) + '\n');
// === AGENTS_DIR_SCAN_PATCH_2026_04_20 ===
// 扫描 agents/*.md将 agent 条目注入 skills 数组type:'agent'),同时写出 skills-index-lite.json
try {
if (fs.existsSync(AGENTS_DIR)) {
const agentFiles = fs.readdirSync(AGENTS_DIR).filter(f => f.endsWith('.md')).sort();
let agentCount = 0;
for (const file of agentFiles) {
const agentPath = path.join(AGENTS_DIR, file);
const content = fs.readFileSync(agentPath, 'utf8');
const fm = parseFrontmatter(content);
if (!fm.name) continue;
const keywords = extractKeywords(content);
const entry = {
name: fm.name,
description: (fm.description || '').slice(0, 200),
maturity: 'agent',
type: 'agent',
isComposable: false,
allowedTools: (fm['allowed-tools'] || '').split(',').map(s => s.trim()).filter(Boolean),
keywords,
};
index.skills.push(entry);
agentCount++;
}
console.log(' [agents] ' + agentCount + ' agent(s) injected into index');
}
} catch (e) {
console.warn(' [agents] 扫描失败 (非致命):', e.message);
}
// 写出 skills-index-lite.json (路由引擎实际使用的是 lite 版本)
fs.writeFileSync(OUTPUT_LITE, JSON.stringify(index, null, 2) + '\n');
console.log(' [lite] skills-index-lite.json 已同步 (' + index.skills.length + ' entries total)');
const composableCount = index.skills.filter(s => s.isComposable).length;
console.log(`skills-index.json generated:`);
console.log(` Skills: ${index.skillCount} (${composableCount} composable)`);
console.log(` Keywords: ${totalKeywords} (avg ${Math.round(totalKeywords / index.skillCount)} per skill)`);
console.log(` Output: ${OUTPUT_FILE}`);
if (showStats) {
console.log('\nPer-skill keyword count:');
for (const s of index.skills) {
const bar = '\u2588'.repeat(Math.min(s.keywords.length, 40));
console.log(` ${s.name.padEnd(35)} ${String(s.keywords.length).padStart(3)} ${bar}`);
}
}
}
// 导出核心函数供测试使用
if (typeof module !== 'undefined') {
module.exports = { parseFrontmatter, extractKeywords, extractComposable, classifyKeywordTier, isNoise, detectKeywordCollisions };
}
if (require.main === module) {
main();
}