bookworm-smart-assistant/scripts/generate-skill-index.js

448 lines
17 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* 技能语义索引生成器
*
* 读取全部 SKILL.md提取 description + 触发关键词,
* 生成 skills-index.json 加权关键词映射 (供 route-analyzer 使用)
*
* 用法:
* node scripts/generate-skill-index.js # 生成索引
* node scripts/generate-skill-index.js --stats # 生成并显示统计
*/
const fs = require('fs');
const path = require('path');
const detectClaudeRoot = () => require('./paths.config.js').PATHS.root;
const CLAUDE_ROOT = detectClaudeRoot();
const SKILLS_DIR = path.join(CLAUDE_ROOT, 'skills');
const OUTPUT_FILE = path.join(CLAUDE_ROOT, 'skills-index.json');
// AGENTS_DIR_SCAN_PATCH_2026_04_20
const AGENTS_DIR = path.join(CLAUDE_ROOT, 'agents');
const OUTPUT_LITE = path.join(CLAUDE_ROOT, 'skills-index-lite.json');
// === YAML frontmatter 解析 (轻量, 无依赖) ===
function parseFrontmatter(content) {
const match = content.match(/^---\n([\s\S]*?)\n---/);
if (!match) return {};
const yaml = match[1];
const result = {};
// 简单 YAML 解析: key: value 或 key: >\n multiline
let currentKey = null;
let currentValue = '';
let isMultiline = false;
for (const line of yaml.split('\n')) {
if (!isMultiline) {
const kvMatch = line.match(/^(\w[\w-]*):\s*(.*)/);
if (kvMatch) {
if (currentKey) result[currentKey] = currentValue.trim();
currentKey = kvMatch[1];
const val = kvMatch[2].trim();
if (val === '>' || val === '|') {
isMultiline = true;
currentValue = '';
} else {
currentValue = val;
isMultiline = false;
}
}
} else {
if (/^\S/.test(line) && !line.startsWith(' ')) {
// 新的顶层 key
if (currentKey) result[currentKey] = currentValue.trim();
const kvMatch = line.match(/^(\w[\w-]*):\s*(.*)/);
if (kvMatch) {
currentKey = kvMatch[1];
const val = kvMatch[2].trim();
if (val === '>' || val === '|') {
isMultiline = true;
currentValue = '';
} else {
currentValue = val;
isMultiline = false;
}
}
} else {
currentValue += ' ' + line.trim();
}
}
}
if (currentKey) result[currentKey] = currentValue.trim();
return result;
}
// === 关键词层级分类 (v4.9) ===
// core: 表格/引号 = 1.0, strong: 列表 = 0.8, extended: 描述 = 0.5
function classifyKeywordTier(source) {
switch (source) {
case 'table': return { tier: 'core', weight: 1.0 };
case 'quoted': return { tier: 'core', weight: 1.0 };
case 'list': return { tier: 'strong', weight: 0.8 };
case 'description': return { tier: 'extended', weight: 0.5 };
default: return { tier: 'extended', weight: 0.5 };
}
}
// === 分隔符/噪声过滤 (v5.9.1: 增强) ===
const SEPARATOR_RE = /^[-|:\s.…]+$/;
function isNoise(str) {
if (SEPARATOR_RE.test(str)) return true;
if (str.length < 2 || str.length > 30) return true;
// 过滤描述文本碎片: 含句式连词的中文长词组
if (str.length > 8 && /[的了或在与及和是从到]/.test(str) && /[\u4e00-\u9fff]/.test(str)) return true;
// 过滤文件路径碎片
if (/^references?\//.test(str) || /\.md$/.test(str)) return true;
// 过滤省略号
if (str === '...' || str === '…') return true;
return false;
}
// === 关键词提取 ===
function extractKeywords(content) {
const keywords = new Map(); // keyword -> { weight, tier }
// 1. 从触发关键词表提取 (core 层 1.0)
const tableRows = content.matchAll(/\|\s*[^|]+\|\s*([^|]+)\|/g);
for (const row of tableRows) {
const cell = row[1].trim();
if (cell === '关键词' || cell === '---') continue;
const { weight } = classifyKeywordTier('table');
for (const kw of cell.split(/[,、]/)) {
const clean = kw.trim().replace(/`/g, '').replace(/\s+/g, ' ').toLowerCase();
if (clean && !isNoise(clean)) {
const prev = keywords.get(clean);
if (!prev || prev.weight < weight) {
keywords.set(clean, { weight, tier: 'core' });
}
}
}
}
// 列表格式: - **类别**: `kw1`, `kw2` (strong 层 0.8, v5.9.1: 加句式过滤)
const listItems = content.matchAll(/-\s*\*\*[^*]+\*\*\s*[:]\s*(.+)/g);
for (const item of listItems) {
const kwStr = item[1];
const { weight } = classifyKeywordTier('list');
for (const kw of kwStr.split(/[,、]/)) {
const clean = kw.trim().replace(/`/g, '').replace(/\s+/g, ' ').toLowerCase();
if (clean && !isNoise(clean)) {
// 过滤指令性句子 (含动宾结构的长文本不是关键词)
if (clean.length > 12 && /[\u4e00-\u9fff]/.test(clean)) continue;
const prev = keywords.get(clean);
if (!prev || prev.weight < weight) {
keywords.set(clean, { weight, tier: 'strong' });
}
}
}
}
// 2. 从 description 提取 (extended 层 0.5)
const frontmatter = parseFrontmatter(content);
if (frontmatter.description) {
const desc = frontmatter.description;
const { weight } = classifyKeywordTier('description');
// 提取中文词组 (2-6 字符)
const cnWords = desc.matchAll(/[\u4e00-\u9fff]{2,6}/g);
for (const w of cnWords) {
const kw = w[0].toLowerCase();
if (!isNoise(kw)) {
const prev = keywords.get(kw);
if (!prev || prev.weight < weight) {
keywords.set(kw, { weight, tier: 'extended' });
}
}
}
// 提取英文词/短语
const enWords = desc.matchAll(/[A-Za-z][\w.-]*(?:\s+[A-Za-z][\w.-]*){0,2}/g);
for (const w of enWords) {
const kw = w[0].toLowerCase().trim();
if (kw.length >= 3 && !isNoise(kw)) {
const prev = keywords.get(kw);
if (!prev || prev.weight < weight) {
keywords.set(kw, { weight, tier: 'extended' });
}
}
}
}
// 3. 提取带引号的触发示例 (core 层 1.0, v5.9.1: 加句式过滤)
const quoted = content.matchAll(/["\u201c]([^"\u201d]{2,20})["\u201d]/g);
for (const q of quoted) {
const kw = q[1].toLowerCase().trim();
const { weight } = classifyKeywordTier('quoted');
if (kw.length >= 2 && !isNoise(kw)) {
// 过滤句子级内容: 含标点或动宾结构的完整句子不是关键词
if (kw.length > 10 && /[,。!?、;:]/.test(kw)) continue;
// 过滤含"的"的描述性短语 (如 "阻力最小的路径")
if (kw.length > 6 && /[\u4e00-\u9fff].*的.*[\u4e00-\u9fff]/.test(kw)) continue;
const prev = keywords.get(kw);
if (!prev || prev.weight < weight) {
keywords.set(kw, { weight, tier: 'core' });
}
}
}
// 过滤噪声词 (v5.9.1: 大幅增强停用词表)
const STOP_WORDS = new Set([
// 英文通用
'the', 'and', 'for', 'this', 'that', 'with', 'from', 'when', 'use',
'output', 'style', 'reference', 'references',
// 中文描述碎片 (从 description 泄漏)
'当用户', '使用此', '时使用', '需要', '或说', '此技能', '当', '的', '了',
'本技能', '内联', '输出', '规范', '深度专家', '专家',
'时使用此技能', '当用户需要', '当用户需要进', '使用此技能',
// 通用动词/泛化词 (区分度极低)
'开发', '设计', '优化', '分析', '管理', '处理', '实现', '构建',
]);
for (const kw of keywords.keys()) {
if (STOP_WORDS.has(kw)) keywords.delete(kw);
}
// 4. 中英同义词自动补齐 (v5.9.1: 提升双语路由精度)
const SYNONYM_PAIRS = [
['测试', 'test'], ['部署', 'deploy'], ['安全', 'security'],
['数据库', 'database'], ['前端', 'frontend'], ['后端', 'backend'],
['架构', 'architecture'], ['性能', 'performance'], ['监控', 'monitoring'],
['容器', 'container'], ['微服务', 'microservice'], ['缓存', 'cache'],
['日志', 'log'], ['调试', 'debug'], ['重构', 'refactor'],
['认证', 'auth'], ['加密', 'encryption'], ['中间件', 'middleware'],
['索引', 'index'], ['事务', 'transaction'], ['队列', 'queue'],
['集群', 'cluster'], ['负载均衡', 'load balancing'], ['网关', 'gateway'],
['回滚', 'rollback'], ['迁移', 'migration'], ['接口', 'interface'],
['组件', 'component'], ['模板', 'template'], ['路由', 'routing'],
['钩子', 'hook'], ['插件', 'plugin'], ['配置', 'config'],
];
for (const [cn, en] of SYNONYM_PAIRS) {
const hasCn = [...keywords.keys()].some(k => k.includes(cn));
const hasEn = [...keywords.keys()].some(k => k.includes(en));
if (hasCn && !hasEn && !STOP_WORDS.has(en)) {
keywords.set(en, { weight: 0.5, tier: 'alias' });
} else if (!hasCn && hasEn && !STOP_WORDS.has(cn)) {
keywords.set(cn, { weight: 0.5, tier: 'alias' });
}
}
// 转为排序数组 (包含 tier 字段)
return Array.from(keywords.entries())
.sort((a, b) => b[1].weight - a[1].weight)
.map(([keyword, { weight, tier }]) => ({
keyword,
weight: Math.round(weight * 100) / 100,
tier,
}));
}
// === 提取 composable 字段 (v4.1) ===
function extractComposable(content) {
// 先检查布尔形式: composable: true
const boolMatch = content.match(/composable:\s*true\b/);
if (boolMatch) {
return { isComposable: true, enhances: [], requires: [], conflicts: [] };
}
// 再检查嵌套块形式
const blockMatch = content.match(/composable:\s*\n((?:\s+\w+:.*\n?)*)/);
if (!blockMatch) return null;
const block = blockMatch[1];
const composable = { isComposable: true };
const reqMatch = block.match(/requires:\s*\[([^\]]*)\]/);
composable.requires = reqMatch ? reqMatch[1].split(',').map(s => s.trim()).filter(Boolean) : [];
const enhMatch = block.match(/enhances:\s*\[([^\]]*)\]/);
composable.enhances = enhMatch ? enhMatch[1].split(',').map(s => s.trim()).filter(Boolean) : [];
const confMatch = block.match(/conflicts:\s*\[([^\]]*)\]/);
composable.conflicts = confMatch ? confMatch[1].split(',').map(s => s.trim()).filter(Boolean) : [];
return composable;
}
// === 第 3 层防线: 关键词冲突检测 (编译期消歧) ===
function detectKeywordCollisions(index) {
// 收集所有 core 层关键词 → 对应技能列表
const coreMap = new Map(); // keyword → [skill1, skill2, ...]
for (const skill of index.skills) {
for (const kw of skill.keywords) {
if (kw.tier === 'core') {
const list = coreMap.get(kw.keyword) || [];
list.push(skill.name);
coreMap.set(kw.keyword, list);
}
}
}
// 找出被 2+ 个技能共享的 core 关键词
const collisions = [];
for (const [keyword, skills] of coreMap.entries()) {
if (skills.length >= 2) {
collisions.push({ keyword, skills, count: skills.length });
}
}
return collisions.sort((a, b) => b.count - a.count);
}
// === 主流程 ===
function main() {
const showStats = process.argv.includes('--stats');
const skillDirs = fs.readdirSync(SKILLS_DIR).filter(d => {
return fs.existsSync(path.join(SKILLS_DIR, d, 'SKILL.md'));
}).sort();
const index = {
generated: new Date().toISOString(),
version: (function() {
try {
var md = require('fs').readFileSync(require('path').join(CLAUDE_ROOT, 'CLAUDE.md'), 'utf8');
var m = md.match(/Smart Assistant.*?(v[\d.]+)/);
return m ? m[1] : 'v6.2';
} catch { return 'v6.2'; }
})(),
skillCount: skillDirs.length,
skills: [],
};
let totalKeywords = 0;
for (const dir of skillDirs) {
const skillFile = path.join(SKILLS_DIR, dir, 'SKILL.md');
const content = fs.readFileSync(skillFile, 'utf8');
const frontmatter = parseFrontmatter(content);
const keywords = extractKeywords(content);
const composable = extractComposable(content);
const entry = {
name: frontmatter.name || dir,
description: (frontmatter.description || '').slice(0, 200),
maturity: frontmatter.maturity || 'unknown',
isComposable: !!(composable || frontmatter.composable === 'true'), // v5.8: 布尔或嵌套块均计入
allowedTools: (frontmatter['allowed-tools'] || '').split(',').map(s => s.trim()).filter(Boolean),
keywords,
};
if (composable) entry.composable = composable;
index.skills.push(entry);
totalKeywords += keywords.length;
}
// v4.9: TF-IDF 加权 (如果引擎可用)
try {
const tfidf = require('./tfidf-engine.js');
tfidf.applyTFIDFWeights(index);
} catch (e) {
// tfidf-engine.js 尚未创建时 fallback 静默
if (!e.code || e.code !== 'MODULE_NOT_FOUND') {
console.warn(' [warn] TF-IDF 加权失败:', e.message);
}
}
// v5.9.1: 长期休眠技能降权标记 (B+C 类: 导入语言 + 商业技能)
// 从 route-stats.json 读取活跃技能30天零命中的 imported/business 技能标记 coldPenalty
const BUSINESS_SKILLS = new Set([
'business-plan-skill', 'copywriter-expert', 'customer-success-expert',
'email-communicator', 'finance-advisor', 'industry-research-cn',
'investor-review-guide', 'pricing-strategist', 'product-manager-expert',
'project-coordinator', 'sales-consultant', 'social-media-manager',
'technical-seo-expert', 'ux-researcher',
]);
try {
const statsPath = path.join(CLAUDE_ROOT, 'debug', 'route-stats.json');
if (fs.existsSync(statsPath)) {
const routeStats = JSON.parse(fs.readFileSync(statsPath, 'utf8'));
const activeSkills = new Set(Object.keys(routeStats.stats || {}));
let penalizedCount = 0;
for (const skill of index.skills) {
const isImported = skill.maturity === 'imported';
const isBusiness = BUSINESS_SKILLS.has(skill.name);
if ((isImported || isBusiness) && !activeSkills.has(skill.name)) {
skill.coldPenalty = 0.5; // BM25 评分 ×0.5
penalizedCount++;
}
}
if (penalizedCount > 0) {
console.log(` [cold-penalty] ${penalizedCount} 个休眠技能已标记 coldPenalty=0.5`);
}
}
} catch {}
// 第 3 层防线: 编译期关键词冲突检测
const collisions = detectKeywordCollisions(index);
if (collisions.length > 0) {
console.warn('\n [COLLISION] 以下 core 关键词被多个技能共享,可能导致路由歧义:');
for (const c of collisions) {
console.warn(` "${c.keyword}" → ${c.skills.join(', ')}`);
}
console.warn(`${collisions.length} 个冲突,建议在 SKILL.md 中消歧或调整权重层级\n`);
}
// 写入索引文件 (附带冲突报告)
index.collisions = collisions;
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(index, null, 2) + '\n');
// === AGENTS_DIR_SCAN_PATCH_2026_04_20 ===
// 扫描 agents/*.md将 agent 条目注入 skills 数组type:'agent'),同时写出 skills-index-lite.json
try {
if (fs.existsSync(AGENTS_DIR)) {
const agentFiles = fs.readdirSync(AGENTS_DIR).filter(f => f.endsWith('.md')).sort();
let agentCount = 0;
for (const file of agentFiles) {
const agentPath = path.join(AGENTS_DIR, file);
const content = fs.readFileSync(agentPath, 'utf8');
const fm = parseFrontmatter(content);
if (!fm.name) continue;
const keywords = extractKeywords(content);
const entry = {
name: fm.name,
description: (fm.description || '').slice(0, 200),
maturity: 'agent',
type: 'agent',
isComposable: false,
allowedTools: (fm['allowed-tools'] || '').split(',').map(s => s.trim()).filter(Boolean),
keywords,
};
index.skills.push(entry);
agentCount++;
}
console.log(' [agents] ' + agentCount + ' agent(s) injected into index');
}
} catch (e) {
console.warn(' [agents] 扫描失败 (非致命):', e.message);
}
// 写出 skills-index-lite.json (路由引擎实际使用的是 lite 版本)
fs.writeFileSync(OUTPUT_LITE, JSON.stringify(index, null, 2) + '\n');
console.log(' [lite] skills-index-lite.json 已同步 (' + index.skills.length + ' entries total)');
const composableCount = index.skills.filter(s => s.isComposable).length;
console.log(`skills-index.json generated:`);
console.log(` Skills: ${index.skillCount} (${composableCount} composable)`);
console.log(` Keywords: ${totalKeywords} (avg ${Math.round(totalKeywords / index.skillCount)} per skill)`);
console.log(` Output: ${OUTPUT_FILE}`);
if (showStats) {
console.log('\nPer-skill keyword count:');
for (const s of index.skills) {
const bar = '\u2588'.repeat(Math.min(s.keywords.length, 40));
console.log(` ${s.name.padEnd(35)} ${String(s.keywords.length).padStart(3)} ${bar}`);
}
}
}
// 导出核心函数供测试使用
if (typeof module !== 'undefined') {
module.exports = { parseFrontmatter, extractKeywords, extractComposable, classifyKeywordTier, isNoise, detectKeywordCollisions };
}
if (require.main === module) {
main();
}