#!/usr/bin/env node /** * 技能语义索引生成器 * * 读取全部 SKILL.md,提取 description + 触发关键词, * 生成 skills-index.json 加权关键词映射 (供 route-analyzer 使用) * * 用法: * node scripts/generate-skill-index.js # 生成索引 * node scripts/generate-skill-index.js --stats # 生成并显示统计 */ const fs = require('fs'); const path = require('path'); const detectClaudeRoot = () => require('./paths.config.js').PATHS.root; const CLAUDE_ROOT = detectClaudeRoot(); const SKILLS_DIR = path.join(CLAUDE_ROOT, 'skills'); const OUTPUT_FILE = path.join(CLAUDE_ROOT, 'skills-index.json'); // AGENTS_DIR_SCAN_PATCH_2026_04_20 const AGENTS_DIR = path.join(CLAUDE_ROOT, 'agents'); const OUTPUT_LITE = path.join(CLAUDE_ROOT, 'skills-index-lite.json'); // === YAML frontmatter 解析 (轻量, 无依赖) === function parseFrontmatter(content) { const match = content.match(/^---\n([\s\S]*?)\n---/); if (!match) return {}; const yaml = match[1]; const result = {}; // 简单 YAML 解析: key: value 或 key: >\n multiline let currentKey = null; let currentValue = ''; let isMultiline = false; for (const line of yaml.split('\n')) { if (!isMultiline) { const kvMatch = line.match(/^(\w[\w-]*):\s*(.*)/); if (kvMatch) { if (currentKey) result[currentKey] = currentValue.trim(); currentKey = kvMatch[1]; const val = kvMatch[2].trim(); if (val === '>' || val === '|') { isMultiline = true; currentValue = ''; } else { currentValue = val; isMultiline = false; } } } else { if (/^\S/.test(line) && !line.startsWith(' ')) { // 新的顶层 key if (currentKey) result[currentKey] = currentValue.trim(); const kvMatch = line.match(/^(\w[\w-]*):\s*(.*)/); if (kvMatch) { currentKey = kvMatch[1]; const val = kvMatch[2].trim(); if (val === '>' || val === '|') { isMultiline = true; currentValue = ''; } else { currentValue = val; isMultiline = false; } } } else { currentValue += ' ' + line.trim(); } } } if (currentKey) result[currentKey] = currentValue.trim(); return result; } // === 关键词层级分类 (v4.9) === // core: 表格/引号 = 1.0, strong: 列表 = 0.8, extended: 描述 = 0.5 function classifyKeywordTier(source) { switch (source) { case 'table': return { tier: 'core', weight: 1.0 }; case 'quoted': return { tier: 'core', weight: 1.0 }; case 'list': return { tier: 'strong', weight: 0.8 }; case 'description': return { tier: 'extended', weight: 0.5 }; default: return { tier: 'extended', weight: 0.5 }; } } // === 分隔符/噪声过滤 (v5.9.1: 增强) === const SEPARATOR_RE = /^[-|:\s.…]+$/; function isNoise(str) { if (SEPARATOR_RE.test(str)) return true; if (str.length < 2 || str.length > 30) return true; // 过滤描述文本碎片: 含句式连词的中文长词组 if (str.length > 8 && /[的了或在与及和是从到]/.test(str) && /[\u4e00-\u9fff]/.test(str)) return true; // 过滤文件路径碎片 if (/^references?\//.test(str) || /\.md$/.test(str)) return true; // 过滤省略号 if (str === '...' || str === '…') return true; return false; } // === 关键词提取 === function extractKeywords(content) { const keywords = new Map(); // keyword -> { weight, tier } // 1. 从触发关键词表提取 (core 层 1.0) const tableRows = content.matchAll(/\|\s*[^|]+\|\s*([^|]+)\|/g); for (const row of tableRows) { const cell = row[1].trim(); if (cell === '关键词' || cell === '---') continue; const { weight } = classifyKeywordTier('table'); for (const kw of cell.split(/[,、]/)) { const clean = kw.trim().replace(/`/g, '').replace(/\s+/g, ' ').toLowerCase(); if (clean && !isNoise(clean)) { const prev = keywords.get(clean); if (!prev || prev.weight < weight) { keywords.set(clean, { weight, tier: 'core' }); } } } } // 列表格式: - **类别**: `kw1`, `kw2` (strong 层 0.8, v5.9.1: 加句式过滤) const listItems = content.matchAll(/-\s*\*\*[^*]+\*\*\s*[::]\s*(.+)/g); for (const item of listItems) { const kwStr = item[1]; const { weight } = classifyKeywordTier('list'); for (const kw of kwStr.split(/[,、]/)) { const clean = kw.trim().replace(/`/g, '').replace(/\s+/g, ' ').toLowerCase(); if (clean && !isNoise(clean)) { // 过滤指令性句子 (含动宾结构的长文本不是关键词) if (clean.length > 12 && /[\u4e00-\u9fff]/.test(clean)) continue; const prev = keywords.get(clean); if (!prev || prev.weight < weight) { keywords.set(clean, { weight, tier: 'strong' }); } } } } // 2. 从 description 提取 (extended 层 0.5) const frontmatter = parseFrontmatter(content); if (frontmatter.description) { const desc = frontmatter.description; const { weight } = classifyKeywordTier('description'); // 提取中文词组 (2-6 字符) const cnWords = desc.matchAll(/[\u4e00-\u9fff]{2,6}/g); for (const w of cnWords) { const kw = w[0].toLowerCase(); if (!isNoise(kw)) { const prev = keywords.get(kw); if (!prev || prev.weight < weight) { keywords.set(kw, { weight, tier: 'extended' }); } } } // 提取英文词/短语 const enWords = desc.matchAll(/[A-Za-z][\w.-]*(?:\s+[A-Za-z][\w.-]*){0,2}/g); for (const w of enWords) { const kw = w[0].toLowerCase().trim(); if (kw.length >= 3 && !isNoise(kw)) { const prev = keywords.get(kw); if (!prev || prev.weight < weight) { keywords.set(kw, { weight, tier: 'extended' }); } } } } // 3. 提取带引号的触发示例 (core 层 1.0, v5.9.1: 加句式过滤) const quoted = content.matchAll(/["\u201c]([^"\u201d]{2,20})["\u201d]/g); for (const q of quoted) { const kw = q[1].toLowerCase().trim(); const { weight } = classifyKeywordTier('quoted'); if (kw.length >= 2 && !isNoise(kw)) { // 过滤句子级内容: 含标点或动宾结构的完整句子不是关键词 if (kw.length > 10 && /[,。!?、;:]/.test(kw)) continue; // 过滤含"的"的描述性短语 (如 "阻力最小的路径") if (kw.length > 6 && /[\u4e00-\u9fff].*的.*[\u4e00-\u9fff]/.test(kw)) continue; const prev = keywords.get(kw); if (!prev || prev.weight < weight) { keywords.set(kw, { weight, tier: 'core' }); } } } // 过滤噪声词 (v5.9.1: 大幅增强停用词表) const STOP_WORDS = new Set([ // 英文通用 'the', 'and', 'for', 'this', 'that', 'with', 'from', 'when', 'use', 'output', 'style', 'reference', 'references', // 中文描述碎片 (从 description 泄漏) '当用户', '使用此', '时使用', '需要', '或说', '此技能', '当', '的', '了', '本技能', '内联', '输出', '规范', '深度专家', '专家', '时使用此技能', '当用户需要', '当用户需要进', '使用此技能', // 通用动词/泛化词 (区分度极低) '开发', '设计', '优化', '分析', '管理', '处理', '实现', '构建', ]); for (const kw of keywords.keys()) { if (STOP_WORDS.has(kw)) keywords.delete(kw); } // 4. 中英同义词自动补齐 (v5.9.1: 提升双语路由精度) const SYNONYM_PAIRS = [ ['测试', 'test'], ['部署', 'deploy'], ['安全', 'security'], ['数据库', 'database'], ['前端', 'frontend'], ['后端', 'backend'], ['架构', 'architecture'], ['性能', 'performance'], ['监控', 'monitoring'], ['容器', 'container'], ['微服务', 'microservice'], ['缓存', 'cache'], ['日志', 'log'], ['调试', 'debug'], ['重构', 'refactor'], ['认证', 'auth'], ['加密', 'encryption'], ['中间件', 'middleware'], ['索引', 'index'], ['事务', 'transaction'], ['队列', 'queue'], ['集群', 'cluster'], ['负载均衡', 'load balancing'], ['网关', 'gateway'], ['回滚', 'rollback'], ['迁移', 'migration'], ['接口', 'interface'], ['组件', 'component'], ['模板', 'template'], ['路由', 'routing'], ['钩子', 'hook'], ['插件', 'plugin'], ['配置', 'config'], ]; for (const [cn, en] of SYNONYM_PAIRS) { const hasCn = [...keywords.keys()].some(k => k.includes(cn)); const hasEn = [...keywords.keys()].some(k => k.includes(en)); if (hasCn && !hasEn && !STOP_WORDS.has(en)) { keywords.set(en, { weight: 0.5, tier: 'alias' }); } else if (!hasCn && hasEn && !STOP_WORDS.has(cn)) { keywords.set(cn, { weight: 0.5, tier: 'alias' }); } } // 转为排序数组 (包含 tier 字段) return Array.from(keywords.entries()) .sort((a, b) => b[1].weight - a[1].weight) .map(([keyword, { weight, tier }]) => ({ keyword, weight: Math.round(weight * 100) / 100, tier, })); } // === 提取 composable 字段 (v4.1) === function extractComposable(content) { // 先检查布尔形式: composable: true const boolMatch = content.match(/composable:\s*true\b/); if (boolMatch) { return { isComposable: true, enhances: [], requires: [], conflicts: [] }; } // 再检查嵌套块形式 const blockMatch = content.match(/composable:\s*\n((?:\s+\w+:.*\n?)*)/); if (!blockMatch) return null; const block = blockMatch[1]; const composable = { isComposable: true }; const reqMatch = block.match(/requires:\s*\[([^\]]*)\]/); composable.requires = reqMatch ? reqMatch[1].split(',').map(s => s.trim()).filter(Boolean) : []; const enhMatch = block.match(/enhances:\s*\[([^\]]*)\]/); composable.enhances = enhMatch ? enhMatch[1].split(',').map(s => s.trim()).filter(Boolean) : []; const confMatch = block.match(/conflicts:\s*\[([^\]]*)\]/); composable.conflicts = confMatch ? confMatch[1].split(',').map(s => s.trim()).filter(Boolean) : []; return composable; } // === 第 3 层防线: 关键词冲突检测 (编译期消歧) === function detectKeywordCollisions(index) { // 收集所有 core 层关键词 → 对应技能列表 const coreMap = new Map(); // keyword → [skill1, skill2, ...] for (const skill of index.skills) { for (const kw of skill.keywords) { if (kw.tier === 'core') { const list = coreMap.get(kw.keyword) || []; list.push(skill.name); coreMap.set(kw.keyword, list); } } } // 找出被 2+ 个技能共享的 core 关键词 const collisions = []; for (const [keyword, skills] of coreMap.entries()) { if (skills.length >= 2) { collisions.push({ keyword, skills, count: skills.length }); } } return collisions.sort((a, b) => b.count - a.count); } // === 主流程 === function main() { const showStats = process.argv.includes('--stats'); const skillDirs = fs.readdirSync(SKILLS_DIR).filter(d => { return fs.existsSync(path.join(SKILLS_DIR, d, 'SKILL.md')); }).sort(); const index = { generated: new Date().toISOString(), version: (function() { try { var md = require('fs').readFileSync(require('path').join(CLAUDE_ROOT, 'CLAUDE.md'), 'utf8'); var m = md.match(/Smart Assistant.*?(v[\d.]+)/); return m ? m[1] : 'v6.2'; } catch { return 'v6.2'; } })(), skillCount: skillDirs.length, skills: [], }; let totalKeywords = 0; for (const dir of skillDirs) { const skillFile = path.join(SKILLS_DIR, dir, 'SKILL.md'); const content = fs.readFileSync(skillFile, 'utf8'); const frontmatter = parseFrontmatter(content); const keywords = extractKeywords(content); const composable = extractComposable(content); const entry = { name: frontmatter.name || dir, description: (frontmatter.description || '').slice(0, 200), maturity: frontmatter.maturity || 'unknown', isComposable: !!(composable || frontmatter.composable === 'true'), // v5.8: 布尔或嵌套块均计入 allowedTools: (frontmatter['allowed-tools'] || '').split(',').map(s => s.trim()).filter(Boolean), keywords, }; if (composable) entry.composable = composable; index.skills.push(entry); totalKeywords += keywords.length; } // v4.9: TF-IDF 加权 (如果引擎可用) try { const tfidf = require('./tfidf-engine.js'); tfidf.applyTFIDFWeights(index); } catch (e) { // tfidf-engine.js 尚未创建时 fallback 静默 if (!e.code || e.code !== 'MODULE_NOT_FOUND') { console.warn(' [warn] TF-IDF 加权失败:', e.message); } } // v5.9.1: 长期休眠技能降权标记 (B+C 类: 导入语言 + 商业技能) // 从 route-stats.json 读取活跃技能,30天零命中的 imported/business 技能标记 coldPenalty const BUSINESS_SKILLS = new Set([ 'business-plan-skill', 'copywriter-expert', 'customer-success-expert', 'email-communicator', 'finance-advisor', 'industry-research-cn', 'investor-review-guide', 'pricing-strategist', 'product-manager-expert', 'project-coordinator', 'sales-consultant', 'social-media-manager', 'technical-seo-expert', 'ux-researcher', ]); try { const statsPath = path.join(CLAUDE_ROOT, 'debug', 'route-stats.json'); if (fs.existsSync(statsPath)) { const routeStats = JSON.parse(fs.readFileSync(statsPath, 'utf8')); const activeSkills = new Set(Object.keys(routeStats.stats || {})); let penalizedCount = 0; for (const skill of index.skills) { const isImported = skill.maturity === 'imported'; const isBusiness = BUSINESS_SKILLS.has(skill.name); if ((isImported || isBusiness) && !activeSkills.has(skill.name)) { skill.coldPenalty = 0.5; // BM25 评分 ×0.5 penalizedCount++; } } if (penalizedCount > 0) { console.log(` [cold-penalty] ${penalizedCount} 个休眠技能已标记 coldPenalty=0.5`); } } } catch {} // 第 3 层防线: 编译期关键词冲突检测 const collisions = detectKeywordCollisions(index); if (collisions.length > 0) { console.warn('\n [COLLISION] 以下 core 关键词被多个技能共享,可能导致路由歧义:'); for (const c of collisions) { console.warn(` "${c.keyword}" → ${c.skills.join(', ')}`); } console.warn(` 共 ${collisions.length} 个冲突,建议在 SKILL.md 中消歧或调整权重层级\n`); } // 写入索引文件 (附带冲突报告) index.collisions = collisions; fs.writeFileSync(OUTPUT_FILE, JSON.stringify(index, null, 2) + '\n'); // === AGENTS_DIR_SCAN_PATCH_2026_04_20 === // 扫描 agents/*.md,将 agent 条目注入 skills 数组(type:'agent'),同时写出 skills-index-lite.json try { if (fs.existsSync(AGENTS_DIR)) { const agentFiles = fs.readdirSync(AGENTS_DIR).filter(f => f.endsWith('.md')).sort(); let agentCount = 0; for (const file of agentFiles) { const agentPath = path.join(AGENTS_DIR, file); const content = fs.readFileSync(agentPath, 'utf8'); const fm = parseFrontmatter(content); if (!fm.name) continue; const keywords = extractKeywords(content); const entry = { name: fm.name, description: (fm.description || '').slice(0, 200), maturity: 'agent', type: 'agent', isComposable: false, allowedTools: (fm['allowed-tools'] || '').split(',').map(s => s.trim()).filter(Boolean), keywords, }; index.skills.push(entry); agentCount++; } console.log(' [agents] ' + agentCount + ' agent(s) injected into index'); } } catch (e) { console.warn(' [agents] 扫描失败 (非致命):', e.message); } // 写出 skills-index-lite.json (路由引擎实际使用的是 lite 版本) fs.writeFileSync(OUTPUT_LITE, JSON.stringify(index, null, 2) + '\n'); console.log(' [lite] skills-index-lite.json 已同步 (' + index.skills.length + ' entries total)'); const composableCount = index.skills.filter(s => s.isComposable).length; console.log(`skills-index.json generated:`); console.log(` Skills: ${index.skillCount} (${composableCount} composable)`); console.log(` Keywords: ${totalKeywords} (avg ${Math.round(totalKeywords / index.skillCount)} per skill)`); console.log(` Output: ${OUTPUT_FILE}`); if (showStats) { console.log('\nPer-skill keyword count:'); for (const s of index.skills) { const bar = '\u2588'.repeat(Math.min(s.keywords.length, 40)); console.log(` ${s.name.padEnd(35)} ${String(s.keywords.length).padStart(3)} ${bar}`); } } } // 导出核心函数供测试使用 if (typeof module !== 'undefined') { module.exports = { parseFrontmatter, extractKeywords, extractComposable, classifyKeywordTier, isNoise, detectKeywordCollisions }; } if (require.main === module) { main(); }