448 lines
17 KiB
JavaScript
448 lines
17 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* 技能语义索引生成器
|
||
*
|
||
* 读取全部 SKILL.md,提取 description + 触发关键词,
|
||
* 生成 skills-index.json 加权关键词映射 (供 route-analyzer 使用)
|
||
*
|
||
* 用法:
|
||
* node scripts/generate-skill-index.js # 生成索引
|
||
* node scripts/generate-skill-index.js --stats # 生成并显示统计
|
||
*/
|
||
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
const detectClaudeRoot = () => require('./paths.config.js').PATHS.root;
|
||
|
||
const CLAUDE_ROOT = detectClaudeRoot();
|
||
const SKILLS_DIR = path.join(CLAUDE_ROOT, 'skills');
|
||
const OUTPUT_FILE = path.join(CLAUDE_ROOT, 'skills-index.json');
|
||
// AGENTS_DIR_SCAN_PATCH_2026_04_20
|
||
const AGENTS_DIR = path.join(CLAUDE_ROOT, 'agents');
|
||
const OUTPUT_LITE = path.join(CLAUDE_ROOT, 'skills-index-lite.json');
|
||
|
||
// === YAML frontmatter 解析 (轻量, 无依赖) ===
|
||
function parseFrontmatter(content) {
|
||
const match = content.match(/^---\n([\s\S]*?)\n---/);
|
||
if (!match) return {};
|
||
|
||
const yaml = match[1];
|
||
const result = {};
|
||
|
||
// 简单 YAML 解析: key: value 或 key: >\n multiline
|
||
let currentKey = null;
|
||
let currentValue = '';
|
||
let isMultiline = false;
|
||
|
||
for (const line of yaml.split('\n')) {
|
||
if (!isMultiline) {
|
||
const kvMatch = line.match(/^(\w[\w-]*):\s*(.*)/);
|
||
if (kvMatch) {
|
||
if (currentKey) result[currentKey] = currentValue.trim();
|
||
currentKey = kvMatch[1];
|
||
const val = kvMatch[2].trim();
|
||
if (val === '>' || val === '|') {
|
||
isMultiline = true;
|
||
currentValue = '';
|
||
} else {
|
||
currentValue = val;
|
||
isMultiline = false;
|
||
}
|
||
}
|
||
} else {
|
||
if (/^\S/.test(line) && !line.startsWith(' ')) {
|
||
// 新的顶层 key
|
||
if (currentKey) result[currentKey] = currentValue.trim();
|
||
const kvMatch = line.match(/^(\w[\w-]*):\s*(.*)/);
|
||
if (kvMatch) {
|
||
currentKey = kvMatch[1];
|
||
const val = kvMatch[2].trim();
|
||
if (val === '>' || val === '|') {
|
||
isMultiline = true;
|
||
currentValue = '';
|
||
} else {
|
||
currentValue = val;
|
||
isMultiline = false;
|
||
}
|
||
}
|
||
} else {
|
||
currentValue += ' ' + line.trim();
|
||
}
|
||
}
|
||
}
|
||
if (currentKey) result[currentKey] = currentValue.trim();
|
||
|
||
return result;
|
||
}
|
||
|
||
// === 关键词层级分类 (v4.9) ===
|
||
// core: 表格/引号 = 1.0, strong: 列表 = 0.8, extended: 描述 = 0.5
|
||
function classifyKeywordTier(source) {
|
||
switch (source) {
|
||
case 'table': return { tier: 'core', weight: 1.0 };
|
||
case 'quoted': return { tier: 'core', weight: 1.0 };
|
||
case 'list': return { tier: 'strong', weight: 0.8 };
|
||
case 'description': return { tier: 'extended', weight: 0.5 };
|
||
default: return { tier: 'extended', weight: 0.5 };
|
||
}
|
||
}
|
||
|
||
// === 分隔符/噪声过滤 (v5.9.1: 增强) ===
|
||
const SEPARATOR_RE = /^[-|:\s.…]+$/;
|
||
function isNoise(str) {
|
||
if (SEPARATOR_RE.test(str)) return true;
|
||
if (str.length < 2 || str.length > 30) return true;
|
||
// 过滤描述文本碎片: 含句式连词的中文长词组
|
||
if (str.length > 8 && /[的了或在与及和是从到]/.test(str) && /[\u4e00-\u9fff]/.test(str)) return true;
|
||
// 过滤文件路径碎片
|
||
if (/^references?\//.test(str) || /\.md$/.test(str)) return true;
|
||
// 过滤省略号
|
||
if (str === '...' || str === '…') return true;
|
||
return false;
|
||
}
|
||
|
||
// === 关键词提取 ===
|
||
function extractKeywords(content) {
|
||
const keywords = new Map(); // keyword -> { weight, tier }
|
||
|
||
// 1. 从触发关键词表提取 (core 层 1.0)
|
||
const tableRows = content.matchAll(/\|\s*[^|]+\|\s*([^|]+)\|/g);
|
||
for (const row of tableRows) {
|
||
const cell = row[1].trim();
|
||
if (cell === '关键词' || cell === '---') continue;
|
||
const { weight } = classifyKeywordTier('table');
|
||
for (const kw of cell.split(/[,、]/)) {
|
||
const clean = kw.trim().replace(/`/g, '').replace(/\s+/g, ' ').toLowerCase();
|
||
if (clean && !isNoise(clean)) {
|
||
const prev = keywords.get(clean);
|
||
if (!prev || prev.weight < weight) {
|
||
keywords.set(clean, { weight, tier: 'core' });
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 列表格式: - **类别**: `kw1`, `kw2` (strong 层 0.8, v5.9.1: 加句式过滤)
|
||
const listItems = content.matchAll(/-\s*\*\*[^*]+\*\*\s*[::]\s*(.+)/g);
|
||
for (const item of listItems) {
|
||
const kwStr = item[1];
|
||
const { weight } = classifyKeywordTier('list');
|
||
for (const kw of kwStr.split(/[,、]/)) {
|
||
const clean = kw.trim().replace(/`/g, '').replace(/\s+/g, ' ').toLowerCase();
|
||
if (clean && !isNoise(clean)) {
|
||
// 过滤指令性句子 (含动宾结构的长文本不是关键词)
|
||
if (clean.length > 12 && /[\u4e00-\u9fff]/.test(clean)) continue;
|
||
const prev = keywords.get(clean);
|
||
if (!prev || prev.weight < weight) {
|
||
keywords.set(clean, { weight, tier: 'strong' });
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 2. 从 description 提取 (extended 层 0.5)
|
||
const frontmatter = parseFrontmatter(content);
|
||
if (frontmatter.description) {
|
||
const desc = frontmatter.description;
|
||
const { weight } = classifyKeywordTier('description');
|
||
// 提取中文词组 (2-6 字符)
|
||
const cnWords = desc.matchAll(/[\u4e00-\u9fff]{2,6}/g);
|
||
for (const w of cnWords) {
|
||
const kw = w[0].toLowerCase();
|
||
if (!isNoise(kw)) {
|
||
const prev = keywords.get(kw);
|
||
if (!prev || prev.weight < weight) {
|
||
keywords.set(kw, { weight, tier: 'extended' });
|
||
}
|
||
}
|
||
}
|
||
// 提取英文词/短语
|
||
const enWords = desc.matchAll(/[A-Za-z][\w.-]*(?:\s+[A-Za-z][\w.-]*){0,2}/g);
|
||
for (const w of enWords) {
|
||
const kw = w[0].toLowerCase().trim();
|
||
if (kw.length >= 3 && !isNoise(kw)) {
|
||
const prev = keywords.get(kw);
|
||
if (!prev || prev.weight < weight) {
|
||
keywords.set(kw, { weight, tier: 'extended' });
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 3. 提取带引号的触发示例 (core 层 1.0, v5.9.1: 加句式过滤)
|
||
const quoted = content.matchAll(/["\u201c]([^"\u201d]{2,20})["\u201d]/g);
|
||
for (const q of quoted) {
|
||
const kw = q[1].toLowerCase().trim();
|
||
const { weight } = classifyKeywordTier('quoted');
|
||
if (kw.length >= 2 && !isNoise(kw)) {
|
||
// 过滤句子级内容: 含标点或动宾结构的完整句子不是关键词
|
||
if (kw.length > 10 && /[,。!?、;:]/.test(kw)) continue;
|
||
// 过滤含"的"的描述性短语 (如 "阻力最小的路径")
|
||
if (kw.length > 6 && /[\u4e00-\u9fff].*的.*[\u4e00-\u9fff]/.test(kw)) continue;
|
||
const prev = keywords.get(kw);
|
||
if (!prev || prev.weight < weight) {
|
||
keywords.set(kw, { weight, tier: 'core' });
|
||
}
|
||
}
|
||
}
|
||
|
||
// 过滤噪声词 (v5.9.1: 大幅增强停用词表)
|
||
const STOP_WORDS = new Set([
|
||
// 英文通用
|
||
'the', 'and', 'for', 'this', 'that', 'with', 'from', 'when', 'use',
|
||
'output', 'style', 'reference', 'references',
|
||
// 中文描述碎片 (从 description 泄漏)
|
||
'当用户', '使用此', '时使用', '需要', '或说', '此技能', '当', '的', '了',
|
||
'本技能', '内联', '输出', '规范', '深度专家', '专家',
|
||
'时使用此技能', '当用户需要', '当用户需要进', '使用此技能',
|
||
// 通用动词/泛化词 (区分度极低)
|
||
'开发', '设计', '优化', '分析', '管理', '处理', '实现', '构建',
|
||
]);
|
||
for (const kw of keywords.keys()) {
|
||
if (STOP_WORDS.has(kw)) keywords.delete(kw);
|
||
}
|
||
|
||
// 4. 中英同义词自动补齐 (v5.9.1: 提升双语路由精度)
|
||
const SYNONYM_PAIRS = [
|
||
['测试', 'test'], ['部署', 'deploy'], ['安全', 'security'],
|
||
['数据库', 'database'], ['前端', 'frontend'], ['后端', 'backend'],
|
||
['架构', 'architecture'], ['性能', 'performance'], ['监控', 'monitoring'],
|
||
['容器', 'container'], ['微服务', 'microservice'], ['缓存', 'cache'],
|
||
['日志', 'log'], ['调试', 'debug'], ['重构', 'refactor'],
|
||
['认证', 'auth'], ['加密', 'encryption'], ['中间件', 'middleware'],
|
||
['索引', 'index'], ['事务', 'transaction'], ['队列', 'queue'],
|
||
['集群', 'cluster'], ['负载均衡', 'load balancing'], ['网关', 'gateway'],
|
||
['回滚', 'rollback'], ['迁移', 'migration'], ['接口', 'interface'],
|
||
['组件', 'component'], ['模板', 'template'], ['路由', 'routing'],
|
||
['钩子', 'hook'], ['插件', 'plugin'], ['配置', 'config'],
|
||
];
|
||
for (const [cn, en] of SYNONYM_PAIRS) {
|
||
const hasCn = [...keywords.keys()].some(k => k.includes(cn));
|
||
const hasEn = [...keywords.keys()].some(k => k.includes(en));
|
||
if (hasCn && !hasEn && !STOP_WORDS.has(en)) {
|
||
keywords.set(en, { weight: 0.5, tier: 'alias' });
|
||
} else if (!hasCn && hasEn && !STOP_WORDS.has(cn)) {
|
||
keywords.set(cn, { weight: 0.5, tier: 'alias' });
|
||
}
|
||
}
|
||
|
||
// 转为排序数组 (包含 tier 字段)
|
||
return Array.from(keywords.entries())
|
||
.sort((a, b) => b[1].weight - a[1].weight)
|
||
.map(([keyword, { weight, tier }]) => ({
|
||
keyword,
|
||
weight: Math.round(weight * 100) / 100,
|
||
tier,
|
||
}));
|
||
}
|
||
|
||
// === 提取 composable 字段 (v4.1) ===
|
||
function extractComposable(content) {
|
||
// 先检查布尔形式: composable: true
|
||
const boolMatch = content.match(/composable:\s*true\b/);
|
||
if (boolMatch) {
|
||
return { isComposable: true, enhances: [], requires: [], conflicts: [] };
|
||
}
|
||
|
||
// 再检查嵌套块形式
|
||
const blockMatch = content.match(/composable:\s*\n((?:\s+\w+:.*\n?)*)/);
|
||
if (!blockMatch) return null;
|
||
|
||
const block = blockMatch[1];
|
||
const composable = { isComposable: true };
|
||
|
||
const reqMatch = block.match(/requires:\s*\[([^\]]*)\]/);
|
||
composable.requires = reqMatch ? reqMatch[1].split(',').map(s => s.trim()).filter(Boolean) : [];
|
||
|
||
const enhMatch = block.match(/enhances:\s*\[([^\]]*)\]/);
|
||
composable.enhances = enhMatch ? enhMatch[1].split(',').map(s => s.trim()).filter(Boolean) : [];
|
||
|
||
const confMatch = block.match(/conflicts:\s*\[([^\]]*)\]/);
|
||
composable.conflicts = confMatch ? confMatch[1].split(',').map(s => s.trim()).filter(Boolean) : [];
|
||
|
||
return composable;
|
||
}
|
||
|
||
// === 第 3 层防线: 关键词冲突检测 (编译期消歧) ===
|
||
function detectKeywordCollisions(index) {
|
||
// 收集所有 core 层关键词 → 对应技能列表
|
||
const coreMap = new Map(); // keyword → [skill1, skill2, ...]
|
||
|
||
for (const skill of index.skills) {
|
||
for (const kw of skill.keywords) {
|
||
if (kw.tier === 'core') {
|
||
const list = coreMap.get(kw.keyword) || [];
|
||
list.push(skill.name);
|
||
coreMap.set(kw.keyword, list);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 找出被 2+ 个技能共享的 core 关键词
|
||
const collisions = [];
|
||
for (const [keyword, skills] of coreMap.entries()) {
|
||
if (skills.length >= 2) {
|
||
collisions.push({ keyword, skills, count: skills.length });
|
||
}
|
||
}
|
||
|
||
return collisions.sort((a, b) => b.count - a.count);
|
||
}
|
||
|
||
// === 主流程 ===
|
||
function main() {
|
||
const showStats = process.argv.includes('--stats');
|
||
const skillDirs = fs.readdirSync(SKILLS_DIR).filter(d => {
|
||
return fs.existsSync(path.join(SKILLS_DIR, d, 'SKILL.md'));
|
||
}).sort();
|
||
|
||
const index = {
|
||
generated: new Date().toISOString(),
|
||
version: (function() {
|
||
try {
|
||
var md = require('fs').readFileSync(require('path').join(CLAUDE_ROOT, 'CLAUDE.md'), 'utf8');
|
||
var m = md.match(/Smart Assistant.*?(v[\d.]+)/);
|
||
return m ? m[1] : 'v6.2';
|
||
} catch { return 'v6.2'; }
|
||
})(),
|
||
skillCount: skillDirs.length,
|
||
skills: [],
|
||
};
|
||
|
||
let totalKeywords = 0;
|
||
|
||
for (const dir of skillDirs) {
|
||
const skillFile = path.join(SKILLS_DIR, dir, 'SKILL.md');
|
||
const content = fs.readFileSync(skillFile, 'utf8');
|
||
const frontmatter = parseFrontmatter(content);
|
||
const keywords = extractKeywords(content);
|
||
const composable = extractComposable(content);
|
||
|
||
const entry = {
|
||
name: frontmatter.name || dir,
|
||
description: (frontmatter.description || '').slice(0, 200),
|
||
maturity: frontmatter.maturity || 'unknown',
|
||
isComposable: !!(composable || frontmatter.composable === 'true'), // v5.8: 布尔或嵌套块均计入
|
||
allowedTools: (frontmatter['allowed-tools'] || '').split(',').map(s => s.trim()).filter(Boolean),
|
||
keywords,
|
||
};
|
||
|
||
if (composable) entry.composable = composable;
|
||
|
||
index.skills.push(entry);
|
||
totalKeywords += keywords.length;
|
||
}
|
||
|
||
// v4.9: TF-IDF 加权 (如果引擎可用)
|
||
try {
|
||
const tfidf = require('./tfidf-engine.js');
|
||
tfidf.applyTFIDFWeights(index);
|
||
} catch (e) {
|
||
// tfidf-engine.js 尚未创建时 fallback 静默
|
||
if (!e.code || e.code !== 'MODULE_NOT_FOUND') {
|
||
console.warn(' [warn] TF-IDF 加权失败:', e.message);
|
||
}
|
||
}
|
||
|
||
// v5.9.1: 长期休眠技能降权标记 (B+C 类: 导入语言 + 商业技能)
|
||
// 从 route-stats.json 读取活跃技能,30天零命中的 imported/business 技能标记 coldPenalty
|
||
const BUSINESS_SKILLS = new Set([
|
||
'business-plan-skill', 'copywriter-expert', 'customer-success-expert',
|
||
'email-communicator', 'finance-advisor', 'industry-research-cn',
|
||
'investor-review-guide', 'pricing-strategist', 'product-manager-expert',
|
||
'project-coordinator', 'sales-consultant', 'social-media-manager',
|
||
'technical-seo-expert', 'ux-researcher',
|
||
]);
|
||
try {
|
||
const statsPath = path.join(CLAUDE_ROOT, 'debug', 'route-stats.json');
|
||
if (fs.existsSync(statsPath)) {
|
||
const routeStats = JSON.parse(fs.readFileSync(statsPath, 'utf8'));
|
||
const activeSkills = new Set(Object.keys(routeStats.stats || {}));
|
||
let penalizedCount = 0;
|
||
for (const skill of index.skills) {
|
||
const isImported = skill.maturity === 'imported';
|
||
const isBusiness = BUSINESS_SKILLS.has(skill.name);
|
||
if ((isImported || isBusiness) && !activeSkills.has(skill.name)) {
|
||
skill.coldPenalty = 0.5; // BM25 评分 ×0.5
|
||
penalizedCount++;
|
||
}
|
||
}
|
||
if (penalizedCount > 0) {
|
||
console.log(` [cold-penalty] ${penalizedCount} 个休眠技能已标记 coldPenalty=0.5`);
|
||
}
|
||
}
|
||
} catch {}
|
||
|
||
// 第 3 层防线: 编译期关键词冲突检测
|
||
const collisions = detectKeywordCollisions(index);
|
||
if (collisions.length > 0) {
|
||
console.warn('\n [COLLISION] 以下 core 关键词被多个技能共享,可能导致路由歧义:');
|
||
for (const c of collisions) {
|
||
console.warn(` "${c.keyword}" → ${c.skills.join(', ')}`);
|
||
}
|
||
console.warn(` 共 ${collisions.length} 个冲突,建议在 SKILL.md 中消歧或调整权重层级\n`);
|
||
}
|
||
|
||
// 写入索引文件 (附带冲突报告)
|
||
index.collisions = collisions;
|
||
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(index, null, 2) + '\n');
|
||
|
||
// === AGENTS_DIR_SCAN_PATCH_2026_04_20 ===
|
||
// 扫描 agents/*.md,将 agent 条目注入 skills 数组(type:'agent'),同时写出 skills-index-lite.json
|
||
try {
|
||
if (fs.existsSync(AGENTS_DIR)) {
|
||
const agentFiles = fs.readdirSync(AGENTS_DIR).filter(f => f.endsWith('.md')).sort();
|
||
let agentCount = 0;
|
||
for (const file of agentFiles) {
|
||
const agentPath = path.join(AGENTS_DIR, file);
|
||
const content = fs.readFileSync(agentPath, 'utf8');
|
||
const fm = parseFrontmatter(content);
|
||
if (!fm.name) continue;
|
||
const keywords = extractKeywords(content);
|
||
const entry = {
|
||
name: fm.name,
|
||
description: (fm.description || '').slice(0, 200),
|
||
maturity: 'agent',
|
||
type: 'agent',
|
||
isComposable: false,
|
||
allowedTools: (fm['allowed-tools'] || '').split(',').map(s => s.trim()).filter(Boolean),
|
||
keywords,
|
||
};
|
||
index.skills.push(entry);
|
||
agentCount++;
|
||
}
|
||
console.log(' [agents] ' + agentCount + ' agent(s) injected into index');
|
||
}
|
||
} catch (e) {
|
||
console.warn(' [agents] 扫描失败 (非致命):', e.message);
|
||
}
|
||
|
||
// 写出 skills-index-lite.json (路由引擎实际使用的是 lite 版本)
|
||
fs.writeFileSync(OUTPUT_LITE, JSON.stringify(index, null, 2) + '\n');
|
||
console.log(' [lite] skills-index-lite.json 已同步 (' + index.skills.length + ' entries total)');
|
||
|
||
const composableCount = index.skills.filter(s => s.isComposable).length;
|
||
console.log(`skills-index.json generated:`);
|
||
console.log(` Skills: ${index.skillCount} (${composableCount} composable)`);
|
||
console.log(` Keywords: ${totalKeywords} (avg ${Math.round(totalKeywords / index.skillCount)} per skill)`);
|
||
console.log(` Output: ${OUTPUT_FILE}`);
|
||
|
||
if (showStats) {
|
||
console.log('\nPer-skill keyword count:');
|
||
for (const s of index.skills) {
|
||
const bar = '\u2588'.repeat(Math.min(s.keywords.length, 40));
|
||
console.log(` ${s.name.padEnd(35)} ${String(s.keywords.length).padStart(3)} ${bar}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 导出核心函数供测试使用
|
||
if (typeof module !== 'undefined') {
|
||
module.exports = { parseFrontmatter, extractKeywords, extractComposable, classifyKeywordTier, isNoise, detectKeywordCollisions };
|
||
}
|
||
|
||
if (require.main === module) {
|
||
main();
|
||
}
|