bookworm-smart-assistant/scripts/intent-classifier.js

372 lines
13 KiB
JavaScript
Raw Normal View History

#!/usr/bin/env node
/**
* 意图分类引擎 (v5.2 Neural Gateway)
*
* 轻量级正则规则引擎对用户输入做三级分流:
* simple 简单问答跳过路由
* medium 标准路由 (BM25 + 上下文融合)
* complex 编排路由 (orchestrator / 多技能协作)
*
* 模块导出:
* classifyIntent(text) { intents, modifiers, entities, complexity }
* extractEntities(text) string[]
* scoreComplexity(intents, modifiers, entities) 'simple'|'medium'|'complex'
*
* 性能预算: < 30ms
*/
const path = require('path');
// === 12 种意图标签 (正则规则) ===
const INTENT_RULES = [
{
label: 'debug',
patterns: [
/(?:调试|debug|报错|bug|异常|错误|error|crash|崩溃|修复|fix|排查|trace|stacktrace|故障)/i,
],
},
{
label: 'performance',
patterns: [
/(?:性能|performance|优化|optimize|慢|卡顿|内存泄漏|memory\s*leak|profil|benchmark|瓶颈|bottleneck|缓存|cache|加速|提速)/i,
],
},
{
label: 'security',
patterns: [
/(?:安全|security|漏洞|vulnerab|XSS|CSRF|注入|inject|认证|auth|鉴权|加密|encrypt|权限|permission|OWASP|渗透|pentest)/i,
],
},
{
label: 'architecture',
patterns: [
/(?:架构|architect|设计模式|design\s*pattern|微服务|microservice|重构|refactor|解耦|decouple|分层|layer|模块化|modular|系统设计)/i,
],
},
{
label: 'review',
patterns: [
/(?:代码审查|code\s*review|PR\s*review|评审|会审|review|审计|audit|检查代码|代码质量|lint|规范|专家组)/i,
],
},
{
label: 'create',
patterns: [
/(?:从零|从头|新建|创建|搭建|初始化|init|scaffold|bootstrap|generate|新项目|starter|template|脚手架|生成)/i,
],
},
{
label: 'deploy',
patterns: [
/(?:部署|deploy|上线|发布|release|CI\/?CD|pipeline|构建|build|打包|bundle|Docker|容器|K8s|Kubernetes)/i,
],
},
{
label: 'test',
patterns: [
/(?:测试|test|单元测试|unit\s*test|集成测试|integration|E2E|端到端|覆盖率|coverage|mock|stub|TDD|BDD|vitest|jest|playwright)/i,
],
},
{
label: 'data',
patterns: [
/(?:数据库|database|SQL|查询|query|表|table|索引|index|迁移|migration|ORM|Redis|MongoDB|PostgreSQL|MySQL|数据模型)/i,
],
},
{
label: 'research',
patterns: [
/(?:调研|research|对比|compare|选型|evaluate|评估|分析|analyz|竞品|技术方案|方案设计|可行性)/i,
],
},
{
label: 'explain',
patterns: [
/(?:解释|explain|什么是|what\s*is|怎么理解|区别|difference|原理|principle|概念|concept|为什么|why|how\s*does)/i,
],
},
{
label: 'continue',
patterns: [
/^(?:继续|接着|下一步|往下|go\s+on|proceed)/i,
],
},
{
label: 'select',
patterns: [
/^(?:选[第1-9一二三四五六七八九十]|方案[A-Za-z]|用这个|就这个|[1-9]号$)/i,
],
},
{
label: 'confirm',
patterns: [
/^(?:好的|可以|行|确认|ok|yes|是的|对|没问题|同意)/i,
],
},
{
label: 'general',
patterns: [
/(?:你好|hello|hi|帮我|请问|麻烦|谢谢|thanks|嗯)/i,
],
},
];
// === 3 种修饰符 ===
const MODIFIER_RULES = [
{
label: 'urgent',
patterns: [/(?:紧急|urgent|马上|立刻|赶紧|ASAP|immediately|尽快)/i],
},
{
label: 'complex',
patterns: [/(?:全面|comprehensive|完整|complete|端到端|end.to.end|全链路|full.stack|整个|entire|深入|deep|详细|detail)/i],
},
{
label: 'simple',
patterns: [/(?:简单|simple|快速|quick|直接|直|高效|efficient|简要|brief)/i],
},
];
// === 实体提取 (框架/工具/语言) ===
const ENTITY_PATTERNS = {
// 框架
frameworks: [
/\b(?:React|Vue(?:\.js)?|Angular|Next\.?js|Nuxt(?:\.js)?|Svelte|Solid(?:JS)?|Remix|Gatsby|Astro)\b/gi,
/\b(?:FastAPI|Django|Flask|Express(?:\.js)?|Nest(?:JS|\.js)?|Spring\s*Boot|Laravel|Rails|Gin|Echo|Fiber)\b/gi,
/\b(?:Electron|React\s*Native|Flutter|SwiftUI|Jetpack\s*Compose|Tauri|Expo)\b/gi,
],
// 工具
tools: [
/\b(?:Docker|Kubernetes|K8s|Webpack|Vite|Rollup|esbuild|SWC|Turbopack|Bun)\b/gi,
/\b(?:Git(?:Hub)?|GitLab|Terraform|Ansible|Prometheus|Grafana|Nginx|Caddy)\b/gi,
/\b(?:PostgreSQL|MySQL|Redis|MongoDB|ElasticSearch|Milvus|Qdrant|SQLite|Prisma|Drizzle)\b/gi,
/\b(?:pnpm|npm|yarn|pip|cargo|go\s+mod)\b/gi,
],
// 语言
languages: [
/\b(?:TypeScript|JavaScript|Python|Go(?:lang)?|Rust|Java|C\+\+|C#|Swift|Kotlin|Ruby|PHP|Dart|Elixir|Zig)\b/gi,
],
};
// === 编排触发词 (标记为 complex) ===
const ORCHESTRATOR_TRIGGERS = /(?:从零开发|全面优化|端到端实现|帮我搭建|整个链路|全生命周期|多步骤|全栈项目|完整项目|多技能协作)/i;
// === 预编译联合正则 (模块加载时一次性构建,用于快速预筛选) ===
// 将 12 条意图 + 3 条修饰符的核心关键词合并为单一正则
// 无匹配时直接短路为 general/simple避免逐条遍历
// continue/select/confirm 用 ^ 锚定只匹配开头,不需要参与联合预筛选
const _SKIP_PRESCREEN = new Set(['general', 'continue', 'select', 'confirm']);
const _intentKeywords = INTENT_RULES
.filter(r => !_SKIP_PRESCREEN.has(r.label))
.flatMap(r => r.patterns.map(p => p.source.replace(/^\(\?:/, '').replace(/\)$/, '').replace(/^\^/, '')));
const _modifierKeywords = MODIFIER_RULES
.flatMap(r => r.patterns.map(p => p.source.replace(/^\(\?:/, '').replace(/\)$/, '')));
const FAST_INTENT_CHECK = new RegExp(`(?:${_intentKeywords.join('|')})`, 'i');
const FAST_MODIFIER_CHECK = new RegExp(`(?:${_modifierKeywords.join('|')})`, 'i');
/**
* 提取实体 (框架/工具/语言名)
* @param {string} text - 用户输入
* @returns {string[]} 去重后的实体列表
*/
function extractEntities(text) {
const entities = new Set();
for (const category of Object.values(ENTITY_PATTERNS)) {
for (const pattern of category) {
const matches = text.match(pattern) || [];
for (const m of matches) {
entities.add(m.trim());
}
}
}
return Array.from(entities);
}
/**
* 评估复杂度
* @param {string[]} intents - 意图标签列表
* @param {string[]} modifiers - 修饰符列表
* @param {string[]} entities - 实体列表
* @returns {'simple'|'medium'|'complex'}
*/
// v5.9: 相邻意图对 — 这些 2-意图组合不应升级为 complex用 medium 处理即可
const ADJACENT_INTENT_PAIRS = new Set([
'data:test', 'debug:performance', 'deploy:test',
'create:deploy', 'review:test', 'data:deploy',
'debug:data', 'debug:security', 'performance:test',
'architecture:data', 'create:test', 'review:security',
// v5.9: explain 与其他意图组合天然相邻 (如"解释一下这个 bug")
'debug:explain', 'explain:performance', 'architecture:explain',
'deploy:explain', 'explain:security', 'create:explain',
]);
function scoreComplexity(intents, modifiers, entities) {
// complex: 有 complex 修饰符 或 匹配编排触发词
if (modifiers.includes('complex')) return 'complex';
// V-01: confirm/continue 后有实质内容 → 强制 medium
if (modifiers.includes('_force_medium')) return 'medium';
// v5.9: simple 修饰符优先于多意图判定 (如"简单说一下性能问题")
if (modifiers.includes('simple')) return 'simple';
// v5.9: 2 意图时检查是否为相邻意图对 (medium 而非 complex)
if (intents.length === 2 && !intents.includes('general')) {
const pair = intents.slice().sort().join(':');
if (!ADJACENT_INTENT_PAIRS.has(pair)) return 'complex';
// 相邻意图对 → 继续往下判定为 medium
}
// 3+ 意图 → complex
if (intents.length >= 3 && !intents.includes('general')) return 'complex';
// simple: 仅 explain/general/continue/select/confirm 且无框架实体
const simpleIntents = new Set(['explain', 'general', 'continue', 'select', 'confirm']);
const allSimple = intents.every(i => simpleIntents.has(i));
if (allSimple && entities.length === 0) {
return 'simple';
}
// medium: 其余
return 'medium';
}
/**
* 意图分类主函数
* @param {string} text - 用户输入文本
* @returns {{ intents: string[], modifiers: string[], entities: string[], complexity: 'simple'|'medium'|'complex' }}
*/
function classifyIntent(text) {
if (!text || typeof text !== 'string') {
return { intents: ['general'], modifiers: [], entities: [], complexity: 'simple' };
}
// 截断超长输入,防止正则性能退化 (2000 字符覆盖 99.9% 正常 prompt)
const input = text.slice(0, 2000);
// 快速预筛选: 联合正则一次判定是否有任何意图/修饰符匹配
const hasIntent = FAST_INTENT_CHECK.test(input);
const hasModifier = FAST_MODIFIER_CHECK.test(input);
// 匹配意图 (仅在预筛选命中时逐条匹配)
const intents = [];
if (hasIntent) {
for (const rule of INTENT_RULES) {
if (_SKIP_PRESCREEN.has(rule.label)) continue; // 跳过锚定规则,下面单独检查
for (const pattern of rule.patterns) {
if (pattern.test(input)) {
intents.push(rule.label);
break;
}
}
}
}
// 单独检查 ^ 锚定意图 (continue/select/confirm),不依赖预筛选
for (const rule of INTENT_RULES) {
if (!_SKIP_PRESCREEN.has(rule.label) || rule.label === 'general') continue;
for (const pattern of rule.patterns) {
if (pattern.test(input)) {
intents.push(rule.label);
break;
}
}
}
// 无匹配时回退 general
if (intents.length === 0) {
for (const pattern of INTENT_RULES[INTENT_RULES.length - 1].patterns) {
if (pattern.test(input)) { intents.push('general'); break; }
}
if (intents.length === 0) intents.push('general');
}
// 匹配修饰符 (仅在预筛选命中时逐条匹配)
const modifiers = [];
if (hasModifier) {
for (const rule of MODIFIER_RULES) {
for (const pattern of rule.patterns) {
if (pattern.test(input)) {
modifiers.push(rule.label);
break;
}
}
}
}
// 编排触发词检测
if (ORCHESTRATOR_TRIGGERS.test(input) && !modifiers.includes('complex')) {
modifiers.push('complex');
}
// 实体提取
const entities = extractEntities(input);
// V-01 修复: confirm/continue/select 前缀后若有实质性后续内容,移除该标签走全分类
// 防止 "好的,帮我写支付接口" 被 confirm 吞没
const _PREFIX_INTENTS = ['confirm', 'continue', 'select'];
const _TRANSITION_WORDS = /[,。.、!!?]\s*|但是|不过|but|however|换成|改为|另外|还有/i;
for (const pi of _PREFIX_INTENTS) {
if (intents.includes(pi) && intents.length === 1) {
// 提取前缀匹配后的剩余文本
const rule = INTENT_RULES.find(r => r.label === pi);
if (rule) {
const match = rule.patterns[0].exec(input);
if (match) {
const remaining = input.slice(match[0].length).replace(/^[\s,。.、]+/, '');
// 剩余文本 > 8 字符或含转折词 → 移除前缀标签,重新分类剩余文本
if (remaining.length > 8 || _TRANSITION_WORDS.test(remaining)) {
intents.length = 0;
// 重新检测剩余文本中的实质性意图
for (const r of INTENT_RULES) {
if (_SKIP_PRESCREEN.has(r.label)) continue;
for (const p of r.patterns) {
if (p.test(remaining)) { intents.push(r.label); break; }
}
}
// 未命中任何专业意图但剩余文本有实质内容 → 标记 general 并强制 medium
// 原理: "好的,帮我写支付接口" 中 "帮我写支付接口" 是新任务不应继承
if (intents.length === 0) {
intents.push('general');
modifiers.push('_force_medium');
}
}
}
}
}
}
// 复杂度评分
const complexity = scoreComplexity(intents, modifiers, entities);
return { intents, modifiers, entities, complexity };
}
// 模块导出
if (typeof module !== 'undefined') {
module.exports = {
classifyIntent,
extractEntities,
scoreComplexity,
// 导出规则供测试
INTENT_RULES,
MODIFIER_RULES,
ENTITY_PATTERNS,
FAST_INTENT_CHECK,
FAST_MODIFIER_CHECK,
ADJACENT_INTENT_PAIRS,
};
}
// CLI 入口
if (require.main === module) {
const query = process.argv.slice(2).join(' ');
if (!query) {
console.log('Usage: node intent-classifier.js "<query>"');
process.exit(0);
}
const result = classifyIntent(query);
console.log(JSON.stringify(result, null, 2));
}