bookworm-smart-assistant/scripts/intent-classifier.js

372 lines
13 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* 意图分类引擎 (v5.2 Neural Gateway)
*
* 轻量级正则规则引擎,对用户输入做三级分流:
* simple — 简单问答,跳过路由
* medium — 标准路由 (BM25 + 上下文融合)
* complex — 编排路由 (orchestrator / 多技能协作)
*
* 模块导出:
* classifyIntent(text) → { intents, modifiers, entities, complexity }
* extractEntities(text) → string[]
* scoreComplexity(intents, modifiers, entities) → 'simple'|'medium'|'complex'
*
* 性能预算: < 30ms
*/
const path = require('path');
// === 12 种意图标签 (正则规则) ===
const INTENT_RULES = [
{
label: 'debug',
patterns: [
/(?:调试|debug|报错|bug|异常|错误|error|crash|崩溃|修复|fix|排查|trace|stacktrace|故障)/i,
],
},
{
label: 'performance',
patterns: [
/(?:性能|performance|优化|optimize|慢|卡顿|内存泄漏|memory\s*leak|profil|benchmark|瓶颈|bottleneck|缓存|cache|加速|提速)/i,
],
},
{
label: 'security',
patterns: [
/(?:安全|security|漏洞|vulnerab|XSS|CSRF|注入|inject|认证|auth|鉴权|加密|encrypt|权限|permission|OWASP|渗透|pentest)/i,
],
},
{
label: 'architecture',
patterns: [
/(?:架构|architect|设计模式|design\s*pattern|微服务|microservice|重构|refactor|解耦|decouple|分层|layer|模块化|modular|系统设计)/i,
],
},
{
label: 'review',
patterns: [
/(?:代码审查|code\s*review|PR\s*review|评审|会审|review|审计|audit|检查代码|代码质量|lint|规范|专家组)/i,
],
},
{
label: 'create',
patterns: [
/(?:从零|从头|新建|创建|搭建|初始化|init|scaffold|bootstrap|generate|新项目|starter|template|脚手架|生成)/i,
],
},
{
label: 'deploy',
patterns: [
/(?:部署|deploy|上线|发布|release|CI\/?CD|pipeline|构建|build|打包|bundle|Docker|容器|K8s|Kubernetes)/i,
],
},
{
label: 'test',
patterns: [
/(?:测试|test|单元测试|unit\s*test|集成测试|integration|E2E|端到端|覆盖率|coverage|mock|stub|TDD|BDD|vitest|jest|playwright)/i,
],
},
{
label: 'data',
patterns: [
/(?:数据库|database|SQL|查询|query|表|table|索引|index|迁移|migration|ORM|Redis|MongoDB|PostgreSQL|MySQL|数据模型)/i,
],
},
{
label: 'research',
patterns: [
/(?:调研|research|对比|compare|选型|evaluate|评估|分析|analyz|竞品|技术方案|方案设计|可行性)/i,
],
},
{
label: 'explain',
patterns: [
/(?:解释|explain|什么是|what\s*is|怎么理解|区别|difference|原理|principle|概念|concept|为什么|why|how\s*does)/i,
],
},
{
label: 'continue',
patterns: [
/^(?:继续|接着|下一步|往下|go\s+on|proceed)/i,
],
},
{
label: 'select',
patterns: [
/^(?:选[第1-9一二三四五六七八九十]|方案[A-Za-z]|用这个|就这个|[1-9]号$)/i,
],
},
{
label: 'confirm',
patterns: [
/^(?:好的|可以|行|确认|ok|yes|是的|对|没问题|同意)/i,
],
},
{
label: 'general',
patterns: [
/(?:你好|hello|hi|帮我|请问|麻烦|谢谢|thanks|嗯)/i,
],
},
];
// === 3 种修饰符 ===
const MODIFIER_RULES = [
{
label: 'urgent',
patterns: [/(?:紧急|urgent|马上|立刻|赶紧|ASAP|immediately|尽快)/i],
},
{
label: 'complex',
patterns: [/(?:全面|comprehensive|完整|complete|端到端|end.to.end|全链路|full.stack|整个|entire|深入|deep|详细|detail)/i],
},
{
label: 'simple',
patterns: [/(?:简单|simple|快速|quick|直接|直|高效|efficient|简要|brief)/i],
},
];
// === 实体提取 (框架/工具/语言) ===
const ENTITY_PATTERNS = {
// 框架
frameworks: [
/\b(?:React|Vue(?:\.js)?|Angular|Next\.?js|Nuxt(?:\.js)?|Svelte|Solid(?:JS)?|Remix|Gatsby|Astro)\b/gi,
/\b(?:FastAPI|Django|Flask|Express(?:\.js)?|Nest(?:JS|\.js)?|Spring\s*Boot|Laravel|Rails|Gin|Echo|Fiber)\b/gi,
/\b(?:Electron|React\s*Native|Flutter|SwiftUI|Jetpack\s*Compose|Tauri|Expo)\b/gi,
],
// 工具
tools: [
/\b(?:Docker|Kubernetes|K8s|Webpack|Vite|Rollup|esbuild|SWC|Turbopack|Bun)\b/gi,
/\b(?:Git(?:Hub)?|GitLab|Terraform|Ansible|Prometheus|Grafana|Nginx|Caddy)\b/gi,
/\b(?:PostgreSQL|MySQL|Redis|MongoDB|ElasticSearch|Milvus|Qdrant|SQLite|Prisma|Drizzle)\b/gi,
/\b(?:pnpm|npm|yarn|pip|cargo|go\s+mod)\b/gi,
],
// 语言
languages: [
/\b(?:TypeScript|JavaScript|Python|Go(?:lang)?|Rust|Java|C\+\+|C#|Swift|Kotlin|Ruby|PHP|Dart|Elixir|Zig)\b/gi,
],
};
// === 编排触发词 (标记为 complex) ===
const ORCHESTRATOR_TRIGGERS = /(?:从零开发|全面优化|端到端实现|帮我搭建|整个链路|全生命周期|多步骤|全栈项目|完整项目|多技能协作)/i;
// === 预编译联合正则 (模块加载时一次性构建,用于快速预筛选) ===
// 将 12 条意图 + 3 条修饰符的核心关键词合并为单一正则
// 无匹配时直接短路为 general/simple避免逐条遍历
// continue/select/confirm 用 ^ 锚定只匹配开头,不需要参与联合预筛选
const _SKIP_PRESCREEN = new Set(['general', 'continue', 'select', 'confirm']);
const _intentKeywords = INTENT_RULES
.filter(r => !_SKIP_PRESCREEN.has(r.label))
.flatMap(r => r.patterns.map(p => p.source.replace(/^\(\?:/, '').replace(/\)$/, '').replace(/^\^/, '')));
const _modifierKeywords = MODIFIER_RULES
.flatMap(r => r.patterns.map(p => p.source.replace(/^\(\?:/, '').replace(/\)$/, '')));
const FAST_INTENT_CHECK = new RegExp(`(?:${_intentKeywords.join('|')})`, 'i');
const FAST_MODIFIER_CHECK = new RegExp(`(?:${_modifierKeywords.join('|')})`, 'i');
/**
* 提取实体 (框架/工具/语言名)
* @param {string} text - 用户输入
* @returns {string[]} 去重后的实体列表
*/
function extractEntities(text) {
const entities = new Set();
for (const category of Object.values(ENTITY_PATTERNS)) {
for (const pattern of category) {
const matches = text.match(pattern) || [];
for (const m of matches) {
entities.add(m.trim());
}
}
}
return Array.from(entities);
}
/**
* 评估复杂度
* @param {string[]} intents - 意图标签列表
* @param {string[]} modifiers - 修饰符列表
* @param {string[]} entities - 实体列表
* @returns {'simple'|'medium'|'complex'}
*/
// v5.9: 相邻意图对 — 这些 2-意图组合不应升级为 complex用 medium 处理即可
const ADJACENT_INTENT_PAIRS = new Set([
'data:test', 'debug:performance', 'deploy:test',
'create:deploy', 'review:test', 'data:deploy',
'debug:data', 'debug:security', 'performance:test',
'architecture:data', 'create:test', 'review:security',
// v5.9: explain 与其他意图组合天然相邻 (如"解释一下这个 bug")
'debug:explain', 'explain:performance', 'architecture:explain',
'deploy:explain', 'explain:security', 'create:explain',
]);
function scoreComplexity(intents, modifiers, entities) {
// complex: 有 complex 修饰符 或 匹配编排触发词
if (modifiers.includes('complex')) return 'complex';
// V-01: confirm/continue 后有实质内容 → 强制 medium
if (modifiers.includes('_force_medium')) return 'medium';
// v5.9: simple 修饰符优先于多意图判定 (如"简单说一下性能问题")
if (modifiers.includes('simple')) return 'simple';
// v5.9: 2 意图时检查是否为相邻意图对 (medium 而非 complex)
if (intents.length === 2 && !intents.includes('general')) {
const pair = intents.slice().sort().join(':');
if (!ADJACENT_INTENT_PAIRS.has(pair)) return 'complex';
// 相邻意图对 → 继续往下判定为 medium
}
// 3+ 意图 → complex
if (intents.length >= 3 && !intents.includes('general')) return 'complex';
// simple: 仅 explain/general/continue/select/confirm 且无框架实体
const simpleIntents = new Set(['explain', 'general', 'continue', 'select', 'confirm']);
const allSimple = intents.every(i => simpleIntents.has(i));
if (allSimple && entities.length === 0) {
return 'simple';
}
// medium: 其余
return 'medium';
}
/**
* 意图分类主函数
* @param {string} text - 用户输入文本
* @returns {{ intents: string[], modifiers: string[], entities: string[], complexity: 'simple'|'medium'|'complex' }}
*/
function classifyIntent(text) {
if (!text || typeof text !== 'string') {
return { intents: ['general'], modifiers: [], entities: [], complexity: 'simple' };
}
// 截断超长输入,防止正则性能退化 (2000 字符覆盖 99.9% 正常 prompt)
const input = text.slice(0, 2000);
// 快速预筛选: 联合正则一次判定是否有任何意图/修饰符匹配
const hasIntent = FAST_INTENT_CHECK.test(input);
const hasModifier = FAST_MODIFIER_CHECK.test(input);
// 匹配意图 (仅在预筛选命中时逐条匹配)
const intents = [];
if (hasIntent) {
for (const rule of INTENT_RULES) {
if (_SKIP_PRESCREEN.has(rule.label)) continue; // 跳过锚定规则,下面单独检查
for (const pattern of rule.patterns) {
if (pattern.test(input)) {
intents.push(rule.label);
break;
}
}
}
}
// 单独检查 ^ 锚定意图 (continue/select/confirm),不依赖预筛选
for (const rule of INTENT_RULES) {
if (!_SKIP_PRESCREEN.has(rule.label) || rule.label === 'general') continue;
for (const pattern of rule.patterns) {
if (pattern.test(input)) {
intents.push(rule.label);
break;
}
}
}
// 无匹配时回退 general
if (intents.length === 0) {
for (const pattern of INTENT_RULES[INTENT_RULES.length - 1].patterns) {
if (pattern.test(input)) { intents.push('general'); break; }
}
if (intents.length === 0) intents.push('general');
}
// 匹配修饰符 (仅在预筛选命中时逐条匹配)
const modifiers = [];
if (hasModifier) {
for (const rule of MODIFIER_RULES) {
for (const pattern of rule.patterns) {
if (pattern.test(input)) {
modifiers.push(rule.label);
break;
}
}
}
}
// 编排触发词检测
if (ORCHESTRATOR_TRIGGERS.test(input) && !modifiers.includes('complex')) {
modifiers.push('complex');
}
// 实体提取
const entities = extractEntities(input);
// V-01 修复: confirm/continue/select 前缀后若有实质性后续内容,移除该标签走全分类
// 防止 "好的,帮我写支付接口" 被 confirm 吞没
const _PREFIX_INTENTS = ['confirm', 'continue', 'select'];
const _TRANSITION_WORDS = /[,。.、!!?]\s*|但是|不过|but|however|换成|改为|另外|还有/i;
for (const pi of _PREFIX_INTENTS) {
if (intents.includes(pi) && intents.length === 1) {
// 提取前缀匹配后的剩余文本
const rule = INTENT_RULES.find(r => r.label === pi);
if (rule) {
const match = rule.patterns[0].exec(input);
if (match) {
const remaining = input.slice(match[0].length).replace(/^[\s,。.、]+/, '');
// 剩余文本 > 8 字符或含转折词 → 移除前缀标签,重新分类剩余文本
if (remaining.length > 8 || _TRANSITION_WORDS.test(remaining)) {
intents.length = 0;
// 重新检测剩余文本中的实质性意图
for (const r of INTENT_RULES) {
if (_SKIP_PRESCREEN.has(r.label)) continue;
for (const p of r.patterns) {
if (p.test(remaining)) { intents.push(r.label); break; }
}
}
// 未命中任何专业意图但剩余文本有实质内容 → 标记 general 并强制 medium
// 原理: "好的,帮我写支付接口" 中 "帮我写支付接口" 是新任务不应继承
if (intents.length === 0) {
intents.push('general');
modifiers.push('_force_medium');
}
}
}
}
}
}
// 复杂度评分
const complexity = scoreComplexity(intents, modifiers, entities);
return { intents, modifiers, entities, complexity };
}
// 模块导出
if (typeof module !== 'undefined') {
module.exports = {
classifyIntent,
extractEntities,
scoreComplexity,
// 导出规则供测试
INTENT_RULES,
MODIFIER_RULES,
ENTITY_PATTERNS,
FAST_INTENT_CHECK,
FAST_MODIFIER_CHECK,
ADJACENT_INTENT_PAIRS,
};
}
// CLI 入口
if (require.main === module) {
const query = process.argv.slice(2).join(' ');
if (!query) {
console.log('Usage: node intent-classifier.js "<query>"');
process.exit(0);
}
const result = classifyIntent(query);
console.log(JSON.stringify(result, null, 2));
}