#!/usr/bin/env node // Bookworm Smart Assistant - 同步前敏感内容扫描器 // 目的: 扫描准备推送到 Gitea 的代码层, 检测凭证/路径/品牌/IP/邮箱等敏感信息 // 默认 dry-run: 只读扫描 + 输出报告, 不修改任何文件 // 使用: node tools/scrubber.mjs [--json] [--apply] // --json 仅输出 JSON (管道友好) // --apply 真修改 (高危, 需二次确认) import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const CLAUDE_ROOT = path.resolve(__dirname, '..'); // ---------- 同步白名单 (与推送脚本保持一致) ---------- const INCLUDE_DIRS = [ 'agents', 'hooks', 'skills', 'lib', 'scripts', 'constitution', 'docs', 'templates', 'config', 'tests', 'tools', ]; const INCLUDE_FILES = [ 'CLAUDE.md', 'package.json', 'feature-flags.json', 'feature-flags.json.sig', // legacy 'integrity.sha256' 不同步 (与新 INTEGRITY.sha256 case 冲突) 'settings.template.json', 'settings.local.template.json', 'SKILL-REGISTRY.md', 'skills-index.json', 'skills-index-lite.json', 'stats-compiled.json', ]; // 二级排除 (白名单目录内仍需排除的子路径) const EXCLUDE_SUBPATHS = [ /[\\/]_archived([\\/]|$)/i, /[\\/]_deprecated([\\/]|$)/i, /[\\/]node_modules([\\/]|$)/, /[\\/]\.git([\\/]|$)/, /[\\/]__pycache__([\\/]|$)/, // P0.2: active-projects.md 含客户域名+IP, 不推送 /docs[\\/]active-projects\.md$/i, // 决策 Q1=B: 测试目录不同步 (fixture 含真实路径/fake key) /hooks[\\/]tests([\\/]|$)/i, /hooks[\\/].*__tests__([\\/]|$)/i, // 决策 Q3.1: scrubber 自身排除 (规则正则会自打) /tools[\\/]scrubber\.mjs$/i, /tools[\\/]scrubber-report\.json$/i, // 备份/临时文件 (不同步) /\.bak(\..+)?$/i, /\.tmp(\..+)?$/i, // 脱敏补丁脚本自身含原字符串作为锚点 (按设计) /scripts[\\/]patches[\\/]patch-sync-[^\\/]+\.js$/i, // 机器绑定的自同步配置 (不推送) /config[\\/]auto-sync-repos\.json$/i, // 历史一次性脚本 (已执行, 不推送) /scripts[\\/]apply-settings-patch\.py$/i, ]; // 二进制扩展名跳过 const BINARY_EXTS = new Set([ '.png', '.jpg', '.jpeg', '.gif', '.webp', '.ico', '.bmp', '.mp3', '.mp4', '.wav', '.ogg', '.webm', '.pdf', '.zip', '.tar', '.gz', '.7z', '.rar', '.exe', '.dll', '.so', '.dylib', '.bin', '.woff', '.woff2', '.ttf', '.otf', '.eot', ]); const MAX_BYTES = 2 * 1024 * 1024; // 2MB 以上跳过 // ---------- 敏感规则定义 ---------- // severity: CRITICAL > HIGH > MEDIUM > LOW // replacement: null 表示必须人工决策, 否则为占位建议 const RULES = [ // ===== CRITICAL: 凭证/密钥/Token ===== { id: 'ANTHROPIC_API_KEY', severity: 'CRITICAL', re: /sk-ant-[a-zA-Z0-9_\-]{20,}/g, desc: 'Anthropic API Key 明文', replacement: null, // 排除明显的占位符 fixture matchWhitelist: [/very-long/i, /DUMMY/i, /placeholder/i, /example/i, /fake/i, /test[_\-]?key/i], }, { id: 'OPENAI_API_KEY', severity: 'CRITICAL', re: /\bsk-[a-zA-Z0-9]{32,}\b/g, desc: 'OpenAI/通用 sk- 密钥', replacement: null, }, { id: 'GITHUB_TOKEN', severity: 'CRITICAL', re: /\bgh[pousr]_[A-Za-z0-9]{36,}\b/g, desc: 'GitHub PAT Token', replacement: null, }, { id: 'GITEA_TOKEN', severity: 'CRITICAL', // Q3.3: 必须有 token/secret/authorization/key 上下文, 避免 ETag/commit hash 误报 re: /\b(?:token|secret|authorization|api[_\-]?key|access[_\-]?token|bearer)["':= ]+[a-f0-9]{40}\b/gi, desc: 'Gitea/长 hex token (带上下文)', replacement: null, whitelist: [/\.sha256$/, /integrity/i, /CHANGELOG/i, /\.version$/], }, { id: 'PRIVATE_KEY_PEM', severity: 'CRITICAL', // Q3.4: 必须有实际 body (至少 40 字符 base64 + END 标记), 排除纯格式说明 re: /-----BEGIN (?:RSA |EC |OPENSSH |ENCRYPTED |)PRIVATE KEY-----[\s\S]*?[A-Za-z0-9+/=]{40,}[\s\S]*?-----END/g, desc: '私钥 PEM 块 (含 body)', replacement: null, multiline: true, }, { id: 'USER_EMAIL_PERSONAL', severity: 'CRITICAL', re: /timoteofatima283@gmail\.com/gi, desc: '用户个人邮箱', replacement: 'user@example.com', }, { id: 'OLD_GITEA_PASSWORD', severity: 'CRITICAL', re: /\bmybio668\b/g, desc: '已轮换的 Gitea 旧密码 mybio668', replacement: null, }, // ===== HIGH: 硬编码路径 ===== { id: 'HARDCODED_PATH_WIN', severity: 'HIGH', re: /C:[\\/]+Users[\\/]+leesu/gi, desc: '硬编码 Windows 用户路径 C:\\Users\\leesu', replacement: '', }, { id: 'HARDCODED_PATH_MSYS', severity: 'HIGH', re: /\/c\/Users\/leesu/g, desc: '硬编码 MSYS/Git Bash 路径 /c/Users/leesu', replacement: '', }, { id: 'HARDCODED_PATH_ADMIN', severity: 'HIGH', re: /C:[\\/]+Users[\\/]+Administrator[\\/]+\.claude/gi, desc: '硬编码 Administrator .claude 路径', replacement: '', }, // ===== HIGH: 私有基础设施 IP ===== { id: 'INFRA_IP_XINLIN', severity: 'HIGH', re: /\b8\.138\.11\.105\b/g, desc: '阿里云鑫霖服务器 IP', replacement: '', }, { id: 'INFRA_IP_MINGYUAN', severity: 'HIGH', re: /\b8\.134\.58\.157\b/g, desc: '阿里云明远服务器 IP', replacement: '', }, { id: 'INFRA_IP_PROXY', severity: 'HIGH', re: /\b175\.29\.205\.124\b/g, desc: 'Claude 注册代理 IP', replacement: '', }, { id: 'GITEA_HOST', severity: 'HIGH', re: /code\.letcareme\.com/gi, desc: '私有 Gitea 主机名', replacement: '', }, { id: 'PROXY_HOSTS', severity: 'HIGH', re: /\b(floppydata|kuailemon)\b[a-zA-Z0-9._\-]*/gi, desc: '代理服务商名称', replacement: '', }, // ===== MEDIUM: 客户/项目品牌词 ===== { id: 'BRAND_MINGYUAN', severity: 'MEDIUM', re: /明远生物|鑫霖|mingyuan|mybioweb|mybiooa|mybiollm|mybiolearn/gi, desc: '客户/项目品牌词 (明远生物体系)', replacement: 'ExampleCorp', }, { id: 'BRAND_VAX', severity: 'MEDIUM', re: /\b(vaxpolicy|vaxclinic|vaxcoldchain|vaxfuture)(?:\.(?:cn|com))?\b/gi, desc: 'vax 系列客户域名', replacement: 'ProjectX', }, { id: 'BRAND_BOOTREPO', severity: 'MEDIUM', re: /bookworm-boot|bookworm-admin-private/gi, desc: '引导仓库/私密档案路径名', replacement: '', }, // ===== LOW: 信息泄露类但非机密 ===== { id: 'CLIENT_DOMAIN_LETCAREME', severity: 'LOW', re: /letcareme\.com/gi, desc: '私域域名 letcareme.com', replacement: '', }, ]; // ---------- 文件遍历 ---------- function isExcluded(relPath) { return EXCLUDE_SUBPATHS.some((re) => re.test(relPath)); } function isBinary(filePath) { const ext = path.extname(filePath).toLowerCase(); return BINARY_EXTS.has(ext); } function* walk(dir, rootRel = '') { let entries; try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { return; } for (const ent of entries) { const abs = path.join(dir, ent.name); const rel = rootRel ? path.join(rootRel, ent.name) : ent.name; if (isExcluded(rel)) continue; if (ent.isDirectory()) { yield* walk(abs, rel); } else if (ent.isFile()) { yield { abs, rel }; } } } function collectTargets() { const targets = []; for (const d of INCLUDE_DIRS) { const abs = path.join(CLAUDE_ROOT, d); if (!fs.existsSync(abs)) continue; for (const f of walk(abs, d)) targets.push(f); } for (const f of INCLUDE_FILES) { const abs = path.join(CLAUDE_ROOT, f); if (fs.existsSync(abs) && fs.statSync(abs).isFile()) { targets.push({ abs, rel: f }); } } return targets; } // ---------- 扫描单文件 ---------- function ruleApplicable(rule, relPath) { if (!rule.whitelist) return true; return !rule.whitelist.some((re) => re.test(relPath)); } function scanFile({ abs, rel }) { const findings = []; let stat; try { stat = fs.statSync(abs); } catch { return findings; } if (stat.size > MAX_BYTES) return findings; if (isBinary(abs)) return findings; let content; try { content = fs.readFileSync(abs, 'utf8'); } catch { return findings; } const lines = content.split(/\r?\n/); // 预计算每行起始 offset (multiline 规则用于把 match.index 反查行号) const lineOffsets = [0]; for (let i = 0; i < content.length; i++) { if (content[i] === '\n') lineOffsets.push(i + 1); } const offsetToLine = (off) => { let lo = 0, hi = lineOffsets.length - 1; while (lo < hi) { const mid = (lo + hi + 1) >> 1; if (lineOffsets[mid] <= off) lo = mid; else hi = mid - 1; } return lo + 1; }; const matchBlocked = (rule, matchText) => { if (!rule.matchWhitelist) return false; return rule.matchWhitelist.some((re) => re.test(matchText)); }; for (const rule of RULES) { if (!ruleApplicable(rule, rel)) continue; if (rule.multiline) { // 全文扫描 const matches = [...content.matchAll(rule.re)]; for (const mm of matches) { if (matchBlocked(rule, mm[0])) continue; const lineNum = offsetToLine(mm.index); const snippet = (lines[lineNum - 1] || '').slice(0, 200); findings.push({ file: rel, line: lineNum, ruleId: rule.id, severity: rule.severity, desc: rule.desc, match: mm[0].slice(0, 80) + (mm[0].length > 80 ? '…' : ''), snippet, replacement: rule.replacement, }); } } else { // 逐行匹配以便记录行号 lines.forEach((line, idx) => { const m = [...line.matchAll(rule.re)]; for (const mm of m) { if (matchBlocked(rule, mm[0])) continue; findings.push({ file: rel, line: idx + 1, ruleId: rule.id, severity: rule.severity, desc: rule.desc, match: mm[0], snippet: line.length > 200 ? line.slice(0, 200) + '…' : line, replacement: rule.replacement, }); } }); } } return findings; } // ---------- 主流程 ---------- function main() { const args = new Set(process.argv.slice(2)); const jsonOnly = args.has('--json'); const apply = args.has('--apply'); if (apply) { console.error('ERROR: --apply 未实现 (保持 dry-run 安全)。如需真改,请用户确认后再启用。'); process.exit(2); } const started = Date.now(); const targets = collectTargets(); const allFindings = []; for (const t of targets) { allFindings.push(...scanFile(t)); } const elapsed = Date.now() - started; // 统计 const bySev = { CRITICAL: 0, HIGH: 0, MEDIUM: 0, LOW: 0 }; const byRule = {}; const byFile = {}; for (const f of allFindings) { bySev[f.severity]++; byRule[f.ruleId] = (byRule[f.ruleId] || 0) + 1; byFile[f.file] = (byFile[f.file] || 0) + 1; } if (jsonOnly) { process.stdout.write(JSON.stringify({ scannedFiles: targets.length, elapsedMs: elapsed, summary: { bySeverity: bySev, byRule, fileCount: Object.keys(byFile).length }, findings: allFindings, }, null, 2)); return; } // 人类可读报告 const report = []; report.push('=== SCRUBBER DRY-RUN REPORT ==='); report.push(`扫描根目录: ${CLAUDE_ROOT}`); report.push(`扫描文件数: ${targets.length}`); report.push(`发现数: ${allFindings.length} (用时 ${elapsed}ms)`); report.push(`严重度分布: CRITICAL=${bySev.CRITICAL} HIGH=${bySev.HIGH} MEDIUM=${bySev.MEDIUM} LOW=${bySev.LOW}`); report.push(`命中文件数: ${Object.keys(byFile).length}`); report.push(''); report.push('--- 规则命中 Top ---'); Object.entries(byRule) .sort((a, b) => b[1] - a[1]) .forEach(([id, n]) => report.push(` ${id.padEnd(28)} ${n}`)); report.push(''); report.push('--- 命中文件 Top 20 ---'); Object.entries(byFile) .sort((a, b) => b[1] - a[1]) .slice(0, 20) .forEach(([f, n]) => report.push(` ${String(n).padStart(4)} ${f}`)); report.push(''); report.push('--- CRITICAL / HIGH 详单 (最多 40 条) ---'); const critHigh = allFindings.filter((f) => f.severity === 'CRITICAL' || f.severity === 'HIGH'); critHigh.slice(0, 40).forEach((f) => { report.push(`[${f.severity}] ${f.file}:${f.line} ${f.ruleId} → "${f.match}"`); report.push(` ${f.snippet.trim()}`); }); if (critHigh.length > 40) report.push(` ... 还有 ${critHigh.length - 40} 条 CRITICAL/HIGH 未显示`); report.push(''); report.push('=== END ==='); console.log(report.join('\n')); // 同时写 JSON 到 tools/scrubber-report.json const jsonPath = path.join(__dirname, 'scrubber-report.json'); fs.writeFileSync(jsonPath, JSON.stringify({ scannedFiles: targets.length, elapsedMs: elapsed, summary: { bySeverity: bySev, byRule, fileCount: Object.keys(byFile).length }, findings: allFindings, }, null, 2)); console.log(`\n完整 JSON 已写入: ${path.relative(CLAUDE_ROOT, jsonPath)}`); } main();