bookworm-smart-assistant/tools/scrubber.mjs

451 lines
13 KiB
JavaScript
Raw Permalink Normal View History

#!/usr/bin/env node
// Bookworm Smart Assistant - 同步前敏感内容扫描器
// 目的: 扫描准备推送到 Gitea 的代码层, 检测凭证/路径/品牌/IP/邮箱等敏感信息
// 默认 dry-run: 只读扫描 + 输出报告, 不修改任何文件
// 使用: node tools/scrubber.mjs [--json] [--apply]
// --json 仅输出 JSON (管道友好)
// --apply 真修改 (高危, 需二次确认)
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const CLAUDE_ROOT = path.resolve(__dirname, '..');
// ---------- 同步白名单 (与推送脚本保持一致) ----------
const INCLUDE_DIRS = [
'agents',
'hooks',
'skills',
'lib',
'scripts',
'constitution',
'docs',
'templates',
'config',
'tests',
'tools',
];
const INCLUDE_FILES = [
'CLAUDE.md',
'package.json',
'feature-flags.json',
'feature-flags.json.sig',
// legacy 'integrity.sha256' 不同步 (与新 INTEGRITY.sha256 case 冲突)
'settings.template.json',
'settings.local.template.json',
'SKILL-REGISTRY.md',
'skills-index.json',
'skills-index-lite.json',
'stats-compiled.json',
];
// 二级排除 (白名单目录内仍需排除的子路径)
const EXCLUDE_SUBPATHS = [
/[\\/]_archived([\\/]|$)/i,
/[\\/]_deprecated([\\/]|$)/i,
/[\\/]node_modules([\\/]|$)/,
/[\\/]\.git([\\/]|$)/,
/[\\/]__pycache__([\\/]|$)/,
// P0.2: active-projects.md 含客户域名+IP, 不推送
/docs[\\/]active-projects\.md$/i,
// 决策 Q1=B: 测试目录不同步 (fixture 含真实路径/fake key)
/hooks[\\/]tests([\\/]|$)/i,
/hooks[\\/].*__tests__([\\/]|$)/i,
// 决策 Q3.1: scrubber 自身排除 (规则正则会自打)
/tools[\\/]scrubber\.mjs$/i,
/tools[\\/]scrubber-report\.json$/i,
// 备份/临时文件 (不同步)
/\.bak(\..+)?$/i,
/\.tmp(\..+)?$/i,
// 脱敏补丁脚本自身含原字符串作为锚点 (按设计)
/scripts[\\/]patches[\\/]patch-sync-[^\\/]+\.js$/i,
// 机器绑定的自同步配置 (不推送)
/config[\\/]auto-sync-repos\.json$/i,
// 历史一次性脚本 (已执行, 不推送)
/scripts[\\/]apply-settings-patch\.py$/i,
];
// 二进制扩展名跳过
const BINARY_EXTS = new Set([
'.png', '.jpg', '.jpeg', '.gif', '.webp', '.ico', '.bmp',
'.mp3', '.mp4', '.wav', '.ogg', '.webm',
'.pdf', '.zip', '.tar', '.gz', '.7z', '.rar',
'.exe', '.dll', '.so', '.dylib', '.bin',
'.woff', '.woff2', '.ttf', '.otf', '.eot',
]);
const MAX_BYTES = 2 * 1024 * 1024; // 2MB 以上跳过
// ---------- 敏感规则定义 ----------
// severity: CRITICAL > HIGH > MEDIUM > LOW
// replacement: null 表示必须人工决策, 否则为占位建议
const RULES = [
// ===== CRITICAL: 凭证/密钥/Token =====
{
id: 'ANTHROPIC_API_KEY',
severity: 'CRITICAL',
re: /sk-ant-[a-zA-Z0-9_\-]{20,}/g,
desc: 'Anthropic API Key 明文',
replacement: null,
// 排除明显的占位符 fixture
matchWhitelist: [/very-long/i, /DUMMY/i, /placeholder/i, /example/i, /fake/i, /test[_\-]?key/i],
},
{
id: 'OPENAI_API_KEY',
severity: 'CRITICAL',
re: /\bsk-[a-zA-Z0-9]{32,}\b/g,
desc: 'OpenAI/通用 sk- 密钥',
replacement: null,
},
{
id: 'GITHUB_TOKEN',
severity: 'CRITICAL',
re: /\bgh[pousr]_[A-Za-z0-9]{36,}\b/g,
desc: 'GitHub PAT Token',
replacement: null,
},
{
id: 'GITEA_TOKEN',
severity: 'CRITICAL',
// Q3.3: 必须有 token/secret/authorization/key 上下文, 避免 ETag/commit hash 误报
re: /\b(?:token|secret|authorization|api[_\-]?key|access[_\-]?token|bearer)["':= ]+[a-f0-9]{40}\b/gi,
desc: 'Gitea/长 hex token (带上下文)',
replacement: null,
whitelist: [/\.sha256$/, /integrity/i, /CHANGELOG/i, /\.version$/],
},
{
id: 'PRIVATE_KEY_PEM',
severity: 'CRITICAL',
// Q3.4: 必须有实际 body (至少 40 字符 base64 + END 标记), 排除纯格式说明
re: /-----BEGIN (?:RSA |EC |OPENSSH |ENCRYPTED |)PRIVATE KEY-----[\s\S]*?[A-Za-z0-9+/=]{40,}[\s\S]*?-----END/g,
desc: '私钥 PEM 块 (含 body)',
replacement: null,
multiline: true,
},
{
id: 'USER_EMAIL_PERSONAL',
severity: 'CRITICAL',
re: /timoteofatima283@gmail\.com/gi,
desc: '用户个人邮箱',
replacement: 'user@example.com',
},
{
id: 'OLD_GITEA_PASSWORD',
severity: 'CRITICAL',
re: /\bmybio668\b/g,
desc: '已轮换的 Gitea 旧密码 mybio668',
replacement: null,
},
// ===== HIGH: 硬编码路径 =====
{
id: 'HARDCODED_PATH_WIN',
severity: 'HIGH',
re: /C:[\\/]+Users[\\/]+leesu/gi,
desc: '硬编码 Windows 用户路径 C:\\Users\\leesu',
replacement: '<CLAUDE_ROOT>',
},
{
id: 'HARDCODED_PATH_MSYS',
severity: 'HIGH',
re: /\/c\/Users\/leesu/g,
desc: '硬编码 MSYS/Git Bash 路径 /c/Users/leesu',
replacement: '<CLAUDE_ROOT_MSYS>',
},
{
id: 'HARDCODED_PATH_ADMIN',
severity: 'HIGH',
re: /C:[\\/]+Users[\\/]+Administrator[\\/]+\.claude/gi,
desc: '硬编码 Administrator .claude 路径',
replacement: '<CLAUDE_ROOT>',
},
// ===== HIGH: 私有基础设施 IP =====
{
id: 'INFRA_IP_XINLIN',
severity: 'HIGH',
re: /\b8\.138\.11\.105\b/g,
desc: '阿里云鑫霖服务器 IP',
replacement: '<SERVER_A_IP>',
},
{
id: 'INFRA_IP_MINGYUAN',
severity: 'HIGH',
re: /\b8\.134\.58\.157\b/g,
desc: '阿里云明远服务器 IP',
replacement: '<SERVER_B_IP>',
},
{
id: 'INFRA_IP_PROXY',
severity: 'HIGH',
re: /\b175\.29\.205\.124\b/g,
desc: 'Claude 注册代理 IP',
replacement: '<PROXY_IP>',
},
{
id: 'GITEA_HOST',
severity: 'HIGH',
re: /code\.letcareme\.com/gi,
desc: '私有 Gitea 主机名',
replacement: '<GITEA_HOST>',
},
{
id: 'PROXY_HOSTS',
severity: 'HIGH',
re: /\b(floppydata|kuailemon)\b[a-zA-Z0-9._\-]*/gi,
desc: '代理服务商名称',
replacement: '<PROXY_VENDOR>',
},
// ===== MEDIUM: 客户/项目品牌词 =====
{
id: 'BRAND_MINGYUAN',
severity: 'MEDIUM',
re: /明远生物|鑫霖|mingyuan|mybioweb|mybiooa|mybiollm|mybiolearn/gi,
desc: '客户/项目品牌词 (明远生物体系)',
replacement: 'ExampleCorp',
},
{
id: 'BRAND_VAX',
severity: 'MEDIUM',
re: /\b(vaxpolicy|vaxclinic|vaxcoldchain|vaxfuture)(?:\.(?:cn|com))?\b/gi,
desc: 'vax 系列客户域名',
replacement: 'ProjectX',
},
{
id: 'BRAND_BOOTREPO',
severity: 'MEDIUM',
re: /bookworm-boot|bookworm-admin-private/gi,
desc: '引导仓库/私密档案路径名',
replacement: '<BOOT_REPO>',
},
// ===== LOW: 信息泄露类但非机密 =====
{
id: 'CLIENT_DOMAIN_LETCAREME',
severity: 'LOW',
re: /letcareme\.com/gi,
desc: '私域域名 letcareme.com',
replacement: '<PRIVATE_DOMAIN>',
},
];
// ---------- 文件遍历 ----------
function isExcluded(relPath) {
return EXCLUDE_SUBPATHS.some((re) => re.test(relPath));
}
function isBinary(filePath) {
const ext = path.extname(filePath).toLowerCase();
return BINARY_EXTS.has(ext);
}
function* walk(dir, rootRel = '') {
let entries;
try {
entries = fs.readdirSync(dir, { withFileTypes: true });
} catch {
return;
}
for (const ent of entries) {
const abs = path.join(dir, ent.name);
const rel = rootRel ? path.join(rootRel, ent.name) : ent.name;
if (isExcluded(rel)) continue;
if (ent.isDirectory()) {
yield* walk(abs, rel);
} else if (ent.isFile()) {
yield { abs, rel };
}
}
}
function collectTargets() {
const targets = [];
for (const d of INCLUDE_DIRS) {
const abs = path.join(CLAUDE_ROOT, d);
if (!fs.existsSync(abs)) continue;
for (const f of walk(abs, d)) targets.push(f);
}
for (const f of INCLUDE_FILES) {
const abs = path.join(CLAUDE_ROOT, f);
if (fs.existsSync(abs) && fs.statSync(abs).isFile()) {
targets.push({ abs, rel: f });
}
}
return targets;
}
// ---------- 扫描单文件 ----------
function ruleApplicable(rule, relPath) {
if (!rule.whitelist) return true;
return !rule.whitelist.some((re) => re.test(relPath));
}
function scanFile({ abs, rel }) {
const findings = [];
let stat;
try {
stat = fs.statSync(abs);
} catch {
return findings;
}
if (stat.size > MAX_BYTES) return findings;
if (isBinary(abs)) return findings;
let content;
try {
content = fs.readFileSync(abs, 'utf8');
} catch {
return findings;
}
const lines = content.split(/\r?\n/);
// 预计算每行起始 offset (multiline 规则用于把 match.index 反查行号)
const lineOffsets = [0];
for (let i = 0; i < content.length; i++) {
if (content[i] === '\n') lineOffsets.push(i + 1);
}
const offsetToLine = (off) => {
let lo = 0, hi = lineOffsets.length - 1;
while (lo < hi) {
const mid = (lo + hi + 1) >> 1;
if (lineOffsets[mid] <= off) lo = mid;
else hi = mid - 1;
}
return lo + 1;
};
const matchBlocked = (rule, matchText) => {
if (!rule.matchWhitelist) return false;
return rule.matchWhitelist.some((re) => re.test(matchText));
};
for (const rule of RULES) {
if (!ruleApplicable(rule, rel)) continue;
if (rule.multiline) {
// 全文扫描
const matches = [...content.matchAll(rule.re)];
for (const mm of matches) {
if (matchBlocked(rule, mm[0])) continue;
const lineNum = offsetToLine(mm.index);
const snippet = (lines[lineNum - 1] || '').slice(0, 200);
findings.push({
file: rel,
line: lineNum,
ruleId: rule.id,
severity: rule.severity,
desc: rule.desc,
match: mm[0].slice(0, 80) + (mm[0].length > 80 ? '…' : ''),
snippet,
replacement: rule.replacement,
});
}
} else {
// 逐行匹配以便记录行号
lines.forEach((line, idx) => {
const m = [...line.matchAll(rule.re)];
for (const mm of m) {
if (matchBlocked(rule, mm[0])) continue;
findings.push({
file: rel,
line: idx + 1,
ruleId: rule.id,
severity: rule.severity,
desc: rule.desc,
match: mm[0],
snippet: line.length > 200 ? line.slice(0, 200) + '…' : line,
replacement: rule.replacement,
});
}
});
}
}
return findings;
}
// ---------- 主流程 ----------
function main() {
const args = new Set(process.argv.slice(2));
const jsonOnly = args.has('--json');
const apply = args.has('--apply');
if (apply) {
console.error('ERROR: --apply 未实现 (保持 dry-run 安全)。如需真改,请用户确认后再启用。');
process.exit(2);
}
const started = Date.now();
const targets = collectTargets();
const allFindings = [];
for (const t of targets) {
allFindings.push(...scanFile(t));
}
const elapsed = Date.now() - started;
// 统计
const bySev = { CRITICAL: 0, HIGH: 0, MEDIUM: 0, LOW: 0 };
const byRule = {};
const byFile = {};
for (const f of allFindings) {
bySev[f.severity]++;
byRule[f.ruleId] = (byRule[f.ruleId] || 0) + 1;
byFile[f.file] = (byFile[f.file] || 0) + 1;
}
if (jsonOnly) {
process.stdout.write(JSON.stringify({
scannedFiles: targets.length,
elapsedMs: elapsed,
summary: { bySeverity: bySev, byRule, fileCount: Object.keys(byFile).length },
findings: allFindings,
}, null, 2));
return;
}
// 人类可读报告
const report = [];
report.push('=== SCRUBBER DRY-RUN REPORT ===');
report.push(`扫描根目录: ${CLAUDE_ROOT}`);
report.push(`扫描文件数: ${targets.length}`);
report.push(`发现数: ${allFindings.length} (用时 ${elapsed}ms)`);
report.push(`严重度分布: CRITICAL=${bySev.CRITICAL} HIGH=${bySev.HIGH} MEDIUM=${bySev.MEDIUM} LOW=${bySev.LOW}`);
report.push(`命中文件数: ${Object.keys(byFile).length}`);
report.push('');
report.push('--- 规则命中 Top ---');
Object.entries(byRule)
.sort((a, b) => b[1] - a[1])
.forEach(([id, n]) => report.push(` ${id.padEnd(28)} ${n}`));
report.push('');
report.push('--- 命中文件 Top 20 ---');
Object.entries(byFile)
.sort((a, b) => b[1] - a[1])
.slice(0, 20)
.forEach(([f, n]) => report.push(` ${String(n).padStart(4)} ${f}`));
report.push('');
report.push('--- CRITICAL / HIGH 详单 (最多 40 条) ---');
const critHigh = allFindings.filter((f) => f.severity === 'CRITICAL' || f.severity === 'HIGH');
critHigh.slice(0, 40).forEach((f) => {
report.push(`[${f.severity}] ${f.file}:${f.line} ${f.ruleId} → "${f.match}"`);
report.push(` ${f.snippet.trim()}`);
});
if (critHigh.length > 40) report.push(` ... 还有 ${critHigh.length - 40} 条 CRITICAL/HIGH 未显示`);
report.push('');
report.push('=== END ===');
console.log(report.join('\n'));
// 同时写 JSON 到 tools/scrubber-report.json
const jsonPath = path.join(__dirname, 'scrubber-report.json');
fs.writeFileSync(jsonPath, JSON.stringify({
scannedFiles: targets.length,
elapsedMs: elapsed,
summary: { bySeverity: bySev, byRule, fileCount: Object.keys(byFile).length },
findings: allFindings,
}, null, 2));
console.log(`\n完整 JSON 已写入: ${path.relative(CLAUDE_ROOT, jsonPath)}`);
}
main();