451 lines
13 KiB
JavaScript
451 lines
13 KiB
JavaScript
|
|
#!/usr/bin/env node
|
||
|
|
// Bookworm Smart Assistant - 同步前敏感内容扫描器
|
||
|
|
// 目的: 扫描准备推送到 Gitea 的代码层, 检测凭证/路径/品牌/IP/邮箱等敏感信息
|
||
|
|
// 默认 dry-run: 只读扫描 + 输出报告, 不修改任何文件
|
||
|
|
// 使用: node tools/scrubber.mjs [--json] [--apply]
|
||
|
|
// --json 仅输出 JSON (管道友好)
|
||
|
|
// --apply 真修改 (高危, 需二次确认)
|
||
|
|
|
||
|
|
import fs from 'node:fs';
|
||
|
|
import path from 'node:path';
|
||
|
|
import { fileURLToPath } from 'node:url';
|
||
|
|
|
||
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||
|
|
const CLAUDE_ROOT = path.resolve(__dirname, '..');
|
||
|
|
|
||
|
|
// ---------- 同步白名单 (与推送脚本保持一致) ----------
|
||
|
|
const INCLUDE_DIRS = [
|
||
|
|
'agents',
|
||
|
|
'hooks',
|
||
|
|
'skills',
|
||
|
|
'lib',
|
||
|
|
'scripts',
|
||
|
|
'constitution',
|
||
|
|
'docs',
|
||
|
|
'templates',
|
||
|
|
'config',
|
||
|
|
'tests',
|
||
|
|
'tools',
|
||
|
|
];
|
||
|
|
const INCLUDE_FILES = [
|
||
|
|
'CLAUDE.md',
|
||
|
|
'package.json',
|
||
|
|
'feature-flags.json',
|
||
|
|
'feature-flags.json.sig',
|
||
|
|
// legacy 'integrity.sha256' 不同步 (与新 INTEGRITY.sha256 case 冲突)
|
||
|
|
'settings.template.json',
|
||
|
|
'settings.local.template.json',
|
||
|
|
'SKILL-REGISTRY.md',
|
||
|
|
'skills-index.json',
|
||
|
|
'skills-index-lite.json',
|
||
|
|
'stats-compiled.json',
|
||
|
|
];
|
||
|
|
|
||
|
|
// 二级排除 (白名单目录内仍需排除的子路径)
|
||
|
|
const EXCLUDE_SUBPATHS = [
|
||
|
|
/[\\/]_archived([\\/]|$)/i,
|
||
|
|
/[\\/]_deprecated([\\/]|$)/i,
|
||
|
|
/[\\/]node_modules([\\/]|$)/,
|
||
|
|
/[\\/]\.git([\\/]|$)/,
|
||
|
|
/[\\/]__pycache__([\\/]|$)/,
|
||
|
|
// P0.2: active-projects.md 含客户域名+IP, 不推送
|
||
|
|
/docs[\\/]active-projects\.md$/i,
|
||
|
|
// 决策 Q1=B: 测试目录不同步 (fixture 含真实路径/fake key)
|
||
|
|
/hooks[\\/]tests([\\/]|$)/i,
|
||
|
|
/hooks[\\/].*__tests__([\\/]|$)/i,
|
||
|
|
// 决策 Q3.1: scrubber 自身排除 (规则正则会自打)
|
||
|
|
/tools[\\/]scrubber\.mjs$/i,
|
||
|
|
/tools[\\/]scrubber-report\.json$/i,
|
||
|
|
// 备份/临时文件 (不同步)
|
||
|
|
/\.bak(\..+)?$/i,
|
||
|
|
/\.tmp(\..+)?$/i,
|
||
|
|
// 脱敏补丁脚本自身含原字符串作为锚点 (按设计)
|
||
|
|
/scripts[\\/]patches[\\/]patch-sync-[^\\/]+\.js$/i,
|
||
|
|
// 机器绑定的自同步配置 (不推送)
|
||
|
|
/config[\\/]auto-sync-repos\.json$/i,
|
||
|
|
// 历史一次性脚本 (已执行, 不推送)
|
||
|
|
/scripts[\\/]apply-settings-patch\.py$/i,
|
||
|
|
];
|
||
|
|
|
||
|
|
// 二进制扩展名跳过
|
||
|
|
const BINARY_EXTS = new Set([
|
||
|
|
'.png', '.jpg', '.jpeg', '.gif', '.webp', '.ico', '.bmp',
|
||
|
|
'.mp3', '.mp4', '.wav', '.ogg', '.webm',
|
||
|
|
'.pdf', '.zip', '.tar', '.gz', '.7z', '.rar',
|
||
|
|
'.exe', '.dll', '.so', '.dylib', '.bin',
|
||
|
|
'.woff', '.woff2', '.ttf', '.otf', '.eot',
|
||
|
|
]);
|
||
|
|
const MAX_BYTES = 2 * 1024 * 1024; // 2MB 以上跳过
|
||
|
|
|
||
|
|
// ---------- 敏感规则定义 ----------
|
||
|
|
// severity: CRITICAL > HIGH > MEDIUM > LOW
|
||
|
|
// replacement: null 表示必须人工决策, 否则为占位建议
|
||
|
|
const RULES = [
|
||
|
|
// ===== CRITICAL: 凭证/密钥/Token =====
|
||
|
|
{
|
||
|
|
id: 'ANTHROPIC_API_KEY',
|
||
|
|
severity: 'CRITICAL',
|
||
|
|
re: /sk-ant-[a-zA-Z0-9_\-]{20,}/g,
|
||
|
|
desc: 'Anthropic API Key 明文',
|
||
|
|
replacement: null,
|
||
|
|
// 排除明显的占位符 fixture
|
||
|
|
matchWhitelist: [/very-long/i, /DUMMY/i, /placeholder/i, /example/i, /fake/i, /test[_\-]?key/i],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'OPENAI_API_KEY',
|
||
|
|
severity: 'CRITICAL',
|
||
|
|
re: /\bsk-[a-zA-Z0-9]{32,}\b/g,
|
||
|
|
desc: 'OpenAI/通用 sk- 密钥',
|
||
|
|
replacement: null,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'GITHUB_TOKEN',
|
||
|
|
severity: 'CRITICAL',
|
||
|
|
re: /\bgh[pousr]_[A-Za-z0-9]{36,}\b/g,
|
||
|
|
desc: 'GitHub PAT Token',
|
||
|
|
replacement: null,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'GITEA_TOKEN',
|
||
|
|
severity: 'CRITICAL',
|
||
|
|
// Q3.3: 必须有 token/secret/authorization/key 上下文, 避免 ETag/commit hash 误报
|
||
|
|
re: /\b(?:token|secret|authorization|api[_\-]?key|access[_\-]?token|bearer)["':= ]+[a-f0-9]{40}\b/gi,
|
||
|
|
desc: 'Gitea/长 hex token (带上下文)',
|
||
|
|
replacement: null,
|
||
|
|
whitelist: [/\.sha256$/, /integrity/i, /CHANGELOG/i, /\.version$/],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'PRIVATE_KEY_PEM',
|
||
|
|
severity: 'CRITICAL',
|
||
|
|
// Q3.4: 必须有实际 body (至少 40 字符 base64 + END 标记), 排除纯格式说明
|
||
|
|
re: /-----BEGIN (?:RSA |EC |OPENSSH |ENCRYPTED |)PRIVATE KEY-----[\s\S]*?[A-Za-z0-9+/=]{40,}[\s\S]*?-----END/g,
|
||
|
|
desc: '私钥 PEM 块 (含 body)',
|
||
|
|
replacement: null,
|
||
|
|
multiline: true,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'USER_EMAIL_PERSONAL',
|
||
|
|
severity: 'CRITICAL',
|
||
|
|
re: /timoteofatima283@gmail\.com/gi,
|
||
|
|
desc: '用户个人邮箱',
|
||
|
|
replacement: 'user@example.com',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'OLD_GITEA_PASSWORD',
|
||
|
|
severity: 'CRITICAL',
|
||
|
|
re: /\bmybio668\b/g,
|
||
|
|
desc: '已轮换的 Gitea 旧密码 mybio668',
|
||
|
|
replacement: null,
|
||
|
|
},
|
||
|
|
|
||
|
|
// ===== HIGH: 硬编码路径 =====
|
||
|
|
{
|
||
|
|
id: 'HARDCODED_PATH_WIN',
|
||
|
|
severity: 'HIGH',
|
||
|
|
re: /C:[\\/]+Users[\\/]+leesu/gi,
|
||
|
|
desc: '硬编码 Windows 用户路径 C:\\Users\\leesu',
|
||
|
|
replacement: '<CLAUDE_ROOT>',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'HARDCODED_PATH_MSYS',
|
||
|
|
severity: 'HIGH',
|
||
|
|
re: /\/c\/Users\/leesu/g,
|
||
|
|
desc: '硬编码 MSYS/Git Bash 路径 /c/Users/leesu',
|
||
|
|
replacement: '<CLAUDE_ROOT_MSYS>',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'HARDCODED_PATH_ADMIN',
|
||
|
|
severity: 'HIGH',
|
||
|
|
re: /C:[\\/]+Users[\\/]+Administrator[\\/]+\.claude/gi,
|
||
|
|
desc: '硬编码 Administrator .claude 路径',
|
||
|
|
replacement: '<CLAUDE_ROOT>',
|
||
|
|
},
|
||
|
|
|
||
|
|
// ===== HIGH: 私有基础设施 IP =====
|
||
|
|
{
|
||
|
|
id: 'INFRA_IP_XINLIN',
|
||
|
|
severity: 'HIGH',
|
||
|
|
re: /\b8\.138\.11\.105\b/g,
|
||
|
|
desc: '阿里云鑫霖服务器 IP',
|
||
|
|
replacement: '<SERVER_A_IP>',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'INFRA_IP_MINGYUAN',
|
||
|
|
severity: 'HIGH',
|
||
|
|
re: /\b8\.134\.58\.157\b/g,
|
||
|
|
desc: '阿里云明远服务器 IP',
|
||
|
|
replacement: '<SERVER_B_IP>',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'INFRA_IP_PROXY',
|
||
|
|
severity: 'HIGH',
|
||
|
|
re: /\b175\.29\.205\.124\b/g,
|
||
|
|
desc: 'Claude 注册代理 IP',
|
||
|
|
replacement: '<PROXY_IP>',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'GITEA_HOST',
|
||
|
|
severity: 'HIGH',
|
||
|
|
re: /code\.letcareme\.com/gi,
|
||
|
|
desc: '私有 Gitea 主机名',
|
||
|
|
replacement: '<GITEA_HOST>',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'PROXY_HOSTS',
|
||
|
|
severity: 'HIGH',
|
||
|
|
re: /\b(floppydata|kuailemon)\b[a-zA-Z0-9._\-]*/gi,
|
||
|
|
desc: '代理服务商名称',
|
||
|
|
replacement: '<PROXY_VENDOR>',
|
||
|
|
},
|
||
|
|
|
||
|
|
// ===== MEDIUM: 客户/项目品牌词 =====
|
||
|
|
{
|
||
|
|
id: 'BRAND_MINGYUAN',
|
||
|
|
severity: 'MEDIUM',
|
||
|
|
re: /明远生物|鑫霖|mingyuan|mybioweb|mybiooa|mybiollm|mybiolearn/gi,
|
||
|
|
desc: '客户/项目品牌词 (明远生物体系)',
|
||
|
|
replacement: 'ExampleCorp',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'BRAND_VAX',
|
||
|
|
severity: 'MEDIUM',
|
||
|
|
re: /\b(vaxpolicy|vaxclinic|vaxcoldchain|vaxfuture)(?:\.(?:cn|com))?\b/gi,
|
||
|
|
desc: 'vax 系列客户域名',
|
||
|
|
replacement: 'ProjectX',
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: 'BRAND_BOOTREPO',
|
||
|
|
severity: 'MEDIUM',
|
||
|
|
re: /bookworm-boot|bookworm-admin-private/gi,
|
||
|
|
desc: '引导仓库/私密档案路径名',
|
||
|
|
replacement: '<BOOT_REPO>',
|
||
|
|
},
|
||
|
|
|
||
|
|
// ===== LOW: 信息泄露类但非机密 =====
|
||
|
|
{
|
||
|
|
id: 'CLIENT_DOMAIN_LETCAREME',
|
||
|
|
severity: 'LOW',
|
||
|
|
re: /letcareme\.com/gi,
|
||
|
|
desc: '私域域名 letcareme.com',
|
||
|
|
replacement: '<PRIVATE_DOMAIN>',
|
||
|
|
},
|
||
|
|
];
|
||
|
|
|
||
|
|
// ---------- 文件遍历 ----------
|
||
|
|
function isExcluded(relPath) {
|
||
|
|
return EXCLUDE_SUBPATHS.some((re) => re.test(relPath));
|
||
|
|
}
|
||
|
|
|
||
|
|
function isBinary(filePath) {
|
||
|
|
const ext = path.extname(filePath).toLowerCase();
|
||
|
|
return BINARY_EXTS.has(ext);
|
||
|
|
}
|
||
|
|
|
||
|
|
function* walk(dir, rootRel = '') {
|
||
|
|
let entries;
|
||
|
|
try {
|
||
|
|
entries = fs.readdirSync(dir, { withFileTypes: true });
|
||
|
|
} catch {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
for (const ent of entries) {
|
||
|
|
const abs = path.join(dir, ent.name);
|
||
|
|
const rel = rootRel ? path.join(rootRel, ent.name) : ent.name;
|
||
|
|
if (isExcluded(rel)) continue;
|
||
|
|
if (ent.isDirectory()) {
|
||
|
|
yield* walk(abs, rel);
|
||
|
|
} else if (ent.isFile()) {
|
||
|
|
yield { abs, rel };
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function collectTargets() {
|
||
|
|
const targets = [];
|
||
|
|
for (const d of INCLUDE_DIRS) {
|
||
|
|
const abs = path.join(CLAUDE_ROOT, d);
|
||
|
|
if (!fs.existsSync(abs)) continue;
|
||
|
|
for (const f of walk(abs, d)) targets.push(f);
|
||
|
|
}
|
||
|
|
for (const f of INCLUDE_FILES) {
|
||
|
|
const abs = path.join(CLAUDE_ROOT, f);
|
||
|
|
if (fs.existsSync(abs) && fs.statSync(abs).isFile()) {
|
||
|
|
targets.push({ abs, rel: f });
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return targets;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ---------- 扫描单文件 ----------
|
||
|
|
function ruleApplicable(rule, relPath) {
|
||
|
|
if (!rule.whitelist) return true;
|
||
|
|
return !rule.whitelist.some((re) => re.test(relPath));
|
||
|
|
}
|
||
|
|
|
||
|
|
function scanFile({ abs, rel }) {
|
||
|
|
const findings = [];
|
||
|
|
let stat;
|
||
|
|
try {
|
||
|
|
stat = fs.statSync(abs);
|
||
|
|
} catch {
|
||
|
|
return findings;
|
||
|
|
}
|
||
|
|
if (stat.size > MAX_BYTES) return findings;
|
||
|
|
if (isBinary(abs)) return findings;
|
||
|
|
|
||
|
|
let content;
|
||
|
|
try {
|
||
|
|
content = fs.readFileSync(abs, 'utf8');
|
||
|
|
} catch {
|
||
|
|
return findings;
|
||
|
|
}
|
||
|
|
const lines = content.split(/\r?\n/);
|
||
|
|
|
||
|
|
// 预计算每行起始 offset (multiline 规则用于把 match.index 反查行号)
|
||
|
|
const lineOffsets = [0];
|
||
|
|
for (let i = 0; i < content.length; i++) {
|
||
|
|
if (content[i] === '\n') lineOffsets.push(i + 1);
|
||
|
|
}
|
||
|
|
const offsetToLine = (off) => {
|
||
|
|
let lo = 0, hi = lineOffsets.length - 1;
|
||
|
|
while (lo < hi) {
|
||
|
|
const mid = (lo + hi + 1) >> 1;
|
||
|
|
if (lineOffsets[mid] <= off) lo = mid;
|
||
|
|
else hi = mid - 1;
|
||
|
|
}
|
||
|
|
return lo + 1;
|
||
|
|
};
|
||
|
|
|
||
|
|
const matchBlocked = (rule, matchText) => {
|
||
|
|
if (!rule.matchWhitelist) return false;
|
||
|
|
return rule.matchWhitelist.some((re) => re.test(matchText));
|
||
|
|
};
|
||
|
|
|
||
|
|
for (const rule of RULES) {
|
||
|
|
if (!ruleApplicable(rule, rel)) continue;
|
||
|
|
|
||
|
|
if (rule.multiline) {
|
||
|
|
// 全文扫描
|
||
|
|
const matches = [...content.matchAll(rule.re)];
|
||
|
|
for (const mm of matches) {
|
||
|
|
if (matchBlocked(rule, mm[0])) continue;
|
||
|
|
const lineNum = offsetToLine(mm.index);
|
||
|
|
const snippet = (lines[lineNum - 1] || '').slice(0, 200);
|
||
|
|
findings.push({
|
||
|
|
file: rel,
|
||
|
|
line: lineNum,
|
||
|
|
ruleId: rule.id,
|
||
|
|
severity: rule.severity,
|
||
|
|
desc: rule.desc,
|
||
|
|
match: mm[0].slice(0, 80) + (mm[0].length > 80 ? '…' : ''),
|
||
|
|
snippet,
|
||
|
|
replacement: rule.replacement,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
// 逐行匹配以便记录行号
|
||
|
|
lines.forEach((line, idx) => {
|
||
|
|
const m = [...line.matchAll(rule.re)];
|
||
|
|
for (const mm of m) {
|
||
|
|
if (matchBlocked(rule, mm[0])) continue;
|
||
|
|
findings.push({
|
||
|
|
file: rel,
|
||
|
|
line: idx + 1,
|
||
|
|
ruleId: rule.id,
|
||
|
|
severity: rule.severity,
|
||
|
|
desc: rule.desc,
|
||
|
|
match: mm[0],
|
||
|
|
snippet: line.length > 200 ? line.slice(0, 200) + '…' : line,
|
||
|
|
replacement: rule.replacement,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return findings;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ---------- 主流程 ----------
|
||
|
|
function main() {
|
||
|
|
const args = new Set(process.argv.slice(2));
|
||
|
|
const jsonOnly = args.has('--json');
|
||
|
|
const apply = args.has('--apply');
|
||
|
|
|
||
|
|
if (apply) {
|
||
|
|
console.error('ERROR: --apply 未实现 (保持 dry-run 安全)。如需真改,请用户确认后再启用。');
|
||
|
|
process.exit(2);
|
||
|
|
}
|
||
|
|
|
||
|
|
const started = Date.now();
|
||
|
|
const targets = collectTargets();
|
||
|
|
const allFindings = [];
|
||
|
|
for (const t of targets) {
|
||
|
|
allFindings.push(...scanFile(t));
|
||
|
|
}
|
||
|
|
const elapsed = Date.now() - started;
|
||
|
|
|
||
|
|
// 统计
|
||
|
|
const bySev = { CRITICAL: 0, HIGH: 0, MEDIUM: 0, LOW: 0 };
|
||
|
|
const byRule = {};
|
||
|
|
const byFile = {};
|
||
|
|
for (const f of allFindings) {
|
||
|
|
bySev[f.severity]++;
|
||
|
|
byRule[f.ruleId] = (byRule[f.ruleId] || 0) + 1;
|
||
|
|
byFile[f.file] = (byFile[f.file] || 0) + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (jsonOnly) {
|
||
|
|
process.stdout.write(JSON.stringify({
|
||
|
|
scannedFiles: targets.length,
|
||
|
|
elapsedMs: elapsed,
|
||
|
|
summary: { bySeverity: bySev, byRule, fileCount: Object.keys(byFile).length },
|
||
|
|
findings: allFindings,
|
||
|
|
}, null, 2));
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// 人类可读报告
|
||
|
|
const report = [];
|
||
|
|
report.push('=== SCRUBBER DRY-RUN REPORT ===');
|
||
|
|
report.push(`扫描根目录: ${CLAUDE_ROOT}`);
|
||
|
|
report.push(`扫描文件数: ${targets.length}`);
|
||
|
|
report.push(`发现数: ${allFindings.length} (用时 ${elapsed}ms)`);
|
||
|
|
report.push(`严重度分布: CRITICAL=${bySev.CRITICAL} HIGH=${bySev.HIGH} MEDIUM=${bySev.MEDIUM} LOW=${bySev.LOW}`);
|
||
|
|
report.push(`命中文件数: ${Object.keys(byFile).length}`);
|
||
|
|
report.push('');
|
||
|
|
report.push('--- 规则命中 Top ---');
|
||
|
|
Object.entries(byRule)
|
||
|
|
.sort((a, b) => b[1] - a[1])
|
||
|
|
.forEach(([id, n]) => report.push(` ${id.padEnd(28)} ${n}`));
|
||
|
|
report.push('');
|
||
|
|
report.push('--- 命中文件 Top 20 ---');
|
||
|
|
Object.entries(byFile)
|
||
|
|
.sort((a, b) => b[1] - a[1])
|
||
|
|
.slice(0, 20)
|
||
|
|
.forEach(([f, n]) => report.push(` ${String(n).padStart(4)} ${f}`));
|
||
|
|
report.push('');
|
||
|
|
report.push('--- CRITICAL / HIGH 详单 (最多 40 条) ---');
|
||
|
|
const critHigh = allFindings.filter((f) => f.severity === 'CRITICAL' || f.severity === 'HIGH');
|
||
|
|
critHigh.slice(0, 40).forEach((f) => {
|
||
|
|
report.push(`[${f.severity}] ${f.file}:${f.line} ${f.ruleId} → "${f.match}"`);
|
||
|
|
report.push(` ${f.snippet.trim()}`);
|
||
|
|
});
|
||
|
|
if (critHigh.length > 40) report.push(` ... 还有 ${critHigh.length - 40} 条 CRITICAL/HIGH 未显示`);
|
||
|
|
report.push('');
|
||
|
|
report.push('=== END ===');
|
||
|
|
|
||
|
|
console.log(report.join('\n'));
|
||
|
|
|
||
|
|
// 同时写 JSON 到 tools/scrubber-report.json
|
||
|
|
const jsonPath = path.join(__dirname, 'scrubber-report.json');
|
||
|
|
fs.writeFileSync(jsonPath, JSON.stringify({
|
||
|
|
scannedFiles: targets.length,
|
||
|
|
elapsedMs: elapsed,
|
||
|
|
summary: { bySeverity: bySev, byRule, fileCount: Object.keys(byFile).length },
|
||
|
|
findings: allFindings,
|
||
|
|
}, null, 2));
|
||
|
|
console.log(`\n完整 JSON 已写入: ${path.relative(CLAUDE_ROOT, jsonPath)}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
main();
|