#!/usr/bin/env node // patch-x02-cjk-three-segment-sample.js // P1: sampleBytesPerToken() 仅采样头8KB导致CJK占比偏低~25% // 修复: 三段采样(头8KB + 中8KB + 尾8KB)加权平均 'use strict'; const fs = require('fs'); const path = require('path'); const SENTINEL = '// [PATCH-X02-THREE-SEGMENT-SAMPLE]'; const target = path.join(__dirname, '..', '..', 'hooks', 'context-pressure-monitor.js'); if (!fs.existsSync(target)) { process.stdout.write('[SKIP] target not found\n'); process.exit(0); } let content = fs.readFileSync(target, 'utf8'); if (content.includes(SENTINEL)) { process.stdout.write('[SKIP] patch-x02 already applied\n'); process.exit(0); } // 备份 const bak = target + '.bak.x02'; if (!fs.existsSync(bak)) fs.writeFileSync(bak, content); const OLD_FN = `function sampleBytesPerToken(tp) { try { const fd = fs.openSync(tp, 'r'); const buf = Buffer.alloc(8192); const n = fs.readSync(fd, buf, 0, 8192, 0); fs.closeSync(fd); if (n < 200) return BYTES_PER_TOKEN; // 样本太小, 用默认 let cjkBytes = 0; // CJK Unified Ideographs (U+4E00-U+9FFF) UTF-8: E4-E9 起始的 3 字节序列 // CJK ext A (U+3400-U+4DBF) UTF-8: E3 起始 + 第二字节 90-9F // 简化: 统计 0xE3-0xE9 起始的 3 字节序列首字节即可 for (let i = 0; i < n; i++) { const b = buf[i]; if (b >= 0xE3 && b <= 0xE9) cjkBytes += 3; } const cjkRatio = cjkBytes / n; if (cjkRatio >= 0.40) return 2.2; if (cjkRatio >= 0.15) return 2.8; return 3.5; } catch { return BYTES_PER_TOKEN; } }`; const NEW_FN = `function sampleBytesPerToken(tp) { ${SENTINEL} try { const fd = fs.openSync(tp, 'r'); const fileSize = fs.fstatSync(fd).size; if (fileSize < 200) { fs.closeSync(fd); return BYTES_PER_TOKEN; } const CHUNK = 8192; const offsets = [0]; if (fileSize > CHUNK * 3) { offsets.push(Math.floor(fileSize / 2) - Math.floor(CHUNK / 2)); offsets.push(Math.max(0, fileSize - CHUNK)); } else if (fileSize > CHUNK) { offsets.push(Math.max(0, fileSize - CHUNK)); } let totalBytes = 0, totalCjk = 0; const buf = Buffer.alloc(CHUNK); for (const off of offsets) { const n = fs.readSync(fd, buf, 0, CHUNK, off); if (n < 100) continue; let cjk = 0; for (let i = 0; i < n; i++) { const b = buf[i]; if (b >= 0xE3 && b <= 0xE9) cjk += 3; } totalBytes += n; totalCjk += cjk; } fs.closeSync(fd); if (totalBytes < 200) return BYTES_PER_TOKEN; const cjkRatio = totalCjk / totalBytes; if (cjkRatio >= 0.40) return 2.2; if (cjkRatio >= 0.15) return 2.8; return 3.5; } catch { return BYTES_PER_TOKEN; } }`; if (!content.includes(OLD_FN)) { process.stdout.write('[ERROR] old function signature not found — file may have been modified\n'); process.exit(1); } content = content.replace(OLD_FN, NEW_FN); fs.writeFileSync(target, content, 'utf8'); // 验证 const verify = fs.readFileSync(target, 'utf8'); const ok = verify.includes(SENTINEL) && verify.includes('offsets.push(Math.floor(fileSize / 2)'); process.stdout.write(ok ? '[DONE] patch-x02 applied: three-segment CJK sampling\n' : '[ERROR] verification failed\n'); process.exit(ok ? 0 : 1);