bookworm-smart-assistant/scripts/patches/patch-x02-cjk-three-segment-sample.js

98 lines
3.2 KiB
JavaScript
Raw Permalink Normal View History

#!/usr/bin/env node
// patch-x02-cjk-three-segment-sample.js
// P1: sampleBytesPerToken() 仅采样头8KB导致CJK占比偏低~25%
// 修复: 三段采样(头8KB + 中8KB + 尾8KB)加权平均
'use strict';
const fs = require('fs');
const path = require('path');
const SENTINEL = '// [PATCH-X02-THREE-SEGMENT-SAMPLE]';
const target = path.join(__dirname, '..', '..', 'hooks', 'context-pressure-monitor.js');
if (!fs.existsSync(target)) {
process.stdout.write('[SKIP] target not found\n');
process.exit(0);
}
let content = fs.readFileSync(target, 'utf8');
if (content.includes(SENTINEL)) {
process.stdout.write('[SKIP] patch-x02 already applied\n');
process.exit(0);
}
// 备份
const bak = target + '.bak.x02';
if (!fs.existsSync(bak)) fs.writeFileSync(bak, content);
const OLD_FN = `function sampleBytesPerToken(tp) {
try {
const fd = fs.openSync(tp, 'r');
const buf = Buffer.alloc(8192);
const n = fs.readSync(fd, buf, 0, 8192, 0);
fs.closeSync(fd);
if (n < 200) return BYTES_PER_TOKEN; // 样本太小, 用默认
let cjkBytes = 0;
// CJK Unified Ideographs (U+4E00-U+9FFF) UTF-8: E4-E9 起始的 3 字节序列
// CJK ext A (U+3400-U+4DBF) UTF-8: E3 起始 + 第二字节 90-9F
// 简化: 统计 0xE3-0xE9 起始的 3 字节序列首字节即可
for (let i = 0; i < n; i++) {
const b = buf[i];
if (b >= 0xE3 && b <= 0xE9) cjkBytes += 3;
}
const cjkRatio = cjkBytes / n;
if (cjkRatio >= 0.40) return 2.2;
if (cjkRatio >= 0.15) return 2.8;
return 3.5;
} catch { return BYTES_PER_TOKEN; }
}`;
const NEW_FN = `function sampleBytesPerToken(tp) { ${SENTINEL}
try {
const fd = fs.openSync(tp, 'r');
const fileSize = fs.fstatSync(fd).size;
if (fileSize < 200) { fs.closeSync(fd); return BYTES_PER_TOKEN; }
const CHUNK = 8192;
const offsets = [0];
if (fileSize > CHUNK * 3) {
offsets.push(Math.floor(fileSize / 2) - Math.floor(CHUNK / 2));
offsets.push(Math.max(0, fileSize - CHUNK));
} else if (fileSize > CHUNK) {
offsets.push(Math.max(0, fileSize - CHUNK));
}
let totalBytes = 0, totalCjk = 0;
const buf = Buffer.alloc(CHUNK);
for (const off of offsets) {
const n = fs.readSync(fd, buf, 0, CHUNK, off);
if (n < 100) continue;
let cjk = 0;
for (let i = 0; i < n; i++) {
const b = buf[i];
if (b >= 0xE3 && b <= 0xE9) cjk += 3;
}
totalBytes += n;
totalCjk += cjk;
}
fs.closeSync(fd);
if (totalBytes < 200) return BYTES_PER_TOKEN;
const cjkRatio = totalCjk / totalBytes;
if (cjkRatio >= 0.40) return 2.2;
if (cjkRatio >= 0.15) return 2.8;
return 3.5;
} catch { return BYTES_PER_TOKEN; }
}`;
if (!content.includes(OLD_FN)) {
process.stdout.write('[ERROR] old function signature not found — file may have been modified\n');
process.exit(1);
}
content = content.replace(OLD_FN, NEW_FN);
fs.writeFileSync(target, content, 'utf8');
// 验证
const verify = fs.readFileSync(target, 'utf8');
const ok = verify.includes(SENTINEL) && verify.includes('offsets.push(Math.floor(fileSize / 2)');
process.stdout.write(ok ? '[DONE] patch-x02 applied: three-segment CJK sampling\n' : '[ERROR] verification failed\n');
process.exit(ok ? 0 : 1);