98 lines
3.2 KiB
JavaScript
98 lines
3.2 KiB
JavaScript
|
|
#!/usr/bin/env node
|
||
|
|
// patch-x02-cjk-three-segment-sample.js
|
||
|
|
// P1: sampleBytesPerToken() 仅采样头8KB导致CJK占比偏低~25%
|
||
|
|
// 修复: 三段采样(头8KB + 中8KB + 尾8KB)加权平均
|
||
|
|
'use strict';
|
||
|
|
const fs = require('fs');
|
||
|
|
const path = require('path');
|
||
|
|
|
||
|
|
const SENTINEL = '// [PATCH-X02-THREE-SEGMENT-SAMPLE]';
|
||
|
|
const target = path.join(__dirname, '..', '..', 'hooks', 'context-pressure-monitor.js');
|
||
|
|
|
||
|
|
if (!fs.existsSync(target)) {
|
||
|
|
process.stdout.write('[SKIP] target not found\n');
|
||
|
|
process.exit(0);
|
||
|
|
}
|
||
|
|
|
||
|
|
let content = fs.readFileSync(target, 'utf8');
|
||
|
|
|
||
|
|
if (content.includes(SENTINEL)) {
|
||
|
|
process.stdout.write('[SKIP] patch-x02 already applied\n');
|
||
|
|
process.exit(0);
|
||
|
|
}
|
||
|
|
|
||
|
|
// 备份
|
||
|
|
const bak = target + '.bak.x02';
|
||
|
|
if (!fs.existsSync(bak)) fs.writeFileSync(bak, content);
|
||
|
|
|
||
|
|
const OLD_FN = `function sampleBytesPerToken(tp) {
|
||
|
|
try {
|
||
|
|
const fd = fs.openSync(tp, 'r');
|
||
|
|
const buf = Buffer.alloc(8192);
|
||
|
|
const n = fs.readSync(fd, buf, 0, 8192, 0);
|
||
|
|
fs.closeSync(fd);
|
||
|
|
if (n < 200) return BYTES_PER_TOKEN; // 样本太小, 用默认
|
||
|
|
let cjkBytes = 0;
|
||
|
|
// CJK Unified Ideographs (U+4E00-U+9FFF) UTF-8: E4-E9 起始的 3 字节序列
|
||
|
|
// CJK ext A (U+3400-U+4DBF) UTF-8: E3 起始 + 第二字节 90-9F
|
||
|
|
// 简化: 统计 0xE3-0xE9 起始的 3 字节序列首字节即可
|
||
|
|
for (let i = 0; i < n; i++) {
|
||
|
|
const b = buf[i];
|
||
|
|
if (b >= 0xE3 && b <= 0xE9) cjkBytes += 3;
|
||
|
|
}
|
||
|
|
const cjkRatio = cjkBytes / n;
|
||
|
|
if (cjkRatio >= 0.40) return 2.2;
|
||
|
|
if (cjkRatio >= 0.15) return 2.8;
|
||
|
|
return 3.5;
|
||
|
|
} catch { return BYTES_PER_TOKEN; }
|
||
|
|
}`;
|
||
|
|
|
||
|
|
const NEW_FN = `function sampleBytesPerToken(tp) { ${SENTINEL}
|
||
|
|
try {
|
||
|
|
const fd = fs.openSync(tp, 'r');
|
||
|
|
const fileSize = fs.fstatSync(fd).size;
|
||
|
|
if (fileSize < 200) { fs.closeSync(fd); return BYTES_PER_TOKEN; }
|
||
|
|
const CHUNK = 8192;
|
||
|
|
const offsets = [0];
|
||
|
|
if (fileSize > CHUNK * 3) {
|
||
|
|
offsets.push(Math.floor(fileSize / 2) - Math.floor(CHUNK / 2));
|
||
|
|
offsets.push(Math.max(0, fileSize - CHUNK));
|
||
|
|
} else if (fileSize > CHUNK) {
|
||
|
|
offsets.push(Math.max(0, fileSize - CHUNK));
|
||
|
|
}
|
||
|
|
let totalBytes = 0, totalCjk = 0;
|
||
|
|
const buf = Buffer.alloc(CHUNK);
|
||
|
|
for (const off of offsets) {
|
||
|
|
const n = fs.readSync(fd, buf, 0, CHUNK, off);
|
||
|
|
if (n < 100) continue;
|
||
|
|
let cjk = 0;
|
||
|
|
for (let i = 0; i < n; i++) {
|
||
|
|
const b = buf[i];
|
||
|
|
if (b >= 0xE3 && b <= 0xE9) cjk += 3;
|
||
|
|
}
|
||
|
|
totalBytes += n;
|
||
|
|
totalCjk += cjk;
|
||
|
|
}
|
||
|
|
fs.closeSync(fd);
|
||
|
|
if (totalBytes < 200) return BYTES_PER_TOKEN;
|
||
|
|
const cjkRatio = totalCjk / totalBytes;
|
||
|
|
if (cjkRatio >= 0.40) return 2.2;
|
||
|
|
if (cjkRatio >= 0.15) return 2.8;
|
||
|
|
return 3.5;
|
||
|
|
} catch { return BYTES_PER_TOKEN; }
|
||
|
|
}`;
|
||
|
|
|
||
|
|
if (!content.includes(OLD_FN)) {
|
||
|
|
process.stdout.write('[ERROR] old function signature not found — file may have been modified\n');
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
content = content.replace(OLD_FN, NEW_FN);
|
||
|
|
fs.writeFileSync(target, content, 'utf8');
|
||
|
|
|
||
|
|
// 验证
|
||
|
|
const verify = fs.readFileSync(target, 'utf8');
|
||
|
|
const ok = verify.includes(SENTINEL) && verify.includes('offsets.push(Math.floor(fileSize / 2)');
|
||
|
|
process.stdout.write(ok ? '[DONE] patch-x02 applied: three-segment CJK sampling\n' : '[ERROR] verification failed\n');
|
||
|
|
process.exit(ok ? 0 : 1);
|