110 lines
3.2 KiB
JavaScript
110 lines
3.2 KiB
JavaScript
|
|
#!/usr/bin/env node
|
|||
|
|
/**
|
|||
|
|
* patch-p1-2-evolution-log-relinify.js
|
|||
|
|
*
|
|||
|
|
* 修复 evolution-log.jsonl 中 4 处历史粘连 JSON(缺 \n 分隔),
|
|||
|
|
* 让其重新成为合法 JSONL,使 baseline 能成功生成。
|
|||
|
|
*
|
|||
|
|
* 算法: JSON 流式 token 平衡计数 — 在每个完整 JSON 对象闭合后插入换行。
|
|||
|
|
*
|
|||
|
|
* 协议: .bak 备份 + 原子写
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
'use strict';
|
|||
|
|
const fs = require('fs');
|
|||
|
|
const path = require('path');
|
|||
|
|
|
|||
|
|
const TARGET = path.join(__dirname, '..', '..', 'evolution-log.jsonl');
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* 流式扫描,逐字符跟踪 JSON 嵌套深度,遇到深度归零即视为一个完整对象闭合。
|
|||
|
|
* 字符串内的 { } 不计数。
|
|||
|
|
*/
|
|||
|
|
function relinify(text) {
|
|||
|
|
let depth = 0;
|
|||
|
|
let inString = false;
|
|||
|
|
let escape = false;
|
|||
|
|
let out = '';
|
|||
|
|
|
|||
|
|
for (let i = 0; i < text.length; i++) {
|
|||
|
|
const ch = text[i];
|
|||
|
|
out += ch;
|
|||
|
|
|
|||
|
|
if (escape) { escape = false; continue; }
|
|||
|
|
|
|||
|
|
if (inString) {
|
|||
|
|
if (ch === '\\') { escape = true; }
|
|||
|
|
else if (ch === '"') { inString = false; }
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (ch === '"') { inString = true; continue; }
|
|||
|
|
|
|||
|
|
if (ch === '{') depth++;
|
|||
|
|
else if (ch === '}') {
|
|||
|
|
depth--;
|
|||
|
|
if (depth === 0) {
|
|||
|
|
// 对象闭合,检查下一个非空白字符是否为 { (即粘连)
|
|||
|
|
// 如果是,需要插入 \n
|
|||
|
|
let j = i + 1;
|
|||
|
|
while (j < text.length && (text[j] === ' ' || text[j] === '\t')) j++;
|
|||
|
|
if (j < text.length && text[j] === '{') {
|
|||
|
|
// 粘连: 插入换行
|
|||
|
|
out += '\n';
|
|||
|
|
}
|
|||
|
|
// 如果下个字符已经是 \n,不重复插
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return out;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function main() {
|
|||
|
|
if (!fs.existsSync(TARGET)) {
|
|||
|
|
process.stderr.write('[ERROR] evolution-log.jsonl not found\n');
|
|||
|
|
process.exit(1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const original = fs.readFileSync(TARGET, 'utf8');
|
|||
|
|
// Step 1: normalize CRLF → LF
|
|||
|
|
const lfNormalized = original.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
|||
|
|
// Step 2: relinify (handle 粘连 }{)
|
|||
|
|
const fixed = relinify(lfNormalized);
|
|||
|
|
|
|||
|
|
if (fixed === original) {
|
|||
|
|
process.stdout.write('[SKIP] no concatenated JSON found\n');
|
|||
|
|
process.exit(0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 验证修复后每行都是合法 JSON
|
|||
|
|
const lines = fixed.split('\n');
|
|||
|
|
let okLines = 0, badLines = [];
|
|||
|
|
for (let i = 0; i < lines.length; i++) {
|
|||
|
|
if (!lines[i]) continue;
|
|||
|
|
try { JSON.parse(lines[i]); okLines++; }
|
|||
|
|
catch (_) { badLines.push(i + 1); }
|
|||
|
|
}
|
|||
|
|
if (badLines.length > 0) {
|
|||
|
|
process.stderr.write('[ERROR] after relinify still has bad lines: ' + JSON.stringify(badLines.slice(0, 5)) + '\n');
|
|||
|
|
process.exit(1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 备份
|
|||
|
|
const ts = new Date().toISOString().replace(/[:.]/g, '-');
|
|||
|
|
const bakPath = TARGET + '.bak.relinify.' + ts;
|
|||
|
|
fs.copyFileSync(TARGET, bakPath);
|
|||
|
|
process.stdout.write('[BACKUP] ' + bakPath + '\n');
|
|||
|
|
|
|||
|
|
// 原子写
|
|||
|
|
const tmpPath = TARGET + '.tmp.' + process.pid;
|
|||
|
|
fs.writeFileSync(tmpPath, fixed);
|
|||
|
|
fs.renameSync(tmpPath, TARGET);
|
|||
|
|
|
|||
|
|
const oldLines = original.split('\n').filter(Boolean).length;
|
|||
|
|
const newLines = okLines;
|
|||
|
|
process.stdout.write('[OK] evolution-log.jsonl relinified\n');
|
|||
|
|
process.stdout.write(' 原行数: ' + oldLines + ' → 新行数: ' + newLines + ' (+' + (newLines - oldLines) + ' 拆开)\n');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (require.main === module) main();
|