bookworm-smart-assistant/scripts/patches/patch-p1-2-evolution-log-relinify.js

110 lines
3.2 KiB
JavaScript
Raw Permalink Normal View History

#!/usr/bin/env node
/**
* patch-p1-2-evolution-log-relinify.js
*
* 修复 evolution-log.jsonl 4 处历史粘连 JSON \n 分隔
* 让其重新成为合法 JSONL使 baseline 能成功生成
*
* 算法: JSON 流式 token 平衡计数 在每个完整 JSON 对象闭合后插入换行
*
* 协议: .bak 备份 + 原子写
*/
'use strict';
const fs = require('fs');
const path = require('path');
const TARGET = path.join(__dirname, '..', '..', 'evolution-log.jsonl');
/**
* 流式扫描逐字符跟踪 JSON 嵌套深度遇到深度归零即视为一个完整对象闭合
* 字符串内的 { } 不计数
*/
function relinify(text) {
let depth = 0;
let inString = false;
let escape = false;
let out = '';
for (let i = 0; i < text.length; i++) {
const ch = text[i];
out += ch;
if (escape) { escape = false; continue; }
if (inString) {
if (ch === '\\') { escape = true; }
else if (ch === '"') { inString = false; }
continue;
}
if (ch === '"') { inString = true; continue; }
if (ch === '{') depth++;
else if (ch === '}') {
depth--;
if (depth === 0) {
// 对象闭合,检查下一个非空白字符是否为 { (即粘连)
// 如果是,需要插入 \n
let j = i + 1;
while (j < text.length && (text[j] === ' ' || text[j] === '\t')) j++;
if (j < text.length && text[j] === '{') {
// 粘连: 插入换行
out += '\n';
}
// 如果下个字符已经是 \n不重复插
}
}
}
return out;
}
function main() {
if (!fs.existsSync(TARGET)) {
process.stderr.write('[ERROR] evolution-log.jsonl not found\n');
process.exit(1);
}
const original = fs.readFileSync(TARGET, 'utf8');
// Step 1: normalize CRLF → LF
const lfNormalized = original.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
// Step 2: relinify (handle 粘连 }{)
const fixed = relinify(lfNormalized);
if (fixed === original) {
process.stdout.write('[SKIP] no concatenated JSON found\n');
process.exit(0);
}
// 验证修复后每行都是合法 JSON
const lines = fixed.split('\n');
let okLines = 0, badLines = [];
for (let i = 0; i < lines.length; i++) {
if (!lines[i]) continue;
try { JSON.parse(lines[i]); okLines++; }
catch (_) { badLines.push(i + 1); }
}
if (badLines.length > 0) {
process.stderr.write('[ERROR] after relinify still has bad lines: ' + JSON.stringify(badLines.slice(0, 5)) + '\n');
process.exit(1);
}
// 备份
const ts = new Date().toISOString().replace(/[:.]/g, '-');
const bakPath = TARGET + '.bak.relinify.' + ts;
fs.copyFileSync(TARGET, bakPath);
process.stdout.write('[BACKUP] ' + bakPath + '\n');
// 原子写
const tmpPath = TARGET + '.tmp.' + process.pid;
fs.writeFileSync(tmpPath, fixed);
fs.renameSync(tmpPath, TARGET);
const oldLines = original.split('\n').filter(Boolean).length;
const newLines = okLines;
process.stdout.write('[OK] evolution-log.jsonl relinified\n');
process.stdout.write(' 原行数: ' + oldLines + ' → 新行数: ' + newLines + ' (+' + (newLines - oldLines) + ' 拆开)\n');
}
if (require.main === module) main();