mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-02-04 13:03:12 +01:00
37 lines
1.2 KiB
Python
37 lines
1.2 KiB
Python
import argparse
|
|
import json
|
|
|
|
|
|
def rewrite_jsonl(input_path, output_path):
|
|
with open(input_path, "r", encoding="utf-8") as infile, open(
|
|
output_path, "w", encoding="utf-8"
|
|
) as outfile:
|
|
|
|
for line_num, line in enumerate(infile, start=1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
try:
|
|
record = json.loads(line)
|
|
user_text = record.get("input", "")
|
|
bot_text = record.get("output", "")
|
|
|
|
new_record = {"text": f"<user>: {user_text} <bot>: {bot_text}"}
|
|
|
|
outfile.write(json.dumps(new_record, ensure_ascii=False) + "\n")
|
|
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Invalid JSON on line {line_num}") from e
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Rewrite JSONL from {input, output} to {text: '<user>: ... <bot>: ...'} format"
|
|
)
|
|
parser.add_argument("--input", required=True, help="Path to input JSONL file")
|
|
parser.add_argument("--output", required=True, help="Path to output JSONL file")
|
|
|
|
args = parser.parse_args()
|
|
rewrite_jsonl(args.input, args.output)
|