mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-02-04 05:03:11 +01:00
QLoRA stuff + datasets
This commit is contained in:
54
raft/jsonl_remapper_3.py
Normal file
54
raft/jsonl_remapper_3.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
|
||||
def rewrite_jsonl(input_path, output_path):
|
||||
with open(input_path, "r", encoding="utf-8") as infile, open(
|
||||
output_path, "w", encoding="utf-8"
|
||||
) as outfile:
|
||||
|
||||
for line_num, line in enumerate(infile, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
record = json.loads(line)
|
||||
messages = record.get("messages", [])
|
||||
|
||||
user_parts = []
|
||||
bot_parts = []
|
||||
|
||||
for msg in messages:
|
||||
role = msg.get("role")
|
||||
content = msg.get("content", "")
|
||||
|
||||
if role == "user":
|
||||
user_parts.append(content)
|
||||
elif role == "assistant":
|
||||
bot_parts.append(content)
|
||||
|
||||
# Skip entries without both sides
|
||||
if not user_parts or not bot_parts:
|
||||
continue
|
||||
|
||||
user_text = " ".join(user_parts)
|
||||
bot_text = " ".join(bot_parts)
|
||||
|
||||
new_record = {"text": f"<user>: {user_text} <bot>: {bot_text}"}
|
||||
|
||||
outfile.write(json.dumps(new_record, ensure_ascii=False) + "\n")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON on line {line_num}") from e
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Rewrite messages-based JSONL to {text: '<user>: ... <bot>: ...'} format"
|
||||
)
|
||||
parser.add_argument("--input", required=True, help="Path to input JSONL file")
|
||||
parser.add_argument("--output", required=True, help="Path to output JSONL file")
|
||||
|
||||
args = parser.parse_args()
|
||||
rewrite_jsonl(args.input, args.output)
|
||||
Reference in New Issue
Block a user