QLoRA stuff + datasets

This commit is contained in:
2025-12-27 16:38:45 +01:00
parent edafc06cab
commit ef99f152ac
8 changed files with 30253 additions and 0 deletions

36
raft/jsonl_remapper_2.py Normal file
View File

@@ -0,0 +1,36 @@
import argparse
import json
def rewrite_jsonl(input_path, output_path):
with open(input_path, "r", encoding="utf-8") as infile, open(
output_path, "w", encoding="utf-8"
) as outfile:
for line_num, line in enumerate(infile, start=1):
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
user_text = record.get("input", "")
bot_text = record.get("output", "")
new_record = {"text": f"<user>: {user_text} <bot>: {bot_text}"}
outfile.write(json.dumps(new_record, ensure_ascii=False) + "\n")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON on line {line_num}") from e
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Rewrite JSONL from {input, output} to {text: '<user>: ... <bot>: ...'} format"
)
parser.add_argument("--input", required=True, help="Path to input JSONL file")
parser.add_argument("--output", required=True, help="Path to output JSONL file")
args = parser.parse_args()
rewrite_jsonl(args.input, args.output)