mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-05-08 05:25:47 +02:00
QLoRA stuff + datasets
This commit is contained in:
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Rewrite chat-style JSONL into {"input": ..., "output": ...} JSONL for LLM tuning.
|
||||
|
||||
Expected input line shape (example):
|
||||
{
|
||||
"messages": [
|
||||
{"role":"system","content":"..."},
|
||||
{"role":"user","content":"..."},
|
||||
{"role":"assistant","content":"..."}
|
||||
],
|
||||
"meta": {...} # optional
|
||||
}
|
||||
|
||||
Output line shape:
|
||||
{"input": "<user text>", "output": "<assistant text>"}
|
||||
|
||||
By default:
|
||||
- Ignores all non-user/assistant roles (e.g., system).
|
||||
- Emits one record per (user -> next assistant) pair in the conversation.
|
||||
- Drops all other fields (including meta) unless --keep-meta is set.
|
||||
|
||||
Usage:
|
||||
python rewrite_jsonl.py in.jsonl out.jsonl
|
||||
cat in.jsonl | python rewrite_jsonl.py - - > out.jsonl
|
||||
python rewrite_jsonl.py in.jsonl out.jsonl --only-last
|
||||
python rewrite_jsonl.py in.jsonl out.jsonl --keep-meta
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
def iter_user_assistant_pairs(messages: List[Dict[str, Any]]) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Return list of (user_content, assistant_content) pairs.
|
||||
Pairing rule: whenever a 'user' message is followed later by the next 'assistant'
|
||||
message, emit a pair. Intermediate system/tool messages are ignored.
|
||||
"""
|
||||
pairs: List[Tuple[str, str]] = []
|
||||
pending_user: Optional[str] = None
|
||||
|
||||
for m in messages:
|
||||
role = m.get("role")
|
||||
content = m.get("content")
|
||||
|
||||
if role == "user":
|
||||
# Start (or restart) a pending user turn
|
||||
if isinstance(content, str) and content.strip():
|
||||
pending_user = content
|
||||
else:
|
||||
pending_user = ""
|
||||
elif role == "assistant":
|
||||
if pending_user is not None:
|
||||
assistant_text = content if isinstance(content, str) else ""
|
||||
pairs.append((pending_user, assistant_text))
|
||||
pending_user = None
|
||||
else:
|
||||
# ignore system/tool/developer/etc.
|
||||
continue
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def read_lines(path: str) -> List[str]:
|
||||
if path == "-":
|
||||
return sys.stdin.read().splitlines()
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return f.read().splitlines()
|
||||
|
||||
|
||||
def write_lines(path: str, lines: List[str]) -> None:
|
||||
if path == "-":
|
||||
sys.stdout.write("\n".join(lines) + ("\n" if lines else ""))
|
||||
return
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(lines) + ("\n" if lines else ""))
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("infile", help="Input JSONL path, or '-' for stdin")
|
||||
ap.add_argument("outfile", help="Output JSONL path, or '-' for stdout")
|
||||
ap.add_argument(
|
||||
"--only-last",
|
||||
action="store_true",
|
||||
help="Emit only the last (user -> assistant) pair per input line.",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--keep-meta",
|
||||
action="store_true",
|
||||
help="If input line has 'meta', copy it through to output records.",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
in_lines = read_lines(args.infile)
|
||||
out_lines: List[str] = []
|
||||
|
||||
for idx, line in enumerate(in_lines, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except json.JSONDecodeError as e:
|
||||
sys.stderr.write(f"[line {idx}] JSON decode error: {e}\n")
|
||||
continue
|
||||
|
||||
messages = obj.get("messages")
|
||||
if not isinstance(messages, list):
|
||||
# Not in expected format; skip silently (or log if desired)
|
||||
continue
|
||||
|
||||
pairs = iter_user_assistant_pairs(messages)
|
||||
if not pairs:
|
||||
continue
|
||||
|
||||
if args.only_last:
|
||||
pairs = [pairs[-1]]
|
||||
|
||||
for user_text, assistant_text in pairs:
|
||||
out_obj: Dict[str, Any] = {
|
||||
"input": user_text,
|
||||
"output": assistant_text,
|
||||
}
|
||||
if args.keep_meta and isinstance(obj.get("meta"), dict):
|
||||
out_obj["meta"] = obj["meta"]
|
||||
out_lines.append(json.dumps(out_obj, ensure_ascii=False))
|
||||
|
||||
write_lines(args.outfile, out_lines)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,36 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
|
||||
def rewrite_jsonl(input_path, output_path):
|
||||
with open(input_path, "r", encoding="utf-8") as infile, open(
|
||||
output_path, "w", encoding="utf-8"
|
||||
) as outfile:
|
||||
|
||||
for line_num, line in enumerate(infile, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
record = json.loads(line)
|
||||
user_text = record.get("input", "")
|
||||
bot_text = record.get("output", "")
|
||||
|
||||
new_record = {"text": f"<user>: {user_text} <bot>: {bot_text}"}
|
||||
|
||||
outfile.write(json.dumps(new_record, ensure_ascii=False) + "\n")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON on line {line_num}") from e
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Rewrite JSONL from {input, output} to {text: '<user>: ... <bot>: ...'} format"
|
||||
)
|
||||
parser.add_argument("--input", required=True, help="Path to input JSONL file")
|
||||
parser.add_argument("--output", required=True, help="Path to output JSONL file")
|
||||
|
||||
args = parser.parse_args()
|
||||
rewrite_jsonl(args.input, args.output)
|
||||
@@ -0,0 +1,54 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
|
||||
def rewrite_jsonl(input_path, output_path):
|
||||
with open(input_path, "r", encoding="utf-8") as infile, open(
|
||||
output_path, "w", encoding="utf-8"
|
||||
) as outfile:
|
||||
|
||||
for line_num, line in enumerate(infile, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
record = json.loads(line)
|
||||
messages = record.get("messages", [])
|
||||
|
||||
user_parts = []
|
||||
bot_parts = []
|
||||
|
||||
for msg in messages:
|
||||
role = msg.get("role")
|
||||
content = msg.get("content", "")
|
||||
|
||||
if role == "user":
|
||||
user_parts.append(content)
|
||||
elif role == "assistant":
|
||||
bot_parts.append(content)
|
||||
|
||||
# Skip entries without both sides
|
||||
if not user_parts or not bot_parts:
|
||||
continue
|
||||
|
||||
user_text = " ".join(user_parts)
|
||||
bot_text = " ".join(bot_parts)
|
||||
|
||||
new_record = {"text": f"<user>: {user_text} <bot>: {bot_text}"}
|
||||
|
||||
outfile.write(json.dumps(new_record, ensure_ascii=False) + "\n")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON on line {line_num}") from e
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Rewrite messages-based JSONL to {text: '<user>: ... <bot>: ...'} format"
|
||||
)
|
||||
parser.add_argument("--input", required=True, help="Path to input JSONL file")
|
||||
parser.add_argument("--output", required=True, help="Path to output JSONL file")
|
||||
|
||||
args = parser.parse_args()
|
||||
rewrite_jsonl(args.input, args.output)
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user