mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-02-04 21:13:12 +01:00
QLoRA stuff + datasets
This commit is contained in:
138
raft/jsonl_remapper.py
Normal file
138
raft/jsonl_remapper.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Rewrite chat-style JSONL into {"input": ..., "output": ...} JSONL for LLM tuning.
|
||||
|
||||
Expected input line shape (example):
|
||||
{
|
||||
"messages": [
|
||||
{"role":"system","content":"..."},
|
||||
{"role":"user","content":"..."},
|
||||
{"role":"assistant","content":"..."}
|
||||
],
|
||||
"meta": {...} # optional
|
||||
}
|
||||
|
||||
Output line shape:
|
||||
{"input": "<user text>", "output": "<assistant text>"}
|
||||
|
||||
By default:
|
||||
- Ignores all non-user/assistant roles (e.g., system).
|
||||
- Emits one record per (user -> next assistant) pair in the conversation.
|
||||
- Drops all other fields (including meta) unless --keep-meta is set.
|
||||
|
||||
Usage:
|
||||
python rewrite_jsonl.py in.jsonl out.jsonl
|
||||
cat in.jsonl | python rewrite_jsonl.py - - > out.jsonl
|
||||
python rewrite_jsonl.py in.jsonl out.jsonl --only-last
|
||||
python rewrite_jsonl.py in.jsonl out.jsonl --keep-meta
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
def iter_user_assistant_pairs(messages: List[Dict[str, Any]]) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Return list of (user_content, assistant_content) pairs.
|
||||
Pairing rule: whenever a 'user' message is followed later by the next 'assistant'
|
||||
message, emit a pair. Intermediate system/tool messages are ignored.
|
||||
"""
|
||||
pairs: List[Tuple[str, str]] = []
|
||||
pending_user: Optional[str] = None
|
||||
|
||||
for m in messages:
|
||||
role = m.get("role")
|
||||
content = m.get("content")
|
||||
|
||||
if role == "user":
|
||||
# Start (or restart) a pending user turn
|
||||
if isinstance(content, str) and content.strip():
|
||||
pending_user = content
|
||||
else:
|
||||
pending_user = ""
|
||||
elif role == "assistant":
|
||||
if pending_user is not None:
|
||||
assistant_text = content if isinstance(content, str) else ""
|
||||
pairs.append((pending_user, assistant_text))
|
||||
pending_user = None
|
||||
else:
|
||||
# ignore system/tool/developer/etc.
|
||||
continue
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def read_lines(path: str) -> List[str]:
|
||||
if path == "-":
|
||||
return sys.stdin.read().splitlines()
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return f.read().splitlines()
|
||||
|
||||
|
||||
def write_lines(path: str, lines: List[str]) -> None:
|
||||
if path == "-":
|
||||
sys.stdout.write("\n".join(lines) + ("\n" if lines else ""))
|
||||
return
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(lines) + ("\n" if lines else ""))
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("infile", help="Input JSONL path, or '-' for stdin")
|
||||
ap.add_argument("outfile", help="Output JSONL path, or '-' for stdout")
|
||||
ap.add_argument(
|
||||
"--only-last",
|
||||
action="store_true",
|
||||
help="Emit only the last (user -> assistant) pair per input line.",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--keep-meta",
|
||||
action="store_true",
|
||||
help="If input line has 'meta', copy it through to output records.",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
in_lines = read_lines(args.infile)
|
||||
out_lines: List[str] = []
|
||||
|
||||
for idx, line in enumerate(in_lines, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except json.JSONDecodeError as e:
|
||||
sys.stderr.write(f"[line {idx}] JSON decode error: {e}\n")
|
||||
continue
|
||||
|
||||
messages = obj.get("messages")
|
||||
if not isinstance(messages, list):
|
||||
# Not in expected format; skip silently (or log if desired)
|
||||
continue
|
||||
|
||||
pairs = iter_user_assistant_pairs(messages)
|
||||
if not pairs:
|
||||
continue
|
||||
|
||||
if args.only_last:
|
||||
pairs = [pairs[-1]]
|
||||
|
||||
for user_text, assistant_text in pairs:
|
||||
out_obj: Dict[str, Any] = {
|
||||
"input": user_text,
|
||||
"output": assistant_text,
|
||||
}
|
||||
if args.keep_meta and isinstance(obj.get("meta"), dict):
|
||||
out_obj["meta"] = obj["meta"]
|
||||
out_lines.append(json.dumps(out_obj, ensure_ascii=False))
|
||||
|
||||
write_lines(args.outfile, out_lines)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
36
raft/jsonl_remapper_2.py
Normal file
36
raft/jsonl_remapper_2.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
|
||||
def rewrite_jsonl(input_path, output_path):
|
||||
with open(input_path, "r", encoding="utf-8") as infile, open(
|
||||
output_path, "w", encoding="utf-8"
|
||||
) as outfile:
|
||||
|
||||
for line_num, line in enumerate(infile, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
record = json.loads(line)
|
||||
user_text = record.get("input", "")
|
||||
bot_text = record.get("output", "")
|
||||
|
||||
new_record = {"text": f"<user>: {user_text} <bot>: {bot_text}"}
|
||||
|
||||
outfile.write(json.dumps(new_record, ensure_ascii=False) + "\n")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON on line {line_num}") from e
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Rewrite JSONL from {input, output} to {text: '<user>: ... <bot>: ...'} format"
|
||||
)
|
||||
parser.add_argument("--input", required=True, help="Path to input JSONL file")
|
||||
parser.add_argument("--output", required=True, help="Path to output JSONL file")
|
||||
|
||||
args = parser.parse_args()
|
||||
rewrite_jsonl(args.input, args.output)
|
||||
54
raft/jsonl_remapper_3.py
Normal file
54
raft/jsonl_remapper_3.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import argparse
|
||||
import json
|
||||
|
||||
|
||||
def rewrite_jsonl(input_path, output_path):
|
||||
with open(input_path, "r", encoding="utf-8") as infile, open(
|
||||
output_path, "w", encoding="utf-8"
|
||||
) as outfile:
|
||||
|
||||
for line_num, line in enumerate(infile, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
record = json.loads(line)
|
||||
messages = record.get("messages", [])
|
||||
|
||||
user_parts = []
|
||||
bot_parts = []
|
||||
|
||||
for msg in messages:
|
||||
role = msg.get("role")
|
||||
content = msg.get("content", "")
|
||||
|
||||
if role == "user":
|
||||
user_parts.append(content)
|
||||
elif role == "assistant":
|
||||
bot_parts.append(content)
|
||||
|
||||
# Skip entries without both sides
|
||||
if not user_parts or not bot_parts:
|
||||
continue
|
||||
|
||||
user_text = " ".join(user_parts)
|
||||
bot_text = " ".join(bot_parts)
|
||||
|
||||
new_record = {"text": f"<user>: {user_text} <bot>: {bot_text}"}
|
||||
|
||||
outfile.write(json.dumps(new_record, ensure_ascii=False) + "\n")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON on line {line_num}") from e
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Rewrite messages-based JSONL to {text: '<user>: ... <bot>: ...'} format"
|
||||
)
|
||||
parser.add_argument("--input", required=True, help="Path to input JSONL file")
|
||||
parser.add_argument("--output", required=True, help="Path to output JSONL file")
|
||||
|
||||
args = parser.parse_args()
|
||||
rewrite_jsonl(args.input, args.output)
|
||||
9901
raft/remap2_bali.jsonl
Normal file
9901
raft/remap2_bali.jsonl
Normal file
File diff suppressed because one or more lines are too long
9902
raft/remap3_bali.jsonl
Normal file
9902
raft/remap3_bali.jsonl
Normal file
File diff suppressed because one or more lines are too long
9901
raft/remap_bali_raft_dataset.jsonl
Normal file
9901
raft/remap_bali_raft_dataset.jsonl
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user