QLoRA stuff + datasets

2026-02-04 21:13:12 +01:00 · 2025-12-27 16:38:45 +01:00
parent edafc06cab
commit ef99f152ac
8 changed files with 30253 additions and 0 deletions
--- a/raft/jsonl_remapper.py
+++ b/raft/jsonl_remapper.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Rewrite chat-style JSONL into {"input": ..., "output": ...} JSONL for LLM tuning.
+
+Expected input line shape (example):
+{
+  "messages": [
+    {"role":"system","content":"..."},
+    {"role":"user","content":"..."},
+    {"role":"assistant","content":"..."}
+  ],
+  "meta": {...}   # optional
+}
+
+Output line shape:
+{"input": "<user text>", "output": "<assistant text>"}
+
+By default:
+- Ignores all non-user/assistant roles (e.g., system).
+- Emits one record per (user -> next assistant) pair in the conversation.
+- Drops all other fields (including meta) unless --keep-meta is set.
+
+Usage:
+  python rewrite_jsonl.py in.jsonl out.jsonl
+  cat in.jsonl | python rewrite_jsonl.py - - > out.jsonl
+  python rewrite_jsonl.py in.jsonl out.jsonl --only-last
+  python rewrite_jsonl.py in.jsonl out.jsonl --keep-meta
+"""
+
+import argparse
+import json
+import sys
+from typing import Any, Dict, List, Optional, Tuple
+
+
+def iter_user_assistant_pairs(messages: List[Dict[str, Any]]) -> List[Tuple[str, str]]:
+    """
+    Return list of (user_content, assistant_content) pairs.
+    Pairing rule: whenever a 'user' message is followed later by the next 'assistant'
+    message, emit a pair. Intermediate system/tool messages are ignored.
+    """
+    pairs: List[Tuple[str, str]] = []
+    pending_user: Optional[str] = None
+
+    for m in messages:
+        role = m.get("role")
+        content = m.get("content")
+
+        if role == "user":
+            # Start (or restart) a pending user turn
+            if isinstance(content, str) and content.strip():
+                pending_user = content
+            else:
+                pending_user = ""
+        elif role == "assistant":
+            if pending_user is not None:
+                assistant_text = content if isinstance(content, str) else ""
+                pairs.append((pending_user, assistant_text))
+                pending_user = None
+        else:
+            # ignore system/tool/developer/etc.
+            continue
+
+    return pairs
+
+
+def read_lines(path: str) -> List[str]:
+    if path == "-":
+        return sys.stdin.read().splitlines()
+    with open(path, "r", encoding="utf-8") as f:
+        return f.read().splitlines()
+
+
+def write_lines(path: str, lines: List[str]) -> None:
+    if path == "-":
+        sys.stdout.write("\n".join(lines) + ("\n" if lines else ""))
+        return
+    with open(path, "w", encoding="utf-8") as f:
+        f.write("\n".join(lines) + ("\n" if lines else ""))
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("infile", help="Input JSONL path, or '-' for stdin")
+    ap.add_argument("outfile", help="Output JSONL path, or '-' for stdout")
+    ap.add_argument(
+        "--only-last",
+        action="store_true",
+        help="Emit only the last (user -> assistant) pair per input line.",
+    )
+    ap.add_argument(
+        "--keep-meta",
+        action="store_true",
+        help="If input line has 'meta', copy it through to output records.",
+    )
+    args = ap.parse_args()
+
+    in_lines = read_lines(args.infile)
+    out_lines: List[str] = []
+
+    for idx, line in enumerate(in_lines, start=1):
+        line = line.strip()
+        if not line:
+            continue
+
+        try:
+            obj = json.loads(line)
+        except json.JSONDecodeError as e:
+            sys.stderr.write(f"[line {idx}] JSON decode error: {e}\n")
+            continue
+
+        messages = obj.get("messages")
+        if not isinstance(messages, list):
+            # Not in expected format; skip silently (or log if desired)
+            continue
+
+        pairs = iter_user_assistant_pairs(messages)
+        if not pairs:
+            continue
+
+        if args.only_last:
+            pairs = [pairs[-1]]
+
+        for user_text, assistant_text in pairs:
+            out_obj: Dict[str, Any] = {
+                "input": user_text,
+                "output": assistant_text,
+            }
+            if args.keep_meta and isinstance(obj.get("meta"), dict):
+                out_obj["meta"] = obj["meta"]
+            out_lines.append(json.dumps(out_obj, ensure_ascii=False))
+
+    write_lines(args.outfile, out_lines)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/raft/jsonl_remapper_2.py
+++ b/raft/jsonl_remapper_2.py
@@ -0,0 +1,36 @@
+import argparse
+import json
+
+
+def rewrite_jsonl(input_path, output_path):
+    with open(input_path, "r", encoding="utf-8") as infile, open(
+        output_path, "w", encoding="utf-8"
+    ) as outfile:
+
+        for line_num, line in enumerate(infile, start=1):
+            line = line.strip()
+            if not line:
+                continue
+
+            try:
+                record = json.loads(line)
+                user_text = record.get("input", "")
+                bot_text = record.get("output", "")
+
+                new_record = {"text": f"<user>: {user_text} <bot>: {bot_text}"}
+
+                outfile.write(json.dumps(new_record, ensure_ascii=False) + "\n")
+
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSON on line {line_num}") from e
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Rewrite JSONL from {input, output} to {text: '<user>: ... <bot>: ...'} format"
+    )
+    parser.add_argument("--input", required=True, help="Path to input JSONL file")
+    parser.add_argument("--output", required=True, help="Path to output JSONL file")
+
+    args = parser.parse_args()
+    rewrite_jsonl(args.input, args.output)
--- a/raft/jsonl_remapper_3.py
+++ b/raft/jsonl_remapper_3.py
@@ -0,0 +1,54 @@
+import argparse
+import json
+
+
+def rewrite_jsonl(input_path, output_path):
+    with open(input_path, "r", encoding="utf-8") as infile, open(
+        output_path, "w", encoding="utf-8"
+    ) as outfile:
+
+        for line_num, line in enumerate(infile, start=1):
+            line = line.strip()
+            if not line:
+                continue
+
+            try:
+                record = json.loads(line)
+                messages = record.get("messages", [])
+
+                user_parts = []
+                bot_parts = []
+
+                for msg in messages:
+                    role = msg.get("role")
+                    content = msg.get("content", "")
+
+                    if role == "user":
+                        user_parts.append(content)
+                    elif role == "assistant":
+                        bot_parts.append(content)
+
+                # Skip entries without both sides
+                if not user_parts or not bot_parts:
+                    continue
+
+                user_text = " ".join(user_parts)
+                bot_text = " ".join(bot_parts)
+
+                new_record = {"text": f"<user>: {user_text} <bot>: {bot_text}"}
+
+                outfile.write(json.dumps(new_record, ensure_ascii=False) + "\n")
+
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSON on line {line_num}") from e
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Rewrite messages-based JSONL to {text: '<user>: ... <bot>: ...'} format"
+    )
+    parser.add_argument("--input", required=True, help="Path to input JSONL file")
+    parser.add_argument("--output", required=True, help="Path to output JSONL file")
+
+    args = parser.parse_args()
+    rewrite_jsonl(args.input, args.output)
--- a/raft/remap2_bali.jsonl
+++ b/raft/remap2_bali.jsonl
--- a/raft/remap3_bali.jsonl
+++ b/raft/remap3_bali.jsonl
--- a/raft/remap_bali_raft_dataset.jsonl
+++ b/raft/remap_bali_raft_dataset.jsonl