import argparse import json def rewrite_jsonl(input_path, output_path): with open(input_path, "r", encoding="utf-8") as infile, open( output_path, "w", encoding="utf-8" ) as outfile: for line_num, line in enumerate(infile, start=1): line = line.strip() if not line: continue try: record = json.loads(line) messages = record.get("messages", []) user_parts = [] bot_parts = [] for msg in messages: role = msg.get("role") content = msg.get("content", "") if role == "user": user_parts.append(content) elif role == "assistant": bot_parts.append(content) # Skip entries without both sides if not user_parts or not bot_parts: continue user_text = " ".join(user_parts) bot_text = " ".join(bot_parts) new_record = {"text": f": {user_text} : {bot_text}"} outfile.write(json.dumps(new_record, ensure_ascii=False) + "\n") except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON on line {line_num}") from e if __name__ == "__main__": parser = argparse.ArgumentParser( description="Rewrite messages-based JSONL to {text: ': ... : ...'} format" ) parser.add_argument("--input", required=True, help="Path to input JSONL file") parser.add_argument("--output", required=True, help="Path to output JSONL file") args = parser.parse_args() rewrite_jsonl(args.input, args.output)