#!/usr/bin/env python3 """ Script to create a copy of raft_val.jsonl without the meta column """ import json import sys from pathlib import Path def remove_meta_column(input_file, output_file): """ Read a JSONL file and write a new one without the 'meta' key and 'system' role messages. Args: input_file: Path to input JSONL file output_file: Path to output JSONL file """ input_path = Path(input_file) output_path = Path(output_file) if not input_path.exists(): print(f"Error: Input file '{input_file}' not found") sys.exit(1) count = 0 with open(input_path, "r") as infile, open(output_path, "w") as outfile: for line in infile: if line.strip(): # Skip empty lines obj = json.loads(line) obj.pop("meta", None) # Remove meta key if it exists # Remove system role messages if "messages" in obj: obj["messages"] = [ msg for msg in obj["messages"] if msg.get("role") != "system" ] outfile.write(json.dumps(obj) + "\n") count += 1 print(f"✓ Processed {count} records") print(f"✓ Output saved to: {output_path}") if __name__ == "__main__": # Default paths input_train_file = "../data/intermediate/raft_train.jsonl" output_train_file = "../data/intermediate/raft_train_no_meta.jsonl" input_val_file = "../data/intermediate/raft_val.jsonl" output_val_file = "../data/intermediate/raft_val_no_meta.jsonl" # Allow command-line overrides if len(sys.argv) > 1: input_file = sys.argv[1] if len(sys.argv) > 2: output_file = sys.argv[2] remove_meta_column(input_train_file, output_train_file) remove_meta_column(input_val_file, output_val_file)