mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 08:22:43 +01:00
61 lines
1.8 KiB
Python
61 lines
1.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to create a copy of raft_val.jsonl without the meta column
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def remove_meta_column(input_file, output_file):
|
|
"""
|
|
Read a JSONL file and write a new one without the 'meta' key and 'system' role messages.
|
|
|
|
Args:
|
|
input_file: Path to input JSONL file
|
|
output_file: Path to output JSONL file
|
|
"""
|
|
input_path = Path(input_file)
|
|
output_path = Path(output_file)
|
|
|
|
if not input_path.exists():
|
|
print(f"Error: Input file '{input_file}' not found")
|
|
sys.exit(1)
|
|
|
|
count = 0
|
|
with open(input_path, "r") as infile, open(output_path, "w") as outfile:
|
|
for line in infile:
|
|
if line.strip(): # Skip empty lines
|
|
obj = json.loads(line)
|
|
obj.pop("meta", None) # Remove meta key if it exists
|
|
|
|
# Remove system role messages
|
|
if "messages" in obj:
|
|
obj["messages"] = [
|
|
msg for msg in obj["messages"] if msg.get("role") != "system"
|
|
]
|
|
|
|
outfile.write(json.dumps(obj) + "\n")
|
|
count += 1
|
|
|
|
print(f"✓ Processed {count} records")
|
|
print(f"✓ Output saved to: {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Default paths
|
|
input_train_file = "../data/intermediate/raft_train.jsonl"
|
|
output_train_file = "../data/intermediate/raft_train_no_meta.jsonl"
|
|
input_val_file = "../data/intermediate/raft_val.jsonl"
|
|
output_val_file = "../data/intermediate/raft_val_no_meta.jsonl"
|
|
|
|
# Allow command-line overrides
|
|
if len(sys.argv) > 1:
|
|
input_file = sys.argv[1]
|
|
if len(sys.argv) > 2:
|
|
output_file = sys.argv[2]
|
|
|
|
remove_meta_column(input_train_file, output_train_file)
|
|
remove_meta_column(input_val_file, output_val_file)
|