mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
RAFT shenanigans
This commit is contained in:
60
raft/remove_meta.py
Normal file
60
raft/remove_meta.py
Normal file
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to create a copy of raft_val.jsonl without the meta column
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def remove_meta_column(input_file, output_file):
|
||||
"""
|
||||
Read a JSONL file and write a new one without the 'meta' key and 'system' role messages.
|
||||
|
||||
Args:
|
||||
input_file: Path to input JSONL file
|
||||
output_file: Path to output JSONL file
|
||||
"""
|
||||
input_path = Path(input_file)
|
||||
output_path = Path(output_file)
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"Error: Input file '{input_file}' not found")
|
||||
sys.exit(1)
|
||||
|
||||
count = 0
|
||||
with open(input_path, "r") as infile, open(output_path, "w") as outfile:
|
||||
for line in infile:
|
||||
if line.strip(): # Skip empty lines
|
||||
obj = json.loads(line)
|
||||
obj.pop("meta", None) # Remove meta key if it exists
|
||||
|
||||
# Remove system role messages
|
||||
if "messages" in obj:
|
||||
obj["messages"] = [
|
||||
msg for msg in obj["messages"] if msg.get("role") != "system"
|
||||
]
|
||||
|
||||
outfile.write(json.dumps(obj) + "\n")
|
||||
count += 1
|
||||
|
||||
print(f"✓ Processed {count} records")
|
||||
print(f"✓ Output saved to: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Default paths
|
||||
input_train_file = "../data/intermediate/raft_train.jsonl"
|
||||
output_train_file = "../data/intermediate/raft_train_no_meta.jsonl"
|
||||
input_val_file = "../data/intermediate/raft_val.jsonl"
|
||||
output_val_file = "../data/intermediate/raft_val_no_meta.jsonl"
|
||||
|
||||
# Allow command-line overrides
|
||||
if len(sys.argv) > 1:
|
||||
input_file = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
output_file = sys.argv[2]
|
||||
|
||||
remove_meta_column(input_train_file, output_train_file)
|
||||
remove_meta_column(input_val_file, output_val_file)
|
||||
Reference in New Issue
Block a user