RAFT shenanigans

This commit is contained in:
2026-02-21 23:47:12 +01:00
parent 49c622db08
commit 61edb35f70
14 changed files with 2943 additions and 6 deletions

60
raft/remove_meta.py Normal file
View File

@@ -0,0 +1,60 @@
#!/usr/bin/env python3
"""
Script to create a copy of raft_val.jsonl without the meta column
"""
import json
import sys
from pathlib import Path
def remove_meta_column(input_file, output_file):
"""
Read a JSONL file and write a new one without the 'meta' key and 'system' role messages.
Args:
input_file: Path to input JSONL file
output_file: Path to output JSONL file
"""
input_path = Path(input_file)
output_path = Path(output_file)
if not input_path.exists():
print(f"Error: Input file '{input_file}' not found")
sys.exit(1)
count = 0
with open(input_path, "r") as infile, open(output_path, "w") as outfile:
for line in infile:
if line.strip(): # Skip empty lines
obj = json.loads(line)
obj.pop("meta", None) # Remove meta key if it exists
# Remove system role messages
if "messages" in obj:
obj["messages"] = [
msg for msg in obj["messages"] if msg.get("role") != "system"
]
outfile.write(json.dumps(obj) + "\n")
count += 1
print(f"✓ Processed {count} records")
print(f"✓ Output saved to: {output_path}")
if __name__ == "__main__":
# Default paths
input_train_file = "../data/intermediate/raft_train.jsonl"
output_train_file = "../data/intermediate/raft_train_no_meta.jsonl"
input_val_file = "../data/intermediate/raft_val.jsonl"
output_val_file = "../data/intermediate/raft_val_no_meta.jsonl"
# Allow command-line overrides
if len(sys.argv) > 1:
input_file = sys.argv[1]
if len(sys.argv) > 2:
output_file = sys.argv[2]
remove_meta_column(input_train_file, output_train_file)
remove_meta_column(input_val_file, output_val_file)