diff --git a/raft/README.md b/raft/README.md index e881e36..45e8be3 100644 --- a/raft/README.md +++ b/raft/README.md @@ -9,7 +9,7 @@ ## Vorbereiten des Retrieval-Corpus ```bash -python prepare_corpus.py --input_tab ../data/intermediate/culture_reviews.csv --out_dir out +python prepare_corpus.py --input_csv ../data/intermediate/culture_reviews.csv --out_dir out ``` ## Erstellen des RAFT-Datensatzes diff --git a/raft/prepare_corpus.py b/raft/prepare_corpus.py index ca16eb2..88c05e9 100644 --- a/raft/prepare_corpus.py +++ b/raft/prepare_corpus.py @@ -48,13 +48,13 @@ def detect_text_col(df: pd.DataFrame) -> str: best_score = avg_len best_col = col if best_col is None: - raise ValueError("Could not detect a text column in the .tab file.") + raise ValueError("Could not detect a text column in the .csv file.") return best_col def main(): ap = argparse.ArgumentParser() - ap.add_argument("--input_tab", required=True, help="Tripadvisor reviews .tab file") + ap.add_argument("--input_csv", required=True, help="Tripadvisor reviews .csv file") ap.add_argument("--out_dir", default="out") ap.add_argument( "--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2" @@ -65,11 +65,9 @@ def main(): os.makedirs(args.out_dir, exist_ok=True) - # Many .tab files are TSV - df = pd.read_csv(args.input_tab, sep="\t", dtype=str, on_bad_lines="skip") - text_col = detect_text_col(df) + df = pd.read_csv(args.input_csv, sep=",", dtype=str, on_bad_lines="skip") - rows = df[text_col].fillna("").astype(str).tolist() + rows = df["Original"].fillna("").astype(str).tolist() corpus_path = os.path.join(args.out_dir, "corpus.jsonl") corpus = [] @@ -111,9 +109,7 @@ def main(): for i, c in enumerate(corpus): f.write(json.dumps({"faiss_id": i, **c}, ensure_ascii=False) + "\n") - print( - f"Saved:\n- {corpus_path}\n- {faiss_path}\n- {mapping_path}\nText column detected: {text_col}" - ) + print(f"Saved:\n- {corpus_path}\n- {faiss_path}\n- {mapping_path}") if __name__ == "__main__":