mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
tsv to csv
This commit is contained in:
@@ -9,7 +9,7 @@
|
|||||||
## Vorbereiten des Retrieval-Corpus
|
## Vorbereiten des Retrieval-Corpus
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python prepare_corpus.py --input_tab ../data/intermediate/culture_reviews.csv --out_dir out
|
python prepare_corpus.py --input_csv ../data/intermediate/culture_reviews.csv --out_dir out
|
||||||
```
|
```
|
||||||
|
|
||||||
## Erstellen des RAFT-Datensatzes
|
## Erstellen des RAFT-Datensatzes
|
||||||
|
|||||||
@@ -48,13 +48,13 @@ def detect_text_col(df: pd.DataFrame) -> str:
|
|||||||
best_score = avg_len
|
best_score = avg_len
|
||||||
best_col = col
|
best_col = col
|
||||||
if best_col is None:
|
if best_col is None:
|
||||||
raise ValueError("Could not detect a text column in the .tab file.")
|
raise ValueError("Could not detect a text column in the .csv file.")
|
||||||
return best_col
|
return best_col
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
ap = argparse.ArgumentParser()
|
ap = argparse.ArgumentParser()
|
||||||
ap.add_argument("--input_tab", required=True, help="Tripadvisor reviews .tab file")
|
ap.add_argument("--input_csv", required=True, help="Tripadvisor reviews .csv file")
|
||||||
ap.add_argument("--out_dir", default="out")
|
ap.add_argument("--out_dir", default="out")
|
||||||
ap.add_argument(
|
ap.add_argument(
|
||||||
"--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
|
"--embedding_model", default="sentence-transformers/all-MiniLM-L6-v2"
|
||||||
@@ -65,11 +65,9 @@ def main():
|
|||||||
|
|
||||||
os.makedirs(args.out_dir, exist_ok=True)
|
os.makedirs(args.out_dir, exist_ok=True)
|
||||||
|
|
||||||
# Many .tab files are TSV
|
df = pd.read_csv(args.input_csv, sep=",", dtype=str, on_bad_lines="skip")
|
||||||
df = pd.read_csv(args.input_tab, sep="\t", dtype=str, on_bad_lines="skip")
|
|
||||||
text_col = detect_text_col(df)
|
|
||||||
|
|
||||||
rows = df[text_col].fillna("").astype(str).tolist()
|
rows = df["Original"].fillna("").astype(str).tolist()
|
||||||
|
|
||||||
corpus_path = os.path.join(args.out_dir, "corpus.jsonl")
|
corpus_path = os.path.join(args.out_dir, "corpus.jsonl")
|
||||||
corpus = []
|
corpus = []
|
||||||
@@ -111,9 +109,7 @@ def main():
|
|||||||
for i, c in enumerate(corpus):
|
for i, c in enumerate(corpus):
|
||||||
f.write(json.dumps({"faiss_id": i, **c}, ensure_ascii=False) + "\n")
|
f.write(json.dumps({"faiss_id": i, **c}, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
print(
|
print(f"Saved:\n- {corpus_path}\n- {faiss_path}\n- {mapping_path}")
|
||||||
f"Saved:\n- {corpus_path}\n- {faiss_path}\n- {mapping_path}\nText column detected: {text_col}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user