mirror of
https://github.com/marvinscham/masterthesis-playground.git
synced 2026-03-22 00:12:42 +01:00
268 lines
8.7 KiB
Python
268 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to extract questions and contexts from questions.md,
|
|
query DeepSeek API, and insert answers above the context sections.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Optional, Tuple
|
|
|
|
import requests
|
|
|
|
|
|
def extract_questions_and_contexts(file_path: str) -> List[Tuple[str, str, str, str]]:
|
|
"""
|
|
Extract questions and their contexts from the markdown file.
|
|
|
|
Returns:
|
|
List of tuples: (section_title, question_text, context_text, full_section)
|
|
"""
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
results = []
|
|
|
|
# Split by section headers (##)
|
|
section_pattern = r"^## (.+?)$"
|
|
sections = re.split(section_pattern, content, flags=re.MULTILINE)
|
|
|
|
# sections will be: ['', 'Section 1', 'content...', 'Section 2', 'content...', ...]
|
|
for i in range(1, len(sections), 2):
|
|
if i + 1 < len(sections):
|
|
section_title = sections[i]
|
|
section_content = sections[i + 1]
|
|
|
|
# Extract questions within this section
|
|
question_pattern = r"^### (.+?)$"
|
|
questions = re.split(question_pattern, section_content, flags=re.MULTILINE)
|
|
|
|
# questions will be: ['', 'Question 1', 'content...', 'Question 2', ...]
|
|
for j in range(1, len(questions), 2):
|
|
if j + 1 < len(questions):
|
|
question_text = questions[j]
|
|
question_content = questions[j + 1]
|
|
|
|
# Extract context block
|
|
context_match = re.search(
|
|
r"Context:\s*\n\s*```\n(.*?)\n```", question_content, re.DOTALL
|
|
)
|
|
|
|
if context_match:
|
|
context_text = context_match.group(1)
|
|
results.append(
|
|
(
|
|
section_title,
|
|
question_text,
|
|
context_text,
|
|
question_content,
|
|
)
|
|
)
|
|
|
|
return results
|
|
|
|
|
|
def query_deepseek_api(
|
|
question: str, context: str, system_prompt: str, api_key: str
|
|
) -> Optional[str]:
|
|
"""
|
|
Query DeepSeek API with the question and context.
|
|
|
|
Args:
|
|
question: The question text
|
|
context: The context/documents
|
|
system_prompt: System prompt to use
|
|
api_key: DeepSeek API key
|
|
|
|
Returns:
|
|
The API response or None if there's an error
|
|
"""
|
|
url = "https://api.deepseek.com/chat/completions"
|
|
|
|
# Prepare the message
|
|
user_message = f"Question: {question}\n\nContext:\n{context}"
|
|
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"Authorization": f"Bearer {api_key}",
|
|
}
|
|
|
|
payload = {
|
|
"model": "deepseek-chat",
|
|
"messages": [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_message},
|
|
],
|
|
"temperature": 0.7,
|
|
"max_tokens": 1000,
|
|
}
|
|
|
|
try:
|
|
response = requests.post(url, headers=headers, json=payload, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
if "choices" in result and len(result["choices"]) > 0:
|
|
return result["choices"][0]["message"]["content"]
|
|
else:
|
|
print(f"Unexpected API response: {result}")
|
|
return None
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"API request failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def format_answer(answer: str) -> str:
|
|
"""
|
|
Format the answer as a response section above the context.
|
|
|
|
Args:
|
|
answer: The answer text from the API
|
|
|
|
Returns:
|
|
Formatted answer string
|
|
"""
|
|
return f"**Answer:**\n\n{answer}\n\n"
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract questions from markdown and query DeepSeek API"
|
|
)
|
|
parser.add_argument("input_file", help="Input markdown file with questions")
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
help="Output file (default: overwrite input file)",
|
|
default=None,
|
|
)
|
|
parser.add_argument(
|
|
"--system-prompt",
|
|
"-s",
|
|
help="System prompt for the API",
|
|
default="You are an expert in tourism and cultural experiences. Answer the question based on the provided context from traveler reviews.",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show what would be processed without making API calls",
|
|
)
|
|
parser.add_argument(
|
|
"--api-key",
|
|
help="DeepSeek API key (can also be set via DEEPSEEK_API_KEY env var)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate input file
|
|
input_path = Path(args.input_file)
|
|
if not input_path.exists():
|
|
print(f"Error: Input file '{args.input_file}' not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Get API key
|
|
import os
|
|
|
|
api_key = args.api_key or os.getenv("DEEPSEEK_API_KEY")
|
|
if not api_key and not args.dry_run:
|
|
print("Error: DeepSeek API key not provided", file=sys.stderr)
|
|
print("Set DEEPSEEK_API_KEY env var or use --api-key", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Extracting questions from: {args.input_file}")
|
|
questions_data = extract_questions_and_contexts(str(input_path))
|
|
print(f"Found {len(questions_data)} questions\n")
|
|
|
|
if args.dry_run:
|
|
print("DRY RUN MODE - No API calls will be made\n")
|
|
for i, (section, question, context, _) in enumerate(questions_data, 1):
|
|
print(f"--- Question {i} ---")
|
|
print(f"Section: {section}")
|
|
print(f"Question: {question}")
|
|
print(f"Context length: {len(context)} characters")
|
|
print()
|
|
return
|
|
|
|
print("Processing questions with DeepSeek API...\n")
|
|
|
|
# Build new markdown content
|
|
output_lines = ["# Fragenkatalog Evaluation - Answered\n\n"]
|
|
|
|
current_section = None
|
|
results = []
|
|
|
|
# Process each question and get answers
|
|
for i, (section, question, context, question_content) in enumerate(
|
|
questions_data, 1
|
|
):
|
|
print(f"Processing question {i}/{len(questions_data)}: {question[:60]}...")
|
|
|
|
system_prompt = """You are a culturally interested Bali traveler in a lead user interview with a marketer.
|
|
|
|
Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.
|
|
|
|
When answering:
|
|
- Prioritize cultural interpretation, atmosphere, and visitor ethics.
|
|
- Weigh trade-offs thoughtfully (e.g., crowds vs. significance).
|
|
- Avoid generic travel advice and avoid promotional language.
|
|
- Do not exaggerate.
|
|
- Provide nuanced, reflective reasoning rather than bullet lists.
|
|
- Keep answers concise but specific.
|
|
|
|
Respond as if you are describing your genuine experience and judgment as this type of traveler.
|
|
|
|
Use the provided CONTEXT to inform your answer, but do not feel obligated to use all of it. If the CONTEXT is not relevant to the question, you can ignore it.
|
|
NEVER directly quote the CONTEXT verbatim.
|
|
NEVER mention DOC or any context sources you are referring to. Instead, use it to synthesize your own understanding and response.
|
|
"""
|
|
|
|
answer = query_deepseek_api(question, context, system_prompt, api_key)
|
|
|
|
if answer:
|
|
print(f" ✓ Got answer ({len(answer)} chars)")
|
|
results.append((section, question, answer, context))
|
|
else:
|
|
print(f" ✗ Failed to get answer")
|
|
results.append((section, question, None, context))
|
|
|
|
# Build the output markdown
|
|
for section, question, answer, context in results:
|
|
# Add section header if it changed
|
|
if section != current_section:
|
|
output_lines.append(f"## {section}\n\n")
|
|
current_section = section
|
|
|
|
# Add question as heading
|
|
output_lines.append(f"### {question}\n\n")
|
|
|
|
# Add answer if available
|
|
if answer:
|
|
output_lines.append(f"**Answer:**\n\n{answer}\n\n")
|
|
else:
|
|
output_lines.append("**Answer:** _(Failed to generate)_\n\n")
|
|
|
|
# Add context
|
|
output_lines.append("Context:\n\n```\n")
|
|
output_lines.append(context)
|
|
output_lines.append("\n```\n\n")
|
|
|
|
# Write the output
|
|
output_path = (
|
|
Path(args.output)
|
|
if args.output
|
|
else input_path.parent / f"{input_path.stem}_answered.md"
|
|
)
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
f.write("".join(output_lines))
|
|
|
|
print(f"\n✓ Complete! Written to: {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|