Files
masterthesis-playground/questionnaire/extract_and_answer_questions.py
2026-02-27 16:45:03 +01:00

268 lines
8.7 KiB
Python

#!/usr/bin/env python3
"""
Script to extract questions and contexts from questions.md,
query DeepSeek API, and insert answers above the context sections.
"""
import json
import re
import sys
from pathlib import Path
from typing import List, Optional, Tuple
import requests
def extract_questions_and_contexts(file_path: str) -> List[Tuple[str, str, str, str]]:
"""
Extract questions and their contexts from the markdown file.
Returns:
List of tuples: (section_title, question_text, context_text, full_section)
"""
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
results = []
# Split by section headers (##)
section_pattern = r"^## (.+?)$"
sections = re.split(section_pattern, content, flags=re.MULTILINE)
# sections will be: ['', 'Section 1', 'content...', 'Section 2', 'content...', ...]
for i in range(1, len(sections), 2):
if i + 1 < len(sections):
section_title = sections[i]
section_content = sections[i + 1]
# Extract questions within this section
question_pattern = r"^### (.+?)$"
questions = re.split(question_pattern, section_content, flags=re.MULTILINE)
# questions will be: ['', 'Question 1', 'content...', 'Question 2', ...]
for j in range(1, len(questions), 2):
if j + 1 < len(questions):
question_text = questions[j]
question_content = questions[j + 1]
# Extract context block
context_match = re.search(
r"Context:\s*\n\s*```\n(.*?)\n```", question_content, re.DOTALL
)
if context_match:
context_text = context_match.group(1)
results.append(
(
section_title,
question_text,
context_text,
question_content,
)
)
return results
def query_deepseek_api(
question: str, context: str, system_prompt: str, api_key: str
) -> Optional[str]:
"""
Query DeepSeek API with the question and context.
Args:
question: The question text
context: The context/documents
system_prompt: System prompt to use
api_key: DeepSeek API key
Returns:
The API response or None if there's an error
"""
url = "https://api.deepseek.com/chat/completions"
# Prepare the message
user_message = f"Question: {question}\n\nContext:\n{context}"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
}
payload = {
"model": "deepseek-chat",
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message},
],
"temperature": 0.7,
"max_tokens": 1000,
}
try:
response = requests.post(url, headers=headers, json=payload, timeout=30)
response.raise_for_status()
result = response.json()
if "choices" in result and len(result["choices"]) > 0:
return result["choices"][0]["message"]["content"]
else:
print(f"Unexpected API response: {result}")
return None
except requests.exceptions.RequestException as e:
print(f"API request failed: {e}", file=sys.stderr)
return None
def format_answer(answer: str) -> str:
"""
Format the answer as a response section above the context.
Args:
answer: The answer text from the API
Returns:
Formatted answer string
"""
return f"**Answer:**\n\n{answer}\n\n"
def main():
"""Main execution function."""
import argparse
parser = argparse.ArgumentParser(
description="Extract questions from markdown and query DeepSeek API"
)
parser.add_argument("input_file", help="Input markdown file with questions")
parser.add_argument(
"--output",
"-o",
help="Output file (default: overwrite input file)",
default=None,
)
parser.add_argument(
"--system-prompt",
"-s",
help="System prompt for the API",
default="You are an expert in tourism and cultural experiences. Answer the question based on the provided context from traveler reviews.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be processed without making API calls",
)
parser.add_argument(
"--api-key",
help="DeepSeek API key (can also be set via DEEPSEEK_API_KEY env var)",
)
args = parser.parse_args()
# Validate input file
input_path = Path(args.input_file)
if not input_path.exists():
print(f"Error: Input file '{args.input_file}' not found", file=sys.stderr)
sys.exit(1)
# Get API key
import os
api_key = args.api_key or os.getenv("DEEPSEEK_API_KEY")
if not api_key and not args.dry_run:
print("Error: DeepSeek API key not provided", file=sys.stderr)
print("Set DEEPSEEK_API_KEY env var or use --api-key", file=sys.stderr)
sys.exit(1)
print(f"Extracting questions from: {args.input_file}")
questions_data = extract_questions_and_contexts(str(input_path))
print(f"Found {len(questions_data)} questions\n")
if args.dry_run:
print("DRY RUN MODE - No API calls will be made\n")
for i, (section, question, context, _) in enumerate(questions_data, 1):
print(f"--- Question {i} ---")
print(f"Section: {section}")
print(f"Question: {question}")
print(f"Context length: {len(context)} characters")
print()
return
print("Processing questions with DeepSeek API...\n")
# Build new markdown content
output_lines = ["# Fragenkatalog Evaluation - Answered\n\n"]
current_section = None
results = []
# Process each question and get answers
for i, (section, question, context, question_content) in enumerate(
questions_data, 1
):
print(f"Processing question {i}/{len(questions_data)}: {question[:60]}...")
system_prompt = """You are a culturally interested Bali traveler in a lead user interview with a marketer.
Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.
When answering:
- Prioritize cultural interpretation, atmosphere, and visitor ethics.
- Weigh trade-offs thoughtfully (e.g., crowds vs. significance).
- Avoid generic travel advice and avoid promotional language.
- Do not exaggerate.
- Provide nuanced, reflective reasoning rather than bullet lists.
- Keep answers concise but specific.
Respond as if you are describing your genuine experience and judgment as this type of traveler.
Use the provided CONTEXT to inform your answer, but do not feel obligated to use all of it. If the CONTEXT is not relevant to the question, you can ignore it.
NEVER directly quote the CONTEXT verbatim.
NEVER mention DOC or any context sources you are referring to. Instead, use it to synthesize your own understanding and response.
"""
answer = query_deepseek_api(question, context, system_prompt, api_key)
if answer:
print(f" ✓ Got answer ({len(answer)} chars)")
results.append((section, question, answer, context))
else:
print(f" ✗ Failed to get answer")
results.append((section, question, None, context))
# Build the output markdown
for section, question, answer, context in results:
# Add section header if it changed
if section != current_section:
output_lines.append(f"## {section}\n\n")
current_section = section
# Add question as heading
output_lines.append(f"### {question}\n\n")
# Add answer if available
if answer:
output_lines.append(f"**Answer:**\n\n{answer}\n\n")
else:
output_lines.append("**Answer:** _(Failed to generate)_\n\n")
# Add context
output_lines.append("Context:\n\n```\n")
output_lines.append(context)
output_lines.append("\n```\n\n")
# Write the output
output_path = (
Path(args.output)
if args.output
else input_path.parent / f"{input_path.stem}_answered.md"
)
with open(output_path, "w", encoding="utf-8") as f:
f.write("".join(output_lines))
print(f"\n✓ Complete! Written to: {output_path}")
if __name__ == "__main__":
main()