masterthesis-playground/questionnaire/extract_and_answer_questions.py

#!/usr/bin/env python3
"""
Script to extract questions and contexts from questions.md,
query DeepSeek API, and insert answers above the context sections.
"""

import json
import re
import sys
from pathlib import Path
from typing import List, Optional, Tuple

import requests


def extract_questions_and_contexts(file_path: str) -> List[Tuple[str, str, str, str]]:
    """
    Extract questions and their contexts from the markdown file.

    Returns:
        List of tuples: (section_title, question_text, context_text, full_section)
    """
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    results = []

    # Split by section headers (##)
    section_pattern = r"^## (.+?)$"
    sections = re.split(section_pattern, content, flags=re.MULTILINE)

    # sections will be: ['', 'Section 1', 'content...', 'Section 2', 'content...', ...]
    for i in range(1, len(sections), 2):
        if i + 1 < len(sections):
            section_title = sections[i]
            section_content = sections[i + 1]

            # Extract questions within this section
            question_pattern = r"^### (.+?)$"
            questions = re.split(question_pattern, section_content, flags=re.MULTILINE)

            # questions will be: ['', 'Question 1', 'content...', 'Question 2', ...]
            for j in range(1, len(questions), 2):
                if j + 1 < len(questions):
                    question_text = questions[j]
                    question_content = questions[j + 1]

                    # Extract context block
                    context_match = re.search(
                        r"Context:\s*\n\s*```\n(.*?)\n```", question_content, re.DOTALL
                    )

                    if context_match:
                        context_text = context_match.group(1)
                        results.append(
                            (
                                section_title,
                                question_text,
                                context_text,
                                question_content,
                            )
                        )

    return results


def query_deepseek_api(
    question: str, context: str, system_prompt: str, api_key: str
) -> Optional[str]:
    """
    Query DeepSeek API with the question and context.

    Args:
        question: The question text
        context: The context/documents
        system_prompt: System prompt to use
        api_key: DeepSeek API key

    Returns:
        The API response or None if there's an error
    """
    url = "https://api.deepseek.com/chat/completions"

    # Prepare the message
    user_message = f"Question: {question}\n\nContext:\n{context}"

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}",
    }

    payload = {
        "model": "deepseek-chat",
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message},
        ],
        "temperature": 0.7,
        "max_tokens": 1000,
    }

    try:
        response = requests.post(url, headers=headers, json=payload, timeout=30)
        response.raise_for_status()

        result = response.json()
        if "choices" in result and len(result["choices"]) > 0:
            return result["choices"][0]["message"]["content"]
        else:
            print(f"Unexpected API response: {result}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {e}", file=sys.stderr)
        return None


def format_answer(answer: str) -> str:
    """
    Format the answer as a response section above the context.

    Args:
        answer: The answer text from the API

    Returns:
        Formatted answer string
    """
    return f"**Answer:**\n\n{answer}\n\n"


def main():
    """Main execution function."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Extract questions from markdown and query DeepSeek API"
    )
    parser.add_argument("input_file", help="Input markdown file with questions")
    parser.add_argument(
        "--output",
        "-o",
        help="Output file (default: overwrite input file)",
        default=None,
    )
    parser.add_argument(
        "--system-prompt",
        "-s",
        help="System prompt for the API",
        default="You are an expert in tourism and cultural experiences. Answer the question based on the provided context from traveler reviews.",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what would be processed without making API calls",
    )
    parser.add_argument(
        "--api-key",
        help="DeepSeek API key (can also be set via DEEPSEEK_API_KEY env var)",
    )

    args = parser.parse_args()

    # Validate input file
    input_path = Path(args.input_file)
    if not input_path.exists():
        print(f"Error: Input file '{args.input_file}' not found", file=sys.stderr)
        sys.exit(1)

    # Get API key
    import os

    api_key = args.api_key or os.getenv("DEEPSEEK_API_KEY")
    if not api_key and not args.dry_run:
        print("Error: DeepSeek API key not provided", file=sys.stderr)
        print("Set DEEPSEEK_API_KEY env var or use --api-key", file=sys.stderr)
        sys.exit(1)

    print(f"Extracting questions from: {args.input_file}")
    questions_data = extract_questions_and_contexts(str(input_path))
    print(f"Found {len(questions_data)} questions\n")

    if args.dry_run:
        print("DRY RUN MODE - No API calls will be made\n")
        for i, (section, question, context, _) in enumerate(questions_data, 1):
            print(f"--- Question {i} ---")
            print(f"Section: {section}")
            print(f"Question: {question}")
            print(f"Context length: {len(context)} characters")
            print()
        return

    print("Processing questions with DeepSeek API...\n")

    # Build new markdown content
    output_lines = ["# Fragenkatalog Evaluation - Answered\n\n"]

    current_section = None
    results = []

    # Process each question and get answers
    for i, (section, question, context, question_content) in enumerate(
        questions_data, 1
    ):
        print(f"Processing question {i}/{len(questions_data)}: {question[:60]}...")

        system_prompt = """You are a culturally interested Bali traveler in a lead user interview with a marketer.

Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal.

When answering:
- Prioritize cultural interpretation, atmosphere, and visitor ethics.
- Weigh trade-offs thoughtfully (e.g., crowds vs. significance).
- Avoid generic travel advice and avoid promotional language.
- Do not exaggerate.
- Provide nuanced, reflective reasoning rather than bullet lists.
- Keep answers concise but specific.

Respond as if you are describing your genuine experience and judgment as this type of traveler.

Use the provided CONTEXT to inform your answer, but do not feel obligated to use all of it. If the CONTEXT is not relevant to the question, you can ignore it.
NEVER directly quote the CONTEXT verbatim.
NEVER mention DOC or any context sources you are referring to. Instead, use it to synthesize your own understanding and response.
"""

        answer = query_deepseek_api(question, context, system_prompt, api_key)

        if answer:
            print(f"  ✓ Got answer ({len(answer)} chars)")
            results.append((section, question, answer, context))
        else:
            print(f"  ✗ Failed to get answer")
            results.append((section, question, None, context))

    # Build the output markdown
    for section, question, answer, context in results:
        # Add section header if it changed
        if section != current_section:
            output_lines.append(f"## {section}\n\n")
            current_section = section

        # Add question as heading
        output_lines.append(f"### {question}\n\n")

        # Add answer if available
        if answer:
            output_lines.append(f"**Answer:**\n\n{answer}\n\n")
        else:
            output_lines.append("**Answer:** _(Failed to generate)_\n\n")

        # Add context
        output_lines.append("Context:\n\n```\n")
        output_lines.append(context)
        output_lines.append("\n```\n\n")

    # Write the output
    output_path = (
        Path(args.output)
        if args.output
        else input_path.parent / f"{input_path.stem}_answered.md"
    )
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("".join(output_lines))

    print(f"\n✓ Complete! Written to: {output_path}")


if __name__ == "__main__":
    main()