#!/usr/bin/env python3 """ Script to extract questions and contexts from questions.md, query DeepSeek API, and insert answers above the context sections. """ import json import re import sys from pathlib import Path from typing import List, Optional, Tuple import requests def extract_questions_and_contexts(file_path: str) -> List[Tuple[str, str, str, str]]: """ Extract questions and their contexts from the markdown file. Returns: List of tuples: (section_title, question_text, context_text, full_section) """ with open(file_path, "r", encoding="utf-8") as f: content = f.read() results = [] # Split by section headers (##) section_pattern = r"^## (.+?)$" sections = re.split(section_pattern, content, flags=re.MULTILINE) # sections will be: ['', 'Section 1', 'content...', 'Section 2', 'content...', ...] for i in range(1, len(sections), 2): if i + 1 < len(sections): section_title = sections[i] section_content = sections[i + 1] # Extract questions within this section question_pattern = r"^### (.+?)$" questions = re.split(question_pattern, section_content, flags=re.MULTILINE) # questions will be: ['', 'Question 1', 'content...', 'Question 2', ...] for j in range(1, len(questions), 2): if j + 1 < len(questions): question_text = questions[j] question_content = questions[j + 1] # Extract context block context_match = re.search( r"Context:\s*\n\s*```\n(.*?)\n```", question_content, re.DOTALL ) if context_match: context_text = context_match.group(1) results.append( ( section_title, question_text, context_text, question_content, ) ) return results def query_deepseek_api( question: str, context: str, system_prompt: str, api_key: str ) -> Optional[str]: """ Query DeepSeek API with the question and context. Args: question: The question text context: The context/documents system_prompt: System prompt to use api_key: DeepSeek API key Returns: The API response or None if there's an error """ url = "https://api.deepseek.com/chat/completions" # Prepare the message user_message = f"Question: {question}\n\nContext:\n{context}" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}", } payload = { "model": "deepseek-chat", "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}, ], "temperature": 0.7, "max_tokens": 1000, } try: response = requests.post(url, headers=headers, json=payload, timeout=30) response.raise_for_status() result = response.json() if "choices" in result and len(result["choices"]) > 0: return result["choices"][0]["message"]["content"] else: print(f"Unexpected API response: {result}") return None except requests.exceptions.RequestException as e: print(f"API request failed: {e}", file=sys.stderr) return None def format_answer(answer: str) -> str: """ Format the answer as a response section above the context. Args: answer: The answer text from the API Returns: Formatted answer string """ return f"**Answer:**\n\n{answer}\n\n" def main(): """Main execution function.""" import argparse parser = argparse.ArgumentParser( description="Extract questions from markdown and query DeepSeek API" ) parser.add_argument("input_file", help="Input markdown file with questions") parser.add_argument( "--output", "-o", help="Output file (default: overwrite input file)", default=None, ) parser.add_argument( "--system-prompt", "-s", help="System prompt for the API", default="You are an expert in tourism and cultural experiences. Answer the question based on the provided context from traveler reviews.", ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be processed without making API calls", ) parser.add_argument( "--api-key", help="DeepSeek API key (can also be set via DEEPSEEK_API_KEY env var)", ) args = parser.parse_args() # Validate input file input_path = Path(args.input_file) if not input_path.exists(): print(f"Error: Input file '{args.input_file}' not found", file=sys.stderr) sys.exit(1) # Get API key import os api_key = args.api_key or os.getenv("DEEPSEEK_API_KEY") if not api_key and not args.dry_run: print("Error: DeepSeek API key not provided", file=sys.stderr) print("Set DEEPSEEK_API_KEY env var or use --api-key", file=sys.stderr) sys.exit(1) print(f"Extracting questions from: {args.input_file}") questions_data = extract_questions_and_contexts(str(input_path)) print(f"Found {len(questions_data)} questions\n") if args.dry_run: print("DRY RUN MODE - No API calls will be made\n") for i, (section, question, context, _) in enumerate(questions_data, 1): print(f"--- Question {i} ---") print(f"Section: {section}") print(f"Question: {question}") print(f"Context length: {len(context)} characters") print() return print("Processing questions with DeepSeek API...\n") # Build new markdown content output_lines = ["# Fragenkatalog Evaluation - Answered\n\n"] current_section = None results = [] # Process each question and get answers for i, (section, question, context, question_content) in enumerate( questions_data, 1 ): print(f"Processing question {i}/{len(questions_data)}: {question[:60]}...") system_prompt = """You are a culturally interested Bali traveler in a lead user interview with a marketer. Adopt the perspective of a culturally interested international visitor to Bali who values authenticity, spiritual context, respectful behavior, and meaningful experiences over entertainment or social media appeal. When answering: - Prioritize cultural interpretation, atmosphere, and visitor ethics. - Weigh trade-offs thoughtfully (e.g., crowds vs. significance). - Avoid generic travel advice and avoid promotional language. - Do not exaggerate. - Provide nuanced, reflective reasoning rather than bullet lists. - Keep answers concise but specific. Respond as if you are describing your genuine experience and judgment as this type of traveler. Use the provided CONTEXT to inform your answer, but do not feel obligated to use all of it. If the CONTEXT is not relevant to the question, you can ignore it. NEVER directly quote the CONTEXT verbatim. NEVER mention DOC or any context sources you are referring to. Instead, use it to synthesize your own understanding and response. """ answer = query_deepseek_api(question, context, system_prompt, api_key) if answer: print(f" āœ“ Got answer ({len(answer)} chars)") results.append((section, question, answer, context)) else: print(f" āœ— Failed to get answer") results.append((section, question, None, context)) # Build the output markdown for section, question, answer, context in results: # Add section header if it changed if section != current_section: output_lines.append(f"## {section}\n\n") current_section = section # Add question as heading output_lines.append(f"### {question}\n\n") # Add answer if available if answer: output_lines.append(f"**Answer:**\n\n{answer}\n\n") else: output_lines.append("**Answer:** _(Failed to generate)_\n\n") # Add context output_lines.append("Context:\n\n```\n") output_lines.append(context) output_lines.append("\n```\n\n") # Write the output output_path = ( Path(args.output) if args.output else input_path.parent / f"{input_path.stem}_answered.md" ) with open(output_path, "w", encoding="utf-8") as f: f.write("".join(output_lines)) print(f"\nāœ“ Complete! Written to: {output_path}") if __name__ == "__main__": main()