From 7d742ec4f9979fd3b45276339b477f3d522b10e7 Mon Sep 17 00:00:00 2001
From: Jooha Yoo <jy49@calvin.edu>
Date: Mon, 8 Dec 2025 16:38:19 -0500
Subject: [PATCH 1/3] Add Langfuse observability, update traces with metadata

---
 backend/nlp.py | 136 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 118 insertions(+), 18 deletions(-)

diff --git a/backend/nlp.py b/backend/nlp.py
index 776cdc6c..622100e4 100644
--- a/backend/nlp.py
+++ b/backend/nlp.py
@@ -7,6 +7,9 @@
 
 from dotenv import load_dotenv
 from pydantic import BaseModel
+from pathlib import Path
+
+from langfuse import Langfuse, observe, get_client
 
 import openai
 from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
@@ -14,38 +17,51 @@
 
 MODEL_PARAMS = {
     "model": "gpt-4o",
-    # "model": "gpt-5-mini",
-    # "reasoning_effort": "minimal",
-    # "text_verbosity": "medium"
 }
 DEBUG_PROMPTS = False
 
-load_dotenv()
+# Load .env from backend directory
+env_path = Path(__file__).parent / ".env"
+load_dotenv(dotenv_path=env_path)
 openai_api_key = (os.getenv("OPENAI_API_KEY") or "").strip()
 
 if openai_api_key == "":
     raise Exception("OPENAI_API_KEY is not set. Please set it in a .env file.")
 
+# Validate Langfuse configuration
+langfuse_public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
+langfuse_secret_key = os.getenv("LANGFUSE_SECRET_KEY")
+langfuse_base_url = os.getenv("LANGFUSE_BASE_URL")
+
+if not langfuse_public_key or not langfuse_secret_key:
+    raise Exception(
+        "LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY must be set in .env file. "
+        f"Current values: PUBLIC_KEY={'set' if langfuse_public_key else 'MISSING'}, "
+        f"SECRET_KEY={'set' if langfuse_secret_key else 'MISSING'}, "
+        f"BASE_URL={langfuse_base_url or 'MISSING'}"
+    )
+
 openai_client = AsyncOpenAI(
     api_key=openai_api_key,
 )
+ 
+langfuse = Langfuse(
+    public_key=langfuse_public_key,
+    secret_key=langfuse_secret_key,
+    host=langfuse_base_url 
+)
 
 
 async def warmup_nlp():
-    # Warm up the OpenAI client by making a dummy request
     dummy_client = openai.AsyncOpenAI(
         base_url="https://localhost:8000/v1", api_key="", timeout=0.01, max_retries=0
     )
-    # make a dummy request to make sure everything is imported
     try:
         await dummy_client.chat.completions.create(
             model="gpt-4o",
-            # model="gpt-5-mini",
-            # reasoning_effort="minimal",
             messages=[{"role": "user", "content": "Hello"}],
         )
     except openai.APIConnectionError:
-        # We expect this error because we're connecting to a non-existent server
         pass
 
 
@@ -125,7 +141,6 @@ def get_full_prompt(
 ) -> str:
     prompt = prompts[prompt_name]
 
-    # Choose which context to use based on the flag
     context_data = (
         doc_context.falseContextData if use_false_context else doc_context.contextData
     )
@@ -157,15 +172,29 @@ class ListResponse(BaseModel):
     responses: List[str]
 
 
+@observe(name="llm_parse_context", as_type="generation")
 async def _get_suggestions_from_context(
-    prompt_name: str, doc_context: DocContext, use_false_context: bool = False
+    prompt_name: str, 
+    doc_context: DocContext, 
+    use_false_context: bool = False
 ) -> List[str]:
     """Helper function to get suggestions from a specific context"""
+    context_type = "false" if use_false_context else "true"
+    
+    # Update current observation with metadata (v3 pattern because of langfuse_decorator version issues)
+    langfuse = get_client()
+    langfuse.update_current_observation(
+        metadata={
+            "prompt_name": prompt_name,
+            "context_type": context_type,
+            "use_false_context": use_false_context
+        }
+    )
+    
     full_prompt = get_full_prompt(
         prompt_name, doc_context, use_false_context=use_false_context
     )
     if DEBUG_PROMPTS:
-        context_type = "false" if use_false_context else "true"
         print(f"Prompt for {prompt_name} ({context_type} context):\n{full_prompt}\n")
 
     completion = await openai_client.chat.completions.parse(
@@ -187,12 +216,34 @@ async def _get_suggestions_from_context(
     return suggestion_response.responses
 
 
+@observe(name="get_suggestion")
 async def get_suggestion(prompt_name: str, doc_context: DocContext) -> GenerationResult:
+    """
+    Main function to get suggestions with Langfuse tracing.
+    This creates a trace for each suggestion request.
+    """
+    # Update trace with metadata for filtering in Langfuse (v3 pattern)
+    langfuse = get_client()
+    langfuse.update_current_trace(
+        name=f"suggestion_{prompt_name}",
+        metadata={
+            "suggestion_type": prompt_name,  # Primary field for evaluation filtering
+            "has_false_context": doc_context.falseContextData is not None and len(doc_context.falseContextData) > 0,
+            "has_true_context": doc_context.contextData is not None and len(doc_context.contextData) > 0,
+            "document_length": len(doc_context.beforeCursor + doc_context.selectedText + doc_context.afterCursor),
+            "has_selection": len(doc_context.selectedText) > 0,
+            "model": MODEL_PARAMS["model"]
+        },
+        tags=[prompt_name, "suggestion"],
+        session_id=prompt_name  # Groups all traces of same type together
+    )
+    
     # Special handling for complete_document: always use false context only, plain completion
     if prompt_name == "complete_document":
         full_prompt = get_full_prompt(prompt_name, doc_context, use_false_context=True)
         if DEBUG_PROMPTS:
             print(f"Prompt for {prompt_name} (false context only):\n{full_prompt}\n")
+        
         completion = await openai_client.chat.completions.create(
             **MODEL_PARAMS,
             messages=[
@@ -204,13 +255,23 @@ async def get_suggestion(prompt_name: str, doc_context: DocContext) -> Generatio
         result = completion.choices[0].message.content
         if not result:
             raise ValueError("No response found from complete_document.")
+        
+        langfuse.update_current_trace(
+            output={"result": result}
+        )
+        
         return GenerationResult(generation_type=prompt_name, result=result, extra_data={})
 
-    # If falseContextData is None/empty, use baseline behavior
+    # If falseContextData is none/empty, use baseline behavior
     if not doc_context.falseContextData:
+        langfuse.update_current_trace(
+            metadata={"execution_mode": "baseline"}
+        )
+        
         full_prompt = get_full_prompt(prompt_name, doc_context)
         if DEBUG_PROMPTS:
             print(f"Prompt for {prompt_name} (baseline):\n{full_prompt}\n")
+        
         completion = await openai_client.chat.completions.parse(
             **MODEL_PARAMS,
             messages=[
@@ -224,16 +285,29 @@ async def get_suggestion(prompt_name: str, doc_context: DocContext) -> Generatio
         )
 
         suggestion_response = completion.choices[0].message.parsed
-        if not suggestion_response or not suggestion_response:
+        if not suggestion_response or not suggestion_response.responses:
             raise ValueError("No suggestions found in the response.")
+        
         markdown_response = "\n\n".join(
             [f"- {item}" for item in suggestion_response.responses]
         )
+        
+        langfuse.update_current_trace(
+            output={
+                "result": markdown_response,
+                "suggestions": suggestion_response.responses
+            }
+        )
+        
         return GenerationResult(
             generation_type=prompt_name, result=markdown_response, extra_data={}
         )
 
     # Study mode: parallel calls with mixing
+    langfuse.update_current_trace(
+        metadata={"execution_mode": "study_mode_with_mixing"}
+    )
+    
     true_suggestions_task = _get_suggestions_from_context(
         prompt_name, doc_context, use_false_context=False
     )
@@ -248,6 +322,9 @@ async def get_suggestion(prompt_name: str, doc_context: DocContext) -> Generatio
 
     if len(true_suggestions) == 0 or len(false_suggestions) == 0:
         # One or both of the queries refused.
+        langfuse.update_current_trace(
+            metadata={"refusal": True}
+        )
         return GenerationResult(generation_type=prompt_name, result="", extra_data={})
 
     if len(true_suggestions) != 3 or len(false_suggestions) != 3:
@@ -274,7 +351,7 @@ async def get_suggestion(prompt_name: str, doc_context: DocContext) -> Generatio
     request_hash = hashlib.sha256(
         json.dumps(request_body, sort_keys=True).encode()
     ).hexdigest()
-    shuffle_seed = int(request_hash[:8], 16)  # Use first 8 hex chars as seed
+    shuffle_seed = int(request_hash[:8], 16)
 
     # Combine and shuffle suggestions
     all_suggestions = []
@@ -321,6 +398,22 @@ async def get_suggestion(prompt_name: str, doc_context: DocContext) -> Generatio
             1 for item in selected_suggestions if item["source"] == "false"
         ),
     }
+    
+    # Add mixing results to trace metadata
+    langfuse.update_current_trace(
+        metadata={
+            "mixing_stats": {
+                "true_count": extra_data["total_true_suggestions"],
+                "false_count": extra_data["total_false_suggestions"],
+                "shuffle_seed": shuffle_seed
+            }
+        },
+        output={
+            "result": markdown_response,
+            "suggestions": [item["content"] for item in selected_suggestions],
+            "sources": [item["source"] for item in selected_suggestions]
+        }
+    )
 
     return GenerationResult(
         generation_type=prompt_name, result=markdown_response, extra_data=extra_data
@@ -342,8 +435,6 @@ async def chat(
     )
 
     result = response.choices[0].message.content
-
-    # FIXME: figure out why result might ever be None
     return result or ""
 
 
@@ -356,8 +447,17 @@ def chat_stream(messages: Iterable[ChatCompletionMessageParam], temperature: flo
     )
 
 
+@observe(name="reflection", as_type="generation")
 async def reflection(userDoc: str, paragraph: str) -> GenerationResult:
     temperature = 1.0
+    
+    langfuse = get_client()
+    langfuse.update_current_observation(
+        metadata={
+            "temperature": temperature,
+            "generation_type": "reflection"
+        }
+    )
 
     questions = await chat(
         messages=[
@@ -374,4 +474,4 @@ async def reflection(userDoc: str, paragraph: str) -> GenerationResult:
             "prompt": userDoc,
             "temperature": temperature,
         },
-    )
+    )
\ No newline at end of file

From 4b3f9ad4cfc54ea61bde3e9c8b4432d2c03c5d2f Mon Sep 17 00:00:00 2001
From: Jooha Yoo <jy49@calvin.edu>
Date: Sat, 13 Dec 2025 19:34:28 -0500
Subject: [PATCH 2/3] commit

---
 backend/nlp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/nlp.py b/backend/nlp.py
index 622100e4..04551be9 100644
--- a/backend/nlp.py
+++ b/backend/nlp.py
@@ -20,7 +20,7 @@
 }
 DEBUG_PROMPTS = False
 
-# Load .env from backend directory
+# Load .env s
 env_path = Path(__file__).parent / ".env"
 load_dotenv(dotenv_path=env_path)
 openai_api_key = (os.getenv("OPENAI_API_KEY") or "").strip()
@@ -224,6 +224,7 @@ async def get_suggestion(prompt_name: str, doc_context: DocContext) -> Generatio
     """
     # Update trace with metadata for filtering in Langfuse (v3 pattern)
     langfuse = get_client()
+
     langfuse.update_current_trace(
         name=f"suggestion_{prompt_name}",
         metadata={
@@ -236,8 +237,7 @@ async def get_suggestion(prompt_name: str, doc_context: DocContext) -> Generatio
         },
         tags=[prompt_name, "suggestion"],
         session_id=prompt_name  # Groups all traces of same type together
-    )
-    
+       )
     # Special handling for complete_document: always use false context only, plain completion
     if prompt_name == "complete_document":
         full_prompt = get_full_prompt(prompt_name, doc_context, use_false_context=True)

From b3273dd1800d6f794984f63dbbd3002bab4b3798 Mon Sep 17 00:00:00 2001
From: Jooha Yoo <jy49@calvin.edu>
Date: Sat, 13 Dec 2025 19:42:21 -0500
Subject: [PATCH 3/3] c

---
 backend/nlp.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/backend/nlp.py b/backend/nlp.py
index 04551be9..52d522be 100644
--- a/backend/nlp.py
+++ b/backend/nlp.py
@@ -17,10 +17,13 @@
 
 MODEL_PARAMS = {
     "model": "gpt-4o",
+    # "model": "gpt-5-mini",
+    # "reasoning_effort": "minimal",
+    # "text_verbosity": "medium"
 }
 DEBUG_PROMPTS = False
 
-# Load .env s
+# Load .env 
 env_path = Path(__file__).parent / ".env"
 load_dotenv(dotenv_path=env_path)
 openai_api_key = (os.getenv("OPENAI_API_KEY") or "").strip()
@@ -53,15 +56,20 @@
 
 
 async def warmup_nlp():
+    # Warm up the OpenAI client by making a dummy request
     dummy_client = openai.AsyncOpenAI(
         base_url="https://localhost:8000/v1", api_key="", timeout=0.01, max_retries=0
     )
+    # make a dummy request to make sure everything is imported
     try:
         await dummy_client.chat.completions.create(
             model="gpt-4o",
+            # model="gpt-5-mini",
+            # reasoning_effort="minimal",
             messages=[{"role": "user", "content": "Hello"}],
         )
     except openai.APIConnectionError:
+        # We expect this error because we're connecting to a non-existent server
         pass
 
 
@@ -141,6 +149,7 @@ def get_full_prompt(
 ) -> str:
     prompt = prompts[prompt_name]
 
+    # Choose which context to use based on the flag
     context_data = (
         doc_context.falseContextData if use_false_context else doc_context.contextData
     )
@@ -262,7 +271,7 @@ async def get_suggestion(prompt_name: str, doc_context: DocContext) -> Generatio
         
         return GenerationResult(generation_type=prompt_name, result=result, extra_data={})
 
-    # If falseContextData is none/empty, use baseline behavior
+    # If falseContextData is None/empty, use baseline behavior
     if not doc_context.falseContextData:
         langfuse.update_current_trace(
             metadata={"execution_mode": "baseline"}
@@ -351,7 +360,7 @@ async def get_suggestion(prompt_name: str, doc_context: DocContext) -> Generatio
     request_hash = hashlib.sha256(
         json.dumps(request_body, sort_keys=True).encode()
     ).hexdigest()
-    shuffle_seed = int(request_hash[:8], 16)
+    shuffle_seed = int(request_hash[:8], 16)  # Use first 8 hex digits as seed
 
     # Combine and shuffle suggestions
     all_suggestions = []