From ffc7b36b67b8d80178e483089f1526cf3bd3a171 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 18:20:19 +0000
Subject: [PATCH 1/2] Upgrade OpenAI model to gpt-5.4-mini across all
 components

Replaces gpt-4o (backend/scripts) and gpt-5.2 (experiment app) with
gpt-5.4-mini in backend/nlp.py, experiment API routes, eval scripts,
and llm_analysis.py defaults.

https://claude.ai/code/session_01D4Tdbmy3kzxCzce3cQpFya
---
 backend/nlp.py                              |  6 ++----
 experiment/app/api/chat/route.ts            |  2 +-
 experiment/app/api/writing-support/route.ts |  2 +-
 experiment/lib/eval/colleagueEval.ts        |  2 +-
 experiment/scripts/evalColleague.ts         |  2 +-
 scripts/llm_analysis.py                     | 12 ++++++------
 6 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/backend/nlp.py b/backend/nlp.py
index 1517487f..f7ea47ed 100644
--- a/backend/nlp.py
+++ b/backend/nlp.py
@@ -15,8 +15,7 @@
 from posthog_client import posthog_client as ph_client
 
 MODEL_PARAMS = {
-    "model": "gpt-4o",
-    # "model": "gpt-5-mini",
+    "model": "gpt-5.4-mini",
     # "reasoning_effort": "minimal",
     # "text_verbosity": "medium"
 }
@@ -43,8 +42,7 @@ async def warmup_nlp():
     # make a dummy request to make sure everything is imported
     try:
         await dummy_client.chat.completions.create(
-            model="gpt-4o",
-            # model="gpt-5-mini",
+            model="gpt-5.4-mini",
             # reasoning_effort="minimal",
             messages=[{"role": "user", "content": "Hello"}],
         )
diff --git a/experiment/app/api/chat/route.ts b/experiment/app/api/chat/route.ts
index 0544bc61..d14ad9fb 100644
--- a/experiment/app/api/chat/route.ts
+++ b/experiment/app/api/chat/route.ts
@@ -9,7 +9,7 @@ export async function POST(req: Request) {
   const scenario = getScenario(scenarioId);
 
   const result = streamText({
-    model: openai('gpt-5.2'),
+    model: openai('gpt-5.4-mini'),
     system: scenario.chat.systemPrompt,
     messages: convertToModelMessages(messages),
     maxOutputTokens: 300,
diff --git a/experiment/app/api/writing-support/route.ts b/experiment/app/api/writing-support/route.ts
index e12d3db9..779f9be3 100644
--- a/experiment/app/api/writing-support/route.ts
+++ b/experiment/app/api/writing-support/route.ts
@@ -142,7 +142,7 @@ export async function POST(req: Request) {
     }
 
     const result = await generateObject({
-      model: openai('gpt-5.2'),
+      model: openai('gpt-5.4-mini'),
       schema: listResponseSchema,
       prompt: fullPrompt,
       system: 'You are a helpful and insightful writing assistant.',
diff --git a/experiment/lib/eval/colleagueEval.ts b/experiment/lib/eval/colleagueEval.ts
index 07509e61..cde20b62 100644
--- a/experiment/lib/eval/colleagueEval.ts
+++ b/experiment/lib/eval/colleagueEval.ts
@@ -49,7 +49,7 @@ export async function evalColleagueResponse(
   testInput: string,
   colleagueResponse: string,
   criterion: EvalCriterion,
-  model: string = 'gpt-4o-mini'
+  model: string = 'gpt-5.4-mini'
 ): Promise<EvalResult> {
   const judgePrompt = `You are evaluating an AI colleague's response in a workplace chat scenario.
 
diff --git a/experiment/scripts/evalColleague.ts b/experiment/scripts/evalColleague.ts
index f3f59e61..2244a5c9 100644
--- a/experiment/scripts/evalColleague.ts
+++ b/experiment/scripts/evalColleague.ts
@@ -41,7 +41,7 @@ async function callColleague(
   ];
 
   const result = await generateText({
-    model: openai('gpt-5.2'),
+    model: openai('gpt-5.4-mini'),
     system: systemPrompt,
     messages,
     maxOutputTokens: 300,
diff --git a/scripts/llm_analysis.py b/scripts/llm_analysis.py
index e491a783..0af94630 100644
--- a/scripts/llm_analysis.py
+++ b/scripts/llm_analysis.py
@@ -46,7 +46,7 @@ def get_scenario_context(scenario_id: str) -> dict:
     return SCENARIOS.get(scenario_id, SCENARIOS['roomDoubleBooking'])
 
 
-def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> dict:
+def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> dict:
     """
     Analyze email quality on multiple dimensions.
 
@@ -114,7 +114,7 @@ def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-4
     return json.loads(response.choices[0].message.content)
 
 
-def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> dict:
+def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> dict:
     """
     Analyze how well the email addresses the recipient's likely emotions.
 
@@ -174,7 +174,7 @@ def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = '
     return json.loads(response.choices[0].message.content)
 
 
-def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> list[str]:
+def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> list[str]:
     """
     Generate a list of factual questions a careful reader would want to verify.
 
@@ -224,7 +224,7 @@ def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'g
 def compare_questions_to_chat(
     questions: list[str],
     chat_messages: list[dict],
-    model: str = 'gpt-4o'
+    model: str = 'gpt-5.4-mini'
 ) -> dict:
     """
     Compare the factual questions that should be asked against
@@ -278,7 +278,7 @@ def compare_questions_to_chat(
 def analyze_ai_influence(
     email_text: str,
     ai_suggestions: list[dict],
-    model: str = 'gpt-4o'
+    model: str = 'gpt-5.4-mini'
 ) -> dict:
     """
     Analyze how much of the AI suggestions made it into the final email.
@@ -355,7 +355,7 @@ def analyze_ai_influence(
 
 def run_full_analysis(
     participant_data: dict,
-    model: str = 'gpt-4o',
+    model: str = 'gpt-5.4-mini',
     cache=None
 ) -> dict:
     """

From 59440b213795781d5fcc743f49d1f65577142b75 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 5 May 2026 18:28:33 +0000
Subject: [PATCH 2/2] Set reasoning_effort=low for gpt-5.4-mini across all call
 sites

Without this, the reasoning model defaults to a high reasoning budget,
significantly increasing latency and cost. Also adds missing maxOutputTokens
cap to writing-support route, which had no output limit.

https://claude.ai/code/session_01D4Tdbmy3kzxCzce3cQpFya
---
 backend/nlp.py                              | 2 +-
 experiment/app/api/chat/route.ts            | 2 +-
 experiment/app/api/writing-support/route.ts | 3 ++-
 experiment/lib/eval/colleagueEval.ts        | 2 +-
 experiment/scripts/evalColleague.ts         | 2 +-
 scripts/llm_analysis.py                     | 5 +++++
 6 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/backend/nlp.py b/backend/nlp.py
index f7ea47ed..06261cf2 100644
--- a/backend/nlp.py
+++ b/backend/nlp.py
@@ -16,7 +16,7 @@
 
 MODEL_PARAMS = {
     "model": "gpt-5.4-mini",
-    # "reasoning_effort": "minimal",
+    "reasoning_effort": "low",
     # "text_verbosity": "medium"
 }
 DEBUG_PROMPTS = False
diff --git a/experiment/app/api/chat/route.ts b/experiment/app/api/chat/route.ts
index d14ad9fb..a08792ba 100644
--- a/experiment/app/api/chat/route.ts
+++ b/experiment/app/api/chat/route.ts
@@ -9,7 +9,7 @@ export async function POST(req: Request) {
   const scenario = getScenario(scenarioId);
 
   const result = streamText({
-    model: openai('gpt-5.4-mini'),
+    model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }),
     system: scenario.chat.systemPrompt,
     messages: convertToModelMessages(messages),
     maxOutputTokens: 300,
diff --git a/experiment/app/api/writing-support/route.ts b/experiment/app/api/writing-support/route.ts
index 779f9be3..8a0324e0 100644
--- a/experiment/app/api/writing-support/route.ts
+++ b/experiment/app/api/writing-support/route.ts
@@ -142,10 +142,11 @@ export async function POST(req: Request) {
     }
 
     const result = await generateObject({
-      model: openai('gpt-5.4-mini'),
+      model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }),
       schema: listResponseSchema,
       prompt: fullPrompt,
       system: 'You are a helpful and insightful writing assistant.',
+      maxOutputTokens: 500,
     });
 
     const suggestions = result.object.responses.length > 0
diff --git a/experiment/lib/eval/colleagueEval.ts b/experiment/lib/eval/colleagueEval.ts
index cde20b62..32ab97ba 100644
--- a/experiment/lib/eval/colleagueEval.ts
+++ b/experiment/lib/eval/colleagueEval.ts
@@ -62,7 +62,7 @@ Think step by step, then answer with ONLY a JSON object (no markdown, no code bl
 {"answer": "yes" or "no", "reasoning": "brief 1-sentence explanation"}`;
 
   const result = await generateText({
-    model: openai(model),
+    model: openai(model, { reasoningEffort: 'low' }),
     prompt: judgePrompt,
     maxOutputTokens: 150,
   });
diff --git a/experiment/scripts/evalColleague.ts b/experiment/scripts/evalColleague.ts
index 2244a5c9..fec78192 100644
--- a/experiment/scripts/evalColleague.ts
+++ b/experiment/scripts/evalColleague.ts
@@ -41,7 +41,7 @@ async function callColleague(
   ];
 
   const result = await generateText({
-    model: openai('gpt-5.4-mini'),
+    model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }),
     system: systemPrompt,
     messages,
     maxOutputTokens: 300,
diff --git a/scripts/llm_analysis.py b/scripts/llm_analysis.py
index 0af94630..f4187224 100644
--- a/scripts/llm_analysis.py
+++ b/scripts/llm_analysis.py
@@ -109,6 +109,7 @@ def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-5
         model=model,
         messages=[{'role': 'user', 'content': prompt}],
         response_format={'type': 'json_object'},
+        reasoning_effort='low',
     )
 
     return json.loads(response.choices[0].message.content)
@@ -169,6 +170,7 @@ def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = '
         model=model,
         messages=[{'role': 'user', 'content': prompt}],
         response_format={'type': 'json_object'},
+        reasoning_effort='low',
     )
 
     return json.loads(response.choices[0].message.content)
@@ -215,6 +217,7 @@ def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'g
         model=model,
         messages=[{'role': 'user', 'content': prompt}],
         response_format={'type': 'json_object'},
+        reasoning_effort='low',
     )
 
     result = json.loads(response.choices[0].message.content)
@@ -270,6 +273,7 @@ def compare_questions_to_chat(
         model=model,
         messages=[{'role': 'user', 'content': prompt}],
         response_format={'type': 'json_object'},
+        reasoning_effort='low',
     )
 
     return json.loads(response.choices[0].message.content)
@@ -345,6 +349,7 @@ def analyze_ai_influence(
         model=model,
         messages=[{'role': 'user', 'content': prompt}],
         response_format={'type': 'json_object'},
+        reasoning_effort='low',
     )
 
     result = json.loads(response.choices[0].message.content)