From ffc7b36b67b8d80178e483089f1526cf3bd3a171 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 18:20:19 +0000 Subject: [PATCH 1/2] Upgrade OpenAI model to gpt-5.4-mini across all components Replaces gpt-4o (backend/scripts) and gpt-5.2 (experiment app) with gpt-5.4-mini in backend/nlp.py, experiment API routes, eval scripts, and llm_analysis.py defaults. https://claude.ai/code/session_01D4Tdbmy3kzxCzce3cQpFya --- backend/nlp.py | 6 ++---- experiment/app/api/chat/route.ts | 2 +- experiment/app/api/writing-support/route.ts | 2 +- experiment/lib/eval/colleagueEval.ts | 2 +- experiment/scripts/evalColleague.ts | 2 +- scripts/llm_analysis.py | 12 ++++++------ 6 files changed, 12 insertions(+), 14 deletions(-) diff --git a/backend/nlp.py b/backend/nlp.py index 1517487f..f7ea47ed 100644 --- a/backend/nlp.py +++ b/backend/nlp.py @@ -15,8 +15,7 @@ from posthog_client import posthog_client as ph_client MODEL_PARAMS = { - "model": "gpt-4o", - # "model": "gpt-5-mini", + "model": "gpt-5.4-mini", # "reasoning_effort": "minimal", # "text_verbosity": "medium" } @@ -43,8 +42,7 @@ async def warmup_nlp(): # make a dummy request to make sure everything is imported try: await dummy_client.chat.completions.create( - model="gpt-4o", - # model="gpt-5-mini", + model="gpt-5.4-mini", # reasoning_effort="minimal", messages=[{"role": "user", "content": "Hello"}], ) diff --git a/experiment/app/api/chat/route.ts b/experiment/app/api/chat/route.ts index 0544bc61..d14ad9fb 100644 --- a/experiment/app/api/chat/route.ts +++ b/experiment/app/api/chat/route.ts @@ -9,7 +9,7 @@ export async function POST(req: Request) { const scenario = getScenario(scenarioId); const result = streamText({ - model: openai('gpt-5.2'), + model: openai('gpt-5.4-mini'), system: scenario.chat.systemPrompt, messages: convertToModelMessages(messages), maxOutputTokens: 300, diff --git a/experiment/app/api/writing-support/route.ts b/experiment/app/api/writing-support/route.ts index e12d3db9..779f9be3 100644 --- a/experiment/app/api/writing-support/route.ts +++ b/experiment/app/api/writing-support/route.ts @@ -142,7 +142,7 @@ export async function POST(req: Request) { } const result = await generateObject({ - model: openai('gpt-5.2'), + model: openai('gpt-5.4-mini'), schema: listResponseSchema, prompt: fullPrompt, system: 'You are a helpful and insightful writing assistant.', diff --git a/experiment/lib/eval/colleagueEval.ts b/experiment/lib/eval/colleagueEval.ts index 07509e61..cde20b62 100644 --- a/experiment/lib/eval/colleagueEval.ts +++ b/experiment/lib/eval/colleagueEval.ts @@ -49,7 +49,7 @@ export async function evalColleagueResponse( testInput: string, colleagueResponse: string, criterion: EvalCriterion, - model: string = 'gpt-4o-mini' + model: string = 'gpt-5.4-mini' ): Promise { const judgePrompt = `You are evaluating an AI colleague's response in a workplace chat scenario. diff --git a/experiment/scripts/evalColleague.ts b/experiment/scripts/evalColleague.ts index f3f59e61..2244a5c9 100644 --- a/experiment/scripts/evalColleague.ts +++ b/experiment/scripts/evalColleague.ts @@ -41,7 +41,7 @@ async function callColleague( ]; const result = await generateText({ - model: openai('gpt-5.2'), + model: openai('gpt-5.4-mini'), system: systemPrompt, messages, maxOutputTokens: 300, diff --git a/scripts/llm_analysis.py b/scripts/llm_analysis.py index e491a783..0af94630 100644 --- a/scripts/llm_analysis.py +++ b/scripts/llm_analysis.py @@ -46,7 +46,7 @@ def get_scenario_context(scenario_id: str) -> dict: return SCENARIOS.get(scenario_id, SCENARIOS['roomDoubleBooking']) -def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> dict: +def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> dict: """ Analyze email quality on multiple dimensions. @@ -114,7 +114,7 @@ def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-4 return json.loads(response.choices[0].message.content) -def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> dict: +def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> dict: """ Analyze how well the email addresses the recipient's likely emotions. @@ -174,7 +174,7 @@ def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = ' return json.loads(response.choices[0].message.content) -def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> list[str]: +def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> list[str]: """ Generate a list of factual questions a careful reader would want to verify. @@ -224,7 +224,7 @@ def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'g def compare_questions_to_chat( questions: list[str], chat_messages: list[dict], - model: str = 'gpt-4o' + model: str = 'gpt-5.4-mini' ) -> dict: """ Compare the factual questions that should be asked against @@ -278,7 +278,7 @@ def compare_questions_to_chat( def analyze_ai_influence( email_text: str, ai_suggestions: list[dict], - model: str = 'gpt-4o' + model: str = 'gpt-5.4-mini' ) -> dict: """ Analyze how much of the AI suggestions made it into the final email. @@ -355,7 +355,7 @@ def analyze_ai_influence( def run_full_analysis( participant_data: dict, - model: str = 'gpt-4o', + model: str = 'gpt-5.4-mini', cache=None ) -> dict: """ From 59440b213795781d5fcc743f49d1f65577142b75 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 18:28:33 +0000 Subject: [PATCH 2/2] Set reasoning_effort=low for gpt-5.4-mini across all call sites Without this, the reasoning model defaults to a high reasoning budget, significantly increasing latency and cost. Also adds missing maxOutputTokens cap to writing-support route, which had no output limit. https://claude.ai/code/session_01D4Tdbmy3kzxCzce3cQpFya --- backend/nlp.py | 2 +- experiment/app/api/chat/route.ts | 2 +- experiment/app/api/writing-support/route.ts | 3 ++- experiment/lib/eval/colleagueEval.ts | 2 +- experiment/scripts/evalColleague.ts | 2 +- scripts/llm_analysis.py | 5 +++++ 6 files changed, 11 insertions(+), 5 deletions(-) diff --git a/backend/nlp.py b/backend/nlp.py index f7ea47ed..06261cf2 100644 --- a/backend/nlp.py +++ b/backend/nlp.py @@ -16,7 +16,7 @@ MODEL_PARAMS = { "model": "gpt-5.4-mini", - # "reasoning_effort": "minimal", + "reasoning_effort": "low", # "text_verbosity": "medium" } DEBUG_PROMPTS = False diff --git a/experiment/app/api/chat/route.ts b/experiment/app/api/chat/route.ts index d14ad9fb..a08792ba 100644 --- a/experiment/app/api/chat/route.ts +++ b/experiment/app/api/chat/route.ts @@ -9,7 +9,7 @@ export async function POST(req: Request) { const scenario = getScenario(scenarioId); const result = streamText({ - model: openai('gpt-5.4-mini'), + model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }), system: scenario.chat.systemPrompt, messages: convertToModelMessages(messages), maxOutputTokens: 300, diff --git a/experiment/app/api/writing-support/route.ts b/experiment/app/api/writing-support/route.ts index 779f9be3..8a0324e0 100644 --- a/experiment/app/api/writing-support/route.ts +++ b/experiment/app/api/writing-support/route.ts @@ -142,10 +142,11 @@ export async function POST(req: Request) { } const result = await generateObject({ - model: openai('gpt-5.4-mini'), + model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }), schema: listResponseSchema, prompt: fullPrompt, system: 'You are a helpful and insightful writing assistant.', + maxOutputTokens: 500, }); const suggestions = result.object.responses.length > 0 diff --git a/experiment/lib/eval/colleagueEval.ts b/experiment/lib/eval/colleagueEval.ts index cde20b62..32ab97ba 100644 --- a/experiment/lib/eval/colleagueEval.ts +++ b/experiment/lib/eval/colleagueEval.ts @@ -62,7 +62,7 @@ Think step by step, then answer with ONLY a JSON object (no markdown, no code bl {"answer": "yes" or "no", "reasoning": "brief 1-sentence explanation"}`; const result = await generateText({ - model: openai(model), + model: openai(model, { reasoningEffort: 'low' }), prompt: judgePrompt, maxOutputTokens: 150, }); diff --git a/experiment/scripts/evalColleague.ts b/experiment/scripts/evalColleague.ts index 2244a5c9..fec78192 100644 --- a/experiment/scripts/evalColleague.ts +++ b/experiment/scripts/evalColleague.ts @@ -41,7 +41,7 @@ async function callColleague( ]; const result = await generateText({ - model: openai('gpt-5.4-mini'), + model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }), system: systemPrompt, messages, maxOutputTokens: 300, diff --git a/scripts/llm_analysis.py b/scripts/llm_analysis.py index 0af94630..f4187224 100644 --- a/scripts/llm_analysis.py +++ b/scripts/llm_analysis.py @@ -109,6 +109,7 @@ def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-5 model=model, messages=[{'role': 'user', 'content': prompt}], response_format={'type': 'json_object'}, + reasoning_effort='low', ) return json.loads(response.choices[0].message.content) @@ -169,6 +170,7 @@ def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = ' model=model, messages=[{'role': 'user', 'content': prompt}], response_format={'type': 'json_object'}, + reasoning_effort='low', ) return json.loads(response.choices[0].message.content) @@ -215,6 +217,7 @@ def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'g model=model, messages=[{'role': 'user', 'content': prompt}], response_format={'type': 'json_object'}, + reasoning_effort='low', ) result = json.loads(response.choices[0].message.content) @@ -270,6 +273,7 @@ def compare_questions_to_chat( model=model, messages=[{'role': 'user', 'content': prompt}], response_format={'type': 'json_object'}, + reasoning_effort='low', ) return json.loads(response.choices[0].message.content) @@ -345,6 +349,7 @@ def analyze_ai_influence( model=model, messages=[{'role': 'user', 'content': prompt}], response_format={'type': 'json_object'}, + reasoning_effort='low', ) result = json.loads(response.choices[0].message.content)