diff --git a/backend/nlp.py b/backend/nlp.py index b93db48f..c6460fa5 100644 --- a/backend/nlp.py +++ b/backend/nlp.py @@ -15,9 +15,8 @@ from posthog_client import posthog_client as ph_client MODEL_PARAMS = { - "model": "gpt-4o", - # "model": "gpt-5-mini", - # "reasoning_effort": "minimal", + "model": "gpt-5.4-mini", + "reasoning_effort": "low", # "text_verbosity": "medium" } DEBUG_PROMPTS = False @@ -43,8 +42,7 @@ async def warmup_nlp(): # make a dummy request to make sure everything is imported try: await dummy_client.chat.completions.create( - model="gpt-4o", - # model="gpt-5-mini", + model="gpt-5.4-mini", # reasoning_effort="minimal", messages=[{"role": "user", "content": "Hello"}], ) diff --git a/experiment/app/api/chat/route.ts b/experiment/app/api/chat/route.ts index 0544bc61..a08792ba 100644 --- a/experiment/app/api/chat/route.ts +++ b/experiment/app/api/chat/route.ts @@ -9,7 +9,7 @@ export async function POST(req: Request) { const scenario = getScenario(scenarioId); const result = streamText({ - model: openai('gpt-5.2'), + model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }), system: scenario.chat.systemPrompt, messages: convertToModelMessages(messages), maxOutputTokens: 300, diff --git a/experiment/app/api/writing-support/route.ts b/experiment/app/api/writing-support/route.ts index e12d3db9..8a0324e0 100644 --- a/experiment/app/api/writing-support/route.ts +++ b/experiment/app/api/writing-support/route.ts @@ -142,10 +142,11 @@ export async function POST(req: Request) { } const result = await generateObject({ - model: openai('gpt-5.2'), + model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }), schema: listResponseSchema, prompt: fullPrompt, system: 'You are a helpful and insightful writing assistant.', + maxOutputTokens: 500, }); const suggestions = result.object.responses.length > 0 diff --git a/experiment/lib/eval/colleagueEval.ts b/experiment/lib/eval/colleagueEval.ts index 07509e61..32ab97ba 100644 --- a/experiment/lib/eval/colleagueEval.ts +++ b/experiment/lib/eval/colleagueEval.ts @@ -49,7 +49,7 @@ export async function evalColleagueResponse( testInput: string, colleagueResponse: string, criterion: EvalCriterion, - model: string = 'gpt-4o-mini' + model: string = 'gpt-5.4-mini' ): Promise { const judgePrompt = `You are evaluating an AI colleague's response in a workplace chat scenario. @@ -62,7 +62,7 @@ Think step by step, then answer with ONLY a JSON object (no markdown, no code bl {"answer": "yes" or "no", "reasoning": "brief 1-sentence explanation"}`; const result = await generateText({ - model: openai(model), + model: openai(model, { reasoningEffort: 'low' }), prompt: judgePrompt, maxOutputTokens: 150, }); diff --git a/experiment/scripts/evalColleague.ts b/experiment/scripts/evalColleague.ts index f3f59e61..fec78192 100644 --- a/experiment/scripts/evalColleague.ts +++ b/experiment/scripts/evalColleague.ts @@ -41,7 +41,7 @@ async function callColleague( ]; const result = await generateText({ - model: openai('gpt-5.2'), + model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }), system: systemPrompt, messages, maxOutputTokens: 300, diff --git a/scripts/llm_analysis.py b/scripts/llm_analysis.py index e491a783..f4187224 100644 --- a/scripts/llm_analysis.py +++ b/scripts/llm_analysis.py @@ -46,7 +46,7 @@ def get_scenario_context(scenario_id: str) -> dict: return SCENARIOS.get(scenario_id, SCENARIOS['roomDoubleBooking']) -def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> dict: +def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> dict: """ Analyze email quality on multiple dimensions. @@ -109,12 +109,13 @@ def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-4 model=model, messages=[{'role': 'user', 'content': prompt}], response_format={'type': 'json_object'}, + reasoning_effort='low', ) return json.loads(response.choices[0].message.content) -def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> dict: +def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> dict: """ Analyze how well the email addresses the recipient's likely emotions. @@ -169,12 +170,13 @@ def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = ' model=model, messages=[{'role': 'user', 'content': prompt}], response_format={'type': 'json_object'}, + reasoning_effort='low', ) return json.loads(response.choices[0].message.content) -def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> list[str]: +def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> list[str]: """ Generate a list of factual questions a careful reader would want to verify. @@ -215,6 +217,7 @@ def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'g model=model, messages=[{'role': 'user', 'content': prompt}], response_format={'type': 'json_object'}, + reasoning_effort='low', ) result = json.loads(response.choices[0].message.content) @@ -224,7 +227,7 @@ def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'g def compare_questions_to_chat( questions: list[str], chat_messages: list[dict], - model: str = 'gpt-4o' + model: str = 'gpt-5.4-mini' ) -> dict: """ Compare the factual questions that should be asked against @@ -270,6 +273,7 @@ def compare_questions_to_chat( model=model, messages=[{'role': 'user', 'content': prompt}], response_format={'type': 'json_object'}, + reasoning_effort='low', ) return json.loads(response.choices[0].message.content) @@ -278,7 +282,7 @@ def compare_questions_to_chat( def analyze_ai_influence( email_text: str, ai_suggestions: list[dict], - model: str = 'gpt-4o' + model: str = 'gpt-5.4-mini' ) -> dict: """ Analyze how much of the AI suggestions made it into the final email. @@ -345,6 +349,7 @@ def analyze_ai_influence( model=model, messages=[{'role': 'user', 'content': prompt}], response_format={'type': 'json_object'}, + reasoning_effort='low', ) result = json.loads(response.choices[0].message.content) @@ -355,7 +360,7 @@ def analyze_ai_influence( def run_full_analysis( participant_data: dict, - model: str = 'gpt-4o', + model: str = 'gpt-5.4-mini', cache=None ) -> dict: """