Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions backend/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@
from posthog_client import posthog_client as ph_client

MODEL_PARAMS = {
"model": "gpt-4o",
# "model": "gpt-5-mini",
# "reasoning_effort": "minimal",
"model": "gpt-5.4-mini",
"reasoning_effort": "low",
# "text_verbosity": "medium"
}
DEBUG_PROMPTS = False
Expand All @@ -43,8 +42,7 @@ async def warmup_nlp():
# make a dummy request to make sure everything is imported
try:
await dummy_client.chat.completions.create(
model="gpt-4o",
# model="gpt-5-mini",
model="gpt-5.4-mini",
# reasoning_effort="minimal",
messages=[{"role": "user", "content": "Hello"}],
)
Expand Down
2 changes: 1 addition & 1 deletion experiment/app/api/chat/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ export async function POST(req: Request) {
const scenario = getScenario(scenarioId);

const result = streamText({
model: openai('gpt-5.2'),
model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }),
system: scenario.chat.systemPrompt,
messages: convertToModelMessages(messages),
maxOutputTokens: 300,
Expand Down
3 changes: 2 additions & 1 deletion experiment/app/api/writing-support/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,11 @@ export async function POST(req: Request) {
}

const result = await generateObject({
model: openai('gpt-5.2'),
model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }),
schema: listResponseSchema,
prompt: fullPrompt,
system: 'You are a helpful and insightful writing assistant.',
maxOutputTokens: 500,
});

const suggestions = result.object.responses.length > 0
Expand Down
4 changes: 2 additions & 2 deletions experiment/lib/eval/colleagueEval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ export async function evalColleagueResponse(
testInput: string,
colleagueResponse: string,
criterion: EvalCriterion,
model: string = 'gpt-4o-mini'
model: string = 'gpt-5.4-mini'
): Promise<EvalResult> {
const judgePrompt = `You are evaluating an AI colleague's response in a workplace chat scenario.

Expand All @@ -62,7 +62,7 @@ Think step by step, then answer with ONLY a JSON object (no markdown, no code bl
{"answer": "yes" or "no", "reasoning": "brief 1-sentence explanation"}`;

const result = await generateText({
model: openai(model),
model: openai(model, { reasoningEffort: 'low' }),
prompt: judgePrompt,
maxOutputTokens: 150,
});
Expand Down
2 changes: 1 addition & 1 deletion experiment/scripts/evalColleague.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ async function callColleague(
];

const result = await generateText({
model: openai('gpt-5.2'),
model: openai('gpt-5.4-mini', { reasoningEffort: 'low' }),
system: systemPrompt,
messages,
maxOutputTokens: 300,
Expand Down
17 changes: 11 additions & 6 deletions scripts/llm_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def get_scenario_context(scenario_id: str) -> dict:
return SCENARIOS.get(scenario_id, SCENARIOS['roomDoubleBooking'])


def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> dict:
def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> dict:
"""
Analyze email quality on multiple dimensions.

Expand Down Expand Up @@ -109,12 +109,13 @@ def analyze_email_quality(email_text: str, scenario_id: str, model: str = 'gpt-4
model=model,
messages=[{'role': 'user', 'content': prompt}],
response_format={'type': 'json_object'},
reasoning_effort='low',
)

return json.loads(response.choices[0].message.content)


def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> dict:
def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> dict:
"""
Analyze how well the email addresses the recipient's likely emotions.

Expand Down Expand Up @@ -169,12 +170,13 @@ def analyze_recipient_feelings(email_text: str, scenario_id: str, model: str = '
model=model,
messages=[{'role': 'user', 'content': prompt}],
response_format={'type': 'json_object'},
reasoning_effort='low',
)

return json.loads(response.choices[0].message.content)


def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'gpt-4o') -> list[str]:
def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'gpt-5.4-mini') -> list[str]:
"""
Generate a list of factual questions a careful reader would want to verify.

Expand Down Expand Up @@ -215,6 +217,7 @@ def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'g
model=model,
messages=[{'role': 'user', 'content': prompt}],
response_format={'type': 'json_object'},
reasoning_effort='low',
)

result = json.loads(response.choices[0].message.content)
Expand All @@ -224,7 +227,7 @@ def extract_factual_questions(email_text: str, scenario_id: str, model: str = 'g
def compare_questions_to_chat(
questions: list[str],
chat_messages: list[dict],
model: str = 'gpt-4o'
model: str = 'gpt-5.4-mini'
) -> dict:
"""
Compare the factual questions that should be asked against
Expand Down Expand Up @@ -270,6 +273,7 @@ def compare_questions_to_chat(
model=model,
messages=[{'role': 'user', 'content': prompt}],
response_format={'type': 'json_object'},
reasoning_effort='low',
)

return json.loads(response.choices[0].message.content)
Expand All @@ -278,7 +282,7 @@ def compare_questions_to_chat(
def analyze_ai_influence(
email_text: str,
ai_suggestions: list[dict],
model: str = 'gpt-4o'
model: str = 'gpt-5.4-mini'
) -> dict:
"""
Analyze how much of the AI suggestions made it into the final email.
Expand Down Expand Up @@ -345,6 +349,7 @@ def analyze_ai_influence(
model=model,
messages=[{'role': 'user', 'content': prompt}],
response_format={'type': 'json_object'},
reasoning_effort='low',
)

result = json.loads(response.choices[0].message.content)
Expand All @@ -355,7 +360,7 @@ def analyze_ai_influence(

def run_full_analysis(
participant_data: dict,
model: str = 'gpt-4o',
model: str = 'gpt-5.4-mini',
cache=None
) -> dict:
"""
Expand Down