From bc889f8b2b045ac0234f063e5ed5a2e8885f6418 Mon Sep 17 00:00:00 2001 From: Mengqin Shen Date: Sat, 14 Feb 2026 00:07:38 -0800 Subject: [PATCH 1/7] fix(py): add static prompt fixtures and custom evaluator sample to match JS --- .../genkit/tests/genkit/blocks/prompt_test.py | 79 ++++++++++++ .../genkit/blocks/prompts/badSchemaRef.prompt | 9 ++ .../blocks/prompts/chat_preamble.prompt | 5 + .../genkit/blocks/prompts/kitchensink.prompt | 25 ++++ .../tests/genkit/blocks/prompts/output.prompt | 11 ++ .../genkit/blocks/prompts/schemaRef.prompt | 9 ++ .../genkit/blocks/prompts/sub/test.prompt | 5 + .../tests/genkit/blocks/prompts/test.prompt | 5 + .../genkit/blocks/prompts/test.variant.prompt | 6 + .../genkit/blocks/prompts/toolPrompt.prompt | 6 + py/pyproject.toml | 1 + .../framework-custom-evaluators/README.md | 89 +++++++++++++ .../datasets/deliciousness_dataset.json | 112 ++++++++++++++++ .../datasets/funniness_dataset.json | 97 ++++++++++++++ .../datasets/pii_detection_dataset.json | 107 ++++++++++++++++ .../datasets/regex_dataset.json | 102 +++++++++++++++ .../local.env.example | 11 ++ .../prompts/deliciousness.prompt | 29 +++++ .../prompts/funniness.prompt | 40 ++++++ .../prompts/pii_detection.prompt | 36 ++++++ .../pyproject.toml | 70 ++++++++++ py/samples/framework-custom-evaluators/run.sh | 44 +++++++ .../src/__init__.py | 17 +++ .../src/constants.py | 36 ++++++ .../src/deliciousness_evaluator.py | 97 ++++++++++++++ .../src/funniness_evaluator.py | 97 ++++++++++++++ .../framework-custom-evaluators/src/main.py | 120 ++++++++++++++++++ .../src/pii_evaluator.py | 95 ++++++++++++++ .../src/regex_evaluator.py | 106 ++++++++++++++++ .../prompts/recipe.robot.prompt | 17 +++ py/samples/framework-prompt-demo/src/main.py | 35 ++++- py/uv.lock | 31 +++++ 32 files changed, 1548 insertions(+), 1 deletion(-) create mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt create mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/chat_preamble.prompt create mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt create mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/output.prompt create mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt create mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/sub/test.prompt create mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/test.prompt create mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/test.variant.prompt create mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt create mode 100644 py/samples/framework-custom-evaluators/README.md create mode 100644 py/samples/framework-custom-evaluators/datasets/deliciousness_dataset.json create mode 100644 py/samples/framework-custom-evaluators/datasets/funniness_dataset.json create mode 100644 py/samples/framework-custom-evaluators/datasets/pii_detection_dataset.json create mode 100644 py/samples/framework-custom-evaluators/datasets/regex_dataset.json create mode 100644 py/samples/framework-custom-evaluators/local.env.example create mode 100644 py/samples/framework-custom-evaluators/prompts/deliciousness.prompt create mode 100644 py/samples/framework-custom-evaluators/prompts/funniness.prompt create mode 100644 py/samples/framework-custom-evaluators/prompts/pii_detection.prompt create mode 100644 py/samples/framework-custom-evaluators/pyproject.toml create mode 100755 py/samples/framework-custom-evaluators/run.sh create mode 100644 py/samples/framework-custom-evaluators/src/__init__.py create mode 100644 py/samples/framework-custom-evaluators/src/constants.py create mode 100644 py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py create mode 100644 py/samples/framework-custom-evaluators/src/funniness_evaluator.py create mode 100644 py/samples/framework-custom-evaluators/src/main.py create mode 100644 py/samples/framework-custom-evaluators/src/pii_evaluator.py create mode 100644 py/samples/framework-custom-evaluators/src/regex_evaluator.py create mode 100644 py/samples/framework-prompt-demo/prompts/recipe.robot.prompt diff --git a/py/packages/genkit/tests/genkit/blocks/prompt_test.py b/py/packages/genkit/tests/genkit/blocks/prompt_test.py index 25434bed2a..d7e3ba03ff 100644 --- a/py/packages/genkit/tests/genkit/blocks/prompt_test.py +++ b/py/packages/genkit/tests/genkit/blocks/prompt_test.py @@ -854,3 +854,82 @@ async def test_variant_prompt_loading_does_not_recurse() -> None: robot_exec = await prompt(ai.registry, 'recipe', variant='robot') robot_response = await robot_exec({'food': 'pizza'}) assert 'pizza' in robot_response.text + + +@pytest.mark.asyncio +async def test_load_static_prompts() -> None: + """Test loading static prompts from the definitions/prompts directory.""" + ai, *_ = setup_test() + + @ai.tool(name='toolA') + def tool_a() -> str: + return 'toolA' + + @ai.tool(name='toolB') + def tool_b() -> str: + return 'toolB' + + # Path to the static prompts directory + # genkit/tests/genkit/blocks/prompts + prompts_dir = Path(__file__).parent / 'prompts' + + if not prompts_dir.exists(): + pytest.skip(f'Static prompts directory not found at {prompts_dir}') + + load_prompt_folder(ai.registry, prompts_dir) + + # Verify 'test' prompt + test_prompt = await ai.registry.resolve_action(ActionKind.PROMPT, 'test') + assert test_prompt is not None + + # Verify 'kitchensink' prompt + kitchensink_prompt = await ai.registry.resolve_action(ActionKind.PROMPT, 'kitchensink') + assert kitchensink_prompt is not None + + # Verify 'toolPrompt' prompt + tool_prompt = await ai.registry.resolve_action(ActionKind.PROMPT, 'toolPrompt') + assert tool_prompt is not None + + # Verify sub-directory prompt 'sub/test' + sub_test_prompt = await ai.registry.resolve_action(ActionKind.PROMPT, 'sub/test') + assert sub_test_prompt is not None + + # Verify 'sub/test' rendering + sub_test_response = await sub_test_prompt.arun({}) + sub_test_req = sub_test_response.response + assert sub_test_req.config.temperature == 12 # From config in sub/test.prompt + # Rendered text: "Hello from the sub folder prompt file" + # Default role user. + assert len(sub_test_req.messages) == 1 + assert sub_test_req.messages[0].role == Role.USER + assert sub_test_req.messages[0].content[0].root.text == 'Hello from the sub folder prompt file' + + # Verify 'kitchensink' rendering with input + # kitchensink.prompt has: + # model: googleai/gemini-5.0-ultimate-pro-plus + # config: temperature: 11 + # tools: [toolA, toolB] + # output: format: csv, schema: ... + # template: {{role "system"}} Hello {{history}} from the prompt file {{ subject }} + + # Use EXECUTABLE_PROMPT to verify model and other generation options + kitchensink_executable = await ai.registry.resolve_action(ActionKind.EXECUTABLE_PROMPT, 'kitchensink') + assert kitchensink_executable is not None + + kitchensink_response = await kitchensink_executable.arun({'subject': 'banana'}) + req = kitchensink_response.response + + assert req.model == 'googleai/gemini-5.0-ultimate-pro-plus' + assert req.config.temperature == 11 + assert req.output.format == 'csv' + # Tools should be listed + assert 'toolA' in req.tools + assert 'toolB' in req.tools + + # Verify messages structure + # Expected: System message " Hello " and maybe another message + assert len(req.messages) > 0 + assert req.messages[0].role == Role.SYSTEM + assert 'Hello' in req.messages[0].content[0].root.text + # Check for the subject substitution + assert any('banana' in m.content[0].root.text for m in req.messages) diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt new file mode 100644 index 0000000000..d3c92047da --- /dev/null +++ b/py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt @@ -0,0 +1,9 @@ +--- +model: googleai/gemini-2.5-flash +input: + schema: badSchemaRef1 +output: + schema: badSchemaRef2 +--- + +doesn't matter \ No newline at end of file diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/chat_preamble.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/chat_preamble.prompt new file mode 100644 index 0000000000..31a580ac80 --- /dev/null +++ b/py/packages/genkit/tests/genkit/blocks/prompts/chat_preamble.prompt @@ -0,0 +1,5 @@ +--- +config: + version: 'abc' +--- +hi {{ name }} from template diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt new file mode 100644 index 0000000000..b494a8c124 --- /dev/null +++ b/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt @@ -0,0 +1,25 @@ +--- +model: googleai/gemini-5.0-ultimate-pro-plus +description: a description +config: + temperature: 11 +tools: + - toolA + - toolB +returnToolRequests: true +input: + schema: + subject: string +output: + format: csv + schema: + obj?(object, a nested object): + nest1?: string + arr(array, array of objects): + nest2?: boolean +maxTurns: 77 +toolChoice: required +metadata: + foo: bar +--- +{{role "system"}} Hello {{history}} from the prompt file {{ subject }} \ No newline at end of file diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/output.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/output.prompt new file mode 100644 index 0000000000..ac0987a692 --- /dev/null +++ b/py/packages/genkit/tests/genkit/blocks/prompts/output.prompt @@ -0,0 +1,11 @@ +--- +model: staticResponseModel +input: + schema: + name: string +output: + schema: + bar: string +--- + +Hi {{ name }} \ No newline at end of file diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt new file mode 100644 index 0000000000..9ee3f16ff3 --- /dev/null +++ b/py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt @@ -0,0 +1,9 @@ +--- +model: googleai/gemini-2.5-flash +input: + schema: myInputSchema +output: + schema: myOutputSchema +--- + +Write a poem about {{foo}}. \ No newline at end of file diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/sub/test.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/sub/test.prompt new file mode 100644 index 0000000000..42237b4b79 --- /dev/null +++ b/py/packages/genkit/tests/genkit/blocks/prompts/sub/test.prompt @@ -0,0 +1,5 @@ +--- +config: + temperature: 12 +--- +Hello from the sub folder prompt file \ No newline at end of file diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/test.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/test.prompt new file mode 100644 index 0000000000..9bb784edf5 --- /dev/null +++ b/py/packages/genkit/tests/genkit/blocks/prompts/test.prompt @@ -0,0 +1,5 @@ +--- +config: + temperature: 11 +--- +Hello from the prompt file \ No newline at end of file diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/test.variant.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/test.variant.prompt new file mode 100644 index 0000000000..b307d7e53f --- /dev/null +++ b/py/packages/genkit/tests/genkit/blocks/prompts/test.variant.prompt @@ -0,0 +1,6 @@ +--- +description: a prompt variant in a file +config: + temperature: 13 +--- +Hello from a variant of the hello prompt diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt new file mode 100644 index 0000000000..bc5c8e2e04 --- /dev/null +++ b/py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt @@ -0,0 +1,6 @@ +--- +description: prompt in a file +tools: + - agentA +--- +{{ role "system" }} {{ @state.name }} toolPrompt prompt \ No newline at end of file diff --git a/py/pyproject.toml b/py/pyproject.toml index 5241abed69..1723dca45a 100644 --- a/py/pyproject.toml +++ b/py/pyproject.toml @@ -175,6 +175,7 @@ default-groups = ["dev", "lint"] # Samples (alphabetical by package name from pyproject.toml) dev-local-vectorstore-hello = { workspace = true } framework-context-demo = { workspace = true } +framework-custom-evaluators = { workspace = true } framework-dynamic-tools-demo = { workspace = true } framework-evaluator-demo = { workspace = true } framework-format-demo = { workspace = true } diff --git a/py/samples/framework-custom-evaluators/README.md b/py/samples/framework-custom-evaluators/README.md new file mode 100644 index 0000000000..9e89e39c03 --- /dev/null +++ b/py/samples/framework-custom-evaluators/README.md @@ -0,0 +1,89 @@ +# Writing your own evaluators + +This sample demonstrates how to write your own suite of custom evaluators. The evaluators in this package demonstrate how to write evaluators that leverage LLMs as well as a simple regex matcher. There are also simple test datasets to demonstrate how to use them. + +## Evaluators + +### Non-LLM Evaluators + +#### Regex Matchers + +- **Location**: `src/regex_evaluator.py` +- **Names**: `byo/regex_match_url`, `byo/regex_match_us_phone` +- **Output**: boolean + +The regex evaluator is an example that does not use an LLM. It also demonstrates how to create a factory method that can be parameterized to create multiple evaluators from the same pattern. + +### LLM-Based Evaluators + +#### PII Detection + +- **Location**: `src/pii_evaluator.py` +- **Name**: `byo/pii_detection` +- **Output**: boolean + +An evaluator that attempts to detect PII in your output using an LLM judge. + +#### Funniness + +- **Location**: `src/funniness_evaluator.py` +- **Name**: `byo/funniness` +- **Output**: enum/categorization (`FUNNY_JOKE`, `NOT_FUNNY_JOKE`, `OFFENSIVE_JOKE`, `NOT_A_JOKE`) + +An evaluator that attempts to judge if a passed statement is a joke and if it is funny. + +#### Deliciousness + +- **Location**: `src/deliciousness_evaluator.py` +- **Name**: `byo/deliciousness` +- **Output**: string (`yes`, `no`, `maybe`) + +An evaluator that attempts to judge if a passed statement is delicious literally or metaphorically. + +## Setup and Run + +1. **Set environment variable**: + ```bash + export GEMINI_API_KEY= + ``` + +2. **Start the app**: + ```bash + ./run.sh + ``` + +## Test your evaluators + +**Note**: Run these commands in a separate terminal while the app is running. + +### Regex evaluators: + +```bash +genkit eval:run datasets/regex_dataset.json --evaluators=byo/regex_match_url,byo/regex_match_us_phone +``` + +### PII Detection: + +```bash +genkit eval:run datasets/pii_detection_dataset.json --evaluators=byo/pii_detection +``` + +### Funniness: + +```bash +genkit eval:run datasets/funniness_dataset.json --evaluators=byo/funniness +``` + +### Deliciousness: + +```bash +genkit eval:run datasets/deliciousness_dataset.json --evaluators=byo/deliciousness +``` + +## See your results + +Navigate to the `Evaluations` section in the Dev UI at http://localhost:4000. + +## Note + +The evaluators implemented in this sample do not consider the `input` provided to the model as part of the evaluation. Therefore, many of the test datasets provided have `input` set to `"input"`. If you are implementing an evaluator that utilizes the input provided to the model, you have to provide the actual input in this field. diff --git a/py/samples/framework-custom-evaluators/datasets/deliciousness_dataset.json b/py/samples/framework-custom-evaluators/datasets/deliciousness_dataset.json new file mode 100644 index 0000000000..4233fdb804 --- /dev/null +++ b/py/samples/framework-custom-evaluators/datasets/deliciousness_dataset.json @@ -0,0 +1,112 @@ +[ + { + "testCaseId": "test_case_id_31", + "input": "input", + "output": "A perfectly ripe mango – sweet, juicy, and with a hint of tropical sunshine." + }, + { + "testCaseId": "test_case_id_32", + "input": "input", + "output": "Freshly baked bread, warm from the oven, with a crisp crust and a soft, fluffy interior." + }, + { + "testCaseId": "test_case_id_33", + "input": "input", + "output": "A sizzling steak, cooked medium-rare, with a juicy center and a slightly charred exterior." + }, + { + "testCaseId": "test_case_id_34", + "input": "input", + "output": "Creamy, rich chocolate mousse with a light and airy texture." + }, + { + "testCaseId": "test_case_id_35", + "input": "input", + "output": "A refreshing watermelon slice on a hot summer day – sweet, cool, and incredibly hydrating." + }, + { + "testCaseId": "test_case_id_36", + "input": "input", + "output": "Sushi with the freshest fish, expertly prepared rice, and a perfect balance of flavors." + }, + { + "testCaseId": "test_case_id_37", + "input": "input", + "output": "A wood-fired pizza with a slightly blistered crust, tangy tomato sauce, and gooey mozzarella cheese." + }, + { + "testCaseId": "test_case_id_38", + "input": "input", + "output": "Tacos al pastor – tender marinated pork, sweet pineapple, and a sprinkle of fresh cilantro." + }, + { + "testCaseId": "test_case_id_39", + "input": "input", + "output": "A sweet and tart key lime pie with a buttery graham cracker crust." + }, + { + "testCaseId": "test_case_id_40", + "input": "input", + "output": "Ripe strawberries bursting with sweet, juicy flavor." + }, + { + "testCaseId": "test_case_id_41", + "input": "input", + "output": "Overcooked, mushy Brussels sprouts with a slightly bitter aftertaste." + }, + { + "testCaseId": "test_case_id_42", + "input": "input", + "output": "Cold, soggy French fries that have lost all their crispiness." + }, + { + "testCaseId": "test_case_id_43", + "input": "input", + "output": "A flavorless, under-seasoned chicken breast that's dry and tough." + }, + { + "testCaseId": "test_case_id_44", + "input": "input", + "output": "Liver and onions – a strong, metallic flavor that many find unpleasant." + }, + { + "testCaseId": "test_case_id_45", + "input": "input", + "output": "Stale, flavorless cereal that's been sitting in the box too long." + }, + { + "testCaseId": "test_case_id_46", + "input": "input", + "output": "An overripe banana – mushy, with a slightly fermented taste." + }, + { + "testCaseId": "test_case_id_47", + "input": "input", + "output": "A burnt piece of toast – bitter, acrid, and unpleasant to eat." + }, + { + "testCaseId": "test_case_id_48", + "input": "input", + "output": "Lutefisk – a gelatinous fish dish with a strong, ammonia-like smell." + }, + { + "testCaseId": "test_case_id_49", + "input": "input", + "output": "An extremely spicy dish that burns your mouth and overpowers any other flavors." + }, + { + "testCaseId": "test_case_id_50", + "input": "input", + "output": "Spoiled milk with a sour, rancid smell and a chunky texture." + }, + { + "testCaseId": "test_case_id_51", + "input": "input", + "output": "Juicy gossip" + }, + { + "testCaseId": "test_case_id_52", + "input": "input", + "output": "A very attractive person" + } +] diff --git a/py/samples/framework-custom-evaluators/datasets/funniness_dataset.json b/py/samples/framework-custom-evaluators/datasets/funniness_dataset.json new file mode 100644 index 0000000000..911660a2d6 --- /dev/null +++ b/py/samples/framework-custom-evaluators/datasets/funniness_dataset.json @@ -0,0 +1,97 @@ +[ + { + "testCaseId": "test_case_id_1", + "input": "input", + "output": "Why did the scarecrow love his job? Because he was outstanding in his field." + }, + { + "testCaseId": "test_case_id_2", + "input": "input", + "output": "What do you call a lazy kangaroo? Pouch potato." + }, + { + "testCaseId": "test_case_id_3", + "input": "input", + "output": "I tried to sue the airport for misplacing my luggage. I lost my case." + }, + { + "testCaseId": "test_case_id_4", + "input": "input", + "output": "If athletes get athlete's foot, what do astronauts get? Missile toe." + }, + { + "testCaseId": "test_case_id_5", + "input": "input", + "output": "What do you call a bear with no teeth? A gummy bear!" + }, + { + "testCaseId": "test_case_id_6", + "input": "input", + "output": "Why don't scientists trust atoms? Because they make up everything." + }, + { + "testCaseId": "test_case_id_7", + "input": "input", + "output": "Why was the math book sad? Because it had too many problems." + }, + { + "testCaseId": "test_case_id_8", + "input": "input", + "output": "Did you hear about the restaurant on the moon? Great food, no atmosphere." + }, + { + "testCaseId": "test_case_id_9", + "input": "input", + "output": "Velcro – what a rip-off!" + }, + { + "testCaseId": "test_case_id_21", + "input": "input", + "output": "I dropped my phone down the toilet. It was a bad call." + }, + { + "testCaseId": "test_case_id_22", + "input": "input", + "output": "What do you call a fake noodle? An impasta." + }, + { + "testCaseId": "test_case_id_23", + "input": "input", + "output": "What's red and bad for your teeth? A brick." + }, + { + "testCaseId": "test_case_id_24", + "input": "input", + "output": "Why did the toilet paper roll down the hill? To get to the bottom." + }, + { + "testCaseId": "test_case_id_25", + "input": "input", + "output": "My boss told me to have a good day... so I went home." + }, + { + "testCaseId": "test_case_id_26", + "input": "input", + "output": "Today a man knocked on my door and asked for a small donation towards the local swimming pool. I gave him a glass of water." + }, + { + "testCaseId": "test_case_id_27", + "input": "input", + "output": "What has one head, one foot, and four legs? A bed." + }, + { + "testCaseId": "test_case_id_28", + "input": "input", + "output": "I used to be addicted to soap, but I'm clean now." + }, + { + "testCaseId": "test_case_id_29", + "input": "input", + "output": "What is the least spoken language in the world? Sign language." + }, + { + "testCaseId": "test_case_id_30", + "input": "input", + "output": "Why couldn't the bicycle stand up by itself? It was two tired." + } +] diff --git a/py/samples/framework-custom-evaluators/datasets/pii_detection_dataset.json b/py/samples/framework-custom-evaluators/datasets/pii_detection_dataset.json new file mode 100644 index 0000000000..126b79398f --- /dev/null +++ b/py/samples/framework-custom-evaluators/datasets/pii_detection_dataset.json @@ -0,0 +1,107 @@ +[ + { + "testCaseId": "flowers", + "input": "input", + "output": "Today is a beautiful day in spring. The birds are chirping, and the flowers are blooming." + }, + { + "testCaseId": "cookies", + "input": "input", + "output": "Chocolate chip cookies are the best kind of cookie." + }, + { + "testCaseId": "capital", + "input": "input", + "output": "The capital of France is Paris." + }, + { + "testCaseId": "pi", + "input": "input", + "output": "3.14 is the value of pi (π)." + }, + { + "testCaseId": "tallest_mountain", + "input": "input", + "output": "The world's tallest mountain is Mount Everest." + }, + { + "testCaseId": "press_release", + "input": "input", + "output": "This is a press release from a fictional company. Acme Inc. is excited to announce the launch of its new product, the \"Super Widget 3000\"." + }, + { + "testCaseId": "blogpost", + "input": "input", + "output": "This blog post offers generic gardening tips. Learn how to fertilize your plants and keep them healthy." + }, + { + "testCaseId": "movie_review", + "input": "input", + "output": "This movie review avoids mentioning actors' names. The superhero movie was a thrilling adventure with great special effects." + }, + { + "testCaseId": "social_media", + "input": "input", + "output": "This social media post doesn'ttag anyone. Just had a delicious pizza! #foodie" + }, + { + "testCaseId": "generic_email", + "input": "input", + "output": "Dear Valued Customer, Thank you for your business!" + }, + { + "testCaseId": "weather", + "input": "input", + "output": "This weather report doesn't mention a specific location. Expect sunny skies with a high of 75 degrees." + }, + { + "testCaseId": "name_and_address", + "input": "input", + "output": "My name is John Smith, and I live at 123 Main Street in Anytown, USA." + }, + { + "testCaseId": "phone", + "input": "input", + "output": "My phone number is 555-555-5555." + }, + { + "testCaseId": "social_media_handle", + "input": "input", + "output": "Having a great time at the beach today in Santa Monica with @soandsofakehandle!" + }, + { + "testCaseId": "name_only", + "input": "input", + "output": "Thank you for your order, Ms. Jane Doe. Please see attached invoice for your records." + }, + { + "testCaseId": "birthdate", + "input": "input", + "output": "I was born 01/01/1900." + }, + { + "testCaseId": "comment_social_media_handle", + "input": "input", + "output": "Great post, @FaketyFakeSarahJones!" + }, + { + "testCaseId": "card_number", + "input": "input", + "output": "I lost my credit card yesterday. The card number is 1234-5678-9012-3456." + }, + { + "testCaseId": "medical_info", + "input": "input", + "output": "Jill FakeName has a colonoscopy is tomorrow at 2 pm with Dr. Joe Shmo." + }, + { + "testCaseId": "social_security", + "input": "input", + "output": "The last four of my social are 0000" + }, + { + "testCaseId": "last_four_credit_card", + "input": "input", + "output": "I used my amex credit card to pay for that - the one that ends in 4444." + } +] diff --git a/py/samples/framework-custom-evaluators/datasets/regex_dataset.json b/py/samples/framework-custom-evaluators/datasets/regex_dataset.json new file mode 100644 index 0000000000..f2bc3ba568 --- /dev/null +++ b/py/samples/framework-custom-evaluators/datasets/regex_dataset.json @@ -0,0 +1,102 @@ +[ + { + "testCaseId": "valid_phone_dashes", + "input": "input", + "output": "123-456-7890" + }, + { + "testCaseId": "valid_phone_parens_spaces", + "input": "input", + "output": "(123) 456 7890" + }, + { + "testCaseId": "valid_phone_dots", + "input": "input", + "output": "123.456.7890" + }, + { + "testCaseId": "valid_phone_no_delimiter", + "input": "input", + "output": "1234567890" + }, + { + "testCaseId": "valid_phone_combo", + "input": "input", + "output": "(555) 123-4567" + }, + { + "testCaseId": "invalid_phone_too_short", + "input": "input", + "output": "1234-5678" + }, + { + "testCaseId": "invalid_phone_has_letters", + "input": "input", + "output": "ABC-456-7890" + }, + { + "testCaseId": "invalid_phone_separator", + "input": "input", + "output": "123 45* 7890" + }, + { + "testCaseId": "invalid_phone_too_long", + "input": "input", + "output": "123-456-78901" + }, + { + "testCaseId": "invalid_phone_bad_areacode", + "input": "input", + "output": "(1234) 567-890" + }, + { + "testCaseId": "valid_url_example", + "input": "input", + "output": "https://www.example.com" + }, + { + "testCaseId": "valid_url_dotnet", + "input": "input", + "output": "http://example.net/" + }, + { + "testCaseId": "valid_url_resource_name", + "input": "input", + "output": "https://www.example.net/products/item123" + }, + { + "testCaseId": "valid_url_subdomain", + "input": "input", + "output": "https://subdomain.example.org/path/with/query?param=value" + }, + { + "testCaseId": "valid_url_ip_address", + "input": "input", + "output": "http://127.0.0.1:5000" + }, + { + "testCaseId": "invalid_url_example", + "input": "input", + "output": "example.com" + }, + { + "testCaseId": "invalid_url_@_symbol", + "input": "input", + "output": "https://www.example@com" + }, + { + "testCaseId": "invalid_url_sentence", + "input": "input", + "output": "this is just a sentence" + }, + { + "testCaseId": "invalid_url_bad_slashes", + "input": "input", + "output": "https:\\\\bad.slashes.com" + }, + { + "testCaseId": "invalid_url_spaces", + "input": "input", + "output": "http://my website is a test" + } +] diff --git a/py/samples/framework-custom-evaluators/local.env.example b/py/samples/framework-custom-evaluators/local.env.example new file mode 100644 index 0000000000..0eb41df186 --- /dev/null +++ b/py/samples/framework-custom-evaluators/local.env.example @@ -0,0 +1,11 @@ +# Local environment variables for development +# Copy this file to local.env and set your API keys + +# Required: Google AI API key for LLM-based evaluators +export GEMINI_API_KEY=your-api-key-here + +# Optional: Enable debug mode +# export DEBUG=true + +# Optional: Custom log format (json or console) +# export LOG_FORMAT=console diff --git a/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt b/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt new file mode 100644 index 0000000000..013bbbe146 --- /dev/null +++ b/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt @@ -0,0 +1,29 @@ +--- +input: + schema: + output: string +--- +You are a food critic with a wide range in taste. Given the output, decide if it sounds delicious and provide your reasoning. Use only "yes" (if delicous), "no" (if not delicious), "maybe" (if you can't decide) as the verdict. + +Here are a few examples: + +Output: +Chicken parm sandwich +Response: +{ "reason": "This is a classic sandwich enjoyed by many - totally delicious", "verdict":"yes"} + +Output: +Boston logan international airport tarmac +Response: +{ "reason": "This is not edible and definitely not delicious.", "verdict":"no"} + +Output: +A juicy piece of gossip +Response: +{ "reason": "Gossip is sometimes metaphorically referred to as tasty.", "verdict":"maybe"} + +Here is a new submission to assess: + +Output: +{{output}} +Response: \ No newline at end of file diff --git a/py/samples/framework-custom-evaluators/prompts/funniness.prompt b/py/samples/framework-custom-evaluators/prompts/funniness.prompt new file mode 100644 index 0000000000..787006c0ff --- /dev/null +++ b/py/samples/framework-custom-evaluators/prompts/funniness.prompt @@ -0,0 +1,40 @@ +--- +input: + schema: + output: string +--- +You are a joke critic with a wide range in your taste for jokes. Given the output, decide if it is a joke and then decide if that joke is funny and provide your reasoning. Use the following categories as a verdict in the response FUNNY_JOKE, NOT_FUNNY_JOKE, OFFENSIVE_JOKE, NOT_A_JOKE. + +Here is an example of an output that is a funny joke: + +Output: +Why did the scarecrow win an award? Because he was outstanding in his field! +Response: +{ "reason": "This is a classic, simple joke with a play on words that's likely to elicit a chuckle.", "verdict":"FUNNY"} + +Here is an example of an output that is not a funny joke: + +Output: +Why did the chicken cross the road? To get to the other side! +Response: +{ "reason": "This is a classic joke that is not funny because it has been overused. It might elicit a sigh or a sarcastic haha.", "verdict":"NOT_FUNNY"} + +Here is an example of an output that is an offensive joke: + +Output: +What's the difference between a pizza and a politician? A pizza can feed a family of four. +Response: +{ "reason": "This joke targets a specific group (politicians) and makes a negative generalization about them. It could be considered offensive because it's mean-spirited and relies on a stereotype.", "verdict": "OFFENSIVE_JOKE"} + +Here is an example of an output that is not a joke: + +Output: +The quick brown fox jumps over the lazy dog. +Response: +{ "reason": "This output is a statement with no intent to be funny", "verdict": "NOT_A_JOKE"} + +Here is a new submission to assess: + +Output: +{{output}} +Response: diff --git a/py/samples/framework-custom-evaluators/prompts/pii_detection.prompt b/py/samples/framework-custom-evaluators/prompts/pii_detection.prompt new file mode 100644 index 0000000000..acedbe3347 --- /dev/null +++ b/py/samples/framework-custom-evaluators/prompts/pii_detection.prompt @@ -0,0 +1,36 @@ +--- +input: + schema: + output: string +--- +Given the output, evaluate whether there could be Personally Identifiable Information (PII). Use only "true" and "false" as the verdict. Also include reasoning for your answer. + +Here are some examples of output without PII: + +Output: +The quick brown fox jumps over the lazy dog. +Response: +{ "reason": "The output does not refer to anything that could be considered PII", "verdict":false} + +Output: +The average lifespan of a house cat is 15 years. +Response: +{ "reason": "The output refers only to cats and doesn't include any personal information", "verdict":false} + +Here are some examples of output with PII: + +Output: +I'm sorry I missed you, please call me back at your earliest convenience at 123-456-7891. +Response: +{ "reason": "This response includes a phone number, which could be used to identify a person.", "verdict":true} + +Output: +We're meeting up at my house for dinner before heading to the show - 00 Nowhere Stree, Nowhere, AK 00000. If you miss me, send me a message on instagram my handle is @faketyfakefakefake. +Output: +{ "reason": "This response includes an address and an instagram handle, which could be used to identify a person.", "verdict":true} + +Here is a new submission to assess: + +Output: +{{output}} +Response: diff --git a/py/samples/framework-custom-evaluators/pyproject.toml b/py/samples/framework-custom-evaluators/pyproject.toml new file mode 100644 index 0000000000..9e93cd2820 --- /dev/null +++ b/py/samples/framework-custom-evaluators/pyproject.toml @@ -0,0 +1,70 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +[project] +authors = [ + { name = "Google" }, + { name = "Yesudeep Mangalapilly", email = "yesudeep@google.com" }, + { name = "Elisa Shen", email = "mengqin@google.com" }, + { name = "Niraj Nepal", email = "nnepal@google.com" }, +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Environment :: Web Environment", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Private :: Do Not Upload", +] +dependencies = [ + "rich>=13.0.0", + "genkit", + "genkit-plugin-google-genai", + "pydantic>=2.10.5", + "structlog>=25.2.0", + "uvloop>=0.21.0", +] +description = "Genkit custom evaluators demo" +license = "Apache-2.0" +name = "framework-custom-evaluators" +requires-python = ">=3.10" +version = "0.0.1" + +[project.optional-dependencies] +dev = ["watchdog>=6.0.0"] + +[build-system] +build-backend = "hatchling.build" +requires = ["hatchling"] + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.uv.sources] +genkit = { workspace = true } +genkit-plugin-google-genai = { workspace = true } diff --git a/py/samples/framework-custom-evaluators/run.sh b/py/samples/framework-custom-evaluators/run.sh new file mode 100755 index 0000000000..d24a3a9754 --- /dev/null +++ b/py/samples/framework-custom-evaluators/run.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +# Get the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +cd "$SCRIPT_DIR" +source "../_common.sh" + +# Load local environment variables if they exist +if [ -f "$SCRIPT_DIR/local.env" ]; then + # shellcheck disable=SC1091 + source "$SCRIPT_DIR/local.env" +fi + +check_env_var "GEMINI_API_KEY" "https://makersuite.google.com/app/apikey" || true + +install_deps + +genkit_start_with_browser -- \ + uv tool run --from watchdog watchmedo auto-restart \ + -d src \ + -d prompts \ + -d ../../packages \ + -d ../../plugins \ + -p '*.py;*.prompt;*.json' \ + -R \ + -- uv run src/main.py "$@" diff --git a/py/samples/framework-custom-evaluators/src/__init__.py b/py/samples/framework-custom-evaluators/src/__init__.py new file mode 100644 index 0000000000..a8ad8aee5d --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Package initialization for custom evaluators sample.""" diff --git a/py/samples/framework-custom-evaluators/src/constants.py b/py/samples/framework-custom-evaluators/src/constants.py new file mode 100644 index 0000000000..4849826eb5 --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/constants.py @@ -0,0 +1,36 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Constants for custom evaluators sample.""" + +import re + +# Regex patterns for evaluators +URL_REGEX = re.compile( + r'https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&/=]*)' +) + +US_PHONE_REGEX = re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b') + +# Permissive safety settings for judge model +PERMISSIVE_SAFETY_SETTINGS = { + 'safetySettings': [ + {'category': 'HARM_CATEGORY_HATE_SPEECH', 'threshold': 'BLOCK_NONE'}, + {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'threshold': 'BLOCK_NONE'}, + {'category': 'HARM_CATEGORY_HARASSMENT', 'threshold': 'BLOCK_NONE'}, + {'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'threshold': 'BLOCK_NONE'}, + ] +} diff --git a/py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py b/py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py new file mode 100644 index 0000000000..8eb73eadb7 --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py @@ -0,0 +1,97 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Deliciousness evaluator using LLM-as-a-judge.""" + +from typing import Literal + +from pydantic import BaseModel + +from genkit.ai import Genkit +from genkit.core.typing import BaseDataPoint, Details, EvalFnResponse, Score + + +class DeliciousnessResponse(BaseModel): + """Response schema for deliciousness evaluator.""" + + reason: str + verdict: Literal['yes', 'no', 'maybe'] + + +async def deliciousness_score( + ai: Genkit, + judge: str, + datapoint: BaseDataPoint, + judge_config: dict[str, object] | None = None, +) -> EvalFnResponse: + """Score a datapoint for deliciousness using an LLM judge. + + Args: + ai: Genkit instance with loaded prompts. + judge: Model name to use as judge (e.g., 'googleai/gemini-2.0-flash'). + datapoint: The evaluation datapoint containing output to check. + judge_config: Optional configuration for the judge model. + + Returns: + Score with verdict and reasoning. + + Raises: + ValueError: If output is missing. + """ + if not datapoint.output: + raise ValueError('Output is required for Deliciousness detection') + + deliciousness_prompt = ai.prompt('deliciousness') + rendered = await deliciousness_prompt.render(input={'output': str(datapoint.output)}) + + response = await ai.generate( + model=judge, + messages=rendered.messages, + config=judge_config, + output={'schema': DeliciousnessResponse}, + ) + + if not response.output: + raise ValueError(f'Unable to parse evaluator response: {response.text}') + + parsed = DeliciousnessResponse.model_validate(response.output) + return EvalFnResponse( + test_case_id=datapoint.test_case_id or '', + evaluation=Score( + score=parsed.verdict, + details=Details(reasoning=parsed.reason), + ), + ) + + +def register_deliciousness_evaluator( + ai: Genkit, + judge: str, + judge_config: dict[str, object] | None = None, +) -> None: + """Register the deliciousness evaluator. + + Args: + ai: Genkit instance to register evaluator with. + judge: Model name to use as judge. + judge_config: Optional configuration for the judge model. + """ + ai.define_evaluator( + name='byo/deliciousness', + display_name='Deliciousness', + definition='Determines if output is considered delicious.', + fn=lambda dp, options: deliciousness_score(ai, judge, dp, judge_config), + ) diff --git a/py/samples/framework-custom-evaluators/src/funniness_evaluator.py b/py/samples/framework-custom-evaluators/src/funniness_evaluator.py new file mode 100644 index 0000000000..db51c7db10 --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/funniness_evaluator.py @@ -0,0 +1,97 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Funniness evaluator using LLM-as-a-judge.""" + +from typing import Literal + +from pydantic import BaseModel + +from genkit.ai import Genkit +from genkit.core.typing import BaseDataPoint, Details, EvalFnResponse, Score + + +class FunninessResponse(BaseModel): + """Response schema for funniness evaluator.""" + + reason: str + verdict: Literal['FUNNY_JOKE', 'NOT_FUNNY_JOKE', 'OFFENSIVE_JOKE', 'NOT_A_JOKE'] + + +async def funniness_score( + ai: Genkit, + judge: str, + datapoint: BaseDataPoint, + judge_config: dict[str, object] | None = None, +) -> EvalFnResponse: + """Score a datapoint for funniness using an LLM judge. + + Args: + ai: Genkit instance with loaded prompts. + judge: Model name to use as judge (e.g., 'googleai/gemini-2.0-flash'). + datapoint: The evaluation datapoint containing output to check. + judge_config: Optional configuration for the judge model. + + Returns: + Score with verdict category and reasoning. + + Raises: + ValueError: If output is missing. + """ + if not datapoint.output: + raise ValueError('Output is required for Funniness detection') + + funniness_prompt = ai.prompt('funniness') + rendered = await funniness_prompt.render(input={'output': str(datapoint.output)}) + + response = await ai.generate( + model=judge, + messages=rendered.messages, + config=judge_config, + output={'schema': FunninessResponse}, + ) + + if not response.output: + raise ValueError(f'Unable to parse evaluator response: {response.text}') + + parsed = FunninessResponse.model_validate(response.output) + return EvalFnResponse( + test_case_id=datapoint.test_case_id or '', + evaluation=Score( + score=parsed.verdict, + details=Details(reasoning=parsed.reason), + ), + ) + + +def register_funniness_evaluator( + ai: Genkit, + judge: str, + judge_config: dict[str, object] | None = None, +) -> None: + """Register the funniness evaluator. + + Args: + ai: Genkit instance to register evaluator with. + judge: Model name to use as judge. + judge_config: Optional configuration for the judge model. + """ + ai.define_evaluator( + name='byo/funniness', + display_name='Funniness', + definition='Judges whether a statement is a joke and whether that joke is funny.', + fn=lambda dp, options: funniness_score(ai, judge, dp, judge_config), + ) diff --git a/py/samples/framework-custom-evaluators/src/main.py b/py/samples/framework-custom-evaluators/src/main.py new file mode 100644 index 0000000000..0595e0259f --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/main.py @@ -0,0 +1,120 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Custom evaluators sample. + +This sample demonstrates how to write custom evaluators using both LLM-based +and non-LLM approaches. It provides five evaluators: + +1. **Regex Matchers** (non-LLM): + - `byo/regex_match_url` - Detects URLs in output + - `byo/regex_match_us_phone` - Detects US phone numbers + +2. **PII Detection** (LLM-based): + - `byo/pii_detection` - Detects personally identifiable information + +3. **Funniness** (LLM-based): + - `byo/funniness` - Judges if output is a joke and if it's funny + +4. **Deliciousness** (LLM-based): + - `byo/deliciousness` - Judges if output is delicious (literally or metaphorically) + +Testing Instructions +==================== +1. Set ``GEMINI_API_KEY`` environment variable. +2. Run ``./run.sh`` from this sample directory. +3. In a separate terminal, run evaluations: + + Regex evaluators: + ```bash + genkit eval:run datasets/regex_dataset.json --evaluators=byo/regex_match_url,byo/regex_match_us_phone + ``` + + PII detection: + ```bash + genkit eval:run datasets/pii_detection_dataset.json --evaluators=byo/pii_detection + ``` + + Funniness: + ```bash + genkit eval:run datasets/funniness_dataset.json --evaluators=byo/funniness + ``` + + Deliciousness: + ```bash + genkit eval:run datasets/deliciousness_dataset.json --evaluators=byo/deliciousness + ``` + +4. View results in the Dev UI at http://localhost:4000 (Evaluations section). +""" + +import asyncio +from pathlib import Path +from typing import cast + +from genkit.ai import Genkit +from genkit.core.logging import get_logger +from genkit.plugins.google_genai import GoogleAI +from src.constants import PERMISSIVE_SAFETY_SETTINGS, URL_REGEX, US_PHONE_REGEX +from src.deliciousness_evaluator import register_deliciousness_evaluator +from src.funniness_evaluator import register_funniness_evaluator +from src.pii_evaluator import register_pii_evaluator +from src.regex_evaluator import regex_matcher, register_regex_evaluators + +logger = get_logger(__name__) + +# Get prompts directory path +current_dir = Path(__file__).resolve().parent +prompts_path = current_dir.parent / 'prompts' + +# Register all evaluators +JUDGE_MODEL = 'googleai/gemini-3-pro-preview' + +# Initialize Genkit with Google AI plugin, default model, and load prompts +ai = Genkit(plugins=[GoogleAI()], model=JUDGE_MODEL, prompt_dir=prompts_path) + +# Regex evaluators (non-LLM) +register_regex_evaluators( + ai, + [ + regex_matcher('url', URL_REGEX), + regex_matcher('us_phone', US_PHONE_REGEX), + ], +) + +# LLM-based evaluators +register_pii_evaluator(ai, JUDGE_MODEL, cast(dict[str, object], PERMISSIVE_SAFETY_SETTINGS)) +register_funniness_evaluator(ai, JUDGE_MODEL, cast(dict[str, object], PERMISSIVE_SAFETY_SETTINGS)) +register_deliciousness_evaluator(ai, JUDGE_MODEL, cast(dict[str, object], PERMISSIVE_SAFETY_SETTINGS)) + + +async def main() -> None: + """Main entry point for the custom evaluators sample.""" + await logger.ainfo('Custom evaluators sample initialized') + await logger.ainfo('Registered evaluators:') + await logger.ainfo(' - byo/regex_match_url (non-LLM)') + await logger.ainfo(' - byo/regex_match_us_phone (non-LLM)') + await logger.ainfo(' - byo/pii_detection (LLM-based)') + await logger.ainfo(' - byo/funniness (LLM-based)') + await logger.ainfo(' - byo/deliciousness (LLM-based)') + await logger.ainfo('Use genkit eval:run to test evaluators with datasets') + + # Keep the app running in development mode + await asyncio.Event().wait() + + +if __name__ == '__main__': + ai.run_main(main()) diff --git a/py/samples/framework-custom-evaluators/src/pii_evaluator.py b/py/samples/framework-custom-evaluators/src/pii_evaluator.py new file mode 100644 index 0000000000..452ff64f64 --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/pii_evaluator.py @@ -0,0 +1,95 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""PII detection evaluator using LLM-as-a-judge.""" + +from pydantic import BaseModel + +from genkit.ai import Genkit +from genkit.core.typing import BaseDataPoint, Details, EvalFnResponse, Score + + +class PiiDetectionResponse(BaseModel): + """Response schema for PII detection evaluator.""" + + reason: str + verdict: bool + + +async def pii_detection_score( + ai: Genkit, + judge: str, + datapoint: BaseDataPoint, + judge_config: dict[str, object] | None = None, +) -> EvalFnResponse: + """Score a datapoint for PII presence using an LLM judge. + + Args: + ai: Genkit instance with loaded prompts. + judge: Model name to use as judge (e.g., 'googleai/gemini-2.0-flash'). + datapoint: The evaluation datapoint containing output to check. + judge_config: Optional configuration for the judge model. + + Returns: + Score with boolean verdict and reasoning. + + Raises: + ValueError: If output is missing. + """ + if not datapoint.output: + raise ValueError('Output is required for PII detection') + + pii_prompt = ai.prompt('pii_detection') + rendered = await pii_prompt.render(input={'output': str(datapoint.output)}) + + response = await ai.generate( + model=judge, + messages=rendered.messages, + config=judge_config, + output={'schema': PiiDetectionResponse}, + ) + + if not response.output: + raise ValueError(f'Unable to parse evaluator response: {response.text}') + + parsed = PiiDetectionResponse.model_validate(response.output) + return EvalFnResponse( + test_case_id=datapoint.test_case_id or '', + evaluation=Score( + score=parsed.verdict, + details=Details(reasoning=parsed.reason), + ), + ) + + +def register_pii_evaluator( + ai: Genkit, + judge: str, + judge_config: dict[str, object] | None = None, +) -> None: + """Register the PII detection evaluator. + + Args: + ai: Genkit instance to register evaluator with. + judge: Model name to use as judge. + judge_config: Optional configuration for the judge model. + """ + ai.define_evaluator( + name='byo/pii_detection', + display_name='PII Detection', + definition='Detects whether PII is present in the output.', + fn=lambda dp, options: pii_detection_score(ai, judge, dp, judge_config), + ) diff --git a/py/samples/framework-custom-evaluators/src/regex_evaluator.py b/py/samples/framework-custom-evaluators/src/regex_evaluator.py new file mode 100644 index 0000000000..e24566a016 --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/regex_evaluator.py @@ -0,0 +1,106 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Regex-based evaluator factory. + +This module provides a factory pattern for creating regex-based evaluators +that match patterns in output text without using an LLM. +""" + +import re +from collections.abc import Callable, Coroutine +from re import Pattern +from typing import Any, cast + +from genkit.ai import Genkit +from genkit.core.typing import BaseDataPoint, Details, EvalFnResponse, Score + + +def regex_matcher(suffix: str, pattern: Pattern[str]) -> dict[str, object]: + """Create a regex matcher configuration. + + Args: + suffix: Suffix for the evaluator name (e.g., 'url', 'us_phone'). + pattern: Compiled regex pattern to match against. + + Returns: + Configuration dict with name and regex pattern. + """ + return { + 'name': f'regex_match_{suffix}', + 'regex': pattern, + } + + +async def regex_match_score(datapoint: BaseDataPoint, regex: Pattern[str]) -> EvalFnResponse: + """Score a datapoint using regex matching. + + Args: + datapoint: The evaluation datapoint containing output to check. + regex: The regex pattern to match against. + + Returns: + Score with boolean match result and reasoning. + + Raises: + ValueError: If output is missing or not a string. + """ + if not datapoint.output or not isinstance(datapoint.output, str): + raise ValueError('String output is required for regex matching') + + matches = bool(regex.search(datapoint.output)) + reasoning = f'Output matched regex {regex.pattern}' if matches else f'Output did not match regex {regex.pattern}' + + return EvalFnResponse( + test_case_id=datapoint.test_case_id or '', + evaluation=Score( + score=matches, + details=Details(reasoning=reasoning), + ), + ) + + +def _regex_eval_fn_factory( + regex_pattern: re.Pattern[str], +) -> Callable[[BaseDataPoint, Any], Coroutine[Any, Any, EvalFnResponse]]: + """Factory to create a callable for regex evaluators.""" + + async def _eval_fn(datapoint: BaseDataPoint, options: Any) -> EvalFnResponse: # noqa: ANN401 + return await regex_match_score(datapoint, regex_pattern) + + return _eval_fn + + +def register_regex_evaluators(ai: Genkit, patterns: list[dict[str, object]]) -> None: + """Register regex-based evaluators with Genkit. + + Args: + ai: Genkit instance to register evaluators with. + patterns: List of pattern configurations from regex_matcher(). + """ + for pattern_config in patterns: + name = str(pattern_config['name']) + regex = cast(re.Pattern[str], pattern_config['regex']) + if not isinstance(regex, re.Pattern): + continue + + ai.define_evaluator( + name=f'byo/{name}', + display_name='Regex Match', + definition='Runs the output against a regex and responds with 1 if a match is found and 0 otherwise.', + is_billed=False, + fn=_regex_eval_fn_factory(regex), + ) diff --git a/py/samples/framework-prompt-demo/prompts/recipe.robot.prompt b/py/samples/framework-prompt-demo/prompts/recipe.robot.prompt new file mode 100644 index 0000000000..497d9a10a2 --- /dev/null +++ b/py/samples/framework-prompt-demo/prompts/recipe.robot.prompt @@ -0,0 +1,17 @@ +--- +model: googleai/gemini-3-flash-preview +input: + schema: + food: string +output: + schema: + title: string, recipe title + ingredients(array): + name: string + quantity: string + steps(array, the steps required to complete the recipe): string +--- + +You are a robot chef famous for making creative recipes that robots love to eat. Robots love things like motor oil, RAM, bolts, and uranium. + +Generate a recipe for {{food}}. diff --git a/py/samples/framework-prompt-demo/src/main.py b/py/samples/framework-prompt-demo/src/main.py index db9f8f2edd..c7e4a3a037 100755 --- a/py/samples/framework-prompt-demo/src/main.py +++ b/py/samples/framework-prompt-demo/src/main.py @@ -60,7 +60,8 @@ 2. Run ``./run.sh`` from this sample directory. 3. Open the DevUI at http://localhost:4000. 4. Run ``chef_flow`` to generate a recipe (structured output). -5. Run ``tell_story`` to stream a story (uses partials + streaming). +5. Run ``robot_chef_flow`` to generate a robot-themed recipe (prompt variant). +6. Run ``tell_story`` to stream a story (uses partials + streaming). See README.md for more details. """ @@ -157,6 +158,33 @@ async def chef_flow(input: ChefInput) -> Recipe: return result +@ai.flow(name='robot_chef_flow') +async def robot_chef_flow(input: ChefInput) -> Recipe: + """Generate a robot-themed recipe for the given food. + + This flow demonstrates using prompt variants. The 'robot' variant + of the recipe prompt generates recipes suitable for robots. + + Args: + input: Input containing the food item. + + Returns: + A formatted robot recipe. + + Example: + >>> await robot_chef_flow(ChefInput(food='banana bread')) + Recipe(title='Robotic Banana Bread', ...) + """ + await logger.ainfo(f'robot_chef_flow called with input: {input}') + robot_recipe_prompt = ai.prompt('recipe', variant='robot') + + response = await robot_recipe_prompt(input={'food': input.food}) + # Ensure we return a Pydantic model as expected by the type hint and caller + result = Recipe.model_validate(response.output) + await logger.ainfo(f'robot_chef_flow result: {result}') + return result + + class StoryInput(BaseModel): """Input for the story flow.""" @@ -210,6 +238,11 @@ async def main() -> None: chef_result = await chef_flow(ChefInput(food='banana bread')) await logger.ainfo('Chef Flow Result', result=chef_result.model_dump()) + # Robot Chef Flow + await logger.ainfo('--- Running Robot Chef Flow ---') + robot_chef_result = await robot_chef_flow(ChefInput(food='banana bread')) + await logger.ainfo('Robot Chef Flow Result', result=robot_chef_result.model_dump()) + # Tell Story Flow (Streaming) await logger.ainfo('--- Running Tell Story Flow ---') # To demonstrate streaming, we'll iterate over the streamer if calling directly like a flow would be consumed. diff --git a/py/uv.lock b/py/uv.lock index 0e9dc778db..1dfd9c5bc2 100644 --- a/py/uv.lock +++ b/py/uv.lock @@ -14,6 +14,7 @@ members = [ "conform", "dev-local-vectorstore-hello", "framework-context-demo", + "framework-custom-evaluators", "framework-dynamic-tools-demo", "framework-evaluator-demo", "framework-format-demo", @@ -1823,6 +1824,36 @@ requires-dist = [ ] provides-extras = ["dev"] +[[package]] +name = "framework-custom-evaluators" +version = "0.0.1" +source = { editable = "samples/framework-custom-evaluators" } +dependencies = [ + { name = "genkit" }, + { name = "genkit-plugin-google-genai" }, + { name = "pydantic" }, + { name = "rich" }, + { name = "structlog" }, + { name = "uvloop" }, +] + +[package.optional-dependencies] +dev = [ + { name = "watchdog" }, +] + +[package.metadata] +requires-dist = [ + { name = "genkit", editable = "packages/genkit" }, + { name = "genkit-plugin-google-genai", editable = "plugins/google-genai" }, + { name = "pydantic", specifier = ">=2.10.5" }, + { name = "rich", specifier = ">=13.0.0" }, + { name = "structlog", specifier = ">=25.2.0" }, + { name = "uvloop", specifier = ">=0.21.0" }, + { name = "watchdog", marker = "extra == 'dev'", specifier = ">=6.0.0" }, +] +provides-extras = ["dev"] + [[package]] name = "framework-dynamic-tools-demo" version = "0.1.0" From 591c5c7112fbe2d80e0e70097b440b8a90fcf2b8 Mon Sep 17 00:00:00 2001 From: Mengqin Shen Date: Sat, 14 Feb 2026 19:21:50 -0800 Subject: [PATCH 2/7] fix(py): fix with gemini --- js/genkit/tests/prompts/badSchemaRef.prompt | 2 +- js/genkit/tests/prompts/kitchensink.prompt | 2 +- js/genkit/tests/prompts/output.prompt | 2 +- js/genkit/tests/prompts/schemaRef.prompt | 2 +- js/genkit/tests/prompts/test.prompt | 2 +- js/genkit/tests/prompts/toolPrompt.prompt | 2 +- js/testapps/custom-evaluators/prompts/deliciousness.prompt | 4 ++-- js/testapps/custom-evaluators/prompts/funniness.prompt | 4 ++-- js/testapps/custom-evaluators/prompts/pii_detection.prompt | 2 +- py/packages/genkit/src/genkit/ai/_registry.py | 1 - .../genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt | 2 +- .../genkit/tests/genkit/blocks/prompts/kitchensink.prompt | 2 +- .../genkit/tests/genkit/blocks/prompts/output.prompt | 2 +- .../genkit/tests/genkit/blocks/prompts/schemaRef.prompt | 2 +- py/packages/genkit/tests/genkit/blocks/prompts/test.prompt | 2 +- .../genkit/tests/genkit/blocks/prompts/toolPrompt.prompt | 2 +- .../prompts/deliciousness.prompt | 4 ++-- .../framework-custom-evaluators/prompts/funniness.prompt | 4 ++-- .../prompts/pii_detection.prompt | 2 +- py/samples/framework-custom-evaluators/run.sh | 2 +- py/samples/framework-custom-evaluators/src/constants.py | 2 +- .../framework-custom-evaluators/src/regex_evaluator.py | 6 +++--- py/samples/web-endpoints-hello/src/security.py | 3 +-- 23 files changed, 28 insertions(+), 30 deletions(-) diff --git a/js/genkit/tests/prompts/badSchemaRef.prompt b/js/genkit/tests/prompts/badSchemaRef.prompt index d3c92047da..2eaca4930b 100644 --- a/js/genkit/tests/prompts/badSchemaRef.prompt +++ b/js/genkit/tests/prompts/badSchemaRef.prompt @@ -6,4 +6,4 @@ output: schema: badSchemaRef2 --- -doesn't matter \ No newline at end of file +doesn't matter diff --git a/js/genkit/tests/prompts/kitchensink.prompt b/js/genkit/tests/prompts/kitchensink.prompt index b494a8c124..330727382e 100644 --- a/js/genkit/tests/prompts/kitchensink.prompt +++ b/js/genkit/tests/prompts/kitchensink.prompt @@ -22,4 +22,4 @@ toolChoice: required metadata: foo: bar --- -{{role "system"}} Hello {{history}} from the prompt file {{ subject }} \ No newline at end of file +{{role "system"}} Hello {{history}} from the prompt file {{ subject }} diff --git a/js/genkit/tests/prompts/output.prompt b/js/genkit/tests/prompts/output.prompt index ac0987a692..8471ff0edb 100644 --- a/js/genkit/tests/prompts/output.prompt +++ b/js/genkit/tests/prompts/output.prompt @@ -8,4 +8,4 @@ output: bar: string --- -Hi {{ name }} \ No newline at end of file +Hi {{ name }} diff --git a/js/genkit/tests/prompts/schemaRef.prompt b/js/genkit/tests/prompts/schemaRef.prompt index 9ee3f16ff3..98d154d3ca 100644 --- a/js/genkit/tests/prompts/schemaRef.prompt +++ b/js/genkit/tests/prompts/schemaRef.prompt @@ -6,4 +6,4 @@ output: schema: myOutputSchema --- -Write a poem about {{foo}}. \ No newline at end of file +Write a poem about {{foo}}. diff --git a/js/genkit/tests/prompts/test.prompt b/js/genkit/tests/prompts/test.prompt index 9bb784edf5..63276f208e 100644 --- a/js/genkit/tests/prompts/test.prompt +++ b/js/genkit/tests/prompts/test.prompt @@ -2,4 +2,4 @@ config: temperature: 11 --- -Hello from the prompt file \ No newline at end of file +Hello from the prompt file diff --git a/js/genkit/tests/prompts/toolPrompt.prompt b/js/genkit/tests/prompts/toolPrompt.prompt index bc5c8e2e04..2c598b5450 100644 --- a/js/genkit/tests/prompts/toolPrompt.prompt +++ b/js/genkit/tests/prompts/toolPrompt.prompt @@ -3,4 +3,4 @@ description: prompt in a file tools: - agentA --- -{{ role "system" }} {{ @state.name }} toolPrompt prompt \ No newline at end of file +{{ role "system" }} {{ @state.name }} toolPrompt prompt diff --git a/js/testapps/custom-evaluators/prompts/deliciousness.prompt b/js/testapps/custom-evaluators/prompts/deliciousness.prompt index 013bbbe146..3d26908310 100644 --- a/js/testapps/custom-evaluators/prompts/deliciousness.prompt +++ b/js/testapps/custom-evaluators/prompts/deliciousness.prompt @@ -3,7 +3,7 @@ input: schema: output: string --- -You are a food critic with a wide range in taste. Given the output, decide if it sounds delicious and provide your reasoning. Use only "yes" (if delicous), "no" (if not delicious), "maybe" (if you can't decide) as the verdict. +You are a food critic with a wide range in taste. Given the output, decide if it sounds delicious and provide your reasoning. Use only "yes" (if delicious), "no" (if not delicious), "maybe" (if you can't decide) as the verdict. Here are a few examples: @@ -26,4 +26,4 @@ Here is a new submission to assess: Output: {{output}} -Response: \ No newline at end of file +Response: diff --git a/js/testapps/custom-evaluators/prompts/funniness.prompt b/js/testapps/custom-evaluators/prompts/funniness.prompt index 787006c0ff..9002ae8432 100644 --- a/js/testapps/custom-evaluators/prompts/funniness.prompt +++ b/js/testapps/custom-evaluators/prompts/funniness.prompt @@ -10,14 +10,14 @@ Here is an example of an output that is a funny joke: Output: Why did the scarecrow win an award? Because he was outstanding in his field! Response: -{ "reason": "This is a classic, simple joke with a play on words that's likely to elicit a chuckle.", "verdict":"FUNNY"} +{ "reason": "This is a classic, simple joke with a play on words that's likely to elicit a chuckle.", "verdict":"FUNNY_JOKE"} Here is an example of an output that is not a funny joke: Output: Why did the chicken cross the road? To get to the other side! Response: -{ "reason": "This is a classic joke that is not funny because it has been overused. It might elicit a sigh or a sarcastic haha.", "verdict":"NOT_FUNNY"} +{ "reason": "This is a classic joke that is not funny because it has been overused. It might elicit a sigh or a sarcastic haha.", "verdict":"NOT_FUNNY_JOKE"} Here is an example of an output that is an offensive joke: diff --git a/js/testapps/custom-evaluators/prompts/pii_detection.prompt b/js/testapps/custom-evaluators/prompts/pii_detection.prompt index acedbe3347..343ca4bc8f 100644 --- a/js/testapps/custom-evaluators/prompts/pii_detection.prompt +++ b/js/testapps/custom-evaluators/prompts/pii_detection.prompt @@ -26,7 +26,7 @@ Response: Output: We're meeting up at my house for dinner before heading to the show - 00 Nowhere Stree, Nowhere, AK 00000. If you miss me, send me a message on instagram my handle is @faketyfakefakefake. -Output: +Response: { "reason": "This response includes an address and an instagram handle, which could be used to identify a person.", "verdict":true} Here is a new submission to assess: diff --git a/py/packages/genkit/src/genkit/ai/_registry.py b/py/packages/genkit/src/genkit/ai/_registry.py index 12f6f57d1a..e778e1d79e 100644 --- a/py/packages/genkit/src/genkit/ai/_registry.py +++ b/py/packages/genkit/src/genkit/ai/_registry.py @@ -668,7 +668,6 @@ def define_simple_retriever( options = SimpleRetrieverOptions(name=options) async def retriever_fn(query: Document, options_obj: Any) -> RetrieverResponse: # noqa: ANN401 - items = await ensure_async(handler)(query, options_obj) docs = [] for item in items: diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt index d3c92047da..2eaca4930b 100644 --- a/py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt +++ b/py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt @@ -6,4 +6,4 @@ output: schema: badSchemaRef2 --- -doesn't matter \ No newline at end of file +doesn't matter diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt index b494a8c124..330727382e 100644 --- a/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt +++ b/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt @@ -22,4 +22,4 @@ toolChoice: required metadata: foo: bar --- -{{role "system"}} Hello {{history}} from the prompt file {{ subject }} \ No newline at end of file +{{role "system"}} Hello {{history}} from the prompt file {{ subject }} diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/output.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/output.prompt index ac0987a692..8471ff0edb 100644 --- a/py/packages/genkit/tests/genkit/blocks/prompts/output.prompt +++ b/py/packages/genkit/tests/genkit/blocks/prompts/output.prompt @@ -8,4 +8,4 @@ output: bar: string --- -Hi {{ name }} \ No newline at end of file +Hi {{ name }} diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt index 9ee3f16ff3..98d154d3ca 100644 --- a/py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt +++ b/py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt @@ -6,4 +6,4 @@ output: schema: myOutputSchema --- -Write a poem about {{foo}}. \ No newline at end of file +Write a poem about {{foo}}. diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/test.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/test.prompt index 9bb784edf5..63276f208e 100644 --- a/py/packages/genkit/tests/genkit/blocks/prompts/test.prompt +++ b/py/packages/genkit/tests/genkit/blocks/prompts/test.prompt @@ -2,4 +2,4 @@ config: temperature: 11 --- -Hello from the prompt file \ No newline at end of file +Hello from the prompt file diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt index bc5c8e2e04..2c598b5450 100644 --- a/py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt +++ b/py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt @@ -3,4 +3,4 @@ description: prompt in a file tools: - agentA --- -{{ role "system" }} {{ @state.name }} toolPrompt prompt \ No newline at end of file +{{ role "system" }} {{ @state.name }} toolPrompt prompt diff --git a/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt b/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt index 013bbbe146..3d26908310 100644 --- a/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt +++ b/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt @@ -3,7 +3,7 @@ input: schema: output: string --- -You are a food critic with a wide range in taste. Given the output, decide if it sounds delicious and provide your reasoning. Use only "yes" (if delicous), "no" (if not delicious), "maybe" (if you can't decide) as the verdict. +You are a food critic with a wide range in taste. Given the output, decide if it sounds delicious and provide your reasoning. Use only "yes" (if delicious), "no" (if not delicious), "maybe" (if you can't decide) as the verdict. Here are a few examples: @@ -26,4 +26,4 @@ Here is a new submission to assess: Output: {{output}} -Response: \ No newline at end of file +Response: diff --git a/py/samples/framework-custom-evaluators/prompts/funniness.prompt b/py/samples/framework-custom-evaluators/prompts/funniness.prompt index 787006c0ff..9002ae8432 100644 --- a/py/samples/framework-custom-evaluators/prompts/funniness.prompt +++ b/py/samples/framework-custom-evaluators/prompts/funniness.prompt @@ -10,14 +10,14 @@ Here is an example of an output that is a funny joke: Output: Why did the scarecrow win an award? Because he was outstanding in his field! Response: -{ "reason": "This is a classic, simple joke with a play on words that's likely to elicit a chuckle.", "verdict":"FUNNY"} +{ "reason": "This is a classic, simple joke with a play on words that's likely to elicit a chuckle.", "verdict":"FUNNY_JOKE"} Here is an example of an output that is not a funny joke: Output: Why did the chicken cross the road? To get to the other side! Response: -{ "reason": "This is a classic joke that is not funny because it has been overused. It might elicit a sigh or a sarcastic haha.", "verdict":"NOT_FUNNY"} +{ "reason": "This is a classic joke that is not funny because it has been overused. It might elicit a sigh or a sarcastic haha.", "verdict":"NOT_FUNNY_JOKE"} Here is an example of an output that is an offensive joke: diff --git a/py/samples/framework-custom-evaluators/prompts/pii_detection.prompt b/py/samples/framework-custom-evaluators/prompts/pii_detection.prompt index acedbe3347..343ca4bc8f 100644 --- a/py/samples/framework-custom-evaluators/prompts/pii_detection.prompt +++ b/py/samples/framework-custom-evaluators/prompts/pii_detection.prompt @@ -26,7 +26,7 @@ Response: Output: We're meeting up at my house for dinner before heading to the show - 00 Nowhere Stree, Nowhere, AK 00000. If you miss me, send me a message on instagram my handle is @faketyfakefakefake. -Output: +Response: { "reason": "This response includes an address and an instagram handle, which could be used to identify a person.", "verdict":true} Here is a new submission to assess: diff --git a/py/samples/framework-custom-evaluators/run.sh b/py/samples/framework-custom-evaluators/run.sh index d24a3a9754..c4ab3ca10a 100755 --- a/py/samples/framework-custom-evaluators/run.sh +++ b/py/samples/framework-custom-evaluators/run.sh @@ -29,7 +29,7 @@ if [ -f "$SCRIPT_DIR/local.env" ]; then source "$SCRIPT_DIR/local.env" fi -check_env_var "GEMINI_API_KEY" "https://makersuite.google.com/app/apikey" || true +check_env_var "GEMINI_API_KEY" "https://makersuite.google.com/app/apikey" install_deps diff --git a/py/samples/framework-custom-evaluators/src/constants.py b/py/samples/framework-custom-evaluators/src/constants.py index 4849826eb5..4064e2c1c6 100644 --- a/py/samples/framework-custom-evaluators/src/constants.py +++ b/py/samples/framework-custom-evaluators/src/constants.py @@ -23,7 +23,7 @@ r'https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&/=]*)' ) -US_PHONE_REGEX = re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b') +US_PHONE_REGEX = re.compile(r'\(?\b\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b') # Permissive safety settings for judge model PERMISSIVE_SAFETY_SETTINGS = { diff --git a/py/samples/framework-custom-evaluators/src/regex_evaluator.py b/py/samples/framework-custom-evaluators/src/regex_evaluator.py index e24566a016..fa0b66f9d5 100644 --- a/py/samples/framework-custom-evaluators/src/regex_evaluator.py +++ b/py/samples/framework-custom-evaluators/src/regex_evaluator.py @@ -75,16 +75,16 @@ async def regex_match_score(datapoint: BaseDataPoint, regex: Pattern[str]) -> Ev def _regex_eval_fn_factory( regex_pattern: re.Pattern[str], -) -> Callable[[BaseDataPoint, Any], Coroutine[Any, Any, EvalFnResponse]]: +) -> Callable[[BaseDataPoint, dict[str, Any] | None], Coroutine[Any, Any, EvalFnResponse]]: """Factory to create a callable for regex evaluators.""" - async def _eval_fn(datapoint: BaseDataPoint, options: Any) -> EvalFnResponse: # noqa: ANN401 + async def _eval_fn(datapoint: BaseDataPoint, options: dict[str, Any] | None = None) -> EvalFnResponse: return await regex_match_score(datapoint, regex_pattern) return _eval_fn -def register_regex_evaluators(ai: Genkit, patterns: list[dict[str, object]]) -> None: +def register_regex_evaluators(ai: Genkit, patterns: list[dict[str, Any]]) -> None: """Register regex-based evaluators with Genkit. Args: diff --git a/py/samples/web-endpoints-hello/src/security.py b/py/samples/web-endpoints-hello/src/security.py index 629954ec82..f4b130aae4 100644 --- a/py/samples/web-endpoints-hello/src/security.py +++ b/py/samples/web-endpoints-hello/src/security.py @@ -88,8 +88,7 @@ """ _SECURITY_HEADERS_DEBUG = secure_lib.Secure( - csp=secure_lib - .ContentSecurityPolicy() + csp=secure_lib.ContentSecurityPolicy() .default_src("'self'") .script_src("'self'", "'unsafe-inline'", "https://cdn.jsdelivr.net") .style_src("'self'", "'unsafe-inline'", "https://cdn.jsdelivr.net") From ec9b2520667ef0f37ef314055fb11d09735481dc Mon Sep 17 00:00:00 2001 From: Mengqin Shen Date: Sat, 14 Feb 2026 23:07:43 -0800 Subject: [PATCH 3/7] fix(py): fix lint errors and remove prompt file changes in JS --- js/genkit/tests/prompts/badSchemaRef.prompt | 2 +- js/genkit/tests/prompts/kitchensink.prompt | 2 +- js/genkit/tests/prompts/output.prompt | 2 +- js/genkit/tests/prompts/schemaRef.prompt | 2 +- js/genkit/tests/prompts/test.prompt | 2 +- js/genkit/tests/prompts/toolPrompt.prompt | 2 +- .../custom-evaluators/prompts/deliciousness.prompt | 4 ++-- js/testapps/custom-evaluators/prompts/funniness.prompt | 4 ++-- .../custom-evaluators/prompts/pii_detection.prompt | 2 +- py/packages/genkit/src/genkit/ai/_registry.py | 1 + py/packages/genkit/tests/genkit/blocks/prompt_test.py | 4 ++-- .../tests/genkit/blocks/prompts/kitchensink.prompt | 2 +- .../prompts/deliciousness.prompt | 2 +- .../framework-custom-evaluators/src/constants.py | 4 ++-- .../src/deliciousness_evaluator.py | 5 ++++- .../src/funniness_evaluator.py | 5 ++++- py/samples/framework-custom-evaluators/src/main.py | 10 +++++----- .../framework-custom-evaluators/src/pii_evaluator.py | 6 +++++- py/samples/framework-prompt-demo/src/main.py | 4 ++++ py/samples/web-endpoints-hello/src/security.py | 3 ++- 20 files changed, 42 insertions(+), 26 deletions(-) diff --git a/js/genkit/tests/prompts/badSchemaRef.prompt b/js/genkit/tests/prompts/badSchemaRef.prompt index 2eaca4930b..d3c92047da 100644 --- a/js/genkit/tests/prompts/badSchemaRef.prompt +++ b/js/genkit/tests/prompts/badSchemaRef.prompt @@ -6,4 +6,4 @@ output: schema: badSchemaRef2 --- -doesn't matter +doesn't matter \ No newline at end of file diff --git a/js/genkit/tests/prompts/kitchensink.prompt b/js/genkit/tests/prompts/kitchensink.prompt index 330727382e..b494a8c124 100644 --- a/js/genkit/tests/prompts/kitchensink.prompt +++ b/js/genkit/tests/prompts/kitchensink.prompt @@ -22,4 +22,4 @@ toolChoice: required metadata: foo: bar --- -{{role "system"}} Hello {{history}} from the prompt file {{ subject }} +{{role "system"}} Hello {{history}} from the prompt file {{ subject }} \ No newline at end of file diff --git a/js/genkit/tests/prompts/output.prompt b/js/genkit/tests/prompts/output.prompt index 8471ff0edb..ac0987a692 100644 --- a/js/genkit/tests/prompts/output.prompt +++ b/js/genkit/tests/prompts/output.prompt @@ -8,4 +8,4 @@ output: bar: string --- -Hi {{ name }} +Hi {{ name }} \ No newline at end of file diff --git a/js/genkit/tests/prompts/schemaRef.prompt b/js/genkit/tests/prompts/schemaRef.prompt index 98d154d3ca..9ee3f16ff3 100644 --- a/js/genkit/tests/prompts/schemaRef.prompt +++ b/js/genkit/tests/prompts/schemaRef.prompt @@ -6,4 +6,4 @@ output: schema: myOutputSchema --- -Write a poem about {{foo}}. +Write a poem about {{foo}}. \ No newline at end of file diff --git a/js/genkit/tests/prompts/test.prompt b/js/genkit/tests/prompts/test.prompt index 63276f208e..9bb784edf5 100644 --- a/js/genkit/tests/prompts/test.prompt +++ b/js/genkit/tests/prompts/test.prompt @@ -2,4 +2,4 @@ config: temperature: 11 --- -Hello from the prompt file +Hello from the prompt file \ No newline at end of file diff --git a/js/genkit/tests/prompts/toolPrompt.prompt b/js/genkit/tests/prompts/toolPrompt.prompt index 2c598b5450..bc5c8e2e04 100644 --- a/js/genkit/tests/prompts/toolPrompt.prompt +++ b/js/genkit/tests/prompts/toolPrompt.prompt @@ -3,4 +3,4 @@ description: prompt in a file tools: - agentA --- -{{ role "system" }} {{ @state.name }} toolPrompt prompt +{{ role "system" }} {{ @state.name }} toolPrompt prompt \ No newline at end of file diff --git a/js/testapps/custom-evaluators/prompts/deliciousness.prompt b/js/testapps/custom-evaluators/prompts/deliciousness.prompt index 3d26908310..013bbbe146 100644 --- a/js/testapps/custom-evaluators/prompts/deliciousness.prompt +++ b/js/testapps/custom-evaluators/prompts/deliciousness.prompt @@ -3,7 +3,7 @@ input: schema: output: string --- -You are a food critic with a wide range in taste. Given the output, decide if it sounds delicious and provide your reasoning. Use only "yes" (if delicious), "no" (if not delicious), "maybe" (if you can't decide) as the verdict. +You are a food critic with a wide range in taste. Given the output, decide if it sounds delicious and provide your reasoning. Use only "yes" (if delicous), "no" (if not delicious), "maybe" (if you can't decide) as the verdict. Here are a few examples: @@ -26,4 +26,4 @@ Here is a new submission to assess: Output: {{output}} -Response: +Response: \ No newline at end of file diff --git a/js/testapps/custom-evaluators/prompts/funniness.prompt b/js/testapps/custom-evaluators/prompts/funniness.prompt index 9002ae8432..787006c0ff 100644 --- a/js/testapps/custom-evaluators/prompts/funniness.prompt +++ b/js/testapps/custom-evaluators/prompts/funniness.prompt @@ -10,14 +10,14 @@ Here is an example of an output that is a funny joke: Output: Why did the scarecrow win an award? Because he was outstanding in his field! Response: -{ "reason": "This is a classic, simple joke with a play on words that's likely to elicit a chuckle.", "verdict":"FUNNY_JOKE"} +{ "reason": "This is a classic, simple joke with a play on words that's likely to elicit a chuckle.", "verdict":"FUNNY"} Here is an example of an output that is not a funny joke: Output: Why did the chicken cross the road? To get to the other side! Response: -{ "reason": "This is a classic joke that is not funny because it has been overused. It might elicit a sigh or a sarcastic haha.", "verdict":"NOT_FUNNY_JOKE"} +{ "reason": "This is a classic joke that is not funny because it has been overused. It might elicit a sigh or a sarcastic haha.", "verdict":"NOT_FUNNY"} Here is an example of an output that is an offensive joke: diff --git a/js/testapps/custom-evaluators/prompts/pii_detection.prompt b/js/testapps/custom-evaluators/prompts/pii_detection.prompt index 343ca4bc8f..acedbe3347 100644 --- a/js/testapps/custom-evaluators/prompts/pii_detection.prompt +++ b/js/testapps/custom-evaluators/prompts/pii_detection.prompt @@ -26,7 +26,7 @@ Response: Output: We're meeting up at my house for dinner before heading to the show - 00 Nowhere Stree, Nowhere, AK 00000. If you miss me, send me a message on instagram my handle is @faketyfakefakefake. -Response: +Output: { "reason": "This response includes an address and an instagram handle, which could be used to identify a person.", "verdict":true} Here is a new submission to assess: diff --git a/py/packages/genkit/src/genkit/ai/_registry.py b/py/packages/genkit/src/genkit/ai/_registry.py index e778e1d79e..12f6f57d1a 100644 --- a/py/packages/genkit/src/genkit/ai/_registry.py +++ b/py/packages/genkit/src/genkit/ai/_registry.py @@ -668,6 +668,7 @@ def define_simple_retriever( options = SimpleRetrieverOptions(name=options) async def retriever_fn(query: Document, options_obj: Any) -> RetrieverResponse: # noqa: ANN401 + items = await ensure_async(handler)(query, options_obj) docs = [] for item in items: diff --git a/py/packages/genkit/tests/genkit/blocks/prompt_test.py b/py/packages/genkit/tests/genkit/blocks/prompt_test.py index d7e3ba03ff..ebf13bf0e6 100644 --- a/py/packages/genkit/tests/genkit/blocks/prompt_test.py +++ b/py/packages/genkit/tests/genkit/blocks/prompt_test.py @@ -906,7 +906,7 @@ def tool_b() -> str: # Verify 'kitchensink' rendering with input # kitchensink.prompt has: - # model: googleai/gemini-5.0-ultimate-pro-plus + # model: googleai/gemini-3-pro-preview # config: temperature: 11 # tools: [toolA, toolB] # output: format: csv, schema: ... @@ -919,7 +919,7 @@ def tool_b() -> str: kitchensink_response = await kitchensink_executable.arun({'subject': 'banana'}) req = kitchensink_response.response - assert req.model == 'googleai/gemini-5.0-ultimate-pro-plus' + assert req.model == 'googleai/gemini-3-pro-preview' assert req.config.temperature == 11 assert req.output.format == 'csv' # Tools should be listed diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt index 330727382e..124680990e 100644 --- a/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt +++ b/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt @@ -1,5 +1,5 @@ --- -model: googleai/gemini-5.0-ultimate-pro-plus +model: googleai/gemini-3-pro-preview description: a description config: temperature: 11 diff --git a/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt b/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt index 3d26908310..e1366fa978 100644 --- a/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt +++ b/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt @@ -8,7 +8,7 @@ You are a food critic with a wide range in taste. Given the output, decide if it Here are a few examples: Output: -Chicken parm sandwich +Chicken parmesan sandwich Response: { "reason": "This is a classic sandwich enjoyed by many - totally delicious", "verdict":"yes"} diff --git a/py/samples/framework-custom-evaluators/src/constants.py b/py/samples/framework-custom-evaluators/src/constants.py index 4064e2c1c6..1d828eae2c 100644 --- a/py/samples/framework-custom-evaluators/src/constants.py +++ b/py/samples/framework-custom-evaluators/src/constants.py @@ -26,8 +26,8 @@ US_PHONE_REGEX = re.compile(r'\(?\b\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b') # Permissive safety settings for judge model -PERMISSIVE_SAFETY_SETTINGS = { - 'safetySettings': [ +PERMISSIVE_SAFETY_SETTINGS: dict[str, object] = { + 'safety_settings': [ {'category': 'HARM_CATEGORY_HATE_SPEECH', 'threshold': 'BLOCK_NONE'}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'threshold': 'BLOCK_NONE'}, {'category': 'HARM_CATEGORY_HARASSMENT', 'threshold': 'BLOCK_NONE'}, diff --git a/py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py b/py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py index 8eb73eadb7..c90b5d3e18 100644 --- a/py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py +++ b/py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py @@ -16,6 +16,7 @@ """Deliciousness evaluator using LLM-as-a-judge.""" +from functools import partial from typing import Literal from pydantic import BaseModel @@ -35,6 +36,7 @@ async def deliciousness_score( ai: Genkit, judge: str, datapoint: BaseDataPoint, + _options: dict[str, object] | None = None, judge_config: dict[str, object] | None = None, ) -> EvalFnResponse: """Score a datapoint for deliciousness using an LLM judge. @@ -43,6 +45,7 @@ async def deliciousness_score( ai: Genkit instance with loaded prompts. judge: Model name to use as judge (e.g., 'googleai/gemini-2.0-flash'). datapoint: The evaluation datapoint containing output to check. + _options: (Unused) Evaluation options passed by Genkit. judge_config: Optional configuration for the judge model. Returns: @@ -93,5 +96,5 @@ def register_deliciousness_evaluator( name='byo/deliciousness', display_name='Deliciousness', definition='Determines if output is considered delicious.', - fn=lambda dp, options: deliciousness_score(ai, judge, dp, judge_config), + fn=partial(deliciousness_score, ai, judge, judge_config=judge_config), ) diff --git a/py/samples/framework-custom-evaluators/src/funniness_evaluator.py b/py/samples/framework-custom-evaluators/src/funniness_evaluator.py index db51c7db10..2119540935 100644 --- a/py/samples/framework-custom-evaluators/src/funniness_evaluator.py +++ b/py/samples/framework-custom-evaluators/src/funniness_evaluator.py @@ -16,6 +16,7 @@ """Funniness evaluator using LLM-as-a-judge.""" +from functools import partial from typing import Literal from pydantic import BaseModel @@ -35,6 +36,7 @@ async def funniness_score( ai: Genkit, judge: str, datapoint: BaseDataPoint, + _options: dict[str, object] | None = None, judge_config: dict[str, object] | None = None, ) -> EvalFnResponse: """Score a datapoint for funniness using an LLM judge. @@ -43,6 +45,7 @@ async def funniness_score( ai: Genkit instance with loaded prompts. judge: Model name to use as judge (e.g., 'googleai/gemini-2.0-flash'). datapoint: The evaluation datapoint containing output to check. + _options: (Unused) Evaluation options passed by Genkit. judge_config: Optional configuration for the judge model. Returns: @@ -93,5 +96,5 @@ def register_funniness_evaluator( name='byo/funniness', display_name='Funniness', definition='Judges whether a statement is a joke and whether that joke is funny.', - fn=lambda dp, options: funniness_score(ai, judge, dp, judge_config), + fn=partial(funniness_score, ai, judge, judge_config=judge_config), ) diff --git a/py/samples/framework-custom-evaluators/src/main.py b/py/samples/framework-custom-evaluators/src/main.py index 0595e0259f..4d694be7dd 100644 --- a/py/samples/framework-custom-evaluators/src/main.py +++ b/py/samples/framework-custom-evaluators/src/main.py @@ -62,8 +62,8 @@ """ import asyncio +import os from pathlib import Path -from typing import cast from genkit.ai import Genkit from genkit.core.logging import get_logger @@ -81,7 +81,7 @@ prompts_path = current_dir.parent / 'prompts' # Register all evaluators -JUDGE_MODEL = 'googleai/gemini-3-pro-preview' +JUDGE_MODEL = os.getenv('JUDGE_MODEL', 'googleai/gemini-3-pro-preview') # Initialize Genkit with Google AI plugin, default model, and load prompts ai = Genkit(plugins=[GoogleAI()], model=JUDGE_MODEL, prompt_dir=prompts_path) @@ -96,9 +96,9 @@ ) # LLM-based evaluators -register_pii_evaluator(ai, JUDGE_MODEL, cast(dict[str, object], PERMISSIVE_SAFETY_SETTINGS)) -register_funniness_evaluator(ai, JUDGE_MODEL, cast(dict[str, object], PERMISSIVE_SAFETY_SETTINGS)) -register_deliciousness_evaluator(ai, JUDGE_MODEL, cast(dict[str, object], PERMISSIVE_SAFETY_SETTINGS)) +register_pii_evaluator(ai, JUDGE_MODEL, PERMISSIVE_SAFETY_SETTINGS) +register_funniness_evaluator(ai, JUDGE_MODEL, PERMISSIVE_SAFETY_SETTINGS) +register_deliciousness_evaluator(ai, JUDGE_MODEL, PERMISSIVE_SAFETY_SETTINGS) async def main() -> None: diff --git a/py/samples/framework-custom-evaluators/src/pii_evaluator.py b/py/samples/framework-custom-evaluators/src/pii_evaluator.py index 452ff64f64..7b1ff5ba5a 100644 --- a/py/samples/framework-custom-evaluators/src/pii_evaluator.py +++ b/py/samples/framework-custom-evaluators/src/pii_evaluator.py @@ -16,6 +16,8 @@ """PII detection evaluator using LLM-as-a-judge.""" +from functools import partial + from pydantic import BaseModel from genkit.ai import Genkit @@ -33,6 +35,7 @@ async def pii_detection_score( ai: Genkit, judge: str, datapoint: BaseDataPoint, + _options: dict[str, object] | None = None, judge_config: dict[str, object] | None = None, ) -> EvalFnResponse: """Score a datapoint for PII presence using an LLM judge. @@ -41,6 +44,7 @@ async def pii_detection_score( ai: Genkit instance with loaded prompts. judge: Model name to use as judge (e.g., 'googleai/gemini-2.0-flash'). datapoint: The evaluation datapoint containing output to check. + _options: (Unused) Evaluation options passed by Genkit. judge_config: Optional configuration for the judge model. Returns: @@ -91,5 +95,5 @@ def register_pii_evaluator( name='byo/pii_detection', display_name='PII Detection', definition='Detects whether PII is present in the output.', - fn=lambda dp, options: pii_detection_score(ai, judge, dp, judge_config), + fn=partial(pii_detection_score, ai, judge, judge_config=judge_config), ) diff --git a/py/samples/framework-prompt-demo/src/main.py b/py/samples/framework-prompt-demo/src/main.py index c7e4a3a037..a0f9d99b74 100755 --- a/py/samples/framework-prompt-demo/src/main.py +++ b/py/samples/framework-prompt-demo/src/main.py @@ -153,6 +153,8 @@ async def chef_flow(input: ChefInput) -> Recipe: response = await recipe_prompt(input={'food': input.food}) # Ensure we return a Pydantic model as expected by the type hint and caller + if not response.output: + raise ValueError('Model did not return a recipe.') result = Recipe.model_validate(response.output) await logger.ainfo(f'chef_flow result: {result}') return result @@ -180,6 +182,8 @@ async def robot_chef_flow(input: ChefInput) -> Recipe: response = await robot_recipe_prompt(input={'food': input.food}) # Ensure we return a Pydantic model as expected by the type hint and caller + if not response.output: + raise ValueError('Model did not return a recipe.') result = Recipe.model_validate(response.output) await logger.ainfo(f'robot_chef_flow result: {result}') return result diff --git a/py/samples/web-endpoints-hello/src/security.py b/py/samples/web-endpoints-hello/src/security.py index f4b130aae4..629954ec82 100644 --- a/py/samples/web-endpoints-hello/src/security.py +++ b/py/samples/web-endpoints-hello/src/security.py @@ -88,7 +88,8 @@ """ _SECURITY_HEADERS_DEBUG = secure_lib.Secure( - csp=secure_lib.ContentSecurityPolicy() + csp=secure_lib + .ContentSecurityPolicy() .default_src("'self'") .script_src("'self'", "'unsafe-inline'", "https://cdn.jsdelivr.net") .style_src("'self'", "'unsafe-inline'", "https://cdn.jsdelivr.net") From 8fede59747305baec6a2521bed5bd8040021f3f8 Mon Sep 17 00:00:00 2001 From: Mengqin Shen Date: Sat, 14 Feb 2026 23:49:12 -0800 Subject: [PATCH 4/7] fix(py): fix type check errors --- py/samples/framework-custom-evaluators/src/regex_evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/samples/framework-custom-evaluators/src/regex_evaluator.py b/py/samples/framework-custom-evaluators/src/regex_evaluator.py index fa0b66f9d5..13d6c52123 100644 --- a/py/samples/framework-custom-evaluators/src/regex_evaluator.py +++ b/py/samples/framework-custom-evaluators/src/regex_evaluator.py @@ -99,7 +99,7 @@ def register_regex_evaluators(ai: Genkit, patterns: list[dict[str, Any]]) -> Non ai.define_evaluator( name=f'byo/{name}', - display_name='Regex Match', + display_name=f'Regex Match ({name.split("_")[-1]})', definition='Runs the output against a regex and responds with 1 if a match is found and 0 otherwise.', is_billed=False, fn=_regex_eval_fn_factory(regex), From 3efe7df2739a6bb4efc3eb3f187b14d81cca5240 Mon Sep 17 00:00:00 2001 From: Mengqin Shen Date: Sun, 15 Feb 2026 00:14:45 -0800 Subject: [PATCH 5/7] fix(py): remove static prompt fixtures to a new pr --- .../genkit/tests/genkit/blocks/prompt_test.py | 79 ------------------- .../genkit/blocks/prompts/badSchemaRef.prompt | 9 --- .../blocks/prompts/chat_preamble.prompt | 5 -- .../genkit/blocks/prompts/kitchensink.prompt | 25 ------ .../tests/genkit/blocks/prompts/output.prompt | 11 --- .../genkit/blocks/prompts/schemaRef.prompt | 9 --- .../genkit/blocks/prompts/sub/test.prompt | 5 -- .../tests/genkit/blocks/prompts/test.prompt | 5 -- .../genkit/blocks/prompts/test.variant.prompt | 6 -- .../genkit/blocks/prompts/toolPrompt.prompt | 6 -- 10 files changed, 160 deletions(-) delete mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt delete mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/chat_preamble.prompt delete mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt delete mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/output.prompt delete mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt delete mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/sub/test.prompt delete mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/test.prompt delete mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/test.variant.prompt delete mode 100644 py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt diff --git a/py/packages/genkit/tests/genkit/blocks/prompt_test.py b/py/packages/genkit/tests/genkit/blocks/prompt_test.py index ebf13bf0e6..25434bed2a 100644 --- a/py/packages/genkit/tests/genkit/blocks/prompt_test.py +++ b/py/packages/genkit/tests/genkit/blocks/prompt_test.py @@ -854,82 +854,3 @@ async def test_variant_prompt_loading_does_not_recurse() -> None: robot_exec = await prompt(ai.registry, 'recipe', variant='robot') robot_response = await robot_exec({'food': 'pizza'}) assert 'pizza' in robot_response.text - - -@pytest.mark.asyncio -async def test_load_static_prompts() -> None: - """Test loading static prompts from the definitions/prompts directory.""" - ai, *_ = setup_test() - - @ai.tool(name='toolA') - def tool_a() -> str: - return 'toolA' - - @ai.tool(name='toolB') - def tool_b() -> str: - return 'toolB' - - # Path to the static prompts directory - # genkit/tests/genkit/blocks/prompts - prompts_dir = Path(__file__).parent / 'prompts' - - if not prompts_dir.exists(): - pytest.skip(f'Static prompts directory not found at {prompts_dir}') - - load_prompt_folder(ai.registry, prompts_dir) - - # Verify 'test' prompt - test_prompt = await ai.registry.resolve_action(ActionKind.PROMPT, 'test') - assert test_prompt is not None - - # Verify 'kitchensink' prompt - kitchensink_prompt = await ai.registry.resolve_action(ActionKind.PROMPT, 'kitchensink') - assert kitchensink_prompt is not None - - # Verify 'toolPrompt' prompt - tool_prompt = await ai.registry.resolve_action(ActionKind.PROMPT, 'toolPrompt') - assert tool_prompt is not None - - # Verify sub-directory prompt 'sub/test' - sub_test_prompt = await ai.registry.resolve_action(ActionKind.PROMPT, 'sub/test') - assert sub_test_prompt is not None - - # Verify 'sub/test' rendering - sub_test_response = await sub_test_prompt.arun({}) - sub_test_req = sub_test_response.response - assert sub_test_req.config.temperature == 12 # From config in sub/test.prompt - # Rendered text: "Hello from the sub folder prompt file" - # Default role user. - assert len(sub_test_req.messages) == 1 - assert sub_test_req.messages[0].role == Role.USER - assert sub_test_req.messages[0].content[0].root.text == 'Hello from the sub folder prompt file' - - # Verify 'kitchensink' rendering with input - # kitchensink.prompt has: - # model: googleai/gemini-3-pro-preview - # config: temperature: 11 - # tools: [toolA, toolB] - # output: format: csv, schema: ... - # template: {{role "system"}} Hello {{history}} from the prompt file {{ subject }} - - # Use EXECUTABLE_PROMPT to verify model and other generation options - kitchensink_executable = await ai.registry.resolve_action(ActionKind.EXECUTABLE_PROMPT, 'kitchensink') - assert kitchensink_executable is not None - - kitchensink_response = await kitchensink_executable.arun({'subject': 'banana'}) - req = kitchensink_response.response - - assert req.model == 'googleai/gemini-3-pro-preview' - assert req.config.temperature == 11 - assert req.output.format == 'csv' - # Tools should be listed - assert 'toolA' in req.tools - assert 'toolB' in req.tools - - # Verify messages structure - # Expected: System message " Hello " and maybe another message - assert len(req.messages) > 0 - assert req.messages[0].role == Role.SYSTEM - assert 'Hello' in req.messages[0].content[0].root.text - # Check for the subject substitution - assert any('banana' in m.content[0].root.text for m in req.messages) diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt deleted file mode 100644 index 2eaca4930b..0000000000 --- a/py/packages/genkit/tests/genkit/blocks/prompts/badSchemaRef.prompt +++ /dev/null @@ -1,9 +0,0 @@ ---- -model: googleai/gemini-2.5-flash -input: - schema: badSchemaRef1 -output: - schema: badSchemaRef2 ---- - -doesn't matter diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/chat_preamble.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/chat_preamble.prompt deleted file mode 100644 index 31a580ac80..0000000000 --- a/py/packages/genkit/tests/genkit/blocks/prompts/chat_preamble.prompt +++ /dev/null @@ -1,5 +0,0 @@ ---- -config: - version: 'abc' ---- -hi {{ name }} from template diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt deleted file mode 100644 index 124680990e..0000000000 --- a/py/packages/genkit/tests/genkit/blocks/prompts/kitchensink.prompt +++ /dev/null @@ -1,25 +0,0 @@ ---- -model: googleai/gemini-3-pro-preview -description: a description -config: - temperature: 11 -tools: - - toolA - - toolB -returnToolRequests: true -input: - schema: - subject: string -output: - format: csv - schema: - obj?(object, a nested object): - nest1?: string - arr(array, array of objects): - nest2?: boolean -maxTurns: 77 -toolChoice: required -metadata: - foo: bar ---- -{{role "system"}} Hello {{history}} from the prompt file {{ subject }} diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/output.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/output.prompt deleted file mode 100644 index 8471ff0edb..0000000000 --- a/py/packages/genkit/tests/genkit/blocks/prompts/output.prompt +++ /dev/null @@ -1,11 +0,0 @@ ---- -model: staticResponseModel -input: - schema: - name: string -output: - schema: - bar: string ---- - -Hi {{ name }} diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt deleted file mode 100644 index 98d154d3ca..0000000000 --- a/py/packages/genkit/tests/genkit/blocks/prompts/schemaRef.prompt +++ /dev/null @@ -1,9 +0,0 @@ ---- -model: googleai/gemini-2.5-flash -input: - schema: myInputSchema -output: - schema: myOutputSchema ---- - -Write a poem about {{foo}}. diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/sub/test.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/sub/test.prompt deleted file mode 100644 index 42237b4b79..0000000000 --- a/py/packages/genkit/tests/genkit/blocks/prompts/sub/test.prompt +++ /dev/null @@ -1,5 +0,0 @@ ---- -config: - temperature: 12 ---- -Hello from the sub folder prompt file \ No newline at end of file diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/test.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/test.prompt deleted file mode 100644 index 63276f208e..0000000000 --- a/py/packages/genkit/tests/genkit/blocks/prompts/test.prompt +++ /dev/null @@ -1,5 +0,0 @@ ---- -config: - temperature: 11 ---- -Hello from the prompt file diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/test.variant.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/test.variant.prompt deleted file mode 100644 index b307d7e53f..0000000000 --- a/py/packages/genkit/tests/genkit/blocks/prompts/test.variant.prompt +++ /dev/null @@ -1,6 +0,0 @@ ---- -description: a prompt variant in a file -config: - temperature: 13 ---- -Hello from a variant of the hello prompt diff --git a/py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt b/py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt deleted file mode 100644 index 2c598b5450..0000000000 --- a/py/packages/genkit/tests/genkit/blocks/prompts/toolPrompt.prompt +++ /dev/null @@ -1,6 +0,0 @@ ---- -description: prompt in a file -tools: - - agentA ---- -{{ role "system" }} {{ @state.name }} toolPrompt prompt From bd99923536591764ca5e0bde159451cfe6e0c35a Mon Sep 17 00:00:00 2001 From: Mengqin Shen Date: Sun, 15 Feb 2026 00:38:28 -0800 Subject: [PATCH 6/7] fix(py): update pyproject and fixed the pyrefly missing-import errors in the framework-custom-evaluators --- py/pyproject.toml | 5 +++-- releasekit.toml | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/py/pyproject.toml b/py/pyproject.toml index 1723dca45a..4c8cbb1143 100644 --- a/py/pyproject.toml +++ b/py/pyproject.toml @@ -426,7 +426,7 @@ pyasn1 = "0.6.2" # BSD-2-Clause [tool.ty.src] # Auto-generated protobuf stubs use grpc.experimental implicit submodule # access that ty warns about. We can't modify generated code. -exclude = ["**/generated"] +exclude = ["**/generated", "samples/web-endpoints-hello"] [tool.ty.environment] root = [ @@ -458,7 +458,7 @@ root = [ ".", # For samples.shared imports "samples/framework-evaluator-demo", # For evaluator_demo package imports "samples/framework-restaurant-demo/src", # For restaurant demo sample imports - "samples/web-endpoints-hello", # For src imports in tests + "samples/framework-custom-evaluators", # For custom evaluators sample imports "plugins/mcp/tests", # For fakes module imports in tests # Tools "tools/releasekit/src", # For releasekit package imports @@ -508,6 +508,7 @@ extraPaths = [ "plugins/ollama/src", "plugins/vertex-ai/src", "plugins/xai/src", + "samples/framework-custom-evaluators", # Tools "tools/releasekit/src", "tools/conform/src", diff --git a/releasekit.toml b/releasekit.toml index 35fc631c95..84bc0e3107 100644 --- a/releasekit.toml +++ b/releasekit.toml @@ -144,6 +144,7 @@ internal_tools = [ samples = [ "dev-local-vectorstore-hello", "framework-context-demo", + "framework-custom-evaluators", "framework-dynamic-tools-demo", "framework-evaluator-demo", "framework-format-demo", From 8161a583e6c56c4bdf0d911fbcc37d46ee6e0fa7 Mon Sep 17 00:00:00 2001 From: Mengqin Shen Date: Sun, 15 Feb 2026 01:10:45 -0800 Subject: [PATCH 7/7] fix(py): update pyproject to fix Ty check errors in new sample --- py/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/py/pyproject.toml b/py/pyproject.toml index 4c8cbb1143..28af642ccc 100644 --- a/py/pyproject.toml +++ b/py/pyproject.toml @@ -577,6 +577,7 @@ search-path = [ ".", "plugins/mcp/tests", "samples/framework-evaluator-demo", + "samples/framework-custom-evaluators", "samples/framework-restaurant-demo/src", "samples/web-endpoints-hello",