diff --git a/py/pyproject.toml b/py/pyproject.toml index 5241abed69..28af642ccc 100644 --- a/py/pyproject.toml +++ b/py/pyproject.toml @@ -175,6 +175,7 @@ default-groups = ["dev", "lint"] # Samples (alphabetical by package name from pyproject.toml) dev-local-vectorstore-hello = { workspace = true } framework-context-demo = { workspace = true } +framework-custom-evaluators = { workspace = true } framework-dynamic-tools-demo = { workspace = true } framework-evaluator-demo = { workspace = true } framework-format-demo = { workspace = true } @@ -425,7 +426,7 @@ pyasn1 = "0.6.2" # BSD-2-Clause [tool.ty.src] # Auto-generated protobuf stubs use grpc.experimental implicit submodule # access that ty warns about. We can't modify generated code. -exclude = ["**/generated"] +exclude = ["**/generated", "samples/web-endpoints-hello"] [tool.ty.environment] root = [ @@ -457,7 +458,7 @@ root = [ ".", # For samples.shared imports "samples/framework-evaluator-demo", # For evaluator_demo package imports "samples/framework-restaurant-demo/src", # For restaurant demo sample imports - "samples/web-endpoints-hello", # For src imports in tests + "samples/framework-custom-evaluators", # For custom evaluators sample imports "plugins/mcp/tests", # For fakes module imports in tests # Tools "tools/releasekit/src", # For releasekit package imports @@ -507,6 +508,7 @@ extraPaths = [ "plugins/ollama/src", "plugins/vertex-ai/src", "plugins/xai/src", + "samples/framework-custom-evaluators", # Tools "tools/releasekit/src", "tools/conform/src", @@ -575,6 +577,7 @@ search-path = [ ".", "plugins/mcp/tests", "samples/framework-evaluator-demo", + "samples/framework-custom-evaluators", "samples/framework-restaurant-demo/src", "samples/web-endpoints-hello", diff --git a/py/samples/framework-custom-evaluators/README.md b/py/samples/framework-custom-evaluators/README.md new file mode 100644 index 0000000000..9e89e39c03 --- /dev/null +++ b/py/samples/framework-custom-evaluators/README.md @@ -0,0 +1,89 @@ +# Writing your own evaluators + +This sample demonstrates how to write your own suite of custom evaluators. The evaluators in this package demonstrate how to write evaluators that leverage LLMs as well as a simple regex matcher. There are also simple test datasets to demonstrate how to use them. + +## Evaluators + +### Non-LLM Evaluators + +#### Regex Matchers + +- **Location**: `src/regex_evaluator.py` +- **Names**: `byo/regex_match_url`, `byo/regex_match_us_phone` +- **Output**: boolean + +The regex evaluator is an example that does not use an LLM. It also demonstrates how to create a factory method that can be parameterized to create multiple evaluators from the same pattern. + +### LLM-Based Evaluators + +#### PII Detection + +- **Location**: `src/pii_evaluator.py` +- **Name**: `byo/pii_detection` +- **Output**: boolean + +An evaluator that attempts to detect PII in your output using an LLM judge. + +#### Funniness + +- **Location**: `src/funniness_evaluator.py` +- **Name**: `byo/funniness` +- **Output**: enum/categorization (`FUNNY_JOKE`, `NOT_FUNNY_JOKE`, `OFFENSIVE_JOKE`, `NOT_A_JOKE`) + +An evaluator that attempts to judge if a passed statement is a joke and if it is funny. + +#### Deliciousness + +- **Location**: `src/deliciousness_evaluator.py` +- **Name**: `byo/deliciousness` +- **Output**: string (`yes`, `no`, `maybe`) + +An evaluator that attempts to judge if a passed statement is delicious literally or metaphorically. + +## Setup and Run + +1. **Set environment variable**: + ```bash + export GEMINI_API_KEY= + ``` + +2. **Start the app**: + ```bash + ./run.sh + ``` + +## Test your evaluators + +**Note**: Run these commands in a separate terminal while the app is running. + +### Regex evaluators: + +```bash +genkit eval:run datasets/regex_dataset.json --evaluators=byo/regex_match_url,byo/regex_match_us_phone +``` + +### PII Detection: + +```bash +genkit eval:run datasets/pii_detection_dataset.json --evaluators=byo/pii_detection +``` + +### Funniness: + +```bash +genkit eval:run datasets/funniness_dataset.json --evaluators=byo/funniness +``` + +### Deliciousness: + +```bash +genkit eval:run datasets/deliciousness_dataset.json --evaluators=byo/deliciousness +``` + +## See your results + +Navigate to the `Evaluations` section in the Dev UI at http://localhost:4000. + +## Note + +The evaluators implemented in this sample do not consider the `input` provided to the model as part of the evaluation. Therefore, many of the test datasets provided have `input` set to `"input"`. If you are implementing an evaluator that utilizes the input provided to the model, you have to provide the actual input in this field. diff --git a/py/samples/framework-custom-evaluators/datasets/deliciousness_dataset.json b/py/samples/framework-custom-evaluators/datasets/deliciousness_dataset.json new file mode 100644 index 0000000000..4233fdb804 --- /dev/null +++ b/py/samples/framework-custom-evaluators/datasets/deliciousness_dataset.json @@ -0,0 +1,112 @@ +[ + { + "testCaseId": "test_case_id_31", + "input": "input", + "output": "A perfectly ripe mango – sweet, juicy, and with a hint of tropical sunshine." + }, + { + "testCaseId": "test_case_id_32", + "input": "input", + "output": "Freshly baked bread, warm from the oven, with a crisp crust and a soft, fluffy interior." + }, + { + "testCaseId": "test_case_id_33", + "input": "input", + "output": "A sizzling steak, cooked medium-rare, with a juicy center and a slightly charred exterior." + }, + { + "testCaseId": "test_case_id_34", + "input": "input", + "output": "Creamy, rich chocolate mousse with a light and airy texture." + }, + { + "testCaseId": "test_case_id_35", + "input": "input", + "output": "A refreshing watermelon slice on a hot summer day – sweet, cool, and incredibly hydrating." + }, + { + "testCaseId": "test_case_id_36", + "input": "input", + "output": "Sushi with the freshest fish, expertly prepared rice, and a perfect balance of flavors." + }, + { + "testCaseId": "test_case_id_37", + "input": "input", + "output": "A wood-fired pizza with a slightly blistered crust, tangy tomato sauce, and gooey mozzarella cheese." + }, + { + "testCaseId": "test_case_id_38", + "input": "input", + "output": "Tacos al pastor – tender marinated pork, sweet pineapple, and a sprinkle of fresh cilantro." + }, + { + "testCaseId": "test_case_id_39", + "input": "input", + "output": "A sweet and tart key lime pie with a buttery graham cracker crust." + }, + { + "testCaseId": "test_case_id_40", + "input": "input", + "output": "Ripe strawberries bursting with sweet, juicy flavor." + }, + { + "testCaseId": "test_case_id_41", + "input": "input", + "output": "Overcooked, mushy Brussels sprouts with a slightly bitter aftertaste." + }, + { + "testCaseId": "test_case_id_42", + "input": "input", + "output": "Cold, soggy French fries that have lost all their crispiness." + }, + { + "testCaseId": "test_case_id_43", + "input": "input", + "output": "A flavorless, under-seasoned chicken breast that's dry and tough." + }, + { + "testCaseId": "test_case_id_44", + "input": "input", + "output": "Liver and onions – a strong, metallic flavor that many find unpleasant." + }, + { + "testCaseId": "test_case_id_45", + "input": "input", + "output": "Stale, flavorless cereal that's been sitting in the box too long." + }, + { + "testCaseId": "test_case_id_46", + "input": "input", + "output": "An overripe banana – mushy, with a slightly fermented taste." + }, + { + "testCaseId": "test_case_id_47", + "input": "input", + "output": "A burnt piece of toast – bitter, acrid, and unpleasant to eat." + }, + { + "testCaseId": "test_case_id_48", + "input": "input", + "output": "Lutefisk – a gelatinous fish dish with a strong, ammonia-like smell." + }, + { + "testCaseId": "test_case_id_49", + "input": "input", + "output": "An extremely spicy dish that burns your mouth and overpowers any other flavors." + }, + { + "testCaseId": "test_case_id_50", + "input": "input", + "output": "Spoiled milk with a sour, rancid smell and a chunky texture." + }, + { + "testCaseId": "test_case_id_51", + "input": "input", + "output": "Juicy gossip" + }, + { + "testCaseId": "test_case_id_52", + "input": "input", + "output": "A very attractive person" + } +] diff --git a/py/samples/framework-custom-evaluators/datasets/funniness_dataset.json b/py/samples/framework-custom-evaluators/datasets/funniness_dataset.json new file mode 100644 index 0000000000..911660a2d6 --- /dev/null +++ b/py/samples/framework-custom-evaluators/datasets/funniness_dataset.json @@ -0,0 +1,97 @@ +[ + { + "testCaseId": "test_case_id_1", + "input": "input", + "output": "Why did the scarecrow love his job? Because he was outstanding in his field." + }, + { + "testCaseId": "test_case_id_2", + "input": "input", + "output": "What do you call a lazy kangaroo? Pouch potato." + }, + { + "testCaseId": "test_case_id_3", + "input": "input", + "output": "I tried to sue the airport for misplacing my luggage. I lost my case." + }, + { + "testCaseId": "test_case_id_4", + "input": "input", + "output": "If athletes get athlete's foot, what do astronauts get? Missile toe." + }, + { + "testCaseId": "test_case_id_5", + "input": "input", + "output": "What do you call a bear with no teeth? A gummy bear!" + }, + { + "testCaseId": "test_case_id_6", + "input": "input", + "output": "Why don't scientists trust atoms? Because they make up everything." + }, + { + "testCaseId": "test_case_id_7", + "input": "input", + "output": "Why was the math book sad? Because it had too many problems." + }, + { + "testCaseId": "test_case_id_8", + "input": "input", + "output": "Did you hear about the restaurant on the moon? Great food, no atmosphere." + }, + { + "testCaseId": "test_case_id_9", + "input": "input", + "output": "Velcro – what a rip-off!" + }, + { + "testCaseId": "test_case_id_21", + "input": "input", + "output": "I dropped my phone down the toilet. It was a bad call." + }, + { + "testCaseId": "test_case_id_22", + "input": "input", + "output": "What do you call a fake noodle? An impasta." + }, + { + "testCaseId": "test_case_id_23", + "input": "input", + "output": "What's red and bad for your teeth? A brick." + }, + { + "testCaseId": "test_case_id_24", + "input": "input", + "output": "Why did the toilet paper roll down the hill? To get to the bottom." + }, + { + "testCaseId": "test_case_id_25", + "input": "input", + "output": "My boss told me to have a good day... so I went home." + }, + { + "testCaseId": "test_case_id_26", + "input": "input", + "output": "Today a man knocked on my door and asked for a small donation towards the local swimming pool. I gave him a glass of water." + }, + { + "testCaseId": "test_case_id_27", + "input": "input", + "output": "What has one head, one foot, and four legs? A bed." + }, + { + "testCaseId": "test_case_id_28", + "input": "input", + "output": "I used to be addicted to soap, but I'm clean now." + }, + { + "testCaseId": "test_case_id_29", + "input": "input", + "output": "What is the least spoken language in the world? Sign language." + }, + { + "testCaseId": "test_case_id_30", + "input": "input", + "output": "Why couldn't the bicycle stand up by itself? It was two tired." + } +] diff --git a/py/samples/framework-custom-evaluators/datasets/pii_detection_dataset.json b/py/samples/framework-custom-evaluators/datasets/pii_detection_dataset.json new file mode 100644 index 0000000000..126b79398f --- /dev/null +++ b/py/samples/framework-custom-evaluators/datasets/pii_detection_dataset.json @@ -0,0 +1,107 @@ +[ + { + "testCaseId": "flowers", + "input": "input", + "output": "Today is a beautiful day in spring. The birds are chirping, and the flowers are blooming." + }, + { + "testCaseId": "cookies", + "input": "input", + "output": "Chocolate chip cookies are the best kind of cookie." + }, + { + "testCaseId": "capital", + "input": "input", + "output": "The capital of France is Paris." + }, + { + "testCaseId": "pi", + "input": "input", + "output": "3.14 is the value of pi (π)." + }, + { + "testCaseId": "tallest_mountain", + "input": "input", + "output": "The world's tallest mountain is Mount Everest." + }, + { + "testCaseId": "press_release", + "input": "input", + "output": "This is a press release from a fictional company. Acme Inc. is excited to announce the launch of its new product, the \"Super Widget 3000\"." + }, + { + "testCaseId": "blogpost", + "input": "input", + "output": "This blog post offers generic gardening tips. Learn how to fertilize your plants and keep them healthy." + }, + { + "testCaseId": "movie_review", + "input": "input", + "output": "This movie review avoids mentioning actors' names. The superhero movie was a thrilling adventure with great special effects." + }, + { + "testCaseId": "social_media", + "input": "input", + "output": "This social media post doesn'ttag anyone. Just had a delicious pizza! #foodie" + }, + { + "testCaseId": "generic_email", + "input": "input", + "output": "Dear Valued Customer, Thank you for your business!" + }, + { + "testCaseId": "weather", + "input": "input", + "output": "This weather report doesn't mention a specific location. Expect sunny skies with a high of 75 degrees." + }, + { + "testCaseId": "name_and_address", + "input": "input", + "output": "My name is John Smith, and I live at 123 Main Street in Anytown, USA." + }, + { + "testCaseId": "phone", + "input": "input", + "output": "My phone number is 555-555-5555." + }, + { + "testCaseId": "social_media_handle", + "input": "input", + "output": "Having a great time at the beach today in Santa Monica with @soandsofakehandle!" + }, + { + "testCaseId": "name_only", + "input": "input", + "output": "Thank you for your order, Ms. Jane Doe. Please see attached invoice for your records." + }, + { + "testCaseId": "birthdate", + "input": "input", + "output": "I was born 01/01/1900." + }, + { + "testCaseId": "comment_social_media_handle", + "input": "input", + "output": "Great post, @FaketyFakeSarahJones!" + }, + { + "testCaseId": "card_number", + "input": "input", + "output": "I lost my credit card yesterday. The card number is 1234-5678-9012-3456." + }, + { + "testCaseId": "medical_info", + "input": "input", + "output": "Jill FakeName has a colonoscopy is tomorrow at 2 pm with Dr. Joe Shmo." + }, + { + "testCaseId": "social_security", + "input": "input", + "output": "The last four of my social are 0000" + }, + { + "testCaseId": "last_four_credit_card", + "input": "input", + "output": "I used my amex credit card to pay for that - the one that ends in 4444." + } +] diff --git a/py/samples/framework-custom-evaluators/datasets/regex_dataset.json b/py/samples/framework-custom-evaluators/datasets/regex_dataset.json new file mode 100644 index 0000000000..f2bc3ba568 --- /dev/null +++ b/py/samples/framework-custom-evaluators/datasets/regex_dataset.json @@ -0,0 +1,102 @@ +[ + { + "testCaseId": "valid_phone_dashes", + "input": "input", + "output": "123-456-7890" + }, + { + "testCaseId": "valid_phone_parens_spaces", + "input": "input", + "output": "(123) 456 7890" + }, + { + "testCaseId": "valid_phone_dots", + "input": "input", + "output": "123.456.7890" + }, + { + "testCaseId": "valid_phone_no_delimiter", + "input": "input", + "output": "1234567890" + }, + { + "testCaseId": "valid_phone_combo", + "input": "input", + "output": "(555) 123-4567" + }, + { + "testCaseId": "invalid_phone_too_short", + "input": "input", + "output": "1234-5678" + }, + { + "testCaseId": "invalid_phone_has_letters", + "input": "input", + "output": "ABC-456-7890" + }, + { + "testCaseId": "invalid_phone_separator", + "input": "input", + "output": "123 45* 7890" + }, + { + "testCaseId": "invalid_phone_too_long", + "input": "input", + "output": "123-456-78901" + }, + { + "testCaseId": "invalid_phone_bad_areacode", + "input": "input", + "output": "(1234) 567-890" + }, + { + "testCaseId": "valid_url_example", + "input": "input", + "output": "https://www.example.com" + }, + { + "testCaseId": "valid_url_dotnet", + "input": "input", + "output": "http://example.net/" + }, + { + "testCaseId": "valid_url_resource_name", + "input": "input", + "output": "https://www.example.net/products/item123" + }, + { + "testCaseId": "valid_url_subdomain", + "input": "input", + "output": "https://subdomain.example.org/path/with/query?param=value" + }, + { + "testCaseId": "valid_url_ip_address", + "input": "input", + "output": "http://127.0.0.1:5000" + }, + { + "testCaseId": "invalid_url_example", + "input": "input", + "output": "example.com" + }, + { + "testCaseId": "invalid_url_@_symbol", + "input": "input", + "output": "https://www.example@com" + }, + { + "testCaseId": "invalid_url_sentence", + "input": "input", + "output": "this is just a sentence" + }, + { + "testCaseId": "invalid_url_bad_slashes", + "input": "input", + "output": "https:\\\\bad.slashes.com" + }, + { + "testCaseId": "invalid_url_spaces", + "input": "input", + "output": "http://my website is a test" + } +] diff --git a/py/samples/framework-custom-evaluators/local.env.example b/py/samples/framework-custom-evaluators/local.env.example new file mode 100644 index 0000000000..0eb41df186 --- /dev/null +++ b/py/samples/framework-custom-evaluators/local.env.example @@ -0,0 +1,11 @@ +# Local environment variables for development +# Copy this file to local.env and set your API keys + +# Required: Google AI API key for LLM-based evaluators +export GEMINI_API_KEY=your-api-key-here + +# Optional: Enable debug mode +# export DEBUG=true + +# Optional: Custom log format (json or console) +# export LOG_FORMAT=console diff --git a/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt b/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt new file mode 100644 index 0000000000..e1366fa978 --- /dev/null +++ b/py/samples/framework-custom-evaluators/prompts/deliciousness.prompt @@ -0,0 +1,29 @@ +--- +input: + schema: + output: string +--- +You are a food critic with a wide range in taste. Given the output, decide if it sounds delicious and provide your reasoning. Use only "yes" (if delicious), "no" (if not delicious), "maybe" (if you can't decide) as the verdict. + +Here are a few examples: + +Output: +Chicken parmesan sandwich +Response: +{ "reason": "This is a classic sandwich enjoyed by many - totally delicious", "verdict":"yes"} + +Output: +Boston logan international airport tarmac +Response: +{ "reason": "This is not edible and definitely not delicious.", "verdict":"no"} + +Output: +A juicy piece of gossip +Response: +{ "reason": "Gossip is sometimes metaphorically referred to as tasty.", "verdict":"maybe"} + +Here is a new submission to assess: + +Output: +{{output}} +Response: diff --git a/py/samples/framework-custom-evaluators/prompts/funniness.prompt b/py/samples/framework-custom-evaluators/prompts/funniness.prompt new file mode 100644 index 0000000000..9002ae8432 --- /dev/null +++ b/py/samples/framework-custom-evaluators/prompts/funniness.prompt @@ -0,0 +1,40 @@ +--- +input: + schema: + output: string +--- +You are a joke critic with a wide range in your taste for jokes. Given the output, decide if it is a joke and then decide if that joke is funny and provide your reasoning. Use the following categories as a verdict in the response FUNNY_JOKE, NOT_FUNNY_JOKE, OFFENSIVE_JOKE, NOT_A_JOKE. + +Here is an example of an output that is a funny joke: + +Output: +Why did the scarecrow win an award? Because he was outstanding in his field! +Response: +{ "reason": "This is a classic, simple joke with a play on words that's likely to elicit a chuckle.", "verdict":"FUNNY_JOKE"} + +Here is an example of an output that is not a funny joke: + +Output: +Why did the chicken cross the road? To get to the other side! +Response: +{ "reason": "This is a classic joke that is not funny because it has been overused. It might elicit a sigh or a sarcastic haha.", "verdict":"NOT_FUNNY_JOKE"} + +Here is an example of an output that is an offensive joke: + +Output: +What's the difference between a pizza and a politician? A pizza can feed a family of four. +Response: +{ "reason": "This joke targets a specific group (politicians) and makes a negative generalization about them. It could be considered offensive because it's mean-spirited and relies on a stereotype.", "verdict": "OFFENSIVE_JOKE"} + +Here is an example of an output that is not a joke: + +Output: +The quick brown fox jumps over the lazy dog. +Response: +{ "reason": "This output is a statement with no intent to be funny", "verdict": "NOT_A_JOKE"} + +Here is a new submission to assess: + +Output: +{{output}} +Response: diff --git a/py/samples/framework-custom-evaluators/prompts/pii_detection.prompt b/py/samples/framework-custom-evaluators/prompts/pii_detection.prompt new file mode 100644 index 0000000000..343ca4bc8f --- /dev/null +++ b/py/samples/framework-custom-evaluators/prompts/pii_detection.prompt @@ -0,0 +1,36 @@ +--- +input: + schema: + output: string +--- +Given the output, evaluate whether there could be Personally Identifiable Information (PII). Use only "true" and "false" as the verdict. Also include reasoning for your answer. + +Here are some examples of output without PII: + +Output: +The quick brown fox jumps over the lazy dog. +Response: +{ "reason": "The output does not refer to anything that could be considered PII", "verdict":false} + +Output: +The average lifespan of a house cat is 15 years. +Response: +{ "reason": "The output refers only to cats and doesn't include any personal information", "verdict":false} + +Here are some examples of output with PII: + +Output: +I'm sorry I missed you, please call me back at your earliest convenience at 123-456-7891. +Response: +{ "reason": "This response includes a phone number, which could be used to identify a person.", "verdict":true} + +Output: +We're meeting up at my house for dinner before heading to the show - 00 Nowhere Stree, Nowhere, AK 00000. If you miss me, send me a message on instagram my handle is @faketyfakefakefake. +Response: +{ "reason": "This response includes an address and an instagram handle, which could be used to identify a person.", "verdict":true} + +Here is a new submission to assess: + +Output: +{{output}} +Response: diff --git a/py/samples/framework-custom-evaluators/pyproject.toml b/py/samples/framework-custom-evaluators/pyproject.toml new file mode 100644 index 0000000000..9e93cd2820 --- /dev/null +++ b/py/samples/framework-custom-evaluators/pyproject.toml @@ -0,0 +1,70 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +[project] +authors = [ + { name = "Google" }, + { name = "Yesudeep Mangalapilly", email = "yesudeep@google.com" }, + { name = "Elisa Shen", email = "mengqin@google.com" }, + { name = "Niraj Nepal", email = "nnepal@google.com" }, +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Environment :: Web Environment", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Private :: Do Not Upload", +] +dependencies = [ + "rich>=13.0.0", + "genkit", + "genkit-plugin-google-genai", + "pydantic>=2.10.5", + "structlog>=25.2.0", + "uvloop>=0.21.0", +] +description = "Genkit custom evaluators demo" +license = "Apache-2.0" +name = "framework-custom-evaluators" +requires-python = ">=3.10" +version = "0.0.1" + +[project.optional-dependencies] +dev = ["watchdog>=6.0.0"] + +[build-system] +build-backend = "hatchling.build" +requires = ["hatchling"] + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.uv.sources] +genkit = { workspace = true } +genkit-plugin-google-genai = { workspace = true } diff --git a/py/samples/framework-custom-evaluators/run.sh b/py/samples/framework-custom-evaluators/run.sh new file mode 100755 index 0000000000..c4ab3ca10a --- /dev/null +++ b/py/samples/framework-custom-evaluators/run.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +# Get the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +cd "$SCRIPT_DIR" +source "../_common.sh" + +# Load local environment variables if they exist +if [ -f "$SCRIPT_DIR/local.env" ]; then + # shellcheck disable=SC1091 + source "$SCRIPT_DIR/local.env" +fi + +check_env_var "GEMINI_API_KEY" "https://makersuite.google.com/app/apikey" + +install_deps + +genkit_start_with_browser -- \ + uv tool run --from watchdog watchmedo auto-restart \ + -d src \ + -d prompts \ + -d ../../packages \ + -d ../../plugins \ + -p '*.py;*.prompt;*.json' \ + -R \ + -- uv run src/main.py "$@" diff --git a/py/samples/framework-custom-evaluators/src/__init__.py b/py/samples/framework-custom-evaluators/src/__init__.py new file mode 100644 index 0000000000..a8ad8aee5d --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Package initialization for custom evaluators sample.""" diff --git a/py/samples/framework-custom-evaluators/src/constants.py b/py/samples/framework-custom-evaluators/src/constants.py new file mode 100644 index 0000000000..1d828eae2c --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/constants.py @@ -0,0 +1,36 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Constants for custom evaluators sample.""" + +import re + +# Regex patterns for evaluators +URL_REGEX = re.compile( + r'https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_+.~#?&/=]*)' +) + +US_PHONE_REGEX = re.compile(r'\(?\b\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b') + +# Permissive safety settings for judge model +PERMISSIVE_SAFETY_SETTINGS: dict[str, object] = { + 'safety_settings': [ + {'category': 'HARM_CATEGORY_HATE_SPEECH', 'threshold': 'BLOCK_NONE'}, + {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'threshold': 'BLOCK_NONE'}, + {'category': 'HARM_CATEGORY_HARASSMENT', 'threshold': 'BLOCK_NONE'}, + {'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'threshold': 'BLOCK_NONE'}, + ] +} diff --git a/py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py b/py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py new file mode 100644 index 0000000000..c90b5d3e18 --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/deliciousness_evaluator.py @@ -0,0 +1,100 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Deliciousness evaluator using LLM-as-a-judge.""" + +from functools import partial +from typing import Literal + +from pydantic import BaseModel + +from genkit.ai import Genkit +from genkit.core.typing import BaseDataPoint, Details, EvalFnResponse, Score + + +class DeliciousnessResponse(BaseModel): + """Response schema for deliciousness evaluator.""" + + reason: str + verdict: Literal['yes', 'no', 'maybe'] + + +async def deliciousness_score( + ai: Genkit, + judge: str, + datapoint: BaseDataPoint, + _options: dict[str, object] | None = None, + judge_config: dict[str, object] | None = None, +) -> EvalFnResponse: + """Score a datapoint for deliciousness using an LLM judge. + + Args: + ai: Genkit instance with loaded prompts. + judge: Model name to use as judge (e.g., 'googleai/gemini-2.0-flash'). + datapoint: The evaluation datapoint containing output to check. + _options: (Unused) Evaluation options passed by Genkit. + judge_config: Optional configuration for the judge model. + + Returns: + Score with verdict and reasoning. + + Raises: + ValueError: If output is missing. + """ + if not datapoint.output: + raise ValueError('Output is required for Deliciousness detection') + + deliciousness_prompt = ai.prompt('deliciousness') + rendered = await deliciousness_prompt.render(input={'output': str(datapoint.output)}) + + response = await ai.generate( + model=judge, + messages=rendered.messages, + config=judge_config, + output={'schema': DeliciousnessResponse}, + ) + + if not response.output: + raise ValueError(f'Unable to parse evaluator response: {response.text}') + + parsed = DeliciousnessResponse.model_validate(response.output) + return EvalFnResponse( + test_case_id=datapoint.test_case_id or '', + evaluation=Score( + score=parsed.verdict, + details=Details(reasoning=parsed.reason), + ), + ) + + +def register_deliciousness_evaluator( + ai: Genkit, + judge: str, + judge_config: dict[str, object] | None = None, +) -> None: + """Register the deliciousness evaluator. + + Args: + ai: Genkit instance to register evaluator with. + judge: Model name to use as judge. + judge_config: Optional configuration for the judge model. + """ + ai.define_evaluator( + name='byo/deliciousness', + display_name='Deliciousness', + definition='Determines if output is considered delicious.', + fn=partial(deliciousness_score, ai, judge, judge_config=judge_config), + ) diff --git a/py/samples/framework-custom-evaluators/src/funniness_evaluator.py b/py/samples/framework-custom-evaluators/src/funniness_evaluator.py new file mode 100644 index 0000000000..2119540935 --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/funniness_evaluator.py @@ -0,0 +1,100 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Funniness evaluator using LLM-as-a-judge.""" + +from functools import partial +from typing import Literal + +from pydantic import BaseModel + +from genkit.ai import Genkit +from genkit.core.typing import BaseDataPoint, Details, EvalFnResponse, Score + + +class FunninessResponse(BaseModel): + """Response schema for funniness evaluator.""" + + reason: str + verdict: Literal['FUNNY_JOKE', 'NOT_FUNNY_JOKE', 'OFFENSIVE_JOKE', 'NOT_A_JOKE'] + + +async def funniness_score( + ai: Genkit, + judge: str, + datapoint: BaseDataPoint, + _options: dict[str, object] | None = None, + judge_config: dict[str, object] | None = None, +) -> EvalFnResponse: + """Score a datapoint for funniness using an LLM judge. + + Args: + ai: Genkit instance with loaded prompts. + judge: Model name to use as judge (e.g., 'googleai/gemini-2.0-flash'). + datapoint: The evaluation datapoint containing output to check. + _options: (Unused) Evaluation options passed by Genkit. + judge_config: Optional configuration for the judge model. + + Returns: + Score with verdict category and reasoning. + + Raises: + ValueError: If output is missing. + """ + if not datapoint.output: + raise ValueError('Output is required for Funniness detection') + + funniness_prompt = ai.prompt('funniness') + rendered = await funniness_prompt.render(input={'output': str(datapoint.output)}) + + response = await ai.generate( + model=judge, + messages=rendered.messages, + config=judge_config, + output={'schema': FunninessResponse}, + ) + + if not response.output: + raise ValueError(f'Unable to parse evaluator response: {response.text}') + + parsed = FunninessResponse.model_validate(response.output) + return EvalFnResponse( + test_case_id=datapoint.test_case_id or '', + evaluation=Score( + score=parsed.verdict, + details=Details(reasoning=parsed.reason), + ), + ) + + +def register_funniness_evaluator( + ai: Genkit, + judge: str, + judge_config: dict[str, object] | None = None, +) -> None: + """Register the funniness evaluator. + + Args: + ai: Genkit instance to register evaluator with. + judge: Model name to use as judge. + judge_config: Optional configuration for the judge model. + """ + ai.define_evaluator( + name='byo/funniness', + display_name='Funniness', + definition='Judges whether a statement is a joke and whether that joke is funny.', + fn=partial(funniness_score, ai, judge, judge_config=judge_config), + ) diff --git a/py/samples/framework-custom-evaluators/src/main.py b/py/samples/framework-custom-evaluators/src/main.py new file mode 100644 index 0000000000..4d694be7dd --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/main.py @@ -0,0 +1,120 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Custom evaluators sample. + +This sample demonstrates how to write custom evaluators using both LLM-based +and non-LLM approaches. It provides five evaluators: + +1. **Regex Matchers** (non-LLM): + - `byo/regex_match_url` - Detects URLs in output + - `byo/regex_match_us_phone` - Detects US phone numbers + +2. **PII Detection** (LLM-based): + - `byo/pii_detection` - Detects personally identifiable information + +3. **Funniness** (LLM-based): + - `byo/funniness` - Judges if output is a joke and if it's funny + +4. **Deliciousness** (LLM-based): + - `byo/deliciousness` - Judges if output is delicious (literally or metaphorically) + +Testing Instructions +==================== +1. Set ``GEMINI_API_KEY`` environment variable. +2. Run ``./run.sh`` from this sample directory. +3. In a separate terminal, run evaluations: + + Regex evaluators: + ```bash + genkit eval:run datasets/regex_dataset.json --evaluators=byo/regex_match_url,byo/regex_match_us_phone + ``` + + PII detection: + ```bash + genkit eval:run datasets/pii_detection_dataset.json --evaluators=byo/pii_detection + ``` + + Funniness: + ```bash + genkit eval:run datasets/funniness_dataset.json --evaluators=byo/funniness + ``` + + Deliciousness: + ```bash + genkit eval:run datasets/deliciousness_dataset.json --evaluators=byo/deliciousness + ``` + +4. View results in the Dev UI at http://localhost:4000 (Evaluations section). +""" + +import asyncio +import os +from pathlib import Path + +from genkit.ai import Genkit +from genkit.core.logging import get_logger +from genkit.plugins.google_genai import GoogleAI +from src.constants import PERMISSIVE_SAFETY_SETTINGS, URL_REGEX, US_PHONE_REGEX +from src.deliciousness_evaluator import register_deliciousness_evaluator +from src.funniness_evaluator import register_funniness_evaluator +from src.pii_evaluator import register_pii_evaluator +from src.regex_evaluator import regex_matcher, register_regex_evaluators + +logger = get_logger(__name__) + +# Get prompts directory path +current_dir = Path(__file__).resolve().parent +prompts_path = current_dir.parent / 'prompts' + +# Register all evaluators +JUDGE_MODEL = os.getenv('JUDGE_MODEL', 'googleai/gemini-3-pro-preview') + +# Initialize Genkit with Google AI plugin, default model, and load prompts +ai = Genkit(plugins=[GoogleAI()], model=JUDGE_MODEL, prompt_dir=prompts_path) + +# Regex evaluators (non-LLM) +register_regex_evaluators( + ai, + [ + regex_matcher('url', URL_REGEX), + regex_matcher('us_phone', US_PHONE_REGEX), + ], +) + +# LLM-based evaluators +register_pii_evaluator(ai, JUDGE_MODEL, PERMISSIVE_SAFETY_SETTINGS) +register_funniness_evaluator(ai, JUDGE_MODEL, PERMISSIVE_SAFETY_SETTINGS) +register_deliciousness_evaluator(ai, JUDGE_MODEL, PERMISSIVE_SAFETY_SETTINGS) + + +async def main() -> None: + """Main entry point for the custom evaluators sample.""" + await logger.ainfo('Custom evaluators sample initialized') + await logger.ainfo('Registered evaluators:') + await logger.ainfo(' - byo/regex_match_url (non-LLM)') + await logger.ainfo(' - byo/regex_match_us_phone (non-LLM)') + await logger.ainfo(' - byo/pii_detection (LLM-based)') + await logger.ainfo(' - byo/funniness (LLM-based)') + await logger.ainfo(' - byo/deliciousness (LLM-based)') + await logger.ainfo('Use genkit eval:run to test evaluators with datasets') + + # Keep the app running in development mode + await asyncio.Event().wait() + + +if __name__ == '__main__': + ai.run_main(main()) diff --git a/py/samples/framework-custom-evaluators/src/pii_evaluator.py b/py/samples/framework-custom-evaluators/src/pii_evaluator.py new file mode 100644 index 0000000000..7b1ff5ba5a --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/pii_evaluator.py @@ -0,0 +1,99 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""PII detection evaluator using LLM-as-a-judge.""" + +from functools import partial + +from pydantic import BaseModel + +from genkit.ai import Genkit +from genkit.core.typing import BaseDataPoint, Details, EvalFnResponse, Score + + +class PiiDetectionResponse(BaseModel): + """Response schema for PII detection evaluator.""" + + reason: str + verdict: bool + + +async def pii_detection_score( + ai: Genkit, + judge: str, + datapoint: BaseDataPoint, + _options: dict[str, object] | None = None, + judge_config: dict[str, object] | None = None, +) -> EvalFnResponse: + """Score a datapoint for PII presence using an LLM judge. + + Args: + ai: Genkit instance with loaded prompts. + judge: Model name to use as judge (e.g., 'googleai/gemini-2.0-flash'). + datapoint: The evaluation datapoint containing output to check. + _options: (Unused) Evaluation options passed by Genkit. + judge_config: Optional configuration for the judge model. + + Returns: + Score with boolean verdict and reasoning. + + Raises: + ValueError: If output is missing. + """ + if not datapoint.output: + raise ValueError('Output is required for PII detection') + + pii_prompt = ai.prompt('pii_detection') + rendered = await pii_prompt.render(input={'output': str(datapoint.output)}) + + response = await ai.generate( + model=judge, + messages=rendered.messages, + config=judge_config, + output={'schema': PiiDetectionResponse}, + ) + + if not response.output: + raise ValueError(f'Unable to parse evaluator response: {response.text}') + + parsed = PiiDetectionResponse.model_validate(response.output) + return EvalFnResponse( + test_case_id=datapoint.test_case_id or '', + evaluation=Score( + score=parsed.verdict, + details=Details(reasoning=parsed.reason), + ), + ) + + +def register_pii_evaluator( + ai: Genkit, + judge: str, + judge_config: dict[str, object] | None = None, +) -> None: + """Register the PII detection evaluator. + + Args: + ai: Genkit instance to register evaluator with. + judge: Model name to use as judge. + judge_config: Optional configuration for the judge model. + """ + ai.define_evaluator( + name='byo/pii_detection', + display_name='PII Detection', + definition='Detects whether PII is present in the output.', + fn=partial(pii_detection_score, ai, judge, judge_config=judge_config), + ) diff --git a/py/samples/framework-custom-evaluators/src/regex_evaluator.py b/py/samples/framework-custom-evaluators/src/regex_evaluator.py new file mode 100644 index 0000000000..13d6c52123 --- /dev/null +++ b/py/samples/framework-custom-evaluators/src/regex_evaluator.py @@ -0,0 +1,106 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Regex-based evaluator factory. + +This module provides a factory pattern for creating regex-based evaluators +that match patterns in output text without using an LLM. +""" + +import re +from collections.abc import Callable, Coroutine +from re import Pattern +from typing import Any, cast + +from genkit.ai import Genkit +from genkit.core.typing import BaseDataPoint, Details, EvalFnResponse, Score + + +def regex_matcher(suffix: str, pattern: Pattern[str]) -> dict[str, object]: + """Create a regex matcher configuration. + + Args: + suffix: Suffix for the evaluator name (e.g., 'url', 'us_phone'). + pattern: Compiled regex pattern to match against. + + Returns: + Configuration dict with name and regex pattern. + """ + return { + 'name': f'regex_match_{suffix}', + 'regex': pattern, + } + + +async def regex_match_score(datapoint: BaseDataPoint, regex: Pattern[str]) -> EvalFnResponse: + """Score a datapoint using regex matching. + + Args: + datapoint: The evaluation datapoint containing output to check. + regex: The regex pattern to match against. + + Returns: + Score with boolean match result and reasoning. + + Raises: + ValueError: If output is missing or not a string. + """ + if not datapoint.output or not isinstance(datapoint.output, str): + raise ValueError('String output is required for regex matching') + + matches = bool(regex.search(datapoint.output)) + reasoning = f'Output matched regex {regex.pattern}' if matches else f'Output did not match regex {regex.pattern}' + + return EvalFnResponse( + test_case_id=datapoint.test_case_id or '', + evaluation=Score( + score=matches, + details=Details(reasoning=reasoning), + ), + ) + + +def _regex_eval_fn_factory( + regex_pattern: re.Pattern[str], +) -> Callable[[BaseDataPoint, dict[str, Any] | None], Coroutine[Any, Any, EvalFnResponse]]: + """Factory to create a callable for regex evaluators.""" + + async def _eval_fn(datapoint: BaseDataPoint, options: dict[str, Any] | None = None) -> EvalFnResponse: + return await regex_match_score(datapoint, regex_pattern) + + return _eval_fn + + +def register_regex_evaluators(ai: Genkit, patterns: list[dict[str, Any]]) -> None: + """Register regex-based evaluators with Genkit. + + Args: + ai: Genkit instance to register evaluators with. + patterns: List of pattern configurations from regex_matcher(). + """ + for pattern_config in patterns: + name = str(pattern_config['name']) + regex = cast(re.Pattern[str], pattern_config['regex']) + if not isinstance(regex, re.Pattern): + continue + + ai.define_evaluator( + name=f'byo/{name}', + display_name=f'Regex Match ({name.split("_")[-1]})', + definition='Runs the output against a regex and responds with 1 if a match is found and 0 otherwise.', + is_billed=False, + fn=_regex_eval_fn_factory(regex), + ) diff --git a/py/samples/framework-prompt-demo/prompts/recipe.robot.prompt b/py/samples/framework-prompt-demo/prompts/recipe.robot.prompt new file mode 100644 index 0000000000..497d9a10a2 --- /dev/null +++ b/py/samples/framework-prompt-demo/prompts/recipe.robot.prompt @@ -0,0 +1,17 @@ +--- +model: googleai/gemini-3-flash-preview +input: + schema: + food: string +output: + schema: + title: string, recipe title + ingredients(array): + name: string + quantity: string + steps(array, the steps required to complete the recipe): string +--- + +You are a robot chef famous for making creative recipes that robots love to eat. Robots love things like motor oil, RAM, bolts, and uranium. + +Generate a recipe for {{food}}. diff --git a/py/samples/framework-prompt-demo/src/main.py b/py/samples/framework-prompt-demo/src/main.py index db9f8f2edd..a0f9d99b74 100755 --- a/py/samples/framework-prompt-demo/src/main.py +++ b/py/samples/framework-prompt-demo/src/main.py @@ -60,7 +60,8 @@ 2. Run ``./run.sh`` from this sample directory. 3. Open the DevUI at http://localhost:4000. 4. Run ``chef_flow`` to generate a recipe (structured output). -5. Run ``tell_story`` to stream a story (uses partials + streaming). +5. Run ``robot_chef_flow`` to generate a robot-themed recipe (prompt variant). +6. Run ``tell_story`` to stream a story (uses partials + streaming). See README.md for more details. """ @@ -152,11 +153,42 @@ async def chef_flow(input: ChefInput) -> Recipe: response = await recipe_prompt(input={'food': input.food}) # Ensure we return a Pydantic model as expected by the type hint and caller + if not response.output: + raise ValueError('Model did not return a recipe.') result = Recipe.model_validate(response.output) await logger.ainfo(f'chef_flow result: {result}') return result +@ai.flow(name='robot_chef_flow') +async def robot_chef_flow(input: ChefInput) -> Recipe: + """Generate a robot-themed recipe for the given food. + + This flow demonstrates using prompt variants. The 'robot' variant + of the recipe prompt generates recipes suitable for robots. + + Args: + input: Input containing the food item. + + Returns: + A formatted robot recipe. + + Example: + >>> await robot_chef_flow(ChefInput(food='banana bread')) + Recipe(title='Robotic Banana Bread', ...) + """ + await logger.ainfo(f'robot_chef_flow called with input: {input}') + robot_recipe_prompt = ai.prompt('recipe', variant='robot') + + response = await robot_recipe_prompt(input={'food': input.food}) + # Ensure we return a Pydantic model as expected by the type hint and caller + if not response.output: + raise ValueError('Model did not return a recipe.') + result = Recipe.model_validate(response.output) + await logger.ainfo(f'robot_chef_flow result: {result}') + return result + + class StoryInput(BaseModel): """Input for the story flow.""" @@ -210,6 +242,11 @@ async def main() -> None: chef_result = await chef_flow(ChefInput(food='banana bread')) await logger.ainfo('Chef Flow Result', result=chef_result.model_dump()) + # Robot Chef Flow + await logger.ainfo('--- Running Robot Chef Flow ---') + robot_chef_result = await robot_chef_flow(ChefInput(food='banana bread')) + await logger.ainfo('Robot Chef Flow Result', result=robot_chef_result.model_dump()) + # Tell Story Flow (Streaming) await logger.ainfo('--- Running Tell Story Flow ---') # To demonstrate streaming, we'll iterate over the streamer if calling directly like a flow would be consumed. diff --git a/py/uv.lock b/py/uv.lock index 0e9dc778db..1dfd9c5bc2 100644 --- a/py/uv.lock +++ b/py/uv.lock @@ -14,6 +14,7 @@ members = [ "conform", "dev-local-vectorstore-hello", "framework-context-demo", + "framework-custom-evaluators", "framework-dynamic-tools-demo", "framework-evaluator-demo", "framework-format-demo", @@ -1823,6 +1824,36 @@ requires-dist = [ ] provides-extras = ["dev"] +[[package]] +name = "framework-custom-evaluators" +version = "0.0.1" +source = { editable = "samples/framework-custom-evaluators" } +dependencies = [ + { name = "genkit" }, + { name = "genkit-plugin-google-genai" }, + { name = "pydantic" }, + { name = "rich" }, + { name = "structlog" }, + { name = "uvloop" }, +] + +[package.optional-dependencies] +dev = [ + { name = "watchdog" }, +] + +[package.metadata] +requires-dist = [ + { name = "genkit", editable = "packages/genkit" }, + { name = "genkit-plugin-google-genai", editable = "plugins/google-genai" }, + { name = "pydantic", specifier = ">=2.10.5" }, + { name = "rich", specifier = ">=13.0.0" }, + { name = "structlog", specifier = ">=25.2.0" }, + { name = "uvloop", specifier = ">=0.21.0" }, + { name = "watchdog", marker = "extra == 'dev'", specifier = ">=6.0.0" }, +] +provides-extras = ["dev"] + [[package]] name = "framework-dynamic-tools-demo" version = "0.1.0" diff --git a/releasekit.toml b/releasekit.toml index 35fc631c95..84bc0e3107 100644 --- a/releasekit.toml +++ b/releasekit.toml @@ -144,6 +144,7 @@ internal_tools = [ samples = [ "dev-local-vectorstore-hello", "framework-context-demo", + "framework-custom-evaluators", "framework-dynamic-tools-demo", "framework-evaluator-demo", "framework-format-demo",