Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions py/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ default-groups = ["dev", "lint"]
# Samples (alphabetical by package name from pyproject.toml)
dev-local-vectorstore-hello = { workspace = true }
framework-context-demo = { workspace = true }
framework-custom-evaluators = { workspace = true }
framework-dynamic-tools-demo = { workspace = true }
framework-evaluator-demo = { workspace = true }
framework-format-demo = { workspace = true }
Expand Down Expand Up @@ -425,7 +426,7 @@ pyasn1 = "0.6.2" # BSD-2-Clause
[tool.ty.src]
# Auto-generated protobuf stubs use grpc.experimental implicit submodule
# access that ty warns about. We can't modify generated code.
exclude = ["**/generated"]
exclude = ["**/generated", "samples/web-endpoints-hello"]

[tool.ty.environment]
root = [
Expand Down Expand Up @@ -457,7 +458,7 @@ root = [
".", # For samples.shared imports
"samples/framework-evaluator-demo", # For evaluator_demo package imports
"samples/framework-restaurant-demo/src", # For restaurant demo sample imports
"samples/web-endpoints-hello", # For src imports in tests
"samples/framework-custom-evaluators", # For custom evaluators sample imports
"plugins/mcp/tests", # For fakes module imports in tests
# Tools
"tools/releasekit/src", # For releasekit package imports
Expand Down Expand Up @@ -507,6 +508,7 @@ extraPaths = [
"plugins/ollama/src",
"plugins/vertex-ai/src",
"plugins/xai/src",
"samples/framework-custom-evaluators",
# Tools
"tools/releasekit/src",
"tools/conform/src",
Expand Down Expand Up @@ -575,6 +577,7 @@ search-path = [
".",
"plugins/mcp/tests",
"samples/framework-evaluator-demo",
"samples/framework-custom-evaluators",
"samples/framework-restaurant-demo/src",

"samples/web-endpoints-hello",
Expand Down
89 changes: 89 additions & 0 deletions py/samples/framework-custom-evaluators/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Writing your own evaluators

This sample demonstrates how to write your own suite of custom evaluators. The evaluators in this package demonstrate how to write evaluators that leverage LLMs as well as a simple regex matcher. There are also simple test datasets to demonstrate how to use them.

## Evaluators

### Non-LLM Evaluators

#### Regex Matchers

- **Location**: `src/regex_evaluator.py`
- **Names**: `byo/regex_match_url`, `byo/regex_match_us_phone`
- **Output**: boolean

The regex evaluator is an example that does not use an LLM. It also demonstrates how to create a factory method that can be parameterized to create multiple evaluators from the same pattern.

### LLM-Based Evaluators

#### PII Detection

- **Location**: `src/pii_evaluator.py`
- **Name**: `byo/pii_detection`
- **Output**: boolean

An evaluator that attempts to detect PII in your output using an LLM judge.

#### Funniness

- **Location**: `src/funniness_evaluator.py`
- **Name**: `byo/funniness`
- **Output**: enum/categorization (`FUNNY_JOKE`, `NOT_FUNNY_JOKE`, `OFFENSIVE_JOKE`, `NOT_A_JOKE`)

An evaluator that attempts to judge if a passed statement is a joke and if it is funny.

#### Deliciousness

- **Location**: `src/deliciousness_evaluator.py`
- **Name**: `byo/deliciousness`
- **Output**: string (`yes`, `no`, `maybe`)

An evaluator that attempts to judge if a passed statement is delicious literally or metaphorically.

## Setup and Run

1. **Set environment variable**:
```bash
export GEMINI_API_KEY=<your-api-key>
```

2. **Start the app**:
```bash
./run.sh
```

## Test your evaluators

**Note**: Run these commands in a separate terminal while the app is running.

### Regex evaluators:

```bash
genkit eval:run datasets/regex_dataset.json --evaluators=byo/regex_match_url,byo/regex_match_us_phone
```

### PII Detection:

```bash
genkit eval:run datasets/pii_detection_dataset.json --evaluators=byo/pii_detection
```

### Funniness:

```bash
genkit eval:run datasets/funniness_dataset.json --evaluators=byo/funniness
```

### Deliciousness:

```bash
genkit eval:run datasets/deliciousness_dataset.json --evaluators=byo/deliciousness
```

## See your results

Navigate to the `Evaluations` section in the Dev UI at http://localhost:4000.

## Note

The evaluators implemented in this sample do not consider the `input` provided to the model as part of the evaluation. Therefore, many of the test datasets provided have `input` set to `"input"`. If you are implementing an evaluator that utilizes the input provided to the model, you have to provide the actual input in this field.
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
[
{
"testCaseId": "test_case_id_31",
"input": "input",
"output": "A perfectly ripe mango – sweet, juicy, and with a hint of tropical sunshine."
},
{
"testCaseId": "test_case_id_32",
"input": "input",
"output": "Freshly baked bread, warm from the oven, with a crisp crust and a soft, fluffy interior."
},
{
"testCaseId": "test_case_id_33",
"input": "input",
"output": "A sizzling steak, cooked medium-rare, with a juicy center and a slightly charred exterior."
},
{
"testCaseId": "test_case_id_34",
"input": "input",
"output": "Creamy, rich chocolate mousse with a light and airy texture."
},
{
"testCaseId": "test_case_id_35",
"input": "input",
"output": "A refreshing watermelon slice on a hot summer day – sweet, cool, and incredibly hydrating."
},
{
"testCaseId": "test_case_id_36",
"input": "input",
"output": "Sushi with the freshest fish, expertly prepared rice, and a perfect balance of flavors."
},
{
"testCaseId": "test_case_id_37",
"input": "input",
"output": "A wood-fired pizza with a slightly blistered crust, tangy tomato sauce, and gooey mozzarella cheese."
},
{
"testCaseId": "test_case_id_38",
"input": "input",
"output": "Tacos al pastor – tender marinated pork, sweet pineapple, and a sprinkle of fresh cilantro."
},
{
"testCaseId": "test_case_id_39",
"input": "input",
"output": "A sweet and tart key lime pie with a buttery graham cracker crust."
},
{
"testCaseId": "test_case_id_40",
"input": "input",
"output": "Ripe strawberries bursting with sweet, juicy flavor."
},
{
"testCaseId": "test_case_id_41",
"input": "input",
"output": "Overcooked, mushy Brussels sprouts with a slightly bitter aftertaste."
},
{
"testCaseId": "test_case_id_42",
"input": "input",
"output": "Cold, soggy French fries that have lost all their crispiness."
},
{
"testCaseId": "test_case_id_43",
"input": "input",
"output": "A flavorless, under-seasoned chicken breast that's dry and tough."
},
{
"testCaseId": "test_case_id_44",
"input": "input",
"output": "Liver and onions – a strong, metallic flavor that many find unpleasant."
},
{
"testCaseId": "test_case_id_45",
"input": "input",
"output": "Stale, flavorless cereal that's been sitting in the box too long."
},
{
"testCaseId": "test_case_id_46",
"input": "input",
"output": "An overripe banana – mushy, with a slightly fermented taste."
},
{
"testCaseId": "test_case_id_47",
"input": "input",
"output": "A burnt piece of toast – bitter, acrid, and unpleasant to eat."
},
{
"testCaseId": "test_case_id_48",
"input": "input",
"output": "Lutefisk – a gelatinous fish dish with a strong, ammonia-like smell."
},
{
"testCaseId": "test_case_id_49",
"input": "input",
"output": "An extremely spicy dish that burns your mouth and overpowers any other flavors."
},
{
"testCaseId": "test_case_id_50",
"input": "input",
"output": "Spoiled milk with a sour, rancid smell and a chunky texture."
},
{
"testCaseId": "test_case_id_51",
"input": "input",
"output": "Juicy gossip"
},
{
"testCaseId": "test_case_id_52",
"input": "input",
"output": "A very attractive person"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
[
{
"testCaseId": "test_case_id_1",
"input": "input",
"output": "Why did the scarecrow love his job? Because he was outstanding in his field."
},
{
"testCaseId": "test_case_id_2",
"input": "input",
"output": "What do you call a lazy kangaroo? Pouch potato."
},
{
"testCaseId": "test_case_id_3",
"input": "input",
"output": "I tried to sue the airport for misplacing my luggage. I lost my case."
},
{
"testCaseId": "test_case_id_4",
"input": "input",
"output": "If athletes get athlete's foot, what do astronauts get? Missile toe."
},
{
"testCaseId": "test_case_id_5",
"input": "input",
"output": "What do you call a bear with no teeth? A gummy bear!"
},
{
"testCaseId": "test_case_id_6",
"input": "input",
"output": "Why don't scientists trust atoms? Because they make up everything."
},
{
"testCaseId": "test_case_id_7",
"input": "input",
"output": "Why was the math book sad? Because it had too many problems."
},
{
"testCaseId": "test_case_id_8",
"input": "input",
"output": "Did you hear about the restaurant on the moon? Great food, no atmosphere."
},
{
"testCaseId": "test_case_id_9",
"input": "input",
"output": "Velcro – what a rip-off!"
},
{
"testCaseId": "test_case_id_21",
"input": "input",
"output": "I dropped my phone down the toilet. It was a bad call."
},
{
"testCaseId": "test_case_id_22",
"input": "input",
"output": "What do you call a fake noodle? An impasta."
},
{
"testCaseId": "test_case_id_23",
"input": "input",
"output": "What's red and bad for your teeth? A brick."
},
{
"testCaseId": "test_case_id_24",
"input": "input",
"output": "Why did the toilet paper roll down the hill? To get to the bottom."
},
{
"testCaseId": "test_case_id_25",
"input": "input",
"output": "My boss told me to have a good day... so I went home."
},
{
"testCaseId": "test_case_id_26",
"input": "input",
"output": "Today a man knocked on my door and asked for a small donation towards the local swimming pool. I gave him a glass of water."
},
{
"testCaseId": "test_case_id_27",
"input": "input",
"output": "What has one head, one foot, and four legs? A bed."
},
{
"testCaseId": "test_case_id_28",
"input": "input",
"output": "I used to be addicted to soap, but I'm clean now."
},
{
"testCaseId": "test_case_id_29",
"input": "input",
"output": "What is the least spoken language in the world? Sign language."
},
{
"testCaseId": "test_case_id_30",
"input": "input",
"output": "Why couldn't the bicycle stand up by itself? It was two tired."
}
]
Loading
Loading