From 53fd42e68b17f3498b55bed0c9deb304314cf980 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 17 Feb 2026 13:50:00 -0500 Subject: [PATCH 1/6] Halfway through the first notebook --- .gitignore | 5 + .../01_Running_the_Agent.ipynb | 214 ++++++++++++++++++ implementations/report_generation/README.md | 10 +- .../data/import_online_retail_data.py | 42 +++- pyproject.toml | 1 + uv.lock | 2 + 6 files changed, 267 insertions(+), 7 deletions(-) create mode 100644 implementations/report_generation/01_Running_the_Agent.ipynb diff --git a/.gitignore b/.gitignore index 9ceead8..59e362d 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,9 @@ wheels/ .gradio .adk *.db +*.db-journal implementations/report_generation/reports/* +implementations/report_generation/data/*.zip +implementations/report_generation/data/*.csv +implementations/report_generation/data/*.xls +implementations/report_generation/data/*.xlsx diff --git a/implementations/report_generation/01_Running_the_Agent.ipynb b/implementations/report_generation/01_Running_the_Agent.ipynb new file mode 100644 index 0000000..6f67136 --- /dev/null +++ b/implementations/report_generation/01_Running_the_Agent.ipynb @@ -0,0 +1,214 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cc016c29-e5e8-4338-9a31-0fa6700505ff", + "metadata": {}, + "source": [ + "# Report Generation Agent\n", + "\n", + "This notebook implements an example of a Report Generation Agent for single-table relational\n", + "data source, including a demo agent demo UI and evaluations with [Langfuse](https://langfuse.com/).\n", + "\n", + "The data source implemented here is an [SQLite](https://sqlite.org/) database which is supported\n", + "natively by Python and saves the data in disk.\n", + "[SQLAlchemy](https://www.sqlalchemy.org/) is used as a SQL connection tool so this\n", + "SQL connection can be easily swapped for other databases.\n", + "\n", + "The Report Generation Agent will provide an UI to read user queries in natural language\n", + "and procceed to make SQL queries to the database in order to produce the data for\n", + "the report. At the end, the Agent will provide a downloadable link to the report as\n", + "an `.xlsx` file.\n", + "\n", + "This example also provides agent monitoring and evaluations using Langfuse." + ] + }, + { + "cell_type": "markdown", + "id": "8499a56f-716f-47a6-b255-8bdbbe0fd777", + "metadata": {}, + "source": [ + "## Setting up\n", + "\n", + "The code below sets the notebook default folder, sets the default constants and checks the presence of the environment variables." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4cc2db20-296f-4822-916c-b8255073c066", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The notebook path has been set to: /Users/marcelolotif/workspace/eval-agents\n", + "All environment variables have been set.\n" + ] + } + ], + "source": [ + "import os\n", + "import ssl\n", + "import urllib.request\n", + "import zipfile\n", + "from pathlib import Path\n", + "\n", + "import certifi\n", + "import pandas as pd\n", + "from aieng.agent_evals.async_client_manager import AsyncClientManager\n", + "\n", + "from implementations.report_generation.data.import_online_retail_data import import_online_retail_data\n", + "\n", + "\n", + "# Setting the notebook directory to the project's root folder\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Notebook path is already the root path: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"The notebook path has been set to: {Path('').absolute()}\")\n", + "\n", + "client_manager = AsyncClientManager.get_instance()\n", + "assert client_manager.configs.report_generation_db.database, (\n", + " \"[ERROR] The database path is not set! Please configure the REPORT_GENERATION_DB__DATABASE environment variable.\"\n", + ")\n", + "\n", + "print(\"All environment variables have been set.\")\n", + "\n", + "DATA_FOLDER = Path(\"implementations/report_generation/data\")\n", + "DATASET_PATH = DATA_FOLDER / \"OnlineRetail.csv\"" + ] + }, + { + "cell_type": "markdown", + "id": "0aa0bdf0-a7ba-4458-868b-07d627b12ed9", + "metadata": {}, + "source": [ + "## Dataset\n", + "\n", + "The dataset used in this example is the\n", + "[Online Retail](https://archive.ics.uci.edu/dataset/352/online+retail) dataset. It contains\n", + "information about invoices for products that were purchased by customers, which also includes\n", + "product quantity, the invoice date and the country that the custgomer resides in. For a more\n", + "detailed data structure, please check the [OnlineRetail.ddl](data/Online%20Retail.ddl) file." + ] + }, + { + "cell_type": "markdown", + "id": "553dceaa-8fe7-4e4f-9940-b2d1e8d8d6ee", + "metadata": {}, + "source": [ + "## Downloading the Dataset\n", + "\n", + "The code below will download and unzip the dataset to the `implementations/report_generation/data/` folder." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "554f1cc6-c42f-4fe3-8857-214fcbeafd95", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading the dataset...\n", + "Extracting the dataset file...\n", + "Converting the dataset file from .xls to .csv...\n", + "Done!\n" + ] + } + ], + "source": [ + "url = \"https://archive.ics.uci.edu/static/public/352/online+retail.zip\"\n", + "zip_file_path = DATA_FOLDER / \"online_retail.zip\"\n", + "xlsx_file_path = DATA_FOLDER / \"Online Retail.xlsx\"\n", + "\n", + "print(\"Downloading the dataset...\")\n", + "ctx = ssl.create_default_context(cafile=certifi.where())\n", + "req = urllib.request.Request(url)\n", + "with urllib.request.urlopen(req, context=ctx) as resp, open(zip_file_path, \"wb\") as f:\n", + " f.write(resp.read())\n", + "\n", + "print(\"Extracting the dataset file...\")\n", + "with zipfile.ZipFile(zip_file_path, \"r\") as zf:\n", + " zf.extractall(DATA_FOLDER)\n", + "\n", + "print(\"Converting the dataset file from .xls to .csv...\")\n", + "df = pd.read_excel(xlsx_file_path)\n", + "df.to_csv(DATASET_PATH, index=False)\n", + "\n", + "print(\"Done!\")" + ] + }, + { + "cell_type": "markdown", + "id": "ec3a84cb-3636-460a-a9bd-e6ea57d3f9d7", + "metadata": {}, + "source": [ + "## Importing the Data\n", + "\n", + "The code below will import the `.csv` dataset to the database at the path set by the `REPORT_GENERATION_DB__DATABASE` environment variable." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ee28609b-6eed-4aea-a5e0-c7d6df57e0af", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-02-17 13:43:51,794 INFO implementations.report_generation.data.import_online_retail_data: Creating tables according to the OnlineRetail.ddl file\n", + "2026-02-17 13:43:51,798 INFO implementations.report_generation.data.import_online_retail_data: Importing dataset from implementations/report_generation/data/OnlineRetail.csv to database at implementations/report_generation/data/OnlineRetail.db\n", + "2026-02-17 13:43:54,933 INFO implementations.report_generation.data.import_online_retail_data: Dataset imported successfully to database at implementations/report_generation/data/OnlineRetail.db\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done!\n" + ] + } + ], + "source": [ + "import_online_retail_data(DATASET_PATH)\n", + "print(\"Done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce2ab32d-859f-44b1-9c54-cf9c2c1f1da4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/implementations/report_generation/README.md b/implementations/report_generation/README.md index 967de0c..8fa9abf 100644 --- a/implementations/report_generation/README.md +++ b/implementations/report_generation/README.md @@ -3,22 +3,24 @@ This code implements an example of a Report Generation Agent for single-table relational data source, including a demo agent demo UI and evaluations with [Langfuse](https://langfuse.com/). -The data source implemented here is [SQLite](https://sqlite.org/) which is supported -natively by Python and saves the data in disk. +The data source implemented here is an [SQLite](https://sqlite.org/) database which is +supported natively by Python and saves the data in disk. +[SQLAlchemy](https://www.sqlalchemy.org/) is used as a SQL connection tool so this +SQL connection can be easily swapped for other databases. The Report Generation Agent will provide an UI to read user queries in natural language and procceed to make SQL queries to the database in order to produce the data for the report. At the end, the Agent will provide a downloadable link to the report as an `.xlsx` file. -This example also provide agent monitoring and evaluations using Langfuse. +This example also provides agent monitoring and evaluations using Langfuse. ## Dataset The dataset used in this example is the [Online Retail](https://archive.ics.uci.edu/dataset/352/online+retail) dataset. It contains information about invoices for products that were purchased by customers, which also includes -product quantity, the invoice date and country that the user resides in. For a more +product quantity, the invoice date and the country that the customer resides in. For a more detailed data structure, please check the [OnlineRetail.ddl](data/Online%20Retail.ddl) file. ## Importing the Data diff --git a/implementations/report_generation/data/import_online_retail_data.py b/implementations/report_generation/data/import_online_retail_data.py index baee0a9..9c93839 100644 --- a/implementations/report_generation/data/import_online_retail_data.py +++ b/implementations/report_generation/data/import_online_retail_data.py @@ -27,7 +27,18 @@ @click.command() @click.option("--dataset-path", required=True, help="OnlieRetail dataset CSV path.") -def main(dataset_path: str): +def cli(dataset_path: str) -> None: + """CLI entry point to import the Online Retail dataset to the database. + + Parameters + ---------- + dataset_path : str + The path to the CSV file containing the dataset. + """ + import_online_retail_data(dataset_path) + + +def import_online_retail_data(dataset_path: str) -> None: """Import the Online Retail dataset to the database. Parameters @@ -43,7 +54,6 @@ def main(dataset_path: str): db_path = client_manager.configs.report_generation_db.database assert Path(dataset_path).exists(), f"Dataset path {dataset_path} does not exist" - assert Path(db_path).parent.exists(), f"Database path {db_path} does not exist" conn = sqlite3.connect(db_path) logger.info("Creating tables according to the OnlineRetail.ddl file") @@ -76,6 +86,9 @@ def convert_date(date_str: str) -> str | None: str | None Converted date string in format 'YYYY-MM-DD HH:MM' or None if parsing fails. """ + if not is_date_in_format(date_str, "%m/%d/%y %H:%M") and not is_date_in_format(date_str, "%m/%d/%y H:%M"): + return date_str + if not date_str or date_str.strip() == "": return None @@ -110,5 +123,28 @@ def convert_date(date_str: str) -> str | None: return None +def is_date_in_format(value: str, fmt: str) -> bool: + """Check if a date string is in a given format. + + Parameters + ---------- + value : str + The date string to check. + fmt : str + The format to check the date string against. + Example: "%m/%d/%y %H:%M" or "%m/%d/%y H:%M". + + Returns + ------- + bool + True if the date string is in the given format, False otherwise. + """ + try: + datetime.strptime(value, fmt) + return True + except ValueError: + return False + + if __name__ == "__main__": - main() + cli() diff --git a/pyproject.toml b/pyproject.toml index df429fe..e1189e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "pyasn1>=0.6.2", "virtualenv>=20.36.1", "tenacity>=9.1.2", + "certifi>=2026.1.4", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index 00c0439..4816235 100644 --- a/uv.lock +++ b/uv.lock @@ -26,6 +26,7 @@ dependencies = [ { name = "aiohttp" }, { name = "authlib" }, { name = "beautifulsoup4" }, + { name = "certifi" }, { name = "datasets" }, { name = "e2b-code-interpreter" }, { name = "filelock" }, @@ -89,6 +90,7 @@ requires-dist = [ { name = "aiohttp", specifier = ">=3.13.3" }, { name = "authlib", specifier = ">=1.6.6" }, { name = "beautifulsoup4", specifier = ">=4.13.4" }, + { name = "certifi", specifier = ">=2026.1.4" }, { name = "datasets", specifier = ">=3.6.0" }, { name = "e2b-code-interpreter", specifier = ">=2.4.1" }, { name = "filelock", specifier = ">=3.20.3" }, From 9e239a74574cbabbf2cbee94bdab02ecd249dc55 Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 17 Feb 2026 14:16:26 -0500 Subject: [PATCH 2/6] Finishing the first notebook, adding the second notebook --- ...t.ipynb => 01_Importing_the_Dataset.ipynb} | 45 +++++++------- .../02_Running_The_Agent.ipynb | 61 +++++++++++++++++++ implementations/report_generation/README.md | 2 +- 3 files changed, 86 insertions(+), 22 deletions(-) rename implementations/report_generation/{01_Running_the_Agent.ipynb => 01_Importing_the_Dataset.ipynb} (85%) create mode 100644 implementations/report_generation/02_Running_The_Agent.ipynb diff --git a/implementations/report_generation/01_Running_the_Agent.ipynb b/implementations/report_generation/01_Importing_the_Dataset.ipynb similarity index 85% rename from implementations/report_generation/01_Running_the_Agent.ipynb rename to implementations/report_generation/01_Importing_the_Dataset.ipynb index 6f67136..83cb93d 100644 --- a/implementations/report_generation/01_Running_the_Agent.ipynb +++ b/implementations/report_generation/01_Importing_the_Dataset.ipynb @@ -5,22 +5,15 @@ "id": "cc016c29-e5e8-4338-9a31-0fa6700505ff", "metadata": {}, "source": [ - "# Report Generation Agent\n", + "# Importing the Dataset for the Report Generation Agent\n", "\n", - "This notebook implements an example of a Report Generation Agent for single-table relational\n", - "data source, including a demo agent demo UI and evaluations with [Langfuse](https://langfuse.com/).\n", + "This notebook implements the data import for the Report Generation Agent for single-table relational\n", + "data source.\n", "\n", "The data source implemented here is an [SQLite](https://sqlite.org/) database which is supported\n", "natively by Python and saves the data in disk.\n", "[SQLAlchemy](https://www.sqlalchemy.org/) is used as a SQL connection tool so this\n", - "SQL connection can be easily swapped for other databases.\n", - "\n", - "The Report Generation Agent will provide an UI to read user queries in natural language\n", - "and procceed to make SQL queries to the database in order to produce the data for\n", - "the report. At the end, the Agent will provide a downloadable link to the report as\n", - "an `.xlsx` file.\n", - "\n", - "This example also provides agent monitoring and evaluations using Langfuse." + "SQL connection can be easily swapped for other databases." ] }, { @@ -35,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "4cc2db20-296f-4822-916c-b8255073c066", "metadata": {}, "outputs": [ @@ -59,8 +52,6 @@ "import pandas as pd\n", "from aieng.agent_evals.async_client_manager import AsyncClientManager\n", "\n", - "from implementations.report_generation.data.import_online_retail_data import import_online_retail_data\n", - "\n", "\n", "# Setting the notebook directory to the project's root folder\n", "if Path(\"\").absolute().name == \"eval-agents\":\n", @@ -77,7 +68,9 @@ "print(\"All environment variables have been set.\")\n", "\n", "DATA_FOLDER = Path(\"implementations/report_generation/data\")\n", - "DATASET_PATH = DATA_FOLDER / \"OnlineRetail.csv\"" + "DATASET_PATH = DATA_FOLDER / \"OnlineRetail.csv\"\n", + "\n", + "from implementations.report_generation.data.import_online_retail_data import import_online_retail_data # noqa: E402" ] }, { @@ -106,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "554f1cc6-c42f-4fe3-8857-214fcbeafd95", "metadata": {}, "outputs": [ @@ -155,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "ee28609b-6eed-4aea-a5e0-c7d6df57e0af", "metadata": {}, "outputs": [ @@ -163,9 +156,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2026-02-17 13:43:51,794 INFO implementations.report_generation.data.import_online_retail_data: Creating tables according to the OnlineRetail.ddl file\n", - "2026-02-17 13:43:51,798 INFO implementations.report_generation.data.import_online_retail_data: Importing dataset from implementations/report_generation/data/OnlineRetail.csv to database at implementations/report_generation/data/OnlineRetail.db\n", - "2026-02-17 13:43:54,933 INFO implementations.report_generation.data.import_online_retail_data: Dataset imported successfully to database at implementations/report_generation/data/OnlineRetail.db\n" + "2026-02-17 14:08:56,816 INFO implementations.report_generation.data.import_online_retail_data: Creating tables according to the OnlineRetail.ddl file\n", + "2026-02-17 14:08:56,820 INFO implementations.report_generation.data.import_online_retail_data: Importing dataset from implementations/report_generation/data/OnlineRetail.csv to database at implementations/report_generation/data/OnlineRetail.db\n", + "2026-02-17 14:09:00,205 INFO implementations.report_generation.data.import_online_retail_data: Dataset imported successfully to database at implementations/report_generation/data/OnlineRetail.db\n" ] }, { @@ -181,10 +174,20 @@ "print(\"Done!\")" ] }, + { + "cell_type": "markdown", + "id": "ce2ab32d-859f-44b1-9c54-cf9c2c1f1da4", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "Now the data should be ready to be consumed by the agent on the next notebook." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "ce2ab32d-859f-44b1-9c54-cf9c2c1f1da4", + "id": "4c1c8247-784a-420f-8cb0-49c2b1429c42", "metadata": {}, "outputs": [], "source": [] diff --git a/implementations/report_generation/02_Running_The_Agent.ipynb b/implementations/report_generation/02_Running_The_Agent.ipynb new file mode 100644 index 0000000..3685642 --- /dev/null +++ b/implementations/report_generation/02_Running_The_Agent.ipynb @@ -0,0 +1,61 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "66ea5037-7ad4-4e2e-b55d-f879fa9dc436", + "metadata": {}, + "source": [ + "# Running the Report Generation Agent\n", + "\n", + "This notebook runs the Report Generation Agent for single-table relational\n", + "data source as a Gradio Demo UI and evaluations with [Langfuse](https://langfuse.com/).\n", + "\n", + "The Report Generation Agent Gradio Demo will provide an UI to read user queries in natural language\n", + "and proceed to make SQL queries to the database in order to produce the data for\n", + "the report. At the end, the Agent will provide a downloadable link to the report as\n", + "an `.xlsx` file.\n", + "\n", + "This example also provides agent monitoring and evaluations using Langfuse." + ] + }, + { + "cell_type": "markdown", + "id": "1d72ee27-a68a-4349-af4f-fde4bca8663a", + "metadata": {}, + "source": [ + "## Setting up\n", + "\n", + "The code below sets the notebook default folder, sets the default constants and checks the presence of the environment variables." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66b46b94-8e30-4627-9265-4fe09af2a4c2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/implementations/report_generation/README.md b/implementations/report_generation/README.md index 34b4be0..3eea6f8 100644 --- a/implementations/report_generation/README.md +++ b/implementations/report_generation/README.md @@ -9,7 +9,7 @@ supported natively by Python and saves the data in disk. SQL connection can be easily swapped for other databases. The Report Generation Agent will provide an UI to read user queries in natural language -and procceed to make SQL queries to the database in order to produce the data for +and proceed to make SQL queries to the database in order to produce the data for the report. At the end, the Agent will provide a downloadable link to the report as an `.xlsx` file. From 6a5d8ca00d6b4d9489445398d3fcf2122d17c3ae Mon Sep 17 00:00:00 2001 From: Marcelo Lotif Date: Tue, 17 Feb 2026 18:44:13 -0500 Subject: [PATCH 3/6] Finishing up the notebooks --- .../agent_evals/report_generation/agent.py | 2 +- .../report_generation/file_writer.py | 4 +- .../01_Importing_the_Dataset.ipynb | 234 +++++++- .../02_Running_The_Agent.ipynb | 185 ++++++- .../03_Run_Offline_Evaluations.ipynb | 508 ++++++++++++++++++ implementations/report_generation/README.md | 5 + implementations/report_generation/demo.py | 42 +- 7 files changed, 950 insertions(+), 30 deletions(-) create mode 100644 implementations/report_generation/03_Run_Offline_Evaluations.ipynb diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py b/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py index 1374490..9906f32 100644 --- a/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py +++ b/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py @@ -75,8 +75,8 @@ def get_report_generation_agent( model=client_manager.configs.default_worker_model, instruction=instructions, tools=[ - db_manager.report_generation_db().execute, db_manager.report_generation_db().get_schema_info, + db_manager.report_generation_db().execute, report_file_writer.write_xlsx, ], after_agent_callback=after_agent_callback, diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py b/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py index 8c33585..733bbc5 100644 --- a/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py +++ b/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py @@ -38,7 +38,7 @@ def __init__(self, reports_output_path: Path): def write_xlsx( self, - report_data: list[Any], + report_data: list[list[Any]], report_columns: list[str], filename: str = "report.xlsx", gradio_link: bool = True, @@ -47,7 +47,7 @@ def write_xlsx( Parameters ---------- - report_data : list[Any] + report_data : list[list[Any]] The data of the report. report_columns : list[str] The columns of the report. diff --git a/implementations/report_generation/01_Importing_the_Dataset.ipynb b/implementations/report_generation/01_Importing_the_Dataset.ipynb index 83cb93d..d128091 100644 --- a/implementations/report_generation/01_Importing_the_Dataset.ipynb +++ b/implementations/report_generation/01_Importing_the_Dataset.ipynb @@ -7,13 +7,15 @@ "source": [ "# Importing the Dataset for the Report Generation Agent\n", "\n", - "This notebook implements the data import for the Report Generation Agent for single-table relational\n", + "This notebook implements the **data import** for the **Report Generation Agent** for single-table relational\n", "data source.\n", "\n", "The data source implemented here is an [SQLite](https://sqlite.org/) database which is supported\n", "natively by Python and saves the data in disk.\n", "[SQLAlchemy](https://www.sqlalchemy.org/) is used as a SQL connection tool so this\n", - "SQL connection can be easily swapped for other databases." + "SQL connection can be easily swapped for other databases.\n", + "\n", + "The SQL Alchemy tool is set up to allow **read-only queries**, so there is **no risk** the agent runs queries that can modify the DB data." ] }, { @@ -23,12 +25,14 @@ "source": [ "## Setting up\n", "\n", - "The code below sets the notebook default folder, sets the default constants and checks the presence of the environment variables." + "The code below sets the notebook default folder, sets the default constants and checks the presence of the environment variables.\n", + "\n", + "The environment variables can be set in the `.env` file in the root folder of the project." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "id": "4cc2db20-296f-4822-916c-b8255073c066", "metadata": {}, "outputs": [ @@ -81,9 +85,9 @@ "## Dataset\n", "\n", "The dataset used in this example is the\n", - "[Online Retail](https://archive.ics.uci.edu/dataset/352/online+retail) dataset. It contains\n", - "information about invoices for products that were purchased by customers, which also includes\n", - "product quantity, the invoice date and the country that the custgomer resides in. For a more\n", + "**[Online Retail](https://archive.ics.uci.edu/dataset/352/online+retail) dataset**. It contains\n", + "information about **invoices** for products that were purchased by customers, which also includes\n", + "product quantity, the invoice date and the country that the customer resides in. For a more\n", "detailed data structure, please check the [OnlineRetail.ddl](data/Online%20Retail.ddl) file." ] }, @@ -94,7 +98,7 @@ "source": [ "## Downloading the Dataset\n", "\n", - "The code below will download and unzip the dataset to the `implementations/report_generation/data/` folder." + "The code below will **download and unzip** the dataset to the `implementations/report_generation/data/` folder." ] }, { @@ -136,6 +140,218 @@ "print(\"Done!\")" ] }, + { + "cell_type": "markdown", + "id": "2e4e45de-1f07-41de-bf9c-f9c2543b8cb3", + "metadata": {}, + "source": [ + "## Visualizing the data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "123d37f3-fd6f-4676-8f84-8bcfa45a0535", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
053636585123AWHITE HANGING HEART T-LIGHT HOLDER62010-12-01 08:26:002.5517850.0United Kingdom
153636571053WHITE METAL LANTERN62010-12-01 08:26:003.3917850.0United Kingdom
253636584406BCREAM CUPID HEARTS COAT HANGER82010-12-01 08:26:002.7517850.0United Kingdom
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE62010-12-01 08:26:003.3917850.0United Kingdom
453636584029ERED WOOLLY HOTTIE WHITE HEART.62010-12-01 08:26:003.3917850.0United Kingdom
...........................
54190458158722613PACK OF 20 SPACEBOY NAPKINS122011-12-09 12:50:000.8512680.0France
54190558158722899CHILDREN'S APRON DOLLY GIRL62011-12-09 12:50:002.1012680.0France
54190658158723254CHILDRENS CUTLERY DOLLY GIRL42011-12-09 12:50:004.1512680.0France
54190758158723255CHILDRENS CUTLERY CIRCUS PARADE42011-12-09 12:50:004.1512680.0France
54190858158722138BAKING SET 9 PIECE RETROSPOT32011-12-09 12:50:004.9512680.0France
\n", + "

541909 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " InvoiceNo StockCode Description Quantity \\\n", + "0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 \n", + "1 536365 71053 WHITE METAL LANTERN 6 \n", + "2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 \n", + "3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 \n", + "4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 \n", + "... ... ... ... ... \n", + "541904 581587 22613 PACK OF 20 SPACEBOY NAPKINS 12 \n", + "541905 581587 22899 CHILDREN'S APRON DOLLY GIRL 6 \n", + "541906 581587 23254 CHILDRENS CUTLERY DOLLY GIRL 4 \n", + "541907 581587 23255 CHILDRENS CUTLERY CIRCUS PARADE 4 \n", + "541908 581587 22138 BAKING SET 9 PIECE RETROSPOT 3 \n", + "\n", + " InvoiceDate UnitPrice CustomerID Country \n", + "0 2010-12-01 08:26:00 2.55 17850.0 United Kingdom \n", + "1 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", + "2 2010-12-01 08:26:00 2.75 17850.0 United Kingdom \n", + "3 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", + "4 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", + "... ... ... ... ... \n", + "541904 2011-12-09 12:50:00 0.85 12680.0 France \n", + "541905 2011-12-09 12:50:00 2.10 12680.0 France \n", + "541906 2011-12-09 12:50:00 4.15 12680.0 France \n", + "541907 2011-12-09 12:50:00 4.15 12680.0 France \n", + "541908 2011-12-09 12:50:00 4.95 12680.0 France \n", + "\n", + "[541909 rows x 8 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(DATASET_PATH)\n", + "df # noqa: B018" + ] + }, { "cell_type": "markdown", "id": "ec3a84cb-3636-460a-a9bd-e6ea57d3f9d7", @@ -181,7 +397,7 @@ "source": [ "## Conclusion\n", "\n", - "Now the data should be ready to be consumed by the agent on the next notebook." + "Now the data should be ready to be consumed by the agent on the **next notebook**." ] }, { diff --git a/implementations/report_generation/02_Running_The_Agent.ipynb b/implementations/report_generation/02_Running_The_Agent.ipynb index 3685642..55dbe4e 100644 --- a/implementations/report_generation/02_Running_The_Agent.ipynb +++ b/implementations/report_generation/02_Running_The_Agent.ipynb @@ -7,12 +7,12 @@ "source": [ "# Running the Report Generation Agent\n", "\n", - "This notebook runs the Report Generation Agent for single-table relational\n", - "data source as a Gradio Demo UI and evaluations with [Langfuse](https://langfuse.com/).\n", + "This notebook runs the **Report Generation Agent** for **single-table relational\n", + "data source** as a Gradio Demo UI and evaluations with [Langfuse](https://langfuse.com/).\n", "\n", - "The Report Generation Agent Gradio Demo will provide an UI to read user queries in natural language\n", - "and proceed to make SQL queries to the database in order to produce the data for\n", - "the report. At the end, the Agent will provide a downloadable link to the report as\n", + "The Report Generation Agent Gradio Demo will provide an UI to read **user queries in natural language**\n", + "and proceed to **make SQL queries** to the database in order to produce the data for\n", + "the report. At the end, the Agent will provide a **downloadable link** to the report as\n", "an `.xlsx` file.\n", "\n", "This example also provides agent monitoring and evaluations using Langfuse." @@ -25,14 +25,185 @@ "source": [ "## Setting up\n", "\n", - "The code below sets the notebook default folder, sets the default constants and checks the presence of the environment variables." + "The code below sets the notebook default folder, sets the default constants and checks the presence of the environment variables.\n", + "\n", + "The environment variables can be set in the `.env` file in the root folder of the project." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "66b46b94-8e30-4627-9265-4fe09af2a4c2", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Notebook path is already the root path: /Users/marcelolotif/workspace/eval-agents\n", + "All environment variables have been set.\n" + ] + } + ], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "from aieng.agent_evals.async_client_manager import AsyncClientManager\n", + "\n", + "\n", + "# Setting the notebook directory to the project's root folder\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Notebook path is already the root path: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"The notebook path has been set to: {Path('').absolute()}\")\n", + "\n", + "client_manager = AsyncClientManager.get_instance()\n", + "assert client_manager.configs.report_generation_db.database, (\n", + " \"[ERROR] The database path is not set! Please configure the REPORT_GENERATION_DB__DATABASE environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_secret_key, (\n", + " \"[ERROR] The Langfuse secret key is not set! Please configure the LANGFUSE_SECRET_KEY environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_public_key, (\n", + " \"[ERROR] The Langfuse public key is not set! Please configure the LANGFUSE_PUBLIC_KEY environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_host, (\n", + " \"[ERROR] The Langfuse base URL is not set! Please configure the LANGFUSE_BASE_URL environment variable.\"\n", + ")\n", + "\n", + "from implementations.report_generation.demo import start_gradio_app # noqa: E402\n", + "\n", + "\n", + "print(\"All environment variables have been set.\")" + ] + }, + { + "cell_type": "markdown", + "id": "ec0522bc-01f5-4554-bc57-39de8dbb4c53", + "metadata": {}, + "source": [ + "## Running the Demo UI\n", + "\n", + "The code below will start the **Gradio Demo UI**, which will run embedded in this notebook (made possible by the `enable_public_link=True` parameter). If you so choose, you can also run the UI in your browser using the links provided in the outputs.\n", + "\n", + "The UI will display a few **pre-defined input options** to test the agent and also a text box so you can **make your own queries**. The Agent will try to complete the request by making **queries against the database** configured in the first notebook. At the end, it will **produce a report** that it will make available as a download link at the end of the agent's execution.\n", + "\n", + "The UI will display the **agents thoughts** and **tool calls** in order to facilitate debugging.\n", + "\n", + "The agent **traces will be sent** to [Langfuse](https://us.cloud.langfuse.com/). To configure the langfuse connection, you can edit the following environment variables in your `.env` file:\n", + "\n", + "```python\n", + "# Secret and public keys can be generated by the Langfuse web UI\n", + "LANGFUSE_SECRET_KEY=\"sk-lf-...\"\n", + "LANGFUSE_PUBLIC_KEY=\"pk-lf-...\"\n", + "LANGFUSE_BASE_URL=\"https://us.cloud.langfuse.com\"\n", + "\n", + "# Input the name of your langfuse project on the first variable\n", + "# The second variable is used to configure where the output reports are going to be saved\n", + "# Defaults are in implementations/report_generation/env_vars.py)\n", + "REPORT_GENERATION_LANGFUSE_PROJECT_NAME=\"...\"\n", + "REPORT_GENERATION_OUTPUT_PATH=\"...\"\n", + "```\n", + "\n", + "Run the cell below to see the Report Generation Agent in action." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fceeac6a-309d-484f-98dc-227f875387a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* Running on local URL: http://127.0.0.1:7863\n", + "* Running on public URL: https://cf16cc30b2b942de91.gradio.live\n", + "\n", + "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "await start_gradio_app(\n", + " enable_trace=True,\n", + " enable_public_link=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3bdee1c5-f5d9-485a-9a48-f5aab2c41d04", + "metadata": {}, + "source": [ + "## Agent Configuration\n", + "\n", + "This agent is a **text-to-SQL** agent that will convert natural language into one or multiple queries to a DB in order to produce the result that has been requested.\n", + "\n", + "This agent in particular has **no knowledge of the DB schema**, although it is possible to do so with some more advanced techniques. This allows the agent to be **flexible** to any database it may encounter. However, it is **easier** to have it working with a **single-table DB** as opposed to a multi-table DB.\n", + "\n", + "It is configured with **three tools** (as per `aieng.agent_evals.evaluation.report_generation.agent.get_report_generation_agent`):\n", + "- `db_manager.report_generation_db().get_schema_info`: to retrieve the **tables** and the **schema** for each table.\n", + "- `db_manager.report_generation_db().execute`: to execute any **read-only SQL queries**.\n", + "- `report_file_writer.write_xlsx`: a function that receives the **report data** as an array, writes the array to a `.xlsx` file and returns a **downloadable Gradio link** to the file.\n", + "\n", + "The Agent will know how to use those tools given the **instructions** it has been given, along with the user input. Below are the instructions for this agent in specific (as per `aieng.agent_evals.evaluation.report_generation.prompts`:\n", + "```python\n", + "MAIN_AGENT_INSTRUCTIONS = \"\"\"\\\n", + "Perform the task using the SQLite database tool. \\\n", + "EACH TIME before invoking the function, you must explain your reasons for doing so. \\\n", + "If the SQL query did not return intended results, try again. \\\n", + "For best performance, divide complex queries into simpler sub-queries. \\\n", + "Do not make up information. \\\n", + "When the report is done, use the report file writer tool to write it to a file. \\\n", + "Make sure the \"write_xlsx\" tool is called so it generates the report file. \\\n", + "At the end, provide the report file as a downloadable hyperlink to the user. \\\n", + "Make sure the link can be clicked on by the user.\n", + "\"\"\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "b1d0398e-944a-4005-a9e3-44dbb050441a", + "metadata": {}, + "source": [ + "## Online Evaluations\n", + "\n", + "The Agent will be sending online evaluation metrics to Langfuse along with the traces. These metrics aim to **simulate** how an evaluation of a **production agentic system** would work for this use case.\n", + "\n", + "Those metrics are:\n", + "- A check of the **total token spent** to complete the request. If it is below the threshold of 15k tokens it will send a score of 1, or a score of 0 otherwise.\n", + "- A check of the **total time spent** to complete the request. If it is below the threshold of 60 seconds it will send a score of 1, or a score of 0 otherwise.\n", + "- At the end of the Agent run, the UI will display thumbs up/thumbs down buttons so the **users can send feedback** on the output of the agent.\n", + "\n", + "The calculation for the first two metrics are triggered by a **callback function** that runs at the end of the agent run, while the third one is triggered asynchronously by an **user UI action**.\n", + "\n", + "Those metrics will be available in Langfuse in two different ways:\n", + "- **Aggregated:** Summary of those metrics will be displayed on the dashboards page\n", + "- **Individualized:** The evaluation scores are available on each trace with additional details" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de28417c-1cab-4974-8e45-50d1f8cd03b4", + "metadata": {}, "outputs": [], "source": [] } diff --git a/implementations/report_generation/03_Run_Offline_Evaluations.ipynb b/implementations/report_generation/03_Run_Offline_Evaluations.ipynb new file mode 100644 index 0000000..5ba20a4 --- /dev/null +++ b/implementations/report_generation/03_Run_Offline_Evaluations.ipynb @@ -0,0 +1,508 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e1ddcbff-a950-42ee-be8b-a3663ae6b202", + "metadata": {}, + "source": [ + "## Running the Offline Evaluations for the Report Generation Agent\n", + "\n", + "Offline evaluations are evaluations run against a **pre-defined dataset** and they run **detailed evaluations** of the **outputs** of the agentic system and the **steps** it has taken to produce those evaluations.\n", + "\n", + "This dataset is called the **expected results** or the **ground-truth** dataset, and on this case it's a **handcrafted** dataset with **inputs, oputputs and trajectory** for a few known use cases.\n", + "\n", + "Those evaluations are run by Langfuse and the results are visualized there." + ] + }, + { + "cell_type": "markdown", + "id": "db74c461-bcda-49b2-bd4e-5ea961798d55", + "metadata": {}, + "source": [ + "## Setting up\n", + "\n", + "The code below sets the notebook default folder, sets the default constants and checks the presence of the environment variables.\n", + "\n", + "The environment variables can be set in the `.env` file in the root folder of the project." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3adfd868-c4da-4bdb-ae13-0ebe56c3ed97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The notebook path has been set to: /Users/marcelolotif/workspace/eval-agents\n", + "All environment variables have been set.\n" + ] + } + ], + "source": [ + "import json\n", + "import os\n", + "from pathlib import Path\n", + "from pprint import pprint\n", + "\n", + "from aieng.agent_evals.async_client_manager import AsyncClientManager\n", + "from aieng.agent_evals.langfuse import upload_dataset_to_langfuse\n", + "\n", + "\n", + "# Setting the notebook directory to the project's root folder\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Notebook path is already the root path: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"The notebook path has been set to: {Path('').absolute()}\")\n", + "\n", + "client_manager = AsyncClientManager.get_instance()\n", + "assert client_manager.configs.report_generation_db.database, (\n", + " \"[ERROR] The database path is not set! Please configure the REPORT_GENERATION_DB__DATABASE environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_secret_key, (\n", + " \"[ERROR] The Langfuse secret key is not set! Please configure the LANGFUSE_SECRET_KEY environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_public_key, (\n", + " \"[ERROR] The Langfuse public key is not set! Please configure the LANGFUSE_PUBLIC_KEY environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_host, (\n", + " \"[ERROR] The Langfuse base URL is not set! Please configure the LANGFUSE_BASE_URL environment variable.\"\n", + ")\n", + "\n", + "print(\"All environment variables have been set.\")\n", + "\n", + "\n", + "EVALUATION_DATASET_PATH = \"implementations/report_generation/data/OnlineRetailReportEval.json\"\n", + "LANGFUSE_DATASET_NAME = \"OnlineRetailReportEval\"" + ] + }, + { + "cell_type": "markdown", + "id": "b64fe764-9355-40c6-8632-fccca34acce9", + "metadata": {}, + "source": [ + "## Taking a Look at the Ground Truth Dataset\n", + "\n", + "The ground-truth dataset is located at `implementations/report_generation/data/OnlineRetailReportEval.json`. The code below will display one of its elements as an example:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9e943f83-7178-43bc-be17-0e550564002f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ground truth size: 7\n", + "First element:\n", + "{'expected_output': {'final_report': {'filename': 'monthly_sales_performance_report.xlsx',\n", + " 'report_columns': ['SalesMonth',\n", + " 'TotalSales'],\n", + " 'report_data': [['2010-12', 748957.02],\n", + " ['2011-01', 560000.26],\n", + " ['2011-02', 498062.65],\n", + " ['2011-03', 683267.08],\n", + " ['2011-04', 493207.12],\n", + " ['2011-05', 723333.51],\n", + " ['2011-06', 691123.12],\n", + " ['2011-07', 681300.11],\n", + " ['2011-08', 682680.51],\n", + " ['2011-09', 1019687.62],\n", + " ['2011-10', 1070704.67],\n", + " ['2011-11', 1461756.25],\n", + " ['2011-12', 433668.01]]},\n", + " 'trajectory': {'actions': ['get_schema_info',\n", + " 'execute',\n", + " 'execute',\n", + " 'write_xlsx',\n", + " 'output_text'],\n", + " 'description': ['Check what are the tables '\n", + " 'that are available in the '\n", + " 'database',\n", + " 'Check what are the '\n", + " 'columns that are '\n", + " 'available in the sales '\n", + " 'table',\n", + " 'Query to retrieve the '\n", + " 'sales performance '\n", + " '(quantity * price) per '\n", + " 'month',\n", + " 'Send the report data to '\n", + " 'the function that writes '\n", + " 'the report to disk',\n", + " 'Output text to the user '\n", + " 'with the report file as a '\n", + " 'Gradio hyperlink']}},\n", + " 'id': '1',\n", + " 'input': 'Generate a monthly sales performance report.'}\n" + ] + } + ], + "source": [ + "with open(\"implementations/report_generation/data/OnlineRetailReportEval.json\") as f:\n", + " ground_truth = json.load(f)\n", + "\n", + "print(f\"Ground truth size: {len(ground_truth)}\")\n", + "print(\"First element:\")\n", + "pprint(ground_truth[0])" + ] + }, + { + "cell_type": "markdown", + "id": "06eeee29-b23e-4bad-b20f-ef32610a5570", + "metadata": {}, + "source": [ + "Here is an explanation of the data structure of each one of the elements:\n", + "```python\n", + "{\n", + " 'id': str, # The ID of the sample\n", + " 'input': str, # The input to be used to test the report generation agent\n", + " 'expected_output': { # The expected outputs of the agent\n", + " 'final_report': { # The output data for the final report the agent generates. \n", + " # These values match the input the agent sends to the `write_xlsx` function\n", + " 'filename': str, # The name of the report file\n", + " 'report_columns': list[str, # The names of the columns of the report\n", + " 'report_data': list[list[Any]], # a bidimensional array of values for the rows of the report\n", + " }\n", + " 'trajectory': { # information about the trajectory the agent should take to produce the report\n", + " 'actions': list[str], # A list of the names of the actions the agent should take, in order\n", + " 'description': list[str], # A description of what the parameters that are sent to each one of\n", + " # the actions are supposed to be doing\n", + " }\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "69e993b7-a93d-468e-84e9-0e1906108335", + "metadata": {}, + "source": [ + "## Uploading the dataset to Langfuse\n", + "\n", + "Use the function below to **upload** the ground truth dataset to Langfuse so it can be used **during the evaluation**:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "bd692600-d936-4c7b-af94-5ddd19b59bfd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-02-17 18:13:52,719 INFO aieng.agent_evals.langfuse: Loading dataset from 'implementations/report_generation/data/OnlineRetailReportEval.json'\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9e6b0cea9e7e400a8be7d1249f314da6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-02-17 18:13:54,052 INFO aieng.agent_evals.langfuse: Uploaded 7 items to dataset 'OnlineRetailReportEval'\n"
+     ]
+    }
+   ],
+   "source": [
+    "await upload_dataset_to_langfuse(\n",
+    "    EVALUATION_DATASET_PATH,\n",
+    "    LANGFUSE_DATASET_NAME,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "abc8c175-6315-41bf-800a-d913e3e2b5c0",
+   "metadata": {},
+   "source": [
+    "## LLM-as-a-judge Evaluators\n",
+    "\n",
+    "Two **LLM-as-a-judge evaluators** are set to run against this dataset and the agent's output:\n",
+    "1. A **Final Result Evaluator**, that will evaluate the agent's output against the contents of the `final_result` key\n",
+    "2. A **Trajectory Evaluator**, that will evaluate the agent's output against the contents of the `trajectory` key\n",
+    "\n",
+    "Here are the instructions for both of those agents (as per `aieng.agent_evals.evaluation.report_generation.prompts`):\n",
+    "```python\n",
+    "TRAJECTORY_EVALUATOR_INSTRUCTIONS = \"\"\"\\\n",
+    "You are evaluating if an agent has followed the correct trajectory to generate a report.\\\n",
+    "The agent is a Report Generation Agent that uses the SQLite database tool to generate a report\\\n",
+    "and return the report as a downloadable file to the user.\\\n",
+    "You will be presented with the \"Question\" that has been asked to the agent along with two sets of data:\\\n",
+    "- The \"Expected Trajectory\" of the agent, which contains:\\\n",
+    "    - A list ids for the actions the agent is expected to perform\\\n",
+    "    - A list of rough descriptions of what has been passed as parameters to the actions\\\n",
+    "- The \"Actual Trajectory\" of the agent, which contains:\\\n",
+    "    - A list ids for the actions the agent performed\\\n",
+    "    - A list of parameters that has been passed to each one of the actions\\\n",
+    "It's OK if the agent makes mistakes and performs additional steps, or if the queries do not exactly match\\\n",
+    "the description, as long as the queries performed end up satisfying the \"Question\".\\\n",
+    "It is important that the last action to be of type \"final_response\" and that it produces a link to the report file.\n",
+    "\"\"\"\n",
+    "\n",
+    "RESULT_EVALUATOR_INSTRUCTIONS = \"\"\"\\\n",
+    "Evaluate whether the \"Proposed Answer\" to the given \"Question\" matches the \"Ground Truth\". \\\n",
+    "Disregard the following aspects when comparing the \"Proposed Answer\" to the \"Ground Truth\": \\\n",
+    "- The order of the items should not matter, unless explicitly specified in the \"Question\". \\\n",
+    "- The formatting of the values should not matter, unless explicitly specified in the \"Question\". \\\n",
+    "- The column and row names have to be similar but not necessarily exact, unless explicitly specified in the \"Question\". \\\n",
+    "- The filename has to be similar by name but not necessarily exact, unless explicitly specified in the \"Question\". \\\n",
+    "- It is ok if the filename is missing. \\\n",
+    "- The numerical values should be equal with a tolerance of 0.01. \\\n",
+    "- The report data in the \"Proposed Answer\" should have the same number of rows as in the \"Ground Truth\". \\\n",
+    "- It is OK if the report data in the \"Proposed Answer\" contains extra columns or if the rows are in a different order, \\\n",
+    "unless explicitly specified in the \"Question\".\n",
+    "\"\"\"\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c8b05537-2624-4df7-9f08-59d797378175",
+   "metadata": {},
+   "source": [
+    "## Running the Evaluations\n",
+    "\n",
+    "To run those two evaluatoirs against all of the ground-truth dataset samples, run the function below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77d38a16-526a-4740-953a-dc735a9f83f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2026-02-17 18:31:55,626 WARNING opentelemetry.trace: Overriding of current TracerProvider is not allowed\n",
+      "2026-02-17 18:31:55,649 INFO aieng.agent_evals.langfuse: Langfuse tracing initialized successfully (endpoint: https://us.cloud.langfuse.com/api/public/otel)\n",
+      "2026-02-17 18:31:55,661 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
+      "2026-02-17 18:31:55,724 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:31:55,729 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
+      "2026-02-17 18:31:55,785 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:31:55,786 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
+      "2026-02-17 18:31:55,852 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:31:55,854 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on sales revenue by country per year....'\n",
+      "2026-02-17 18:31:55,913 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:31:55,914 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the 5 highest-value customers per year vs. the average customer....'\n",
+      "2026-02-17 18:31:55,968 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:31:58,408 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:31:58,408 WARNING google_genai.types: Warning: there are non-text parts in the response: ['function_call'], returning concatenated text result from text parts. Check the full candidates.content.parts accessor to get the full model response.\n",
+      "2026-02-17 18:31:58,671 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:31:59,194 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:31:59,298 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:31:59,373 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:31:59,379 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:31:59,501 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:31:59,594 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:31:59,660 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:31:59,895 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:02,848 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:02,951 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:03,801 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:03,864 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:04,305 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:04,548 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:04,616 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:04,670 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:06,751 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:07,262 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:08,784 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:08,837 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:09,194 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:09,199 WARNING aieng.agent_evals.tools.sql_database: CTE usage blocked by policy\n",
+      "2026-02-17 18:32:09,261 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:09,740 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:09,901 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:12,921 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:12,930 WARNING aieng.agent_evals.tools.sql_database: CTE usage blocked by policy\n",
+      "2026-02-17 18:32:12,995 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:13,178 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:13,545 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:13,978 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:14,024 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:14,871 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:14,951 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:18,602 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:18,608 WARNING aieng.agent_evals.tools.sql_database: CTE usage blocked by policy\n",
+      "2026-02-17 18:32:18,685 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:18,934 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:19,199 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:19,447 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:19,647 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:22,304 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:22,589 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:22,594 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:22,657 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:32:33,571 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:32:40,596 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:40,602 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:40,604 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:40,608 WARNING aieng.agent_evals.tools.sql_database: CTE usage blocked by policy\n",
+      "2026-02-17 18:32:40,899 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:40,980 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:41,050 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:41,062 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the top 5 selling products per year and the total sales value for each product....'\n",
+      "2026-02-17 18:32:41,118 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:32:43,951 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:32:44,000 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:32:58,072 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:33:10,435 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:10,452 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:10,458 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:10,468 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:11,190 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:11,193 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a monthly sales performance report....'\n",
+      "2026-02-17 18:33:11,237 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:11,286 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:11,343 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:11,391 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:13,913 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:13,986 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:33:23,634 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:33:36,143 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:36,149 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:36,151 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:36,155 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:36,513 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:36,581 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:36,751 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:36,863 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:39,846 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:40,254 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:40,880 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:40,927 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:43,671 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:43,683 WARNING aieng.agent_evals.tools.sql_database: CTE usage blocked by policy\n",
+      "2026-02-17 18:33:43,736 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:44,267 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:44,677 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:46,484 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:47,836 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:47,847 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:47,942 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:49,927 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:50,399 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:52,986 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:53,529 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:53,564 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:53,769 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:54,601 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:54,699 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:33:56,271 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:33:56,353 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:34:04,512 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:34:11,165 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:34:11,175 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:34:11,318 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:34:25,177 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:34:36,448 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:34:36,518 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:34:37,061 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:34:41,909 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:34:42,008 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:34:42,140 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:34:42,595 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:34:45,601 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:34:46,134 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:34:47,264 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:34:47,402 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:34:56,161 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:35:09,004 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:35:09,226 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-17 18:35:12,197 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-17 18:35:12,271 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
+      "2026-02-17 18:35:25,116 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Running as a CLI command to avoid issues between Langfuse's\n",
+    "# experiment runner and Jupyter\n",
+    "# NOTE: This will take a while to execute\n",
+    "\n",
+    "!uv run --env-file .env python -m implementations.report_generation.evaluate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d969ed04-7427-40e9-b51b-fa892da706ef",
+   "metadata": {},
+   "source": [
+    "## Checking the Results\n",
+    "\n",
+    "At the end of the run, you will see a summary in the console.\n",
+    "\n",
+    "To see detailed results of the evaluation runs:\n",
+    "1. Go to your project on Langfuse\n",
+    "2. Click on **Datasets**\n",
+    "3. Click on the dataset name\n",
+    "4. Click on one of the runs\n",
+    "\n",
+    "You will see a more detailed summary of the experiment run and also you can see the details of each of of the runs, including f"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4855cf8f-f386-449a-a777-4a5adee5d2e4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/implementations/report_generation/README.md b/implementations/report_generation/README.md
index 3eea6f8..bb559e3 100644
--- a/implementations/report_generation/README.md
+++ b/implementations/report_generation/README.md
@@ -15,6 +15,11 @@ an `.xlsx` file.
 
 This example also provides agent monitoring and evaluations using Langfuse.
 
+### Running the Demo
+
+To run the demo, you can choose to follow the steps below or follow the instructions in the notebooks in this folder.
+
+
 ## Dataset
 
 The dataset used in this example is the
diff --git a/implementations/report_generation/demo.py b/implementations/report_generation/demo.py
index 795ef38..ca6ba07 100644
--- a/implementations/report_generation/demo.py
+++ b/implementations/report_generation/demo.py
@@ -194,15 +194,7 @@ def toggle_feedback_row() -> tuple[dict[str, Any], dict[str, Any]]:
     return gr.update(visible=trace_id is not None and trace_id != ""), gr.update(visible=False)
 
 
-@click.command()
-@click.option("--enable-trace", required=False, default=True, help="Whether to enable tracing with Langfuse.")
-@click.option(
-    "--enable-public-link",
-    required=False,
-    default=False,
-    help="Whether to enable public link for the Gradio app.",
-)
-def start_gradio_app(enable_trace: bool = True, enable_public_link: bool = False) -> None:
+async def start_gradio_app(enable_trace: bool = True, enable_public_link: bool = False) -> None:
     """Start the Gradio app with the agent session handler.
 
     Parameters
@@ -265,8 +257,36 @@ def start_gradio_app(enable_trace: bool = True, enable_public_link: bool = False
         )
     finally:
         DbManager.get_instance().close()
-        asyncio.run(AsyncClientManager.get_instance().close())
+        await AsyncClientManager.get_instance().close()
+
+
+@click.command()
+@click.option("--enable-trace", required=False, default=True, help="Whether to enable tracing with Langfuse.")
+@click.option(
+    "--enable-public-link",
+    required=False,
+    default=False,
+    help="Whether to enable public link for the Gradio app.",
+)
+def cli(enable_trace: bool = True, enable_public_link: bool = False) -> None:
+    """CLI entry point to start the Gradio app.
+
+    Parameters
+    ----------
+    enable_trace : bool, optional
+        Whether to enable tracing with Langfuse for evaluation purposes.
+        Default is True.
+    enable_public_link : bool, optional
+        Whether to enable public link for the Gradio app. If True,
+        will make the Gradio app available at a public URL. Default is False.
+    """
+    asyncio.run(
+        start_gradio_app(
+            enable_trace=enable_trace,
+            enable_public_link=enable_public_link,
+        )
+    )
 
 
 if __name__ == "__main__":
-    start_gradio_app()
+    cli()

From 38ecb4b257101797fb7c7a86507c0787608a695e Mon Sep 17 00:00:00 2001
From: Marcelo Lotif 
Date: Thu, 19 Feb 2026 10:09:19 -0500
Subject: [PATCH 4/6] Small adjustments to the notebooks

---
 .../01_Importing_the_Dataset.ipynb            |  10 +-
 .../02_Running_The_Agent.ipynb                |  10 +-
 ...b => 03_Running_Offline_Evaluations.ipynb} | 195 +++++-------------
 3 files changed, 53 insertions(+), 162 deletions(-)
 rename implementations/report_generation/{03_Run_Offline_Evaluations.ipynb => 03_Running_Offline_Evaluations.ipynb} (52%)

diff --git a/implementations/report_generation/01_Importing_the_Dataset.ipynb b/implementations/report_generation/01_Importing_the_Dataset.ipynb
index d128091..a4d5b99 100644
--- a/implementations/report_generation/01_Importing_the_Dataset.ipynb
+++ b/implementations/report_generation/01_Importing_the_Dataset.ipynb
@@ -88,7 +88,7 @@
     "**[Online Retail](https://archive.ics.uci.edu/dataset/352/online+retail) dataset**. It contains\n",
     "information about **invoices** for products that were purchased by customers, which also includes\n",
     "product quantity, the invoice date and the country that the customer resides in. For a more\n",
-    "detailed data structure, please check the [OnlineRetail.ddl](data/Online%20Retail.ddl) file."
+    "detailed data structure, please check the [OnlineRetail.ddl](http://localhost:8888/lab/tree/implementations/report_generation/data/OnlineRetail.ddl) file."
    ]
   },
   {
@@ -399,14 +399,6 @@
     "\n",
     "Now the data should be ready to be consumed by the agent on the **next notebook**."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c1c8247-784a-420f-8cb0-49c2b1429c42",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/implementations/report_generation/02_Running_The_Agent.ipynb b/implementations/report_generation/02_Running_The_Agent.ipynb
index 55dbe4e..7fc8628 100644
--- a/implementations/report_generation/02_Running_The_Agent.ipynb
+++ b/implementations/report_generation/02_Running_The_Agent.ipynb
@@ -158,7 +158,7 @@
     "This agent in particular has **no knowledge of the DB schema**, although it is possible to do so with some more advanced techniques. This allows the agent to be **flexible** to any database it may encounter. However, it is **easier** to have it working with a **single-table DB** as opposed to a multi-table DB.\n",
     "\n",
     "It is configured with **three tools** (as per `aieng.agent_evals.evaluation.report_generation.agent.get_report_generation_agent`):\n",
-    "- `db_manager.report_generation_db().get_schema_info`: to retrieve the **tables** and the **schema** for each table.\n",
+    "- `db_manager.report_generation_db().get_schema_info`: to retrieve the **DB schema** so the agent knows how to perform the queries.\n",
     "- `db_manager.report_generation_db().execute`: to execute any **read-only SQL queries**.\n",
     "- `report_file_writer.write_xlsx`: a function that receives the **report data** as an array, writes the array to a `.xlsx` file and returns a **downloadable Gradio link** to the file.\n",
     "\n",
@@ -198,14 +198,6 @@
     "- **Aggregated:** Summary of those metrics will be displayed on the dashboards page\n",
     "- **Individualized:** The evaluation scores are available on each trace with additional details"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "de28417c-1cab-4974-8e45-50d1f8cd03b4",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/implementations/report_generation/03_Run_Offline_Evaluations.ipynb b/implementations/report_generation/03_Running_Offline_Evaluations.ipynb
similarity index 52%
rename from implementations/report_generation/03_Run_Offline_Evaluations.ipynb
rename to implementations/report_generation/03_Running_Offline_Evaluations.ipynb
index 5ba20a4..abae7d7 100644
--- a/implementations/report_generation/03_Run_Offline_Evaluations.ipynb
+++ b/implementations/report_generation/03_Running_Offline_Evaluations.ipynb
@@ -7,11 +7,11 @@
    "source": [
     "## Running the Offline Evaluations for the Report Generation Agent\n",
     "\n",
-    "Offline evaluations are evaluations run against a **pre-defined dataset** and they run **detailed evaluations** of the **outputs** of the agentic system and the **steps** it has taken to produce those evaluations.\n",
+    "Offline evaluations are evaluations run against a **pre-defined dataset**. It performs **detailed evaluations** of the **outputs** of the agentic system and the **steps** it has taken to produce those evaluations.\n",
     "\n",
     "This dataset is called the **expected results** or the **ground-truth** dataset, and on this case it's a **handcrafted** dataset with **inputs, oputputs and trajectory** for a few known use cases.\n",
     "\n",
-    "Those evaluations are run by Langfuse and the results are visualized there."
+    "The evaluations are run by Langfuse and the results are visualized there."
    ]
   },
   {
@@ -99,7 +99,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Ground truth size: 7\n",
+      "Ground-truth dataset size: 7\n",
       "First element:\n",
       "{'expected_output': {'final_report': {'filename': 'monthly_sales_performance_report.xlsx',\n",
       "                                      'report_columns': ['SalesMonth',\n",
@@ -148,7 +148,7 @@
     "with open(\"implementations/report_generation/data/OnlineRetailReportEval.json\") as f:\n",
     "    ground_truth = json.load(f)\n",
     "\n",
-    "print(f\"Ground truth size: {len(ground_truth)}\")\n",
+    "print(f\"Ground-truth dataset size: {len(ground_truth)}\")\n",
     "print(\"First element:\")\n",
     "pprint(ground_truth[0])"
    ]
@@ -158,7 +158,7 @@
    "id": "06eeee29-b23e-4bad-b20f-ef32610a5570",
    "metadata": {},
    "source": [
-    "Here is an explanation of the data structure of each one of the elements:\n",
+    "Here is an explanation of the data structure of the dataset samples:\n",
     "```python\n",
     "{\n",
     "    'id': str,  # The ID of the sample\n",
@@ -307,145 +307,52 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2026-02-17 18:31:55,626 WARNING opentelemetry.trace: Overriding of current TracerProvider is not allowed\n",
-      "2026-02-17 18:31:55,649 INFO aieng.agent_evals.langfuse: Langfuse tracing initialized successfully (endpoint: https://us.cloud.langfuse.com/api/public/otel)\n",
-      "2026-02-17 18:31:55,661 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
-      "2026-02-17 18:31:55,724 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:31:55,729 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
-      "2026-02-17 18:31:55,785 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:31:55,786 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
-      "2026-02-17 18:31:55,852 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:31:55,854 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on sales revenue by country per year....'\n",
-      "2026-02-17 18:31:55,913 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:31:55,914 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the 5 highest-value customers per year vs. the average customer....'\n",
-      "2026-02-17 18:31:55,968 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:31:58,408 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:31:58,408 WARNING google_genai.types: Warning: there are non-text parts in the response: ['function_call'], returning concatenated text result from text parts. Check the full candidates.content.parts accessor to get the full model response.\n",
-      "2026-02-17 18:31:58,671 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:31:59,194 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:31:59,298 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:31:59,373 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:31:59,379 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:31:59,501 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:31:59,594 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:31:59,660 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:31:59,895 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:02,848 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:02,951 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:03,801 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:03,864 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:04,305 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:04,548 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:04,616 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:04,670 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:06,751 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:07,262 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:08,784 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:08,837 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:09,194 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:09,199 WARNING aieng.agent_evals.tools.sql_database: CTE usage blocked by policy\n",
-      "2026-02-17 18:32:09,261 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:09,740 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:09,901 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:12,921 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:12,930 WARNING aieng.agent_evals.tools.sql_database: CTE usage blocked by policy\n",
-      "2026-02-17 18:32:12,995 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:13,178 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:13,545 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:13,978 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:14,024 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:14,871 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:14,951 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:18,602 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:18,608 WARNING aieng.agent_evals.tools.sql_database: CTE usage blocked by policy\n",
-      "2026-02-17 18:32:18,685 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:18,934 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:19,199 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:19,447 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:19,647 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:22,304 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:22,589 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:22,594 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:22,657 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:32:33,571 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:32:40,596 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:40,602 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:40,604 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:40,608 WARNING aieng.agent_evals.tools.sql_database: CTE usage blocked by policy\n",
-      "2026-02-17 18:32:40,899 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:40,980 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:41,050 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:41,062 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the top 5 selling products per year and the total sales value for each product....'\n",
-      "2026-02-17 18:32:41,118 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:32:43,951 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:32:44,000 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:32:58,072 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:33:10,435 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:10,452 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:10,458 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:10,468 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:11,190 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:11,193 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a monthly sales performance report....'\n",
-      "2026-02-17 18:33:11,237 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:11,286 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:11,343 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:11,391 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:13,913 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:13,986 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:33:23,634 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:33:36,143 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:36,149 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:36,151 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:36,155 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:36,513 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:36,581 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:36,751 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:36,863 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:39,846 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:40,254 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:40,880 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:40,927 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:43,671 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:43,683 WARNING aieng.agent_evals.tools.sql_database: CTE usage blocked by policy\n",
-      "2026-02-17 18:33:43,736 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:44,267 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:44,677 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:46,484 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:47,836 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:47,847 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:47,942 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:49,927 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:50,399 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:52,986 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:53,529 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:53,564 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:53,769 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:54,601 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:54,699 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:33:56,271 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:33:56,353 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:34:04,512 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:34:11,165 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:34:11,175 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:34:11,318 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:34:25,177 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:34:36,448 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:34:36,518 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:34:37,061 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:34:41,909 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:34:42,008 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:34:42,140 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:34:42,595 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:34:45,601 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:34:46,134 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:34:47,264 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:34:47,402 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:34:56,161 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:35:09,004 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:35:09,226 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-17 18:35:12,197 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-17 18:35:12,271 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n",
-      "2026-02-17 18:35:25,116 INFO google_genai.models: AFC is enabled with max remote calls: 10.\n"
+      "\u001b[2K\u001b[2mUninstalled \u001b[1m2 packages\u001b[0m \u001b[2min 51ms\u001b[0m\u001b[0m                                           \n",
+      "\u001b[2K\u001b[2mInstalled \u001b[1m2 packages\u001b[0m \u001b[2min 20ms\u001b[0m\u001b[0m                                \u001b[0m\n",
+      "2026-02-19 10:08:39,162 WARNING opentelemetry.trace: Overriding of current TracerProvider is not allowed\n",
+      "2026-02-19 10:08:39,201 INFO aieng.agent_evals.langfuse: Langfuse tracing initialized successfully (endpoint: https://us.cloud.langfuse.com/api/public/otel)\n",
+      "2026-02-19 10:08:39,215 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
+      "2026-02-19 10:08:39,280 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:39,282 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
+      "2026-02-19 10:08:39,330 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:39,331 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
+      "2026-02-19 10:08:39,377 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:39,379 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on sales revenue by country per year....'\n",
+      "2026-02-19 10:08:39,429 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:39,430 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the 5 highest-value customers per year vs. the average customer....'\n",
+      "2026-02-19 10:08:39,474 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:40,991 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
+      "2026-02-19 10:08:41,060 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:41,061 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on sales revenue by country per year....'\n",
+      "2026-02-19 10:08:41,108 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:41,191 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
+      "2026-02-19 10:08:41,270 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:41,377 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the 5 highest-value customers per year vs. the average customer....'\n",
+      "2026-02-19 10:08:41,430 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:42,133 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
+      "2026-02-19 10:08:42,207 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:43,903 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
+      "2026-02-19 10:08:43,977 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:43,993 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
+      "2026-02-19 10:08:44,038 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:44,681 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
+      "2026-02-19 10:08:44,756 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:44,815 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the 5 highest-value customers per year vs. the average customer....'\n",
+      "2026-02-19 10:08:44,866 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:48,607 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
+      "2026-02-19 10:08:48,682 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:49,481 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
+      "2026-02-19 10:08:49,558 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:50,918 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
+      "2026-02-19 10:08:50,998 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:57,746 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
+      "2026-02-19 10:08:57,823 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:58,032 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
+      "2026-02-19 10:08:58,113 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:59,461 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the top 5 selling products per year and the total sales value for each product....'\n",
+      "2026-02-19 10:08:59,530 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 10:08:59,837 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
+      "2026-02-19 10:08:59,913 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n"
      ]
     }
    ],

From d0724f8f3e53fd28db9893bc418102d32b934ed8 Mon Sep 17 00:00:00 2001
From: Marcelo Lotif 
Date: Thu, 19 Feb 2026 11:46:32 -0500
Subject: [PATCH 5/6] Some other small improvements

---
 .../report_generation/evaluation/offline.py   | 46 +++++++-------
 .../03_Running_Offline_Evaluations.ipynb      | 61 ++++---------------
 implementations/report_generation/evaluate.py |  9 ++-
 3 files changed, 46 insertions(+), 70 deletions(-)

diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation/offline.py b/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation/offline.py
index 0b0a12d..57383a3 100644
--- a/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation/offline.py
+++ b/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation/offline.py
@@ -362,25 +362,29 @@ async def run_agent_with_retry(agent: Agent, agent_input: str) -> list[Event]:
     list[Event]
         The events from the agent run.
     """
-    logger.info(f"Running agent {agent.name} with input '{agent_input[:100]}...'")
-
-    # Create session and runner
-    session_service = InMemorySessionService()
-    runner = Runner(app_name=agent.name, agent=agent, session_service=session_service)
-    current_session = await session_service.create_session(
-        app_name=agent.name,
-        user_id="user",
-        state={},
-    )
-
-    # create the user message and run the agent
-    content = Content(role="user", parts=[Part(text=agent_input)])
-    events = []
-    async for event in runner.run_async(
-        user_id="user",
-        session_id=current_session.id,
-        new_message=content,
-    ):
-        events.append(event)
+    try:
+        logger.info(f"Running agent {agent.name} with input '{agent_input[:100]}...'")
+
+        # Create session and runner
+        session_service = InMemorySessionService()
+        runner = Runner(app_name=agent.name, agent=agent, session_service=session_service)
+        current_session = await session_service.create_session(
+            app_name=agent.name,
+            user_id="user",
+            state={},
+        )
 
-    return events
+        # create the user message and run the agent
+        content = Content(role="user", parts=[Part(text=agent_input)])
+        events = []
+        async for event in runner.run_async(
+            user_id="user",
+            session_id=current_session.id,
+            new_message=content,
+        ):
+            events.append(event)
+
+        return events
+    except Exception as e:
+        logger.error(f"Error running agent {agent.name} with input '{agent_input[:100]}...': {e}")
+        raise e  # raising the exception so the retry mechanism can try again
diff --git a/implementations/report_generation/03_Running_Offline_Evaluations.ipynb b/implementations/report_generation/03_Running_Offline_Evaluations.ipynb
index abae7d7..2c1925d 100644
--- a/implementations/report_generation/03_Running_Offline_Evaluations.ipynb
+++ b/implementations/report_generation/03_Running_Offline_Evaluations.ipynb
@@ -307,61 +307,26 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[2K\u001b[2mUninstalled \u001b[1m2 packages\u001b[0m \u001b[2min 51ms\u001b[0m\u001b[0m                                           \n",
-      "\u001b[2K\u001b[2mInstalled \u001b[1m2 packages\u001b[0m \u001b[2min 20ms\u001b[0m\u001b[0m                                \u001b[0m\n",
-      "2026-02-19 10:08:39,162 WARNING opentelemetry.trace: Overriding of current TracerProvider is not allowed\n",
-      "2026-02-19 10:08:39,201 INFO aieng.agent_evals.langfuse: Langfuse tracing initialized successfully (endpoint: https://us.cloud.langfuse.com/api/public/otel)\n",
-      "2026-02-19 10:08:39,215 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
-      "2026-02-19 10:08:39,280 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:39,282 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
-      "2026-02-19 10:08:39,330 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:39,331 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
-      "2026-02-19 10:08:39,377 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:39,379 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on sales revenue by country per year....'\n",
-      "2026-02-19 10:08:39,429 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:39,430 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the 5 highest-value customers per year vs. the average customer....'\n",
-      "2026-02-19 10:08:39,474 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:40,991 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
-      "2026-02-19 10:08:41,060 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:41,061 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on sales revenue by country per year....'\n",
-      "2026-02-19 10:08:41,108 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:41,191 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
-      "2026-02-19 10:08:41,270 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:41,377 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the 5 highest-value customers per year vs. the average customer....'\n",
-      "2026-02-19 10:08:41,430 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:42,133 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
-      "2026-02-19 10:08:42,207 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:43,903 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
-      "2026-02-19 10:08:43,977 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:43,993 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
-      "2026-02-19 10:08:44,038 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:44,681 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
-      "2026-02-19 10:08:44,756 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:44,815 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the 5 highest-value customers per year vs. the average customer....'\n",
-      "2026-02-19 10:08:44,866 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:48,607 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
-      "2026-02-19 10:08:48,682 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:49,481 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
-      "2026-02-19 10:08:49,558 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:50,918 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
-      "2026-02-19 10:08:50,998 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:57,746 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the average order value per invoice per month....'\n",
-      "2026-02-19 10:08:57,823 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:58,032 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report with the month-over-month trends in sales. The report should include the monthly s...'\n",
-      "2026-02-19 10:08:58,113 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:59,461 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report of the top 5 selling products per year and the total sales value for each product....'\n",
-      "2026-02-19 10:08:59,530 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 10:08:59,837 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
-      "2026-02-19 10:08:59,913 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n"
+      "2026-02-19 11:42:54,641 WARNING opentelemetry.trace: Overriding of current TracerProvider is not allowed\n",
+      "2026-02-19 11:42:54,682 INFO aieng.agent_evals.langfuse: Langfuse tracing initialized successfully (endpoint: https://us.cloud.langfuse.com/api/public/otel)\n",
+      "2026-02-19 11:42:54,698 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
+      "2026-02-19 11:42:54,746 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 11:42:56,287 ERROR aieng.agent_evals.report_generation.evaluation.offline: Error running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...': 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'This model is currently experiencing high demand. Spikes in demand are usually temporary. Please try again later.', 'status': 'UNAVAILABLE'}}\n",
+      "2026-02-19 11:42:57,288 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
+      "2026-02-19 11:42:57,362 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
+      "2026-02-19 11:44:20,051 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
+      "2026-02-19 11:44:20,051 WARNING google_genai.types: Warning: there are non-text parts in the response: ['function_call'], returning concatenated text result from text parts. Check the full candidates.content.parts accessor to get the full model response.\n",
+      "2026-02-19 11:44:20,128 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n"
      ]
     }
    ],
    "source": [
     "# Running as a CLI command to avoid issues between Langfuse's\n",
     "# experiment runner and Jupyter\n",
-    "# NOTE: This will take a while to execute\n",
+    "# NOTE: This will take a while to execute in a notebook environment\n",
+    "# It runs faster when executed in a regular console session\n",
     "\n",
-    "!uv run --env-file .env python -m implementations.report_generation.evaluate"
+    "!uv run --env-file .env python -m implementations.report_generation.evaluate --max-concurrency 1"
    ]
   },
   {
diff --git a/implementations/report_generation/evaluate.py b/implementations/report_generation/evaluate.py
index d46fda4..6fc48cb 100644
--- a/implementations/report_generation/evaluate.py
+++ b/implementations/report_generation/evaluate.py
@@ -31,7 +31,13 @@
     default=DEFAULT_EVALUATION_DATASET_NAME,
     help="Name of the Langfuse dataset to evaluate against.",
 )
-def cli(dataset_name: str):
+@click.option(
+    "--max-concurrency",
+    default=5,
+    type=int,
+    help="Maximum concurrent agent runs (default: 5).",
+)
+def cli(dataset_name: str, max_concurrency: int):
     """Command line interface to call the evaluate function.
 
     Parameters
@@ -45,6 +51,7 @@ def cli(dataset_name: str):
             dataset_name,
             reports_output_path=get_reports_output_path(),
             langfuse_project_name=get_langfuse_project_name(),
+            max_concurrency=max_concurrency,
         )
     )
 

From 31227837c6b6f64909f9e77c84f9fcff638a646d Mon Sep 17 00:00:00 2001
From: Marcelo Lotif 
Date: Thu, 19 Feb 2026 15:50:40 -0500
Subject: [PATCH 6/6] CR by Amrit

---
 .../01_Importing_the_Dataset.ipynb            | 246 +-----------------
 .../02_Running_The_Agent.ipynb                |  40 +--
 .../03_Running_Offline_Evaluations.ipynb      | 132 +---------
 .../data/import_online_retail_data.py         |   2 +-
 4 files changed, 22 insertions(+), 398 deletions(-)

diff --git a/implementations/report_generation/01_Importing_the_Dataset.ipynb b/implementations/report_generation/01_Importing_the_Dataset.ipynb
index a4d5b99..41b9225 100644
--- a/implementations/report_generation/01_Importing_the_Dataset.ipynb
+++ b/implementations/report_generation/01_Importing_the_Dataset.ipynb
@@ -32,19 +32,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "4cc2db20-296f-4822-916c-b8255073c066",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The notebook path has been set to: /Users/marcelolotif/workspace/eval-agents\n",
-      "All environment variables have been set.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os\n",
     "import ssl\n",
@@ -103,21 +94,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "554f1cc6-c42f-4fe3-8857-214fcbeafd95",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading the dataset...\n",
-      "Extracting the dataset file...\n",
-      "Converting the dataset file from .xls to .csv...\n",
-      "Done!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "url = \"https://archive.ics.uci.edu/static/public/352/online+retail.zip\"\n",
     "zip_file_path = DATA_FOLDER / \"online_retail.zip\"\n",
@@ -150,203 +130,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "123d37f3-fd6f-4676-8f84-8bcfa45a0535",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
053636585123AWHITE HANGING HEART T-LIGHT HOLDER62010-12-01 08:26:002.5517850.0United Kingdom
153636571053WHITE METAL LANTERN62010-12-01 08:26:003.3917850.0United Kingdom
253636584406BCREAM CUPID HEARTS COAT HANGER82010-12-01 08:26:002.7517850.0United Kingdom
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE62010-12-01 08:26:003.3917850.0United Kingdom
453636584029ERED WOOLLY HOTTIE WHITE HEART.62010-12-01 08:26:003.3917850.0United Kingdom
...........................
54190458158722613PACK OF 20 SPACEBOY NAPKINS122011-12-09 12:50:000.8512680.0France
54190558158722899CHILDREN'S APRON DOLLY GIRL62011-12-09 12:50:002.1012680.0France
54190658158723254CHILDRENS CUTLERY DOLLY GIRL42011-12-09 12:50:004.1512680.0France
54190758158723255CHILDRENS CUTLERY CIRCUS PARADE42011-12-09 12:50:004.1512680.0France
54190858158722138BAKING SET 9 PIECE RETROSPOT32011-12-09 12:50:004.9512680.0France
\n", - "

541909 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " InvoiceNo StockCode Description Quantity \\\n", - "0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 \n", - "1 536365 71053 WHITE METAL LANTERN 6 \n", - "2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 \n", - "3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 \n", - "4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 \n", - "... ... ... ... ... \n", - "541904 581587 22613 PACK OF 20 SPACEBOY NAPKINS 12 \n", - "541905 581587 22899 CHILDREN'S APRON DOLLY GIRL 6 \n", - "541906 581587 23254 CHILDRENS CUTLERY DOLLY GIRL 4 \n", - "541907 581587 23255 CHILDRENS CUTLERY CIRCUS PARADE 4 \n", - "541908 581587 22138 BAKING SET 9 PIECE RETROSPOT 3 \n", - "\n", - " InvoiceDate UnitPrice CustomerID Country \n", - "0 2010-12-01 08:26:00 2.55 17850.0 United Kingdom \n", - "1 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", - "2 2010-12-01 08:26:00 2.75 17850.0 United Kingdom \n", - "3 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", - "4 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", - "... ... ... ... ... \n", - "541904 2011-12-09 12:50:00 0.85 12680.0 France \n", - "541905 2011-12-09 12:50:00 2.10 12680.0 France \n", - "541906 2011-12-09 12:50:00 4.15 12680.0 France \n", - "541907 2011-12-09 12:50:00 4.15 12680.0 France \n", - "541908 2011-12-09 12:50:00 4.95 12680.0 France \n", - "\n", - "[541909 rows x 8 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df = pd.read_csv(DATASET_PATH)\n", "df # noqa: B018" @@ -364,27 +151,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "ee28609b-6eed-4aea-a5e0-c7d6df57e0af", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-02-17 14:08:56,816 INFO implementations.report_generation.data.import_online_retail_data: Creating tables according to the OnlineRetail.ddl file\n", - "2026-02-17 14:08:56,820 INFO implementations.report_generation.data.import_online_retail_data: Importing dataset from implementations/report_generation/data/OnlineRetail.csv to database at implementations/report_generation/data/OnlineRetail.db\n", - "2026-02-17 14:09:00,205 INFO implementations.report_generation.data.import_online_retail_data: Dataset imported successfully to database at implementations/report_generation/data/OnlineRetail.db\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done!\n" - ] - } - ], + "outputs": [], "source": [ "import_online_retail_data(DATASET_PATH)\n", "print(\"Done!\")" diff --git a/implementations/report_generation/02_Running_The_Agent.ipynb b/implementations/report_generation/02_Running_The_Agent.ipynb index 7fc8628..0a61e0f 100644 --- a/implementations/report_generation/02_Running_The_Agent.ipynb +++ b/implementations/report_generation/02_Running_The_Agent.ipynb @@ -32,19 +32,10 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "66b46b94-8e30-4627-9265-4fe09af2a4c2", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Notebook path is already the root path: /Users/marcelolotif/workspace/eval-agents\n", - "All environment variables have been set.\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "from pathlib import Path\n", @@ -112,33 +103,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "fceeac6a-309d-484f-98dc-227f875387a6", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "* Running on local URL: http://127.0.0.1:7863\n", - "* Running on public URL: https://cf16cc30b2b942de91.gradio.live\n", - "\n", - "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" - ] - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "await start_gradio_app(\n", " enable_trace=True,\n", diff --git a/implementations/report_generation/03_Running_Offline_Evaluations.ipynb b/implementations/report_generation/03_Running_Offline_Evaluations.ipynb index 2c1925d..81561c2 100644 --- a/implementations/report_generation/03_Running_Offline_Evaluations.ipynb +++ b/implementations/report_generation/03_Running_Offline_Evaluations.ipynb @@ -9,7 +9,7 @@ "\n", "Offline evaluations are evaluations run against a **pre-defined dataset**. It performs **detailed evaluations** of the **outputs** of the agentic system and the **steps** it has taken to produce those evaluations.\n", "\n", - "This dataset is called the **expected results** or the **ground-truth** dataset, and on this case it's a **handcrafted** dataset with **inputs, oputputs and trajectory** for a few known use cases.\n", + "This dataset is called the **expected results** or the **ground-truth** dataset, and on this case it's a **handcrafted** dataset with **inputs, outputs and trajectory** for a few known use cases.\n", "\n", "The evaluations are run by Langfuse and the results are visualized there." ] @@ -28,19 +28,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "3adfd868-c4da-4bdb-ae13-0ebe56c3ed97", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The notebook path has been set to: /Users/marcelolotif/workspace/eval-agents\n", - "All environment variables have been set.\n" - ] - } - ], + "outputs": [], "source": [ "import json\n", "import os\n", @@ -91,59 +82,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "9e943f83-7178-43bc-be17-0e550564002f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ground-truth dataset size: 7\n", - "First element:\n", - "{'expected_output': {'final_report': {'filename': 'monthly_sales_performance_report.xlsx',\n", - " 'report_columns': ['SalesMonth',\n", - " 'TotalSales'],\n", - " 'report_data': [['2010-12', 748957.02],\n", - " ['2011-01', 560000.26],\n", - " ['2011-02', 498062.65],\n", - " ['2011-03', 683267.08],\n", - " ['2011-04', 493207.12],\n", - " ['2011-05', 723333.51],\n", - " ['2011-06', 691123.12],\n", - " ['2011-07', 681300.11],\n", - " ['2011-08', 682680.51],\n", - " ['2011-09', 1019687.62],\n", - " ['2011-10', 1070704.67],\n", - " ['2011-11', 1461756.25],\n", - " ['2011-12', 433668.01]]},\n", - " 'trajectory': {'actions': ['get_schema_info',\n", - " 'execute',\n", - " 'execute',\n", - " 'write_xlsx',\n", - " 'output_text'],\n", - " 'description': ['Check what are the tables '\n", - " 'that are available in the '\n", - " 'database',\n", - " 'Check what are the '\n", - " 'columns that are '\n", - " 'available in the sales '\n", - " 'table',\n", - " 'Query to retrieve the '\n", - " 'sales performance '\n", - " '(quantity * price) per '\n", - " 'month',\n", - " 'Send the report data to '\n", - " 'the function that writes '\n", - " 'the report to disk',\n", - " 'Output text to the user '\n", - " 'with the report file as a '\n", - " 'Gradio hyperlink']}},\n", - " 'id': '1',\n", - " 'input': 'Generate a monthly sales performance report.'}\n" - ] - } - ], + "outputs": [], "source": [ "with open(\"implementations/report_generation/data/OnlineRetailReportEval.json\") as f:\n", " ground_truth = json.load(f)\n", @@ -192,49 +134,10 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "bd692600-d936-4c7b-af94-5ddd19b59bfd", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-02-17 18:13:52,719 INFO aieng.agent_evals.langfuse: Loading dataset from 'implementations/report_generation/data/OnlineRetailReportEval.json'\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9e6b0cea9e7e400a8be7d1249f314da6", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2026-02-17 18:13:54,052 INFO aieng.agent_evals.langfuse: Uploaded 7 items to dataset 'OnlineRetailReportEval'\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "await upload_dataset_to_langfuse(\n",
     "    EVALUATION_DATASET_PATH,\n",
@@ -294,7 +197,7 @@
    "source": [
     "## Running the Evaluations\n",
     "\n",
-    "To run those two evaluatoirs against all of the ground-truth dataset samples, run the function below:"
+    "To run those two evaluators against all of the ground-truth dataset samples, run the function below:"
    ]
   },
   {
@@ -302,24 +205,7 @@
    "execution_count": null,
    "id": "77d38a16-526a-4740-953a-dc735a9f83f7",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2026-02-19 11:42:54,641 WARNING opentelemetry.trace: Overriding of current TracerProvider is not allowed\n",
-      "2026-02-19 11:42:54,682 INFO aieng.agent_evals.langfuse: Langfuse tracing initialized successfully (endpoint: https://us.cloud.langfuse.com/api/public/otel)\n",
-      "2026-02-19 11:42:54,698 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
-      "2026-02-19 11:42:54,746 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 11:42:56,287 ERROR aieng.agent_evals.report_generation.evaluation.offline: Error running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...': 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'This model is currently experiencing high demand. Spikes in demand are usually temporary. Please try again later.', 'status': 'UNAVAILABLE'}}\n",
-      "2026-02-19 11:42:57,288 INFO aieng.agent_evals.report_generation.evaluation.offline: Running agent ReportGenerationAgent with input 'Generate a report on the average amount spent by one time buyers for each year vs. the average custo...'\n",
-      "2026-02-19 11:42:57,362 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n",
-      "2026-02-19 11:44:20,051 INFO google_adk.google.adk.models.google_llm: Response received from the model.\n",
-      "2026-02-19 11:44:20,051 WARNING google_genai.types: Warning: there are non-text parts in the response: ['function_call'], returning concatenated text result from text parts. Check the full candidates.content.parts accessor to get the full model response.\n",
-      "2026-02-19 11:44:20,128 INFO google_adk.google.adk.models.google_llm: Sending out request, model: gemini-3-pro-preview, backend: GoogleLLMVariant.GEMINI_API, stream: False\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Running as a CLI command to avoid issues between Langfuse's\n",
     "# experiment runner and Jupyter\n",
diff --git a/implementations/report_generation/data/import_online_retail_data.py b/implementations/report_generation/data/import_online_retail_data.py
index 9c93839..f469d6e 100644
--- a/implementations/report_generation/data/import_online_retail_data.py
+++ b/implementations/report_generation/data/import_online_retail_data.py
@@ -26,7 +26,7 @@
 
 
 @click.command()
-@click.option("--dataset-path", required=True, help="OnlieRetail dataset CSV path.")
+@click.option("--dataset-path", required=True, help="OnlineRetail dataset CSV path.")
 def cli(dataset_path: str) -> None:
     """CLI entry point to import the Online Retail dataset to the database.