diff --git a/.gitignore b/.gitignore index 9ceead8..59e362d 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,9 @@ wheels/ .gradio .adk *.db +*.db-journal implementations/report_generation/reports/* +implementations/report_generation/data/*.zip +implementations/report_generation/data/*.csv +implementations/report_generation/data/*.xls +implementations/report_generation/data/*.xlsx diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py b/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py index 1374490..9906f32 100644 --- a/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py +++ b/aieng-eval-agents/aieng/agent_evals/report_generation/agent.py @@ -75,8 +75,8 @@ def get_report_generation_agent( model=client_manager.configs.default_worker_model, instruction=instructions, tools=[ - db_manager.report_generation_db().execute, db_manager.report_generation_db().get_schema_info, + db_manager.report_generation_db().execute, report_file_writer.write_xlsx, ], after_agent_callback=after_agent_callback, diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation/offline.py b/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation/offline.py index 0b0a12d..57383a3 100644 --- a/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation/offline.py +++ b/aieng-eval-agents/aieng/agent_evals/report_generation/evaluation/offline.py @@ -362,25 +362,29 @@ async def run_agent_with_retry(agent: Agent, agent_input: str) -> list[Event]: list[Event] The events from the agent run. """ - logger.info(f"Running agent {agent.name} with input '{agent_input[:100]}...'") - - # Create session and runner - session_service = InMemorySessionService() - runner = Runner(app_name=agent.name, agent=agent, session_service=session_service) - current_session = await session_service.create_session( - app_name=agent.name, - user_id="user", - state={}, - ) - - # create the user message and run the agent - content = Content(role="user", parts=[Part(text=agent_input)]) - events = [] - async for event in runner.run_async( - user_id="user", - session_id=current_session.id, - new_message=content, - ): - events.append(event) + try: + logger.info(f"Running agent {agent.name} with input '{agent_input[:100]}...'") + + # Create session and runner + session_service = InMemorySessionService() + runner = Runner(app_name=agent.name, agent=agent, session_service=session_service) + current_session = await session_service.create_session( + app_name=agent.name, + user_id="user", + state={}, + ) - return events + # create the user message and run the agent + content = Content(role="user", parts=[Part(text=agent_input)]) + events = [] + async for event in runner.run_async( + user_id="user", + session_id=current_session.id, + new_message=content, + ): + events.append(event) + + return events + except Exception as e: + logger.error(f"Error running agent {agent.name} with input '{agent_input[:100]}...': {e}") + raise e # raising the exception so the retry mechanism can try again diff --git a/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py b/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py index 8c33585..733bbc5 100644 --- a/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py +++ b/aieng-eval-agents/aieng/agent_evals/report_generation/file_writer.py @@ -38,7 +38,7 @@ def __init__(self, reports_output_path: Path): def write_xlsx( self, - report_data: list[Any], + report_data: list[list[Any]], report_columns: list[str], filename: str = "report.xlsx", gradio_link: bool = True, @@ -47,7 +47,7 @@ def write_xlsx( Parameters ---------- - report_data : list[Any] + report_data : list[list[Any]] The data of the report. report_columns : list[str] The columns of the report. diff --git a/implementations/report_generation/01_Importing_the_Dataset.ipynb b/implementations/report_generation/01_Importing_the_Dataset.ipynb new file mode 100644 index 0000000..41b9225 --- /dev/null +++ b/implementations/report_generation/01_Importing_the_Dataset.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cc016c29-e5e8-4338-9a31-0fa6700505ff", + "metadata": {}, + "source": [ + "# Importing the Dataset for the Report Generation Agent\n", + "\n", + "This notebook implements the **data import** for the **Report Generation Agent** for single-table relational\n", + "data source.\n", + "\n", + "The data source implemented here is an [SQLite](https://sqlite.org/) database which is supported\n", + "natively by Python and saves the data in disk.\n", + "[SQLAlchemy](https://www.sqlalchemy.org/) is used as a SQL connection tool so this\n", + "SQL connection can be easily swapped for other databases.\n", + "\n", + "The SQL Alchemy tool is set up to allow **read-only queries**, so there is **no risk** the agent runs queries that can modify the DB data." + ] + }, + { + "cell_type": "markdown", + "id": "8499a56f-716f-47a6-b255-8bdbbe0fd777", + "metadata": {}, + "source": [ + "## Setting up\n", + "\n", + "The code below sets the notebook default folder, sets the default constants and checks the presence of the environment variables.\n", + "\n", + "The environment variables can be set in the `.env` file in the root folder of the project." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cc2db20-296f-4822-916c-b8255073c066", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import ssl\n", + "import urllib.request\n", + "import zipfile\n", + "from pathlib import Path\n", + "\n", + "import certifi\n", + "import pandas as pd\n", + "from aieng.agent_evals.async_client_manager import AsyncClientManager\n", + "\n", + "\n", + "# Setting the notebook directory to the project's root folder\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Notebook path is already the root path: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"The notebook path has been set to: {Path('').absolute()}\")\n", + "\n", + "client_manager = AsyncClientManager.get_instance()\n", + "assert client_manager.configs.report_generation_db.database, (\n", + " \"[ERROR] The database path is not set! Please configure the REPORT_GENERATION_DB__DATABASE environment variable.\"\n", + ")\n", + "\n", + "print(\"All environment variables have been set.\")\n", + "\n", + "DATA_FOLDER = Path(\"implementations/report_generation/data\")\n", + "DATASET_PATH = DATA_FOLDER / \"OnlineRetail.csv\"\n", + "\n", + "from implementations.report_generation.data.import_online_retail_data import import_online_retail_data # noqa: E402" + ] + }, + { + "cell_type": "markdown", + "id": "0aa0bdf0-a7ba-4458-868b-07d627b12ed9", + "metadata": {}, + "source": [ + "## Dataset\n", + "\n", + "The dataset used in this example is the\n", + "**[Online Retail](https://archive.ics.uci.edu/dataset/352/online+retail) dataset**. It contains\n", + "information about **invoices** for products that were purchased by customers, which also includes\n", + "product quantity, the invoice date and the country that the customer resides in. For a more\n", + "detailed data structure, please check the [OnlineRetail.ddl](http://localhost:8888/lab/tree/implementations/report_generation/data/OnlineRetail.ddl) file." + ] + }, + { + "cell_type": "markdown", + "id": "553dceaa-8fe7-4e4f-9940-b2d1e8d8d6ee", + "metadata": {}, + "source": [ + "## Downloading the Dataset\n", + "\n", + "The code below will **download and unzip** the dataset to the `implementations/report_generation/data/` folder." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "554f1cc6-c42f-4fe3-8857-214fcbeafd95", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://archive.ics.uci.edu/static/public/352/online+retail.zip\"\n", + "zip_file_path = DATA_FOLDER / \"online_retail.zip\"\n", + "xlsx_file_path = DATA_FOLDER / \"Online Retail.xlsx\"\n", + "\n", + "print(\"Downloading the dataset...\")\n", + "ctx = ssl.create_default_context(cafile=certifi.where())\n", + "req = urllib.request.Request(url)\n", + "with urllib.request.urlopen(req, context=ctx) as resp, open(zip_file_path, \"wb\") as f:\n", + " f.write(resp.read())\n", + "\n", + "print(\"Extracting the dataset file...\")\n", + "with zipfile.ZipFile(zip_file_path, \"r\") as zf:\n", + " zf.extractall(DATA_FOLDER)\n", + "\n", + "print(\"Converting the dataset file from .xls to .csv...\")\n", + "df = pd.read_excel(xlsx_file_path)\n", + "df.to_csv(DATASET_PATH, index=False)\n", + "\n", + "print(\"Done!\")" + ] + }, + { + "cell_type": "markdown", + "id": "2e4e45de-1f07-41de-bf9c-f9c2543b8cb3", + "metadata": {}, + "source": [ + "## Visualizing the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "123d37f3-fd6f-4676-8f84-8bcfa45a0535", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(DATASET_PATH)\n", + "df # noqa: B018" + ] + }, + { + "cell_type": "markdown", + "id": "ec3a84cb-3636-460a-a9bd-e6ea57d3f9d7", + "metadata": {}, + "source": [ + "## Importing the Data\n", + "\n", + "The code below will import the `.csv` dataset to the database at the path set by the `REPORT_GENERATION_DB__DATABASE` environment variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee28609b-6eed-4aea-a5e0-c7d6df57e0af", + "metadata": {}, + "outputs": [], + "source": [ + "import_online_retail_data(DATASET_PATH)\n", + "print(\"Done!\")" + ] + }, + { + "cell_type": "markdown", + "id": "ce2ab32d-859f-44b1-9c54-cf9c2c1f1da4", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "Now the data should be ready to be consumed by the agent on the **next notebook**." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/implementations/report_generation/02_Running_The_Agent.ipynb b/implementations/report_generation/02_Running_The_Agent.ipynb new file mode 100644 index 0000000..0a61e0f --- /dev/null +++ b/implementations/report_generation/02_Running_The_Agent.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "66ea5037-7ad4-4e2e-b55d-f879fa9dc436", + "metadata": {}, + "source": [ + "# Running the Report Generation Agent\n", + "\n", + "This notebook runs the **Report Generation Agent** for **single-table relational\n", + "data source** as a Gradio Demo UI and evaluations with [Langfuse](https://langfuse.com/).\n", + "\n", + "The Report Generation Agent Gradio Demo will provide an UI to read **user queries in natural language**\n", + "and proceed to **make SQL queries** to the database in order to produce the data for\n", + "the report. At the end, the Agent will provide a **downloadable link** to the report as\n", + "an `.xlsx` file.\n", + "\n", + "This example also provides agent monitoring and evaluations using Langfuse." + ] + }, + { + "cell_type": "markdown", + "id": "1d72ee27-a68a-4349-af4f-fde4bca8663a", + "metadata": {}, + "source": [ + "## Setting up\n", + "\n", + "The code below sets the notebook default folder, sets the default constants and checks the presence of the environment variables.\n", + "\n", + "The environment variables can be set in the `.env` file in the root folder of the project." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66b46b94-8e30-4627-9265-4fe09af2a4c2", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "from aieng.agent_evals.async_client_manager import AsyncClientManager\n", + "\n", + "\n", + "# Setting the notebook directory to the project's root folder\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Notebook path is already the root path: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"The notebook path has been set to: {Path('').absolute()}\")\n", + "\n", + "client_manager = AsyncClientManager.get_instance()\n", + "assert client_manager.configs.report_generation_db.database, (\n", + " \"[ERROR] The database path is not set! Please configure the REPORT_GENERATION_DB__DATABASE environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_secret_key, (\n", + " \"[ERROR] The Langfuse secret key is not set! Please configure the LANGFUSE_SECRET_KEY environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_public_key, (\n", + " \"[ERROR] The Langfuse public key is not set! Please configure the LANGFUSE_PUBLIC_KEY environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_host, (\n", + " \"[ERROR] The Langfuse base URL is not set! Please configure the LANGFUSE_BASE_URL environment variable.\"\n", + ")\n", + "\n", + "from implementations.report_generation.demo import start_gradio_app # noqa: E402\n", + "\n", + "\n", + "print(\"All environment variables have been set.\")" + ] + }, + { + "cell_type": "markdown", + "id": "ec0522bc-01f5-4554-bc57-39de8dbb4c53", + "metadata": {}, + "source": [ + "## Running the Demo UI\n", + "\n", + "The code below will start the **Gradio Demo UI**, which will run embedded in this notebook (made possible by the `enable_public_link=True` parameter). If you so choose, you can also run the UI in your browser using the links provided in the outputs.\n", + "\n", + "The UI will display a few **pre-defined input options** to test the agent and also a text box so you can **make your own queries**. The Agent will try to complete the request by making **queries against the database** configured in the first notebook. At the end, it will **produce a report** that it will make available as a download link at the end of the agent's execution.\n", + "\n", + "The UI will display the **agents thoughts** and **tool calls** in order to facilitate debugging.\n", + "\n", + "The agent **traces will be sent** to [Langfuse](https://us.cloud.langfuse.com/). To configure the langfuse connection, you can edit the following environment variables in your `.env` file:\n", + "\n", + "```python\n", + "# Secret and public keys can be generated by the Langfuse web UI\n", + "LANGFUSE_SECRET_KEY=\"sk-lf-...\"\n", + "LANGFUSE_PUBLIC_KEY=\"pk-lf-...\"\n", + "LANGFUSE_BASE_URL=\"https://us.cloud.langfuse.com\"\n", + "\n", + "# Input the name of your langfuse project on the first variable\n", + "# The second variable is used to configure where the output reports are going to be saved\n", + "# Defaults are in implementations/report_generation/env_vars.py)\n", + "REPORT_GENERATION_LANGFUSE_PROJECT_NAME=\"...\"\n", + "REPORT_GENERATION_OUTPUT_PATH=\"...\"\n", + "```\n", + "\n", + "Run the cell below to see the Report Generation Agent in action." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fceeac6a-309d-484f-98dc-227f875387a6", + "metadata": {}, + "outputs": [], + "source": [ + "await start_gradio_app(\n", + " enable_trace=True,\n", + " enable_public_link=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3bdee1c5-f5d9-485a-9a48-f5aab2c41d04", + "metadata": {}, + "source": [ + "## Agent Configuration\n", + "\n", + "This agent is a **text-to-SQL** agent that will convert natural language into one or multiple queries to a DB in order to produce the result that has been requested.\n", + "\n", + "This agent in particular has **no knowledge of the DB schema**, although it is possible to do so with some more advanced techniques. This allows the agent to be **flexible** to any database it may encounter. However, it is **easier** to have it working with a **single-table DB** as opposed to a multi-table DB.\n", + "\n", + "It is configured with **three tools** (as per `aieng.agent_evals.evaluation.report_generation.agent.get_report_generation_agent`):\n", + "- `db_manager.report_generation_db().get_schema_info`: to retrieve the **DB schema** so the agent knows how to perform the queries.\n", + "- `db_manager.report_generation_db().execute`: to execute any **read-only SQL queries**.\n", + "- `report_file_writer.write_xlsx`: a function that receives the **report data** as an array, writes the array to a `.xlsx` file and returns a **downloadable Gradio link** to the file.\n", + "\n", + "The Agent will know how to use those tools given the **instructions** it has been given, along with the user input. Below are the instructions for this agent in specific (as per `aieng.agent_evals.evaluation.report_generation.prompts`:\n", + "```python\n", + "MAIN_AGENT_INSTRUCTIONS = \"\"\"\\\n", + "Perform the task using the SQLite database tool. \\\n", + "EACH TIME before invoking the function, you must explain your reasons for doing so. \\\n", + "If the SQL query did not return intended results, try again. \\\n", + "For best performance, divide complex queries into simpler sub-queries. \\\n", + "Do not make up information. \\\n", + "When the report is done, use the report file writer tool to write it to a file. \\\n", + "Make sure the \"write_xlsx\" tool is called so it generates the report file. \\\n", + "At the end, provide the report file as a downloadable hyperlink to the user. \\\n", + "Make sure the link can be clicked on by the user.\n", + "\"\"\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "b1d0398e-944a-4005-a9e3-44dbb050441a", + "metadata": {}, + "source": [ + "## Online Evaluations\n", + "\n", + "The Agent will be sending online evaluation metrics to Langfuse along with the traces. These metrics aim to **simulate** how an evaluation of a **production agentic system** would work for this use case.\n", + "\n", + "Those metrics are:\n", + "- A check of the **total token spent** to complete the request. If it is below the threshold of 15k tokens it will send a score of 1, or a score of 0 otherwise.\n", + "- A check of the **total time spent** to complete the request. If it is below the threshold of 60 seconds it will send a score of 1, or a score of 0 otherwise.\n", + "- At the end of the Agent run, the UI will display thumbs up/thumbs down buttons so the **users can send feedback** on the output of the agent.\n", + "\n", + "The calculation for the first two metrics are triggered by a **callback function** that runs at the end of the agent run, while the third one is triggered asynchronously by an **user UI action**.\n", + "\n", + "Those metrics will be available in Langfuse in two different ways:\n", + "- **Aggregated:** Summary of those metrics will be displayed on the dashboards page\n", + "- **Individualized:** The evaluation scores are available on each trace with additional details" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/implementations/report_generation/03_Running_Offline_Evaluations.ipynb b/implementations/report_generation/03_Running_Offline_Evaluations.ipynb new file mode 100644 index 0000000..81561c2 --- /dev/null +++ b/implementations/report_generation/03_Running_Offline_Evaluations.ipynb @@ -0,0 +1,266 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e1ddcbff-a950-42ee-be8b-a3663ae6b202", + "metadata": {}, + "source": [ + "## Running the Offline Evaluations for the Report Generation Agent\n", + "\n", + "Offline evaluations are evaluations run against a **pre-defined dataset**. It performs **detailed evaluations** of the **outputs** of the agentic system and the **steps** it has taken to produce those evaluations.\n", + "\n", + "This dataset is called the **expected results** or the **ground-truth** dataset, and on this case it's a **handcrafted** dataset with **inputs, outputs and trajectory** for a few known use cases.\n", + "\n", + "The evaluations are run by Langfuse and the results are visualized there." + ] + }, + { + "cell_type": "markdown", + "id": "db74c461-bcda-49b2-bd4e-5ea961798d55", + "metadata": {}, + "source": [ + "## Setting up\n", + "\n", + "The code below sets the notebook default folder, sets the default constants and checks the presence of the environment variables.\n", + "\n", + "The environment variables can be set in the `.env` file in the root folder of the project." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3adfd868-c4da-4bdb-ae13-0ebe56c3ed97", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from pathlib import Path\n", + "from pprint import pprint\n", + "\n", + "from aieng.agent_evals.async_client_manager import AsyncClientManager\n", + "from aieng.agent_evals.langfuse import upload_dataset_to_langfuse\n", + "\n", + "\n", + "# Setting the notebook directory to the project's root folder\n", + "if Path(\"\").absolute().name == \"eval-agents\":\n", + " print(f\"Notebook path is already the root path: {Path('').absolute()}\")\n", + "else:\n", + " os.chdir(Path(\"\").absolute().parent.parent)\n", + " print(f\"The notebook path has been set to: {Path('').absolute()}\")\n", + "\n", + "client_manager = AsyncClientManager.get_instance()\n", + "assert client_manager.configs.report_generation_db.database, (\n", + " \"[ERROR] The database path is not set! Please configure the REPORT_GENERATION_DB__DATABASE environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_secret_key, (\n", + " \"[ERROR] The Langfuse secret key is not set! Please configure the LANGFUSE_SECRET_KEY environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_public_key, (\n", + " \"[ERROR] The Langfuse public key is not set! Please configure the LANGFUSE_PUBLIC_KEY environment variable.\"\n", + ")\n", + "assert client_manager.configs.langfuse_host, (\n", + " \"[ERROR] The Langfuse base URL is not set! Please configure the LANGFUSE_BASE_URL environment variable.\"\n", + ")\n", + "\n", + "print(\"All environment variables have been set.\")\n", + "\n", + "\n", + "EVALUATION_DATASET_PATH = \"implementations/report_generation/data/OnlineRetailReportEval.json\"\n", + "LANGFUSE_DATASET_NAME = \"OnlineRetailReportEval\"" + ] + }, + { + "cell_type": "markdown", + "id": "b64fe764-9355-40c6-8632-fccca34acce9", + "metadata": {}, + "source": [ + "## Taking a Look at the Ground Truth Dataset\n", + "\n", + "The ground-truth dataset is located at `implementations/report_generation/data/OnlineRetailReportEval.json`. The code below will display one of its elements as an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e943f83-7178-43bc-be17-0e550564002f", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"implementations/report_generation/data/OnlineRetailReportEval.json\") as f:\n", + " ground_truth = json.load(f)\n", + "\n", + "print(f\"Ground-truth dataset size: {len(ground_truth)}\")\n", + "print(\"First element:\")\n", + "pprint(ground_truth[0])" + ] + }, + { + "cell_type": "markdown", + "id": "06eeee29-b23e-4bad-b20f-ef32610a5570", + "metadata": {}, + "source": [ + "Here is an explanation of the data structure of the dataset samples:\n", + "```python\n", + "{\n", + " 'id': str, # The ID of the sample\n", + " 'input': str, # The input to be used to test the report generation agent\n", + " 'expected_output': { # The expected outputs of the agent\n", + " 'final_report': { # The output data for the final report the agent generates. \n", + " # These values match the input the agent sends to the `write_xlsx` function\n", + " 'filename': str, # The name of the report file\n", + " 'report_columns': list[str, # The names of the columns of the report\n", + " 'report_data': list[list[Any]], # a bidimensional array of values for the rows of the report\n", + " }\n", + " 'trajectory': { # information about the trajectory the agent should take to produce the report\n", + " 'actions': list[str], # A list of the names of the actions the agent should take, in order\n", + " 'description': list[str], # A description of what the parameters that are sent to each one of\n", + " # the actions are supposed to be doing\n", + " }\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "69e993b7-a93d-468e-84e9-0e1906108335", + "metadata": {}, + "source": [ + "## Uploading the dataset to Langfuse\n", + "\n", + "Use the function below to **upload** the ground truth dataset to Langfuse so it can be used **during the evaluation**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd692600-d936-4c7b-af94-5ddd19b59bfd", + "metadata": {}, + "outputs": [], + "source": [ + "await upload_dataset_to_langfuse(\n", + " EVALUATION_DATASET_PATH,\n", + " LANGFUSE_DATASET_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "abc8c175-6315-41bf-800a-d913e3e2b5c0", + "metadata": {}, + "source": [ + "## LLM-as-a-judge Evaluators\n", + "\n", + "Two **LLM-as-a-judge evaluators** are set to run against this dataset and the agent's output:\n", + "1. A **Final Result Evaluator**, that will evaluate the agent's output against the contents of the `final_result` key\n", + "2. A **Trajectory Evaluator**, that will evaluate the agent's output against the contents of the `trajectory` key\n", + "\n", + "Here are the instructions for both of those agents (as per `aieng.agent_evals.evaluation.report_generation.prompts`):\n", + "```python\n", + "TRAJECTORY_EVALUATOR_INSTRUCTIONS = \"\"\"\\\n", + "You are evaluating if an agent has followed the correct trajectory to generate a report.\\\n", + "The agent is a Report Generation Agent that uses the SQLite database tool to generate a report\\\n", + "and return the report as a downloadable file to the user.\\\n", + "You will be presented with the \"Question\" that has been asked to the agent along with two sets of data:\\\n", + "- The \"Expected Trajectory\" of the agent, which contains:\\\n", + " - A list ids for the actions the agent is expected to perform\\\n", + " - A list of rough descriptions of what has been passed as parameters to the actions\\\n", + "- The \"Actual Trajectory\" of the agent, which contains:\\\n", + " - A list ids for the actions the agent performed\\\n", + " - A list of parameters that has been passed to each one of the actions\\\n", + "It's OK if the agent makes mistakes and performs additional steps, or if the queries do not exactly match\\\n", + "the description, as long as the queries performed end up satisfying the \"Question\".\\\n", + "It is important that the last action to be of type \"final_response\" and that it produces a link to the report file.\n", + "\"\"\"\n", + "\n", + "RESULT_EVALUATOR_INSTRUCTIONS = \"\"\"\\\n", + "Evaluate whether the \"Proposed Answer\" to the given \"Question\" matches the \"Ground Truth\". \\\n", + "Disregard the following aspects when comparing the \"Proposed Answer\" to the \"Ground Truth\": \\\n", + "- The order of the items should not matter, unless explicitly specified in the \"Question\". \\\n", + "- The formatting of the values should not matter, unless explicitly specified in the \"Question\". \\\n", + "- The column and row names have to be similar but not necessarily exact, unless explicitly specified in the \"Question\". \\\n", + "- The filename has to be similar by name but not necessarily exact, unless explicitly specified in the \"Question\". \\\n", + "- It is ok if the filename is missing. \\\n", + "- The numerical values should be equal with a tolerance of 0.01. \\\n", + "- The report data in the \"Proposed Answer\" should have the same number of rows as in the \"Ground Truth\". \\\n", + "- It is OK if the report data in the \"Proposed Answer\" contains extra columns or if the rows are in a different order, \\\n", + "unless explicitly specified in the \"Question\".\n", + "\"\"\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "c8b05537-2624-4df7-9f08-59d797378175", + "metadata": {}, + "source": [ + "## Running the Evaluations\n", + "\n", + "To run those two evaluators against all of the ground-truth dataset samples, run the function below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77d38a16-526a-4740-953a-dc735a9f83f7", + "metadata": {}, + "outputs": [], + "source": [ + "# Running as a CLI command to avoid issues between Langfuse's\n", + "# experiment runner and Jupyter\n", + "# NOTE: This will take a while to execute in a notebook environment\n", + "# It runs faster when executed in a regular console session\n", + "\n", + "!uv run --env-file .env python -m implementations.report_generation.evaluate --max-concurrency 1" + ] + }, + { + "cell_type": "markdown", + "id": "d969ed04-7427-40e9-b51b-fa892da706ef", + "metadata": {}, + "source": [ + "## Checking the Results\n", + "\n", + "At the end of the run, you will see a summary in the console.\n", + "\n", + "To see detailed results of the evaluation runs:\n", + "1. Go to your project on Langfuse\n", + "2. Click on **Datasets**\n", + "3. Click on the dataset name\n", + "4. Click on one of the runs\n", + "\n", + "You will see a more detailed summary of the experiment run and also you can see the details of each of of the runs, including f" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4855cf8f-f386-449a-a777-4a5adee5d2e4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/implementations/report_generation/README.md b/implementations/report_generation/README.md index c43d3f9..bb559e3 100644 --- a/implementations/report_generation/README.md +++ b/implementations/report_generation/README.md @@ -3,22 +3,29 @@ This code implements an example of a Report Generation Agent for single-table relational data source, including a demo agent demo UI and evaluations with [Langfuse](https://langfuse.com/). -The data source implemented here is [SQLite](https://sqlite.org/) which is supported -natively by Python and saves the data in disk. +The data source implemented here is an [SQLite](https://sqlite.org/) database which is +supported natively by Python and saves the data in disk. +[SQLAlchemy](https://www.sqlalchemy.org/) is used as a SQL connection tool so this +SQL connection can be easily swapped for other databases. The Report Generation Agent will provide an UI to read user queries in natural language -and procceed to make SQL queries to the database in order to produce the data for +and proceed to make SQL queries to the database in order to produce the data for the report. At the end, the Agent will provide a downloadable link to the report as an `.xlsx` file. -This example also provide agent monitoring and evaluations using Langfuse. +This example also provides agent monitoring and evaluations using Langfuse. + +### Running the Demo + +To run the demo, you can choose to follow the steps below or follow the instructions in the notebooks in this folder. + ## Dataset The dataset used in this example is the [Online Retail](https://archive.ics.uci.edu/dataset/352/online+retail) dataset. It contains information about invoices for products that were purchased by customers, which also includes -product quantity, the invoice date and country that the user resides in. For a more +product quantity, the invoice date and the country that the customer resides in. For a more detailed data structure, please check the [OnlineRetail.ddl](data/Online%20Retail.ddl) file. ## Importing the Data diff --git a/implementations/report_generation/data/import_online_retail_data.py b/implementations/report_generation/data/import_online_retail_data.py index baee0a9..f469d6e 100644 --- a/implementations/report_generation/data/import_online_retail_data.py +++ b/implementations/report_generation/data/import_online_retail_data.py @@ -26,8 +26,19 @@ @click.command() -@click.option("--dataset-path", required=True, help="OnlieRetail dataset CSV path.") -def main(dataset_path: str): +@click.option("--dataset-path", required=True, help="OnlineRetail dataset CSV path.") +def cli(dataset_path: str) -> None: + """CLI entry point to import the Online Retail dataset to the database. + + Parameters + ---------- + dataset_path : str + The path to the CSV file containing the dataset. + """ + import_online_retail_data(dataset_path) + + +def import_online_retail_data(dataset_path: str) -> None: """Import the Online Retail dataset to the database. Parameters @@ -43,7 +54,6 @@ def main(dataset_path: str): db_path = client_manager.configs.report_generation_db.database assert Path(dataset_path).exists(), f"Dataset path {dataset_path} does not exist" - assert Path(db_path).parent.exists(), f"Database path {db_path} does not exist" conn = sqlite3.connect(db_path) logger.info("Creating tables according to the OnlineRetail.ddl file") @@ -76,6 +86,9 @@ def convert_date(date_str: str) -> str | None: str | None Converted date string in format 'YYYY-MM-DD HH:MM' or None if parsing fails. """ + if not is_date_in_format(date_str, "%m/%d/%y %H:%M") and not is_date_in_format(date_str, "%m/%d/%y H:%M"): + return date_str + if not date_str or date_str.strip() == "": return None @@ -110,5 +123,28 @@ def convert_date(date_str: str) -> str | None: return None +def is_date_in_format(value: str, fmt: str) -> bool: + """Check if a date string is in a given format. + + Parameters + ---------- + value : str + The date string to check. + fmt : str + The format to check the date string against. + Example: "%m/%d/%y %H:%M" or "%m/%d/%y H:%M". + + Returns + ------- + bool + True if the date string is in the given format, False otherwise. + """ + try: + datetime.strptime(value, fmt) + return True + except ValueError: + return False + + if __name__ == "__main__": - main() + cli() diff --git a/implementations/report_generation/demo.py b/implementations/report_generation/demo.py index 795ef38..ca6ba07 100644 --- a/implementations/report_generation/demo.py +++ b/implementations/report_generation/demo.py @@ -194,15 +194,7 @@ def toggle_feedback_row() -> tuple[dict[str, Any], dict[str, Any]]: return gr.update(visible=trace_id is not None and trace_id != ""), gr.update(visible=False) -@click.command() -@click.option("--enable-trace", required=False, default=True, help="Whether to enable tracing with Langfuse.") -@click.option( - "--enable-public-link", - required=False, - default=False, - help="Whether to enable public link for the Gradio app.", -) -def start_gradio_app(enable_trace: bool = True, enable_public_link: bool = False) -> None: +async def start_gradio_app(enable_trace: bool = True, enable_public_link: bool = False) -> None: """Start the Gradio app with the agent session handler. Parameters @@ -265,8 +257,36 @@ def start_gradio_app(enable_trace: bool = True, enable_public_link: bool = False ) finally: DbManager.get_instance().close() - asyncio.run(AsyncClientManager.get_instance().close()) + await AsyncClientManager.get_instance().close() + + +@click.command() +@click.option("--enable-trace", required=False, default=True, help="Whether to enable tracing with Langfuse.") +@click.option( + "--enable-public-link", + required=False, + default=False, + help="Whether to enable public link for the Gradio app.", +) +def cli(enable_trace: bool = True, enable_public_link: bool = False) -> None: + """CLI entry point to start the Gradio app. + + Parameters + ---------- + enable_trace : bool, optional + Whether to enable tracing with Langfuse for evaluation purposes. + Default is True. + enable_public_link : bool, optional + Whether to enable public link for the Gradio app. If True, + will make the Gradio app available at a public URL. Default is False. + """ + asyncio.run( + start_gradio_app( + enable_trace=enable_trace, + enable_public_link=enable_public_link, + ) + ) if __name__ == "__main__": - start_gradio_app() + cli() diff --git a/implementations/report_generation/evaluate.py b/implementations/report_generation/evaluate.py index d46fda4..6fc48cb 100644 --- a/implementations/report_generation/evaluate.py +++ b/implementations/report_generation/evaluate.py @@ -31,7 +31,13 @@ default=DEFAULT_EVALUATION_DATASET_NAME, help="Name of the Langfuse dataset to evaluate against.", ) -def cli(dataset_name: str): +@click.option( + "--max-concurrency", + default=5, + type=int, + help="Maximum concurrent agent runs (default: 5).", +) +def cli(dataset_name: str, max_concurrency: int): """Command line interface to call the evaluate function. Parameters @@ -45,6 +51,7 @@ def cli(dataset_name: str): dataset_name, reports_output_path=get_reports_output_path(), langfuse_project_name=get_langfuse_project_name(), + max_concurrency=max_concurrency, ) ) diff --git a/pyproject.toml b/pyproject.toml index df429fe..e1189e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "pyasn1>=0.6.2", "virtualenv>=20.36.1", "tenacity>=9.1.2", + "certifi>=2026.1.4", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index 8d9d005..ab75598 100644 --- a/uv.lock +++ b/uv.lock @@ -26,6 +26,7 @@ dependencies = [ { name = "aiohttp" }, { name = "authlib" }, { name = "beautifulsoup4" }, + { name = "certifi" }, { name = "datasets" }, { name = "e2b-code-interpreter" }, { name = "filelock" }, @@ -89,6 +90,7 @@ requires-dist = [ { name = "aiohttp", specifier = ">=3.13.3" }, { name = "authlib", specifier = ">=1.6.6" }, { name = "beautifulsoup4", specifier = ">=4.13.4" }, + { name = "certifi", specifier = ">=2026.1.4" }, { name = "datasets", specifier = ">=3.6.0" }, { name = "e2b-code-interpreter", specifier = ">=2.4.1" }, { name = "filelock", specifier = ">=3.20.3" },