From f1cfe7d31799abf0b212e50be36c2925ad10ccbe Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Tue, 15 Jul 2025 15:08:28 -0400 Subject: [PATCH 01/17] change ubuntu version for stable, playwright-compatible one --- .github/workflows/unit_tests.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index f5f1902..6ad0567 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -11,7 +11,7 @@ on: jobs: code-format: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 # Changed from ubuntu-latest defaults: run: shell: bash -l {0} @@ -36,7 +36,7 @@ jobs: run: black . --check browsergym-workarena-fast: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 # Changed from ubuntu-latest defaults: run: @@ -71,7 +71,7 @@ jobs: run: pytest -n 5 --durations=10 -m 'not slow and not pricy' --slowmo 1000 -v tests browsergym-workarena-slow: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 # Changed from ubuntu-latest defaults: run: @@ -106,7 +106,7 @@ jobs: run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests end-to-end-tests: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 # Changed from ubuntu-latest if: github.event_name == 'schedule' defaults: run: @@ -131,4 +131,4 @@ jobs: SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} - run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests + run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests \ No newline at end of file From 69adad8906d13b4addfa67df2c27e0bf828857f9 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Tue, 15 Jul 2025 15:22:06 -0400 Subject: [PATCH 02/17] Update GitHub Actions workflow to run unit tests and E2E tests on weekdays; modify test parameterization to use dynamic task selection based on environment variable. --- .github/workflows/unit_tests.yml | 28 ++++++++++++++++++++++++---- tests/test_compositional.py | 17 ++++++++++++++++- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 6ad0567..2ae5a7f 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -6,7 +6,11 @@ on: - main pull_request: schedule: - - cron: '59 23 * * SUN' # Runs at midnight on Sunday + - cron: '59 23 * * 1' + - cron: '59 23 * * 2' + - cron: '59 23 * * 3' + - cron: '59 23 * * 4' + - cron: '59 23 * * 5' jobs: @@ -104,10 +108,25 @@ jobs: SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests - + end-to-end-tests: - runs-on: ubuntu-22.04 # Changed from ubuntu-latest - if: github.event_name == 'schedule' + name: "E2E - ${{ matrix.test_category }}" + runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + include: + - day_of_week: '1' + test_category: 'planning_and_problem_solving' + - day_of_week: '2' + test_category: 'information_retrieval' + - day_of_week: '3' + test_category: 'data_driven_decision_making_and_reasoning' + - day_of_week: '4' + test_category: 'sophisticated_memory' + - day_of_week: '5' + test_category: 'contextual_understanding_infeasible_tasks' + if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * ${{ matrix.day_of_week }}' defaults: run: shell: bash -l {0} @@ -131,4 +150,5 @@ jobs: SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_CATEGORY: ${{ matrix.test_category }} run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests \ No newline at end of file diff --git a/tests/test_compositional.py b/tests/test_compositional.py index 716a499..c8f3430 100644 --- a/tests/test_compositional.py +++ b/tests/test_compositional.py @@ -4,6 +4,7 @@ """ import logging +import os import pytest @@ -12,7 +13,10 @@ from playwright.sync_api import Page, TimeoutError from tenacity import retry, stop_after_attempt, retry_if_exception_type + from browsergym.workarena import ALL_COMPOSITIONAL_TASKS, get_all_tasks_agents +from browsergym.workarena.tasks.compositional.utils.curriculum import AGENT_CURRICULUM + AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2") @@ -38,13 +42,24 @@ sampled_set[1] for sampled_set in HUMAN_L3_SAMPLED_SET ] +test_category = os.environ.get("TEST_CATEGORY") + +if test_category: + tasks_to_test = [] + items = AGENT_CURRICULUM.get(test_category) + if items: + for bucket in items["buckets"]: + tasks_to_test.extend(bucket) +else: + tasks_to_test = ALL_COMPOSITIONAL_TASKS + @retry( stop=stop_after_attempt(5), reraise=True, before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), ) -@pytest.mark.parametrize("task_entrypoint", ALL_COMPOSITIONAL_TASKS) +@pytest.mark.parametrize("task_entrypoint", tasks_to_test) @pytest.mark.parametrize("random_seed", range(1)) @pytest.mark.parametrize("level", range(2, 4)) @pytest.mark.pricy From 9ef7267f0239576259dff38d4bf1ee95da17b475 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Tue, 15 Jul 2025 15:24:21 -0400 Subject: [PATCH 03/17] Refactor GitHub Actions workflow to replace 'day_of_week' with 'schedule' for unit test execution, ensuring correct scheduling format for each test category. --- .github/workflows/unit_tests.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 2ae5a7f..489183c 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -116,17 +116,17 @@ jobs: fail-fast: false matrix: include: - - day_of_week: '1' + - schedule: '59 23 * * 1' test_category: 'planning_and_problem_solving' - - day_of_week: '2' + - schedule: '59 23 * * 2' test_category: 'information_retrieval' - - day_of_week: '3' + - schedule: '59 23 * * 3' test_category: 'data_driven_decision_making_and_reasoning' - - day_of_week: '4' + - schedule: '59 23 * * 4' test_category: 'sophisticated_memory' - - day_of_week: '5' + - schedule: '59 23 * * 5' test_category: 'contextual_understanding_infeasible_tasks' - if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * ${{ matrix.day_of_week }}' + if: github.event_name == 'schedule' && github.event.schedule == matrix.schedule defaults: run: shell: bash -l {0} From 6fe476babbb4ee596d95ebb5c34793d80d8d6fd9 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Tue, 15 Jul 2025 15:28:46 -0400 Subject: [PATCH 04/17] revert to explicit scheduling --- .github/workflows/unit_tests.yml | 137 +++++++++++++++++++++++++++---- 1 file changed, 119 insertions(+), 18 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 489183c..dea9c29 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -108,25 +108,126 @@ jobs: SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests + + end-to-end-tests-planning: + runs-on: ubuntu-22.04 + if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1' + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + - name: Pip list + run: pip list + - name: Install Playwright + run: playwright install --with-deps + - name: Run E2E Tests + env: + SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} + SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} + SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_CATEGORY: planning_and_problem_solving + run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests + + end-to-end-tests-information-retrieval: + runs-on: ubuntu-22.04 + if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 2' + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + - name: Pip list + run: pip list + - name: Install Playwright + run: playwright install --with-deps + - name: Run E2E Tests + env: + SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} + SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} + SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_CATEGORY: information_retrieval + run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests + + end-to-end-tests-data-driven-decision-making: + runs-on: ubuntu-22.04 + if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 3' + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + - name: Pip list + run: pip list + - name: Install Playwright + run: playwright install --with-deps + - name: Run E2E Tests + env: + SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} + SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} + SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_CATEGORY: data_driven_decision_making_and_reasoning + run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests + + end-to-end-tests-sophisticated-memory: + runs-on: ubuntu-22.04 + if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 4' + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + - name: Pip list + run: pip list + - name: Install Playwright + run: playwright install --with-deps + - name: Run E2E Tests + env: + SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} + SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} + SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_CATEGORY: sophisticated_memory + run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests - end-to-end-tests: - name: "E2E - ${{ matrix.test_category }}" + end-to-end-tests-contextual-understanding: runs-on: ubuntu-22.04 - strategy: - fail-fast: false - matrix: - include: - - schedule: '59 23 * * 1' - test_category: 'planning_and_problem_solving' - - schedule: '59 23 * * 2' - test_category: 'information_retrieval' - - schedule: '59 23 * * 3' - test_category: 'data_driven_decision_making_and_reasoning' - - schedule: '59 23 * * 4' - test_category: 'sophisticated_memory' - - schedule: '59 23 * * 5' - test_category: 'contextual_understanding_infeasible_tasks' - if: github.event_name == 'schedule' && github.event.schedule == matrix.schedule + if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 5' defaults: run: shell: bash -l {0} @@ -150,5 +251,5 @@ jobs: SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} - TEST_CATEGORY: ${{ matrix.test_category }} + TEST_CATEGORY: contextual_understanding_infeasible_tasks run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests \ No newline at end of file From ba77b93c506ec1ca9566a15f6592aa2a50d0a37d Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 13:43:20 -0400 Subject: [PATCH 05/17] Add workflow_dispatch support to GitHub Actions for E2E test category selection --- .github/workflows/unit_tests.yml | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index dea9c29..07dc996 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -5,6 +5,20 @@ on: branches: - main pull_request: + workflow_dispatch: + inputs: + test_category: + description: 'E2E test category to run. Select "none" to run only unit tests.' + required: true + type: choice + default: 'none' + options: + - none + - planning_and_problem_solving + - information_retrieval + - data_driven_decision_making_and_reasoning + - sophisticated_memory + - contextual_understanding_infeasible_tasks schedule: - cron: '59 23 * * 1' - cron: '59 23 * * 2' @@ -16,6 +30,7 @@ jobs: code-format: runs-on: ubuntu-22.04 # Changed from ubuntu-latest + if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none') defaults: run: shell: bash -l {0} @@ -41,6 +56,7 @@ jobs: browsergym-workarena-fast: runs-on: ubuntu-22.04 # Changed from ubuntu-latest + if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none') defaults: run: @@ -76,6 +92,7 @@ jobs: browsergym-workarena-slow: runs-on: ubuntu-22.04 # Changed from ubuntu-latest + if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none') defaults: run: @@ -111,7 +128,7 @@ jobs: end-to-end-tests-planning: runs-on: ubuntu-22.04 - if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1' + if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving') defaults: run: shell: bash -l {0} @@ -140,7 +157,7 @@ jobs: end-to-end-tests-information-retrieval: runs-on: ubuntu-22.04 - if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 2' + if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 2') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'information_retrieval') defaults: run: shell: bash -l {0} @@ -169,7 +186,7 @@ jobs: end-to-end-tests-data-driven-decision-making: runs-on: ubuntu-22.04 - if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 3' + if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 3') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'data_driven_decision_making_and_reasoning') defaults: run: shell: bash -l {0} @@ -198,7 +215,7 @@ jobs: end-to-end-tests-sophisticated-memory: runs-on: ubuntu-22.04 - if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 4' + if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 4') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'sophisticated_memory') defaults: run: shell: bash -l {0} @@ -227,7 +244,7 @@ jobs: end-to-end-tests-contextual-understanding: runs-on: ubuntu-22.04 - if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 5' + if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 5') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'contextual_understanding_infeasible_tasks') defaults: run: shell: bash -l {0} From 6d016f77eb9701b5d414ab4f73520627d4c70f5c Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 14:21:05 -0400 Subject: [PATCH 06/17] for testing --- .github/workflows/unit_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 07dc996..1625417 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -20,7 +20,7 @@ on: - sophisticated_memory - contextual_understanding_infeasible_tasks schedule: - - cron: '59 23 * * 1' + - cron: '25 18 * * *' # Temporary schedule for testing - cron: '59 23 * * 2' - cron: '59 23 * * 3' - cron: '59 23 * * 4' @@ -128,7 +128,7 @@ jobs: end-to-end-tests-planning: runs-on: ubuntu-22.04 - if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving') + if: (github.event_name == 'schedule' && github.event.schedule == '25 18 * * *') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving') defaults: run: shell: bash -l {0} From 42b776ccc519f69618598f28059796280f87967f Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 14:23:06 -0400 Subject: [PATCH 07/17] Update scheduling for end-to-end tests in GitHub Actions to run at 23:59 on Mondays --- .github/workflows/unit_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 1625417..773ab62 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -128,7 +128,7 @@ jobs: end-to-end-tests-planning: runs-on: ubuntu-22.04 - if: (github.event_name == 'schedule' && github.event.schedule == '25 18 * * *') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving') + if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving') defaults: run: shell: bash -l {0} From e0c5ab5126a7f065b83e1d7aee55ddf798ac7a26 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 14:55:06 -0400 Subject: [PATCH 08/17] Add L1 atomic task tests with retry logic in test_task_general.py --- tests/test_task_general.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_task_general.py b/tests/test_task_general.py index b410003..799e479 100644 --- a/tests/test_task_general.py +++ b/tests/test_task_general.py @@ -17,6 +17,9 @@ from browsergym.workarena import ATOMIC_TASKS +L1_TASKS = [task for task in ATOMIC_TASKS if ".l1." in task.__name__] + + @retry( stop=stop_after_attempt(5), retry=retry_if_exception_type(TimeoutError), @@ -37,3 +40,26 @@ def test_cheat(task_entrypoint, random_seed: int, page: Page): reward, done, message, info = task.validate(page, chat_messages) task.teardown() assert done is True and reward == 1.0 + + + +@retry( + stop=stop_after_attempt(5), + retry=retry_if_exception_type(TimeoutError), + reraise=True, + before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), +) +@pytest.mark.parametrize("task_entrypoint", L1_TASKS) +@pytest.mark.slow +def test_l1_atomic_cheat(task_entrypoint, page: Page): + """L1 atomic tasks have a fixed seed""" + task = task_entrypoint(seed=0) + goal, info = task.setup(page=page) + chat_messages = [] + reward, done, message, info = task.validate(page, chat_messages) + assert done is False and reward == 0.0 + assert type(message) == str and type(info) == dict + task.cheat(page=page, chat_messages=chat_messages) + reward, done, message, info = task.validate(page, chat_messages) + task.teardown() + assert done is True and reward == 1.0 From aa9bac6a38ec303aaf04ff6d469997f4f37a818e Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 15:19:03 -0400 Subject: [PATCH 09/17] Remove unnecessary blank line in test_task_general.py --- tests/test_task_general.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_task_general.py b/tests/test_task_general.py index 799e479..d7d37cf 100644 --- a/tests/test_task_general.py +++ b/tests/test_task_general.py @@ -42,7 +42,6 @@ def test_cheat(task_entrypoint, random_seed: int, page: Page): assert done is True and reward == 1.0 - @retry( stop=stop_after_attempt(5), retry=retry_if_exception_type(TimeoutError), From 3ba5819134235e13635d1320482db2d936b5956f Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 15:28:23 -0400 Subject: [PATCH 10/17] Refactor L1 atomic task tests in test_task_general.py to use dynamic task and seed selection from get_all_tasks_agents --- tests/test_task_general.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_task_general.py b/tests/test_task_general.py index d7d37cf..edd784b 100644 --- a/tests/test_task_general.py +++ b/tests/test_task_general.py @@ -14,10 +14,11 @@ from playwright.sync_api import Page, TimeoutError from tenacity import retry, stop_after_attempt, retry_if_exception_type -from browsergym.workarena import ATOMIC_TASKS +from browsergym.workarena import ATOMIC_TASKS, get_all_tasks_agents -L1_TASKS = [task for task in ATOMIC_TASKS if ".l1." in task.__name__] +L1_SET = get_all_tasks_agents(filter="l1", is_compositional=False) +L1_TASKS, L1_SEEDS = [item[0] for item in L1_SET], [item[1] for item in L1_SET] @retry( @@ -48,11 +49,11 @@ def test_cheat(task_entrypoint, random_seed: int, page: Page): reraise=True, before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), ) -@pytest.mark.parametrize("task_entrypoint", L1_TASKS) +@pytest.mark.parametrize("task_entrypoint, seed", zip(L1_TASKS, L1_SEEDS)) @pytest.mark.slow -def test_l1_atomic_cheat(task_entrypoint, page: Page): +def test_l1_atomic_cheat(task_entrypoint, seed, page: Page): """L1 atomic tasks have a fixed seed""" - task = task_entrypoint(seed=0) + task = task_entrypoint(seed=seed) goal, info = task.setup(page=page) chat_messages = [] reward, done, message, info = task.validate(page, chat_messages) From 1f189d42929e65872afe4dba4997c279f25bdba5 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 15:40:13 -0400 Subject: [PATCH 11/17] fix test --- tests/test_task_general.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/test_task_general.py b/tests/test_task_general.py index edd784b..8ac15f6 100644 --- a/tests/test_task_general.py +++ b/tests/test_task_general.py @@ -17,7 +17,7 @@ from browsergym.workarena import ATOMIC_TASKS, get_all_tasks_agents -L1_SET = get_all_tasks_agents(filter="l1", is_compositional=False) +L1_SET = get_all_tasks_agents(filter="l1") L1_TASKS, L1_SEEDS = [item[0] for item in L1_SET], [item[1] for item in L1_SET] @@ -51,15 +51,18 @@ def test_cheat(task_entrypoint, random_seed: int, page: Page): ) @pytest.mark.parametrize("task_entrypoint, seed", zip(L1_TASKS, L1_SEEDS)) @pytest.mark.slow -def test_l1_atomic_cheat(task_entrypoint, seed, page: Page): - """L1 atomic tasks have a fixed seed""" +def test_l1_cheat(task_entrypoint, seed, page: Page): task = task_entrypoint(seed=seed) goal, info = task.setup(page=page) chat_messages = [] - reward, done, message, info = task.validate(page, chat_messages) - assert done is False and reward == 0.0 - assert type(message) == str and type(info) == dict - task.cheat(page=page, chat_messages=chat_messages) - reward, done, message, info = task.validate(page, chat_messages) + for i in range(len(task)): + page.wait_for_timeout(1000) + task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i) + page.wait_for_timeout(1000) + reward, done, message, info = task.validate(page=page, chat_messages=chat_messages) + if i < len(task) - 1: + assert done is False and reward == 0.0 + task.teardown() + assert done is True and reward == 1.0 From b54453ad4c646aa247ff0d7ed59928d2a49dceb4 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 15:53:22 -0400 Subject: [PATCH 12/17] add pipeline to run atomic tasks on a schedule --- .github/workflows/unit_tests.yml | 34 ++++++++++++++++++++++-- tests/test_task_general.py | 44 +++++++++----------------------- 2 files changed, 44 insertions(+), 34 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 773ab62..95c43c3 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -20,11 +20,12 @@ on: - sophisticated_memory - contextual_understanding_infeasible_tasks schedule: - - cron: '25 18 * * *' # Temporary schedule for testing + - cron: '59 23 * * 1' - cron: '59 23 * * 2' - cron: '59 23 * * 3' - cron: '59 23 * * 4' - cron: '59 23 * * 5' + - cron: '59 23 * * 6' jobs: @@ -269,4 +270,33 @@ jobs: SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} TEST_CATEGORY: contextual_understanding_infeasible_tasks - run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests \ No newline at end of file + run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests + + l1-atomic-weekly-test: + runs-on: ubuntu-22.04 + if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 6' + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + - name: Pip list + run: pip list + - name: Install Playwright + run: playwright install --with-deps + - name: Run L1 Atomic Cheat Test + env: + SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} + SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} + SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_L1_ATOMIC: "true" + run: pytest -n 5 --durations=10 -m "slow" --slowmo 1000 -v tests/test_task_general.py \ No newline at end of file diff --git a/tests/test_task_general.py b/tests/test_task_general.py index 8ac15f6..0886cbf 100644 --- a/tests/test_task_general.py +++ b/tests/test_task_general.py @@ -2,7 +2,7 @@ Tests that are not specific to any particular kind of task. """ - +import os import json import logging import pickle @@ -17,8 +17,14 @@ from browsergym.workarena import ATOMIC_TASKS, get_all_tasks_agents -L1_SET = get_all_tasks_agents(filter="l1") -L1_TASKS, L1_SEEDS = [item[0] for item in L1_SET], [item[1] for item in L1_SET] +# Prepare parameters for the test, based on the environment variable +if os.environ.get("TEST_L1_ATOMIC"): + # For the weekly scheduled job, run all L1 tasks with their specific seeds + L1_SET = get_all_tasks_agents(filter="l1") + PARAMS = [(item[0], item[1]) for item in L1_SET] +else: + # For PRs, run all atomic tasks with a single seed to be faster + PARAMS = [(task, 0) for task in ATOMIC_TASKS] @retry( @@ -27,11 +33,10 @@ reraise=True, before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), ) -@pytest.mark.parametrize("task_entrypoint", ATOMIC_TASKS) -@pytest.mark.parametrize("random_seed", range(1)) +@pytest.mark.parametrize("task_entrypoint, seed", PARAMS) @pytest.mark.slow -def test_cheat(task_entrypoint, random_seed: int, page: Page): - task = task_entrypoint(seed=random_seed) +def test_atomic_cheat(task_entrypoint, seed: int, page: Page): + task = task_entrypoint(seed=seed) goal, info = task.setup(page=page) chat_messages = [] reward, done, message, info = task.validate(page, chat_messages) @@ -41,28 +46,3 @@ def test_cheat(task_entrypoint, random_seed: int, page: Page): reward, done, message, info = task.validate(page, chat_messages) task.teardown() assert done is True and reward == 1.0 - - -@retry( - stop=stop_after_attempt(5), - retry=retry_if_exception_type(TimeoutError), - reraise=True, - before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), -) -@pytest.mark.parametrize("task_entrypoint, seed", zip(L1_TASKS, L1_SEEDS)) -@pytest.mark.slow -def test_l1_cheat(task_entrypoint, seed, page: Page): - task = task_entrypoint(seed=seed) - goal, info = task.setup(page=page) - chat_messages = [] - for i in range(len(task)): - page.wait_for_timeout(1000) - task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i) - page.wait_for_timeout(1000) - reward, done, message, info = task.validate(page=page, chat_messages=chat_messages) - if i < len(task) - 1: - assert done is False and reward == 0.0 - - task.teardown() - - assert done is True and reward == 1.0 From b55c8e3d3669de8c0d43c987360c994c81c79991 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 15:56:16 -0400 Subject: [PATCH 13/17] code format --- tests/test_task_general.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_task_general.py b/tests/test_task_general.py index 0886cbf..ed31ee7 100644 --- a/tests/test_task_general.py +++ b/tests/test_task_general.py @@ -2,6 +2,7 @@ Tests that are not specific to any particular kind of task. """ + import os import json import logging From cbc51d6836d5d79f07db1f7d5fa350cdfcda0b4e Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 16:30:59 -0400 Subject: [PATCH 14/17] test schedule --- .github/workflows/unit_tests.yml | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 95c43c3..c2331ba 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -5,22 +5,8 @@ on: branches: - main pull_request: - workflow_dispatch: - inputs: - test_category: - description: 'E2E test category to run. Select "none" to run only unit tests.' - required: true - type: choice - default: 'none' - options: - - none - - planning_and_problem_solving - - information_retrieval - - data_driven_decision_making_and_reasoning - - sophisticated_memory - - contextual_understanding_infeasible_tasks schedule: - - cron: '59 23 * * 1' + - cron: '34 20 * * *' # Temporary schedule for testing - cron: '59 23 * * 2' - cron: '59 23 * * 3' - cron: '59 23 * * 4' @@ -129,7 +115,7 @@ jobs: end-to-end-tests-planning: runs-on: ubuntu-22.04 - if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving') + if: (github.event_name == 'schedule' && github.event.schedule == '34 20 * * *') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving') defaults: run: shell: bash -l {0} From dca8e91246b6e2d782ed46933a856a7186fd7a55 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 17:37:57 -0400 Subject: [PATCH 15/17] revert test setting --- .github/workflows/unit_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index c2331ba..fb86ca5 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -6,7 +6,7 @@ on: - main pull_request: schedule: - - cron: '34 20 * * *' # Temporary schedule for testing + - cron: '59 23 * * 1' # Temporary schedule for testing - cron: '59 23 * * 2' - cron: '59 23 * * 3' - cron: '59 23 * * 4' From fbd4e8e222b8f27bbfd3e744ad79f80dddbda7b9 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 18:23:21 -0400 Subject: [PATCH 16/17] refactor tests --- src/browsergym/workarena/__init__.py | 16 ++- tests/test_compositional.py | 159 ++++----------------------- tests/test_workarena_utils.py | 114 +++++++++++++++++++ 3 files changed, 145 insertions(+), 144 deletions(-) create mode 100644 tests/test_workarena_utils.py diff --git a/src/browsergym/workarena/__init__.py b/src/browsergym/workarena/__init__.py index b8767c6..3abd02e 100644 --- a/src/browsergym/workarena/__init__.py +++ b/src/browsergym/workarena/__init__.py @@ -111,18 +111,19 @@ def get_task_category(task_name): return benchmark, TASK_CATEGORY_MAP.get(task_name, None) -def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True): +def get_all_tasks_agents( + filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True, task_bucket=None +): OFFSET = 42 all_task_tuples = [] filter = filter.split(".") + rng = np.random.RandomState(meta_seed) if len(filter) > 2: raise Exception("Unsupported filter used.") if len(filter) == 1: level = filter[0] if level not in ["l1", "l2", "l3"]: raise Exception("Unsupported category of tasks.") - else: - rng = np.random.RandomState(meta_seed) if level == "l1": for task in ATOMIC_TASKS: for seed in rng.randint(0, 1000, n_seed_l1): @@ -151,9 +152,16 @@ def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curri for category, items in ALL_COMPOSITIONAL_TASKS_CATEGORIES.items(): if filter_category and category != filter_category: continue + # If a task_bucket is specified, check if it exists in the current category + if task_bucket and task_bucket not in items["buckets"]: + continue for curr_seed in rng.randint(0, 1000, items["num_seeds"]): random_gen = np.random.RandomState(curr_seed) - for task_set, count in zip(items["buckets"], items["weights"]): + for i, task_set in enumerate(items["buckets"]): + # if a task_bucket is specified, only select tasks from that bucket + if task_bucket and task_set != task_bucket: + continue + count = items["weights"][i] tasks = random_gen.choice(task_set, count, replace=False) for task in tasks: all_task_tuples.append((task, int(curr_seed))) diff --git a/tests/test_compositional.py b/tests/test_compositional.py index c8f3430..0f5f04b 100644 --- a/tests/test_compositional.py +++ b/tests/test_compositional.py @@ -2,7 +2,6 @@ Tests that are not specific to any particular kind of task. """ - import logging import os @@ -14,70 +13,25 @@ from playwright.sync_api import Page, TimeoutError from tenacity import retry, stop_after_attempt, retry_if_exception_type -from browsergym.workarena import ALL_COMPOSITIONAL_TASKS, get_all_tasks_agents -from browsergym.workarena.tasks.compositional.utils.curriculum import AGENT_CURRICULUM - - -AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2") - -AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [ - sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET -] - -AGENT_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3") - -AGENT_L3_SAMPLED_TASKS, AGENT_L3_SEEDS = [sampled_set[0] for sampled_set in AGENT_L3_SAMPLED_SET], [ - sampled_set[1] for sampled_set in AGENT_L3_SAMPLED_SET -] +from browsergym.workarena import get_all_tasks_agents +from browsergym.workarena.tasks.compositional.base import CompositionalTask +# Combine all tasks into a single list for parameterization +AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2", is_agent_curriculum=True) +AGENT_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3", is_agent_curriculum=True) HUMAN_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2", is_agent_curriculum=False) - -HUMAN_L2_SAMPLED_TASKS, HUMAN_L2_SEEDS = [sampled_set[0] for sampled_set in HUMAN_L2_SAMPLED_SET], [ - sampled_set[1] for sampled_set in HUMAN_L2_SAMPLED_SET -] - HUMAN_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3", is_agent_curriculum=False) -HUMAN_L3_SAMPLED_TASKS, HUMAN_L3_SEEDS = [sampled_set[0] for sampled_set in HUMAN_L3_SAMPLED_SET], [ - sampled_set[1] for sampled_set in HUMAN_L3_SAMPLED_SET -] +all_tasks_to_test = ( + AGENT_L2_SAMPLED_SET + AGENT_L3_SAMPLED_SET + HUMAN_L2_SAMPLED_SET + HUMAN_L3_SAMPLED_SET +) test_category = os.environ.get("TEST_CATEGORY") - if test_category: - tasks_to_test = [] - items = AGENT_CURRICULUM.get(test_category) - if items: - for bucket in items["buckets"]: - tasks_to_test.extend(bucket) + # If a category is specified, filter the tasks to test + tasks_to_test = get_all_tasks_agents(filter=f"l3.{test_category}", is_agent_curriculum=True) else: - tasks_to_test = ALL_COMPOSITIONAL_TASKS - - -@retry( - stop=stop_after_attempt(5), - reraise=True, - before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), -) -@pytest.mark.parametrize("task_entrypoint", tasks_to_test) -@pytest.mark.parametrize("random_seed", range(1)) -@pytest.mark.parametrize("level", range(2, 4)) -@pytest.mark.pricy -def test_cheat_compositional(task_entrypoint, random_seed, level, page: Page): - task = task_entrypoint(seed=random_seed, level=level) - goal, info = task.setup(page=page) - chat_messages = [] - for i in range(len(task)): - page.wait_for_timeout(1000) - task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i) - page.wait_for_timeout(1000) - reward, done, message, info = task.validate(page=page, chat_messages=chat_messages) - if i < len(task) - 1: - assert done is False and reward == 0.0 - - task.teardown() - - assert done is True and reward == 1.0 + tasks_to_test = all_tasks_to_test @retry( @@ -86,89 +40,14 @@ def test_cheat_compositional(task_entrypoint, random_seed, level, page: Page): reraise=True, before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), ) -@pytest.mark.parametrize("task_entrypoint, seed", zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS)) -@pytest.mark.slow -@pytest.mark.skip(reason="Tests are too slow") -def test_cheat_compositional_sampled_agent_set_l2(task_entrypoint, seed, page: Page): - task = task_entrypoint(seed=seed) - goal, info = task.setup(page=page) - chat_messages = [] - for i in range(len(task)): - page.wait_for_timeout(1000) - task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i) - page.wait_for_timeout(1000) - reward, done, message, info = task.validate(page=page, chat_messages=chat_messages) - if i < len(task) - 1: - assert done is False and reward == 0.0 - - task.teardown() - - assert done is True and reward == 1.0 - - -@retry( - stop=stop_after_attempt(5), - retry=retry_if_exception_type(TimeoutError), - reraise=True, - before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), -) -@pytest.mark.parametrize("task_entrypoint, seed", zip(AGENT_L3_SAMPLED_TASKS, AGENT_L3_SEEDS)) -@pytest.mark.slow -@pytest.mark.skip(reason="Tests are too slow") -def test_cheat_compositional_sampled_agent_set_l3(task_entrypoint, seed, page: Page): - task = task_entrypoint(seed=seed) - goal, info = task.setup(page=page) - chat_messages = [] - for i in range(len(task)): - page.wait_for_timeout(1000) - task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i) - page.wait_for_timeout(1000) - reward, done, message, info = task.validate(page=page, chat_messages=chat_messages) - if i < len(task) - 1: - assert done is False and reward == 0.0 - - task.teardown() - - assert done is True and reward == 1.0 - - -@retry( - stop=stop_after_attempt(5), - retry=retry_if_exception_type(TimeoutError), - reraise=True, - before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), -) -@pytest.mark.parametrize("task_entrypoint, seed", zip(HUMAN_L2_SAMPLED_TASKS, HUMAN_L2_SEEDS)) -@pytest.mark.slow -@pytest.mark.skip(reason="Tests are too slow") -def test_cheat_compositional_sampled_human_set_l2(task_entrypoint, seed, page: Page): - task = task_entrypoint(seed=seed) - goal, info = task.setup(page=page) - chat_messages = [] - for i in range(len(task)): - page.wait_for_timeout(1000) - task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i) - page.wait_for_timeout(1000) - reward, done, message, info = task.validate(page=page, chat_messages=chat_messages) - if i < len(task) - 1: - assert done is False and reward == 0.0 - - task.teardown() - - assert done is True and reward == 1.0 - - -@retry( - stop=stop_after_attempt(5), - retry=retry_if_exception_type(TimeoutError), - reraise=True, - before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), -) -@pytest.mark.parametrize("task_entrypoint, seed", zip(HUMAN_L3_SAMPLED_TASKS, HUMAN_L3_SEEDS)) -@pytest.mark.slow -@pytest.mark.skip(reason="Tests are too slow") -def test_cheat_compositional_sampled_human_set_l3(task_entrypoint, seed, page: Page): - task = task_entrypoint(seed=seed) +@pytest.mark.parametrize("task_class, seed", tasks_to_test) +@pytest.mark.pricy +def test_cheat_compositional(task_class, seed, page: Page): + """ + Test that the cheat method works for all compositional tasks. + This test is parameterized to run for all tasks in the agent and human curricula. + """ + task = task_class(seed=seed) goal, info = task.setup(page=page) chat_messages = [] for i in range(len(task)): diff --git a/tests/test_workarena_utils.py b/tests/test_workarena_utils.py new file mode 100644 index 0000000..3009a63 --- /dev/null +++ b/tests/test_workarena_utils.py @@ -0,0 +1,114 @@ +""" +Tests for workarena utility functions. +""" +import pytest +from browsergym.workarena import get_all_tasks_agents +from browsergym.workarena.tasks.compositional import ( + AGENT_CURRICULUM_L2, + AGENT_CURRICULUM_L3, + HUMAN_CURRICULUM_L2, + HUMAN_CURRICULUM_L3, + specialize_task_class_to_level, +) +from browsergym.workarena.tasks.compositional.base import CompositionalTask +from browsergym.workarena.tasks.compositional.mark_duplicate_problems import ( + BasicFilterProblemsAndMarkDuplicatesSmallTask, + PriorityFilterProblemsAndMarkDuplicatesSmallTask, +) +from browsergym.workarena.tasks.compositional.navigate_and_do_infeasible import ( + InfeasibleNavigateAndCreateUserWithReasonTask, +) + + +def get_tasks_from_curriculum(curriculum): + """Helper function to extract all unique tasks from a curriculum.""" + all_tasks = set() + for category, items in curriculum.items(): + for bucket in items["buckets"]: + for task in bucket: + all_tasks.add(task) + return all_tasks + + +def test_get_all_tasks_agents(): + """Test that get_all_tasks_agents returns the correct tasks from the curricula.""" + # Test L1 filter (atomic tasks) + tasks_with_seeds_l1 = get_all_tasks_agents(filter="l1") + assert len(tasks_with_seeds_l1) > 0 + for task, seed in tasks_with_seeds_l1: + assert not issubclass(task, CompositionalTask) + assert isinstance(seed, int) + + # Test L2 Human Curriculum + tasks_with_seeds_l2_human = get_all_tasks_agents(filter="l2", is_agent_curriculum=False) + expected_l2_human_tasks = get_tasks_from_curriculum(HUMAN_CURRICULUM_L2) + assert len(tasks_with_seeds_l2_human) > 0 + for task, seed in tasks_with_seeds_l2_human: + assert task in expected_l2_human_tasks + + # Test L3 Human Curriculum + tasks_with_seeds_l3_human = get_all_tasks_agents(filter="l3", is_agent_curriculum=False) + expected_l3_human_tasks = get_tasks_from_curriculum(HUMAN_CURRICULUM_L3) + assert len(tasks_with_seeds_l3_human) > 0 + for task, seed in tasks_with_seeds_l3_human: + assert task in expected_l3_human_tasks + + # Test category filtering + category = "planning_and_problem_solving" + tasks_with_seeds_cat = get_all_tasks_agents( + filter=f"l3.{category}", is_agent_curriculum=True + ) + assert len(tasks_with_seeds_cat) > 0 + # Expected tasks from the specified category's buckets + expected_cat_tasks = set() + for bucket in AGENT_CURRICULUM_L3[category]["buckets"]: + expected_cat_tasks.update(bucket) + + returned_tasks = {task for task, seed in tasks_with_seeds_cat} + assert returned_tasks.issubset(expected_cat_tasks) + + # Check that tasks from other categories are not present + for other_category, items in AGENT_CURRICULUM_L3.items(): + if other_category != category: + for bucket in items["buckets"]: + for task in bucket: + assert task not in returned_tasks + + # Test task_bucket filtering + category = "planning_and_problem_solving" + # This bucket contains BasicFilterProblemsAndMarkDuplicatesSmallTask + bucket_to_test = AGENT_CURRICULUM_L3[category]["buckets"][0] + + tasks_with_seeds_bucket = get_all_tasks_agents( + filter=f"l3.{category}", is_agent_curriculum=True, task_bucket=bucket_to_test + ) + assert len(tasks_with_seeds_bucket) > 0 + + returned_tasks_from_bucket = {task for task, seed in tasks_with_seeds_bucket} + + # 1. All returned tasks are from the specified bucket + assert returned_tasks_from_bucket.issubset(set(bucket_to_test)) + + # 2. A specific task from the bucket is present + expected_task_base = BasicFilterProblemsAndMarkDuplicatesSmallTask + # Find the specialized task in the bucket that corresponds to the base task + expected_task_specialized = next( + task + for task in bucket_to_test + if expected_task_base in task.__mro__ + ) + assert expected_task_specialized in returned_tasks_from_bucket + + # A task from a different category is not present + unexpected_task = specialize_task_class_to_level( + InfeasibleNavigateAndCreateUserWithReasonTask, level=3 + ) + assert unexpected_task not in returned_tasks_from_bucket + + # Test invalid filter + with pytest.raises(Exception): + get_all_tasks_agents(filter="invalid") + + # Test invalid category filter + with pytest.raises(Exception): + get_all_tasks_agents(filter="l3.invalid_category") \ No newline at end of file From ba6f5931a05d03d445c6d03e242ceef4d05a6ab0 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Wed, 16 Jul 2025 18:30:08 -0400 Subject: [PATCH 17/17] code format --- tests/test_compositional.py | 1 + tests/test_workarena_utils.py | 11 ++++------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_compositional.py b/tests/test_compositional.py index 0f5f04b..425aa33 100644 --- a/tests/test_compositional.py +++ b/tests/test_compositional.py @@ -2,6 +2,7 @@ Tests that are not specific to any particular kind of task. """ + import logging import os diff --git a/tests/test_workarena_utils.py b/tests/test_workarena_utils.py index 3009a63..395fbd0 100644 --- a/tests/test_workarena_utils.py +++ b/tests/test_workarena_utils.py @@ -1,6 +1,7 @@ """ Tests for workarena utility functions. """ + import pytest from browsergym.workarena import get_all_tasks_agents from browsergym.workarena.tasks.compositional import ( @@ -55,9 +56,7 @@ def test_get_all_tasks_agents(): # Test category filtering category = "planning_and_problem_solving" - tasks_with_seeds_cat = get_all_tasks_agents( - filter=f"l3.{category}", is_agent_curriculum=True - ) + tasks_with_seeds_cat = get_all_tasks_agents(filter=f"l3.{category}", is_agent_curriculum=True) assert len(tasks_with_seeds_cat) > 0 # Expected tasks from the specified category's buckets expected_cat_tasks = set() @@ -93,9 +92,7 @@ def test_get_all_tasks_agents(): expected_task_base = BasicFilterProblemsAndMarkDuplicatesSmallTask # Find the specialized task in the bucket that corresponds to the base task expected_task_specialized = next( - task - for task in bucket_to_test - if expected_task_base in task.__mro__ + task for task in bucket_to_test if expected_task_base in task.__mro__ ) assert expected_task_specialized in returned_tasks_from_bucket @@ -111,4 +108,4 @@ def test_get_all_tasks_agents(): # Test invalid category filter with pytest.raises(Exception): - get_all_tasks_agents(filter="l3.invalid_category") \ No newline at end of file + get_all_tasks_agents(filter="l3.invalid_category")