From f1cfe7d31799abf0b212e50be36c2925ad10ccbe Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Tue, 15 Jul 2025 15:08:28 -0400
Subject: [PATCH 01/17] change ubuntu version for stable, playwright-compatible
 one

---
 .github/workflows/unit_tests.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index f5f1902..6ad0567 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -11,7 +11,7 @@ on:
 jobs:
 
   code-format:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
     defaults:
       run:
         shell: bash -l {0}
@@ -36,7 +36,7 @@ jobs:
         run: black . --check
 
   browsergym-workarena-fast:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
   
     defaults:
       run:
@@ -71,7 +71,7 @@ jobs:
         run: pytest -n 5 --durations=10 -m 'not slow and not pricy' --slowmo 1000 -v tests
 
   browsergym-workarena-slow:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
 
     defaults:
       run:
@@ -106,7 +106,7 @@ jobs:
         run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
   
   end-to-end-tests:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
     if: github.event_name == 'schedule'
     defaults:
       run:
@@ -131,4 +131,4 @@ jobs:
           SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
           SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
-        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
\ No newline at end of file

From 69adad8906d13b4addfa67df2c27e0bf828857f9 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Tue, 15 Jul 2025 15:22:06 -0400
Subject: [PATCH 02/17] Update GitHub Actions workflow to run unit tests and
 E2E tests on weekdays; modify test parameterization to use dynamic task
 selection based on environment variable.

---
 .github/workflows/unit_tests.yml | 28 ++++++++++++++++++++++++----
 tests/test_compositional.py      | 17 ++++++++++++++++-
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 6ad0567..2ae5a7f 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -6,7 +6,11 @@ on:
       - main
   pull_request:
   schedule:
-    - cron: '59 23 * * SUN'  # Runs at midnight on Sunday
+    - cron: '59 23 * * 1'
+    - cron: '59 23 * * 2'
+    - cron: '59 23 * * 3'
+    - cron: '59 23 * * 4'
+    - cron: '59 23 * * 5'
 
 jobs:
 
@@ -104,10 +108,25 @@ jobs:
           SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
         run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
-  
+
   end-to-end-tests:
-    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
-    if: github.event_name == 'schedule'
+    name: "E2E - ${{ matrix.test_category }}"
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - day_of_week: '1'
+            test_category: 'planning_and_problem_solving'
+          - day_of_week: '2'
+            test_category: 'information_retrieval'
+          - day_of_week: '3'
+            test_category: 'data_driven_decision_making_and_reasoning'
+          - day_of_week: '4'
+            test_category: 'sophisticated_memory'
+          - day_of_week: '5'
+            test_category: 'contextual_understanding_infeasible_tasks'
+    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * ${{ matrix.day_of_week }}'
     defaults:
       run:
         shell: bash -l {0}
@@ -131,4 +150,5 @@ jobs:
           SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
           SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: ${{ matrix.test_category }}
         run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
\ No newline at end of file
diff --git a/tests/test_compositional.py b/tests/test_compositional.py
index 716a499..c8f3430 100644
--- a/tests/test_compositional.py
+++ b/tests/test_compositional.py
@@ -4,6 +4,7 @@
 """
 
 import logging
+import os
 
 import pytest
 
@@ -12,7 +13,10 @@
 
 from playwright.sync_api import Page, TimeoutError
 from tenacity import retry, stop_after_attempt, retry_if_exception_type
+
 from browsergym.workarena import ALL_COMPOSITIONAL_TASKS, get_all_tasks_agents
+from browsergym.workarena.tasks.compositional.utils.curriculum import AGENT_CURRICULUM
+
 
 AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
 
@@ -38,13 +42,24 @@
     sampled_set[1] for sampled_set in HUMAN_L3_SAMPLED_SET
 ]
 
+test_category = os.environ.get("TEST_CATEGORY")
+
+if test_category:
+    tasks_to_test = []
+    items = AGENT_CURRICULUM.get(test_category)
+    if items:
+        for bucket in items["buckets"]:
+            tasks_to_test.extend(bucket)
+else:
+    tasks_to_test = ALL_COMPOSITIONAL_TASKS
+
 
 @retry(
     stop=stop_after_attempt(5),
     reraise=True,
     before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
 )
-@pytest.mark.parametrize("task_entrypoint", ALL_COMPOSITIONAL_TASKS)
+@pytest.mark.parametrize("task_entrypoint", tasks_to_test)
 @pytest.mark.parametrize("random_seed", range(1))
 @pytest.mark.parametrize("level", range(2, 4))
 @pytest.mark.pricy

From 9ef7267f0239576259dff38d4bf1ee95da17b475 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Tue, 15 Jul 2025 15:24:21 -0400
Subject: [PATCH 03/17] Refactor GitHub Actions workflow to replace
 'day_of_week' with 'schedule' for unit test execution, ensuring correct
 scheduling format for each test category.

---
 .github/workflows/unit_tests.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 2ae5a7f..489183c 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -116,17 +116,17 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - day_of_week: '1'
+          - schedule: '59 23 * * 1'
             test_category: 'planning_and_problem_solving'
-          - day_of_week: '2'
+          - schedule: '59 23 * * 2'
             test_category: 'information_retrieval'
-          - day_of_week: '3'
+          - schedule: '59 23 * * 3'
             test_category: 'data_driven_decision_making_and_reasoning'
-          - day_of_week: '4'
+          - schedule: '59 23 * * 4'
             test_category: 'sophisticated_memory'
-          - day_of_week: '5'
+          - schedule: '59 23 * * 5'
             test_category: 'contextual_understanding_infeasible_tasks'
-    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * ${{ matrix.day_of_week }}'
+    if: github.event_name == 'schedule' && github.event.schedule == matrix.schedule
     defaults:
       run:
         shell: bash -l {0}

From 6fe476babbb4ee596d95ebb5c34793d80d8d6fd9 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Tue, 15 Jul 2025 15:28:46 -0400
Subject: [PATCH 04/17] revert to explicit scheduling

---
 .github/workflows/unit_tests.yml | 137 +++++++++++++++++++++++++++----
 1 file changed, 119 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 489183c..dea9c29 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -108,25 +108,126 @@ jobs:
           SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
         run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
+  
+  end-to-end-tests-planning:
+    runs-on: ubuntu-22.04
+    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1'
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: planning_and_problem_solving
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-information-retrieval:
+    runs-on: ubuntu-22.04
+    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 2'
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: information_retrieval
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-data-driven-decision-making:
+    runs-on: ubuntu-22.04
+    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 3'
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: data_driven_decision_making_and_reasoning
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-sophisticated-memory:
+    runs-on: ubuntu-22.04
+    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 4'
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: sophisticated_memory
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
 
-  end-to-end-tests:
-    name: "E2E - ${{ matrix.test_category }}"
+  end-to-end-tests-contextual-understanding:
     runs-on: ubuntu-22.04
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - schedule: '59 23 * * 1'
-            test_category: 'planning_and_problem_solving'
-          - schedule: '59 23 * * 2'
-            test_category: 'information_retrieval'
-          - schedule: '59 23 * * 3'
-            test_category: 'data_driven_decision_making_and_reasoning'
-          - schedule: '59 23 * * 4'
-            test_category: 'sophisticated_memory'
-          - schedule: '59 23 * * 5'
-            test_category: 'contextual_understanding_infeasible_tasks'
-    if: github.event_name == 'schedule' && github.event.schedule == matrix.schedule
+    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 5'
     defaults:
       run:
         shell: bash -l {0}
@@ -150,5 +251,5 @@ jobs:
           SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
           SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
-          TEST_CATEGORY: ${{ matrix.test_category }}
+          TEST_CATEGORY: contextual_understanding_infeasible_tasks
         run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
\ No newline at end of file

From ba77b93c506ec1ca9566a15f6592aa2a50d0a37d Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 13:43:20 -0400
Subject: [PATCH 05/17] Add workflow_dispatch support to GitHub Actions for E2E
 test category selection

---
 .github/workflows/unit_tests.yml | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index dea9c29..07dc996 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -5,6 +5,20 @@ on:
     branches:
       - main
   pull_request:
+  workflow_dispatch:
+    inputs:
+      test_category:
+        description: 'E2E test category to run. Select "none" to run only unit tests.'
+        required: true
+        type: choice
+        default: 'none'
+        options:
+          - none
+          - planning_and_problem_solving
+          - information_retrieval
+          - data_driven_decision_making_and_reasoning
+          - sophisticated_memory
+          - contextual_understanding_infeasible_tasks
   schedule:
     - cron: '59 23 * * 1'
     - cron: '59 23 * * 2'
@@ -16,6 +30,7 @@ jobs:
 
   code-format:
     runs-on: ubuntu-22.04  # Changed from ubuntu-latest
+    if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')
     defaults:
       run:
         shell: bash -l {0}
@@ -41,6 +56,7 @@ jobs:
 
   browsergym-workarena-fast:
     runs-on: ubuntu-22.04  # Changed from ubuntu-latest
+    if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')
   
     defaults:
       run:
@@ -76,6 +92,7 @@ jobs:
 
   browsergym-workarena-slow:
     runs-on: ubuntu-22.04  # Changed from ubuntu-latest
+    if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')
 
     defaults:
       run:
@@ -111,7 +128,7 @@ jobs:
   
   end-to-end-tests-planning:
     runs-on: ubuntu-22.04
-    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1'
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving')
     defaults:
       run:
         shell: bash -l {0}
@@ -140,7 +157,7 @@ jobs:
 
   end-to-end-tests-information-retrieval:
     runs-on: ubuntu-22.04
-    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 2'
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 2') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'information_retrieval')
     defaults:
       run:
         shell: bash -l {0}
@@ -169,7 +186,7 @@ jobs:
 
   end-to-end-tests-data-driven-decision-making:
     runs-on: ubuntu-22.04
-    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 3'
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 3') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'data_driven_decision_making_and_reasoning')
     defaults:
       run:
         shell: bash -l {0}
@@ -198,7 +215,7 @@ jobs:
 
   end-to-end-tests-sophisticated-memory:
     runs-on: ubuntu-22.04
-    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 4'
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 4') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'sophisticated_memory')
     defaults:
       run:
         shell: bash -l {0}
@@ -227,7 +244,7 @@ jobs:
 
   end-to-end-tests-contextual-understanding:
     runs-on: ubuntu-22.04
-    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 5'
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 5') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'contextual_understanding_infeasible_tasks')
     defaults:
       run:
         shell: bash -l {0}

From 6d016f77eb9701b5d414ab4f73520627d4c70f5c Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 14:21:05 -0400
Subject: [PATCH 06/17] for testing

---
 .github/workflows/unit_tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 07dc996..1625417 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -20,7 +20,7 @@ on:
           - sophisticated_memory
           - contextual_understanding_infeasible_tasks
   schedule:
-    - cron: '59 23 * * 1'
+    - cron: '25 18 * * *' # Temporary schedule for testing
     - cron: '59 23 * * 2'
     - cron: '59 23 * * 3'
     - cron: '59 23 * * 4'
@@ -128,7 +128,7 @@ jobs:
   
   end-to-end-tests-planning:
     runs-on: ubuntu-22.04
-    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving')
+    if: (github.event_name == 'schedule' && github.event.schedule == '25 18 * * *') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving')
     defaults:
       run:
         shell: bash -l {0}

From 42b776ccc519f69618598f28059796280f87967f Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 14:23:06 -0400
Subject: [PATCH 07/17] Update scheduling for end-to-end tests in GitHub
 Actions to run at 23:59 on Mondays

---
 .github/workflows/unit_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 1625417..773ab62 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -128,7 +128,7 @@ jobs:
   
   end-to-end-tests-planning:
     runs-on: ubuntu-22.04
-    if: (github.event_name == 'schedule' && github.event.schedule == '25 18 * * *') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving')
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving')
     defaults:
       run:
         shell: bash -l {0}

From e0c5ab5126a7f065b83e1d7aee55ddf798ac7a26 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 14:55:06 -0400
Subject: [PATCH 08/17] Add L1 atomic task tests with retry logic in
 test_task_general.py

---
 tests/test_task_general.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/test_task_general.py b/tests/test_task_general.py
index b410003..799e479 100644
--- a/tests/test_task_general.py
+++ b/tests/test_task_general.py
@@ -17,6 +17,9 @@
 from browsergym.workarena import ATOMIC_TASKS
 
 
+L1_TASKS = [task for task in ATOMIC_TASKS if ".l1." in task.__name__]
+
+
 @retry(
     stop=stop_after_attempt(5),
     retry=retry_if_exception_type(TimeoutError),
@@ -37,3 +40,26 @@ def test_cheat(task_entrypoint, random_seed: int, page: Page):
     reward, done, message, info = task.validate(page, chat_messages)
     task.teardown()
     assert done is True and reward == 1.0
+
+
+
+@retry(
+    stop=stop_after_attempt(5),
+    retry=retry_if_exception_type(TimeoutError),
+    reraise=True,
+    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
+)
+@pytest.mark.parametrize("task_entrypoint", L1_TASKS)
+@pytest.mark.slow
+def test_l1_atomic_cheat(task_entrypoint, page: Page):
+    """L1 atomic tasks have a fixed seed"""
+    task = task_entrypoint(seed=0)
+    goal, info = task.setup(page=page)
+    chat_messages = []
+    reward, done, message, info = task.validate(page, chat_messages)
+    assert done is False and reward == 0.0
+    assert type(message) == str and type(info) == dict
+    task.cheat(page=page, chat_messages=chat_messages)
+    reward, done, message, info = task.validate(page, chat_messages)
+    task.teardown()
+    assert done is True and reward == 1.0

From aa9bac6a38ec303aaf04ff6d469997f4f37a818e Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 15:19:03 -0400
Subject: [PATCH 09/17] Remove unnecessary blank line in test_task_general.py

---
 tests/test_task_general.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_task_general.py b/tests/test_task_general.py
index 799e479..d7d37cf 100644
--- a/tests/test_task_general.py
+++ b/tests/test_task_general.py
@@ -42,7 +42,6 @@ def test_cheat(task_entrypoint, random_seed: int, page: Page):
     assert done is True and reward == 1.0
 
 
-
 @retry(
     stop=stop_after_attempt(5),
     retry=retry_if_exception_type(TimeoutError),

From 3ba5819134235e13635d1320482db2d936b5956f Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 15:28:23 -0400
Subject: [PATCH 10/17] Refactor L1 atomic task tests in test_task_general.py
 to use dynamic task and seed selection from get_all_tasks_agents

---
 tests/test_task_general.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/test_task_general.py b/tests/test_task_general.py
index d7d37cf..edd784b 100644
--- a/tests/test_task_general.py
+++ b/tests/test_task_general.py
@@ -14,10 +14,11 @@
 from playwright.sync_api import Page, TimeoutError
 from tenacity import retry, stop_after_attempt, retry_if_exception_type
 
-from browsergym.workarena import ATOMIC_TASKS
+from browsergym.workarena import ATOMIC_TASKS, get_all_tasks_agents
 
 
-L1_TASKS = [task for task in ATOMIC_TASKS if ".l1." in task.__name__]
+L1_SET = get_all_tasks_agents(filter="l1", is_compositional=False)
+L1_TASKS, L1_SEEDS = [item[0] for item in L1_SET], [item[1] for item in L1_SET]
 
 
 @retry(
@@ -48,11 +49,11 @@ def test_cheat(task_entrypoint, random_seed: int, page: Page):
     reraise=True,
     before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
 )
-@pytest.mark.parametrize("task_entrypoint", L1_TASKS)
+@pytest.mark.parametrize("task_entrypoint, seed", zip(L1_TASKS, L1_SEEDS))
 @pytest.mark.slow
-def test_l1_atomic_cheat(task_entrypoint, page: Page):
+def test_l1_atomic_cheat(task_entrypoint, seed, page: Page):
     """L1 atomic tasks have a fixed seed"""
-    task = task_entrypoint(seed=0)
+    task = task_entrypoint(seed=seed)
     goal, info = task.setup(page=page)
     chat_messages = []
     reward, done, message, info = task.validate(page, chat_messages)

From 1f189d42929e65872afe4dba4997c279f25bdba5 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 15:40:13 -0400
Subject: [PATCH 11/17] fix test

---
 tests/test_task_general.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tests/test_task_general.py b/tests/test_task_general.py
index edd784b..8ac15f6 100644
--- a/tests/test_task_general.py
+++ b/tests/test_task_general.py
@@ -17,7 +17,7 @@
 from browsergym.workarena import ATOMIC_TASKS, get_all_tasks_agents
 
 
-L1_SET = get_all_tasks_agents(filter="l1", is_compositional=False)
+L1_SET = get_all_tasks_agents(filter="l1")
 L1_TASKS, L1_SEEDS = [item[0] for item in L1_SET], [item[1] for item in L1_SET]
 
 
@@ -51,15 +51,18 @@ def test_cheat(task_entrypoint, random_seed: int, page: Page):
 )
 @pytest.mark.parametrize("task_entrypoint, seed", zip(L1_TASKS, L1_SEEDS))
 @pytest.mark.slow
-def test_l1_atomic_cheat(task_entrypoint, seed, page: Page):
-    """L1 atomic tasks have a fixed seed"""
+def test_l1_cheat(task_entrypoint, seed, page: Page):
     task = task_entrypoint(seed=seed)
     goal, info = task.setup(page=page)
     chat_messages = []
-    reward, done, message, info = task.validate(page, chat_messages)
-    assert done is False and reward == 0.0
-    assert type(message) == str and type(info) == dict
-    task.cheat(page=page, chat_messages=chat_messages)
-    reward, done, message, info = task.validate(page, chat_messages)
+    for i in range(len(task)):
+        page.wait_for_timeout(1000)
+        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
+        page.wait_for_timeout(1000)
+        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
+        if i < len(task) - 1:
+            assert done is False and reward == 0.0
+
     task.teardown()
+
     assert done is True and reward == 1.0

From b54453ad4c646aa247ff0d7ed59928d2a49dceb4 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 15:53:22 -0400
Subject: [PATCH 12/17] add pipeline to run atomic tasks on a schedule

---
 .github/workflows/unit_tests.yml | 34 ++++++++++++++++++++++--
 tests/test_task_general.py       | 44 +++++++++-----------------------
 2 files changed, 44 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 773ab62..95c43c3 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -20,11 +20,12 @@ on:
           - sophisticated_memory
           - contextual_understanding_infeasible_tasks
   schedule:
-    - cron: '25 18 * * *' # Temporary schedule for testing
+    - cron: '59 23 * * 1'
     - cron: '59 23 * * 2'
     - cron: '59 23 * * 3'
     - cron: '59 23 * * 4'
     - cron: '59 23 * * 5'
+    - cron: '59 23 * * 6'
 
 jobs:
 
@@ -269,4 +270,33 @@ jobs:
           SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
           TEST_CATEGORY: contextual_understanding_infeasible_tasks
-        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
\ No newline at end of file
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  l1-atomic-weekly-test:
+    runs-on: ubuntu-22.04
+    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 6'
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run L1 Atomic Cheat Test
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_L1_ATOMIC: "true"
+        run: pytest -n 5 --durations=10 -m "slow" --slowmo 1000 -v tests/test_task_general.py
\ No newline at end of file
diff --git a/tests/test_task_general.py b/tests/test_task_general.py
index 8ac15f6..0886cbf 100644
--- a/tests/test_task_general.py
+++ b/tests/test_task_general.py
@@ -2,7 +2,7 @@
 Tests that are not specific to any particular kind of task.
 
 """
-
+import os
 import json
 import logging
 import pickle
@@ -17,8 +17,14 @@
 from browsergym.workarena import ATOMIC_TASKS, get_all_tasks_agents
 
 
-L1_SET = get_all_tasks_agents(filter="l1")
-L1_TASKS, L1_SEEDS = [item[0] for item in L1_SET], [item[1] for item in L1_SET]
+# Prepare parameters for the test, based on the environment variable
+if os.environ.get("TEST_L1_ATOMIC"):
+    # For the weekly scheduled job, run all L1 tasks with their specific seeds
+    L1_SET = get_all_tasks_agents(filter="l1")
+    PARAMS = [(item[0], item[1]) for item in L1_SET]
+else:
+    # For PRs, run all atomic tasks with a single seed to be faster
+    PARAMS = [(task, 0) for task in ATOMIC_TASKS]
 
 
 @retry(
@@ -27,11 +33,10 @@
     reraise=True,
     before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
 )
-@pytest.mark.parametrize("task_entrypoint", ATOMIC_TASKS)
-@pytest.mark.parametrize("random_seed", range(1))
+@pytest.mark.parametrize("task_entrypoint, seed", PARAMS)
 @pytest.mark.slow
-def test_cheat(task_entrypoint, random_seed: int, page: Page):
-    task = task_entrypoint(seed=random_seed)
+def test_atomic_cheat(task_entrypoint, seed: int, page: Page):
+    task = task_entrypoint(seed=seed)
     goal, info = task.setup(page=page)
     chat_messages = []
     reward, done, message, info = task.validate(page, chat_messages)
@@ -41,28 +46,3 @@ def test_cheat(task_entrypoint, random_seed: int, page: Page):
     reward, done, message, info = task.validate(page, chat_messages)
     task.teardown()
     assert done is True and reward == 1.0
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    retry=retry_if_exception_type(TimeoutError),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint, seed", zip(L1_TASKS, L1_SEEDS))
-@pytest.mark.slow
-def test_l1_cheat(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0

From b55c8e3d3669de8c0d43c987360c994c81c79991 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 15:56:16 -0400
Subject: [PATCH 13/17] code format

---
 tests/test_task_general.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_task_general.py b/tests/test_task_general.py
index 0886cbf..ed31ee7 100644
--- a/tests/test_task_general.py
+++ b/tests/test_task_general.py
@@ -2,6 +2,7 @@
 Tests that are not specific to any particular kind of task.
 
 """
+
 import os
 import json
 import logging

From cbc51d6836d5d79f07db1f7d5fa350cdfcda0b4e Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 16:30:59 -0400
Subject: [PATCH 14/17] test schedule

---
 .github/workflows/unit_tests.yml | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 95c43c3..c2331ba 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -5,22 +5,8 @@ on:
     branches:
       - main
   pull_request:
-  workflow_dispatch:
-    inputs:
-      test_category:
-        description: 'E2E test category to run. Select "none" to run only unit tests.'
-        required: true
-        type: choice
-        default: 'none'
-        options:
-          - none
-          - planning_and_problem_solving
-          - information_retrieval
-          - data_driven_decision_making_and_reasoning
-          - sophisticated_memory
-          - contextual_understanding_infeasible_tasks
   schedule:
-    - cron: '59 23 * * 1'
+    - cron: '34 20 * * *' # Temporary schedule for testing
     - cron: '59 23 * * 2'
     - cron: '59 23 * * 3'
     - cron: '59 23 * * 4'
@@ -129,7 +115,7 @@ jobs:
   
   end-to-end-tests-planning:
     runs-on: ubuntu-22.04
-    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 1') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving')
+    if: (github.event_name == 'schedule' && github.event.schedule == '34 20 * * *') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving')
     defaults:
       run:
         shell: bash -l {0}

From dca8e91246b6e2d782ed46933a856a7186fd7a55 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 17:37:57 -0400
Subject: [PATCH 15/17] revert test setting

---
 .github/workflows/unit_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index c2331ba..fb86ca5 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -6,7 +6,7 @@ on:
       - main
   pull_request:
   schedule:
-    - cron: '34 20 * * *' # Temporary schedule for testing
+    - cron: '59 23 * * 1' # Temporary schedule for testing
     - cron: '59 23 * * 2'
     - cron: '59 23 * * 3'
     - cron: '59 23 * * 4'

From fbd4e8e222b8f27bbfd3e744ad79f80dddbda7b9 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 18:23:21 -0400
Subject: [PATCH 16/17] refactor tests

---
 src/browsergym/workarena/__init__.py |  16 ++-
 tests/test_compositional.py          | 159 ++++-----------------------
 tests/test_workarena_utils.py        | 114 +++++++++++++++++++
 3 files changed, 145 insertions(+), 144 deletions(-)
 create mode 100644 tests/test_workarena_utils.py

diff --git a/src/browsergym/workarena/__init__.py b/src/browsergym/workarena/__init__.py
index b8767c6..3abd02e 100644
--- a/src/browsergym/workarena/__init__.py
+++ b/src/browsergym/workarena/__init__.py
@@ -111,18 +111,19 @@ def get_task_category(task_name):
     return benchmark, TASK_CATEGORY_MAP.get(task_name, None)
 
 
-def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True):
+def get_all_tasks_agents(
+    filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True, task_bucket=None
+):
     OFFSET = 42
     all_task_tuples = []
     filter = filter.split(".")
+    rng = np.random.RandomState(meta_seed)
     if len(filter) > 2:
         raise Exception("Unsupported filter used.")
     if len(filter) == 1:
         level = filter[0]
         if level not in ["l1", "l2", "l3"]:
             raise Exception("Unsupported category of tasks.")
-        else:
-            rng = np.random.RandomState(meta_seed)
         if level == "l1":
             for task in ATOMIC_TASKS:
                 for seed in rng.randint(0, 1000, n_seed_l1):
@@ -151,9 +152,16 @@ def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curri
     for category, items in ALL_COMPOSITIONAL_TASKS_CATEGORIES.items():
         if filter_category and category != filter_category:
             continue
+        # If a task_bucket is specified, check if it exists in the current category
+        if task_bucket and task_bucket not in items["buckets"]:
+            continue
         for curr_seed in rng.randint(0, 1000, items["num_seeds"]):
             random_gen = np.random.RandomState(curr_seed)
-            for task_set, count in zip(items["buckets"], items["weights"]):
+            for i, task_set in enumerate(items["buckets"]):
+                # if a task_bucket is specified, only select tasks from that bucket
+                if task_bucket and task_set != task_bucket:
+                    continue
+                count = items["weights"][i]
                 tasks = random_gen.choice(task_set, count, replace=False)
                 for task in tasks:
                     all_task_tuples.append((task, int(curr_seed)))
diff --git a/tests/test_compositional.py b/tests/test_compositional.py
index c8f3430..0f5f04b 100644
--- a/tests/test_compositional.py
+++ b/tests/test_compositional.py
@@ -2,7 +2,6 @@
 Tests that are not specific to any particular kind of task.
 
 """
-
 import logging
 import os
 
@@ -14,70 +13,25 @@
 from playwright.sync_api import Page, TimeoutError
 from tenacity import retry, stop_after_attempt, retry_if_exception_type
 
-from browsergym.workarena import ALL_COMPOSITIONAL_TASKS, get_all_tasks_agents
-from browsergym.workarena.tasks.compositional.utils.curriculum import AGENT_CURRICULUM
-
-
-AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
-
-AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
-]
-
-AGENT_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3")
-
-AGENT_L3_SAMPLED_TASKS, AGENT_L3_SEEDS = [sampled_set[0] for sampled_set in AGENT_L3_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in AGENT_L3_SAMPLED_SET
-]
+from browsergym.workarena import get_all_tasks_agents
+from browsergym.workarena.tasks.compositional.base import CompositionalTask
 
+# Combine all tasks into a single list for parameterization
+AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2", is_agent_curriculum=True)
+AGENT_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3", is_agent_curriculum=True)
 HUMAN_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2", is_agent_curriculum=False)
-
-HUMAN_L2_SAMPLED_TASKS, HUMAN_L2_SEEDS = [sampled_set[0] for sampled_set in HUMAN_L2_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in HUMAN_L2_SAMPLED_SET
-]
-
 HUMAN_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3", is_agent_curriculum=False)
 
-HUMAN_L3_SAMPLED_TASKS, HUMAN_L3_SEEDS = [sampled_set[0] for sampled_set in HUMAN_L3_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in HUMAN_L3_SAMPLED_SET
-]
+all_tasks_to_test = (
+    AGENT_L2_SAMPLED_SET + AGENT_L3_SAMPLED_SET + HUMAN_L2_SAMPLED_SET + HUMAN_L3_SAMPLED_SET
+)
 
 test_category = os.environ.get("TEST_CATEGORY")
-
 if test_category:
-    tasks_to_test = []
-    items = AGENT_CURRICULUM.get(test_category)
-    if items:
-        for bucket in items["buckets"]:
-            tasks_to_test.extend(bucket)
+    # If a category is specified, filter the tasks to test
+    tasks_to_test = get_all_tasks_agents(filter=f"l3.{test_category}", is_agent_curriculum=True)
 else:
-    tasks_to_test = ALL_COMPOSITIONAL_TASKS
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint", tasks_to_test)
-@pytest.mark.parametrize("random_seed", range(1))
-@pytest.mark.parametrize("level", range(2, 4))
-@pytest.mark.pricy
-def test_cheat_compositional(task_entrypoint, random_seed, level, page: Page):
-    task = task_entrypoint(seed=random_seed, level=level)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
+    tasks_to_test = all_tasks_to_test
 
 
 @retry(
@@ -86,89 +40,14 @@ def test_cheat_compositional(task_entrypoint, random_seed, level, page: Page):
     reraise=True,
     before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
 )
-@pytest.mark.parametrize("task_entrypoint, seed", zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_agent_set_l2(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    retry=retry_if_exception_type(TimeoutError),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint, seed", zip(AGENT_L3_SAMPLED_TASKS, AGENT_L3_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_agent_set_l3(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    retry=retry_if_exception_type(TimeoutError),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint, seed", zip(HUMAN_L2_SAMPLED_TASKS, HUMAN_L2_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_human_set_l2(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    retry=retry_if_exception_type(TimeoutError),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint, seed", zip(HUMAN_L3_SAMPLED_TASKS, HUMAN_L3_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_human_set_l3(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
+@pytest.mark.parametrize("task_class, seed", tasks_to_test)
+@pytest.mark.pricy
+def test_cheat_compositional(task_class, seed, page: Page):
+    """
+    Test that the cheat method works for all compositional tasks.
+    This test is parameterized to run for all tasks in the agent and human curricula.
+    """
+    task = task_class(seed=seed)
     goal, info = task.setup(page=page)
     chat_messages = []
     for i in range(len(task)):
diff --git a/tests/test_workarena_utils.py b/tests/test_workarena_utils.py
new file mode 100644
index 0000000..3009a63
--- /dev/null
+++ b/tests/test_workarena_utils.py
@@ -0,0 +1,114 @@
+"""
+Tests for workarena utility functions.
+"""
+import pytest
+from browsergym.workarena import get_all_tasks_agents
+from browsergym.workarena.tasks.compositional import (
+    AGENT_CURRICULUM_L2,
+    AGENT_CURRICULUM_L3,
+    HUMAN_CURRICULUM_L2,
+    HUMAN_CURRICULUM_L3,
+    specialize_task_class_to_level,
+)
+from browsergym.workarena.tasks.compositional.base import CompositionalTask
+from browsergym.workarena.tasks.compositional.mark_duplicate_problems import (
+    BasicFilterProblemsAndMarkDuplicatesSmallTask,
+    PriorityFilterProblemsAndMarkDuplicatesSmallTask,
+)
+from browsergym.workarena.tasks.compositional.navigate_and_do_infeasible import (
+    InfeasibleNavigateAndCreateUserWithReasonTask,
+)
+
+
+def get_tasks_from_curriculum(curriculum):
+    """Helper function to extract all unique tasks from a curriculum."""
+    all_tasks = set()
+    for category, items in curriculum.items():
+        for bucket in items["buckets"]:
+            for task in bucket:
+                all_tasks.add(task)
+    return all_tasks
+
+
+def test_get_all_tasks_agents():
+    """Test that get_all_tasks_agents returns the correct tasks from the curricula."""
+    # Test L1 filter (atomic tasks)
+    tasks_with_seeds_l1 = get_all_tasks_agents(filter="l1")
+    assert len(tasks_with_seeds_l1) > 0
+    for task, seed in tasks_with_seeds_l1:
+        assert not issubclass(task, CompositionalTask)
+        assert isinstance(seed, int)
+
+    # Test L2 Human Curriculum
+    tasks_with_seeds_l2_human = get_all_tasks_agents(filter="l2", is_agent_curriculum=False)
+    expected_l2_human_tasks = get_tasks_from_curriculum(HUMAN_CURRICULUM_L2)
+    assert len(tasks_with_seeds_l2_human) > 0
+    for task, seed in tasks_with_seeds_l2_human:
+        assert task in expected_l2_human_tasks
+
+    # Test L3 Human Curriculum
+    tasks_with_seeds_l3_human = get_all_tasks_agents(filter="l3", is_agent_curriculum=False)
+    expected_l3_human_tasks = get_tasks_from_curriculum(HUMAN_CURRICULUM_L3)
+    assert len(tasks_with_seeds_l3_human) > 0
+    for task, seed in tasks_with_seeds_l3_human:
+        assert task in expected_l3_human_tasks
+
+    # Test category filtering
+    category = "planning_and_problem_solving"
+    tasks_with_seeds_cat = get_all_tasks_agents(
+        filter=f"l3.{category}", is_agent_curriculum=True
+    )
+    assert len(tasks_with_seeds_cat) > 0
+    # Expected tasks from the specified category's buckets
+    expected_cat_tasks = set()
+    for bucket in AGENT_CURRICULUM_L3[category]["buckets"]:
+        expected_cat_tasks.update(bucket)
+
+    returned_tasks = {task for task, seed in tasks_with_seeds_cat}
+    assert returned_tasks.issubset(expected_cat_tasks)
+
+    # Check that tasks from other categories are not present
+    for other_category, items in AGENT_CURRICULUM_L3.items():
+        if other_category != category:
+            for bucket in items["buckets"]:
+                for task in bucket:
+                    assert task not in returned_tasks
+
+    # Test task_bucket filtering
+    category = "planning_and_problem_solving"
+    # This bucket contains BasicFilterProblemsAndMarkDuplicatesSmallTask
+    bucket_to_test = AGENT_CURRICULUM_L3[category]["buckets"][0]
+
+    tasks_with_seeds_bucket = get_all_tasks_agents(
+        filter=f"l3.{category}", is_agent_curriculum=True, task_bucket=bucket_to_test
+    )
+    assert len(tasks_with_seeds_bucket) > 0
+
+    returned_tasks_from_bucket = {task for task, seed in tasks_with_seeds_bucket}
+
+    # 1. All returned tasks are from the specified bucket
+    assert returned_tasks_from_bucket.issubset(set(bucket_to_test))
+
+    # 2. A specific task from the bucket is present
+    expected_task_base = BasicFilterProblemsAndMarkDuplicatesSmallTask
+    # Find the specialized task in the bucket that corresponds to the base task
+    expected_task_specialized = next(
+        task
+        for task in bucket_to_test
+        if expected_task_base in task.__mro__
+    )
+    assert expected_task_specialized in returned_tasks_from_bucket
+
+    # A task from a different category is not present
+    unexpected_task = specialize_task_class_to_level(
+        InfeasibleNavigateAndCreateUserWithReasonTask, level=3
+    )
+    assert unexpected_task not in returned_tasks_from_bucket
+
+    # Test invalid filter
+    with pytest.raises(Exception):
+        get_all_tasks_agents(filter="invalid")
+
+    # Test invalid category filter
+    with pytest.raises(Exception):
+        get_all_tasks_agents(filter="l3.invalid_category") 
\ No newline at end of file

From ba6f5931a05d03d445c6d03e242ceef4d05a6ab0 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Wed, 16 Jul 2025 18:30:08 -0400
Subject: [PATCH 17/17] code format

---
 tests/test_compositional.py   |  1 +
 tests/test_workarena_utils.py | 11 ++++-------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/tests/test_compositional.py b/tests/test_compositional.py
index 0f5f04b..425aa33 100644
--- a/tests/test_compositional.py
+++ b/tests/test_compositional.py
@@ -2,6 +2,7 @@
 Tests that are not specific to any particular kind of task.
 
 """
+
 import logging
 import os
 
diff --git a/tests/test_workarena_utils.py b/tests/test_workarena_utils.py
index 3009a63..395fbd0 100644
--- a/tests/test_workarena_utils.py
+++ b/tests/test_workarena_utils.py
@@ -1,6 +1,7 @@
 """
 Tests for workarena utility functions.
 """
+
 import pytest
 from browsergym.workarena import get_all_tasks_agents
 from browsergym.workarena.tasks.compositional import (
@@ -55,9 +56,7 @@ def test_get_all_tasks_agents():
 
     # Test category filtering
     category = "planning_and_problem_solving"
-    tasks_with_seeds_cat = get_all_tasks_agents(
-        filter=f"l3.{category}", is_agent_curriculum=True
-    )
+    tasks_with_seeds_cat = get_all_tasks_agents(filter=f"l3.{category}", is_agent_curriculum=True)
     assert len(tasks_with_seeds_cat) > 0
     # Expected tasks from the specified category's buckets
     expected_cat_tasks = set()
@@ -93,9 +92,7 @@ def test_get_all_tasks_agents():
     expected_task_base = BasicFilterProblemsAndMarkDuplicatesSmallTask
     # Find the specialized task in the bucket that corresponds to the base task
     expected_task_specialized = next(
-        task
-        for task in bucket_to_test
-        if expected_task_base in task.__mro__
+        task for task in bucket_to_test if expected_task_base in task.__mro__
     )
     assert expected_task_specialized in returned_tasks_from_bucket
 
@@ -111,4 +108,4 @@ def test_get_all_tasks_agents():
 
     # Test invalid category filter
     with pytest.raises(Exception):
-        get_all_tasks_agents(filter="l3.invalid_category") 
\ No newline at end of file
+        get_all_tasks_agents(filter="l3.invalid_category")