ServiceNow · jardinetsouffleton · Jul 15, 2025 · Jul 15, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -6,12 +6,18 @@ on:
       - main
   pull_request:
   schedule:
-    - cron: '59 23 * * SUN'  # Runs at midnight on Sunday
+    - cron: '59 23 * * 1' # Temporary schedule for testing
+    - cron: '59 23 * * 2'
+    - cron: '59 23 * * 3'
+    - cron: '59 23 * * 4'
+    - cron: '59 23 * * 5'
+    - cron: '59 23 * * 6'
 
 jobs:
 
   code-format:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
+    if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')
     defaults:
       run:
         shell: bash -l {0}
@@ -36,7 +42,8 @@ jobs:
         run: black . --check
 
   browsergym-workarena-fast:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
+    if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')
 
     defaults:
       run:
@@ -71,7 +78,8 @@ jobs:
         run: pytest -n 5 --durations=10 -m 'not slow and not pricy' --slowmo 1000 -v tests
 
   browsergym-workarena-slow:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
+    if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')
 
     defaults:
       run:
@@ -105,9 +113,9 @@ jobs:
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
         run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
 
-  end-to-end-tests:
-    runs-on: ubuntu-latest
-    if: github.event_name == 'schedule'
+  end-to-end-tests-planning:
+    runs-on: ubuntu-22.04
+    if: (github.event_name == 'schedule' && github.event.schedule == '34 20 * * *') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving')
     defaults:
       run:
         shell: bash -l {0}
@@ -131,4 +139,150 @@ jobs:
           SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
           SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: planning_and_problem_solving
         run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-information-retrieval:
+    runs-on: ubuntu-22.04
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 2') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'information_retrieval')
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: information_retrieval
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-data-driven-decision-making:
+    runs-on: ubuntu-22.04
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 3') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'data_driven_decision_making_and_reasoning')
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: data_driven_decision_making_and_reasoning
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-sophisticated-memory:
+    runs-on: ubuntu-22.04
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 4') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'sophisticated_memory')
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: sophisticated_memory
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-contextual-understanding:
+    runs-on: ubuntu-22.04
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 5') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'contextual_understanding_infeasible_tasks')
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: contextual_understanding_infeasible_tasks
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  l1-atomic-weekly-test:
+    runs-on: ubuntu-22.04
+    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 6'
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run L1 Atomic Cheat Test
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_L1_ATOMIC: "true"
+        run: pytest -n 5 --durations=10 -m "slow" --slowmo 1000 -v tests/test_task_general.py
diff --git a/src/browsergym/workarena/__init__.py b/src/browsergym/workarena/__init__.py
@@ -111,18 +111,19 @@ def get_task_category(task_name):
     return benchmark, TASK_CATEGORY_MAP.get(task_name, None)
 
 
-def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True):
+def get_all_tasks_agents(
+    filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True, task_bucket=None
+):
     OFFSET = 42
     all_task_tuples = []
     filter = filter.split(".")
+    rng = np.random.RandomState(meta_seed)
     if len(filter) > 2:
         raise Exception("Unsupported filter used.")
     if len(filter) == 1:
         level = filter[0]
         if level not in ["l1", "l2", "l3"]:
             raise Exception("Unsupported category of tasks.")
-        else:
-            rng = np.random.RandomState(meta_seed)
         if level == "l1":
             for task in ATOMIC_TASKS:
                 for seed in rng.randint(0, 1000, n_seed_l1):
@@ -151,9 +152,16 @@ def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curri
     for category, items in ALL_COMPOSITIONAL_TASKS_CATEGORIES.items():
         if filter_category and category != filter_category:
             continue
+        # If a task_bucket is specified, check if it exists in the current category
+        if task_bucket and task_bucket not in items["buckets"]:
+            continue
         for curr_seed in rng.randint(0, 1000, items["num_seeds"]):
             random_gen = np.random.RandomState(curr_seed)
-            for task_set, count in zip(items["buckets"], items["weights"]):
+            for i, task_set in enumerate(items["buckets"]):
+                # if a task_bucket is specified, only select tasks from that bucket
+                if task_bucket and task_set != task_bucket:
+                    continue
+                count = items["weights"][i]
                 tasks = random_gen.choice(task_set, count, replace=False)
                 for task in tasks:
                     all_task_tuples.append((task, int(curr_seed)))