diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index f5f1902..fb86ca5 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -6,12 +6,18 @@ on:
       - main
   pull_request:
   schedule:
-    - cron: '59 23 * * SUN'  # Runs at midnight on Sunday
+    - cron: '59 23 * * 1' # Temporary schedule for testing
+    - cron: '59 23 * * 2'
+    - cron: '59 23 * * 3'
+    - cron: '59 23 * * 4'
+    - cron: '59 23 * * 5'
+    - cron: '59 23 * * 6'
 
 jobs:
 
   code-format:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
+    if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')
     defaults:
       run:
         shell: bash -l {0}
@@ -36,7 +42,8 @@ jobs:
         run: black . --check
 
   browsergym-workarena-fast:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
+    if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')
   
     defaults:
       run:
@@ -71,7 +78,8 @@ jobs:
         run: pytest -n 5 --durations=10 -m 'not slow and not pricy' --slowmo 1000 -v tests
 
   browsergym-workarena-slow:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04  # Changed from ubuntu-latest
+    if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')
 
     defaults:
       run:
@@ -105,9 +113,9 @@ jobs:
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
         run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests
   
-  end-to-end-tests:
-    runs-on: ubuntu-latest
-    if: github.event_name == 'schedule'
+  end-to-end-tests-planning:
+    runs-on: ubuntu-22.04
+    if: (github.event_name == 'schedule' && github.event.schedule == '34 20 * * *') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving')
     defaults:
       run:
         shell: bash -l {0}
@@ -131,4 +139,150 @@ jobs:
           SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
           SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
           SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: planning_and_problem_solving
         run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-information-retrieval:
+    runs-on: ubuntu-22.04
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 2') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'information_retrieval')
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: information_retrieval
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-data-driven-decision-making:
+    runs-on: ubuntu-22.04
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 3') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'data_driven_decision_making_and_reasoning')
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: data_driven_decision_making_and_reasoning
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-sophisticated-memory:
+    runs-on: ubuntu-22.04
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 4') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'sophisticated_memory')
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: sophisticated_memory
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  end-to-end-tests-contextual-understanding:
+    runs-on: ubuntu-22.04
+    if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 5') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'contextual_understanding_infeasible_tasks')
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run E2E Tests
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_CATEGORY: contextual_understanding_infeasible_tasks
+        run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests
+
+  l1-atomic-weekly-test:
+    runs-on: ubuntu-22.04
+    if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 6'
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+      - name: Pip list
+        run: pip list
+      - name: Install Playwright
+        run: playwright install --with-deps
+      - name: Run L1 Atomic Cheat Test
+        env:
+          SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
+          SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
+          SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
+          TEST_L1_ATOMIC: "true"
+        run: pytest -n 5 --durations=10 -m "slow" --slowmo 1000 -v tests/test_task_general.py
\ No newline at end of file
diff --git a/src/browsergym/workarena/__init__.py b/src/browsergym/workarena/__init__.py
index b8767c6..3abd02e 100644
--- a/src/browsergym/workarena/__init__.py
+++ b/src/browsergym/workarena/__init__.py
@@ -111,18 +111,19 @@ def get_task_category(task_name):
     return benchmark, TASK_CATEGORY_MAP.get(task_name, None)
 
 
-def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True):
+def get_all_tasks_agents(
+    filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True, task_bucket=None
+):
     OFFSET = 42
     all_task_tuples = []
     filter = filter.split(".")
+    rng = np.random.RandomState(meta_seed)
     if len(filter) > 2:
         raise Exception("Unsupported filter used.")
     if len(filter) == 1:
         level = filter[0]
         if level not in ["l1", "l2", "l3"]:
             raise Exception("Unsupported category of tasks.")
-        else:
-            rng = np.random.RandomState(meta_seed)
         if level == "l1":
             for task in ATOMIC_TASKS:
                 for seed in rng.randint(0, 1000, n_seed_l1):
@@ -151,9 +152,16 @@ def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curri
     for category, items in ALL_COMPOSITIONAL_TASKS_CATEGORIES.items():
         if filter_category and category != filter_category:
             continue
+        # If a task_bucket is specified, check if it exists in the current category
+        if task_bucket and task_bucket not in items["buckets"]:
+            continue
         for curr_seed in rng.randint(0, 1000, items["num_seeds"]):
             random_gen = np.random.RandomState(curr_seed)
-            for task_set, count in zip(items["buckets"], items["weights"]):
+            for i, task_set in enumerate(items["buckets"]):
+                # if a task_bucket is specified, only select tasks from that bucket
+                if task_bucket and task_set != task_bucket:
+                    continue
+                count = items["weights"][i]
                 tasks = random_gen.choice(task_set, count, replace=False)
                 for task in tasks:
                     all_task_tuples.append((task, int(curr_seed)))
diff --git a/tests/test_compositional.py b/tests/test_compositional.py
index 716a499..425aa33 100644
--- a/tests/test_compositional.py
+++ b/tests/test_compositional.py
@@ -4,6 +4,7 @@
 """
 
 import logging
+import os
 
 import pytest
 
@@ -12,135 +13,26 @@
 
 from playwright.sync_api import Page, TimeoutError
 from tenacity import retry, stop_after_attempt, retry_if_exception_type
-from browsergym.workarena import ALL_COMPOSITIONAL_TASKS, get_all_tasks_agents
 
-AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
-
-AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
-]
-
-AGENT_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3")
-
-AGENT_L3_SAMPLED_TASKS, AGENT_L3_SEEDS = [sampled_set[0] for sampled_set in AGENT_L3_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in AGENT_L3_SAMPLED_SET
-]
+from browsergym.workarena import get_all_tasks_agents
+from browsergym.workarena.tasks.compositional.base import CompositionalTask
 
+# Combine all tasks into a single list for parameterization
+AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2", is_agent_curriculum=True)
+AGENT_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3", is_agent_curriculum=True)
 HUMAN_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2", is_agent_curriculum=False)
-
-HUMAN_L2_SAMPLED_TASKS, HUMAN_L2_SEEDS = [sampled_set[0] for sampled_set in HUMAN_L2_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in HUMAN_L2_SAMPLED_SET
-]
-
 HUMAN_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3", is_agent_curriculum=False)
 
-HUMAN_L3_SAMPLED_TASKS, HUMAN_L3_SEEDS = [sampled_set[0] for sampled_set in HUMAN_L3_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in HUMAN_L3_SAMPLED_SET
-]
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint", ALL_COMPOSITIONAL_TASKS)
-@pytest.mark.parametrize("random_seed", range(1))
-@pytest.mark.parametrize("level", range(2, 4))
-@pytest.mark.pricy
-def test_cheat_compositional(task_entrypoint, random_seed, level, page: Page):
-    task = task_entrypoint(seed=random_seed, level=level)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    retry=retry_if_exception_type(TimeoutError),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint, seed", zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_agent_set_l2(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    retry=retry_if_exception_type(TimeoutError),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint, seed", zip(AGENT_L3_SAMPLED_TASKS, AGENT_L3_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_agent_set_l3(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    retry=retry_if_exception_type(TimeoutError),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
+all_tasks_to_test = (
+    AGENT_L2_SAMPLED_SET + AGENT_L3_SAMPLED_SET + HUMAN_L2_SAMPLED_SET + HUMAN_L3_SAMPLED_SET
 )
-@pytest.mark.parametrize("task_entrypoint, seed", zip(HUMAN_L2_SAMPLED_TASKS, HUMAN_L2_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_human_set_l2(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
 
-    task.teardown()
-
-    assert done is True and reward == 1.0
+test_category = os.environ.get("TEST_CATEGORY")
+if test_category:
+    # If a category is specified, filter the tasks to test
+    tasks_to_test = get_all_tasks_agents(filter=f"l3.{test_category}", is_agent_curriculum=True)
+else:
+    tasks_to_test = all_tasks_to_test
 
 
 @retry(
@@ -149,11 +41,14 @@ def test_cheat_compositional_sampled_human_set_l2(task_entrypoint, seed, page: P
     reraise=True,
     before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
 )
-@pytest.mark.parametrize("task_entrypoint, seed", zip(HUMAN_L3_SAMPLED_TASKS, HUMAN_L3_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_human_set_l3(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
+@pytest.mark.parametrize("task_class, seed", tasks_to_test)
+@pytest.mark.pricy
+def test_cheat_compositional(task_class, seed, page: Page):
+    """
+    Test that the cheat method works for all compositional tasks.
+    This test is parameterized to run for all tasks in the agent and human curricula.
+    """
+    task = task_class(seed=seed)
     goal, info = task.setup(page=page)
     chat_messages = []
     for i in range(len(task)):
diff --git a/tests/test_task_general.py b/tests/test_task_general.py
index b410003..ed31ee7 100644
--- a/tests/test_task_general.py
+++ b/tests/test_task_general.py
@@ -3,6 +3,7 @@
 
 """
 
+import os
 import json
 import logging
 import pickle
@@ -14,7 +15,17 @@
 from playwright.sync_api import Page, TimeoutError
 from tenacity import retry, stop_after_attempt, retry_if_exception_type
 
-from browsergym.workarena import ATOMIC_TASKS
+from browsergym.workarena import ATOMIC_TASKS, get_all_tasks_agents
+
+
+# Prepare parameters for the test, based on the environment variable
+if os.environ.get("TEST_L1_ATOMIC"):
+    # For the weekly scheduled job, run all L1 tasks with their specific seeds
+    L1_SET = get_all_tasks_agents(filter="l1")
+    PARAMS = [(item[0], item[1]) for item in L1_SET]
+else:
+    # For PRs, run all atomic tasks with a single seed to be faster
+    PARAMS = [(task, 0) for task in ATOMIC_TASKS]
 
 
 @retry(
@@ -23,11 +34,10 @@
     reraise=True,
     before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
 )
-@pytest.mark.parametrize("task_entrypoint", ATOMIC_TASKS)
-@pytest.mark.parametrize("random_seed", range(1))
+@pytest.mark.parametrize("task_entrypoint, seed", PARAMS)
 @pytest.mark.slow
-def test_cheat(task_entrypoint, random_seed: int, page: Page):
-    task = task_entrypoint(seed=random_seed)
+def test_atomic_cheat(task_entrypoint, seed: int, page: Page):
+    task = task_entrypoint(seed=seed)
     goal, info = task.setup(page=page)
     chat_messages = []
     reward, done, message, info = task.validate(page, chat_messages)
diff --git a/tests/test_workarena_utils.py b/tests/test_workarena_utils.py
new file mode 100644
index 0000000..395fbd0
--- /dev/null
+++ b/tests/test_workarena_utils.py
@@ -0,0 +1,111 @@
+"""
+Tests for workarena utility functions.
+"""
+
+import pytest
+from browsergym.workarena import get_all_tasks_agents
+from browsergym.workarena.tasks.compositional import (
+    AGENT_CURRICULUM_L2,
+    AGENT_CURRICULUM_L3,
+    HUMAN_CURRICULUM_L2,
+    HUMAN_CURRICULUM_L3,
+    specialize_task_class_to_level,
+)
+from browsergym.workarena.tasks.compositional.base import CompositionalTask
+from browsergym.workarena.tasks.compositional.mark_duplicate_problems import (
+    BasicFilterProblemsAndMarkDuplicatesSmallTask,
+    PriorityFilterProblemsAndMarkDuplicatesSmallTask,
+)
+from browsergym.workarena.tasks.compositional.navigate_and_do_infeasible import (
+    InfeasibleNavigateAndCreateUserWithReasonTask,
+)
+
+
+def get_tasks_from_curriculum(curriculum):
+    """Helper function to extract all unique tasks from a curriculum."""
+    all_tasks = set()
+    for category, items in curriculum.items():
+        for bucket in items["buckets"]:
+            for task in bucket:
+                all_tasks.add(task)
+    return all_tasks
+
+
+def test_get_all_tasks_agents():
+    """Test that get_all_tasks_agents returns the correct tasks from the curricula."""
+    # Test L1 filter (atomic tasks)
+    tasks_with_seeds_l1 = get_all_tasks_agents(filter="l1")
+    assert len(tasks_with_seeds_l1) > 0
+    for task, seed in tasks_with_seeds_l1:
+        assert not issubclass(task, CompositionalTask)
+        assert isinstance(seed, int)
+
+    # Test L2 Human Curriculum
+    tasks_with_seeds_l2_human = get_all_tasks_agents(filter="l2", is_agent_curriculum=False)
+    expected_l2_human_tasks = get_tasks_from_curriculum(HUMAN_CURRICULUM_L2)
+    assert len(tasks_with_seeds_l2_human) > 0
+    for task, seed in tasks_with_seeds_l2_human:
+        assert task in expected_l2_human_tasks
+
+    # Test L3 Human Curriculum
+    tasks_with_seeds_l3_human = get_all_tasks_agents(filter="l3", is_agent_curriculum=False)
+    expected_l3_human_tasks = get_tasks_from_curriculum(HUMAN_CURRICULUM_L3)
+    assert len(tasks_with_seeds_l3_human) > 0
+    for task, seed in tasks_with_seeds_l3_human:
+        assert task in expected_l3_human_tasks
+
+    # Test category filtering
+    category = "planning_and_problem_solving"
+    tasks_with_seeds_cat = get_all_tasks_agents(filter=f"l3.{category}", is_agent_curriculum=True)
+    assert len(tasks_with_seeds_cat) > 0
+    # Expected tasks from the specified category's buckets
+    expected_cat_tasks = set()
+    for bucket in AGENT_CURRICULUM_L3[category]["buckets"]:
+        expected_cat_tasks.update(bucket)
+
+    returned_tasks = {task for task, seed in tasks_with_seeds_cat}
+    assert returned_tasks.issubset(expected_cat_tasks)
+
+    # Check that tasks from other categories are not present
+    for other_category, items in AGENT_CURRICULUM_L3.items():
+        if other_category != category:
+            for bucket in items["buckets"]:
+                for task in bucket:
+                    assert task not in returned_tasks
+
+    # Test task_bucket filtering
+    category = "planning_and_problem_solving"
+    # This bucket contains BasicFilterProblemsAndMarkDuplicatesSmallTask
+    bucket_to_test = AGENT_CURRICULUM_L3[category]["buckets"][0]
+
+    tasks_with_seeds_bucket = get_all_tasks_agents(
+        filter=f"l3.{category}", is_agent_curriculum=True, task_bucket=bucket_to_test
+    )
+    assert len(tasks_with_seeds_bucket) > 0
+
+    returned_tasks_from_bucket = {task for task, seed in tasks_with_seeds_bucket}
+
+    # 1. All returned tasks are from the specified bucket
+    assert returned_tasks_from_bucket.issubset(set(bucket_to_test))
+
+    # 2. A specific task from the bucket is present
+    expected_task_base = BasicFilterProblemsAndMarkDuplicatesSmallTask
+    # Find the specialized task in the bucket that corresponds to the base task
+    expected_task_specialized = next(
+        task for task in bucket_to_test if expected_task_base in task.__mro__
+    )
+    assert expected_task_specialized in returned_tasks_from_bucket
+
+    # A task from a different category is not present
+    unexpected_task = specialize_task_class_to_level(
+        InfeasibleNavigateAndCreateUserWithReasonTask, level=3
+    )
+    assert unexpected_task not in returned_tasks_from_bucket
+
+    # Test invalid filter
+    with pytest.raises(Exception):
+        get_all_tasks_agents(filter="invalid")
+
+    # Test invalid category filter
+    with pytest.raises(Exception):
+        get_all_tasks_agents(filter="l3.invalid_category")