diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index f5f1902..fb86ca5 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -6,12 +6,18 @@ on: - main pull_request: schedule: - - cron: '59 23 * * SUN' # Runs at midnight on Sunday + - cron: '59 23 * * 1' # Temporary schedule for testing + - cron: '59 23 * * 2' + - cron: '59 23 * * 3' + - cron: '59 23 * * 4' + - cron: '59 23 * * 5' + - cron: '59 23 * * 6' jobs: code-format: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 # Changed from ubuntu-latest + if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none') defaults: run: shell: bash -l {0} @@ -36,7 +42,8 @@ jobs: run: black . --check browsergym-workarena-fast: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 # Changed from ubuntu-latest + if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none') defaults: run: @@ -71,7 +78,8 @@ jobs: run: pytest -n 5 --durations=10 -m 'not slow and not pricy' --slowmo 1000 -v tests browsergym-workarena-slow: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 # Changed from ubuntu-latest + if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none') defaults: run: @@ -105,9 +113,9 @@ jobs: SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests - end-to-end-tests: - runs-on: ubuntu-latest - if: github.event_name == 'schedule' + end-to-end-tests-planning: + runs-on: ubuntu-22.04 + if: (github.event_name == 'schedule' && github.event.schedule == '34 20 * * *') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving') defaults: run: shell: bash -l {0} @@ -131,4 +139,150 @@ jobs: SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_CATEGORY: planning_and_problem_solving run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests + + end-to-end-tests-information-retrieval: + runs-on: ubuntu-22.04 + if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 2') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'information_retrieval') + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + - name: Pip list + run: pip list + - name: Install Playwright + run: playwright install --with-deps + - name: Run E2E Tests + env: + SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} + SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} + SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_CATEGORY: information_retrieval + run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests + + end-to-end-tests-data-driven-decision-making: + runs-on: ubuntu-22.04 + if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 3') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'data_driven_decision_making_and_reasoning') + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + - name: Pip list + run: pip list + - name: Install Playwright + run: playwright install --with-deps + - name: Run E2E Tests + env: + SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} + SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} + SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_CATEGORY: data_driven_decision_making_and_reasoning + run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests + + end-to-end-tests-sophisticated-memory: + runs-on: ubuntu-22.04 + if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 4') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'sophisticated_memory') + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + - name: Pip list + run: pip list + - name: Install Playwright + run: playwright install --with-deps + - name: Run E2E Tests + env: + SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} + SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} + SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_CATEGORY: sophisticated_memory + run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests + + end-to-end-tests-contextual-understanding: + runs-on: ubuntu-22.04 + if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 5') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'contextual_understanding_infeasible_tasks') + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + - name: Pip list + run: pip list + - name: Install Playwright + run: playwright install --with-deps + - name: Run E2E Tests + env: + SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} + SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} + SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_CATEGORY: contextual_understanding_infeasible_tasks + run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests + + l1-atomic-weekly-test: + runs-on: ubuntu-22.04 + if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 6' + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + - name: Pip list + run: pip list + - name: Install Playwright + run: playwright install --with-deps + - name: Run L1 Atomic Cheat Test + env: + SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }} + SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }} + SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }} + TEST_L1_ATOMIC: "true" + run: pytest -n 5 --durations=10 -m "slow" --slowmo 1000 -v tests/test_task_general.py \ No newline at end of file diff --git a/src/browsergym/workarena/__init__.py b/src/browsergym/workarena/__init__.py index b8767c6..3abd02e 100644 --- a/src/browsergym/workarena/__init__.py +++ b/src/browsergym/workarena/__init__.py @@ -111,18 +111,19 @@ def get_task_category(task_name): return benchmark, TASK_CATEGORY_MAP.get(task_name, None) -def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True): +def get_all_tasks_agents( + filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True, task_bucket=None +): OFFSET = 42 all_task_tuples = [] filter = filter.split(".") + rng = np.random.RandomState(meta_seed) if len(filter) > 2: raise Exception("Unsupported filter used.") if len(filter) == 1: level = filter[0] if level not in ["l1", "l2", "l3"]: raise Exception("Unsupported category of tasks.") - else: - rng = np.random.RandomState(meta_seed) if level == "l1": for task in ATOMIC_TASKS: for seed in rng.randint(0, 1000, n_seed_l1): @@ -151,9 +152,16 @@ def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curri for category, items in ALL_COMPOSITIONAL_TASKS_CATEGORIES.items(): if filter_category and category != filter_category: continue + # If a task_bucket is specified, check if it exists in the current category + if task_bucket and task_bucket not in items["buckets"]: + continue for curr_seed in rng.randint(0, 1000, items["num_seeds"]): random_gen = np.random.RandomState(curr_seed) - for task_set, count in zip(items["buckets"], items["weights"]): + for i, task_set in enumerate(items["buckets"]): + # if a task_bucket is specified, only select tasks from that bucket + if task_bucket and task_set != task_bucket: + continue + count = items["weights"][i] tasks = random_gen.choice(task_set, count, replace=False) for task in tasks: all_task_tuples.append((task, int(curr_seed))) diff --git a/tests/test_compositional.py b/tests/test_compositional.py index 716a499..425aa33 100644 --- a/tests/test_compositional.py +++ b/tests/test_compositional.py @@ -4,6 +4,7 @@ """ import logging +import os import pytest @@ -12,135 +13,26 @@ from playwright.sync_api import Page, TimeoutError from tenacity import retry, stop_after_attempt, retry_if_exception_type -from browsergym.workarena import ALL_COMPOSITIONAL_TASKS, get_all_tasks_agents -AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2") - -AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [ - sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET -] - -AGENT_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3") - -AGENT_L3_SAMPLED_TASKS, AGENT_L3_SEEDS = [sampled_set[0] for sampled_set in AGENT_L3_SAMPLED_SET], [ - sampled_set[1] for sampled_set in AGENT_L3_SAMPLED_SET -] +from browsergym.workarena import get_all_tasks_agents +from browsergym.workarena.tasks.compositional.base import CompositionalTask +# Combine all tasks into a single list for parameterization +AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2", is_agent_curriculum=True) +AGENT_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3", is_agent_curriculum=True) HUMAN_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2", is_agent_curriculum=False) - -HUMAN_L2_SAMPLED_TASKS, HUMAN_L2_SEEDS = [sampled_set[0] for sampled_set in HUMAN_L2_SAMPLED_SET], [ - sampled_set[1] for sampled_set in HUMAN_L2_SAMPLED_SET -] - HUMAN_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3", is_agent_curriculum=False) -HUMAN_L3_SAMPLED_TASKS, HUMAN_L3_SEEDS = [sampled_set[0] for sampled_set in HUMAN_L3_SAMPLED_SET], [ - sampled_set[1] for sampled_set in HUMAN_L3_SAMPLED_SET -] - - -@retry( - stop=stop_after_attempt(5), - reraise=True, - before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), -) -@pytest.mark.parametrize("task_entrypoint", ALL_COMPOSITIONAL_TASKS) -@pytest.mark.parametrize("random_seed", range(1)) -@pytest.mark.parametrize("level", range(2, 4)) -@pytest.mark.pricy -def test_cheat_compositional(task_entrypoint, random_seed, level, page: Page): - task = task_entrypoint(seed=random_seed, level=level) - goal, info = task.setup(page=page) - chat_messages = [] - for i in range(len(task)): - page.wait_for_timeout(1000) - task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i) - page.wait_for_timeout(1000) - reward, done, message, info = task.validate(page=page, chat_messages=chat_messages) - if i < len(task) - 1: - assert done is False and reward == 0.0 - - task.teardown() - - assert done is True and reward == 1.0 - - -@retry( - stop=stop_after_attempt(5), - retry=retry_if_exception_type(TimeoutError), - reraise=True, - before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), -) -@pytest.mark.parametrize("task_entrypoint, seed", zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS)) -@pytest.mark.slow -@pytest.mark.skip(reason="Tests are too slow") -def test_cheat_compositional_sampled_agent_set_l2(task_entrypoint, seed, page: Page): - task = task_entrypoint(seed=seed) - goal, info = task.setup(page=page) - chat_messages = [] - for i in range(len(task)): - page.wait_for_timeout(1000) - task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i) - page.wait_for_timeout(1000) - reward, done, message, info = task.validate(page=page, chat_messages=chat_messages) - if i < len(task) - 1: - assert done is False and reward == 0.0 - - task.teardown() - - assert done is True and reward == 1.0 - - -@retry( - stop=stop_after_attempt(5), - retry=retry_if_exception_type(TimeoutError), - reraise=True, - before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), -) -@pytest.mark.parametrize("task_entrypoint, seed", zip(AGENT_L3_SAMPLED_TASKS, AGENT_L3_SEEDS)) -@pytest.mark.slow -@pytest.mark.skip(reason="Tests are too slow") -def test_cheat_compositional_sampled_agent_set_l3(task_entrypoint, seed, page: Page): - task = task_entrypoint(seed=seed) - goal, info = task.setup(page=page) - chat_messages = [] - for i in range(len(task)): - page.wait_for_timeout(1000) - task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i) - page.wait_for_timeout(1000) - reward, done, message, info = task.validate(page=page, chat_messages=chat_messages) - if i < len(task) - 1: - assert done is False and reward == 0.0 - - task.teardown() - - assert done is True and reward == 1.0 - - -@retry( - stop=stop_after_attempt(5), - retry=retry_if_exception_type(TimeoutError), - reraise=True, - before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), +all_tasks_to_test = ( + AGENT_L2_SAMPLED_SET + AGENT_L3_SAMPLED_SET + HUMAN_L2_SAMPLED_SET + HUMAN_L3_SAMPLED_SET ) -@pytest.mark.parametrize("task_entrypoint, seed", zip(HUMAN_L2_SAMPLED_TASKS, HUMAN_L2_SEEDS)) -@pytest.mark.slow -@pytest.mark.skip(reason="Tests are too slow") -def test_cheat_compositional_sampled_human_set_l2(task_entrypoint, seed, page: Page): - task = task_entrypoint(seed=seed) - goal, info = task.setup(page=page) - chat_messages = [] - for i in range(len(task)): - page.wait_for_timeout(1000) - task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i) - page.wait_for_timeout(1000) - reward, done, message, info = task.validate(page=page, chat_messages=chat_messages) - if i < len(task) - 1: - assert done is False and reward == 0.0 - task.teardown() - - assert done is True and reward == 1.0 +test_category = os.environ.get("TEST_CATEGORY") +if test_category: + # If a category is specified, filter the tasks to test + tasks_to_test = get_all_tasks_agents(filter=f"l3.{test_category}", is_agent_curriculum=True) +else: + tasks_to_test = all_tasks_to_test @retry( @@ -149,11 +41,14 @@ def test_cheat_compositional_sampled_human_set_l2(task_entrypoint, seed, page: P reraise=True, before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), ) -@pytest.mark.parametrize("task_entrypoint, seed", zip(HUMAN_L3_SAMPLED_TASKS, HUMAN_L3_SEEDS)) -@pytest.mark.slow -@pytest.mark.skip(reason="Tests are too slow") -def test_cheat_compositional_sampled_human_set_l3(task_entrypoint, seed, page: Page): - task = task_entrypoint(seed=seed) +@pytest.mark.parametrize("task_class, seed", tasks_to_test) +@pytest.mark.pricy +def test_cheat_compositional(task_class, seed, page: Page): + """ + Test that the cheat method works for all compositional tasks. + This test is parameterized to run for all tasks in the agent and human curricula. + """ + task = task_class(seed=seed) goal, info = task.setup(page=page) chat_messages = [] for i in range(len(task)): diff --git a/tests/test_task_general.py b/tests/test_task_general.py index b410003..ed31ee7 100644 --- a/tests/test_task_general.py +++ b/tests/test_task_general.py @@ -3,6 +3,7 @@ """ +import os import json import logging import pickle @@ -14,7 +15,17 @@ from playwright.sync_api import Page, TimeoutError from tenacity import retry, stop_after_attempt, retry_if_exception_type -from browsergym.workarena import ATOMIC_TASKS +from browsergym.workarena import ATOMIC_TASKS, get_all_tasks_agents + + +# Prepare parameters for the test, based on the environment variable +if os.environ.get("TEST_L1_ATOMIC"): + # For the weekly scheduled job, run all L1 tasks with their specific seeds + L1_SET = get_all_tasks_agents(filter="l1") + PARAMS = [(item[0], item[1]) for item in L1_SET] +else: + # For PRs, run all atomic tasks with a single seed to be faster + PARAMS = [(task, 0) for task in ATOMIC_TASKS] @retry( @@ -23,11 +34,10 @@ reraise=True, before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."), ) -@pytest.mark.parametrize("task_entrypoint", ATOMIC_TASKS) -@pytest.mark.parametrize("random_seed", range(1)) +@pytest.mark.parametrize("task_entrypoint, seed", PARAMS) @pytest.mark.slow -def test_cheat(task_entrypoint, random_seed: int, page: Page): - task = task_entrypoint(seed=random_seed) +def test_atomic_cheat(task_entrypoint, seed: int, page: Page): + task = task_entrypoint(seed=seed) goal, info = task.setup(page=page) chat_messages = [] reward, done, message, info = task.validate(page, chat_messages) diff --git a/tests/test_workarena_utils.py b/tests/test_workarena_utils.py new file mode 100644 index 0000000..395fbd0 --- /dev/null +++ b/tests/test_workarena_utils.py @@ -0,0 +1,111 @@ +""" +Tests for workarena utility functions. +""" + +import pytest +from browsergym.workarena import get_all_tasks_agents +from browsergym.workarena.tasks.compositional import ( + AGENT_CURRICULUM_L2, + AGENT_CURRICULUM_L3, + HUMAN_CURRICULUM_L2, + HUMAN_CURRICULUM_L3, + specialize_task_class_to_level, +) +from browsergym.workarena.tasks.compositional.base import CompositionalTask +from browsergym.workarena.tasks.compositional.mark_duplicate_problems import ( + BasicFilterProblemsAndMarkDuplicatesSmallTask, + PriorityFilterProblemsAndMarkDuplicatesSmallTask, +) +from browsergym.workarena.tasks.compositional.navigate_and_do_infeasible import ( + InfeasibleNavigateAndCreateUserWithReasonTask, +) + + +def get_tasks_from_curriculum(curriculum): + """Helper function to extract all unique tasks from a curriculum.""" + all_tasks = set() + for category, items in curriculum.items(): + for bucket in items["buckets"]: + for task in bucket: + all_tasks.add(task) + return all_tasks + + +def test_get_all_tasks_agents(): + """Test that get_all_tasks_agents returns the correct tasks from the curricula.""" + # Test L1 filter (atomic tasks) + tasks_with_seeds_l1 = get_all_tasks_agents(filter="l1") + assert len(tasks_with_seeds_l1) > 0 + for task, seed in tasks_with_seeds_l1: + assert not issubclass(task, CompositionalTask) + assert isinstance(seed, int) + + # Test L2 Human Curriculum + tasks_with_seeds_l2_human = get_all_tasks_agents(filter="l2", is_agent_curriculum=False) + expected_l2_human_tasks = get_tasks_from_curriculum(HUMAN_CURRICULUM_L2) + assert len(tasks_with_seeds_l2_human) > 0 + for task, seed in tasks_with_seeds_l2_human: + assert task in expected_l2_human_tasks + + # Test L3 Human Curriculum + tasks_with_seeds_l3_human = get_all_tasks_agents(filter="l3", is_agent_curriculum=False) + expected_l3_human_tasks = get_tasks_from_curriculum(HUMAN_CURRICULUM_L3) + assert len(tasks_with_seeds_l3_human) > 0 + for task, seed in tasks_with_seeds_l3_human: + assert task in expected_l3_human_tasks + + # Test category filtering + category = "planning_and_problem_solving" + tasks_with_seeds_cat = get_all_tasks_agents(filter=f"l3.{category}", is_agent_curriculum=True) + assert len(tasks_with_seeds_cat) > 0 + # Expected tasks from the specified category's buckets + expected_cat_tasks = set() + for bucket in AGENT_CURRICULUM_L3[category]["buckets"]: + expected_cat_tasks.update(bucket) + + returned_tasks = {task for task, seed in tasks_with_seeds_cat} + assert returned_tasks.issubset(expected_cat_tasks) + + # Check that tasks from other categories are not present + for other_category, items in AGENT_CURRICULUM_L3.items(): + if other_category != category: + for bucket in items["buckets"]: + for task in bucket: + assert task not in returned_tasks + + # Test task_bucket filtering + category = "planning_and_problem_solving" + # This bucket contains BasicFilterProblemsAndMarkDuplicatesSmallTask + bucket_to_test = AGENT_CURRICULUM_L3[category]["buckets"][0] + + tasks_with_seeds_bucket = get_all_tasks_agents( + filter=f"l3.{category}", is_agent_curriculum=True, task_bucket=bucket_to_test + ) + assert len(tasks_with_seeds_bucket) > 0 + + returned_tasks_from_bucket = {task for task, seed in tasks_with_seeds_bucket} + + # 1. All returned tasks are from the specified bucket + assert returned_tasks_from_bucket.issubset(set(bucket_to_test)) + + # 2. A specific task from the bucket is present + expected_task_base = BasicFilterProblemsAndMarkDuplicatesSmallTask + # Find the specialized task in the bucket that corresponds to the base task + expected_task_specialized = next( + task for task in bucket_to_test if expected_task_base in task.__mro__ + ) + assert expected_task_specialized in returned_tasks_from_bucket + + # A task from a different category is not present + unexpected_task = specialize_task_class_to_level( + InfeasibleNavigateAndCreateUserWithReasonTask, level=3 + ) + assert unexpected_task not in returned_tasks_from_bucket + + # Test invalid filter + with pytest.raises(Exception): + get_all_tasks_agents(filter="invalid") + + # Test invalid category filter + with pytest.raises(Exception): + get_all_tasks_agents(filter="l3.invalid_category")