Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 161 additions & 7 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@ on:
- main
pull_request:
schedule:
- cron: '59 23 * * SUN' # Runs at midnight on Sunday
- cron: '59 23 * * 1' # Temporary schedule for testing
- cron: '59 23 * * 2'
- cron: '59 23 * * 3'
- cron: '59 23 * * 4'
- cron: '59 23 * * 5'
- cron: '59 23 * * 6'

jobs:

code-format:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04 # Changed from ubuntu-latest
if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')
defaults:
run:
shell: bash -l {0}
Expand All @@ -36,7 +42,8 @@ jobs:
run: black . --check

browsergym-workarena-fast:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04 # Changed from ubuntu-latest
if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')

defaults:
run:
Expand Down Expand Up @@ -71,7 +78,8 @@ jobs:
run: pytest -n 5 --durations=10 -m 'not slow and not pricy' --slowmo 1000 -v tests

browsergym-workarena-slow:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04 # Changed from ubuntu-latest
if: github.event_name != 'schedule' && (github.event_name != 'workflow_dispatch' || github.event.inputs.test_category == 'none')

defaults:
run:
Expand Down Expand Up @@ -105,9 +113,9 @@ jobs:
SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v tests

end-to-end-tests:
runs-on: ubuntu-latest
if: github.event_name == 'schedule'
end-to-end-tests-planning:
runs-on: ubuntu-22.04
if: (github.event_name == 'schedule' && github.event.schedule == '34 20 * * *') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'planning_and_problem_solving')
defaults:
run:
shell: bash -l {0}
Expand All @@ -131,4 +139,150 @@ jobs:
SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
TEST_CATEGORY: planning_and_problem_solving
run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests

end-to-end-tests-information-retrieval:
runs-on: ubuntu-22.04
if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 2') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'information_retrieval')
defaults:
run:
shell: bash -l {0}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Pip install
working-directory: ./dev
run: pip install -r requirements.txt
- name: Pip list
run: pip list
- name: Install Playwright
run: playwright install --with-deps
- name: Run E2E Tests
env:
SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
TEST_CATEGORY: information_retrieval
run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests

end-to-end-tests-data-driven-decision-making:
runs-on: ubuntu-22.04
if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 3') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'data_driven_decision_making_and_reasoning')
defaults:
run:
shell: bash -l {0}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Pip install
working-directory: ./dev
run: pip install -r requirements.txt
- name: Pip list
run: pip list
- name: Install Playwright
run: playwright install --with-deps
- name: Run E2E Tests
env:
SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
TEST_CATEGORY: data_driven_decision_making_and_reasoning
run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests

end-to-end-tests-sophisticated-memory:
runs-on: ubuntu-22.04
if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 4') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'sophisticated_memory')
defaults:
run:
shell: bash -l {0}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Pip install
working-directory: ./dev
run: pip install -r requirements.txt
- name: Pip list
run: pip list
- name: Install Playwright
run: playwright install --with-deps
- name: Run E2E Tests
env:
SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
TEST_CATEGORY: sophisticated_memory
run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests

end-to-end-tests-contextual-understanding:
runs-on: ubuntu-22.04
if: (github.event_name == 'schedule' && github.event.schedule == '59 23 * * 5') || (github.event_name == 'workflow_dispatch' && github.event.inputs.test_category == 'contextual_understanding_infeasible_tasks')
defaults:
run:
shell: bash -l {0}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Pip install
working-directory: ./dev
run: pip install -r requirements.txt
- name: Pip list
run: pip list
- name: Install Playwright
run: playwright install --with-deps
- name: Run E2E Tests
env:
SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
TEST_CATEGORY: contextual_understanding_infeasible_tasks
run: pytest -n 5 --durations=10 -m 'pricy' --slowmo 1800 -v tests

l1-atomic-weekly-test:
runs-on: ubuntu-22.04
if: github.event_name == 'schedule' && github.event.schedule == '59 23 * * 6'
defaults:
run:
shell: bash -l {0}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Pip install
working-directory: ./dev
run: pip install -r requirements.txt
- name: Pip list
run: pip list
- name: Install Playwright
run: playwright install --with-deps
- name: Run L1 Atomic Cheat Test
env:
SNOW_INSTANCE_URL: ${{ secrets.SNOW_INSTANCE_URL }}
SNOW_INSTANCE_UNAME: ${{ secrets.SNOW_INSTANCE_UNAME }}
SNOW_INSTANCE_PWD: ${{ secrets.SNOW_INSTANCE_PWD }}
TEST_L1_ATOMIC: "true"
run: pytest -n 5 --durations=10 -m "slow" --slowmo 1000 -v tests/test_task_general.py
16 changes: 12 additions & 4 deletions src/browsergym/workarena/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,18 +111,19 @@ def get_task_category(task_name):
return benchmark, TASK_CATEGORY_MAP.get(task_name, None)


def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True):
def get_all_tasks_agents(
filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True, task_bucket=None
):
OFFSET = 42
all_task_tuples = []
filter = filter.split(".")
rng = np.random.RandomState(meta_seed)
if len(filter) > 2:
raise Exception("Unsupported filter used.")
if len(filter) == 1:
level = filter[0]
if level not in ["l1", "l2", "l3"]:
raise Exception("Unsupported category of tasks.")
else:
rng = np.random.RandomState(meta_seed)
if level == "l1":
for task in ATOMIC_TASKS:
for seed in rng.randint(0, 1000, n_seed_l1):
Expand Down Expand Up @@ -151,9 +152,16 @@ def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curri
for category, items in ALL_COMPOSITIONAL_TASKS_CATEGORIES.items():
if filter_category and category != filter_category:
continue
# If a task_bucket is specified, check if it exists in the current category
if task_bucket and task_bucket not in items["buckets"]:
continue
for curr_seed in rng.randint(0, 1000, items["num_seeds"]):
random_gen = np.random.RandomState(curr_seed)
for task_set, count in zip(items["buckets"], items["weights"]):
for i, task_set in enumerate(items["buckets"]):
# if a task_bucket is specified, only select tasks from that bucket
if task_bucket and task_set != task_bucket:
continue
count = items["weights"][i]
tasks = random_gen.choice(task_set, count, replace=False)
for task in tasks:
all_task_tuples.append((task, int(curr_seed)))
Expand Down
Loading