ServiceNow · oriyor · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -35,6 +35,9 @@ jobs:
       - name: Build a binary wheel and a source tarball (browsergym-assistantbench)
         run: python3 -m build browsergym/assistantbench/ --outdir dist/
 
+      - name: Build a binary wheel and a source tarball (browsergym-gaia)
+        run: python3 -m build browsergym/gaia/ --outdir dist/
+
       - name: Build a binary wheel and a source tarball (browsergym-experiments)
         run: python3 -m build browsergym/experiments/ --outdir dist/
 

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -409,3 +409,43 @@ jobs:
           OPENAI_API_KEY: ""
         run: |
           pytest -n 5 --durations=10 -m 'not pricy' --slowmo 1000 -v tests/assistantbench
+
+
+  browsergym-gaia:
+    runs-on: ubuntu-22.04
+
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '>=3.10'
+          cache: 'pip' # caching pip dependencies
+
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+
+      - name: Pip list
+        run: pip list
+
+      - name: Install Playwright
+        run: playwright install chromium --with-deps
+
+      - name: Run browsergym-gaia Unit Tests
+        env:
+          VWA_CLASSIFIEDS: "${{ vars.VWA_CLASSIFIEDS }}"
+          VWA_CLASSIFIEDS_RESET_TOKEN: "${{ vars.VWA_CLASSIFIEDS_RESET_TOKEN }}"
+          VWA_SHOPPING: "${{ vars.VWA_SHOPPING }}"
+          VWA_REDDIT: "${{ vars.VWA_REDDIT }}"
+          VWA_WIKIPEDIA: "${{ vars.VWA_WIKIPEDIA }}"
+          VWA_HOMEPAGE: "${{ vars.VWA_HOMEPAGE }}"
+          OPENAI_API_KEY: ""
+        run: |
+          pytest -n 5 --durations=10 -m 'not pricy' --slowmo 1000 -v tests/gaia
diff --git a/.gitignore b/.gitignore
@@ -145,5 +145,6 @@ results/
 
 .vscode/launch.json
 
-# assistantbench
-tests/assistantbench/assistantbench-predictions-test.jsonl
+# hidden-test set predictions
+tests/assistantbench/assistantbench-predictions-test.jsonl
+tests/assistantbench/gaia-predictions-test.jsonl
diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 install:
 	@echo "--- 🚀 Installing project dependencies ---"
-	pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/
+	pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/gaia -e ./browsergym/
 	playwright install chromium --with-deps
 
 install-demo:

diff --git a/README.md b/README.md
@@ -17,6 +17,7 @@ BrowserGym includes the following benchmarks by default:
  - [VisualWebArena](https://jykoh.com/vwa)
  - [WorkArena++](https://github.com/ServiceNow/WorkArena)
  - [AssistantBench](https://github.com/oriyor/assistantbench)
+ - [GAIA](https://huggingface.co/gaia-benchmark)
 
 Designing new web benchmarks with BrowserGym is easy, and simply requires to inherit the [`AbstractBrowserTask`](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/core/src/browsergym/core/task.py#L7C7-L7C26) class.
 
@@ -32,6 +33,7 @@ pip install browsergym-webarena  # core + webarena
 pip install browsergym-visualwebarena  # core + visualwebarena
 pip install browsergym-workarena  # core + workarena
 pip install browsergym-assistantbench  # core + assistantbench
+pip install browsergym-gaia  # core + GAIA
 ```
 
 Then setup playwright by running

diff --git a/browsergym/assistantbench/README.md b/browsergym/assistantbench/README.md
@@ -8,14 +8,14 @@ Please note that AssistantBench has a hidden test set, so test set predictions w
 
 ## Setting up
 
-- Install the package (this is still a wip)
+- Install the package
 ```
 pip install browsergym-assistantbench
 ```
 
 - Run inference, e.g., run the following commands for demo on a simple toy task
 ```
-python demo_agent/run_demo.py --task_name ab.imp.0
+python demo_agent/run_demo.py --task_name assistantbench.imp.0
 ```
 
 - Test set predictions will be saved to `./assistantbench-predictions-test.jsonl`. To evaluate on the official test set, upload these predictions to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard).
diff --git a/browsergym/assistantbench/src/browsergym/assistantbench/task.py b/browsergym/assistantbench/src/browsergym/assistantbench/task.py
@@ -6,7 +6,7 @@
 from browsergym.core.task import AbstractBrowserTask
 
 from .evaluation.evaluator import question_scorer
-from .utils import add_prediction_to_jsonl
+from browsergym.utils.utils import add_prediction_to_jsonl
 
 # Load dataset
 

diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/base.py b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py
@@ -50,7 +50,9 @@ def make_action_set(self):
         )
 
 
-BenchmarkBackend = Literal["miniwob", "webarena", "visualwebarena", "workarena", "assistantbench"]
+BenchmarkBackend = Literal[
+    "miniwob", "webarena", "visualwebarena", "workarena", "assistantbench", "gaia"
+]
 
 
 @dataclass
@@ -240,6 +242,13 @@ def subset_from_regexp(self, column, regexp):
         retry_with_force=True,
         demo_mode="off",
     ),
+    "gaia": HighLevelActionSetArgs(
+        subsets=["chat", "bid", "tab", "nav"],
+        multiaction=False,
+        strict=False,
+        retry_with_force=True,
+        demo_mode="off",
+    ),
 }
 
 # all benchmarks are callables designed for lazy loading, i.e. `bench = DEFAULT_BENCHMARKS["miniwob_all"]()`
@@ -354,4 +363,19 @@ def subset_from_regexp(self, column, regexp):
         ),
         task_metadata=task_metadata("assistantbench"),
     ),
+    "gaia": lambda: Benchmark(
+        name="gaia",
+        high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["gaia"],
+        is_multi_tab=True,
+        backends=["gaia"],
+        env_args_list=make_env_args_list_from_repeat_tasks(
+            task_list=task_list_from_metadata(
+                metadata=task_metadata("gaia"), filter={"browsergym_split": "valid|test"}
+            ),
+            max_steps=15,
+            n_repeats=1,
+            seeds_rng=np.random.RandomState(42),
+        ),
+        task_metadata=task_metadata("gaia"),
+    ),
 }