Fixed hella bugs

swe-bench · Apr 6, 2024 · 3430581 · 3430581
1 parent 39f5fb8
commit 3430581
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -172,3 +172,8 @@ analysis/**/scratch*
 analysis/benchmark/plots/
 analysis/evaluation/*.csv
 analysis/evaluation/*.pdf
+
+harness/logs/**
+harness/predictions/**
+harness/testbed/**
+**/miniconda.sh
diff --git a/harness/constants.py b/harness/constants.py
@@ -214,9 +214,15 @@
         ["4.5", "5.0", "5.1", "5.2", "5.3", "6.0", "6.2", "7.0", "7.1", "7.2"]
 }
 for k in ["3.0", "3.1", "3.2", "3.3", "3.4", "3.5", "4.0"]:
-    MAP_VERSION_TO_INSTALL_SPHINX[k][
-        "pre_install"
-    ].append("sed -i 's/Jinja2>=2.3/Jinja2<3.1/' setup.py")
+    MAP_VERSION_TO_INSTALL_SPHINX[k]["pre_install"].extend([
+        "sed -i 's/Jinja2>=2.3/Jinja2<3.1/' setup.py",
+        "sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp==1.0.4/' setup.py",
+        "sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp==1.0.2/' setup.py",
+        "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp==2.0.1/' setup.py",
+        "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml==1.1.5/' setup.py",
+        "sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp==1.0.3/' setup.py",
+        "sed -i 's/alabaster>=0.7,<0.8/alabaster<0.7/' setup.py",
+    ])
 
 MAP_VERSION_TO_INSTALL_ASTROPY = {
     k: {"python": "3.9", "install": "pip install -e .[test]"}
@@ -418,7 +424,7 @@
 MAP_REPO_TO_TEST_FRAMEWORK = {
     "astropy/astropy": TEST_PYTEST,
     "dbt-labs/dbt-core": TEST_PYTEST,
-    "django/django": "./tests/runtests.py --verbosity 2",
+    "django/django": "./tests/runtests.py --verbosity 2 --parallel=1",
     "huggingface/transformers": TEST_PYTEST,
     "marshmallow-code/marshmallow": TEST_PYTEST,
     "matplotlib/matplotlib": TEST_PYTEST,

diff --git a/harness/context_manager.py b/harness/context_manager.py
@@ -339,7 +339,14 @@ def __enter__(self):
                     logger_testbed.info(
                         f"[Testbed] Installing pip packages for {env_name}; Command: {cmd}"
                     )
-                    self.exec(cmd, shell=True)
+                    # breakpoint()
+                    self.exec(
+                        cmd,
+                        shell=True,
+                        executable="/bin/bash",
+                        timeout=self.timeout,
+                        env=None
+                    )
 
         return self
 
@@ -523,6 +530,8 @@ def run_install_task(self, instance: dict) -> bool:
         # Get installation instructions by repo/version
         specifications = MAP_VERSION_TO_INSTALL[instance["repo"]][instance["version"]]
 
+        # breakpoint()
+
         # Run pre-install set up if provided
         if "pre_install" in specifications:
             for pre_install in specifications["pre_install"]:
@@ -531,7 +540,11 @@ def run_install_task(self, instance: dict) -> bool:
                     f"[{self.testbed_name}] [{instance[KEY_INSTANCE_ID]}] Running pre-install setup command: {cmd_pre_install}"
                 )
                 out_pre_install = self.exec(
-                    cmd_pre_install, timeout=self.timeout, shell=True
+                    cmd_pre_install, 
+                    timeout=self.timeout, 
+                    shell=True,
+                    executable="/bin/bash",
+                    env=None
                 )
                 with open(self.log_file, "a") as f:
                     f.write(f"Pre-installation Command: {cmd_pre_install}\n")
@@ -555,7 +568,8 @@ def run_install_task(self, instance: dict) -> bool:
         )
         try:
             # Run installation command
-            out_install = self.exec(cmd_install, timeout=self.timeout, shell=True)
+            # breakpoint()
+            out_install = self.exec(cmd_install, timeout=self.timeout, shell=True, executable="/bin/bash", env=None)
 
             # Write installation logs to log file
             with open(self.log_file, "a") as f:
@@ -664,10 +678,18 @@ def run_tests_task(self, instance: dict):
         try:
             # Run test command for task instance
             test_cmd = f"{self.cmd_activate} && {instance['test_cmd']}"
+            # test_cmd = test_cmd.replace("./tests/runtests.py ", "pip install -e . && ./tests/runtests.py --parallel=1 ") # Fix Django installs
             with open(self.log_file, "a") as f:
                 f.write(f"Test Script: {test_cmd};\n")
+            breakpoint()
             out_test = self.exec(
-                test_cmd, shell=True, timeout=self.timeout, check=False
+                test_cmd,
+                shell=True,
+                timeout=self.timeout,
+                check=False,
+                executable="/bin/bash",
+                text=True,
+                env=None
             )
 
             # Write test results to log file

diff --git a/harness/run_evaluation.sh b/harness/run_evaluation.sh
@@ -1,9 +1,13 @@
 #!/bin/bash
-python run_evaluation.py \
-    --predictions_path "<path to predictions (.json)>" \
-    --swe_bench_tasks "<path to `swe-bench.json`>" \
-    --log_dir "<path to folder>" \
-    --testbed "<path to folder>" \
-    --skip_existing \
-    --timeout 900 \
-    --verbose
+# python run_evaluation.py \
+#     --predictions_path "<path to predictions (.json)>" \
+#     --swe_bench_tasks "<path to `swe-bench.json`>" \
+#     --log_dir "<path to folder>" \
+#     --testbed "<path to folder>" \
+#     --skip_existing \
+#     --timeout 900 \
+#     --verbose
+
+# python run_evaluation.py --predictions_path=predictions/sweep-04-02__SWE-bench_unassisted__test.jsonl --log_dir=logs --swe_bench_tasks=test --testbed=testbed --num_processes=1 # i don't know if its swe-bench-test
+# python run_evaluation.py --predictions_path=predictions/ground_truth__SWE-bench_unassisted__test.jsonl --log_dir=logs --swe_bench_tasks=test --testbed=testbed --num_processes=1 # i don't know if its swe-bench-test
+python run_evaluation.py --predictions_path=predictions/ground_truth_subset__SWE-bench_unassisted__test.jsonl --log_dir=logs --swe_bench_tasks=test --testbed=testbed --num_processes=1 # i don't know if its swe-bench-test