swe-bench · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 20, 2024 · Mar 22, 2024
diff --git a/.gitignore b/.gitignore
@@ -172,3 +172,8 @@ analysis/**/scratch*
 analysis/benchmark/plots/
 analysis/evaluation/*.csv
 analysis/evaluation/*.pdf
+
+harness/logs/**
+harness/predictions/**
+harness/testbed/**
+**/miniconda.sh
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,35 @@
+FROM ubuntu:20.04
+
+# https://github.com/princeton-nlp/SWE-bench/issues/15#issuecomment-1815392192
+RUN apt-get update && \
+    apt-get install -y bash gcc git jq wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN git config --global user.email "swebench@pnlp.org"
+RUN git config --global user.name "swebench"
+
+RUN apt update && apt install -y build-essential
+
+# Create new user
+RUN useradd -ms /bin/bash swe-bench
+USER swe-bench
+WORKDIR /home/swe-bench
+
+# Setup Conda
+ENV PATH="/home/swe-bench/miniconda3/bin:${PATH}"
+ARG PATH="/home/swe-bench/miniconda3/bin:${PATH}"
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh \
+    && mkdir ~/.conda \
+    && bash miniconda.sh -b \
+    && rm -f miniconda.sh
+RUN conda --version
+
+# Setup SWE-Bench Env
+COPY environment.yml .
+RUN conda env create -f environment.yml
+
+# Some missing packages
+RUN pip install datasets python-dotenv gitpython
+
+CMD ["/bin/bash"]
diff --git a/collect/cleanup/remove_envs.py b/collect/cleanup/remove_envs.py
@@ -71,7 +71,7 @@ def remove_environment(env_name, prefix):
     args = parser.parse_args()
 
     # Remove conda environments with a specific prefix
-    conda_source = "source " + os.path.join(args.conda_path, "etc/profile.d/conda.sh")
+    conda_source = ". " + os.path.join(args.conda_path, "etc/profile.d/conda.sh")
     check_env = conda_source + " && " + "conda env list"
     try:
         conda_envs = subprocess.run(check_env.split(" "), check=True, capture_output=True)

diff --git a/harness/constants.py b/harness/constants.py
@@ -1,26 +1,27 @@
+# Note: Need build-esential for sklearn to work properly
 MAP_VERSION_TO_INSTALL_SKLEARN = {
     k: {
         "python": "3.6",
-        "packages": "numpy scipy cython pytest pandas matplotlib",
+        "packages": "numpy scipy Cython==0.27 pytest pandas matplotlib",
         "install": "pip install -v --no-use-pep517 --no-build-isolation -e .",
     }
-    for k in ["0.20", "0.21", "0.22"]
+    for k in ["0.20"]
 }
 MAP_VERSION_TO_INSTALL_SKLEARN.update(
     {
         k: {
             "python": "3.7",
-            "packages": "numpy scipy cython pytest pandas matplotlib",
+            "packages": "numpy scipy Cython==0.29.33 pytest pandas matplotlib",
             "install": "pip install -v --no-use-pep517 --no-build-isolation -e .",
         }
-        for k in ["0.23", "0.24"]
+        for k in ["0.21", "0.22", "0.23", "0.24"]
     }
 )
 MAP_VERSION_TO_INSTALL_SKLEARN.update(
     {
         k: {
-            "python": "3.9",
-            "packages": "numpy scipy cython pytest pandas matplotlib joblib threadpoolctl",
+            "python": "3.8",
+            "packages": "numpy scipy Cython==0.29.33 pytest pandas matplotlib joblib threadpoolctl",
             "install": "pip install -v --no-use-pep517 --no-build-isolation -e .",
         }
         for k in ["1.0", "1.1", "1.2", "1.3", "1.4"]
@@ -165,6 +166,7 @@
         "python": "3.9",
         "packages": "environment.yml",
         "install": "python -m pip install -e .",
+        "pip_packages": "pytest",
     }
     for k in ["3.5", "3.6", "3.7"]
 }
@@ -174,6 +176,7 @@
             "python": "3.8",
             "packages": "requirements.txt",
             "install": "python -m pip install -e .",
+            "pip_packages": "pytest"
         }
         for k in ["3.1", "3.2", "3.3", "3.4"]
     }
@@ -184,6 +187,7 @@
             "python": "3.7",
             "packages": "requirements.txt",
             "install": "python -m pip install -e .",
+            "pip_packages": "pytest",
         }
         for k in ["3.0"]
     }
@@ -193,6 +197,7 @@
         k: {
             "python": "3.5",
             "install": "python setup.py build; python setup.py install",
+            "pip_packages": "pytest",
         }
         for k in ["2.0", "2.1", "2.2", "1.0", "1.1", "1.2", "1.3", "1.4", "1.5"]
     }
@@ -210,17 +215,46 @@
         ["4.5", "5.0", "5.1", "5.2", "5.3", "6.0", "6.2", "7.0", "7.1", "7.2"]
 }
 for k in ["3.0", "3.1", "3.2", "3.3", "3.4", "3.5", "4.0"]:
-    MAP_VERSION_TO_INSTALL_SPHINX[k][
-        "pre_install"
-    ].append("sed -i 's/Jinja2>=2.3/Jinja2<3.1/' setup.py")
+    MAP_VERSION_TO_INSTALL_SPHINX[k]["pre_install"].extend([
+        "sed -i 's/Jinja2>=2.3/Jinja2<3.1/' setup.py",
+        "sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp==1.0.4/' setup.py",
+        "sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp==1.0.2/' setup.py",
+        "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp==2.0.1/' setup.py",
+        "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml==1.1.5/' setup.py",
+        "sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp==1.0.3/' setup.py",
+        "sed -i 's/alabaster>=0.7,<0.8/alabaster<0.7/' setup.py",
+    ])
 
 MAP_VERSION_TO_INSTALL_ASTROPY = {
-    k: {"python": "3.9", "install": "pip install -e .[test]"}
-    for k in
-        ["0.1", "0.2", "0.3", "0.4", "1.1", "1.2", "1.3", "3.0", "3.1", "3.2"] + \
-        ["4.1", "4.2", "4.3", "5.0", "5.1", "5.2"]
+    k: {
+        "python": "3.9",
+        "install": "python setup.py install",
+        "pip_packages": "pytest extension-helpers numpy==1.19.5",
+    }
+    for k in ["0.1", "0.2", "0.3", "0.4", "1.1", "1.2", "1.3", "3.0", "3.1", "3.2"]
 }
 
+MAP_VERSION_TO_INSTALL_ASTROPY.update(
+    {
+        k: {
+            "python": "3.9",
+            "install": "pip install -e .[test]",
+            "pip_packages": "pytest extension-helpers numpy==1.19.5",
+        }
+        for k in ["4.1", "4.2", "4.3", "5.0", "5.1", "5.2"]
+    }
+)
+
+MAP_VERSION_TO_INSTALL_SYMPY = {
+    k: {
+        "python": "3.7",
+        "packages": "mpmath flake8",
+        "pip_packages": "flake8-comprehensions",
+        "install": "pip install -e .",
+    }
+    for k in
+        ["0.7", "1.0", "1.1", "1.10", "1.11", "1.12", "1.2", "1.4", "1.5"]
+}
 MAP_VERSION_TO_INSTALL_SYMPY = {
     k: {
         "python": "3.9",
@@ -229,8 +263,7 @@
         "install": "pip install -e .",
     }
     for k in
-        ["0.7", "1.0", "1.1", "1.10", "1.11", "1.12", "1.2", "1.4", "1.5", "1.6"] + \
-        ["1.7", "1.8", "1.9"]
+        ["1.6", "1.7", "1.8", "1.9"]
 }
 MAP_VERSION_TO_INSTALL_SYMPY.update(
     {
@@ -414,7 +447,7 @@
 MAP_REPO_TO_TEST_FRAMEWORK = {
     "astropy/astropy": TEST_PYTEST,
     "dbt-labs/dbt-core": TEST_PYTEST,
-    "django/django": "./tests/runtests.py --verbosity 2",
+    "django/django": "./tests/runtests.py --verbosity 2 --parallel=1",
     "huggingface/transformers": TEST_PYTEST,
     "marshmallow-code/marshmallow": TEST_PYTEST,
     "matplotlib/matplotlib": TEST_PYTEST,
@@ -472,4 +505,4 @@
 
 # Constants - Miscellaneous
 NON_TEST_EXTS = [".json", ".png", "csv", ".txt", ".md", ".jpg", ".jpeg", ".pkl", ".yml", ".yaml", ".toml"]
-SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/"
+SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/"
diff --git a/harness/context_manager.py b/harness/context_manager.py
@@ -1,4 +1,6 @@
 import logging, os, platform, subprocess
+import os.path
+import shutil
 
 from constants import (
     APPLY_PATCH_FAIL,
@@ -196,13 +198,17 @@ def __enter__(self):
                     cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-aarch64.sh"
             else:
                 raise ValueError("Unknown computer platform " + platform.system())
-            download_cmd = [
-                "wget",
-                cmd_line_install_link,
-                "-O",
-                miniconda_sh,
-            ]
-            self.exec(download_cmd)
+
+            temp_miniconda_sh = f"{os.getcwd()}/miniconda.sh"
+            if not os.path.exists(temp_miniconda_sh):
+                download_cmd = [
+                    "wget",
+                    cmd_line_install_link,
+                    "-O",
+                    temp_miniconda_sh,
+                ]
+                self.exec(download_cmd)
+            shutil.copy(temp_miniconda_sh, miniconda_sh)
 
             # Install Miniconda
             install_cmd = ["bash", miniconda_sh, "-b", "-u", "-p", self.path_conda]
@@ -284,7 +290,7 @@ def __enter__(self):
 
                     # Install dependencies
                     path_to_reqs = get_requirements(setup_ref_instance, self.testbed)
-                    cmd = f"source {path_activate} {env_name} && echo 'activate successful' && pip install -r {path_to_reqs}"
+                    cmd = f". {path_activate} {env_name} && echo 'activate successful' && pip install -r {path_to_reqs}"
                     logger_testbed.info(
                         f"[Testbed] Installing dependencies for {env_name}; Command: {cmd}"
                     )
@@ -321,19 +327,43 @@ def __enter__(self):
                     os.remove(path_to_reqs)
                 else:
                     # Create environment + install dependencies
-                    cmd = f"{exec_cmd} create -n {env_name} python={install['python']} {pkgs} -y"
+                    cmd = f"{exec_cmd} create -n {env_name} python={install['python']} -y"
                     logger_testbed.info(
                         f"[Testbed] Creating environment {env_name}; Command: {cmd}"
                     )
                     self.exec(cmd.split(" "))
 
+                    # Activate the environment and install the packages
+                    activate_cmd = f". {path_activate} {env_name}"
+                    full_cmd = f"{activate_cmd} && echo 'activate successful'"
+                    if pkgs != "":
+                        install_cmd = f"pip install {pkgs}"
+                        full_cmd += f" && {install_cmd}"
+                    logger_testbed.info(
+                        f"[Testbed] Installing environment dependencies {pkgs}; Command: {full_cmd}"
+                    )
+                    self.exec(
+                        full_cmd, 
+                        shell=True,
+                        executable="/bin/bash",
+                        timeout=self.timeout,
+                        env=None
+                    )
+
                 # Install additional packages if specified
                 if "pip_packages" in install:
-                    cmd = f"source {path_activate} {env_name} && pip install {install['pip_packages']}"
+                    cmd = f". {path_activate} {env_name} && pip install {install['pip_packages']}"
                     logger_testbed.info(
                         f"[Testbed] Installing pip packages for {env_name}; Command: {cmd}"
                     )
-                    self.exec(cmd, shell=True)
+                    # breakpoint()
+                    self.exec(
+                        cmd,
+                        shell=True,
+                        executable="/bin/bash",
+                        timeout=self.timeout,
+                        env=None
+                    )
 
         return self
 
@@ -439,10 +469,7 @@ def __init__(
             )
         self.log_file = os.path.join(log_dir, log_file_name)
 
-        self.cmd_activate = (
-            f"source {os.path.join(self.conda_path, 'bin', 'activate')} "
-            + f"{self.venv} && echo 'activate successful'"
-        )
+        self.cmd_activate = f". {os.path.join(self.conda_path, 'bin', 'activate')} {self.venv} && echo 'activate successful'"
         self.timeout = timeout
 
         shellenv = os.environ.copy()
@@ -520,6 +547,8 @@ def run_install_task(self, instance: dict) -> bool:
         # Get installation instructions by repo/version
         specifications = MAP_VERSION_TO_INSTALL[instance["repo"]][instance["version"]]
 
+        # breakpoint()
+
         # Run pre-install set up if provided
         if "pre_install" in specifications:
             for pre_install in specifications["pre_install"]:
@@ -528,7 +557,11 @@ def run_install_task(self, instance: dict) -> bool:
                     f"[{self.testbed_name}] [{instance[KEY_INSTANCE_ID]}] Running pre-install setup command: {cmd_pre_install}"
                 )
                 out_pre_install = self.exec(
-                    cmd_pre_install, timeout=self.timeout, shell=True
+                    cmd_pre_install,
+                    timeout=self.timeout,
+                    shell=True,
+                    executable="/bin/bash",
+                    env=None
                 )
                 with open(self.log_file, "a") as f:
                     f.write(f"Pre-installation Command: {cmd_pre_install}\n")
@@ -552,7 +585,8 @@ def run_install_task(self, instance: dict) -> bool:
         )
         try:
             # Run installation command
-            out_install = self.exec(cmd_install, timeout=self.timeout, shell=True)
+            breakpoint()
+            out_install = self.exec(cmd_install, timeout=self.timeout, shell=True, executable="/bin/bash", env=None)
 
             # Write installation logs to log file
             with open(self.log_file, "a") as f:
@@ -663,8 +697,24 @@ def run_tests_task(self, instance: dict):
             test_cmd = f"{self.cmd_activate} && {instance['test_cmd']}"
             with open(self.log_file, "a") as f:
                 f.write(f"Test Script: {test_cmd};\n")
+            print(self.exec(
+                "pwd",
+                shell=True,
+                timeout=self.timeout,
+                check=False,
+                executable="/bin/bash",
+                text=True,
+                env=None
+            ))
+            # breakpoint()
             out_test = self.exec(
-                test_cmd, shell=True, timeout=self.timeout, check=False
+                test_cmd,
+                shell=True,
+                timeout=self.timeout,
+                check=False,
+                executable="/bin/bash",
+                text=True,
+                env=None
             )
 
             # Write test results to log file

diff --git a/harness/engine_validation.py b/harness/engine_validation.py
@@ -60,6 +60,7 @@ def verify_task_instances(data: dict):
                 in SKIP_INSTANCES[task_instance["repo"]]
             ):
                 continue
+            # breakpoint()
             if (
                 not tcm.reset_task_env(task_instance)
                 or not tcm.run_install_task(task_instance)

diff --git a/harness/run_docker_interactive.sh b/harness/run_docker_interactive.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+DOCKER_IMAGE=xingyaoww/swe-bench
+WORK_DIR=`pwd`
+
+docker run \
+    -it \
+    --rm \
+    --user $(id -u):$(id -g) \
+    --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
+    -v $WORK_DIR:/swe-bench \
+    $DOCKER_IMAGE \
+    bash -c "cd /swe-bench && bash"