Format files with ruff and add a pre-commit hook to enforce formattin…

…g + linting (#323) * Run `ruff format` * Run `ruff check --fix` * Add ruff lint and format pre-commit config
swe-bench · Feb 13, 2025 · 2f621d5 · 2f621d5
1 parent 5a5173e
commit 2f621d5
Show file tree

Hide file tree

Showing 55 changed files with 1,980 additions and 1,110 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,11 @@
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version
+  rev: v0.9.6
+  hooks:
+    # Run the linter
+    - id: ruff
+      # Only fix newly changed lines
+      args: [ --fix, --diff ]
+    # Run the formatter
+    - id: ruff-format
diff --git a/docs/20240406_devin_validate/get_devin_preds.ipynb b/docs/20240406_devin_validate/get_devin_preds.ipynb
@@ -55,7 +55,7 @@
     "    return {\n",
     "        \"model_name_or_path\": \"devin-20240406\",\n",
     "        \"instance_id\": inst_id,\n",
-    "        \"model_patch\": pred\n",
+    "        \"model_patch\": pred,\n",
     "    }"
    ]
   },
@@ -78,9 +78,9 @@
    ],
    "source": [
     "predictions = []\n",
-    "for pred_file in \\\n",
-    "    glob(\"devin-swebench-results/output_diffs/fail/*.txt\") + \\\n",
-    "    glob(\"devin-swebench-results/output_diffs/pass/*.txt\"):\n",
+    "for pred_file in glob(\"devin-swebench-results/output_diffs/fail/*.txt\") + glob(\n",
+    "    \"devin-swebench-results/output_diffs/pass/*.txt\"\n",
+    "):\n",
     "    predictions.append(convert_devin_txt_to_pred(pred_file))\n",
     "len(predictions)"
    ]

diff --git a/docs/20240415_eval_bug/check_harness.ipynb b/docs/20240415_eval_bug/check_harness.ipynb
@@ -24,7 +24,7 @@
     "\n",
     "# NOTE: We have not released the gold predictions, so this is just a placeholder and will not work\n",
     "golds = [json.loads(x) for x in open(\"gold_preds.jsonl\")]\n",
-    "golds = {x['instance_id']: x for x in golds}"
+    "golds = {x[\"instance_id\"]: x for x in golds}"
    ]
   },
   {
@@ -61,11 +61,11 @@
    "source": [
     "check_harness = []\n",
     "for repo, version in repo_version_pairs:\n",
-    "    subset = [x for x in data if x['repo'] == repo and x['version'] == version]\n",
+    "    subset = [x for x in data if x[\"repo\"] == repo and x[\"version\"] == version]\n",
     "    if len(subset) == 0:\n",
     "        continue\n",
-    "    subset = sorted(subset, key=lambda x: x['created_at'], reverse=False)\n",
-    "    inst_id = subset[-1]['instance_id']\n",
+    "    subset = sorted(subset, key=lambda x: x[\"created_at\"], reverse=False)\n",
+    "    inst_id = subset[-1][\"instance_id\"]\n",
     "    check_harness.append(golds[inst_id])\n",
     "len(check_harness)"
    ]

diff --git a/docs/20240415_eval_bug/sweep_conda_links.py b/docs/20240415_eval_bug/sweep_conda_links.py
@@ -8,54 +8,56 @@
 """
 
 conda_links = [
-  "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.9.0-0-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.10.0-1-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.10.0-1-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.10.0-1-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-1-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-1-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-1-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-1-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh",
-  "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-2-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.9.0-0-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.10.0-1-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.10.0-1-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.10.0-1-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-1-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-1-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-1-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-1-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh",
+    "https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-2-Linux-x86_64.sh",
 ]
 
 for conda_link in conda_links:
-    version = conda_link.split("/")[-1]\
-      .split("-", 1)[1]\
-      .rsplit("-", 2)[0]\
-      .replace(".", "_")\
-      .replace("-", "_")
+    version = (
+        conda_link.split("/")[-1]
+        .split("-", 1)[1]
+        .rsplit("-", 2)[0]
+        .replace(".", "_")
+        .replace("-", "_")
+    )
     os.makedirs(f"/n/fs/p-swe-bench/results/{version}/", exist_ok=True)
 
     cmd = (
-      "python evaluation.py "
-      "--predictions_path /n/fs/p-swe-bench/data/original/gold_preds.jsonl "
-      "--swe_bench_tasks /n/fs/p-swe-bench/data/original/swe-bench.json "
-      f"--log_dir /n/fs/p-swe-bench/results/{version}/ "
-      f"--conda_link {conda_link} "
-      "--testbed /n/fs/p-swe-bench/testbed/ "
-      "--timeout 1200 "
-      "--verbose "
+        "python evaluation.py "
+        "--predictions_path /n/fs/p-swe-bench/data/original/gold_preds.jsonl "
+        "--swe_bench_tasks /n/fs/p-swe-bench/data/original/swe-bench.json "
+        f"--log_dir /n/fs/p-swe-bench/results/{version}/ "
+        f"--conda_link {conda_link} "
+        "--testbed /n/fs/p-swe-bench/testbed/ "
+        "--timeout 1200 "
+        "--verbose "
     )
 
     # Run subprocess
     subprocess.run(cmd, shell=True)
 
     # Move results, scorecard to results/{version} log_dir
     subprocess.run(
-      f"mv /n/fs/p-swe-bench/data/original/results.json /n/fs/p-swe-bench/results/{version}/results.json",
-      shell=True
+        f"mv /n/fs/p-swe-bench/data/original/results.json /n/fs/p-swe-bench/results/{version}/results.json",
+        shell=True,
     )
     subprocess.run(
-      f"mv /n/fs/p-swe-bench/data/original/scorecard.json /n/fs/p-swe-bench/results/{version}/scorecard.json",
-      shell=True
+        f"mv /n/fs/p-swe-bench/data/original/scorecard.json /n/fs/p-swe-bench/results/{version}/scorecard.json",
+        shell=True,
     )
-    
+
     # Clear testbed
-    subprocess.run(f"rm -rf /n/fs/p-swe-bench/testbed/*", shell=True)
+    subprocess.run("rm -rf /n/fs/p-swe-bench/testbed/*", shell=True)
diff --git a/docs/20240415_eval_bug/update_swe_bench.ipynb b/docs/20240415_eval_bug/update_swe_bench.ipynb
@@ -33,8 +33,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = load_dataset('princeton-nlp/SWE-bench', split='test')\n",
-    "data_map = {x['instance_id']: x for x in data}"
+    "data = load_dataset(\"princeton-nlp/SWE-bench\", split=\"test\")\n",
+    "data_map = {x[\"instance_id\"]: x for x in data}"
    ]
   },
   {
@@ -106,39 +106,42 @@
     "    new_dataset = []\n",
     "    changed_f2p = []\n",
     "    changed_p2p = []\n",
-    "    \n",
+    "\n",
     "    for d in tqdm(data):\n",
     "        log_path = os.path.join(log_folder, f\"{d['instance_id']}.log\")\n",
     "        if not os.path.exists(log_path):\n",
-    "            ids_no_log.append((d['instance_id'], d['version']))\n",
+    "            ids_no_log.append((d[\"instance_id\"], d[\"version\"]))\n",
     "            continue\n",
     "        status_map, applied = get_logs_eval(log_path)\n",
-    "        f2p_old = json.loads(d['FAIL_TO_PASS'])\n",
-    "        p2p_old = json.loads(d['PASS_TO_PASS'])\n",
+    "        f2p_old = json.loads(d[\"FAIL_TO_PASS\"])\n",
+    "        p2p_old = json.loads(d[\"PASS_TO_PASS\"])\n",
     "\n",
     "        # NOTE: Change to `all` to enforce f2ps must all exist\n",
-    "        tests_reproduced = any([ \n",
-    "            f2p in status_map and status_map[f2p] == 'PASSED'\n",
-    "            for f2p in f2p_old\n",
-    "        ])\n",
+    "        tests_reproduced = any(\n",
+    "            [f2p in status_map and status_map[f2p] == \"PASSED\" for f2p in f2p_old]\n",
+    "        )\n",
     "        if not tests_reproduced:\n",
-    "            ids_reproduce_fail.append((d['instance_id'], d['version']))\n",
+    "            ids_reproduce_fail.append((d[\"instance_id\"], d[\"version\"]))\n",
     "            continue\n",
     "\n",
-    "        f2p_new = [k for k, v in status_map.items() if v == 'PASSED' and k in f2p_old]\n",
-    "        p2p_new = [k for k, v in status_map.items() if v == 'PASSED' and k not in f2p_old]\n",
-    "    \n",
+    "        f2p_new = [k for k, v in status_map.items() if v == \"PASSED\" and k in f2p_old]\n",
+    "        p2p_new = [\n",
+    "            k for k, v in status_map.items() if v == \"PASSED\" and k not in f2p_old\n",
+    "        ]\n",
+    "\n",
     "        if sorted(f2p_old) != sorted(f2p_new):\n",
-    "            changed_f2p.append((d['instance_id'], d['version']))\n",
+    "            changed_f2p.append((d[\"instance_id\"], d[\"version\"]))\n",
     "        if sorted(p2p_old) != sorted(p2p_new):\n",
-    "            changed_p2p.append((d['instance_id'], d['version']))\n",
+    "            changed_p2p.append((d[\"instance_id\"], d[\"version\"]))\n",
     "\n",
-    "        new_dataset.append({\n",
-    "            **d,\n",
-    "            # NOTE: Comment out following line to maintain original tests\n",
-    "            'FAIL_TO_PASS': f2p_new,\n",
-    "            'PASS_TO_PASS': p2p_new,\n",
-    "        })\n",
+    "        new_dataset.append(\n",
+    "            {\n",
+    "                **d,\n",
+    "                # NOTE: Comment out following line to maintain original tests\n",
+    "                \"FAIL_TO_PASS\": f2p_new,\n",
+    "                \"PASS_TO_PASS\": p2p_new,\n",
+    "            }\n",
+    "        )\n",
     "    return ids_no_log, ids_reproduce_fail, new_dataset, changed_f2p, changed_p2p"
    ]
   },
@@ -271,13 +274,15 @@
     "# Loop through\n",
     "map_folder_id_to_conda_id = {}\n",
     "for conda_id in conda_ids:\n",
-    "    temp = conda_id.replace('.', '_').replace('-', '_')\n",
+    "    temp = conda_id.replace(\".\", \"_\").replace(\"-\", \"_\")\n",
     "    map_folder_id_to_conda_id[temp] = conda_id\n",
     "    conda_id = temp\n",
     "    print(conda_id)\n",
     "\n",
     "    log_folder = f\"{folder}/{conda_id}\"\n",
-    "    ids_no_log, ids_reproduce_fail, new_dataset, changed_f2p, changed_p2p = survey_updated_logs(log_folder)\n",
+    "    ids_no_log, ids_reproduce_fail, new_dataset, changed_f2p, changed_p2p = (\n",
+    "        survey_updated_logs(log_folder)\n",
+    "    )\n",
     "    if len(ids_no_log) == 0 and len(ids_reproduce_fail) == 0 and len(new_dataset) == 0:\n",
     "        continue\n",
     "\n",
@@ -286,13 +291,16 @@
     "    print(f\"- # Changed (F2P): {len(changed_f2p)}\")\n",
     "    print(f\"- # Changed (P2P): {len(changed_p2p)}\")\n",
     "    with open(f\"{folder_out}/results_{conda_id}.json\", \"w\") as f:\n",
-    "        json.dump({\n",
-    "            \"conda_id\": conda_id,\n",
-    "            \"new_dataset\": new_dataset,\n",
-    "            \"ids_reproduce_fail\": ids_reproduce_fail,\n",
-    "            \"changed_f2p\": changed_f2p,\n",
-    "            \"changed_p2p\": changed_p2p,\n",
-    "        }, fp=f)"
+    "        json.dump(\n",
+    "            {\n",
+    "                \"conda_id\": conda_id,\n",
+    "                \"new_dataset\": new_dataset,\n",
+    "                \"ids_reproduce_fail\": ids_reproduce_fail,\n",
+    "                \"changed_f2p\": changed_f2p,\n",
+    "                \"changed_p2p\": changed_p2p,\n",
+    "            },\n",
+    "            fp=f,\n",
+    "        )"
    ]
   },
   {
@@ -325,10 +333,18 @@
     "    \"\"\"\n",
     "    max_count, conda_id_best = 0, None\n",
     "    for x in conda_ids:\n",
-    "        results_path = f\"{folder_out}/results_{x.replace('-', '_').replace('.', '_')}.json\"\n",
+    "        results_path = (\n",
+    "            f\"{folder_out}/results_{x.replace('-', '_').replace('.', '_')}.json\"\n",
+    "        )\n",
     "        data_adjusted = json.load(open(results_path))\n",
-    "        conda_id = results_path.split(\"/\")[1][len(\"results_\"):-len(\".json\")]\n",
-    "        count = sum([1 for x in data_adjusted['new_dataset'] if x['repo'] == repo and x['version'] == version])\n",
+    "        conda_id = results_path.split(\"/\")[1][len(\"results_\") : -len(\".json\")]\n",
+    "        count = sum(\n",
+    "            [\n",
+    "                1\n",
+    "                for x in data_adjusted[\"new_dataset\"]\n",
+    "                if x[\"repo\"] == repo and x[\"version\"] == version\n",
+    "            ]\n",
+    "        )\n",
     "        if count > max_count:\n",
     "            max_count = count\n",
     "            conda_id_best = conda_id\n",
@@ -477,11 +493,13 @@
    "source": [
     "repo_version_to_conda_id = {}\n",
     "total = 0\n",
-    "test_set_repo_version_pairs = set([(x['repo'], x['version']) for x in data_map.values()])\n",
+    "test_set_repo_version_pairs = set(\n",
+    "    [(x[\"repo\"], x[\"version\"]) for x in data_map.values()]\n",
+    ")\n",
     "\n",
     "# Loop through all repo/version combos\n",
     "for repo, v in MAP_VERSION_TO_INSTALL.items():\n",
-    "    if repo not in set(data['repo']):\n",
+    "    if repo not in set(data[\"repo\"]):\n",
     "        # Do not proceed for repos that are not in test set\n",
     "        continue\n",
     "\n",
@@ -543,12 +561,18 @@
     "for repo, version_map in tqdm(repo_version_to_conda_id.items()):\n",
     "    for version, conda_id in version_map.items():\n",
     "        if conda_id != None:\n",
-    "            conda_id = conda_id.replace('.', '_').replace('-', '_')\n",
-    "            adjusted_data = json.load(open(f\"{folder_out}/results_{conda_id}.json\"))['new_dataset']\n",
-    "            adjusted_data = [x for x in adjusted_data if x['repo'] == repo and x['version'] == version]\n",
+    "            conda_id = conda_id.replace(\".\", \"_\").replace(\"-\", \"_\")\n",
+    "            adjusted_data = json.load(open(f\"{folder_out}/results_{conda_id}.json\"))[\n",
+    "                \"new_dataset\"\n",
+    "            ]\n",
+    "            adjusted_data = [\n",
+    "                x\n",
+    "                for x in adjusted_data\n",
+    "                if x[\"repo\"] == repo and x[\"version\"] == version\n",
+    "            ]\n",
     "            new_dataset_agg.extend(adjusted_data)\n",
     "\n",
-    "new_dataset_agg = sorted(new_dataset_agg, key=lambda x: x['instance_id'])\n",
+    "new_dataset_agg = sorted(new_dataset_agg, key=lambda x: x[\"instance_id\"])\n",
     "print(len(new_dataset_agg))"
    ]
   },
@@ -576,11 +600,15 @@
    "outputs": [],
    "source": [
     "# Set subraction between original test set and new dataset\n",
-    "failing_inst_ids = sorted([\n",
-    "    x for x in list(\n",
-    "        set([(x['instance_id'], x['version']) for x in data]) -\n",
-    "        set([(x['instance_id'], x['version']) for x in new_dataset_agg])\n",
-    "    )])"
+    "failing_inst_ids = sorted(\n",
+    "    [\n",
+    "        x\n",
+    "        for x in list(\n",
+    "            set([(x[\"instance_id\"], x[\"version\"]) for x in data])\n",
+    "            - set([(x[\"instance_id\"], x[\"version\"]) for x in new_dataset_agg])\n",
+    "        )\n",
+    "    ]\n",
+    ")"
    ]
   },
   {