Skip to content

Commit

Permalink
Format files with ruff and add a pre-commit hook to enforce formattin…
Browse files Browse the repository at this point in the history
…g + linting (#323)

* Run `ruff format`

* Run `ruff check --fix`

* Add ruff lint and format pre-commit config
  • Loading branch information
kabirgh authored Feb 13, 2025
1 parent 5a5173e commit 2f621d5
Show file tree
Hide file tree
Showing 55 changed files with 1,980 additions and 1,110 deletions.
11 changes: 11 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version
rev: v0.9.6
hooks:
# Run the linter
- id: ruff
# Only fix newly changed lines
args: [ --fix, --diff ]
# Run the formatter
- id: ruff-format
8 changes: 4 additions & 4 deletions docs/20240406_devin_validate/get_devin_preds.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
" return {\n",
" \"model_name_or_path\": \"devin-20240406\",\n",
" \"instance_id\": inst_id,\n",
" \"model_patch\": pred\n",
" \"model_patch\": pred,\n",
" }"
]
},
Expand All @@ -78,9 +78,9 @@
],
"source": [
"predictions = []\n",
"for pred_file in \\\n",
" glob(\"devin-swebench-results/output_diffs/fail/*.txt\") + \\\n",
" glob(\"devin-swebench-results/output_diffs/pass/*.txt\"):\n",
"for pred_file in glob(\"devin-swebench-results/output_diffs/fail/*.txt\") + glob(\n",
" \"devin-swebench-results/output_diffs/pass/*.txt\"\n",
"):\n",
" predictions.append(convert_devin_txt_to_pred(pred_file))\n",
"len(predictions)"
]
Expand Down
8 changes: 4 additions & 4 deletions docs/20240415_eval_bug/check_harness.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"\n",
"# NOTE: We have not released the gold predictions, so this is just a placeholder and will not work\n",
"golds = [json.loads(x) for x in open(\"gold_preds.jsonl\")]\n",
"golds = {x['instance_id']: x for x in golds}"
"golds = {x[\"instance_id\"]: x for x in golds}"
]
},
{
Expand Down Expand Up @@ -61,11 +61,11 @@
"source": [
"check_harness = []\n",
"for repo, version in repo_version_pairs:\n",
" subset = [x for x in data if x['repo'] == repo and x['version'] == version]\n",
" subset = [x for x in data if x[\"repo\"] == repo and x[\"version\"] == version]\n",
" if len(subset) == 0:\n",
" continue\n",
" subset = sorted(subset, key=lambda x: x['created_at'], reverse=False)\n",
" inst_id = subset[-1]['instance_id']\n",
" subset = sorted(subset, key=lambda x: x[\"created_at\"], reverse=False)\n",
" inst_id = subset[-1][\"instance_id\"]\n",
" check_harness.append(golds[inst_id])\n",
"len(check_harness)"
]
Expand Down
70 changes: 36 additions & 34 deletions docs/20240415_eval_bug/sweep_conda_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,54 +8,56 @@
"""

conda_links = [
"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py310_23.9.0-0-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.10.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py311_23.10.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py38_23.10.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-2-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py310_23.9.0-0-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.10.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py311_23.10.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py38_23.10.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-1-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-2-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh",
"https://repo.anaconda.com/miniconda/Miniconda3-py38_23.11.0-2-Linux-x86_64.sh",
]

for conda_link in conda_links:
version = conda_link.split("/")[-1]\
.split("-", 1)[1]\
.rsplit("-", 2)[0]\
.replace(".", "_")\
.replace("-", "_")
version = (
conda_link.split("/")[-1]
.split("-", 1)[1]
.rsplit("-", 2)[0]
.replace(".", "_")
.replace("-", "_")
)
os.makedirs(f"/n/fs/p-swe-bench/results/{version}/", exist_ok=True)

cmd = (
"python evaluation.py "
"--predictions_path /n/fs/p-swe-bench/data/original/gold_preds.jsonl "
"--swe_bench_tasks /n/fs/p-swe-bench/data/original/swe-bench.json "
f"--log_dir /n/fs/p-swe-bench/results/{version}/ "
f"--conda_link {conda_link} "
"--testbed /n/fs/p-swe-bench/testbed/ "
"--timeout 1200 "
"--verbose "
"python evaluation.py "
"--predictions_path /n/fs/p-swe-bench/data/original/gold_preds.jsonl "
"--swe_bench_tasks /n/fs/p-swe-bench/data/original/swe-bench.json "
f"--log_dir /n/fs/p-swe-bench/results/{version}/ "
f"--conda_link {conda_link} "
"--testbed /n/fs/p-swe-bench/testbed/ "
"--timeout 1200 "
"--verbose "
)

# Run subprocess
subprocess.run(cmd, shell=True)

# Move results, scorecard to results/{version} log_dir
subprocess.run(
f"mv /n/fs/p-swe-bench/data/original/results.json /n/fs/p-swe-bench/results/{version}/results.json",
shell=True
f"mv /n/fs/p-swe-bench/data/original/results.json /n/fs/p-swe-bench/results/{version}/results.json",
shell=True,
)
subprocess.run(
f"mv /n/fs/p-swe-bench/data/original/scorecard.json /n/fs/p-swe-bench/results/{version}/scorecard.json",
shell=True
f"mv /n/fs/p-swe-bench/data/original/scorecard.json /n/fs/p-swe-bench/results/{version}/scorecard.json",
shell=True,
)

# Clear testbed
subprocess.run(f"rm -rf /n/fs/p-swe-bench/testbed/*", shell=True)
subprocess.run("rm -rf /n/fs/p-swe-bench/testbed/*", shell=True)
118 changes: 73 additions & 45 deletions docs/20240415_eval_bug/update_swe_bench.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
"metadata": {},
"outputs": [],
"source": [
"data = load_dataset('princeton-nlp/SWE-bench', split='test')\n",
"data_map = {x['instance_id']: x for x in data}"
"data = load_dataset(\"princeton-nlp/SWE-bench\", split=\"test\")\n",
"data_map = {x[\"instance_id\"]: x for x in data}"
]
},
{
Expand Down Expand Up @@ -106,39 +106,42 @@
" new_dataset = []\n",
" changed_f2p = []\n",
" changed_p2p = []\n",
" \n",
"\n",
" for d in tqdm(data):\n",
" log_path = os.path.join(log_folder, f\"{d['instance_id']}.log\")\n",
" if not os.path.exists(log_path):\n",
" ids_no_log.append((d['instance_id'], d['version']))\n",
" ids_no_log.append((d[\"instance_id\"], d[\"version\"]))\n",
" continue\n",
" status_map, applied = get_logs_eval(log_path)\n",
" f2p_old = json.loads(d['FAIL_TO_PASS'])\n",
" p2p_old = json.loads(d['PASS_TO_PASS'])\n",
" f2p_old = json.loads(d[\"FAIL_TO_PASS\"])\n",
" p2p_old = json.loads(d[\"PASS_TO_PASS\"])\n",
"\n",
" # NOTE: Change to `all` to enforce f2ps must all exist\n",
" tests_reproduced = any([ \n",
" f2p in status_map and status_map[f2p] == 'PASSED'\n",
" for f2p in f2p_old\n",
" ])\n",
" tests_reproduced = any(\n",
" [f2p in status_map and status_map[f2p] == \"PASSED\" for f2p in f2p_old]\n",
" )\n",
" if not tests_reproduced:\n",
" ids_reproduce_fail.append((d['instance_id'], d['version']))\n",
" ids_reproduce_fail.append((d[\"instance_id\"], d[\"version\"]))\n",
" continue\n",
"\n",
" f2p_new = [k for k, v in status_map.items() if v == 'PASSED' and k in f2p_old]\n",
" p2p_new = [k for k, v in status_map.items() if v == 'PASSED' and k not in f2p_old]\n",
" \n",
" f2p_new = [k for k, v in status_map.items() if v == \"PASSED\" and k in f2p_old]\n",
" p2p_new = [\n",
" k for k, v in status_map.items() if v == \"PASSED\" and k not in f2p_old\n",
" ]\n",
"\n",
" if sorted(f2p_old) != sorted(f2p_new):\n",
" changed_f2p.append((d['instance_id'], d['version']))\n",
" changed_f2p.append((d[\"instance_id\"], d[\"version\"]))\n",
" if sorted(p2p_old) != sorted(p2p_new):\n",
" changed_p2p.append((d['instance_id'], d['version']))\n",
" changed_p2p.append((d[\"instance_id\"], d[\"version\"]))\n",
"\n",
" new_dataset.append({\n",
" **d,\n",
" # NOTE: Comment out following line to maintain original tests\n",
" 'FAIL_TO_PASS': f2p_new,\n",
" 'PASS_TO_PASS': p2p_new,\n",
" })\n",
" new_dataset.append(\n",
" {\n",
" **d,\n",
" # NOTE: Comment out following line to maintain original tests\n",
" \"FAIL_TO_PASS\": f2p_new,\n",
" \"PASS_TO_PASS\": p2p_new,\n",
" }\n",
" )\n",
" return ids_no_log, ids_reproduce_fail, new_dataset, changed_f2p, changed_p2p"
]
},
Expand Down Expand Up @@ -271,13 +274,15 @@
"# Loop through\n",
"map_folder_id_to_conda_id = {}\n",
"for conda_id in conda_ids:\n",
" temp = conda_id.replace('.', '_').replace('-', '_')\n",
" temp = conda_id.replace(\".\", \"_\").replace(\"-\", \"_\")\n",
" map_folder_id_to_conda_id[temp] = conda_id\n",
" conda_id = temp\n",
" print(conda_id)\n",
"\n",
" log_folder = f\"{folder}/{conda_id}\"\n",
" ids_no_log, ids_reproduce_fail, new_dataset, changed_f2p, changed_p2p = survey_updated_logs(log_folder)\n",
" ids_no_log, ids_reproduce_fail, new_dataset, changed_f2p, changed_p2p = (\n",
" survey_updated_logs(log_folder)\n",
" )\n",
" if len(ids_no_log) == 0 and len(ids_reproduce_fail) == 0 and len(new_dataset) == 0:\n",
" continue\n",
"\n",
Expand All @@ -286,13 +291,16 @@
" print(f\"- # Changed (F2P): {len(changed_f2p)}\")\n",
" print(f\"- # Changed (P2P): {len(changed_p2p)}\")\n",
" with open(f\"{folder_out}/results_{conda_id}.json\", \"w\") as f:\n",
" json.dump({\n",
" \"conda_id\": conda_id,\n",
" \"new_dataset\": new_dataset,\n",
" \"ids_reproduce_fail\": ids_reproduce_fail,\n",
" \"changed_f2p\": changed_f2p,\n",
" \"changed_p2p\": changed_p2p,\n",
" }, fp=f)"
" json.dump(\n",
" {\n",
" \"conda_id\": conda_id,\n",
" \"new_dataset\": new_dataset,\n",
" \"ids_reproduce_fail\": ids_reproduce_fail,\n",
" \"changed_f2p\": changed_f2p,\n",
" \"changed_p2p\": changed_p2p,\n",
" },\n",
" fp=f,\n",
" )"
]
},
{
Expand Down Expand Up @@ -325,10 +333,18 @@
" \"\"\"\n",
" max_count, conda_id_best = 0, None\n",
" for x in conda_ids:\n",
" results_path = f\"{folder_out}/results_{x.replace('-', '_').replace('.', '_')}.json\"\n",
" results_path = (\n",
" f\"{folder_out}/results_{x.replace('-', '_').replace('.', '_')}.json\"\n",
" )\n",
" data_adjusted = json.load(open(results_path))\n",
" conda_id = results_path.split(\"/\")[1][len(\"results_\"):-len(\".json\")]\n",
" count = sum([1 for x in data_adjusted['new_dataset'] if x['repo'] == repo and x['version'] == version])\n",
" conda_id = results_path.split(\"/\")[1][len(\"results_\") : -len(\".json\")]\n",
" count = sum(\n",
" [\n",
" 1\n",
" for x in data_adjusted[\"new_dataset\"]\n",
" if x[\"repo\"] == repo and x[\"version\"] == version\n",
" ]\n",
" )\n",
" if count > max_count:\n",
" max_count = count\n",
" conda_id_best = conda_id\n",
Expand Down Expand Up @@ -477,11 +493,13 @@
"source": [
"repo_version_to_conda_id = {}\n",
"total = 0\n",
"test_set_repo_version_pairs = set([(x['repo'], x['version']) for x in data_map.values()])\n",
"test_set_repo_version_pairs = set(\n",
" [(x[\"repo\"], x[\"version\"]) for x in data_map.values()]\n",
")\n",
"\n",
"# Loop through all repo/version combos\n",
"for repo, v in MAP_VERSION_TO_INSTALL.items():\n",
" if repo not in set(data['repo']):\n",
" if repo not in set(data[\"repo\"]):\n",
" # Do not proceed for repos that are not in test set\n",
" continue\n",
"\n",
Expand Down Expand Up @@ -543,12 +561,18 @@
"for repo, version_map in tqdm(repo_version_to_conda_id.items()):\n",
" for version, conda_id in version_map.items():\n",
" if conda_id != None:\n",
" conda_id = conda_id.replace('.', '_').replace('-', '_')\n",
" adjusted_data = json.load(open(f\"{folder_out}/results_{conda_id}.json\"))['new_dataset']\n",
" adjusted_data = [x for x in adjusted_data if x['repo'] == repo and x['version'] == version]\n",
" conda_id = conda_id.replace(\".\", \"_\").replace(\"-\", \"_\")\n",
" adjusted_data = json.load(open(f\"{folder_out}/results_{conda_id}.json\"))[\n",
" \"new_dataset\"\n",
" ]\n",
" adjusted_data = [\n",
" x\n",
" for x in adjusted_data\n",
" if x[\"repo\"] == repo and x[\"version\"] == version\n",
" ]\n",
" new_dataset_agg.extend(adjusted_data)\n",
"\n",
"new_dataset_agg = sorted(new_dataset_agg, key=lambda x: x['instance_id'])\n",
"new_dataset_agg = sorted(new_dataset_agg, key=lambda x: x[\"instance_id\"])\n",
"print(len(new_dataset_agg))"
]
},
Expand Down Expand Up @@ -576,11 +600,15 @@
"outputs": [],
"source": [
"# Set subraction between original test set and new dataset\n",
"failing_inst_ids = sorted([\n",
" x for x in list(\n",
" set([(x['instance_id'], x['version']) for x in data]) -\n",
" set([(x['instance_id'], x['version']) for x in new_dataset_agg])\n",
" )])"
"failing_inst_ids = sorted(\n",
" [\n",
" x\n",
" for x in list(\n",
" set([(x[\"instance_id\"], x[\"version\"]) for x in data])\n",
" - set([(x[\"instance_id\"], x[\"version\"]) for x in new_dataset_agg])\n",
" )\n",
" ]\n",
")"
]
},
{
Expand Down
Loading

0 comments on commit 2f621d5

Please sign in to comment.