diff --git a/mpcontribs-portal/notebooks/contribs.materialsproject.org/dilute_solute_diffusion.ipynb b/mpcontribs-portal/notebooks/contribs.materialsproject.org/dilute_solute_diffusion.ipynb index d1b52c173..3ce892c23 100644 --- a/mpcontribs-portal/notebooks/contribs.materialsproject.org/dilute_solute_diffusion.ipynb +++ b/mpcontribs-portal/notebooks/contribs.materialsproject.org/dilute_solute_diffusion.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "dated-confirmation", "metadata": {}, "outputs": [], @@ -12,13 +12,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "forty-florida", "metadata": {}, "outputs": [], "source": [ "name = \"dilute_solute_diffusion\"\n", - "client = Client()" + "client = Client(project=name)" ] }, { @@ -30,17 +30,17 @@ "source": [ "import os, json, requests, sys\n", "from pandas import read_excel, isnull, ExcelWriter, Series\n", - "from mpcontribs.io.core.recdict import RecursiveDict\n", - "from mpcontribs.io.core.utils import clean_value, nest_dict\n", - "from mpcontribs.io.archieml.mpfile import MPFile\n", - "from pymatgen.ext.matproj import MPRester\n", + "from mp_api.client import MPRester\n", + "from pathlib import Path\n", "\n", - "z = json.load(open(\"z.json\", \"r\"))\n", - "mpr = MPRester()\n", + "data_dir = Path(\"/Users/patrick/GoogleDriveLBNL/My Drive/MaterialsProject/gitrepos/mpcontribs-data/\")\n", + "zfile = data_dir / name / \"z.json\"\n", + "z = json.load(zfile.open())\n", + "mpr = MPRester(\"bmdNL4cV6Ei0CqhUAhK6JwFSZ6XMH0Gz\")\n", "fpath = f\"{name}.xlsx\"\n", + "download = False\n", "\n", "if download or not os.path.exists(fpath):\n", - "\n", " figshare_id = 1546772\n", " url = \"https://api.figshare.com/v2/articles/{}\".format(figshare_id)\n", " print(\"get figshare article {}\".format(figshare_id))\n", @@ -53,22 +53,46 @@ " for d in figshare[\"files\"]:\n", " if \"xlsx\" in d[\"name\"]:\n", " # Dict of DataFrames is returned, with keys representing sheets\n", - " df_dct = read_excel(d[\"download_url\"], sheet_name=None)\n", + " df_dct = read_excel(d[\"download_url\"], sheet_name=None, engine=\"openpyxl\")\n", " break\n", - " if df_dct is None:\n", - " print(\"no excel sheet found on figshare\")\n", - " return\n", - "\n", - " print(\"save excel to disk\")\n", - " writer = ExcelWriter(fpath)\n", - " for sheet, df in df_dct.items():\n", - " df.to_excel(writer, sheet)\n", - " writer.save()\n", - "\n", + " if df_dct is not None:\n", + " print(\"save excel to disk\")\n", + " with ExcelWriter(fpath) as writer:\n", + " for sheet, df in df_dct.items():\n", + " df.to_excel(writer, sheet)\n", + " else:\n", + " print(\"no excel sheet found on figshare\") \n", "else:\n", - " df_dct = read_excel(fpath, sheet_name=None)\n", + " df_dct = read_excel(fpath, sheet_name=None, engine=\"openpyxl\")\n", "\n", - "print(len(df_dct), \"sheets loaded.\")\n" + "print(len(df_dct), \"sheets loaded.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c919df10-44ee-4d3c-b003-e026ac56bfbf", + "metadata": {}, + "outputs": [], + "source": [ + "# function to search MP via its summary API endpoint\n", + "def search(formula=None, spacegroup_number=None, chemsys=None):\n", + " return mpr.summary.search(\n", + " formula=formula, chemsys=chemsys, spacegroup_number=spacegroup_number,\n", + " fields=[\"material_id\"]#, sort_fields=\"energy_above_hull\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "827b60ac-f149-4962-a7c0-7c3b829f266e", + "metadata": {}, + "outputs": [], + "source": [ + "host_info = df_dct[\"Host Information\"].set_index(\"Host element name\").dropna().drop(\"Unnamed: 0\", axis=1)\n", + "hosts = None\n", + "host_info" ] }, { @@ -78,13 +102,9 @@ "metadata": {}, "outputs": [], "source": [ + "contributions = []\n", "\n", - "print(\"looping hosts ...\")\n", - "host_info = df_dct[\"Host Information\"]\n", - "host_info.set_index(host_info.columns[0], inplace=True)\n", - "host_info.dropna(inplace=True)\n", - "\n", - "for idx, host in enumerate(host_info):\n", + "for idx, host in enumerate(host_info.columns):\n", " if hosts is not None:\n", " if isinstance(hosts, int) and idx + 1 > hosts:\n", " break\n", @@ -92,48 +112,46 @@ " continue\n", "\n", " print(\"get mp-id for {}\".format(host))\n", - " mpid = None\n", - " for doc in mpr.query(\n", - " criteria={\"pretty_formula\": host}, properties={\"task_id\": 1}\n", - " ):\n", - " if \"decomposes_to\" not in doc[\"sbxd\"][0]:\n", - " mpid = doc[\"task_id\"]\n", - " break\n", - " if mpid is None:\n", + " results = search(formula=host)\n", + " if not results:\n", " print(\"mp-id for {} not found\".format(host))\n", " continue\n", "\n", + " mpid = str(results[0].material_id)\n", + " contrib = {\"identifier\": mpid}\n", " print(\"add host info for {}\".format(mpid))\n", - " hdata = host_info[host].to_dict(into=RecursiveDict)\n", + " hdata = host_info[host].to_dict()\n", " for k in list(hdata.keys()):\n", " v = hdata.pop(k)\n", " ks = k.split()\n", " if ks[0] not in hdata:\n", - " hdata[ks[0]] = RecursiveDict()\n", + " hdata[ks[0]] = {}\n", " unit = ks[-1][1:-1] if ks[-1].startswith(\"[\") else \"\"\n", " subkey = \"_\".join(ks[1:-1] if unit else ks[1:]).split(\",\")[0]\n", " if subkey == \"lattice_constant\":\n", " unit = \"Å\"\n", " try:\n", - " hdata[ks[0]][subkey] = clean_value(v, unit.replace(\"angstrom\", \"Å\"))\n", + " unit = unit.replace(\"angstrom\", \"Å\")\n", + " hdata[ks[0]][subkey] = f\"{v} {unit}\" if unit else v\n", " except ValueError:\n", " hdata[ks[0]][subkey] = v\n", - " hdata[\"formula\"] = host\n", - " df = df_dct[\"{}-X\".format(host)]\n", - " rows = list(isnull(df).any(1).nonzero()[0])\n", + " contrib[\"formula\"] = host\n", + " df = df_dct[\"{}-X\".format(host)].drop(\"Unnamed: 0\", axis=1)\n", + " rows = list(isnull(df).any(axis=1).to_numpy().nonzero()[0])\n", " if rows:\n", " cells = df.iloc[rows].dropna(how=\"all\").dropna(axis=1)[df.columns[0]]\n", " note = cells.iloc[0].replace(\"following\", cells.iloc[1])[:-1]\n", " hdata[\"note\"] = note\n", - " df.drop(rows, inplace=True)\n", - " mpfile.add_hierarchical_data(nest_dict(hdata, [\"data\"]), identifier=mpid)\n", + " df = df.drop(rows)\n", "\n", + " contrib[\"data\"] = hdata\n", + " \n", " print(\"add table for D₀/Q data for {}\".format(mpid))\n", - " df.set_index(df[\"Solute element number\"], inplace=True)\n", - " df.drop(\"Solute element number\", axis=1, inplace=True)\n", + " df = df.set_index(df[\"Solute element number\"])\n", + " df = df.drop(\"Solute element number\", axis=1)\n", " df.columns = df.iloc[0]\n", " df.index.name = \"index\"\n", - " df.drop(\"Solute element name\", inplace=True)\n", + " df = df.drop(\"Solute element name\")\n", " df = df.T.reset_index()\n", " if str(host) == \"Fe\":\n", " df_D0_Q = df[\n", @@ -153,15 +171,23 @@ " ]\n", " else:\n", " df_D0_Q = df[[\"Solute element name\", \"Solute D0 [cm^2/s]\", \"Solute Q [eV]\"]]\n", + "\n", " df_D0_Q.columns = [\"Solute\", \"D₀ [cm²/s]\", \"Q [eV]\"]\n", " anums = [z[el] for el in df_D0_Q[\"Solute\"]]\n", " df_D0_Q.insert(0, \"Z\", Series(anums, index=df_D0_Q.index))\n", - " df_D0_Q.sort_values(\"Z\", inplace=True)\n", - " df_D0_Q.reset_index(drop=True, inplace=True)\n", - " mpfile.add_data_table(mpid, df_D0_Q, \"D₀_Q\")\n", + " df_D0_Q = df_D0_Q.sort_values(\"Z\")\n", + " df_D0_Q = df_D0_Q.reset_index(drop=True)\n", + " df_D0_Q.attrs = {\n", + " \"name\": \"D0_Q\",\n", + " \"title\": \"D₀/Q by Solute\",\n", + " \"labels\": {\n", + " \"value\": \"D₀/Q\",\n", + " #\"variable\": \"method\"\n", + " }\n", + " }\n", + " contrib[\"tables\"] = [df_D0_Q]\n", "\n", " if hdata[\"Host\"][\"crystal_structure\"] == \"BCC\":\n", - "\n", " print(\"add table for hop activation barriers for {} (BCC)\".format(mpid))\n", " columns_E = (\n", " [\"Hop activation barrier, E_{} [eV]\".format(i) for i in range(2, 5)]\n", @@ -177,7 +203,11 @@ " + [\"E``{} [eV]\".format(i) for i in [\"₃\", \"₄\"]]\n", " + [\"E{} [eV]\".format(i) for i in [\"₅\", \"₆\"]]\n", " )\n", - " mpfile.add_data_table(mpid, df_E, \"hop_activation_barriers\")\n", + " df_E.attrs = {\n", + " \"name\": \"hop_activation_barriers\",\n", + " \"title\": \"Hop Activation Barriers\",\n", + " }\n", + " contrib[\"tables\"].append(df_E)\n", "\n", " print(\"add table for hop attempt frequencies for {} (BCC)\".format(mpid))\n", " columns_v = (\n", @@ -194,7 +224,11 @@ " + [\"v``{} [THz]\".format(i) for i in [\"₃\", \"₄\"]]\n", " + [\"v{} [THz]\".format(i) for i in [\"₅\", \"₆\"]]\n", " )\n", - " mpfile.add_data_table(mpid, df_v, \"hop_attempt_frequencies\")\n", + " df_v.attrs = {\n", + " \"name\": \"hop_attempt_frequencies\",\n", + " \"title\": \"Hop Attempt Frequencies\",\n", + " }\n", + " contrib[\"tables\"].append(df_v)\n", "\n", " elif hdata[\"Host\"][\"crystal_structure\"] == \"FCC\":\n", "\n", @@ -206,7 +240,11 @@ " df_E.columns = [\"Solute\"] + [\n", " \"E{} [eV]\".format(i) for i in [\"₀\", \"₁\", \"₂\", \"₃\", \"₄\"]\n", " ]\n", - " mpfile.add_data_table(mpid, df_E, \"hop_activation_barriers\")\n", + " df_E.attrs = {\n", + " \"name\": \"hop_activation_barriers\",\n", + " \"title\": \"Hop Activation Barriers\",\n", + " }\n", + " contrib[\"tables\"].append(df_E)\n", "\n", " print(\"add table for hop attempt frequencies for {} (FCC)\".format(mpid))\n", " columns_v = [\n", @@ -216,7 +254,11 @@ " df_v.columns = [\"Solute\"] + [\n", " \"v{} [THz]\".format(i) for i in [\"₀\", \"₁\", \"₂\", \"₃\", \"₄\"]\n", " ]\n", - " mpfile.add_data_table(mpid, df_v, \"hop_attempt_frequencies\")\n", + " df_v.attrs = {\n", + " \"name\": \"hop_attempt_frequencies\",\n", + " \"title\": \"Hop Attempt Frequencies\",\n", + " }\n", + " contrib[\"tables\"].append(df_v)\n", "\n", " elif hdata[\"Host\"][\"crystal_structure\"] == \"HCP\":\n", "\n", @@ -242,7 +284,11 @@ " \"Eꪱ [eV]\",\n", " \"E`ꪱ [eV]\",\n", " ]\n", - " mpfile.add_data_table(mpid, df_E, \"hop_activation_barriers\")\n", + " df_E.attrs = {\n", + " \"name\": \"hop_activation_barriers\",\n", + " \"title\": \"Hop Activation Barriers\",\n", + " }\n", + " contrib[\"tables\"].append(df_E)\n", "\n", " print(\"add table for hop attempt frequencies for {} (HCP)\".format(mpid))\n", " columns_v = [\"Hop attempt frequency, v_a [THz]\"] + [\n", @@ -250,56 +296,96 @@ " ]\n", " df_v = df[[\"Solute element name\"] + columns_v]\n", " df_v.columns = [\"Solute\"] + [\"vₐ [THz]\"] + [\"vₓ [THz]\"]\n", - " mpfile.add_data_table(mpid, df_v, \"hop_attempt_frequencies\")\n", + " df_v.attrs = {\n", + " \"name\": \"hop_attempt_frequencies\",\n", + " \"title\": \"Hop Attempt Frequencies\",\n", + " }\n", + " contrib[\"tables\"].append(df_v)\n", "\n", - "print(\"DONE\")\n", + " contributions.append(contrib)\n", "\n", + "len(contributions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d978f45a-f160-4bd4-89b9-9d3eef7e6449", + "metadata": {}, + "outputs": [], + "source": [ + "from flatten_dict import flatten, unflatten\n", "\n", - "mpfile = MPFile()\n", - "mpfile.max_contribs = 15\n", - "run(mpfile)\n", - "print(mpfile)\n", - "\n", - "filename = f\"{project}.txt\"\n", - "mpfile.write_file(filename=filename)\n", - "mpfile = MPFile.from_file(filename)\n", - "print(len(mpfile.ids))\n", - "\n", - "table_names = [\"D₀_Q\", \"hop_activation_barriers\", \"hop_attempt_frequencies\"]\n", - "\n", - "for idx, (identifier, content) in enumerate(mpfile.document.items()):\n", - " # doc = {'identifier': identifier, 'project': project, 'content': {}}\n", - " # doc['content']['data'] = content['data']\n", - " # doc['collaborators'] = [{'name': 'Patrick Huck', 'email': 'phuck@lbl.gov'}]\n", - " # r = db.contributions.insert_one(doc)\n", - " # cid = r.inserted_id\n", - " # print(idx, ':', cid)\n", + "columns_map = {\n", + " \"Host.crystal_structure\": {\"name\": \"host.symmetry\", \"description\": \"host crystal structure\"},\n", + " \"Host.melting_temperature\": {\"name\": \"host.temperature|melt\", \"unit\": \"K\", \"description\": \"host melting temperature\"},\n", + " \"Host.vacancy_formation_energy\": {\"name\": \"host.energy|formation\", \"unit\": \"eV\", \"description\": \"host vacancy formation energy\"},\n", + " \"Host.lattice_constant\": {\"name\": \"host.lattice\", \"unit\": \"Å\", \"description\": \"host lattice constant\"},\n", + " \"Host.self-diffusion_correction_shift\": {\"name\": \"host.shift\", \"unit\": \"eV\", \"description\": \"host self diffusion correction shift\"},\n", + " \"note\": {\"name\": \"excluded\", \"description\": \"solutes were calculated but either did not converge or relaxed into the neighboring vacancy, making it ineligible for the analytical multi-frequency formalism\"},\n", + "}\n", + "columns = {col[\"name\"]: col.get(\"unit\") for col in columns_map.values()}\n", + "clean_contributions = []\n", "\n", - " # tids = []\n", - " # for name in table_names:\n", - " # table = mpfile.document[identifier][name]\n", - " # table.pop('@module')\n", - " # table.pop('@class')\n", - " # table['identifier'] = identifier\n", - " # table['project'] = project\n", - " # table['name'] = name\n", - " # table['cid'] = cid\n", - " # r = db.tables.insert_one(table)\n", - " # tids.append(r.inserted_id)\n", + "for contrib in contributions:\n", + " clean_contrib = {\"identifier\": contrib[\"identifier\"], \"formula\": contrib[\"formula\"], \"tables\": contrib[\"tables\"]}\n", + " data = {}\n", + " for k, v in flatten(contrib[\"data\"], reducer=\"dot\").items():\n", + " data[columns_map[k][\"name\"]] = v.replace(\"The\", \"\").replace(columns_map[k][\"description\"], \"\").replace(\n", + " \"solutes were calculated but either did not converge or relaxed into the neighboring vacancy, making the solute ineligible for the analytical multi-frequency formalism\", \"\"\n", + " ).strip()\n", "\n", - " # print(tids)\n", - " # query = {'identifier': identifier, 'project': project}\n", - " # r = db.contributions.update_one(query, {'$set': {'content.tables': tids}})\n", + " clean_contrib[\"data\"] = unflatten(data, splitter=\"dot\")\n", + " clean_contributions.append(clean_contrib)\n", "\n", - " name = table_names[0]\n", - " query = {\"identifier\": identifier, \"project\": project, \"name\": name}\n", - " print(query)\n", - " table = mpfile.document[identifier][name]\n", - " r = db.tables.update_one(\n", - " query, {\"$set\": {\"columns\": table[\"columns\"], \"data\": table[\"data\"]}}\n", - " )\n", - " print(r.matched_count, r.modified_count)\n" + "len(clean_contributions)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "017ad9b3-85ea-4571-9d2f-ada20e66e6b1", + "metadata": {}, + "outputs": [], + "source": [ + "# description = client.get_project(fields=[\"description\"]).get(\"description\")\n", + "# description += \" Diffusion values for Fe-X are given for the α-BCC phase, both paramagnetic and ferromagnetic. The paramagnetic D₀ and Q are given here, the full diffusivity can be obtained by: D|BCC(T) = D₀|para * exp[-Q|para*(1+αs²)/(kT)] where α=0.156 and s is the temperature dependent spontaneous magnetization of Fe relative to T=0K.\"\n", + "# description += \" NSF award No. 1148011, version 10.\"\n", + "# client.update_project({\"description\": description})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2e726cd-e11d-409f-acaf-d82cff3e9c52", + "metadata": {}, + "outputs": [], + "source": [ + "# other = unflatten({col[\"name\"]: col[\"description\"] for col in columns_map.values()}, splitter=\"dot\")\n", + "# #client.update_project({\"other\": {\"funding\": None, \"version\": None, \"abbreviations\": None, \"FeX\": None}})\n", + "# client.update_project({\"other\": other})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a55fde6b-1566-4d2a-9be1-ea4fc0f9e7f4", + "metadata": {}, + "outputs": [], + "source": [ + "client.delete_contributions()\n", + "client.init_columns(columns)\n", + "client.submit_contributions(clean_contributions)\n", + "client.init_columns(columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bce3aa4-aabe-46d3-8ae4-2e1119077812", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {