pangeo-data · willirath · Oct 12, 2018 · Nov 5, 2018 · guillaumeeb · Oct 31, 2018
diff --git a/elastic_monte_carlo_estimate_of_pi.ipynb b/elastic_monte_carlo_estimate_of_pi.ipynb
@@ -0,0 +1,247 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Monte Carlo Estimate of $\\pi$\n",
+    "\n",
+    "<img src=\"http://dask.readthedocs.io/en/latest/_images/dask_horizontal.svg\" \n",
+    "     width=\"50%\" \n",
+    "     align=top\n",
+    "     alt=\"Dask logo\">\n",
+    "<img src=\"https://upload.wikimedia.org/wikipedia/commons/b/ba/Monte-Carlo01.gif\" \n",
+    "     width=\"30%\" \n",
+    "     align=top\n",
+    "     alt=\"PI monte-carlo estimate\">\n",
+    "     \n",
+    "Using [Dask's adaptivity](http://docs.dask.org/en/latest/setup/adaptive.html), we'll show that it is possible to scale the available resources to meet almost identical wall times irrespective of the acutal work load:\n",
+    "\n",
+    "- Estimating $\\pi$ from 16 GB of random data is done in 17 seconds using 3 workers (with 2 cores each).\n",
+    "- Estimating $\\pi$ from 512 GB of random data is done in 19 seconds using 142 workers (with 2 cores each).\n",
+    "- Estimating $\\pi$ from 1024 GB of random data is done in 21 seconds using 273 workers (with 2 cores each)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dask_kubernetes import KubeCluster\n",
+    "cluster = KubeCluster(n_workers=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check Adaptive? for help on adapt's kwargs.\n",
+    "from dask.distributed import Adaptive"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cluster.adapt(minimum=1, maximum=400,\n",
+    "              target_duration=\"20s\",  # more realistic than the default \"5s\"?\n",
+    "              wait_count=10,  # 10 seconds before killing an idle worker\n",
+    "              scale_factor=1.2);  # scale slower than doubling (default)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table style=\"border: 2px solid white;\">\n",
+       "<tr>\n",
+       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
+       "<h3>Client</h3>\n",
+       "<ul>\n",
+       "  <li><b>Scheduler: </b>tcp://10.23.27.5:37004\n",
+       "  <li><b>Dashboard: </b><a href='/user/willirath/proxy/8787/status' target='_blank'>/user/willirath/proxy/8787/status</a>\n",
+       "</ul>\n",
+       "</td>\n",
+       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
+       "<h3>Cluster</h3>\n",
+       "<ul>\n",
+       "  <li><b>Workers: </b>0</li>\n",
+       "  <li><b>Cores: </b>0</li>\n",
+       "  <li><b>Memory: </b>0 B</li>\n",
+       "</ul>\n",
+       "</td>\n",
+       "</tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "<Client: scheduler='tcp://10.23.27.5:37004' processes=0 cores=0>"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dask.distributed import Client\n",
+    "c = Client(cluster)\n",
+    "c"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(Check the dash board to see the cluster scale up and down!)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import dask.array as da\n",
+    "import numpy as np\n",
+    "from time import time\n",
+    "\n",
+    "def calc_pi_mc(size):\n",
+    "    xy = da.random.uniform(0, 1, size=(size / 8 / 2, 2), chunks=(0.25e9 / 8, 2))\n",
+    "    \n",
+    "    in_circle = ((xy ** 2).sum(axis=-1) < 1)\n",
+    "    pi = 4 * in_circle.mean()\n",
+    "\n",
+    "    start = time()\n",
+    "    pi = pi.compute()\n",
+    "    end = time()\n",
+    "    \n",
+    "    num_pods = len(cluster.pods())\n",
+    "    \n",
+    "    print(\"Size of data:\", xy.nbytes / 1e9, \"GB\")\n",
+    "    print(\"Monte-Carlo pi:\", pi)\n",
+    "    print(\"Numpys pi:\", np.pi)\n",
+    "    print(\"Delta:\", abs(pi - np.pi))\n",
+    "    print(\"Duration: {:.2f} seconds with {} pods\".format(end-start, num_pods))\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size of data: 1.0 GB\n",
+      "Monte-Carlo pi: 3.141738048\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 0.0001453944102070004\n",
+      "Duration: 4.68 seconds with 1 pods\n",
+      "\n",
+      "Size of data: 2.0 GB\n",
+      "Monte-Carlo pi: 3.1416384\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 4.574641020704817e-05\n",
+      "Duration: 5.31 seconds with 1 pods\n",
+      "\n",
+      "Size of data: 4.0 GB\n",
+      "Monte-Carlo pi: 3.141615792\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 2.3138410206957616e-05\n",
+      "Duration: 7.91 seconds with 2 pods\n",
+      "\n",
+      "Size of data: 8.0 GB\n",
+      "Monte-Carlo pi: 3.141654136\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 6.148241020698109e-05\n",
+      "Duration: 10.73 seconds with 3 pods\n",
+      "\n",
+      "Size of data: 16.0 GB\n",
+      "Monte-Carlo pi: 3.141506724\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 8.592958979303233e-05\n",
+      "Duration: 17.35 seconds with 3 pods\n",
+      "\n",
+      "Size of data: 32.0 GB\n",
+      "Monte-Carlo pi: 3.141638062\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 4.5408410207059546e-05\n",
+      "Duration: 12.77 seconds with 12 pods\n",
+      "\n",
+      "Size of data: 64.0 GB\n",
+      "Monte-Carlo pi: 3.141572989\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 1.9664589792967035e-05\n",
+      "Duration: 19.20 seconds with 15 pods\n",
+      "\n",
+      "Size of data: 128.0 GB\n",
+      "Monte-Carlo pi: 3.141593464\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 8.104102069417252e-07\n",
+      "Duration: 17.55 seconds with 36 pods\n",
+      "\n",
+      "Size of data: 256.0 GB\n",
+      "Monte-Carlo pi: 3.14161230525\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 1.9651660206676524e-05\n",
+      "Duration: 18.69 seconds with 68 pods\n",
+      "\n",
+      "Size of data: 512.0 GB\n",
+      "Monte-Carlo pi: 3.14158963425\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 3.019339793297604e-06\n",
+      "Duration: 18.71 seconds with 142 pods\n",
+      "\n",
+      "Size of data: 1024.0 GB\n",
+      "Monte-Carlo pi: 3.1415884875\n",
+      "Numpys pi: 3.141592653589793\n",
+      "Delta: 4.166089793145034e-06\n",
+      "Duration: 20.80 seconds with 273 pods\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from time import sleep\n",
+    "\n",
+    "for size in [1e9 * 2 ** n for n in range(11)]:\n",
+    "    \n",
+    "    calc_pi_mc(size)\n",
+    "    sleep(10)  # allow for some scale-down time"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}