From 0b3b64e4e7e4ce915ab2d417c0196dade2133583 Mon Sep 17 00:00:00 2001
From: raynardj <raynard@rasenn.com>
Date: Wed, 1 Sep 2021 11:15:29 +0800
Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20perset=20tag?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                   |  20 +++
 docs/loader/README.md       |   4 +-
 langhuan/options.py         |   2 +-
 langhuan/tasks.py           |  18 ++-
 langhuan/templates/ner.html |  47 ++++---
 langhuan/utility.py         |  10 ++
 settings.ini                |   2 +-
 tests/loader.ipynb          |  92 +++++++++++++
 tests/ner_test.ipynb        | 248 ++++++++++++++++++++++++++++++++++--
 9 files changed, 408 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 77afe6f..a05fcb4 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,26 @@ app = ClassifyTask.from_df(
     cross_verify_num=2,)
 ```
 
+#### Preset the tagging
+You can set a column in dataframe, eg. called ```guessed_tags```, to preset the tagging result.
+
+Each cell can contain the format of tagging result, eg. 
+```json
+{"tags":[
+    {"text": "Genomicare Bio Tech", "offset":32, "label":"company"},
+    {"text": "East China University of Politic Science & Law", "offset":96, "label":"company"},
+    ]}
+```
+
+Then you can run the app with preset tag column
+```python
+app = NERTask.from_df(
+    df, text_col="description",
+    options=["institution", "company", "name"],
+    preset_tag_col="guessed_tags")
+app.run("0.0.0.0", port=5000)
+```
+
 #### Order strategy
 The order of which text got tagged first is according to order_strategy.
 
diff --git a/docs/loader/README.md b/docs/loader/README.md
index 7f03256..ce4a357 100644
--- a/docs/loader/README.md
+++ b/docs/loader/README.md
@@ -54,7 +54,9 @@ len(train_ds), len(val_ds)
 
 
 ```python
-x, y = data_ds.one_batch(5)
+batch = data_ds.one_batch(5)
+x = batch['input_ids']
+y = batch['targets']
 ```
 
 
diff --git a/langhuan/options.py b/langhuan/options.py
index 1ec2be1..f28a521 100644
--- a/langhuan/options.py
+++ b/langhuan/options.py
@@ -33,7 +33,7 @@ def __init__(
             dict(options=self.option_vals, idx=self.df_idx))
 
         self.option_col = self.df["options"]
-        self.df.set_index("idx")
+        self.df = self.df.set_index("idx")
 
     def __len__(self): return len(self.df)
 
diff --git a/langhuan/tasks.py b/langhuan/tasks.py
index 4193f22..652feeb 100644
--- a/langhuan/tasks.py
+++ b/langhuan/tasks.py
@@ -27,6 +27,7 @@ def from_df(
         task_name: str = None,
         options: List[str] = None,
         load_history: bool = False,
+        preset_tag_col: str = None,
         save_frequency: int = 42,
         order_strategy: Union[str, Callable] = "forward_march",
         order_by_column: str = None,
@@ -42,6 +43,8 @@ def from_df(
             you don't even have to decide this now, you can input
             None and configure it on /admin page later
         - load_history: bool = False, load the saved history if True
+        - preset_tag_col: str = None, a column name
+            that contains preset tags
         - task_name: str, name of your task, if not provided
         - order_strategy: Union[str, Callable] = "forward_march",
             a function defining how progress move one by one. As a
@@ -90,7 +93,7 @@ def from_df(
         app.config['TEMPLATES_AUTO_RELOAD'] = True
         HyperFlask(app)
 
-        app.register(df, text_col, Options(df, options))
+        app.register(df, text_col, preset_tag_col, Options(df, options))
         app.create_progress(
             order_strategy,
             order_by_column,
@@ -219,9 +222,16 @@ def raw_data():
 
             rt = dict(idx=idx, index=index, text=text, options=list(options))
 
+            # Scenario 1, when we have tagged data in progress
             if user_id in self.progress.depth[index].keys():
                 rt.update({"record": self.progress.depth[index][user_id]})
 
+            # Scenario 2, when we have preset tag defined in dataframe
+            elif self.preset_tag_col is not None:
+                preset_tag = self.df.loc[idx, self.preset_tag_col]
+                if preset_tag is not None:
+                    rt.update({"record": preset_tag})
+
             return jsonify(rt)
 
         @self.route("/tagging", methods=["POST"])
@@ -357,10 +367,16 @@ def register(
         self,
         df: pd.DataFrame,
         text_col: Union[List[str], str],
+        preset_tag_col: str,
         options: List[str],
     ) -> None:
+        """
+        Register properties to the class
+        """
         self.df = df
         self.text_col = text_col
+        self.preset_tag_col = preset_tag_col
+
         self.options = options
         self.register_functions()
 
diff --git a/langhuan/templates/ner.html b/langhuan/templates/ner.html
index cf1d4fd..ec87786 100644
--- a/langhuan/templates/ner.html
+++ b/langhuan/templates/ner.html
@@ -4,7 +4,8 @@
 {{super()}}
 <h4 class="m-3 bg-warning p-2 text-white text-center rounded" title="Named Entity Recognition">
     <i class="fas fa-dove"></i>
-    NER task</h4>
+    NER task
+</h4>
 {%endblock%}
 
 {%block controls%}
@@ -54,7 +55,7 @@ <h5 class="mt-2 mb-2">
     const create_option_list = (options) => {
         var option_list = [];
         var option_to_data = {}
-        var color_ct=0;
+        var color_ct = 0;
         for (var i in options) {
             if (color_ct >= flavors.length) {
                 color_ct = 0
@@ -80,18 +81,24 @@ <h5 class="mt-2 mb-2">
             this.node.title = d.label;
             var node = this.node
 
+            /*
+            Click the selected to remove tag
+            */
             this.node.addEventListener("click", () => {
                 node.after(node.innerText)
                 node.remove()
             })
         }
 
-        get_node(){
+        get_node() {
             return this.node
         }
     }
 
     const calc_label = () => {
+        /*
+        From the DOM, get the tagged data
+        */
         var nodes = document.querySelector("#raw").childNodes
 
         var tags = [];
@@ -119,7 +126,6 @@ <h5 class="mt-2 mb-2">
         tagging(data)
     }
 
-
     document.querySelector("#next").addEventListener("click", next_btn)
     document.querySelector("#skip").addEventListener("click", skipping)
 
@@ -155,24 +161,27 @@ <h5 class="mt-2 mb-2">
     const get_history = async () => {
         var user_id = get_user_id()
         fetch(`/personal_history?user_id=${user_id}`)
-        .then(res=>res.json())
-        .then(data=>{
-            d3.select("#histories")
-            .selectAll(".history_entry")
-            .data(data)
-            .enter()
-            .append("div")
-            .attr("class",
-            "history_entry m-3 pt-1 pb-1 pl-3 pr-3 border border-primary rounded-pill border-3 border-top-0 border-bottom-0")
-            .text(d=>{return `${d.time.substring(9,17)} (${d.tags}x🏷)`})
-            .on("click", (e, data) => {
-                window.location = `/?index=${data.index}`
+            .then(res => res.json())
+            .then(data => {
+                d3.select("#histories")
+                    .selectAll(".history_entry")
+                    .data(data)
+                    .enter()
+                    .append("div")
+                    .attr("class",
+                        "history_entry m-3 pt-1 pb-1 pl-3 pr-3 border border-primary rounded-pill border-3 border-top-0 border-bottom-0")
+                    .text(d => { return `${d.time.substring(9, 17)} (${d.tags}x🏷)` })
+                    .on("click", (e, data) => {
+                        window.location = `/?index=${data.index}`
+                    })
             })
-        })
-        .catch(console.error)
+            .catch(console.error)
     }
 
     const visualize_options = (options) => {
+        /*
+        Visualize the options as buttons
+        */
         d3.select("#label_pool").selectAll("button")
             .data(options)
             .enter().append("button")
@@ -214,7 +223,7 @@ <h5 class="mt-2 mb-2">
         subs.push({ text: text.substring(ct, text.length) })
         // console.log(subs)
 
-        raw.innerHTML=""
+        raw.innerHTML = ""
 
         for (var i in subs) {
             var sub = subs[i];
diff --git a/langhuan/utility.py b/langhuan/utility.py
index 8a928fd..e8bf4a1 100644
--- a/langhuan/utility.py
+++ b/langhuan/utility.py
@@ -3,6 +3,7 @@
 from typing import Union
 import json
 from pathlib import Path
+import regex
 
 
 def now_str(): return datetime.now().strftime("%y%m%d_%H%M%S")
@@ -38,3 +39,12 @@ def arg_by_key(key: str) -> Union[str, int, float]:
 
 def get_root() -> Path:
     return Path(__file__).parent.absolute()
+
+
+def findall_word_position(text: str, word: str) -> list:
+    """
+    find all the position of word in text
+    """
+    text = text.lower()
+    word = word.lower()
+    return [m.start() for m in regex.finditer(word, text)]
diff --git a/settings.ini b/settings.ini
index d05de87..06a77cb 100644
--- a/settings.ini
+++ b/settings.ini
@@ -6,7 +6,7 @@ keywords = python pandas label data science
 author = xiaochen(ray) zhang
 author_email = b2ray2c@gmail.com
 branch = main
-version = 0.0.18
+version = 0.1.0
 min_python = 3.6
 audience = Developers
 language = English
diff --git a/tests/loader.ipynb b/tests/loader.ipynb
index 9dc5958..259f0f8 100644
--- a/tests/loader.ipynb
+++ b/tests/loader.ipynb
@@ -106,6 +106,98 @@
     "len(train_ds), len(val_ds)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'index': 0,\n",
+       "  'now': '21-02-01_15:39:20',\n",
+       "  'pandas': 0,\n",
+       "  'remote_addr': '127.0.0.1',\n",
+       "  'tags': [{'label': 'school',\n",
+       "    'offset': 122,\n",
+       "    'text': 'University of Maryland'},\n",
+       "   {'label': 'company', 'offset': 346, 'text': 'Bricklin'}],\n",
+       "  'user_id': '4de71c07fa'},\n",
+       " {'index': 1,\n",
+       "  'now': '21-02-01_15:38:29',\n",
+       "  'pandas': 1,\n",
+       "  'remote_addr': '127.0.0.1',\n",
+       "  'tags': [{'label': 'school',\n",
+       "    'offset': 213,\n",
+       "    'text': 'University of Washington'},\n",
+       "   {'label': 'company', 'offset': 340, 'text': 'SI'}],\n",
+       "  'user_id': '4de71c07fa'},\n",
+       " {'index': 2,\n",
+       "  'now': '21-02-01_15:39:03',\n",
+       "  'pandas': 2,\n",
+       "  'remote_addr': '127.0.0.1',\n",
+       "  'tags': [{'label': 'school', 'offset': 89, 'text': 'Purdue University'},\n",
+       "   {'label': 'company', 'offset': 107, 'text': 'Engineering Computer Network'},\n",
+       "   {'label': 'company',\n",
+       "    'offset': 1795,\n",
+       "    'text': 'Purdue Electrical Engineering'}],\n",
+       "  'user_id': '4de71c07fa'},\n",
+       " {'index': 3,\n",
+       "  'now': '21-02-01_15:39:11',\n",
+       "  'pandas': 3,\n",
+       "  'remote_addr': '127.0.0.1',\n",
+       "  'tags': [{'label': 'company',\n",
+       "    'offset': 73,\n",
+       "    'text': 'Harris Computer Systems Division'},\n",
+       "   {'label': 'company', 'offset': 645, 'text': 'Harris Corporation'}],\n",
+       "  'user_id': '4de71c07fa'},\n",
+       " {'index': 4,\n",
+       "  'now': '21-02-01_15:40:03',\n",
+       "  'pandas': 4,\n",
+       "  'remote_addr': '127.0.0.1',\n",
+       "  'tags': [{'label': 'school',\n",
+       "    'offset': 102,\n",
+       "    'text': 'Smithsonian Astrophysical Observatory'}],\n",
+       "  'user_id': '4de71c07fa'},\n",
+       " {'index': 5,\n",
+       "  'now': '21-02-01_15:40:38',\n",
+       "  'pandas': 5,\n",
+       "  'remote_addr': '127.0.0.1',\n",
+       "  'tags': [],\n",
+       "  'user_id': '4de71c07fa'},\n",
+       " {'index': 6,\n",
+       "  'now': '21-02-01_15:40:55',\n",
+       "  'pandas': 6,\n",
+       "  'remote_addr': '127.0.0.1',\n",
+       "  'tags': [{'label': 'school',\n",
+       "    'offset': 151,\n",
+       "    'text': 'University of Chicago'}],\n",
+       "  'user_id': '4de71c07fa'},\n",
+       " {'index': 7,\n",
+       "  'now': '21-02-01_15:43:01',\n",
+       "  'pandas': 7,\n",
+       "  'remote_addr': '127.0.0.1',\n",
+       "  'tags': [{'label': 'school',\n",
+       "    'offset': 75,\n",
+       "    'text': 'New Mexico State University'},\n",
+       "   {'label': 'company', 'offset': 1936, 'text': 'IBM'},\n",
+       "   {'label': 'company', 'offset': 2151, 'text': 'IBM'},\n",
+       "   {'label': 'company', 'offset': 2238, 'text': 'Quadra'},\n",
+       "   {'label': 'company', 'offset': 2266, 'text': 'Apple'},\n",
+       "   {'label': 'company', 'offset': 2753, 'text': 'Quadra'},\n",
+       "   {'label': 'company', 'offset': 2790, 'text': 'Digital Review'}],\n",
+       "  'user_id': '4de71c07fa'}]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_ds.labels"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/tests/ner_test.ipynb b/tests/ner_test.ipynb
index ba9b68d..0184a97 100644
--- a/tests/ner_test.ipynb
+++ b/tests/ner_test.ipynb
@@ -7,8 +7,11 @@
    "outputs": [],
    "source": [
     "from langhuan import NERTask\n",
+    "from langhuan.utility import findall_word_position\n",
     "from forgebox.imports import *\n",
-    "from sklearn.datasets import fetch_20newsgroups"
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "from uuid import uuid4\n",
+    "import regex"
    ]
   },
   {
@@ -91,19 +94,238 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "app = NERTask.from_df(df, text_col=\"text\", options=[\"institution\", \"company\", \"name\"])"
+    "def word_finder(kw, label):\n",
+    "    def get_tags(text):\n",
+    "        tags = []\n",
+    "        for offset in findall_word_position(text, kw):\n",
+    "            tags.append(dict(\n",
+    "                text=text[offset:offset+len(kw)],\n",
+    "                label=label,\n",
+    "                offset=offset\n",
+    "            ))\n",
+    "        return dict(tags=tags, preset=\"preset\")\n",
+    "    return get_tags"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set extra unique id"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"special_id\"] = list(str(uuid4()) for uid in range(len(df)))\n",
+    "df = df.set_index(\"special_id\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preset some tags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"guess\"] = df.text.apply(word_finder(\"university\",\"institution\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>guess</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>special_id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>f7ec2c6f-046a-469f-a18a-d4a0540af07b</td>\n",
+       "      <td>From: lerxst@wam.umd.edu (where's my thing)\\nS...</td>\n",
+       "      <td>{'tags': [{'text': 'University', 'label': 'ins...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4c2eecd7-39f4-4d7c-b586-a7c7803a74c0</td>\n",
+       "      <td>From: guykuo@carson.u.washington.edu (Guy Kuo)...</td>\n",
+       "      <td>{'tags': [{'text': 'University', 'label': 'ins...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>a96d71fa-47d4-474a-b835-6078761865d7</td>\n",
+       "      <td>From: twillis@ec.ecn.purdue.edu (Thomas E Will...</td>\n",
+       "      <td>{'tags': [{'text': 'University', 'label': 'ins...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>acd720f6-d70b-457b-ac1c-fb175f91bd9e</td>\n",
+       "      <td>From: jgreen@amber (Joe Green)\\nSubject: Re: W...</td>\n",
+       "      <td>{'tags': [], 'preset': 'preset'}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>c34a56dc-a053-4cfe-8606-498087d49e79</td>\n",
+       "      <td>From: jcm@head-cfa.harvard.edu (Jonathan McDow...</td>\n",
+       "      <td>{'tags': [], 'preset': 'preset'}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>47d62545-3a21-402c-83a6-7a590981d06c</td>\n",
+       "      <td>From: jim.zisfein@factory.com (Jim Zisfein) \\n...</td>\n",
+       "      <td>{'tags': [], 'preset': 'preset'}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>829cde95-4d2d-4b85-ad15-0d062832b4dc</td>\n",
+       "      <td>From: ebodin@pearl.tufts.edu\\nSubject: Screen ...</td>\n",
+       "      <td>{'tags': [{'text': 'University', 'label': 'ins...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3b96eff1-8ba6-4d98-9c11-49e661230a20</td>\n",
+       "      <td>From: westes@netcom.com (Will Estes)\\nSubject:...</td>\n",
+       "      <td>{'tags': [], 'preset': 'preset'}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>952f8e26-e817-4bb2-ab84-243e43bcf094</td>\n",
+       "      <td>From: steve@hcrlgw (Steven Collins)\\nSubject: ...</td>\n",
+       "      <td>{'tags': [], 'preset': 'preset'}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>39ce49c8-ac3e-4310-9c8f-9f29f2a18f86</td>\n",
+       "      <td>From: gunning@cco.caltech.edu (Kevin J. Gunnin...</td>\n",
+       "      <td>{'tags': [], 'preset': 'preset'}</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>11314 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                   text  \\\n",
+       "special_id                                                                                \n",
+       "f7ec2c6f-046a-469f-a18a-d4a0540af07b  From: lerxst@wam.umd.edu (where's my thing)\\nS...   \n",
+       "4c2eecd7-39f4-4d7c-b586-a7c7803a74c0  From: guykuo@carson.u.washington.edu (Guy Kuo)...   \n",
+       "a96d71fa-47d4-474a-b835-6078761865d7  From: twillis@ec.ecn.purdue.edu (Thomas E Will...   \n",
+       "acd720f6-d70b-457b-ac1c-fb175f91bd9e  From: jgreen@amber (Joe Green)\\nSubject: Re: W...   \n",
+       "c34a56dc-a053-4cfe-8606-498087d49e79  From: jcm@head-cfa.harvard.edu (Jonathan McDow...   \n",
+       "...                                                                                 ...   \n",
+       "47d62545-3a21-402c-83a6-7a590981d06c  From: jim.zisfein@factory.com (Jim Zisfein) \\n...   \n",
+       "829cde95-4d2d-4b85-ad15-0d062832b4dc  From: ebodin@pearl.tufts.edu\\nSubject: Screen ...   \n",
+       "3b96eff1-8ba6-4d98-9c11-49e661230a20  From: westes@netcom.com (Will Estes)\\nSubject:...   \n",
+       "952f8e26-e817-4bb2-ab84-243e43bcf094  From: steve@hcrlgw (Steven Collins)\\nSubject: ...   \n",
+       "39ce49c8-ac3e-4310-9c8f-9f29f2a18f86  From: gunning@cco.caltech.edu (Kevin J. Gunnin...   \n",
+       "\n",
+       "                                                                                  guess  \n",
+       "special_id                                                                               \n",
+       "f7ec2c6f-046a-469f-a18a-d4a0540af07b  {'tags': [{'text': 'University', 'label': 'ins...  \n",
+       "4c2eecd7-39f4-4d7c-b586-a7c7803a74c0  {'tags': [{'text': 'University', 'label': 'ins...  \n",
+       "a96d71fa-47d4-474a-b835-6078761865d7  {'tags': [{'text': 'University', 'label': 'ins...  \n",
+       "acd720f6-d70b-457b-ac1c-fb175f91bd9e                   {'tags': [], 'preset': 'preset'}  \n",
+       "c34a56dc-a053-4cfe-8606-498087d49e79                   {'tags': [], 'preset': 'preset'}  \n",
+       "...                                                                                 ...  \n",
+       "47d62545-3a21-402c-83a6-7a590981d06c                   {'tags': [], 'preset': 'preset'}  \n",
+       "829cde95-4d2d-4b85-ad15-0d062832b4dc  {'tags': [{'text': 'University', 'label': 'ins...  \n",
+       "3b96eff1-8ba6-4d98-9c11-49e661230a20                   {'tags': [], 'preset': 'preset'}  \n",
+       "952f8e26-e817-4bb2-ab84-243e43bcf094                   {'tags': [], 'preset': 'preset'}  \n",
+       "39ce49c8-ac3e-4310-9c8f-9f29f2a18f86                   {'tags': [], 'preset': 'preset'}  \n",
+       "\n",
+       "[11314 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run service"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Service with preset tags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "app = NERTask.from_df(df, text_col=\"text\", options=[\"institution\", \"company\", \"name\"], preset_tag_col=\"guess\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Service without preset tags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "app = NERTask.from_df(df, text_col=\"text\", options=[\"institution\", \"company\", \"name\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      " * Serving Flask app \"task_NER_210120_155940\" (lazy loading)\n",
+      " * Serving Flask app \"task_NER_210901_105527\" (lazy loading)\n",
       " * Environment: production\n",
       "   WARNING: This is a development server. Do not use it in a production deployment.\n",
       "   Use a production WSGI server instead.\n",
@@ -115,15 +337,17 @@
      "output_type": "stream",
      "text": [
       " * Running on http://0.0.0.0:5001/ (Press CTRL+C to quit)\n",
-      "127.0.0.1 - - [20/Jan/2021 15:59:55] \"\u001b[37mGET / HTTP/1.1\u001b[0m\" 200 -\n",
-      "127.0.0.1 - - [20/Jan/2021 15:59:55] \"\u001b[37mGET /static/js/jquery.min.js HTTP/1.1\u001b[0m\" 200 -\n",
-      "127.0.0.1 - - [20/Jan/2021 15:59:55] \"\u001b[37mGET /static/js/bootstrap.min.js HTTP/1.1\u001b[0m\" 200 -\n",
-      "127.0.0.1 - - [20/Jan/2021 15:59:55] \"\u001b[37mGET /static/js/popper.min.js HTTP/1.1\u001b[0m\" 200 -\n",
-      "127.0.0.1 - - [20/Jan/2021 15:59:55] \"\u001b[37mGET /static/js/d3.js HTTP/1.1\u001b[0m\" 200 -\n",
-      "127.0.0.1 - - [20/Jan/2021 15:59:55] \"\u001b[37mGET /static/css/bootstrap.css HTTP/1.1\u001b[0m\" 200 -\n",
-      "127.0.0.1 - - [20/Jan/2021 15:59:55] \"\u001b[37mPOST /data HTTP/1.1\u001b[0m\" 200 -\n",
-      "127.0.0.1 - - [20/Jan/2021 15:59:55] \"\u001b[37mGET /personal_history?user_id=2fe81aa184 HTTP/1.1\u001b[0m\" 200 -\n",
-      "127.0.0.1 - - [20/Jan/2021 15:59:55] \"\u001b[33mGET /favicon.ico HTTP/1.1\u001b[0m\" 404 -\n"
+      "127.0.0.1 - - [01/Sep/2021 10:55:38] \"\u001b[37mGET /admin HTTP/1.1\u001b[0m\" 200 -\n",
+      "127.0.0.1 - - [01/Sep/2021 10:55:38] \"\u001b[37mPOST /get_options HTTP/1.1\u001b[0m\" 200 -\n",
+      "127.0.0.1 - - [01/Sep/2021 10:55:38] \"\u001b[37mPOST /stats HTTP/1.1\u001b[0m\" 200 -\n",
+      "127.0.0.1 - - [01/Sep/2021 10:55:42] \"\u001b[37mGET /?index=3 HTTP/1.1\u001b[0m\" 200 -\n",
+      "127.0.0.1 - - [01/Sep/2021 10:55:43] \"\u001b[37mPOST /data HTTP/1.1\u001b[0m\" 200 -\n",
+      "127.0.0.1 - - [01/Sep/2021 10:55:43] \"\u001b[37mGET /personal_history?user_id=2fe81aa184 HTTP/1.1\u001b[0m\" 200 -\n",
+      "127.0.0.1 - - [01/Sep/2021 10:55:56] \"\u001b[37mPOST /tagging HTTP/1.1\u001b[0m\" 200 -\n",
+      "127.0.0.1 - - [01/Sep/2021 10:55:56] \"\u001b[37mGET / HTTP/1.1\u001b[0m\" 200 -\n",
+      "127.0.0.1 - - [01/Sep/2021 10:55:56] \"\u001b[37mPOST /data HTTP/1.1\u001b[0m\" 200 -\n",
+      "127.0.0.1 - - [01/Sep/2021 10:55:56] \"\u001b[37mGET /personal_history?user_id=2fe81aa184 HTTP/1.1\u001b[0m\" 200 -\n",
+      "127.0.0.1 - - [01/Sep/2021 10:56:01] \"\u001b[37mPOST /data HTTP/1.1\u001b[0m\" 200 -\n"
      ]
     }
    ],