diff --git a/docs/pl_training.html b/docs/pl_training.html index 95bd7f4..5b8198c 100644 --- a/docs/pl_training.html +++ b/docs/pl_training.html @@ -33,6 +33,13 @@
+
+ {% endraw %} + + {% raw %} + +
+
{% endraw %} @@ -71,7 +78,7 @@

Load model and tokenizer -

ner_model_from[source]

ner_model_from(name:str, dataset:IOBES)

+

ner_model_from[source]

ner_model_from(name:str, dataset:IOBES)

name: from_pretrain(name)

@@ -96,7 +103,7 @@

ner_model_from -

ner_tokenizer_from[source]

ner_tokenizer_from(name:str)

+

ner_tokenizer_from[source]

ner_tokenizer_from(name:str)

@@ -133,7 +140,7 @@

Lightning data module -

class NERDataModule[source]

NERDataModule(*args:Any, **kwargs:Any) :: LightningDataModule

+

class NERDataModule[source]

NERDataModule(*args:Any, **kwargs:Any) :: LightningDataModule

A DataModule standardizes the training, val, test splits, data preparation and transforms. The main advantage is consistent data splits, data preparation and transforms across models.

@@ -202,7 +209,7 @@

class NERDataModule
-

class NERModule[source]

NERModule(model) :: LightningModule

+

class NERModule[source]

NERModule(model) :: LightningModule

PyTorch lightning module for training ner model

@@ -240,7 +247,7 @@

Enhance pipeline -

clean_ner_output[source]

clean_ner_output(outputs)

+

clean_ner_output[source]

clean_ner_output(outputs)

Cleaning output for NER task

@@ -258,47 +265,6 @@

clean_ner_output -
-
- -
- - -
-

predict_ner_table[source]

predict_ner_table(pipeline_kw)

-
- -
- -
- -
-
- -

- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

refined_ner_pipeline[source]

refined_ner_pipeline(model, tokenizer, **pipeline_kw)

-
- -
- -
- -
-
-
{% endraw %} @@ -313,8 +279,11 @@

refined_ner_pipeline -

refined_ner_from_pretrained[source]

refined_ner_from_pretrained(pretrained, **pipeline_kw)

+

class NERInference[source]

NERInference(model, tokenizer, name=None)

+

NER Inference pipeline +ner = NERInference.from_pretrained('xxxx/xxxx') +ner.predict(['text1','text2'])

diff --git a/forgebox/__init__.py b/forgebox/__init__.py index 58ce5cd..ac15521 100644 --- a/forgebox/__init__.py +++ b/forgebox/__init__.py @@ -1 +1 @@ -__version__ = "0.4.11" +__version__ = "0.4.17" diff --git a/forgebox/_nbdev.py b/forgebox/_nbdev.py index 8041416..7eb9377 100644 --- a/forgebox/_nbdev.py +++ b/forgebox/_nbdev.py @@ -114,9 +114,7 @@ "NERDataModule": "72_pl_training.ipynb", "NERModule": "72_pl_training.ipynb", "clean_ner_output": "72_pl_training.ipynb", - "predict_ner_table": "72_pl_training.ipynb", - "refined_ner_pipeline": "72_pl_training.ipynb", - "refined_ner_from_pretrained": "72_pl_training.ipynb", + "NERInference": "72_pl_training.ipynb", "CudaDevice": "CUDA_GPU_Management.ipynb", "CudaHandler": "CUDA_GPU_Management.ipynb", "MLMVisualizer": "bert_visualize.ipynb", diff --git a/forgebox/hf/train.py b/forgebox/hf/train.py index d307835..e797c5b 100644 --- a/forgebox/hf/train.py +++ b/forgebox/hf/train.py @@ -1,18 +1,29 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/72_pl_training.ipynb (unless otherwise specified). -__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule', 'clean_ner_output', - 'predict_ner_table', 'refined_ner_pipeline', 'refined_ner_from_pretrained'] +__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule', 'clean_ner_output', 'NERInference'] # Cell from .data import IOBES from ..imports import * +from ..loop import chunkify import pytorch_lightning as pl from transformers import ( AutoModelForTokenClassification, AutoTokenizer, pipeline ) -from typing import Callable +from tqdm.notebook import tqdm +from typing import Callable, List +from torch import device + +# Cell +try: + ishell = get_ipython() + IS_JUPYTER = True + from tqdm.notebook import tqdm +except NameError: + IS_JUPYTER = False + from tqdm import tqdm # Cell @@ -113,12 +124,12 @@ def clean_ner_output(self, outputs): last_idx = 0 # make to sub group by position for output in outputs: - if output["index"]-1 == last_idx: + if output["start"] in [last_idx, last_idx-1]: current.append(output) else: results.append(current) current = [output, ] - last_idx = output["index"] + last_idx = output["end"] if len(current) > 0: results.append(current) @@ -139,30 +150,177 @@ def clean_ner_output(self, outputs): word=new_str, start=min(starts), end=max(ends), - entity=c[0]['entity'] + entity=c[0]['entity_group'] )) return strings -def predict_ner_table(pipeline_kw): - def predict_ner_table_(self, text: str) -> pd.DataFrame: - return pd.DataFrame( - self.clean_output( - self(text, **pipeline_kw) - ) - ) - return predict_ner_table_ - -def refined_ner_pipeline(model, tokenizer, **pipeline_kw): - if "aggregation_strategy" not in pipeline_kw: - pipeline_kw["aggregation_strategy"] = "first" - - ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer) - - ner_pipeline.__class__.clean_output = clean_ner_output - ner_pipeline.__class__.predict_table = predict_ner_table(pipeline_kw) - return ner_pipeline - -def refined_ner_from_pretrained(pretrained, **pipeline_kw): - model = AutoModelForTokenClassification.from_pretrained(pretrained) - tokenizer = AutoTokenizer.from_pretrained(pretrained) - return refined_ner_pipeline(model, tokenizer, **pipeline_kw) \ No newline at end of file +# Cell +class NERInference: + """ + NER Inference pipeline + ner = NERInference.from_pretrained('xxxx/xxxx') + ner.predict(['text1','text2']) + """ + + def __init__(self, model, tokenizer, name=None): + super().__init__() + self.model = model.eval() + self.tokenizer = tokenizer + self.name = name if name else "NER model" + + def __repr__(self): + return f"[NERInference on {self.name}]" + + def to(self, device_str): + self.model = self.model.to(device(device_str)) + return self + + @classmethod + def from_pretrained(cls, tag): + """ + Load from pretrained model and tokenizer + """ + model = AutoModelForTokenClassification.from_pretrained(tag) + tokenizer = AutoTokenizer.from_pretrained(tag) + return cls(model=model, tokenizer=tokenizer, name=model.config._name_or_path) + + def __call__(self, data, batch_size=32, dev=device("cpu")): + if type(data) == str: + return self.batch_predict([data,]) + else: + return self.predict(data, dev=dev, batch_size=batch_size) + + def predict( + self, + texts: List[str], + dev=device("cpu"), + batch_size: int = 32, + progress_bar: bool = True + ) -> pd.DataFrame: + """ + Predict a list of sentences/ paragraphs + """ + # place the model into device + self.model = self.model.to(dev) + iterator = list(enumerate(chunkify(texts, bs=batch_size))) + if progress_bar: + iterator = tqdm(iterator, leave=False) + + # run through iterator + all_dfs = [] + for i, text_b in iterator: + # by batch prediction + batch_df = self.batch_predict(text_b) + if len(batch_df) > 0: + # calculate the row number + batch_df['text_id'] = batch_df.apply( + lambda row: i*batch_size+row.batch_row_sn, axis=1) + all_dfs.append(batch_df) + + # place the model back to cpu + self.model = self.model.to("cpu") + return pd.concat(all_dfs).reset_index(drop=True) + + def tokenizing(self, texts): + inputs = self.tokenizer( + texts, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_attention_mask=True, + return_tensors='pt', truncation=True, return_offsets_mapping=True + ).to(self.model.device) + return inputs + + + def batch_predict(self, texts:List[str])-> pd.DataFrame: + """ + Predict a single batch of sentences + """ + id2label = self.model.config.id2label + inputs = self.tokenizing(texts) + + with torch.no_grad(): + outputs = self.model(input_ids=inputs.input_ids, + attention_mask=inputs.attention_mask) + inputs = inputs.to(device('cpu')) + + pred_idx = outputs.logits.argmax(-1).to(device("cpu")) + batch_size = pred_idx.size(0) + offsets = inputs.offset_mapping + results = [] + for bi in range(batch_size): + text = texts[bi] + input_ids = inputs.input_ids[bi] + word_ids = inputs.word_ids(bi) + pred_ids = pred_idx[bi] + # initial values for the row + last_pos = 0 + previous_has_positive = False + current_start = 0 + current_index = 0 + current_id = 0 + line = [] + for ti in range(1, len(input_ids)): + if input_ids[ti] == self.tokenizer.sep_token_id: + break + # is the current token an appending sub-word? + if word_ids[ti] == last_pos: + pass + # is current token negative + elif pred_ids[ti].item() == 0: + # store the previous hanging prediction + if previous_has_positive: + start = current_start + end = offsets[bi, ti, 0].item() + line.append({ + "start": start, "end": end, + "entity": id2label[current_id], + "word": text[start:end], + "index": current_index, + }) + + current_start = offsets[bi, ti, 0].item() + previous_has_positive = False + current_id = 0 + current_index = ti + # has positive prediction index, other than zero + else: + if previous_has_positive: + # different than the previous + if current_id != pred_ids[ti].item(): + start = current_start + end = offsets[bi, ti, 0].item() + line.append({ + "start": start, + "end": end, + "entity": id2label[current_id], + "word": text[start:end], + "index": current_index, + }) + current_start = offsets[bi, ti, 0].item() + # this is the 1st postive predict for a while + else: + current_start = offsets[bi, ti, 0].item() + previous_has_positive = True + current_index = ti + current_id = pred_ids[ti].item() + + last_pos = word_ids[ti] + if previous_has_positive: + start = current_start + end = offsets[bi, ti, 1].item() + line.append({ + "start": start, + "end": end, + "entity": id2label[current_id], + "word": text[start:end], + "index": current_index, + }) + + results.append(line) + all_dfs = [] + for i, res in enumerate(results): + sub_df = pd.DataFrame(res) + sub_df["batch_row_sn"] = i + all_dfs.append(sub_df) + return pd.concat(all_dfs) \ No newline at end of file diff --git a/nbs/72_pl_training.ipynb b/nbs/72_pl_training.ipynb index be82bc7..3ce4854 100644 --- a/nbs/72_pl_training.ipynb +++ b/nbs/72_pl_training.ipynb @@ -19,20 +19,39 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# export\n", "from forgebox.hf.data import IOBES\n", "from forgebox.imports import *\n", + "from forgebox.loop import chunkify\n", "import pytorch_lightning as pl\n", "from transformers import (\n", " AutoModelForTokenClassification,\n", " AutoTokenizer,\n", " pipeline\n", ")\n", - "from typing import Callable" + "from tqdm.notebook import tqdm\n", + "from typing import Callable, List\n", + "from torch import device" + ] + }, + { + "cell_type": "code", + "execution_count": 290, + "metadata": {}, + "outputs": [], + "source": [ + "# export\n", + "try:\n", + " ishell = get_ipython()\n", + " IS_JUPYTER = True\n", + " from tqdm.notebook import tqdm\n", + "except NameError:\n", + " IS_JUPYTER = False\n", + " from tqdm import tqdm" ] }, { @@ -55,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -179,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -193,12 +212,12 @@ " last_idx = 0\n", " # make to sub group by position\n", " for output in outputs:\n", - " if output[\"index\"]-1 == last_idx:\n", + " if output[\"start\"] in [last_idx, last_idx-1]:\n", " current.append(output)\n", " else:\n", " results.append(current)\n", " current = [output, ]\n", - " last_idx = output[\"index\"]\n", + " last_idx = output[\"end\"]\n", " if len(current) > 0:\n", " results.append(current)\n", "\n", @@ -219,34 +238,195 @@ " word=new_str,\n", " start=min(starts),\n", " end=max(ends),\n", - " entity=c[0]['entity']\n", + " entity=c[0]['entity_group']\n", " ))\n", - " return strings\n", + " return strings" + ] + }, + { + "cell_type": "code", + "execution_count": 281, + "metadata": {}, + "outputs": [], + "source": [ + "# export\n", + "class NERInference:\n", + " \"\"\"\n", + " NER Inference pipeline\n", + " ner = NERInference.from_pretrained('xxxx/xxxx')\n", + " ner.predict(['text1','text2'])\n", + " \"\"\"\n", + "\n", + " def __init__(self, model, tokenizer, name=None):\n", + " super().__init__()\n", + " self.model = model.eval()\n", + " self.tokenizer = tokenizer\n", + " self.name = name if name else \"NER model\"\n", + "\n", + " def __repr__(self):\n", + " return f\"[NERInference on {self.name}]\"\n", "\n", - "def predict_ner_table(pipeline_kw):\n", - " def predict_ner_table_(self, text: str) -> pd.DataFrame:\n", - " return pd.DataFrame(\n", - " self.clean_output(\n", - " self(text, **pipeline_kw)\n", - " )\n", - " )\n", - " return predict_ner_table_\n", + " def to(self, device_str):\n", + " self.model = self.model.to(device(device_str))\n", + " return self\n", "\n", - "def refined_ner_pipeline(model, tokenizer, **pipeline_kw):\n", - " if \"aggregation_strategy\" not in pipeline_kw:\n", - " pipeline_kw[\"aggregation_strategy\"] = \"first\"\n", + " @classmethod\n", + " def from_pretrained(cls, tag):\n", + " \"\"\"\n", + " Load from pretrained model and tokenizer\n", + " \"\"\"\n", + " model = AutoModelForTokenClassification.from_pretrained(tag)\n", + " tokenizer = AutoTokenizer.from_pretrained(tag)\n", + " return cls(model=model, tokenizer=tokenizer, name=model.config._name_or_path)\n", + " \n", + " def __call__(self, data, batch_size=32, dev=device(\"cpu\")):\n", + " if type(data) == str:\n", + " return self.batch_predict([data,])\n", + " else:\n", + " return self.predict(data, dev=dev, batch_size=batch_size)\n", "\n", - " ner_pipeline = pipeline(\"ner\", model=model, tokenizer=tokenizer)\n", + " def predict(\n", + " self,\n", + " texts: List[str],\n", + " dev=device(\"cpu\"),\n", + " batch_size: int = 32,\n", + " progress_bar: bool = True\n", + " ) -> pd.DataFrame:\n", + " \"\"\"\n", + " Predict a list of sentences/ paragraphs\n", + " \"\"\"\n", + " # place the model into device\n", + " self.model = self.model.to(dev)\n", + " iterator = list(enumerate(chunkify(texts, bs=batch_size)))\n", + " if progress_bar:\n", + " iterator = tqdm(iterator, leave=False)\n", + "\n", + " # run through iterator\n", + " all_dfs = []\n", + " for i, text_b in iterator:\n", + " # by batch prediction\n", + " batch_df = self.batch_predict(text_b)\n", + " if len(batch_df) > 0:\n", + " # calculate the row number\n", + " batch_df['text_id'] = batch_df.apply(\n", + " lambda row: i*batch_size+row.batch_row_sn, axis=1)\n", + " all_dfs.append(batch_df)\n", + "\n", + " # place the model back to cpu\n", + " self.model = self.model.to(\"cpu\")\n", + " return pd.concat(all_dfs).reset_index(drop=True)\n", " \n", - " ner_pipeline.__class__.clean_output = clean_ner_output\n", - " ner_pipeline.__class__.predict_table = predict_ner_table(pipeline_kw)\n", - " return ner_pipeline\n", + " def tokenizing(self, texts):\n", + " inputs = self.tokenizer(\n", + " texts,\n", + " padding=\"max_length\",\n", + " max_length=self.tokenizer.model_max_length,\n", + " return_attention_mask=True,\n", + " return_tensors='pt', truncation=True, return_offsets_mapping=True\n", + " ).to(self.model.device)\n", + " return inputs\n", + "\n", + "\n", + " def batch_predict(self, texts:List[str])-> pd.DataFrame:\n", + " \"\"\"\n", + " Predict a single batch of sentences\n", + " \"\"\"\n", + " id2label = self.model.config.id2label\n", + " inputs = self.tokenizing(texts)\n", + "\n", + " with torch.no_grad():\n", + " outputs = self.model(input_ids=inputs.input_ids,\n", + " attention_mask=inputs.attention_mask)\n", + " inputs = inputs.to(device('cpu'))\n", + "\n", + " pred_idx = outputs.logits.argmax(-1).to(device(\"cpu\"))\n", + " batch_size = pred_idx.size(0)\n", + " offsets = inputs.offset_mapping\n", + " results = []\n", + " for bi in range(batch_size):\n", + " text = texts[bi]\n", + " input_ids = inputs.input_ids[bi]\n", + " word_ids = inputs.word_ids(bi)\n", + " pred_ids = pred_idx[bi]\n", + " # initial values for the row\n", + " last_pos = 0\n", + " previous_has_positive = False\n", + " current_start = 0\n", + " current_index = 0\n", + " current_id = 0\n", + " line = []\n", + " for ti in range(1, len(input_ids)):\n", + " if input_ids[ti] == self.tokenizer.sep_token_id:\n", + " break\n", + " # is the current token an appending sub-word?\n", + " if word_ids[ti] == last_pos:\n", + " pass\n", + " # is current token negative\n", + " elif pred_ids[ti].item() == 0:\n", + " # store the previous hanging prediction\n", + " if previous_has_positive:\n", + " start = current_start\n", + " end = offsets[bi, ti, 0].item()\n", + " line.append({\n", + " \"start\": start, \"end\": end,\n", + " \"entity\": id2label[current_id],\n", + " \"word\": text[start:end],\n", + " \"index\": current_index,\n", + " })\n", "\n", - "def refined_ner_from_pretrained(pretrained, **pipeline_kw):\n", - " model = AutoModelForTokenClassification.from_pretrained(pretrained)\n", - " tokenizer = AutoTokenizer.from_pretrained(pretrained)\n", - " return refined_ner_pipeline(model, tokenizer, **pipeline_kw)" + " current_start = offsets[bi, ti, 0].item()\n", + " previous_has_positive = False\n", + " current_id = 0\n", + " current_index = ti\n", + " # has positive prediction index, other than zero\n", + " else:\n", + " if previous_has_positive:\n", + " # different than the previous\n", + " if current_id != pred_ids[ti].item():\n", + " start = current_start\n", + " end = offsets[bi, ti, 0].item()\n", + " line.append({\n", + " \"start\": start,\n", + " \"end\": end,\n", + " \"entity\": id2label[current_id],\n", + " \"word\": text[start:end],\n", + " \"index\": current_index,\n", + " })\n", + " current_start = offsets[bi, ti, 0].item()\n", + " # this is the 1st postive predict for a while\n", + " else:\n", + " current_start = offsets[bi, ti, 0].item()\n", + " previous_has_positive = True\n", + " current_index = ti\n", + " current_id = pred_ids[ti].item()\n", + "\n", + " last_pos = word_ids[ti]\n", + " if previous_has_positive:\n", + " start = current_start\n", + " end = offsets[bi, ti, 1].item()\n", + " line.append({\n", + " \"start\": start,\n", + " \"end\": end,\n", + " \"entity\": id2label[current_id],\n", + " \"word\": text[start:end],\n", + " \"index\": current_index,\n", + " })\n", + "\n", + " results.append(line)\n", + " all_dfs = []\n", + " for i, res in enumerate(results):\n", + " sub_df = pd.DataFrame(res)\n", + " sub_df[\"batch_row_sn\"] = i\n", + " all_dfs.append(sub_df)\n", + " return pd.concat(all_dfs)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/settings.ini b/settings.ini index 79bf464..b416459 100644 --- a/settings.ini +++ b/settings.ini @@ -7,7 +7,7 @@ author = xiaochen(ray) zhang author_email = b2ray2c@gmail.com copyright = xiaochen(ray) zhang branch = master -version = 0.4.11 +version = 0.4.17 min_python = 3.6 audience = Developers language = English