diff --git a/docs/hf_transformer_data.html b/docs/hf_transformer_data.html index 1d333a1..e1837ec 100644 --- a/docs/hf_transformer_data.html +++ b/docs/hf_transformer_data.html @@ -135,31 +135,6 @@

class IOBES -
-
- -
- - -
-

clean_output[source]

clean_output(outputs)

-
-

Cleaning output for NER task

- -
- -
- -
-
- - - {% endraw %} - - {% raw %} - -
-
{% endraw %} diff --git a/docs/pl_training.html b/docs/pl_training.html index 864d669..95bd7f4 100644 --- a/docs/pl_training.html +++ b/docs/pl_training.html @@ -71,7 +71,7 @@

Load model and tokenizer -

ner_model_from[source]

ner_model_from(name:str, dataset:IOBES)

+

ner_model_from[source]

ner_model_from(name:str, dataset:IOBES)

name: from_pretrain(name)

@@ -96,7 +96,7 @@

ner_model_from -

ner_tokenizer_from[source]

ner_tokenizer_from(name:str)

+

ner_tokenizer_from[source]

ner_tokenizer_from(name:str)

@@ -133,7 +133,7 @@

Lightning data module -

class NERDataModule[source]

NERDataModule(*args:Any, **kwargs:Any) :: LightningDataModule

+

class NERDataModule[source]

NERDataModule(*args:Any, **kwargs:Any) :: LightningDataModule

A DataModule standardizes the training, val, test splits, data preparation and transforms. The main advantage is consistent data splits, data preparation and transforms across models.

@@ -202,7 +202,7 @@

class NERDataModule
-

class NERModule[source]

NERModule(model) :: LightningModule

+

class NERModule[source]

NERModule(model) :: LightningModule

PyTorch lightning module for training ner model

@@ -220,6 +220,116 @@

class NERModule<
+
+ {% endraw %} + +
+
+

Enhance pipeline

+
+
+
+ {% raw %} + +
+ +
+
+ +
+ + +
+

clean_ner_output[source]

clean_ner_output(outputs)

+
+

Cleaning output for NER task

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

predict_ner_table[source]

predict_ner_table(pipeline_kw)

+
+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

refined_ner_pipeline[source]

refined_ner_pipeline(model, tokenizer, **pipeline_kw)

+
+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

refined_ner_from_pretrained[source]

refined_ner_from_pretrained(pretrained, **pipeline_kw)

+
+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+
{% endraw %} diff --git a/forgebox/__init__.py b/forgebox/__init__.py index 574c066..58ce5cd 100644 --- a/forgebox/__init__.py +++ b/forgebox/__init__.py @@ -1 +1 @@ -__version__ = "0.4.9" +__version__ = "0.4.11" diff --git a/forgebox/_nbdev.py b/forgebox/_nbdev.py index 1566d36..8041416 100644 --- a/forgebox/_nbdev.py +++ b/forgebox/_nbdev.py @@ -109,11 +109,14 @@ "convert_iob2_file_to_iobes": "70_hf_transformer_data.ipynb", "conbine_iobes_file": "70_hf_transformer_data.ipynb", "IOBES": "70_hf_transformer_data.ipynb", - "clean_output": "70_hf_transformer_data.ipynb", "ner_model_from": "72_pl_training.ipynb", "ner_tokenizer_from": "72_pl_training.ipynb", "NERDataModule": "72_pl_training.ipynb", "NERModule": "72_pl_training.ipynb", + "clean_ner_output": "72_pl_training.ipynb", + "predict_ner_table": "72_pl_training.ipynb", + "refined_ner_pipeline": "72_pl_training.ipynb", + "refined_ner_from_pretrained": "72_pl_training.ipynb", "CudaDevice": "CUDA_GPU_Management.ipynb", "CudaHandler": "CUDA_GPU_Management.ipynb", "MLMVisualizer": "bert_visualize.ipynb", diff --git a/forgebox/hf/data.py b/forgebox/hf/data.py index 3287221..dfe8367 100644 --- a/forgebox/hf/data.py +++ b/forgebox/hf/data.py @@ -1,6 +1,6 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/70_hf_transformer_data.ipynb (unless otherwise specified). -__all__ = ['convert_iob2_file_to_iobes', 'conbine_iobes_file', 'IOBES', 'clean_output'] +__all__ = ['convert_iob2_file_to_iobes', 'conbine_iobes_file', 'IOBES'] # Cell from ..imports import * @@ -214,44 +214,4 @@ def set_hfconfig(self, config): """ config.num_labels = len(self.cates) config.id2label = {i: label for i, label in enumerate(self.cates.i2c)} - config.label2id = {label: i for i, label in enumerate(self.cates.i2c)} - - -def clean_output(outputs): - """ - Cleaning output for NER task - """ - results = [] - current = [] - last_idx = 0 - # make to sub group by position - for output in outputs: - if output["index"]-1 == last_idx: - current.append(output) - else: - results.append(current) - current = [output, ] - last_idx = output["index"] - if len(current) > 0: - results.append(current) - - # from tokens to string - strings = [] - for c in results: - tokens = [] - starts = [] - ends = [] - for o in c: - tokens.append(o['word']) - starts.append(o['start']) - ends.append(o['end']) - - new_str = tokenizer.convert_tokens_to_string(tokens) - if new_str != '': - strings.append(dict( - word=new_str, - start=min(starts), - end=max(ends), - entity=c[0]['entity'] - )) - return strings \ No newline at end of file + config.label2id = {label: i for i, label in enumerate(self.cates.i2c)} \ No newline at end of file diff --git a/forgebox/hf/train.py b/forgebox/hf/train.py index 0cae4f6..d307835 100644 --- a/forgebox/hf/train.py +++ b/forgebox/hf/train.py @@ -1,12 +1,18 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/72_pl_training.ipynb (unless otherwise specified). -__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule'] +__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule', 'clean_ner_output', + 'predict_ner_table', 'refined_ner_pipeline', 'refined_ner_from_pretrained'] # Cell from .data import IOBES from ..imports import * import pytorch_lightning as pl -from transformers import AutoModelForTokenClassification, AutoTokenizer +from transformers import ( + AutoModelForTokenClassification, + AutoTokenizer, + pipeline +) +from typing import Callable # Cell @@ -95,4 +101,68 @@ def configure_optimizers(self): {'params': self.model.classifier.parameters(), 'lr': 1e-3}, ] optimizer = torch.optim.Adam(param_groups, lr=1e-3) - return optimizer \ No newline at end of file + return optimizer + +# Cell +def clean_ner_output(self, outputs): + """ + Cleaning output for NER task + """ + results = [] + current = [] + last_idx = 0 + # make to sub group by position + for output in outputs: + if output["index"]-1 == last_idx: + current.append(output) + else: + results.append(current) + current = [output, ] + last_idx = output["index"] + if len(current) > 0: + results.append(current) + + # from tokens to string + strings = [] + for c in results: + tokens = [] + starts = [] + ends = [] + for o in c: + tokens.append(o['word']) + starts.append(o['start']) + ends.append(o['end']) + + new_str = self.tokenizer.convert_tokens_to_string(tokens) + if new_str != '': + strings.append(dict( + word=new_str, + start=min(starts), + end=max(ends), + entity=c[0]['entity'] + )) + return strings + +def predict_ner_table(pipeline_kw): + def predict_ner_table_(self, text: str) -> pd.DataFrame: + return pd.DataFrame( + self.clean_output( + self(text, **pipeline_kw) + ) + ) + return predict_ner_table_ + +def refined_ner_pipeline(model, tokenizer, **pipeline_kw): + if "aggregation_strategy" not in pipeline_kw: + pipeline_kw["aggregation_strategy"] = "first" + + ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer) + + ner_pipeline.__class__.clean_output = clean_ner_output + ner_pipeline.__class__.predict_table = predict_ner_table(pipeline_kw) + return ner_pipeline + +def refined_ner_from_pretrained(pretrained, **pipeline_kw): + model = AutoModelForTokenClassification.from_pretrained(pretrained) + tokenizer = AutoTokenizer.from_pretrained(pretrained) + return refined_ner_pipeline(model, tokenizer, **pipeline_kw) \ No newline at end of file diff --git a/nbs/70_hf_transformer_data.ipynb b/nbs/70_hf_transformer_data.ipynb index fac2f7d..ac7b569 100644 --- a/nbs/70_hf_transformer_data.ipynb +++ b/nbs/70_hf_transformer_data.ipynb @@ -85,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -262,47 +262,7 @@ " \"\"\"\n", " config.num_labels = len(self.cates)\n", " config.id2label = {i: label for i, label in enumerate(self.cates.i2c)}\n", - " config.label2id = {label: i for i, label in enumerate(self.cates.i2c)}\n", - "\n", - "\n", - "def clean_output(outputs):\n", - " \"\"\"\n", - " Cleaning output for NER task\n", - " \"\"\"\n", - " results = []\n", - " current = []\n", - " last_idx = 0\n", - " # make to sub group by position\n", - " for output in outputs:\n", - " if output[\"index\"]-1 == last_idx:\n", - " current.append(output)\n", - " else:\n", - " results.append(current)\n", - " current = [output, ]\n", - " last_idx = output[\"index\"]\n", - " if len(current) > 0:\n", - " results.append(current)\n", - "\n", - " # from tokens to string\n", - " strings = []\n", - " for c in results:\n", - " tokens = []\n", - " starts = []\n", - " ends = []\n", - " for o in c:\n", - " tokens.append(o['word'])\n", - " starts.append(o['start'])\n", - " ends.append(o['end'])\n", - "\n", - " new_str = tokenizer.convert_tokens_to_string(tokens)\n", - " if new_str != '':\n", - " strings.append(dict(\n", - " word=new_str,\n", - " start=min(starts),\n", - " end=max(ends),\n", - " entity=c[0]['entity']\n", - " ))\n", - " return strings" + " config.label2id = {label: i for i, label in enumerate(self.cates.i2c)}" ] }, { diff --git a/nbs/72_pl_training.ipynb b/nbs/72_pl_training.ipynb index 82a4985..be82bc7 100644 --- a/nbs/72_pl_training.ipynb +++ b/nbs/72_pl_training.ipynb @@ -27,7 +27,12 @@ "from forgebox.hf.data import IOBES\n", "from forgebox.imports import *\n", "import pytorch_lightning as pl\n", - "from transformers import AutoModelForTokenClassification, AutoTokenizer" + "from transformers import (\n", + " AutoModelForTokenClassification,\n", + " AutoTokenizer,\n", + " pipeline\n", + ")\n", + "from typing import Callable" ] }, { @@ -165,12 +170,83 @@ " return optimizer" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enhance pipeline" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# export\n", + "def clean_ner_output(self, outputs):\n", + " \"\"\"\n", + " Cleaning output for NER task\n", + " \"\"\"\n", + " results = []\n", + " current = []\n", + " last_idx = 0\n", + " # make to sub group by position\n", + " for output in outputs:\n", + " if output[\"index\"]-1 == last_idx:\n", + " current.append(output)\n", + " else:\n", + " results.append(current)\n", + " current = [output, ]\n", + " last_idx = output[\"index\"]\n", + " if len(current) > 0:\n", + " results.append(current)\n", + "\n", + " # from tokens to string\n", + " strings = []\n", + " for c in results:\n", + " tokens = []\n", + " starts = []\n", + " ends = []\n", + " for o in c:\n", + " tokens.append(o['word'])\n", + " starts.append(o['start'])\n", + " ends.append(o['end'])\n", + "\n", + " new_str = self.tokenizer.convert_tokens_to_string(tokens)\n", + " if new_str != '':\n", + " strings.append(dict(\n", + " word=new_str,\n", + " start=min(starts),\n", + " end=max(ends),\n", + " entity=c[0]['entity']\n", + " ))\n", + " return strings\n", + "\n", + "def predict_ner_table(pipeline_kw):\n", + " def predict_ner_table_(self, text: str) -> pd.DataFrame:\n", + " return pd.DataFrame(\n", + " self.clean_output(\n", + " self(text, **pipeline_kw)\n", + " )\n", + " )\n", + " return predict_ner_table_\n", + "\n", + "def refined_ner_pipeline(model, tokenizer, **pipeline_kw):\n", + " if \"aggregation_strategy\" not in pipeline_kw:\n", + " pipeline_kw[\"aggregation_strategy\"] = \"first\"\n", + "\n", + " ner_pipeline = pipeline(\"ner\", model=model, tokenizer=tokenizer)\n", + " \n", + " ner_pipeline.__class__.clean_output = clean_ner_output\n", + " ner_pipeline.__class__.predict_table = predict_ner_table(pipeline_kw)\n", + " return ner_pipeline\n", + "\n", + "def refined_ner_from_pretrained(pretrained, **pipeline_kw):\n", + " model = AutoModelForTokenClassification.from_pretrained(pretrained)\n", + " tokenizer = AutoTokenizer.from_pretrained(pretrained)\n", + " return refined_ner_pipeline(model, tokenizer, **pipeline_kw)" + ] } ], "metadata": { diff --git a/settings.ini b/settings.ini index bcdd2b7..79bf464 100644 --- a/settings.ini +++ b/settings.ini @@ -7,7 +7,7 @@ author = xiaochen(ray) zhang author_email = b2ray2c@gmail.com copyright = xiaochen(ray) zhang branch = master -version = 0.4.9 +version = 0.4.11 min_python = 3.6 audience = Developers language = English