diff --git a/docs/_data/sidebars/home_sidebar.yml b/docs/_data/sidebars/home_sidebar.yml index 449d4f7..d50c1ff 100644 --- a/docs/_data/sidebars/home_sidebar.yml +++ b/docs/_data/sidebars/home_sidebar.yml @@ -78,6 +78,12 @@ entries: - output: web,pdf title: Lightning Callbacks url: thunder_callbacks.html + - output: web,pdf + title: Data parts for hf transformers + url: hf_transformer_data.html + - output: web,pdf + title: Pytorch Lighting training + url: pl_training.html - output: web,pdf title: CUDA GPU Management url: CUDA_GPU_Management.html diff --git a/docs/hf_transformer_data.html b/docs/hf_transformer_data.html new file mode 100644 index 0000000..1d333a1 --- /dev/null +++ b/docs/hf_transformer_data.html @@ -0,0 +1,462 @@ +--- + +title: Data parts for hf transformers + + +keywords: fastai +sidebar: home_sidebar + + + +nb_path: "nbs/70_hf_transformer_data.ipynb" +--- + + +
+ + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + +
+
+

Process IOBES files

+
+
+
+ {% raw %} + +
+ +
+
+ +
+ + +
+

convert_iob2_file_to_iobes[source]

convert_iob2_file_to_iobes(file_path, result_path)

+
+

Convert IOB2 file to IOBES

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

conbine_iobes_file[source]

conbine_iobes_file(file_paths:List[Path], new_file_path:Path)

+
+

Conbine from multiple IOBES files + into IOBES files

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + +
+
+

Dataset

+
+
+
+ {% raw %} + +
+ +
+
+ +
+ + +
+

class IOBES[source]

IOBES(*args, **kwds) :: Dataset

+
+

Load iobes file for NER training task

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

clean_output[source]

clean_output(outputs)

+
+

Cleaning output for NER task

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+
+ +
+
+
from transformers import AutoTokenizer
+
+ +
+
+
+ +
+ {% endraw %} + + {% raw %} + +
+
+ +
+
+
tokenizer = AutoTokenizer.from_pretrained("raynardj/roberta-pubmed", add_prefix_space=True)
+
+ +
+
+
+ +
+ {% endraw %} + + {% raw %} + +
+
+ +
+
+
dataset = IOBES("/Users/xiaochen.zhang/data/valid.iobes", tokenizer)
+
+ +
+
+
+ +
+ {% endraw %} + + {% raw %} + +
+
+ +
+
+
for w,l in zip(*dataset[2]):
+    print(f"{w}-{l}")
+
+ +
+
+
+ +
+
+ +
+ +
+
in-O
+blood-O
+;-O
+content-O
+of-O
+cAMP-O
+was-O
+also-O
+decreased-O
+in-O
+lymphocytes-O
+by-O
+33-O
+%-O
+.-O
+At-O
+the-O
+same-O
+time-O
+,-O
+total-O
+content-O
+of-O
+T-cell_type
+lymphocytes-cell_type
+was-O
+decreased-O
+1.5-fold-O
+in-O
+peripheric-O
+blood-O
+.-O
+Treatment-O
+with-O
+I-hydroxyvitamin-O
+D3-O
+(-O
+1-1.5-O
+mg-O
+daily-O
+,-O
+within-O
+4-O
+weeks-O
+)-O
+led-O
+to-O
+normalization-O
+of-O
+total-O
+and-O
+ionized-O
+form-O
+of-O
+Ca2+-O
+and-O
+of-O
+25-O
+(-O
+OH-O
+)-O
+D-O
+,-O
+but-O
+did-O
+not-O
+affect-O
+the-O
+PTH-O
+content-O
+in-O
+blood-O
+.-O
+Concentration-O
+of-O
+the-O
+receptors-protein
+to-O
+1.25-O
+(-O
+OH-O
+)-O
+2D3-O
+was-O
+elevated-O
+up-O
+to-O
+39.7-O
+fmole/mg-O
+after-O
+I-O
+week-O
+of-O
+the-O
+treatment-O
+,-O
+whereas-O
+it-O
+was-O
+decreased-O
+to-O
+the-O
+initial-O
+level-O
+24.8-O
+fmole/mg-O
+within-O
+4-O
+weeks-O
+;-O
+simultaneous-O
+alteration-O
+in-O
+
+
+
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+
+ +
+
+
dataset.one_batch()
+
+ +
+
+
+ +
+
+ +
+ + + +
+
{'input_ids': tensor([[   19,  3741,  2603,  ...,  1417,  2617, 11576],
+        [ 4590,  2156,   255,  ...,   405,  1182,  6608],
+        [ 6214, 25683,  3809,  ...,    11,     5,  8151],
+        ...,
+        [13998, 25326,  2413,  ...,     5,  2199,    21],
+        [11299,   705, 24811,  ...,   134,  1589,  2032],
+        [ 5804,   924,    14,  ...,   366,  1168,     9]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
+        [1, 1, 1,  ..., 1, 1, 1],
+        [1, 1, 1,  ..., 1, 1, 1],
+        ...,
+        [1, 1, 1,  ..., 1, 1, 1],
+        [1, 1, 1,  ..., 1, 1, 1],
+        [1, 1, 1,  ..., 1, 1, 1]]), 'offset_mapping': tensor([[[ 1,  4],
+         [ 1,  2],
+         [ 2,  5],
+         ...,
+         [ 3,  5],
+         [ 5,  8],
+         [ 1,  6]],
+
+        [[ 1,  5],
+         [ 1,  1],
+         [ 1,  1],
+         ...,
+         [ 5,  7],
+         [ 7,  9],
+         [ 9, 14]],
+
+        [[ 1,  5],
+         [ 5,  8],
+         [ 8, 10],
+         ...,
+         [ 1,  2],
+         [ 1,  3],
+         [ 1, 10]],
+
+        ...,
+
+        [[ 1,  5],
+         [ 5,  8],
+         [ 8, 10],
+         ...,
+         [ 1,  3],
+         [ 1,  7],
+         [ 1,  3]],
+
+        [[ 1,  5],
+         [ 5,  6],
+         [ 6, 10],
+         ...,
+         [ 2,  3],
+         [ 1,  1],
+         [ 1,  2]],
+
+        [[ 1,  7],
+         [ 1,  5],
+         [ 1,  4],
+         ...,
+         [ 3,  5],
+         [ 5,  7],
+         [ 1,  2]]]), 'labels': tensor([[0, 1, 1,  ..., 0, 0, 0],
+        [2, 0, 2,  ..., 0, 0, 0],
+        [0, 0, 0,  ..., 0, 0, 0],
+        ...,
+        [1, 1, 1,  ..., 0, 0, 0],
+        [0, 0, 0,  ..., 2, 0, 2],
+        [0, 0, 0,  ..., 0, 0, 0]])}
+
+ +
+ +
+
+ +
+ {% endraw %} + +
+ + diff --git a/docs/pl_training.html b/docs/pl_training.html new file mode 100644 index 0000000..864d669 --- /dev/null +++ b/docs/pl_training.html @@ -0,0 +1,228 @@ +--- + +title: Pytorch Lighting training + + +keywords: fastai +sidebar: home_sidebar + +summary: "on huggingface transformers" +description: "on huggingface transformers" +nb_path: "nbs/72_pl_training.ipynb" +--- + + +
+ + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+
+ +
+
+
# !pip install pytorch-lightning==1.3.8
+# !pip install tensorflow==2.2.0
+
+ +
+
+
+ +
+ {% endraw %} + +
+
+

Load model and tokenizer

+
+
+
+ {% raw %} + +
+ +
+
+ +
+ + +
+

ner_model_from[source]

ner_model_from(name:str, dataset:IOBES)

+
+

name: from_pretrain(name)

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

ner_tokenizer_from[source]

ner_tokenizer_from(name:str)

+
+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + +
+
+

Lightning data module

+
+
+
+ {% raw %} + +
+ +
+
+ +
+ + +
+

class NERDataModule[source]

NERDataModule(*args:Any, **kwargs:Any) :: LightningDataModule

+
+

A DataModule standardizes the training, val, test splits, data preparation and transforms. +The main advantage is consistent data splits, data preparation and transforms across models.

+

Example::

+ +
class MyDataModule(LightningDataModule):
+    def __init__(self):
+        super().__init__()
+    def prepare_data(self):
+        # download, split, etc...
+        # only called on 1 GPU/TPU in distributed
+    def setup(self):
+        # make assignments here (val/train/test split)
+        # called on every process in DDP
+    def train_dataloader(self):
+        train_split = Dataset(...)
+        return DataLoader(train_split)
+    def val_dataloader(self):
+        val_split = Dataset(...)
+        return DataLoader(val_split)
+    def test_dataloader(self):
+        test_split = Dataset(...)
+        return DataLoader(test_split)
+    def teardown(self):
+        # clean up after fit or test
+        # called on every process in DDP
+
+
+

A DataModule implements 6 key methods:

+
    +
  • prepare_data (things to do on 1 GPU/TPU not on every GPU/TPU in distributed mode).
  • +
  • setup (things to do on every accelerator in distributed mode).
  • +
  • train_dataloader the training dataloader.
  • +
  • val_dataloader the val dataloader(s).
  • +
  • test_dataloader the test dataloader(s).
  • +
  • teardown (things to do on every accelerator in distributed mode when finished)
  • +
+

This allows you to share a full dataset without explaining how to download, +split transform and process the data

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+
+ +
+ + +
+

class NERModule[source]

NERModule(model) :: LightningModule

+
+

PyTorch lightning module for training ner model

+ +
+ +
+ +
+
+ +
+ {% endraw %} + + {% raw %} + +
+ +
+ {% endraw %} + +
+ + diff --git a/docs/sidebar.json b/docs/sidebar.json index 9e2b594..cff08b1 100644 --- a/docs/sidebar.json +++ b/docs/sidebar.json @@ -24,6 +24,8 @@ "Categorical Transformation for DL": "category.html", "Cosine ": "cosine_search.html", "Lightning Callbacks": "thunder_callbacks.html", + "Data parts for hf transformers": "hf_transformer_data.html", + "Pytorch Lighting training": "pl_training.html", "CUDA GPU Management": "CUDA_GPU_Management.html", "Bert Visualize": "bert_visualize.html", "NLP data": "bilstm-based-search-on-netflix-data.html", diff --git a/forgebox/__init__.py b/forgebox/__init__.py index a34b2f6..574c066 100644 --- a/forgebox/__init__.py +++ b/forgebox/__init__.py @@ -1 +1 @@ -__version__ = "0.4.7" +__version__ = "0.4.9" diff --git a/forgebox/_nbdev.py b/forgebox/_nbdev.py index e8c03ee..1566d36 100644 --- a/forgebox/_nbdev.py +++ b/forgebox/_nbdev.py @@ -106,6 +106,14 @@ "UnfreezeScheduler": "61_thunder_callbacks.ipynb", "nn.Module.unfreeze": "61_thunder_callbacks.ipynb", "nn.Module.freeze": "61_thunder_callbacks.ipynb", + "convert_iob2_file_to_iobes": "70_hf_transformer_data.ipynb", + "conbine_iobes_file": "70_hf_transformer_data.ipynb", + "IOBES": "70_hf_transformer_data.ipynb", + "clean_output": "70_hf_transformer_data.ipynb", + "ner_model_from": "72_pl_training.ipynb", + "ner_tokenizer_from": "72_pl_training.ipynb", + "NERDataModule": "72_pl_training.ipynb", + "NERModule": "72_pl_training.ipynb", "CudaDevice": "CUDA_GPU_Management.ipynb", "CudaHandler": "CUDA_GPU_Management.ipynb", "MLMVisualizer": "bert_visualize.ipynb", @@ -164,6 +172,8 @@ "category.py", "cosine.py", "thunder/callbacks.py", + "hf/data.py", + "hf/train.py", "ftorch/cuda.py", "bert_visualize.py", "data/nlp.py", diff --git a/forgebox/hf/__init__.py b/forgebox/hf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/forgebox/hf/data.py b/forgebox/hf/data.py new file mode 100644 index 0000000..3287221 --- /dev/null +++ b/forgebox/hf/data.py @@ -0,0 +1,257 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/70_hf_transformer_data.ipynb (unless otherwise specified). + +__all__ = ['convert_iob2_file_to_iobes', 'conbine_iobes_file', 'IOBES', 'clean_output'] + +# Cell +from ..imports import * +from ..category import Category +from typing import List, Dict, Callable, Any, Tuple + +# Cell +def convert_iob2_file_to_iobes(file_path, result_path): + """ + Convert IOB2 file to IOBES + """ + with open(file_path, 'r') as f: + lines = f.readlines() + with open(result_path, 'w') as f: + for line in lines: + line = line.strip() + if line == '': + f.write('\n') + continue + line = line.split() + if line[-1] == 'O': + f.write(' '.join(line) + '\n') + else: + f.write(' '.join(line[:-1]) + ' ' + line[-1] + '\n') + + +def conbine_iobes_file( + file_paths: List[Path], + new_file_path: Path +): + """ + Conbine from multiple IOBES files + into IOBES files + """ + with open(new_file_path, 'w') as new_file: + for file_path in file_paths: + with open(file_path, 'r') as file: + for line in file: + new_file.write(line) + +# Cell +class IOBES(Dataset): + """ + Load iobes file for NER training task + """ + + def __init__( + self, + file_path, + tokenizer, + max_len=128, + save_buffer: int = 15, + category: Category = None, + return_string: bool = False, + use_frag: bool = False, + ): + """ + file_path, + tokenizer, + max_len=128, + save_buffer: int = 15, + category: Category = None, + label categories, if set to None, will be figured out + automatically. + You can set this to None for train dataset, but for valid + dataset: + valid_ds = IOBES(...,category=train_ds.cates) + return_string: bool = False, do we return original string + for tokenizer output, this option is good for debuging + but the data won't pass into cuda if choose so + use_frag: bool = False, do we use prepend like 'I-','B-' + """ + self.file_path = file_path + self.max_len = max_len + self.pairs = [] + self.list_of_words = [] + self.list_of_labels = [] + self.tokenizer = tokenizer + self.cates = category + self.return_string = return_string + self.use_frag = use_frag + self.load_data(save_buffer) + + def load_data(self, save_buffer: int = 15): + """ + Load file in to object structure + """ + with open(self.file_path, 'r') as f: + for line in f: + line = line.strip() + if line: + splited = line.split() + if len(splited) != 2: + continue + word, label = splited + # do we use 'I-', 'B-' etc + if self.use_frag is False: + if "-" in label: + label = label.split('-')[1] + self.pairs.append([word, label]) + + self.pairs = np.array(self.pairs) + + if self.cates is None: + labels_df = pd.DataFrame({"label": self.pairs[:, 1]}) + self.cates = Category(list(labels_df.vc("label").index)) + + self.batching_words(save_buffer) + + def batching_words(self, save_buffer: int = 15): + """ + batching self.words into self.list_of_words + by self.max_len -15 + """ + for i in range(0, len(self.pairs), self.max_len-save_buffer): + chunk_slice = slice(i, i+self.max_len-save_buffer) + self.list_of_words.append(self.pairs[chunk_slice, 0]) + self.list_of_labels.append(self.pairs[chunk_slice, 1]) + + def __len__(self) -> int: + return len(self.list_of_words) + + def __getitem__(self, idx: int) -> Tuple[List[str]]: + return list(self.list_of_words[idx]), list(self.list_of_labels[idx]) + + def __repr__(self): + return f"""NER dataset using IOBES annotation + {len(self)} sentences, + Labels: + {list(self.cates.i2c)} + """ + + def collate_fn(self, data): + """ + data: list of tuple + """ + words, text_labels = zip(*data) + + inputs = self.tokenizer( + list(words), + return_tensors='pt', + padding=True, + truncation=True, + max_length=self.max_len, + is_split_into_words=True, + return_offsets_mapping=True, + add_special_tokens=False, + ) + return self.align_offsets(inputs, text_labels, words) + + def align_offsets( + self, + inputs, + text_labels: List[List[str]], + words: List[List[str]] + ): + """ + inputs: output if tokenizer + text_labels: labels in form of list of list of strings + words: words in form of list of list of strings + """ + labels = torch.zeros_like(inputs.input_ids).long() + labels -= 100 + text_lables_array = np.empty(labels.shape, dtype=object) + words_array = np.empty(labels.shape, dtype=object) + max_len = inputs.input_ids.shape[1] + + for row_id, input_ids in enumerate(inputs.input_ids): + word_pos = inputs.word_ids(row_id) + for idx, pos in enumerate(word_pos): + if pos is None: + continue + if pos <= max_len: + labels[row_id, idx] = self.cates.c2i[text_labels[row_id][pos]] + if self.return_string: + text_lables_array[row_id, + idx] = text_labels[row_id][pos] + words_array[row_id, idx] = words[row_id][pos] + + inputs['labels'] = labels + if self.return_string: + inputs['text_labels'] = text_lables_array.tolist() + inputs['word'] = words_array.tolist() + return inputs + + def dataloader(self, batch_size: int = 32, shuffle: bool = True): + """ + Create dataloader + """ + return DataLoader( + self, + batch_size=batch_size, + shuffle=shuffle, + collate_fn=self.collate_fn, + ) + + def one_batch(self, batch_size: int = 32, shuffle: bool = True): + return next(iter(self.dataloader(batch_size, shuffle))) + + def visualize_batch(self, batch, row_idx=0): + return list(zip(self.tokenizer.convert_ids_to_tokens(batch.input_ids[row_idx]), + batch.labels[row_idx].numpy(), + batch.text_labels[row_idx], + batch.word[row_idx], + batch.offset_mapping[row_idx].numpy(), + )) + + def set_hfconfig(self, config): + """ + set the category information to huggingface config + """ + config.num_labels = len(self.cates) + config.id2label = {i: label for i, label in enumerate(self.cates.i2c)} + config.label2id = {label: i for i, label in enumerate(self.cates.i2c)} + + +def clean_output(outputs): + """ + Cleaning output for NER task + """ + results = [] + current = [] + last_idx = 0 + # make to sub group by position + for output in outputs: + if output["index"]-1 == last_idx: + current.append(output) + else: + results.append(current) + current = [output, ] + last_idx = output["index"] + if len(current) > 0: + results.append(current) + + # from tokens to string + strings = [] + for c in results: + tokens = [] + starts = [] + ends = [] + for o in c: + tokens.append(o['word']) + starts.append(o['start']) + ends.append(o['end']) + + new_str = tokenizer.convert_tokens_to_string(tokens) + if new_str != '': + strings.append(dict( + word=new_str, + start=min(starts), + end=max(ends), + entity=c[0]['entity'] + )) + return strings \ No newline at end of file diff --git a/forgebox/hf/train.py b/forgebox/hf/train.py new file mode 100644 index 0000000..0cae4f6 --- /dev/null +++ b/forgebox/hf/train.py @@ -0,0 +1,98 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/72_pl_training.ipynb (unless otherwise specified). + +__all__ = ['ner_model_from', 'ner_tokenizer_from', 'NERDataModule', 'NERModule'] + +# Cell +from .data import IOBES +from ..imports import * +import pytorch_lightning as pl +from transformers import AutoModelForTokenClassification, AutoTokenizer + +# Cell + +# ner model and tokenizer +def ner_model_from( + name:str, dataset: IOBES +): + """ + name: from_pretrain(name) + """ + model = AutoModelForTokenClassification.from_pretrained( + name, + num_labels=len(dataset.cates), + ) + dataset.set_hfconfig(model.config) + return model + +def ner_tokenizer_from( + name: str +): + return AutoTokenizer.from_pretrained( + name, add_prefix_space=True) + +# Cell + +# ner data module +class NERDataModule(pl.LightningDataModule): + def __init__(self, train_ds, val_ds, batch_size=32): + super().__init__() + self.train_ds = train_ds + self.val_ds = val_ds + self.batch_size = batch_size + + def train_dataloader(self): + return self.train_ds.dataloader(batch_size=self.batch_size, shuffle=True) + + def val_dataloader(self): + return self.val_ds.dataloader(batch_size=self.batch_size*2, shuffle=False) + +# Cell + +# ner module +class NERModule(pl.LightningModule): + """ + PyTorch lightning module for training ner model + """ + def __init__( + self, model, + ): + """ + model: huggingface transformer model for ner + """ + super().__init__() + self.model = model + + def forward(self, batch): + return self.model( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + labels=batch['labels']) + + def training_step(self, batch, batch_idx): + outputs = self(batch) + loss = outputs.loss + self.log("loss", loss) + self.log("acc", self.calcualte_acc(outputs, batch.labels)) + return loss + + def validation_step(self, batch, batch_idx): + outputs = self(batch) + loss = outputs.loss + self.log("val_loss", loss) + self.log("val_acc", self.calcualte_acc(outputs, batch.labels)) + return loss + + def calcualte_acc(self, outputs, labels): + pred_idx = outputs.logits.argmax(-1) + mask = torch.ones_like(pred_idx) + mask[labels==-100]=False + return (pred_idx[mask]==labels[mask]).float().mean() + + def configure_optimizers(self): + # discriminative learning rate + param_groups = [ + {'params': self.model.roberta.parameters(), 'lr': 5e-6}, + {'params': self.model.classifier.parameters(), 'lr': 1e-3}, + ] + optimizer = torch.optim.Adam(param_groups, lr=1e-3) + return optimizer \ No newline at end of file diff --git a/nbs/70_hf_transformer_data.ipynb b/nbs/70_hf_transformer_data.ipynb new file mode 100644 index 0000000..fac2f7d --- /dev/null +++ b/nbs/70_hf_transformer_data.ipynb @@ -0,0 +1,593 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data parts for hf transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# default_exp hf.data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# export\n", + "from forgebox.imports import *\n", + "from forgebox.category import Category\n", + "from typing import List, Dict, Callable, Any, Tuple" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Process IOBES files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# export\n", + "def convert_iob2_file_to_iobes(file_path, result_path):\n", + " \"\"\"\n", + " Convert IOB2 file to IOBES\n", + " \"\"\"\n", + " with open(file_path, 'r') as f:\n", + " lines = f.readlines()\n", + " with open(result_path, 'w') as f:\n", + " for line in lines:\n", + " line = line.strip()\n", + " if line == '':\n", + " f.write('\\n')\n", + " continue\n", + " line = line.split()\n", + " if line[-1] == 'O':\n", + " f.write(' '.join(line) + '\\n')\n", + " else:\n", + " f.write(' '.join(line[:-1]) + ' ' + line[-1] + '\\n')\n", + "\n", + "\n", + "def conbine_iobes_file(\n", + " file_paths: List[Path],\n", + " new_file_path: Path\n", + "):\n", + " \"\"\"\n", + " Conbine from multiple IOBES files\n", + " into IOBES files\n", + " \"\"\"\n", + " with open(new_file_path, 'w') as new_file:\n", + " for file_path in file_paths:\n", + " with open(file_path, 'r') as file:\n", + " for line in file:\n", + " new_file.write(line)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# export\n", + "class IOBES(Dataset):\n", + " \"\"\"\n", + " Load iobes file for NER training task\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " file_path,\n", + " tokenizer,\n", + " max_len=128,\n", + " save_buffer: int = 15,\n", + " category: Category = None,\n", + " return_string: bool = False,\n", + " use_frag: bool = False,\n", + " ):\n", + " \"\"\"\n", + " file_path,\n", + " tokenizer,\n", + " max_len=128,\n", + " save_buffer: int = 15,\n", + " category: Category = None,\n", + " label categories, if set to None, will be figured out\n", + " automatically.\n", + " You can set this to None for train dataset, but for valid\n", + " dataset:\n", + " valid_ds = IOBES(...,category=train_ds.cates)\n", + " return_string: bool = False, do we return original string\n", + " for tokenizer output, this option is good for debuging\n", + " but the data won't pass into cuda if choose so\n", + " use_frag: bool = False, do we use prepend like 'I-','B-'\n", + " \"\"\"\n", + " self.file_path = file_path\n", + " self.max_len = max_len\n", + " self.pairs = []\n", + " self.list_of_words = []\n", + " self.list_of_labels = []\n", + " self.tokenizer = tokenizer\n", + " self.cates = category\n", + " self.return_string = return_string\n", + " self.use_frag = use_frag\n", + " self.load_data(save_buffer)\n", + "\n", + " def load_data(self, save_buffer: int = 15):\n", + " \"\"\"\n", + " Load file in to object structure\n", + " \"\"\"\n", + " with open(self.file_path, 'r') as f:\n", + " for line in f:\n", + " line = line.strip()\n", + " if line:\n", + " splited = line.split()\n", + " if len(splited) != 2:\n", + " continue\n", + " word, label = splited\n", + " # do we use 'I-', 'B-' etc\n", + " if self.use_frag is False:\n", + " if \"-\" in label:\n", + " label = label.split('-')[1]\n", + " self.pairs.append([word, label])\n", + "\n", + " self.pairs = np.array(self.pairs)\n", + "\n", + " if self.cates is None:\n", + " labels_df = pd.DataFrame({\"label\": self.pairs[:, 1]})\n", + " self.cates = Category(list(labels_df.vc(\"label\").index))\n", + "\n", + " self.batching_words(save_buffer)\n", + "\n", + " def batching_words(self, save_buffer: int = 15):\n", + " \"\"\"\n", + " batching self.words into self.list_of_words\n", + " by self.max_len -15\n", + " \"\"\"\n", + " for i in range(0, len(self.pairs), self.max_len-save_buffer):\n", + " chunk_slice = slice(i, i+self.max_len-save_buffer)\n", + " self.list_of_words.append(self.pairs[chunk_slice, 0])\n", + " self.list_of_labels.append(self.pairs[chunk_slice, 1])\n", + "\n", + " def __len__(self) -> int:\n", + " return len(self.list_of_words)\n", + "\n", + " def __getitem__(self, idx: int) -> Tuple[List[str]]:\n", + " return list(self.list_of_words[idx]), list(self.list_of_labels[idx])\n", + "\n", + " def __repr__(self):\n", + " return f\"\"\"NER dataset using IOBES annotation\n", + " {len(self)} sentences,\n", + " Labels:\n", + " {list(self.cates.i2c)}\n", + " \"\"\"\n", + "\n", + " def collate_fn(self, data):\n", + " \"\"\"\n", + " data: list of tuple\n", + " \"\"\"\n", + " words, text_labels = zip(*data)\n", + "\n", + " inputs = self.tokenizer(\n", + " list(words),\n", + " return_tensors='pt',\n", + " padding=True,\n", + " truncation=True,\n", + " max_length=self.max_len,\n", + " is_split_into_words=True,\n", + " return_offsets_mapping=True,\n", + " add_special_tokens=False,\n", + " )\n", + " return self.align_offsets(inputs, text_labels, words)\n", + "\n", + " def align_offsets(\n", + " self,\n", + " inputs,\n", + " text_labels: List[List[str]],\n", + " words: List[List[str]]\n", + " ):\n", + " \"\"\"\n", + " inputs: output if tokenizer\n", + " text_labels: labels in form of list of list of strings\n", + " words: words in form of list of list of strings\n", + " \"\"\"\n", + " labels = torch.zeros_like(inputs.input_ids).long()\n", + " labels -= 100\n", + " text_lables_array = np.empty(labels.shape, dtype=object)\n", + " words_array = np.empty(labels.shape, dtype=object)\n", + " max_len = inputs.input_ids.shape[1]\n", + "\n", + " for row_id, input_ids in enumerate(inputs.input_ids):\n", + " word_pos = inputs.word_ids(row_id)\n", + " for idx, pos in enumerate(word_pos):\n", + " if pos is None:\n", + " continue\n", + " if pos <= max_len:\n", + " labels[row_id, idx] = self.cates.c2i[text_labels[row_id][pos]]\n", + " if self.return_string:\n", + " text_lables_array[row_id,\n", + " idx] = text_labels[row_id][pos]\n", + " words_array[row_id, idx] = words[row_id][pos]\n", + "\n", + " inputs['labels'] = labels\n", + " if self.return_string:\n", + " inputs['text_labels'] = text_lables_array.tolist()\n", + " inputs['word'] = words_array.tolist()\n", + " return inputs\n", + "\n", + " def dataloader(self, batch_size: int = 32, shuffle: bool = True):\n", + " \"\"\"\n", + " Create dataloader\n", + " \"\"\"\n", + " return DataLoader(\n", + " self,\n", + " batch_size=batch_size,\n", + " shuffle=shuffle,\n", + " collate_fn=self.collate_fn,\n", + " )\n", + "\n", + " def one_batch(self, batch_size: int = 32, shuffle: bool = True):\n", + " return next(iter(self.dataloader(batch_size, shuffle)))\n", + "\n", + " def visualize_batch(self, batch, row_idx=0):\n", + " return list(zip(self.tokenizer.convert_ids_to_tokens(batch.input_ids[row_idx]),\n", + " batch.labels[row_idx].numpy(),\n", + " batch.text_labels[row_idx],\n", + " batch.word[row_idx],\n", + " batch.offset_mapping[row_idx].numpy(),\n", + " ))\n", + "\n", + " def set_hfconfig(self, config):\n", + " \"\"\"\n", + " set the category information to huggingface config\n", + " \"\"\"\n", + " config.num_labels = len(self.cates)\n", + " config.id2label = {i: label for i, label in enumerate(self.cates.i2c)}\n", + " config.label2id = {label: i for i, label in enumerate(self.cates.i2c)}\n", + "\n", + "\n", + "def clean_output(outputs):\n", + " \"\"\"\n", + " Cleaning output for NER task\n", + " \"\"\"\n", + " results = []\n", + " current = []\n", + " last_idx = 0\n", + " # make to sub group by position\n", + " for output in outputs:\n", + " if output[\"index\"]-1 == last_idx:\n", + " current.append(output)\n", + " else:\n", + " results.append(current)\n", + " current = [output, ]\n", + " last_idx = output[\"index\"]\n", + " if len(current) > 0:\n", + " results.append(current)\n", + "\n", + " # from tokens to string\n", + " strings = []\n", + " for c in results:\n", + " tokens = []\n", + " starts = []\n", + " ends = []\n", + " for o in c:\n", + " tokens.append(o['word'])\n", + " starts.append(o['start'])\n", + " ends.append(o['end'])\n", + "\n", + " new_str = tokenizer.convert_tokens_to_string(tokens)\n", + " if new_str != '':\n", + " strings.append(dict(\n", + " word=new_str,\n", + " start=min(starts),\n", + " end=max(ends),\n", + " entity=c[0]['entity']\n", + " ))\n", + " return strings" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(\"raynardj/roberta-pubmed\", add_prefix_space=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = IOBES(\"/Users/xiaochen.zhang/data/valid.iobes\", tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "in-O\n", + "blood-O\n", + ";-O\n", + "content-O\n", + "of-O\n", + "cAMP-O\n", + "was-O\n", + "also-O\n", + "decreased-O\n", + "in-O\n", + "lymphocytes-O\n", + "by-O\n", + "33-O\n", + "%-O\n", + ".-O\n", + "At-O\n", + "the-O\n", + "same-O\n", + "time-O\n", + ",-O\n", + "total-O\n", + "content-O\n", + "of-O\n", + "T-cell_type\n", + "lymphocytes-cell_type\n", + "was-O\n", + "decreased-O\n", + "1.5-fold-O\n", + "in-O\n", + "peripheric-O\n", + "blood-O\n", + ".-O\n", + "Treatment-O\n", + "with-O\n", + "I-hydroxyvitamin-O\n", + "D3-O\n", + "(-O\n", + "1-1.5-O\n", + "mg-O\n", + "daily-O\n", + ",-O\n", + "within-O\n", + "4-O\n", + "weeks-O\n", + ")-O\n", + "led-O\n", + "to-O\n", + "normalization-O\n", + "of-O\n", + "total-O\n", + "and-O\n", + "ionized-O\n", + "form-O\n", + "of-O\n", + "Ca2+-O\n", + "and-O\n", + "of-O\n", + "25-O\n", + "(-O\n", + "OH-O\n", + ")-O\n", + "D-O\n", + ",-O\n", + "but-O\n", + "did-O\n", + "not-O\n", + "affect-O\n", + "the-O\n", + "PTH-O\n", + "content-O\n", + "in-O\n", + "blood-O\n", + ".-O\n", + "Concentration-O\n", + "of-O\n", + "the-O\n", + "receptors-protein\n", + "to-O\n", + "1.25-O\n", + "(-O\n", + "OH-O\n", + ")-O\n", + "2D3-O\n", + "was-O\n", + "elevated-O\n", + "up-O\n", + "to-O\n", + "39.7-O\n", + "fmole/mg-O\n", + "after-O\n", + "I-O\n", + "week-O\n", + "of-O\n", + "the-O\n", + "treatment-O\n", + ",-O\n", + "whereas-O\n", + "it-O\n", + "was-O\n", + "decreased-O\n", + "to-O\n", + "the-O\n", + "initial-O\n", + "level-O\n", + "24.8-O\n", + "fmole/mg-O\n", + "within-O\n", + "4-O\n", + "weeks-O\n", + ";-O\n", + "simultaneous-O\n", + "alteration-O\n", + "in-O\n" + ] + } + ], + "source": [ + "for w,l in zip(*dataset[2]):\n", + " print(f\"{w}-{l}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_ids': tensor([[ 19, 3741, 2603, ..., 1417, 2617, 11576],\n", + " [ 4590, 2156, 255, ..., 405, 1182, 6608],\n", + " [ 6214, 25683, 3809, ..., 11, 5, 8151],\n", + " ...,\n", + " [13998, 25326, 2413, ..., 5, 2199, 21],\n", + " [11299, 705, 24811, ..., 134, 1589, 2032],\n", + " [ 5804, 924, 14, ..., 366, 1168, 9]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1],\n", + " [1, 1, 1, ..., 1, 1, 1],\n", + " [1, 1, 1, ..., 1, 1, 1],\n", + " ...,\n", + " [1, 1, 1, ..., 1, 1, 1],\n", + " [1, 1, 1, ..., 1, 1, 1],\n", + " [1, 1, 1, ..., 1, 1, 1]]), 'offset_mapping': tensor([[[ 1, 4],\n", + " [ 1, 2],\n", + " [ 2, 5],\n", + " ...,\n", + " [ 3, 5],\n", + " [ 5, 8],\n", + " [ 1, 6]],\n", + "\n", + " [[ 1, 5],\n", + " [ 1, 1],\n", + " [ 1, 1],\n", + " ...,\n", + " [ 5, 7],\n", + " [ 7, 9],\n", + " [ 9, 14]],\n", + "\n", + " [[ 1, 5],\n", + " [ 5, 8],\n", + " [ 8, 10],\n", + " ...,\n", + " [ 1, 2],\n", + " [ 1, 3],\n", + " [ 1, 10]],\n", + "\n", + " ...,\n", + "\n", + " [[ 1, 5],\n", + " [ 5, 8],\n", + " [ 8, 10],\n", + " ...,\n", + " [ 1, 3],\n", + " [ 1, 7],\n", + " [ 1, 3]],\n", + "\n", + " [[ 1, 5],\n", + " [ 5, 6],\n", + " [ 6, 10],\n", + " ...,\n", + " [ 2, 3],\n", + " [ 1, 1],\n", + " [ 1, 2]],\n", + "\n", + " [[ 1, 7],\n", + " [ 1, 5],\n", + " [ 1, 4],\n", + " ...,\n", + " [ 3, 5],\n", + " [ 5, 7],\n", + " [ 1, 2]]]), 'labels': tensor([[0, 1, 1, ..., 0, 0, 0],\n", + " [2, 0, 2, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " ...,\n", + " [1, 1, 1, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 2, 0, 2],\n", + " [0, 0, 0, ..., 0, 0, 0]])}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.one_batch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nbs/72_pl_training.ipynb b/nbs/72_pl_training.ipynb new file mode 100644 index 0000000..82a4985 --- /dev/null +++ b/nbs/72_pl_training.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pytorch Lighting training\n", + "> on huggingface transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# default_exp hf.train" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# export\n", + "from forgebox.hf.data import IOBES\n", + "from forgebox.imports import *\n", + "import pytorch_lightning as pl\n", + "from transformers import AutoModelForTokenClassification, AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install transformers==4.9.1\n", + "# !pip install pytorch-lightning==1.3.8\n", + "# !pip install tensorflow==2.2.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load model and tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# export\n", + "\n", + "# ner model and tokenizer\n", + "def ner_model_from(\n", + " name:str, dataset: IOBES\n", + "):\n", + " \"\"\"\n", + " name: from_pretrain(name)\n", + " \"\"\"\n", + " model = AutoModelForTokenClassification.from_pretrained(\n", + " name,\n", + " num_labels=len(dataset.cates),\n", + " )\n", + " dataset.set_hfconfig(model.config)\n", + " return model\n", + "\n", + "def ner_tokenizer_from(\n", + " name: str\n", + "):\n", + " return AutoTokenizer.from_pretrained(\n", + " name, add_prefix_space=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lightning data module" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# export\n", + "\n", + "# ner data module\n", + "class NERDataModule(pl.LightningDataModule):\n", + " def __init__(self, train_ds, val_ds, batch_size=32):\n", + " super().__init__()\n", + " self.train_ds = train_ds\n", + " self.val_ds = val_ds\n", + " self.batch_size = batch_size\n", + "\n", + " def train_dataloader(self):\n", + " return self.train_ds.dataloader(batch_size=self.batch_size, shuffle=True)\n", + "\n", + " def val_dataloader(self):\n", + " return self.val_ds.dataloader(batch_size=self.batch_size*2, shuffle=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# export\n", + "\n", + "# ner module\n", + "class NERModule(pl.LightningModule):\n", + " \"\"\"\n", + " PyTorch lightning module for training ner model\n", + " \"\"\"\n", + " def __init__(\n", + " self, model,\n", + " ):\n", + " \"\"\"\n", + " model: huggingface transformer model for ner\n", + " \"\"\"\n", + " super().__init__()\n", + " self.model = model\n", + "\n", + " def forward(self, batch):\n", + " return self.model(\n", + " input_ids=batch['input_ids'],\n", + " attention_mask=batch['attention_mask'],\n", + " labels=batch['labels'])\n", + " \n", + " def training_step(self, batch, batch_idx):\n", + " outputs = self(batch)\n", + " loss = outputs.loss\n", + " self.log(\"loss\", loss)\n", + " self.log(\"acc\", self.calcualte_acc(outputs, batch.labels))\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " outputs = self(batch)\n", + " loss = outputs.loss\n", + " self.log(\"val_loss\", loss)\n", + " self.log(\"val_acc\", self.calcualte_acc(outputs, batch.labels))\n", + " return loss\n", + " \n", + " def calcualte_acc(self, outputs, labels):\n", + " pred_idx = outputs.logits.argmax(-1)\n", + " mask = torch.ones_like(pred_idx)\n", + " mask[labels==-100]=False\n", + " return (pred_idx[mask]==labels[mask]).float().mean()\n", + " \n", + " def configure_optimizers(self):\n", + " # discriminative learning rate\n", + " param_groups = [\n", + " {'params': self.model.roberta.parameters(), 'lr': 5e-6},\n", + " {'params': self.model.classifier.parameters(), 'lr': 1e-3},\n", + " ]\n", + " optimizer = torch.optim.Adam(param_groups, lr=1e-3)\n", + " return optimizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/settings.ini b/settings.ini index d6fe5ad..bcdd2b7 100644 --- a/settings.ini +++ b/settings.ini @@ -7,7 +7,7 @@ author = xiaochen(ray) zhang author_email = b2ray2c@gmail.com copyright = xiaochen(ray) zhang branch = master -version = 0.4.7 +version = 0.4.9 min_python = 3.6 audience = Developers language = English