From 5dfaa4ab9372e0b8ff2673acfd1bd5446a0bd935 Mon Sep 17 00:00:00 2001 From: Weitang Liu <1436496575@qq.com> Date: Sat, 4 Mar 2023 15:34:35 +0800 Subject: [PATCH] update version_0.8.1 update --- .DS_Store | Bin 6148 -> 10244 bytes LICENSE | 0 README.md | 15 +- .../classification => docs}/__init__.py | 0 docs/apex_install | 4 + examples/README.md | 52 - examples/ccks_kfold_split.py | 48 + examples/cner_kfold_split.py | 57 + examples/task_pretrain_ccks.py | 329 +++++ ...uence_labeling_cner_beam_search_softmax.py | 220 +++ examples/task_sequence_labeling_cner_crf.py | 201 --- ...k_sequence_labeling_cner_global_pointer.py | 222 --- .../task_sequence_labeling_cner_softmax.py | 199 --- examples/task_sequence_labeling_cner_span.py | 228 --- ...nce_labeling_resume_beam_search_softmax.py | 215 +++ .../task_sequence_labeling_resume_biaffine.py | 234 ++++ examples/task_sequence_labeling_resume_crf.py | 209 +++ ...sequence_labeling_resume_global_pointer.py | 242 ++++ .../task_sequence_labeling_resume_span.py | 236 ++++ ..._labeling_resume_token_mask_aug_softmax.py | 221 +++ ...uence_labeling_resume_token_mdp_softmax.py | 218 +++ ..._sequence_labeling_resume_token_softmax.py | 201 +++ examples/task_text_classification_cola.py | 150 +- .../task_text_classification_cola_adan.py | 213 +++ examples/task_text_classification_wsc.py | 203 +++ examples/task_text_classify_fewshot_pet.py | 237 ++++ .../task_text_classify_fewshot_ptuning.py | 240 ++++ examples/task_text_classify_tnews.py | 156 +++ examples/task_text_match_afqmc.py | 156 +++ examples/task_text_match_cmnli.py | 164 +++ examples/task_text_match_csl.py | 157 +++ examples/task_text_match_ocnli.py | 162 +++ examples/task_text_similarity_ccks2021.py | 187 +++ examples/task_text_similarity_lcqmc.py | 162 +++ pyproject.toml | 20 + requirements.txt | 7 +- scripts/run_task_fewshot_pet.sh | 35 + scripts/run_task_fewshot_ptuning.sh | 35 + scripts/run_task_pretrain_ccks.sh | 35 + ...uence_labeling_cner_beam_search_softmax.sh | 39 + ...nce_labeling_resume_beam_search_softmax.sh | 38 + ...task_sequence_labeling_resume_biaffine.sh} | 27 +- ... run_task_sequence_labeling_resume_crf.sh} | 20 +- ...sequence_labeling_resume_global_pointer.sh | 32 + ...run_task_sequence_labeling_resume_span.sh} | 28 +- ...ence_labeling_resume_token_mdp_softmax.sh} | 22 +- scripts/run_task_text_classification_cola.sh | 13 +- .../run_task_text_classification_cola_adan.sh | 42 + .../run_task_text_classification_cola_awp.sh | 44 + .../run_task_text_classification_cola_fgm.sh | 12 +- .../run_task_text_classification_cola_pgd.sh | 37 + ...run_task_text_classification_cola_rdrop.sh | 40 + scripts/run_task_text_classification_wsc.sh | 41 + scripts/run_task_text_match_afqmc.sh | 40 + scripts/run_task_text_match_cmnli.sh | 40 + scripts/run_task_text_match_csl.sh | 40 + scripts/run_task_text_match_ocnli.sh | 39 + scripts/run_task_text_similarity_ccks2021.sh | 37 + scripts/run_task_text_similarity_lcqmc.sh | 35 + setup.py | 21 +- src/.DS_Store | Bin 0 -> 6148 bytes {torchblocks/models => src}/__init__.py | 0 src/torchblocks/__init__.py | 5 + .../torchblocks}/callback/__init__.py | 6 +- .../torchblocks/callback/attacks}/__init__.py | 0 .../callback/attacks/attack_base.py | 18 + src/torchblocks/callback/attacks/awp.py | 83 ++ src/torchblocks/callback/attacks/fgm.py | 26 + src/torchblocks/callback/attacks/pgd.py | 49 + .../torchblocks}/callback/early_stopping.py | 4 +- src/torchblocks/callback/ema.py | 34 + .../torchblocks}/callback/file_writer.py | 16 +- .../torchblocks}/callback/model_checkpoint.py | 22 +- .../torchblocks}/callback/progressbar.py | 8 +- src/torchblocks/callback/swa.py | 29 + src/torchblocks/core/__init__.py | 2 + src/torchblocks/core/application.py | 13 + src/torchblocks/core/train_base.py | 890 ++++++++++++ .../torchblocks}/data/Vocabulary.py | 2 +- src/torchblocks/data/__init__.py | 4 + src/torchblocks/data/dataset_builder.py | 129 ++ .../torchblocks}/data/embedding.py | 0 src/torchblocks/data/ngram.py | 12 + src/torchblocks/data/samplers.py | 93 ++ src/torchblocks/data/splits/__init__.py | 2 + src/torchblocks/data/splits/ml_stratifiers.py | 369 +++++ src/torchblocks/data/splits/seq_splits.py | 74 + src/torchblocks/data/token_text_mapping.py | 88 ++ .../torchblocks}/losses/__init__.py | 4 +- .../losses/aslsinglelabel_loss.py | 0 .../torchblocks}/losses/asymmetric_loss.py | 0 .../torchblocks}/losses/cross_entropy.py | 22 +- .../torchblocks}/losses/focal_loss.py | 14 +- .../torchblocks}/losses/hard_mining.py | 0 .../torchblocks}/losses/kl_divergence.py | 14 + .../torchblocks}/losses/label_smoothing.py | 0 src/torchblocks/losses/poly_loss.py | 116 ++ .../torchblocks}/losses/span_loss.py | 0 .../torchblocks}/losses/symmetric_loss.py | 5 +- .../torchblocks}/losses/triplet_loss.py | 0 .../torchblocks}/metrics/__init__.py | 1 - .../torchblocks}/metrics/base.py | 2 +- .../metrics/classification}/__init__.py | 0 .../metrics/classification/accuracy.py | 56 + src/torchblocks/metrics/classification/auc.py | 51 + .../metrics/classification/f1_score.py | 52 + .../classification/matthews_corrcoef.py | 51 + .../metrics/sequence_labeling/__init__.py | 0 .../precision_recall_fscore.py | 12 +- .../metrics/sequence_labeling/seqTag_score.py | 10 +- .../metrics/sequence_labeling/util.py | 0 src/torchblocks/models/__init__.py | 2 + .../torchblocks}/models/configuration_base.py | 4 +- .../torchblocks}/models/model_base.py | 9 +- src/torchblocks/models/nezha/__init__.py | 2 + .../models/nezha/configuration_nezha.py | 124 ++ .../models/nezha/modeling_nezha.py | 1236 +++++++++++++++++ src/torchblocks/models/utils.py | 103 ++ src/torchblocks/modules/__init__.py | 6 + .../torchblocks/modules}/activations.py | 0 .../torchblocks/modules}/attentions.py | 2 +- src/torchblocks/modules/biaffine.py | 42 + .../torchblocks/modules}/capsule.py | 0 .../torchblocks/modules}/conv.py | 49 +- .../layers => src/torchblocks/modules}/crf.py | 1 - src/torchblocks/modules/dropouts.py | 112 ++ .../torchblocks/modules}/embeddings.py | 0 src/torchblocks/modules/gate.py | 46 + src/torchblocks/modules/global_pointer.py | 77 + .../torchblocks/modules}/layer_norm.py | 31 +- .../torchblocks/modules}/linears.py | 4 + .../torchblocks/modules}/mixout.py | 0 .../layers => src/torchblocks/modules}/mlp.py | 0 .../torchblocks/modules}/pooling.py | 38 +- .../torchblocks/modules}/position.py | 34 + src/torchblocks/modules/rnn.py | 66 + .../torchblocks/modules}/utils.py | 2 + .../torchblocks}/optims/__init__.py | 1 + .../torchblocks}/optims/adabelief.py | 0 .../torchblocks}/optims/adabound.py | 0 .../torchblocks}/optims/adafactor.py | 2 +- src/torchblocks/optims/adai.py | 115 ++ src/torchblocks/optims/adaiw.py | 117 ++ .../torchblocks}/optims/adamod.py | 0 .../torchblocks}/optims/adamp.py | 0 .../torchblocks}/optims/adamw.py | 4 +- src/torchblocks/optims/adan.py | 154 ++ .../torchblocks}/optims/adax.py | 0 .../torchblocks}/optims/lamb.py | 0 .../torchblocks}/optims/lars.py | 0 .../torchblocks}/optims/lookahead.py | 0 .../torchblocks}/optims/lr_scheduler.py | 19 +- .../torchblocks}/optims/nadam.py | 0 .../torchblocks}/optims/novograd.py | 0 .../torchblocks}/optims/planradam.py | 0 .../torchblocks}/optims/priorwd.py | 0 .../torchblocks}/optims/radam.py | 0 .../torchblocks}/optims/ralamb.py | 0 .../torchblocks}/optims/ralars.py | 0 .../torchblocks}/optims/ranger_adabelief.py | 8 +- .../torchblocks}/optims/sgdp.py | 0 .../torchblocks}/optims/sgdw.py | 0 .../torchblocks}/optims/shampoo.py | 0 src/torchblocks/tasks/__init__.py | 1 + src/torchblocks/tasks/sequence_tags.py | 296 ++++ src/torchblocks/utils/__init__.py | 13 + src/torchblocks/utils/chinese_utils.py | 32 + src/torchblocks/utils/ckpt_utils.py | 57 + src/torchblocks/utils/common_utils.py | 80 ++ .../torchblocks}/utils/device.py | 47 +- src/torchblocks/utils/import_utils.py | 23 + src/torchblocks/utils/io_utils.py | 113 ++ .../torchblocks}/utils/logger.py | 23 +- .../torchblocks}/utils/meter.py | 6 +- src/torchblocks/utils/npy_utils.py | 11 + src/torchblocks/utils/options.py | 300 ++++ .../torchblocks}/utils/seed.py | 9 +- .../torchblocks/utils/tensor_utils.py | 74 +- src/torchblocks/utils/visual_utils.py | 55 + src/torchblocks/version.py | 1 + tests/__init__.py | 1 + torchblocks/__init__.py | 1 - torchblocks/callback/adversarial/awp.py | 64 - torchblocks/callback/adversarial/fgm.py | 41 - torchblocks/callback/adversarial/pgd.py | 49 - torchblocks/callback/ema.py | 51 - torchblocks/callback/swa.py | 93 -- torchblocks/core/__init__.py | 3 - torchblocks/core/classification_trainer.py | 13 - torchblocks/core/sequence_labeling_trainer.py | 14 - torchblocks/core/trainer_base.py | 705 ---------- torchblocks/core/utils.py | 3 - torchblocks/data/__init__.py | 4 - torchblocks/data/dataset.py | 118 -- torchblocks/data/process_base.py | 5 - torchblocks/layers/char.py | 80 -- torchblocks/layers/dropouts.py | 48 - torchblocks/layers/gate.py | 29 - .../metrics/classification/accuracy.py | 44 - torchblocks/metrics/classification/auc.py | 30 - .../metrics/classification/f1_score.py | 40 - .../classification/matthews_corrcoef.py | 26 - .../metrics/sequence_labeling/scheme.py | 95 -- torchblocks/models/utils.py | 64 - torchblocks/tasks/sequence_classification.py | 97 -- torchblocks/tasks/sequence_labeling_crf.py | 68 - .../tasks/sequence_labeling_global_pointer.py | 91 -- .../tasks/sequence_labeling_softmax.py | 69 - torchblocks/tasks/sequence_labeling_span.py | 138 -- torchblocks/tasks/siamese_classification.py | 48 - torchblocks/utils/__init__.py | 8 - torchblocks/utils/common.py | 3 - torchblocks/utils/options.py | 230 --- torchblocks/utils/paths.py | 149 -- torchblocks/utils/versions.py | 120 -- torchblocks/version.py | 13 - 216 files changed, 11125 insertions(+), 3826 deletions(-) mode change 100755 => 100644 LICENSE mode change 100755 => 100644 README.md rename {torchblocks/metrics/classification => docs}/__init__.py (100%) mode change 100755 => 100644 create mode 100644 docs/apex_install delete mode 100755 examples/README.md create mode 100644 examples/ccks_kfold_split.py create mode 100644 examples/cner_kfold_split.py create mode 100644 examples/task_pretrain_ccks.py create mode 100644 examples/task_sequence_labeling_cner_beam_search_softmax.py delete mode 100755 examples/task_sequence_labeling_cner_crf.py delete mode 100755 examples/task_sequence_labeling_cner_global_pointer.py delete mode 100755 examples/task_sequence_labeling_cner_softmax.py delete mode 100755 examples/task_sequence_labeling_cner_span.py create mode 100644 examples/task_sequence_labeling_resume_beam_search_softmax.py create mode 100644 examples/task_sequence_labeling_resume_biaffine.py create mode 100644 examples/task_sequence_labeling_resume_crf.py create mode 100644 examples/task_sequence_labeling_resume_global_pointer.py create mode 100644 examples/task_sequence_labeling_resume_span.py create mode 100644 examples/task_sequence_labeling_resume_token_mask_aug_softmax.py create mode 100644 examples/task_sequence_labeling_resume_token_mdp_softmax.py create mode 100644 examples/task_sequence_labeling_resume_token_softmax.py mode change 100755 => 100644 examples/task_text_classification_cola.py create mode 100644 examples/task_text_classification_cola_adan.py create mode 100644 examples/task_text_classification_wsc.py create mode 100644 examples/task_text_classify_fewshot_pet.py create mode 100644 examples/task_text_classify_fewshot_ptuning.py create mode 100644 examples/task_text_classify_tnews.py create mode 100644 examples/task_text_match_afqmc.py create mode 100644 examples/task_text_match_cmnli.py create mode 100644 examples/task_text_match_csl.py create mode 100644 examples/task_text_match_ocnli.py create mode 100644 examples/task_text_similarity_ccks2021.py create mode 100644 examples/task_text_similarity_lcqmc.py create mode 100644 pyproject.toml mode change 100755 => 100644 requirements.txt create mode 100644 scripts/run_task_fewshot_pet.sh create mode 100644 scripts/run_task_fewshot_ptuning.sh create mode 100644 scripts/run_task_pretrain_ccks.sh create mode 100644 scripts/run_task_sequence_labeling_cner_beam_search_softmax.sh create mode 100644 scripts/run_task_sequence_labeling_resume_beam_search_softmax.sh rename scripts/{run_task_sequence_labeling_cner_global_pointer.sh => run_task_sequence_labeling_resume_biaffine.sh} (59%) mode change 100755 => 100644 rename scripts/{run_task_sequence_labeling_cner_crf.sh => run_task_sequence_labeling_resume_crf.sh} (71%) mode change 100755 => 100644 create mode 100644 scripts/run_task_sequence_labeling_resume_global_pointer.sh rename scripts/{run_task_sequence_labeling_cner_span.sh => run_task_sequence_labeling_resume_span.sh} (65%) mode change 100755 => 100644 rename scripts/{run_task_sequence_labeling_cner_softmax.sh => run_task_sequence_labeling_resume_token_mdp_softmax.sh} (70%) mode change 100755 => 100644 mode change 100755 => 100644 scripts/run_task_text_classification_cola.sh create mode 100644 scripts/run_task_text_classification_cola_adan.sh create mode 100644 scripts/run_task_text_classification_cola_awp.sh mode change 100755 => 100644 scripts/run_task_text_classification_cola_fgm.sh create mode 100644 scripts/run_task_text_classification_cola_pgd.sh create mode 100644 scripts/run_task_text_classification_cola_rdrop.sh create mode 100644 scripts/run_task_text_classification_wsc.sh create mode 100644 scripts/run_task_text_match_afqmc.sh create mode 100644 scripts/run_task_text_match_cmnli.sh create mode 100644 scripts/run_task_text_match_csl.sh create mode 100644 scripts/run_task_text_match_ocnli.sh create mode 100644 scripts/run_task_text_similarity_ccks2021.sh create mode 100644 scripts/run_task_text_similarity_lcqmc.sh mode change 100755 => 100644 setup.py create mode 100644 src/.DS_Store rename {torchblocks/models => src}/__init__.py (100%) mode change 100755 => 100644 create mode 100644 src/torchblocks/__init__.py rename {torchblocks => src/torchblocks}/callback/__init__.py (83%) mode change 100755 => 100644 rename {torchblocks/callback/adversarial => src/torchblocks/callback/attacks}/__init__.py (100%) mode change 100755 => 100644 create mode 100644 src/torchblocks/callback/attacks/attack_base.py create mode 100644 src/torchblocks/callback/attacks/awp.py create mode 100644 src/torchblocks/callback/attacks/fgm.py create mode 100644 src/torchblocks/callback/attacks/pgd.py rename {torchblocks => src/torchblocks}/callback/early_stopping.py (95%) mode change 100755 => 100644 create mode 100644 src/torchblocks/callback/ema.py rename {torchblocks => src/torchblocks}/callback/file_writer.py (86%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/callback/model_checkpoint.py (86%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/callback/progressbar.py (91%) mode change 100755 => 100644 create mode 100644 src/torchblocks/callback/swa.py create mode 100644 src/torchblocks/core/__init__.py create mode 100644 src/torchblocks/core/application.py create mode 100644 src/torchblocks/core/train_base.py rename {torchblocks => src/torchblocks}/data/Vocabulary.py (96%) mode change 100755 => 100644 create mode 100644 src/torchblocks/data/__init__.py create mode 100644 src/torchblocks/data/dataset_builder.py rename {torchblocks => src/torchblocks}/data/embedding.py (100%) mode change 100755 => 100644 create mode 100644 src/torchblocks/data/ngram.py create mode 100644 src/torchblocks/data/samplers.py create mode 100644 src/torchblocks/data/splits/__init__.py create mode 100644 src/torchblocks/data/splits/ml_stratifiers.py create mode 100644 src/torchblocks/data/splits/seq_splits.py create mode 100644 src/torchblocks/data/token_text_mapping.py rename {torchblocks => src/torchblocks}/losses/__init__.py (75%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/losses/aslsinglelabel_loss.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/losses/asymmetric_loss.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/losses/cross_entropy.py (56%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/losses/focal_loss.py (88%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/losses/hard_mining.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/losses/kl_divergence.py (64%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/losses/label_smoothing.py (100%) mode change 100755 => 100644 create mode 100644 src/torchblocks/losses/poly_loss.py rename {torchblocks => src/torchblocks}/losses/span_loss.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/losses/symmetric_loss.py (94%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/losses/triplet_loss.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/metrics/__init__.py (88%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/metrics/base.py (95%) mode change 100755 => 100644 rename {torchblocks/tasks => src/torchblocks/metrics/classification}/__init__.py (100%) mode change 100755 => 100644 create mode 100644 src/torchblocks/metrics/classification/accuracy.py create mode 100644 src/torchblocks/metrics/classification/auc.py create mode 100644 src/torchblocks/metrics/classification/f1_score.py create mode 100644 src/torchblocks/metrics/classification/matthews_corrcoef.py create mode 100644 src/torchblocks/metrics/sequence_labeling/__init__.py rename {torchblocks => src/torchblocks}/metrics/sequence_labeling/precision_recall_fscore.py (94%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/metrics/sequence_labeling/seqTag_score.py (71%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/metrics/sequence_labeling/util.py (100%) mode change 100755 => 100644 create mode 100644 src/torchblocks/models/__init__.py rename {torchblocks => src/torchblocks}/models/configuration_base.py (96%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/models/model_base.py (94%) mode change 100755 => 100644 create mode 100644 src/torchblocks/models/nezha/__init__.py create mode 100644 src/torchblocks/models/nezha/configuration_nezha.py create mode 100644 src/torchblocks/models/nezha/modeling_nezha.py create mode 100644 src/torchblocks/models/utils.py create mode 100644 src/torchblocks/modules/__init__.py rename {torchblocks/layers => src/torchblocks/modules}/activations.py (100%) mode change 100755 => 100644 rename {torchblocks/layers => src/torchblocks/modules}/attentions.py (96%) mode change 100755 => 100644 create mode 100644 src/torchblocks/modules/biaffine.py rename {torchblocks/layers => src/torchblocks/modules}/capsule.py (100%) mode change 100755 => 100644 rename {torchblocks/layers => src/torchblocks/modules}/conv.py (54%) mode change 100755 => 100644 rename {torchblocks/layers => src/torchblocks/modules}/crf.py (99%) mode change 100755 => 100644 create mode 100644 src/torchblocks/modules/dropouts.py rename {torchblocks/layers => src/torchblocks/modules}/embeddings.py (100%) mode change 100755 => 100644 create mode 100644 src/torchblocks/modules/gate.py create mode 100644 src/torchblocks/modules/global_pointer.py rename {torchblocks/layers => src/torchblocks/modules}/layer_norm.py (59%) mode change 100755 => 100644 rename {torchblocks/layers => src/torchblocks/modules}/linears.py (96%) mode change 100755 => 100644 rename {torchblocks/layers => src/torchblocks/modules}/mixout.py (100%) mode change 100755 => 100644 rename {torchblocks/layers => src/torchblocks/modules}/mlp.py (100%) mode change 100755 => 100644 rename {torchblocks/layers => src/torchblocks/modules}/pooling.py (72%) mode change 100755 => 100644 rename {torchblocks/layers => src/torchblocks/modules}/position.py (50%) mode change 100755 => 100644 create mode 100644 src/torchblocks/modules/rnn.py rename {torchblocks/layers => src/torchblocks/modules}/utils.py (79%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/__init__.py (91%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/adabelief.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/adabound.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/adafactor.py (97%) mode change 100755 => 100644 create mode 100644 src/torchblocks/optims/adai.py create mode 100644 src/torchblocks/optims/adaiw.py rename {torchblocks => src/torchblocks}/optims/adamod.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/adamp.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/adamw.py (95%) mode change 100755 => 100644 create mode 100644 src/torchblocks/optims/adan.py rename {torchblocks => src/torchblocks}/optims/adax.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/lamb.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/lars.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/lookahead.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/lr_scheduler.py (91%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/nadam.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/novograd.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/planradam.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/priorwd.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/radam.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/ralamb.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/ralars.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/ranger_adabelief.py (97%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/sgdp.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/sgdw.py (100%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/optims/shampoo.py (100%) mode change 100755 => 100644 create mode 100644 src/torchblocks/tasks/__init__.py create mode 100644 src/torchblocks/tasks/sequence_tags.py create mode 100644 src/torchblocks/utils/__init__.py create mode 100644 src/torchblocks/utils/chinese_utils.py create mode 100644 src/torchblocks/utils/ckpt_utils.py create mode 100644 src/torchblocks/utils/common_utils.py rename {torchblocks => src/torchblocks}/utils/device.py (52%) mode change 100755 => 100644 create mode 100644 src/torchblocks/utils/import_utils.py create mode 100644 src/torchblocks/utils/io_utils.py rename {torchblocks => src/torchblocks}/utils/logger.py (73%) mode change 100755 => 100644 rename {torchblocks => src/torchblocks}/utils/meter.py (69%) mode change 100755 => 100644 create mode 100644 src/torchblocks/utils/npy_utils.py create mode 100644 src/torchblocks/utils/options.py rename {torchblocks => src/torchblocks}/utils/seed.py (83%) mode change 100755 => 100644 rename torchblocks/utils/tensor.py => src/torchblocks/utils/tensor_utils.py (66%) mode change 100755 => 100644 create mode 100644 src/torchblocks/utils/visual_utils.py create mode 100644 src/torchblocks/version.py create mode 100644 tests/__init__.py delete mode 100755 torchblocks/__init__.py delete mode 100644 torchblocks/callback/adversarial/awp.py delete mode 100755 torchblocks/callback/adversarial/fgm.py delete mode 100755 torchblocks/callback/adversarial/pgd.py delete mode 100755 torchblocks/callback/ema.py delete mode 100755 torchblocks/callback/swa.py delete mode 100755 torchblocks/core/__init__.py delete mode 100755 torchblocks/core/classification_trainer.py delete mode 100755 torchblocks/core/sequence_labeling_trainer.py delete mode 100755 torchblocks/core/trainer_base.py delete mode 100755 torchblocks/core/utils.py delete mode 100755 torchblocks/data/__init__.py delete mode 100755 torchblocks/data/dataset.py delete mode 100755 torchblocks/data/process_base.py delete mode 100755 torchblocks/layers/char.py delete mode 100755 torchblocks/layers/dropouts.py delete mode 100755 torchblocks/layers/gate.py delete mode 100755 torchblocks/metrics/classification/accuracy.py delete mode 100755 torchblocks/metrics/classification/auc.py delete mode 100755 torchblocks/metrics/classification/f1_score.py delete mode 100755 torchblocks/metrics/classification/matthews_corrcoef.py delete mode 100755 torchblocks/metrics/sequence_labeling/scheme.py delete mode 100755 torchblocks/models/utils.py delete mode 100755 torchblocks/tasks/sequence_classification.py delete mode 100755 torchblocks/tasks/sequence_labeling_crf.py delete mode 100755 torchblocks/tasks/sequence_labeling_global_pointer.py delete mode 100755 torchblocks/tasks/sequence_labeling_softmax.py delete mode 100755 torchblocks/tasks/sequence_labeling_span.py delete mode 100755 torchblocks/tasks/siamese_classification.py delete mode 100755 torchblocks/utils/__init__.py delete mode 100755 torchblocks/utils/common.py delete mode 100755 torchblocks/utils/options.py delete mode 100755 torchblocks/utils/paths.py delete mode 100755 torchblocks/utils/versions.py delete mode 100755 torchblocks/version.py diff --git a/.DS_Store b/.DS_Store index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..f0080dc3a8ce99ee3743f8184ee30f7d28c43a6a 100644 GIT binary patch literal 10244 zcmeHM&2Jk;6n~SX#HJr{+N56sX_W{GsZnC*gTSGx>m+Ier3rBeiQ3}st~buMn;o;e z&PPp9?nvAz5=X9x8%Hi2K;l>-ap51}zzK;H{APCTtjDQjp#pBTBkjDmGw;pEZ{ECl zvr9y*P%Gz%G$Kmj5j1lO*)d7fi>EF1L?t7j4EjVqSrjZf+%%jv>7*b*AVDBOAVDBO zAVHuT5Wtx&QtD(#lRQBnK_Ee(iGc7A8XiIO5j_m4bREd#6#%pk#WI1%JwWPc5zR;R zFr-oyZHBi87OAr6iors4)N4Wd+{ZuBM0z9$%J2y~+&BR^>{qirFq`D$)?D|#1kU&dNwMm0O-g?H zA1!)wf(k$>EZZKIb(^YK1T3=x)Yl47ZSpTFU~Ln%&7H^{u%8<4;MYv1=@4E%dFu3; zfz069bAv;J!^7tidERZ=jevzEV%_91bkaaiqGWv+ov0*5SFmm0 zhKt&Qv-yl3(^vE@eNVs8Z}bQ2W9QijyTYc~Rd$WtWbd&RX0k12vj?GmPs1e&uOCay z$4lE{E4!qhZqXvm(E{B@{|l{qM4lsYXVMRsQ-jN^%KW$;)q3i*NO#$OdWNS?%QU<4 z)KN!xbn3L!z+jXmR?KA(OYgy%2FP`sc`o=emEdGKs0Bx>7zO(jZN`nFgBD&?L+N1Z zU1&U=jk;k_9Z16wVXRCbgoFd-~d4!jW)HlaV#WvHAjl=!_iuEtm>oV-i&4j`Jw21p=XD8d@ zJ`v0#JvG?>0+kU8ik+MQjv}OQLtO^OaYq!BjA7Ib_n~JN8po!nz$D>KvgZ$h-f+m8 z`2N4}^#A|Q-^7v;CkP}6bc_J&n=j56Al|tAshZ+jyN2gJ9+8CHVMwJ4G9e<$<>z?3 rh|lrzCX2GHs1pf%KB9*qMG4Y>{xcxC|0nnVF8%7w5KPGg_gAhnpCgF}!R mBnTAa1`@77th%xAJM(0I6-7qI$)+kstROQPHplbKVFm!!rw@hz diff --git a/LICENSE b/LICENSE old mode 100755 new mode 100644 diff --git a/README.md b/README.md old mode 100755 new mode 100644 index 0d398ba..60bd61c --- a/README.md +++ b/README.md @@ -4,12 +4,18 @@ A PyTorch-based toolkit for natural language processing +![python](https://img.shields.io/badge/-Python_3.7_%7C_3.8_%7C_3.9_%7C_3.10-blue?logo=python&logoColor=white) +![pytorch](https://img.shields.io/badge/PyTorch_1.10+-ee4c2c?logo=pytorch&logoColor=white) +![black](https://img.shields.io/badge/Code%20Style-Black-black.svg?labelColor=gray) +![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray) ### Requirements -- torch>=1.6.0 -- transformers>=4.1.1 -- torchmetrics>=0.6.0 +- torch>=1.10.0 +- tokenizers >= 0.7.0 +- transformers>=4.10.0 +- torchmetrics>=0.11.3 + TorchBlocks requires Python 3.7+. We recommend installing TorchBlocks in a Linux or OSX environment. @@ -22,6 +28,7 @@ git clone https://github.com/lonePatient/TorchBlocks.git cd TorchBlocks python setup.py install ``` +⚠️**Note:** This project is still in the development stage and some of the interfaces are subject to change. ### Tutorials @@ -30,5 +37,7 @@ python setup.py install * Tutorial 3 (sequence labeling): [task_sequence_labeling_ner_crf.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sequence_labeling_ner_crf.py) * Tutorial 4 (sentence similarity): [task_sentence_similarity_lcqmc.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sentence_similarity_lcqmc.py) * Tutorial 5 (triple similarity): [task_triple_similarity_epidemic.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_triple_similarity_epidemic.py) +* Tutorial 6 (sequence labeling): [task_sequence_labeling_resume_beam_search_softmax.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sequence_labeling_resume_beam_search_softmax.py) +* Tutorual 7 (sequence labeling): [task_sequence_labeling_resume_global_pointer.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sequence_labeling_resume_global_pointer.py) * Example scripts for each task: [TorchBlocks/examples/](https://github.com/lonePatient/TorchBlocks/tree/master/examples) diff --git a/torchblocks/metrics/classification/__init__.py b/docs/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/metrics/classification/__init__.py rename to docs/__init__.py diff --git a/docs/apex_install b/docs/apex_install new file mode 100644 index 0000000..2eb226c --- /dev/null +++ b/docs/apex_install @@ -0,0 +1,4 @@ +$ git clone https://github.com/NVIDIA/apex +$ sed -i "s/or (bare_metal_minor != torch_binary_minor)//g" apex/setup.py +$ pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" apex/ +$ rm -rf apex \ No newline at end of file diff --git a/examples/README.md b/examples/README.md deleted file mode 100755 index 476287b..0000000 --- a/examples/README.md +++ /dev/null @@ -1,52 +0,0 @@ -## Examples - -17. run_task_text_classification_ema_cola.sh -> 文本分类任务,增加Exponential Moving Average - -16. task_relation_classification_semeval.py -> 关系分类任务,主要参考论文: Enriching Pre-trained Language Model with Entity Information for Relation Classification - -15. task_sequence_labeling_ner_span.py -> 命名实体识别任务,采用MRC模式进行,主要参考论文: A Unified MRC Framework for Named Entity Recognition (实现方式有点不一样) - -14. task_text_classification_alum_cola.py -> 文本分类任务,增加alum对抗,主要参考论文:Adversarial Training for Large Neural Language Models - -13. task_text_classification_freelb_cola.py -> 文本分类任务,增加FreeLB对抗,主要参考论文: FreeLB: Enhanced Adversarial Training for Language Understanding - -12. task_text_classification_sda_cola.py -> 文本分类任务,增加self-distillation,主要参考论文: Improving BERT Fine-Tuning via Self-Ensemble and Self-Distillation - -11. task_text_classification_fgm_cola.py -> 文本分类任务,增加FGM对抗 - -10. task_attribute_value_extract_crf.py -> 实体属性提取任务,主要参考论文: Scaling Up Open Tagging from Tens to Thousands: Comprehension Empowered Attribute Value Extraction from Product Title - -9. task_text_classification_lookahead_cola.py -> 文本分类任务,增加Lookahead优化器,主要参考论文:Lookahead Optimizer: k steps forward, 1 step back - -8. task_text_classification_mdp_cola.py -> 文本分类任务,增加multisample-dropout,主要参考论文:Multi-Sample Dropout for Accelerated Training and Better Generalization - -7. task_multilabel_classification_toxic.py -> 多标签分类任务,数据来源与Kaagle中的Toxic评论比赛 - -6. task_triple_similarity_epidemic.py -> Triple文本相似任务,主要采用Triple Network形式 - -5. task_sentence_similarity_lcqmc.py -> 句子相似任务,形式为[CLS]sentence_A[SEP]sentence_B[SEP] - -4. task_sequence_labeling_ner_crf.py -> 命名实体识别任务,使用CRF解码器 - -3. task_sequence_labeling_layer_lr_ner_crf.py -> 命名实体识别任务,调整CRF层学习率大小 - -2. task_siamese_similarity_afqmc.py -> 句子相似任务,采用siamese network结构 - -1. task_text_classification_cola.py -> 文本分类任务,主要形式为[CLS]sentence[SEP] diff --git a/examples/ccks_kfold_split.py b/examples/ccks_kfold_split.py new file mode 100644 index 0000000..c591f2d --- /dev/null +++ b/examples/ccks_kfold_split.py @@ -0,0 +1,48 @@ +from torchblocks.utils import json_to_text +from sklearn.model_selection import StratifiedKFold + + +def get_data(data_path, datatype): + data = [] + if datatype == 'train': + with open(data_path) as f: + for i in f: + dict_txt = eval(i) + if dict_txt['query'] == '': + continue + for j in dict_txt['candidate']: + if j['text'] == '': + continue + data.append({'query': dict_txt['query'], 'candidate': j['text'], 'label': j['label']}) + else: + with open(data_path) as f: + for i in f: + dict_txt = eval(i) + for j in dict_txt['candidate']: + data.append({'text_id': dict_txt['text_id'], 'query': dict_txt['query'], 'candidate': j['text']}) + return data + + +def generate_data(train_data, random_state=42): + X = range(len(train_data)) + y = [x['label'] for x in train_data] + skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state) + for fold, (train_index, dev_index) in enumerate(skf.split(X, y)): + tmp_train_df = [train_data[index] for index in train_index] + tmp_dev_df = [train_data[index] for index in dev_index] + json_to_text(f'../dataset/ccks2021/ccks2021_train_seed{random_state}_fold{fold}.json', tmp_train_df) + json_to_text(f'../dataset/ccks2021/ccks2021_dev_seed{random_state}_fold{fold}.json', tmp_dev_df) + + +if __name__ == '__main__': + seed = 42 + train_path1 = '../dataset/ccks2021/round1_train.txt' + train_path2 = '../dataset/ccks2021/round2_train.txt' + train_data1 = get_data(train_path1, 'train') + train_data2 = get_data(train_path2, 'train') + train_data = train_data1 + train_data.extend(train_data2) + generate_data(train_data, 42) + generate_data(train_data, 24) + generate_data(train_data, 33) + print('...............kf finish...........') diff --git a/examples/cner_kfold_split.py b/examples/cner_kfold_split.py new file mode 100644 index 0000000..4cfbeec --- /dev/null +++ b/examples/cner_kfold_split.py @@ -0,0 +1,57 @@ +from torchblocks.utils import json_to_text +from torchblocks.tasks import get_spans_from_bio_tags +from torchblocks.data.splits import split_ner_stratified_kfold + +''' +采用多标签方式进行划分数据 +''' + +train_file = '../dataset/cner/train.char.bmes' +dev_file = '../dataset/cner/dev.char.bmes' +folds = 5 +sentences = [] +lines = [] +for input_file in [train_file, dev_file]: + with open(input_file, 'r') as f: + words, labels = [], [] + for line in f: + if line.startswith("-DOCSTART-") or line == "" or line == "\n": + if words: + lines.append([words, labels]) + words, labels = [], [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + label = splits[-1].replace("\n", "") + if 'M-' in label: + label = label.replace('M-', 'I-') + elif 'E-' in label: + label = label.replace('E-', 'I-') + elif 'S-' in label: # 去除S标签,主要方便后面做实验 + label = "O" + labels.append(label) + else: + labels.append("O") + if words: + lines.append([words, labels]) + +for i, (words, labels) in enumerate(lines): + spans = get_spans_from_bio_tags(labels, id2label=None) + new_spans = [] + for span in spans: + tag, start, end = span + new_spans.append([tag, start, end + 1, "".join(words[start:(end + 1)])]) + sentence = {'id': i, 'text': words, 'entities': new_spans, 'bio_seq': labels} + sentences.append(sentence) + +entities_list = [x['entities'] for x in sentences] +all_indices = split_ner_stratified_kfold(entities_list, num_folds=5) +for fold, (train_indices, val_indices) in enumerate(all_indices): + print("The number of train examples: ",len(train_indices)) + print("The number of dev examples: ", len(val_indices)) + train_data = [sentences[i] for i in train_indices] + dev_data = [sentences[i] for i in val_indices] + json_to_text(f'../dataset/cner/cner_train_fold{fold}.json', train_data) + json_to_text(f'../dataset/cner/cner_dev_fold{fold}.json', dev_data) + diff --git a/examples/task_pretrain_ccks.py b/examples/task_pretrain_ccks.py new file mode 100644 index 0000000..fb56c11 --- /dev/null +++ b/examples/task_pretrain_ccks.py @@ -0,0 +1,329 @@ +import json +import torch +import numpy as np +import torch.nn as nn +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.metrics.classification.accuracy import Accuracy +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel +from transformers.models.bert.modeling_bert import BertOnlyMLMHead + +''' +针对CCKS2021任务预训练 +''' + + +class BertForMaskedLM(BertPreTrainedModel, Application): + # _keys_to_ignore_on_load_unexpected = [r"pooler"] + # _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + def __init__(self, config): + super().__init__(config) + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + self.init_weights() + + def compute_loss(self, outputs, labels): + loss_fct = nn.CrossEntropyLoss() # -100 index = padding token + loss = loss_fct(outputs.view(-1, self.config.vocab_size), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert( + inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids'] + ) + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + masked_lm_loss = None + labels = inputs.get("labels", None) + if labels is not None: + masked_lm_loss = self.compute_loss(prediction_scores, labels) + return {"loss": masked_lm_loss, "logits": prediction_scores} + + +class CCKSDataset(DatasetBaseBuilder): + + def __init__(self, opts, file_name, data_type, process_piplines, tokenizer, mlm_probability, max_seq_len, **kwargs): + super().__init__(opts, file_name, data_type, process_piplines, **kwargs) + self.tokenizer = tokenizer + self.max_seq_len = max_seq_len + self.mlm_probability = mlm_probability + self.special_token_ids = {tokenizer.cls_token_id, tokenizer.sep_token_id} + + @staticmethod + def get_labels(): + ''' + 在预训练中添加标签信息 + Returns: + ''' + return {'不匹配': 0, '部分匹配': 1, '完全匹配': 2, -1: -1} + + def read_data(self, input_file): + lines = [] + with open(input_file, 'r') as f: + for line in f: + lines.append(json.loads(line)) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{line['text_id']}" + if line['query'] == '': + continue + text_a = line['query'] + for c in line['candidate']: + if c['text'] == '': + continue + text_b = c['text'] + label = c['label'] if data_type != 'test' else -1 + examples.append(dict(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + # 默认都是0进行padding + def pad_and_truncate(self, input_ids_list, token_type_ids_list, + attention_mask_list, max_seq_len): + input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long) + token_type_ids = torch.zeros_like(input_ids) + attention_mask = torch.zeros_like(input_ids) + for i in range(len(input_ids_list)): + seq_len = len(input_ids_list[i]) + if seq_len <= max_seq_len: + input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long) + token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long) + attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long) + else: + input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id], + dtype=torch.long) + token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long) + attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long) + return input_ids, token_type_ids, attention_mask + + def _ngram_mask(self, input_ids, max_seq_len): + cand_indexes = [] + for (i, id_) in enumerate(input_ids): + if id_ in self.special_token_ids: + continue + cand_indexes.append([i]) + num_to_predict = max(1, int(round(len(input_ids) * self.mlm_probability))) + if len(input_ids) <= 32: + max_ngram = 2 + else: + max_ngram = 3 + ngrams = np.arange(1, max_ngram + 1, dtype=np.int64) + pvals = 1. / np.arange(1, max_ngram + 1) + pvals /= pvals.sum(keepdims=True) + ngram_indexes = [] + for idx in range(len(cand_indexes)): + ngram_index = [] + for n in ngrams: + ngram_index.append(cand_indexes[idx:idx + n]) + ngram_indexes.append(ngram_index) + np.random.shuffle(ngram_indexes) + covered_indexes = set() + for cand_index_set in ngram_indexes: + if len(covered_indexes) >= num_to_predict: + break + if not cand_index_set: + continue + for index_set in cand_index_set[0]: + for index in index_set: + if index in covered_indexes: + continue + n = np.random.choice(ngrams[:len(cand_index_set)], + p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True)) + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + while len(covered_indexes) + len(index_set) > num_to_predict: + if n == 0: + break + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + if len(covered_indexes) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_ids))] + mask_labels += [0] * (max_seq_len - len(mask_labels)) + return torch.tensor(mask_labels[:max_seq_len]) + + def ngram_mask(self, input_ids_list, max_seq_len): + mask_labels = [] + for i, input_ids in enumerate(input_ids_list): + mask_label = self._ngram_mask(input_ids, max_seq_len) + mask_labels.append(mask_label) + return torch.stack(mask_labels, dim=0) + + def mask_tokens(self, inputs, mask_labels): + + labels = inputs.clone() + probability_matrix = mask_labels + bs = inputs.shape[0] + # word struct prediction + for i in range(bs): + tmp = [] + tmp_pro = [] + tmp_pro.extend([1] * 3) + now_input = inputs[i] + now_probability_matrix = probability_matrix[i] + now_probability_matrix = now_probability_matrix.cpu().numpy().tolist() + now_input = now_input.cpu().numpy().tolist() + for j in range(len(now_input)): + if now_input[j] == self.tokenizer.sep_token_id: + sep_index = j + # we don't choose cls_ids, sep_ids, pad_ids + choose_range = now_input[1:sep_index - 2] + if len(choose_range) == 0: + choose_range = now_input[1:5] + rd_token = np.random.choice(choose_range) + token_idx = now_input.index(rd_token) + tmp.extend(now_input[token_idx:token_idx + 3]) + np.random.shuffle(tmp) + now_input[token_idx:token_idx + 3] = tmp + now_probability_matrix[token_idx:token_idx + 3] = tmp_pro + now_input = torch.tensor(now_input) + now_probability_matrix = torch.tensor(now_probability_matrix) + inputs[i] = now_input + probability_matrix[i] = now_probability_matrix + special_tokens_mask = [ + self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() + ] + probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) + masked_indices = probability_matrix.bool() + labels[~masked_indices] = -100 + indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices + inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) + indices_random = torch.bernoulli( + torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced + random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) + inputs[indices_random] = random_words[indices_random] + return inputs, labels + + def build_data_collator(self, features): + input_ids_list, token_type_ids_list, attention_mask_list = list(zip(*features)) + cur_max_seq_len = max(len(input_id) for input_id in input_ids_list) + max_seq_len = min(cur_max_seq_len, self.max_seq_len) + input_ids, token_type_ids, attention_mask = self.pad_and_truncate(input_ids_list, + token_type_ids_list, + attention_mask_list, + max_seq_len) + batch_mask = self.ngram_mask(input_ids_list, max_seq_len) + input_ids, mlm_labels = self.mask_tokens(input_ids, batch_mask) + data_dict = { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'token_type_ids': token_type_ids, + 'labels': mlm_labels + } + return data_dict + + +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, label2id, tokenizer, max_seq_len): + self.label2id = label2id + self.tokenizer = tokenizer + self.max_seq_len = max_seq_len + + def __call__(self, example): + label = self.label2id.get(example["label"]) + encoding = self.tokenizer.encode_plus(text=example["text_a"], text_pair=example['text_b'], + add_special_tokens=True, + max_length=self.max_seq_len, + truncation=True, + truncation_strategy='longest_first') + if label != -1: + # 添加标签信息,即:[CLS]TEXT[SEP]LABEL[SEP] + encoding['input_ids'] = encoding['input_ids'] + [label + 1] + [102] + encoding['attention_mask'] = encoding['attention_mask'] + [1] + [1] + encoding['token_type_ids'] = encoding['token_type_ids'] + [0] + [0] + return encoding['input_ids'], encoding['token_type_ids'], encoding['attention_mask'] + + +class PretrainTrainer(TrainBaseBuilder): + pass + + +def load_data(opts, file_name, data_type, tokenizer, mlm_probability, max_seq_len): + process_piplines = [ProcessEncodeText(CCKSDataset.label2id(), tokenizer, max_seq_len)] + return CCKSDataset(opts, file_name, data_type, process_piplines, tokenizer, mlm_probability, max_seq_len) + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForMaskedLM, BertTokenizer) +} + + +def main(): + parser = Argparser().build_parser() + group = parser.add_argument_group(title="pretrain", description="") + group.add_argument("--mlm_probability", type=float, default=0.4) + opts = parser.build_args_from_parser(parser) + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, + opts.train_input_file, + "train", + tokenizer, + opts.mlm_probability, + opts.train_max_seq_length) + dev_dataset = load_data(opts, + opts.eval_input_file, + "dev", + tokenizer, + opts.mlm_probability, + opts.eval_max_seq_length) + test_dataset = load_data(opts, + opts.test_input_file, + "test", + tokenizer, + opts.mlm_probability, + opts.eval_max_seq_length) + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path) + config.update( + { + "layer_norm_eps": 1e-7 + } + ) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = PretrainTrainer(opts=opts, + model=model, + metrics=Accuracy(task="multiclass", num_classes=config.vocab_size), + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, + dev_data=None, + state_to_save={'vocab': tokenizer}, + train_with_add_datasets=[dev_dataset, test_dataset]) + + +if __name__ == "__main__": + main() diff --git a/examples/task_sequence_labeling_cner_beam_search_softmax.py b/examples/task_sequence_labeling_cner_beam_search_softmax.py new file mode 100644 index 0000000..6463236 --- /dev/null +++ b/examples/task_sequence_labeling_cner_beam_search_softmax.py @@ -0,0 +1,220 @@ +import json +import torch +import torch.nn as nn +from transformers import ( + BertConfig, + BertTokenizerFast, + BertPreTrainedModel, + BertModel +) +from torch.nn import CrossEntropyLoss +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils.logger import Logger +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils import seed_everything +from torchblocks.tasks.sequence_tags import generate_bio_tags_from_spans +from torchblocks.tasks.sequence_tags import ner_beam_search_decode +from torchblocks.utils import concat_tensors_with_padding, tensor_to_numpy +from torchblocks.tasks.sequence_tags import get_spans_from_subword_bio_tags +from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore + + +class BertForTokenClassification(BertPreTrainedModel, Application): + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, logits, labels, attention_mask): + loss_fct = CrossEntropyLoss() + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + return loss + + def forward(self, inputs): + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + labels = inputs.get("labels", None) + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + loss = None + if labels is not None: + loss = self.compute_loss(logits, labels, attention_mask) + return {"loss": loss, "logits": logits, "attention_mask": attention_mask} + + +class SequenceLabelingTrainer(TrainBaseBuilder): + keys_to_ignore_on_gpu = ['offset_mapping', 'text'] + + def process_batch_outputs(self, tensor_dict): + texts = tensor_dict['text'] # 原始文本 + # beam_search搜索 + preds, pred_probs = ner_beam_search_decode( + concat_tensors_with_padding(tensor_dict['logits'], padding_shape=(0, 0, 0, 1), + padding_value=0).float().log_softmax(dim=-1), + self.opts.id2label, + self.opts.decode_beam_size, + ) + labels = concat_tensors_with_padding(tensor_dict['labels'], padding_shape=(0, 1), padding_value=0) + offset_mappings = concat_tensors_with_padding(tensor_dict['offset_mapping'], padding_shape=(0, 0,0,1), + padding_value=0) + preds, pred_probs = tensor_to_numpy(preds), tensor_to_numpy(pred_probs) + labels, offset_mappings = tensor_to_numpy(labels), tensor_to_numpy(offset_mappings) + # Collect the NER entities for predictions and labels to calculate the F1 score. + pred_entities_list, label_entities_list = [], [] + for text, pred, pred_prob, label, offset_mapping in zip( + texts, preds, pred_probs, labels, offset_mappings + ): + valid_mask = offset_mapping[..., 1] > 0 + pred, pred_prob = pred[valid_mask], pred_prob[valid_mask] + label, offset_mapping = label[valid_mask], offset_mapping[valid_mask] + # Extract the NER entities from BIO-naming tags. Note that the + # low-confidence or too-short entities will be dropped. + pred_entities, pred_entity_probs = get_spans_from_subword_bio_tags( + [self.opts.id2label[x] for x in pred], offset_mapping, pred_prob + ) + pred_entities = [ + (entity, a, b) + for (entity, a, b), prob in zip(pred_entities, pred_entity_probs) + ] + pred_entities_list.append(pred_entities) + # Of course, we will extract the entities for labels. + label_entities, _ = get_spans_from_subword_bio_tags( + [self.opts.id2label[x] for x in label], offset_mapping + ) + label_entities_list.append(label_entities) + + return {"preds": pred_entities_list, "target": label_entities_list} + + +class CnerDataset(DatasetBaseBuilder): + keys_to_ignore_on_collate_batch = ['text'] + keys_to_dynamical_truncate_on_padding_batch = [ + 'input_ids', 'attention_mask', 'token_type_ids', 'labels', 'offset_mapping' + ] + + @staticmethod + def get_labels(): + labels = ['CONT', 'EDU', 'LOC', 'NAME', 'ORG', 'PRO', 'RACE', 'TITLE'] + Bio_labels = ["O"] + [f"B-{x}" for x in labels] + [f"I-{x}" for x in labels] + return Bio_labels + + def read_data(self, input_file): + lines = [] + with open(input_file, 'r') as f: + for line in f: + lines.append(json.loads(line)) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{line['id']}" + text = line['text'] + entities = line['entities'] + examples.append(dict(guid=guid, text=text, entities=entities)) + return examples + + +class ProcessExample2Feature: + + def __init__(self, label2id, tokenizer, max_sequence_length): + super().__init__() + self.label2id = label2id + self.tokenizer = tokenizer + self.max_sequence_length = max_sequence_length + + def __call__(self, example): + text = example['text'] + entities = example['entities'] + ## 处理空格以及异常符号 + new_tokens = [] + for i, word in enumerate(text): + tokenizer_word = self.tokenizer.tokenize(word) + if len(tokenizer_word) == 0: + new_tokens.append("^") + else: + new_tokens.append(word) + new_text = "".join(new_tokens) + encoding = self.tokenizer(new_text, + truncation=True, + padding="max_length", + return_tensors='pt', + max_length=self.max_sequence_length, + return_offsets_mapping=True) + encoding = {k: v.squeeze(0) for k, v in encoding.items()} + outputs = dict(**encoding, text="".join(text)) # 保持原有的text,后续解码使用 + # 将[['PER', 0,1]]转化为['B-AGE', 'I-AGE', 'I-AGE', 'I-AGE',..........] + bio_seq_tags = generate_bio_tags_from_spans(entities, encoding['offset_mapping']) + outputs["label_ids"] = torch.tensor([self.label2id[x] for x in bio_seq_tags]) + return outputs + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length, **kwargs): + process_piplines = [ + ProcessExample2Feature( + CnerDataset.label2id(), tokenizer, max_sequence_length), + ] + return CnerDataset(opts, file_name, data_type, process_piplines, **kwargs) + + +MODEL_CLASSES = { + "bert": (BertConfig, BertForTokenClassification, BertTokenizerFast), +} + + +def main(): + parser = Argparser().build_parser() + group = parser.add_argument_group(title="add", description="") + group.add_argument("--decode_beam_size", type=int, default=2) + opts = parser.build_args_from_parser(parser) + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(CnerDataset.label2id()) + opts.label2id = CnerDataset.label2id() + opts.id2label = CnerDataset.id2label() + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, + num_labels=opts.num_labels, + label2id=opts.label2id, + id2label=opts.id2label) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + + # trainer + logger.info("initializing traniner") + labels = {label.split('-')[1] for label in CnerDataset.get_labels() if '-' in label} + metrics = [SequenceLabelingScore(labels=labels, average='micro', schema='BIO')] + trainer = SequenceLabelingTrainer(opts=opts, model=model, metrics=metrics, logger=logger) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_sequence_labeling_cner_crf.py b/examples/task_sequence_labeling_cner_crf.py deleted file mode 100755 index 65f5e05..0000000 --- a/examples/task_sequence_labeling_cner_crf.py +++ /dev/null @@ -1,201 +0,0 @@ -import os -import torch -from typing import * -from itertools import product -from transformers import ( - BertConfig, - BertTokenizer, -) -from torchblocks.data.dataset import DatasetBase -from torchblocks.data.process_base import ProcessBase -from torchblocks.tasks.sequence_labeling_crf import BertCrfForSeqLabel -from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore -from torchblocks.utils.options import Argparser -from torchblocks.utils.logger import Logger -from torchblocks.core import SequenceLabelingTrainer -from torchblocks.utils.device import prepare_device -from torchblocks.utils.paths import check_dir -from torchblocks.utils.paths import find_all_checkpoints -from torchblocks.utils.seed import seed_everything - - -class CnerDataset(DatasetBase): - keys_to_truncate_on_dynamic_batch = [ - 'input_ids', 'attention_mask', 'token_type_ids', 'labels' - ] - - def __init__(self, - data_name, - data_dir, - data_type, - process_piplines: List[Callable], - **kwargs): - super().__init__(data_name, data_dir, data_type, process_piplines, **kwargs) - - @classmethod - def get_labels(self) -> List[str]: - labels = ["X", 'B-CONT', 'B-EDU', 'B-LOC', 'B-NAME', 'B-ORG', 'B-PRO', 'B-RACE', 'B-TITLE', - 'I-CONT', 'I-EDU', 'I-LOC', 'I-NAME', 'I-ORG', 'I-PRO', 'I-RACE', 'I-TITLE', - 'O', 'S-NAME', 'S-ORG', 'S-RACE','[START]','[END]'] - return labels - - def read_data(self, input_file: str) -> Any: - lines = [] - with open(input_file, 'r') as f: - words = [] - labels = [] - for line in f: - if line.startswith("-DOCSTART-") or line == "" or line == "\n": - if words: - lines.append([words, labels]) - words = [] - labels = [] - else: - splits = line.split(" ") - words.append(splits[0]) - if len(splits) > 1: - labels.append(splits[-1].replace("\n", "")) - else: - # Examples could have no label for mode = "test" - labels.append("O") - if words: - lines.append([words, labels]) - return lines - - def create_examples(self, data: Any, data_type: str, **kwargs) -> List[Dict[str, Any]]: - examples = [] - for (i, line) in enumerate(data): - guid = f"{data_type}-{i}" - tokens = line[0] - labels = [] - for x in line[1]: - if 'M-' in x: - labels.append(x.replace('M-', 'I-')) - elif 'E-' in x: - labels.append(x.replace('E-', 'I-')) - else: - labels.append(x) - examples.append(dict(guid=guid, tokens=tokens, labels=labels)) - return examples - - -class ProcessExample2Feature(ProcessBase): - - def __init__(self, label2id, tokenizer, max_sequence_length): - super().__init__() - self.label2id = label2id - self.tokenizer = tokenizer - self.max_sequence_length = max_sequence_length - - def __call__(self, example): - tokens = example['tokens'] - labels = example['labels'] - - inputs = self.tokenizer( - tokens, - padding="max_length", - truncation="longest_first", - max_length=self.max_sequence_length, - return_overflowing_tokens=True, - is_split_into_words=True, - return_tensors='pt', - ) - overflowing_tokens = inputs.pop("overflowing_tokens") - num_truncated_tokens = inputs.pop("num_truncated_tokens") - inputs = {k: v.squeeze(0) for k, v in inputs.items()} - - if labels is None: - inputs['label_ids'] = None - return inputs - - truncate_len = len(tokens) - overflowing_tokens.size(-1) - labels = ['O'] + labels[: truncate_len] + ['O'] - labels = labels + ['O'] * (self.max_sequence_length - truncate_len - 2) - label_ids = [self.label2id[label] for label in labels] - inputs['label_ids'] = torch.tensor(label_ids) - return inputs - - -def load_data(data_name, data_dir, data_type, tokenizer, max_sequence_length, **kwargs): - process_piplines = [ - ProcessExample2Feature( - CnerDataset.label2id(), tokenizer, max_sequence_length), - ] - return CnerDataset(data_name, data_dir, data_type, process_piplines, **kwargs) - - -MODEL_CLASSES = { - "bert": (BertConfig, BertCrfForSeqLabel, BertTokenizer), -} - -def main(): - opts = Argparser().get_training_arguments() - logger = Logger(opts=opts) - # device - logger.info("initializing device") - opts.device, opts.device_num = prepare_device(opts.device_id) - seed_everything(opts.seed) - config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] - # data processor - logger.info("initializing data processor") - tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) - train_dataset = load_data(opts.train_input_file, opts.data_dir, "train", tokenizer, opts.train_max_seq_length) - dev_dataset = load_data(opts.eval_input_file, opts.data_dir, "dev", tokenizer, opts.eval_max_seq_length) - test_dataset = load_data(opts.test_input_file, opts.data_dir, "test", tokenizer, opts.test_max_seq_length) - opts.num_labels = train_dataset.num_labels - opts.label2id = CnerDataset.label2id() - opts.id2label = CnerDataset.id2label() - # model - logger.info("initializing model and config") - config = config_class.from_pretrained(opts.pretrained_model_path, - num_labels=opts.num_labels, - label2id=opts.label2id, - id2label=opts.id2label) - model = model_class.from_pretrained(opts.pretrained_model_path, config=config) - model.to(opts.device) - - # trainer - logger.info("initializing traniner") - labels = {label.split('-')[1] for label in CnerDataset.get_labels() if '-' in label} - metrics = [SequenceLabelingScore(labels=labels, average='micro', schema='BIOS')] - trainer = SequenceLabelingTrainer(opts=opts, - model=model, - metrics=metrics, - logger=logger) - # do train - if opts.do_train: - trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) - - if opts.do_eval: - checkpoints = [] - if opts.checkpoint_predict_code is not None: - checkpoint = os.path.join(opts.output_dir, opts.checkpoint_predict_code) - check_dir(checkpoint) - checkpoints.append(checkpoint) - if opts.eval_all_checkpoints: - checkpoints = find_all_checkpoints(checkpoint_dir=opts.output_dir) - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - prefix = checkpoint.split("/")[-1] - model = model_class.from_pretrained(checkpoint, config=config) - model.to(opts.device) - trainer.model = model - trainer.evaluate(dev_data=dev_dataset, save_result=True, save_dir=prefix) - - if opts.do_predict: - checkpoints = [] - if opts.checkpoint_predict_code is not None: - checkpoint = os.path.join(opts.output_dir, opts.checkpoint_predict_code) - check_dir(checkpoint) - checkpoints.append(checkpoint) - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - prefix = checkpoint.split("/")[-1] - model = model_class.from_pretrained(checkpoint, config=config) - model.to(opts.device) - trainer.model = model - trainer.predict(test_data=test_dataset, save_result=True, save_dir=prefix) - - -if __name__ == "__main__": - main() diff --git a/examples/task_sequence_labeling_cner_global_pointer.py b/examples/task_sequence_labeling_cner_global_pointer.py deleted file mode 100755 index 1318565..0000000 --- a/examples/task_sequence_labeling_cner_global_pointer.py +++ /dev/null @@ -1,222 +0,0 @@ -import os -from typing import * -import torch -from torchblocks.data.dataset import DatasetBase -from torchblocks.data.process_base import ProcessBase -from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore -from torchblocks.metrics.sequence_labeling.scheme import get_scheme -from torchblocks.utils.options import Argparser -from torchblocks.utils.logger import Logger -from torchblocks.core import SequenceLabelingTrainer -from torchblocks.utils.device import prepare_device -from torchblocks.utils.paths import check_dir -from torchblocks.utils.paths import find_all_checkpoints -from torchblocks.utils.seed import seed_everything -from torchblocks.tasks.sequence_labeling_global_pointer import BertGlobalPointerForSeqLabel -from transformers import BertTokenizer, BertConfig - -class CnerDataset(DatasetBase): - keys_to_truncate_on_dynamic_batch = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] - - def __init__(self, - data_name, - data_dir, - data_type, - process_piplines: List[Callable], - **kwargs): - super().__init__(data_name, data_dir, data_type, process_piplines, **kwargs) - - @classmethod - def get_labels(self) -> List[str]: - return ['CONT', 'NAME', 'PRO', 'ORG', 'RACE', 'TITLE', 'LOC', 'EDU'] - - def read_data(self, input_file: str) -> Any: - lines = [] - with open(input_file, 'r') as f: - words = [] - labels = [] - for line in f: - if line.startswith("-DOCSTART-") or line == "" or line == "\n": - if words: - lines.append([words, labels]) - words = [] - labels = [] - else: - splits = line.split(" ") - words.append(splits[0]) - if len(splits) > 1: - labels.append(splits[-1].replace("\n", "")) - else: - # Examples could have no label for mode = "test" - labels.append("O") - if words: - lines.append([words, labels]) - return lines - - def create_examples(self, data: Any, data_type: str, **kwargs) -> List[Dict[str, Any]]: - examples = [] - for (i, line) in enumerate(data): - guid = f"{data_type}-{i}" - tokens = line[0] - labels = [] - for x in line[1]: - if 'M-' in x: - labels.append(x.replace('M-', 'I-')) - elif 'E-' in x: - labels.append(x.replace('E-', 'I-')) - else: - labels.append(x) - examples.append(dict(guid=guid, tokens=tokens, labels=labels)) - return examples - - def collate_fn(self, features: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: - batch = {} - first = features[0] - max_input_length = first['input_ids'].size(0) - if self.collate_dynamic: - max_input_length = max([torch.sum(f["attention_mask"]) for f in features]) - if "labels" in first and first["labels"] is not None: - batch["labels"] = torch.stack([f["labels"] for f in features]) - for k, v in first.items(): - if k != "labels" and v is not None and not isinstance(v, str): - bv = torch.stack([f[k] for f in features]) if isinstance(v, torch.Tensor) \ - else torch.tensor([f[k] for f in features]) - batch[k] = bv - if self.collate_dynamic: - for k in self.keys_to_truncate_on_dynamic_batch: - if k in batch: - if k == 'labels': - batch[k] = batch[k][..., :max_input_length, :max_input_length] - elif batch[k].dim() >= 2: - batch[k] = batch[k][:, : max_input_length] - return batch - - -class ProcessExample2Feature(ProcessBase): - - def __init__(self, label2id, tokenizer, max_sequence_length): - super().__init__() - self.label2id = label2id - self.tokenizer = tokenizer - self.max_sequence_length = max_sequence_length - - def __call__(self, example): - tokens = example['tokens'] - labels = example['labels'] - num_labels = len(self.label2id) - inputs = self.tokenizer(tokens, - padding="max_length", - truncation="longest_first", - max_length=self.max_sequence_length, - return_overflowing_tokens=True, - is_split_into_words=True, - return_tensors='pt', - ) - inputs.pop("overflowing_tokens") - inputs.pop("num_truncated_tokens") - inputs = {k: v.squeeze(0) for k, v in inputs.items()} - if labels is None: - inputs['labels'] = None - return inputs - global_labels = torch.zeros((num_labels, self.max_sequence_length, self.max_sequence_length), dtype=torch.long) - entities = get_scheme('BIOS')(labels) # 左闭右闭 - for label, start, end in entities: - start += 1 # [CLS] - end += 1 - label_id = self.label2id[label] - if start List[str]: - labels = ["X", 'B-CONT', 'B-EDU', 'B-LOC', 'B-NAME', 'B-ORG', 'B-PRO', 'B-RACE', 'B-TITLE', - 'I-CONT', 'I-EDU', 'I-LOC', 'I-NAME', 'I-ORG', 'I-PRO', 'I-RACE', 'I-TITLE', - 'O', 'S-NAME', 'S-ORG', 'S-RACE'] - return labels - - def read_data(self, input_file: str) -> Any: - lines = [] - with open(input_file, 'r') as f: - words = [] - labels = [] - for line in f: - if line.startswith("-DOCSTART-") or line == "" or line == "\n": - if words: - lines.append([words, labels]) - words = [] - labels = [] - else: - splits = line.split(" ") - words.append(splits[0]) - if len(splits) > 1: - labels.append(splits[-1].replace("\n", "")) - else: - # Examples could have no label for mode = "test" - labels.append("O") - if words: - lines.append([words, labels]) - return lines - - def create_examples(self, data: Any, data_type: str, **kwargs) -> List[Dict[str, Any]]: - examples = [] - for (i, line) in enumerate(data): - guid = f"{data_type}-{i}" - tokens = line[0] - labels = [] - for x in line[1]: - if 'M-' in x: - labels.append(x.replace('M-', 'I-')) - elif 'E-' in x: - labels.append(x.replace('E-', 'I-')) - else: - labels.append(x) - examples.append(dict(guid=guid, tokens=tokens, labels=labels)) - return examples - - -class ProcessExample2Feature(ProcessBase): - - def __init__(self, label2id, tokenizer, max_sequence_length): - super().__init__() - self.label2id = label2id - self.tokenizer = tokenizer - self.max_sequence_length = max_sequence_length - - def __call__(self, example): - tokens = example['tokens'] - labels = example['labels'] - - inputs = self.tokenizer( - tokens, - padding="max_length", - truncation="longest_first", - max_length=self.max_sequence_length, - return_overflowing_tokens=True, - is_split_into_words=True, - return_tensors='pt', - ) - overflowing_tokens = inputs.pop("overflowing_tokens") - num_truncated_tokens = inputs.pop("num_truncated_tokens") - inputs = {k: v.squeeze(0) for k, v in inputs.items()} - - if labels is None: - inputs['label_ids'] = None - return inputs - - truncate_len = len(tokens) - overflowing_tokens.size(-1) - labels = ['O'] + labels[: truncate_len] + ['O'] - labels = labels + ['O'] * (self.max_sequence_length - truncate_len - 2) - label_ids = [self.label2id[label] for label in labels] - inputs['label_ids'] = torch.tensor(label_ids) - return inputs - - -def load_data(data_name, data_dir, data_type, tokenizer, max_sequence_length, **kwargs): - process_piplines = [ - ProcessExample2Feature( - CnerDataset.label2id(), tokenizer, max_sequence_length), - ] - return CnerDataset(data_name, data_dir, data_type, process_piplines, **kwargs) - - -MODEL_CLASSES = { - "bert": (BertConfig, BertSoftmaxForSeqLabel, BertTokenizer), -} - - -def main(): - opts = Argparser().get_training_arguments() - logger = Logger(opts=opts) - # device - logger.info("initializing device") - opts.device, opts.device_num = prepare_device(opts.device_id) - seed_everything(opts.seed) - config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] - # data processor - logger.info("initializing data processor") - tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) - train_dataset = load_data(opts.train_input_file, opts.data_dir, "train", tokenizer, opts.train_max_seq_length) - dev_dataset = load_data(opts.eval_input_file, opts.data_dir, "dev", tokenizer, opts.eval_max_seq_length) - test_dataset = load_data(opts.test_input_file, opts.data_dir, "test", tokenizer, opts.test_max_seq_length) - opts.num_labels = train_dataset.num_labels - opts.label2id = CnerDataset.label2id() - opts.id2label = CnerDataset.id2label() - - # model - logger.info("initializing model and config") - config = config_class.from_pretrained(opts.pretrained_model_path, - num_labels=opts.num_labels, label2id=opts.label2id, id2label=opts.id2label) - model = model_class.from_pretrained(opts.pretrained_model_path, config=config) - model.to(opts.device) - - # trainer - logger.info("initializing traniner") - labels = {label.split('-')[1] for label in CnerDataset.get_labels() if '-' in label} - metrics = [SequenceLabelingScore(labels=labels, average='micro', schema='BIOS')] - trainer = SequenceLabelingTrainer(opts=opts, - model=model, - metrics=metrics, - logger=logger) - # do train - if opts.do_train: - trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) - if opts.do_eval: - checkpoints = [] - if opts.checkpoint_predict_code is not None: - checkpoint = os.path.join(opts.output_dir, opts.checkpoint_predict_code) - check_dir(checkpoint) - checkpoints.append(checkpoint) - if opts.eval_all_checkpoints: - checkpoints = find_all_checkpoints(checkpoint_dir=opts.output_dir) - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - prefix = checkpoint.split("/")[-1] - model = model_class.from_pretrained(checkpoint, config=config) - model.to(opts.device) - trainer.model = model - trainer.evaluate(dev_data=dev_dataset, save_result=True, save_dir=prefix) - - if opts.do_predict: - checkpoints = [] - if opts.checkpoint_predict_code is not None: - checkpoint = os.path.join(opts.output_dir, opts.checkpoint_predict_code) - check_dir(checkpoint) - checkpoints.append(checkpoint) - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - prefix = checkpoint.split("/")[-1] - model = model_class.from_pretrained(checkpoint, config=config) - model.to(opts.device) - trainer.model = model - trainer.predict(test_data=test_dataset, save_result=True, save_dir=prefix) - - -if __name__ == "__main__": - main() diff --git a/examples/task_sequence_labeling_cner_span.py b/examples/task_sequence_labeling_cner_span.py deleted file mode 100755 index 6ccc3aa..0000000 --- a/examples/task_sequence_labeling_cner_span.py +++ /dev/null @@ -1,228 +0,0 @@ -import os -from typing import * -import torch -from transformers import BertConfig, BertTokenizer -from torchblocks.data.dataset import DatasetBase -from torchblocks.data.process_base import ProcessBase -from torchblocks.tasks.sequence_labeling_span import BertSpanForSeqLabel -from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore -from torchblocks.metrics.sequence_labeling.scheme import get_scheme -from torchblocks.utils.options import Argparser -from torchblocks.utils.logger import Logger -from torchblocks.core import SequenceLabelingTrainer -from torchblocks.utils.device import prepare_device -from torchblocks.utils.paths import check_dir -from torchblocks.utils.paths import find_all_checkpoints -from torchblocks.utils.seed import seed_everything - - -class CnerDataset(DatasetBase): - keys_to_truncate_on_dynamic_batch = [ - 'input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions' - ] - - def __init__(self, - data_name, - data_dir, - data_type, - process_piplines, - **kwargs): - super().__init__(data_name, data_dir, data_type, process_piplines, **kwargs) - - @classmethod - def get_labels(self) -> List[str]: - return ["O", "CONT", "ORG", "LOC", 'EDU', 'NAME', 'PRO', 'RACE', 'TITLE'] - - def read_data(self, input_file: str) -> Any: - lines = [] - with open(input_file, 'r') as f: - words = [] - labels = [] - for line in f: - if line.startswith("-DOCSTART-") or line == "" or line == "\n": - if words: - lines.append([words, labels]) - words = [] - labels = [] - else: - splits = line.split(" ") - words.append(splits[0]) - if len(splits) > 1: - labels.append(splits[-1].replace("\n", "")) - else: - # Examples could have no label for mode = "test" - labels.append("O") - if words: - lines.append([words, labels]) - return lines - - def create_examples(self, data: Any, data_type: str, **kwargs) -> List[Dict[str, Any]]: - examples = [] - for (i, line) in enumerate(data): - guid = f"{data_type}-{i}" - tokens = line[0] - labels = [] - for x in line[1]: - if 'M-' in x: - labels.append(x.replace('M-', 'I-')) - elif 'E-' in x: - labels.append(x.replace('E-', 'I-')) - else: - labels.append(x) - examples.append(dict(guid=guid, tokens=tokens, labels=labels)) - return examples - - def collate_fn(self, features: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: - batch = {} - first = features[0] - max_input_length = first['input_ids'].size(0) - if self.collate_dynamic: - max_input_length = max([torch.sum(f["attention_mask"]) for f in features]) - if "start_positions" in first and first["start_positions"] is not None: - batch["start_positions"] = torch.stack([f["start_positions"] for f in features]) - if "end_positions" in first and first["end_positions"] is not None: - batch["end_positions"] = torch.stack([f["end_positions"] for f in features]) - for k, v in first.items(): - if k not in ("start_positions", "end_positions") and v is not None and not isinstance(v, str): - bv = torch.stack([f[k] for f in features]) if isinstance(v, torch.Tensor) \ - else torch.tensor([f[k] for f in features]) - batch[k] = bv - if self.collate_dynamic: - for k in self.keys_to_truncate_on_dynamic_batch: - if batch[k].dim() >= 2: batch[k] = batch[k][:, : max_input_length] - return batch - - -class ProcessExample2Feature(ProcessBase): - - def __init__(self, label2id, tokenizer, max_sequence_length): - super().__init__() - self.label2id = label2id - self.tokenizer = tokenizer - self.max_sequence_length = max_sequence_length - - def __call__(self, example): - tokens = example['tokens'] - labels = example['labels'] - - inputs = self.tokenizer( - tokens, - padding="max_length", - truncation="longest_first", - max_length=self.max_sequence_length, - return_overflowing_tokens=True, - is_split_into_words=True, - return_tensors='pt', - ) - overflowing_tokens = inputs.pop("overflowing_tokens") - num_truncated_tokens = inputs.pop("num_truncated_tokens") - inputs = {k: v.squeeze(0) for k, v in inputs.items()} - if labels is None: - inputs['start_positions'] = None - inputs['end_positions'] = None - return inputs - start_positions = [self.label2id["O"]] * self.max_sequence_length - end_positions = [self.label2id["O"]] * self.max_sequence_length - entities = get_scheme('BIOS')(labels) # 左闭右闭 - for label, start, end in entities: - start += 1 - end += 1 # [CLS] - label_id = self.label2id[label] - if start < self.max_sequence_length and end < self.max_sequence_length: - start_positions[start] = label_id - end_positions[end] = label_id - inputs['start_positions'] = torch.tensor(start_positions) - inputs['end_positions'] = torch.tensor(end_positions) - return inputs - - -def load_data(data_name, data_dir, data_type, tokenizer, max_sequence_length, **kwargs): - process_piplines = [ - ProcessExample2Feature( - CnerDataset.label2id(), tokenizer, max_sequence_length), - ] - return CnerDataset(data_name, data_dir, data_type, process_piplines, max_examples=None, **kwargs) - - -MODEL_CLASSES = { - "bert": (BertConfig, BertSpanForSeqLabel, BertTokenizer), -} - - -def main(): - parser = Argparser.get_training_parser() - group = parser.add_argument_group(title="start/end Thresh", description="start/end Thresh") - group.add_argument("--start_thresh", type=float, default=0.0) - group.add_argument("--end_thresh", type=float, default=0.0) - opts = parser.parse_args_from_parser(parser) - logger = Logger(opts=opts) - # device - logger.info("initializing device") - opts.device, opts.device_num = prepare_device(opts.device_id) - seed_everything(opts.seed) - config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] - - # data processor - logger.info("initializing data processor") - tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) - train_dataset = load_data(opts.train_input_file, opts.data_dir, "train", tokenizer, opts.train_max_seq_length) - dev_dataset = load_data(opts.eval_input_file, opts.data_dir, "dev", tokenizer, opts.eval_max_seq_length) - test_dataset = load_data(opts.test_input_file, opts.data_dir, "test", tokenizer, opts.test_max_seq_length) - opts.num_labels = train_dataset.num_labels - opts.label2id = CnerDataset.label2id() - opts.id2label = CnerDataset.id2label() - - # model - logger.info("initializing model and config") - config, unused_kwargs = config_class.from_pretrained( - opts.pretrained_model_path, return_unused_kwargs=True, - num_labels=opts.num_labels, id2label=opts.id2label, label2id=opts.label2id, - start_thresh=opts.start_thresh, end_thresh=opts.end_thresh) - # FIXED: 默认`from_dict`中,只有config中有键才能设置值,这里强制设置 - for key, value in unused_kwargs.items(): setattr(config, key, value) - model = model_class.from_pretrained(opts.pretrained_model_path, config=config) - model.to(opts.device) - # trainer - logger.info("initializing traniner") - labels = [label for label in CnerDataset.get_labels() if label != 'O'] - metrics = [SequenceLabelingScore(labels=labels, average='micro', schema='BIOS')] - trainer = SequenceLabelingTrainer(opts=opts, - model=model, - metrics=metrics, - logger=logger, - ) - if opts.do_train: - trainer.train(train_data=train_dataset, dev_data=dev_dataset) - - if opts.do_eval: - checkpoints = [] - if opts.checkpoint_predict_code is not None: - checkpoint = os.path.join(opts.output_dir, opts.checkpoint_predict_code) - check_dir(checkpoint) - checkpoints.append(checkpoint) - if opts.eval_all_checkpoints: - checkpoints = find_all_checkpoints(checkpoint_dir=opts.output_dir) - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - prefix = checkpoint.split("/")[-1] - model = model_class.from_pretrained(checkpoint, config=config) - model.to(opts.device) - trainer.model = model - trainer.evaluate(dev_data=dev_dataset, save_result=True, save_dir=prefix) - - if opts.do_predict: - checkpoints = [] - if opts.checkpoint_predict_code is not None: - checkpoint = os.path.join(opts.output_dir, opts.checkpoint_predict_code) - check_dir(checkpoint) - checkpoints.append(checkpoint) - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - prefix = checkpoint.split("/")[-1] - model = model_class.from_pretrained(checkpoint, config=config) - model.to(opts.device) - trainer.model = model - trainer.predict(test_data=test_dataset, save_result=True, save_dir=prefix) - -if __name__ == "__main__": - main() diff --git a/examples/task_sequence_labeling_resume_beam_search_softmax.py b/examples/task_sequence_labeling_resume_beam_search_softmax.py new file mode 100644 index 0000000..5d0f335 --- /dev/null +++ b/examples/task_sequence_labeling_resume_beam_search_softmax.py @@ -0,0 +1,215 @@ +import torch +import torch.nn as nn +from transformers import ( + BertConfig, + BertTokenizer, + BertPreTrainedModel, + BertModel +) +from torch.nn import CrossEntropyLoss +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils.logger import Logger +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils import seed_everything +from torchblocks.tasks import ner_beam_search_decode +from torchblocks.utils import concat_tensors_with_padding, tensor_to_numpy +from torchblocks.tasks import get_spans_from_bio_tags +from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore + + +class BertForTokenClassification(BertPreTrainedModel, Application): + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, logits, labels, attention_mask): + loss_fct = CrossEntropyLoss() + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + return loss + + def forward(self, inputs): + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + labels = inputs.get("labels", None) + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + loss = None + if labels is not None: + loss = self.compute_loss(logits, labels, attention_mask) + return {"loss": loss, "logits": logits, "attention_mask": attention_mask} + + +class SequenceLabelingTrainer(TrainBaseBuilder): + def process_batch_outputs(self, tensor_dict): + # beam_search搜索 + preds, pred_probs = ner_beam_search_decode( + concat_tensors_with_padding(tensor_dict['logits'], padding_shape=(0, 0, 0, 1), + padding_value=0).float().log_softmax(dim=-1), + self.opts.id2label, + self.opts.decode_beam_size, + ) + labels = concat_tensors_with_padding(tensor_dict['labels'], padding_shape=(0, 1), padding_value=0) + attention_masks = concat_tensors_with_padding(tensor_dict['attention_mask'], padding_shape=(0, 1), + padding_value=0) + preds, pred_probs = tensor_to_numpy(preds), tensor_to_numpy(pred_probs) + labels = tensor_to_numpy(labels) + input_lens = tensor_to_numpy(attention_masks.sum(1)) + # Collect the NER entities for predictions and labels to calculate the F1 score. + pred_entities_list, label_entities_list = [], [] + for pred, input_len, label in zip(preds, input_lens, labels): + # Extract the NER entities from BIO-naming tags. Note that the + pred_entities = get_spans_from_bio_tags([self.opts.id2label[x] for x in pred[:input_len]]) + pred_entities_list.append(pred_entities) + # Of course, we will extract the entities for labels. + label_entities = get_spans_from_bio_tags([self.opts.id2label[x] for x in label[:input_len]]) + label_entities_list.append(label_entities) + return {"preds": pred_entities_list, "target": label_entities_list} + + +class ResumeDataset(DatasetBaseBuilder): + keys_to_dynamical_truncate_on_padding_batch = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] + + @staticmethod + def get_labels(): + labels = ['NAME', 'ORG', 'TITLE', 'RACE', 'EDU', 'CONT', 'LOC', 'PRO', ] + Bio_labels = ["O"] + [f"B-{x}" for x in labels] + [f"I-{x}" for x in labels] + return Bio_labels + + def read_data(self, input_file): + lines = [] + with open(input_file, 'r') as f: + words, labels = [], [] + for line in f: + if line == "" or line == "\n": + if words: + lines.append([words, labels]) + words, labels = [], [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + label = splits[-1].replace("\n", "") + labels.append(label) + else: + labels.append("O") + if words: + lines.append([words, labels]) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + tokens = line[0] + labels = line[1] if data_type != 'test' else None + examples.append(dict(guid=guid, tokens=tokens, labels=labels)) + return examples + + +class ProcessExample2Feature: + + def __init__(self, label2id, tokenizer, max_sequence_length): + super().__init__() + self.label2id = label2id + self.tokenizer = tokenizer + self.max_sequence_length = max_sequence_length + + def __call__(self, example): + tokens = example['tokens'] + labels = example['labels'] + encoder_txt = self.tokenizer(tokens, + truncation=True, + padding="max_length", + return_tensors='pt', + return_overflowing_tokens=True, + is_split_into_words=True, + max_length=self.max_sequence_length) + encoder_txt = {k: v.squeeze(0) for k, v in encoder_txt.items()} + input_ids = encoder_txt["input_ids"] + token_type_ids = encoder_txt["token_type_ids"] + attention_mask = encoder_txt["attention_mask"] + overflowing_tokens = encoder_txt["overflowing_tokens"] + label_ids = None + if labels is not None: + truncate_len = len(tokens) - overflowing_tokens.size(-1) + labels = ['O'] + labels[: truncate_len] + ['O'] + labels = labels + ['O'] * (self.max_sequence_length - truncate_len - 2) + label_ids = [self.label2id[label] for label in labels] + label_ids = torch.tensor(label_ids) + inputs = { + "input_ids": input_ids, + 'token_type_ids': token_type_ids, + 'attention_mask': attention_mask, + 'label_ids': label_ids + } + return inputs + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ + ProcessExample2Feature( + ResumeDataset.label2id(), tokenizer, max_sequence_length), + ] + return ResumeDataset(opts, file_name, data_type, process_piplines) + + +MODEL_CLASSES = { + "bert": (BertConfig, BertForTokenClassification, BertTokenizer), +} + + +def main(): + parser = Argparser().build_parser() + group = parser.add_argument_group(title="beam search", description="bs") + group.add_argument("--decode_beam_size", type=int, default=2) + opts = parser.build_args_from_parser(parser) + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(ResumeDataset.label2id()) + opts.label2id = ResumeDataset.label2id() + opts.id2label = ResumeDataset.id2label() + + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + + # trainer + logger.info("initializing traniner") + labels = {label.split('-')[1] for label in ResumeDataset.get_labels() if '-' in label} + metrics = [SequenceLabelingScore(labels=labels, average='micro', schema='BIO')] + trainer = SequenceLabelingTrainer(opts=opts, model=model, metrics=metrics, logger=logger) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_sequence_labeling_resume_biaffine.py b/examples/task_sequence_labeling_resume_biaffine.py new file mode 100644 index 0000000..45f113a --- /dev/null +++ b/examples/task_sequence_labeling_resume_biaffine.py @@ -0,0 +1,234 @@ +import torch +import torch.nn as nn +import numpy as np +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils.options import Argparser +from torchblocks.utils.logger import Logger +from torchblocks.utils.device import build_device +from torchblocks.utils import seed_everything +from transformers import BertTokenizerFast, BertConfig +from transformers import BertPreTrainedModel, BertModel +from torchblocks.utils import tensor_to_numpy +from torchblocks.modules.biaffine import Biaffine +from torchblocks.tasks.sequence_tags import get_spans_from_bio_tags +from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore + + +class BertBiaffineForSeqLabel(BertPreTrainedModel, Application): + + def __init__(self, config): + super(BertBiaffineForSeqLabel, self).__init__(config) + self.num_labels = config.num_labels + self.hidden_size = config.hidden_size + self.biaffine_bias = config.biaffine_bias + self.biaffine_ffnn_size = config.biaffine_ffnn_size + self.bert = BertModel(config) + self.active = nn.ELU() + self.start_mlp = nn.Linear(self.hidden_size, self.biaffine_ffnn_size) + self.end_mlp = nn.Linear(self.hidden_size, self.biaffine_ffnn_size) + self.biaffine = Biaffine(self.biaffine_ffnn_size, self.num_labels, + bias=(self.biaffine_bias, self.biaffine_bias)) + self.dropout = nn.Dropout(0.1) + self.init_weights() + + def forward(self, inputs): + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + labels = inputs.get("labels", None) + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=True) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + start_feat = self.active(self.start_mlp(sequence_output)) + end_feat = self.active(self.end_mlp(sequence_output)) + logits = self.biaffine(start_feat, end_feat) + loss = None + if labels is not None: + loss = self.compute_loss(logits, labels, attention_mask) + return {"loss": loss, "logits": logits, 'attention_mask': attention_mask} + + def compute_loss(self, logits, labels, mask): + label_mask = torch.triu(mask.unsqueeze(-1).expand_as(labels).clone()) + loss = nn.functional.cross_entropy( + logits.reshape(-1, self.num_labels), + labels.masked_fill(~label_mask.bool(), -100).reshape(-1), + ) + return loss + + +class ResumeDataset(DatasetBaseBuilder): + keys_to_dynamical_truncate_on_padding_batch = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] + + @staticmethod + def get_labels(): + return ["O", "CONT", "ORG", "LOC", 'EDU', 'NAME', 'PRO', 'RACE', 'TITLE'] + + def read_data(self, input_file): + lines = [] + with open(input_file, 'r') as f: + words, labels = [], [] + for line in f: + if line == "" or line == "\n": + if words: + lines.append([words, labels]) + words, labels = [], [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + label = splits[-1].replace("\n", "") + labels.append(label) + else: + labels.append("O") + if words: + lines.append([words, labels]) + return lines + + def build_examples(self, data, data_type): + examples = [] + for i, (words, labels) in enumerate(data): + spans = get_spans_from_bio_tags(labels, id2label=None) + new_spans = [] + for span in spans: + tag, start, end = span + new_spans.append([tag, start, end + 1, "".join(words[start:(end + 1)])]) + guid = f"{data_type}-{i}" + entities = new_spans if data_type != 'test' else None + examples.append(dict(guid=guid, tokens=words, entities=entities)) + return examples + + def process_collator(self, batch, max_input_length): + # 动态padding + if self.dynamical_padding: + for k in self.keys_to_dynamical_truncate_on_padding_batch: + if k in batch: + if k in ['labels']: + batch[k] = batch[k][:, :max_input_length, :max_input_length] + elif batch[k].ndim == 2: + batch[k] = batch[k][:, : max_input_length] + return batch + + +class ProcessExample2Feature: + def __init__(self, label2id, tokenizer, max_sequence_length): + self.label2id = label2id + self.tokenizer = tokenizer + self.max_sequence_length = max_sequence_length + + def __call__(self, example): + tokens = example['tokens'] + entities = example['entities'] + encoder_txt = self.tokenizer( + tokens, + padding="max_length", + truncation=True, + return_overflowing_tokens=True, + is_split_into_words=True, + max_length=self.max_sequence_length, + return_tensors='pt', + ) + encoder_txt = {k: v.squeeze(0) for k, v in encoder_txt.items()} + labels = torch.zeros((self.max_sequence_length, self.max_sequence_length), dtype=torch.long) + for e_type, start_idx, end_idx, *_ in entities: + if start_idx > self.max_sequence_length - 1 or end_idx > self.max_sequence_length - 1: + continue + labels[start_idx + 1, end_idx + 1] = self.label2id[e_type] + inputs = { + "input_ids": encoder_txt["input_ids"], + 'token_type_ids': encoder_txt["token_type_ids"], + 'attention_mask': encoder_txt["attention_mask"], + 'labels': labels + } + return inputs + + +class SequenceLabelingTrainer(TrainBaseBuilder): + keys_to_ignore_on_save_checkpoint = ['optimizer'] # checkpoint中不存储的模块,比如'optimizer' + + def process_batch_outputs(self, outputs): + """ + :param span_scores: (b, t, t, c) + :param mask: (b, t) + :return: + """ + preds = [] + targets = [] + for logits, labels, attention_mask in zip(outputs['logits'], outputs['labels'], outputs['attention_mask']): + predict_labels = torch.argmax(logits, -1) + input_lens = torch.sum(attention_mask, dim=-1) + mask = torch.tril(torch.ones_like(predict_labels), diagonal=-1) + predict_labels = predict_labels - mask * 1e12 + y_pred = tensor_to_numpy(predict_labels) + y_true = tensor_to_numpy(labels) + pred = [] + target = [] + for b, start, end in zip(*np.where(y_pred > 0)): + if start > input_lens[b] or end > input_lens[b]: + continue + pred.append((self.opts.id2label[int(y_pred[b, start, end])], (b, start), (b, end))) + for b, start, end in zip(*np.where(y_true > 0)): + target.append((self.opts.id2label[int(y_true[b, start, end])], (b, start), (b, end))) + preds.append(pred) + targets.append(target) + return {"preds": preds, "target": targets} + + +def load_data(data_name, data_dir, data_type, tokenizer, max_sequence_length, **kwargs): + process_piplines = [ + ProcessExample2Feature( + ResumeDataset.label2id(), tokenizer, max_sequence_length), + ] + return ResumeDataset(data_name, data_dir, data_type, process_piplines, **kwargs) + + +MODEL_CLASSES = { + "bert": (BertConfig, BertBiaffineForSeqLabel, BertTokenizerFast), +} + + +def main(): + parser = Argparser.build_parser() + group = parser.add_argument_group(title="Biaffine", description="Biaffine") + group.add_argument('--biaffine_ffnn_size', default=512, type=int) + opts = parser.build_args_from_parser(parser) + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + + opts.num_labels = len(ResumeDataset.get_labels()) + opts.label2id = ResumeDataset.label2id() + opts.id2label = ResumeDataset.id2label() + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + config.biaffine_ffnn_size = opts.biaffine_ffnn_size + config.biaffine_bias = True + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + metrics = [SequenceLabelingScore(ResumeDataset.get_labels(), average='micro')] + trainer = SequenceLabelingTrainer(opts=opts, + model=model, + metrics=metrics, + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_sequence_labeling_resume_crf.py b/examples/task_sequence_labeling_resume_crf.py new file mode 100644 index 0000000..4336654 --- /dev/null +++ b/examples/task_sequence_labeling_resume_crf.py @@ -0,0 +1,209 @@ +import torch +import torch.nn as nn +from transformers import ( + BertConfig, + BertTokenizer, + BertPreTrainedModel, + BertModel +) +from torchblocks.modules.crf import CRF +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils.options import Argparser +from torchblocks.utils.logger import Logger +from torchblocks.utils.device import build_device +from torchblocks.utils import seed_everything +from torchblocks.utils import tensor_to_list +from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore + + +class BertCrfForSeqLabel(BertPreTrainedModel, Application): + + def __init__(self, config): + super().__init__(config) + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.crf = CRF(num_tags=config.num_labels, batch_first=True) + self.init_weights() + + def compute_loss(self, logits, labels, attention_mask): + loss = -1 * self.crf(emissions=logits, tags=labels, mask=attention_mask) + return loss + + def forward(self, inputs): + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + labels = inputs.get("labels", None) + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + loss = None + if labels is not None: + loss = self.compute_loss(logits, labels, attention_mask) + return {"loss": loss, "logits": logits} + + def decode(self, logits, mask): + decode_labels = self.crf.decode(logits, mask).squeeze(0) # (batch_size, seq_length) + return decode_labels + + +class ResumeDataset(DatasetBaseBuilder): + keys_to_dynamical_truncate_on_padding_batch = [ 'input_ids', 'attention_mask', 'token_type_ids', 'labels'] + + @staticmethod + def get_labels(): + labels = ['NAME', 'ORG', 'TITLE', 'RACE', 'EDU', 'CONT', 'LOC', 'PRO', ] + Bio_labels = ["O"] + [f"B-{x}" for x in labels] + [f"I-{x}" for x in labels] + return Bio_labels + + def read_data(self, input_file): + lines = [] + with open(input_file, 'r') as f: + words, labels = [], [] + for line in f: + if line == "" or line == "\n": + if words: + lines.append([words, labels]) + words, labels = [], [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + label = splits[-1].replace("\n", "") + labels.append(label) + else: + labels.append("O") + if words: + lines.append([words, labels]) + return lines + + def build_examples(self, data, data_type, **kwargs): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + tokens = line[0] + labels = line[1] if data_type != 'test' else None + examples.append(dict(guid=guid, tokens=tokens, labels=labels)) + return examples + + +class ProcessExample2Feature: + + def __init__(self, label2id, tokenizer, max_sequence_length): + super().__init__() + self.label2id = label2id + self.tokenizer = tokenizer + self.max_sequence_length = max_sequence_length + + def __call__(self, example): + tokens = example['tokens'] + labels = example['labels'] + inputs = self.tokenizer( + tokens, + padding="max_length", + truncation="longest_first", + max_length=self.max_sequence_length, + return_overflowing_tokens=True, + is_split_into_words=True, + return_tensors='pt', + ) + inputs.pop("num_truncated_tokens") + overflowing_tokens = inputs.pop("overflowing_tokens") + inputs = {k: v.squeeze(0) for k, v in inputs.items()} + if labels is None: + inputs['label_ids'] = None + return inputs + truncate_len = len(tokens) - overflowing_tokens.size(-1) + labels = ['O'] + labels[: truncate_len] + ['O'] + labels = labels + ['O'] * (self.max_sequence_length - truncate_len - 2) + label_ids = [self.label2id[label] for label in labels] + inputs['label_ids'] = torch.tensor(label_ids) + return inputs + + +class SequenceLabelingTrainer(TrainBaseBuilder): + + def process_batch_outputs(self, outputs): + preds, targets = [], [] + for logits, attention_mask, labels in zip(outputs['logits'], outputs['attention_mask'], outputs['labels']): + tags = self.model.decode(logits.to(self.opts.device), attention_mask.to(self.opts.device)) + out_label_ids = tensor_to_list(labels) + input_lens = tensor_to_list(torch.sum(attention_mask, dim=-1)) + tags = tensor_to_list(tags) + for i, label in enumerate(out_label_ids): + temp_1 = [] + temp_2 = [] + for j, m in enumerate(label): + if j == 0: + continue + elif j == input_lens[i] - 1: + preds.append(temp_2) + targets.append(temp_1) + break + else: + temp_1.append(self.opts.id2label[out_label_ids[i][j]]) + temp_2.append(self.opts.id2label[tags[i][j]]) + return {"preds": preds, "target": targets} + + +def load_data(data_name, data_dir, data_type, tokenizer, max_sequence_length, **kwargs): + process_piplines = [ + ProcessExample2Feature( + ResumeDataset.label2id(), tokenizer, max_sequence_length), + ] + return ResumeDataset(data_name, data_dir, data_type, process_piplines, **kwargs) + + +MODEL_CLASSES = { + "bert": (BertConfig, BertCrfForSeqLabel, BertTokenizer), +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + + opts.num_labels = len(ResumeDataset.label2id()) + opts.label2id = ResumeDataset.label2id() + opts.id2label = ResumeDataset.id2label() + + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, + num_labels=opts.num_labels, + label2id=opts.label2id, + id2label=opts.id2label) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + + # trainer + logger.info("initializing traniner") + labels = {label.split('-')[1] for label in ResumeDataset.get_labels() if '-' in label} + metrics = [SequenceLabelingScore(labels=labels, average='micro', schema='BIO')] + trainer = SequenceLabelingTrainer(opts=opts,model=model,metrics=metrics,logger=logger) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_sequence_labeling_resume_global_pointer.py b/examples/task_sequence_labeling_resume_global_pointer.py new file mode 100644 index 0000000..0043500 --- /dev/null +++ b/examples/task_sequence_labeling_resume_global_pointer.py @@ -0,0 +1,242 @@ +import torch +import numpy as np +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils.options import Argparser +from torchblocks.utils.logger import Logger +from torchblocks.utils.device import build_device +from torchblocks.utils import seed_everything +from torchblocks.utils import tensor_to_numpy +from transformers import ( + BertConfig, + BertTokenizer, + BertPreTrainedModel, + BertModel +) +from torchblocks.utils import concat_tensors_with_padding +from torchblocks.metrics.base import Metric +from torchblocks.modules.global_pointer import GlobalPointer +from torchblocks.tasks import get_spans_from_bio_tags + + +class BertGlobalPointerForSeqLabel(BertPreTrainedModel, Application): + def __init__(self, config): + super(BertGlobalPointerForSeqLabel, self).__init__(config) + self.num_labels = config.num_labels + self.inner_dim = config.inner_dim + self.use_rope = config.use_rope + self.hidden_size = config.hidden_size + self.bert = BertModel(config) + self.global_pointer = GlobalPointer(self.num_labels, self.inner_dim, self.hidden_size, self.use_rope) + self.dropout = torch.nn.Dropout(0.1) + self.init_weights() + + def forward(self, inputs): + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + labels = inputs.get("labels", None) + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids) + sequence_output = outputs[0] + # import pdb + # pdb.set_trace() + sequence_output = self.dropout(sequence_output) + logits = self.global_pointer(sequence_output, mask=attention_mask) + loss = None + if labels is not None: + loss = self.global_pointer.compute_loss(logits, labels) + # loss = self.global_pointer.compute_loss(labels,logits) + return {"loss": loss, "logits": logits} + + +class ResumeDataset(DatasetBaseBuilder): + keys_to_dynamical_truncate_on_padding_batch = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] + + @staticmethod + def get_labels(): + return ["O", "CONT", "ORG", "LOC", 'EDU', 'NAME', 'PRO', 'RACE', 'TITLE'] + + def read_data(self, input_file): + lines = [] + with open(input_file, 'r') as f: + words, labels = [], [] + for line in f: + if line == "" or line == "\n": + if words: + lines.append([words, labels]) + words, labels = [], [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + label = splits[-1].replace("\n", "") + labels.append(label) + else: + labels.append("O") + if words: + lines.append([words, labels]) + return lines + + def build_examples(self, data, data_type): + examples = [] + for i, (words, labels) in enumerate(data): + spans = get_spans_from_bio_tags(labels, id2label=None) + new_spans = [] + for span in spans: + tag, start, end = span + new_spans.append([tag, start, end, "".join(words[start:(end + 1)])]) # 左闭右闭 + guid = f"{data_type}-{i}" + entities = new_spans if data_type != 'test' else None + examples.append(dict(guid=guid, tokens=words, entities=entities)) + return examples + + def process_collator(self, batch, max_input_length): + # 动态padding + if self.dynamical_padding: + for k in self.keys_to_dynamical_truncate_on_padding_batch: + if k in batch: + if k in ['labels']: + batch[k] = batch[k][:, :, :max_input_length, :max_input_length] + elif batch[k].dim() >= 2: + batch[k] = batch[k][:, : max_input_length] + return batch + + +class ProcessExample2Feature: + + def __init__(self, label2id, tokenizer, max_sequence_length): + self.label2id = label2id + self.tokenizer = tokenizer + self.max_sequence_length = max_sequence_length + + def __call__(self, example): + tokens = example['tokens'] + entities = example['entities'] + encoder_txt = self.tokenizer( + tokens, + padding="max_length", + truncation=True, + return_overflowing_tokens=True, + is_split_into_words=True, + max_length=self.max_sequence_length, + return_tensors='pt', + ) + encoder_txt = {k: v.squeeze(0) for k, v in encoder_txt.items()} + input_ids = encoder_txt["input_ids"] + token_type_ids = encoder_txt["token_type_ids"] + attention_mask = encoder_txt["attention_mask"] + labels = torch.zeros((len(self.label2id), self.max_sequence_length, self.max_sequence_length), + dtype=torch.int) + for label, start, end, _ in entities: + if start > self.max_sequence_length - 1 or end > self.max_sequence_length - 1: + continue + labels[self.label2id[label], start + 1, end + 1] = 1 + inputs = { + "input_ids": input_ids, + 'token_type_ids': token_type_ids, + 'attention_mask': attention_mask, + 'labels': labels + } + return inputs + + +class GobalPointerMetric(Metric): + def __init__(self): + super().__init__() + self.reset() + + def reset(self): + self.preds = [] + self.target = [] + + def update(self, preds, target): + self.preds.extend(preds) + self.target.extend(target) + + def value(self): + X, Y, Z = 1e-10, 1e-10, 1e-10 + R = set(self.preds) + T = set(self.target) + X += len(R & T) + Y += len(R) + Z += len(T) + f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z + return {'f1': f1, "precision": precision, "recall": recall} + + def name(self): + return 'gp' + + +class SequenceLabelingTrainer(TrainBaseBuilder): + keys_to_ignore_on_checkpoint_save = ['optimizer'] # checkpoint中不存储的模块,比如'optimizer' + + def process_batch_outputs(self, outputs): + pred_entities = [] + true_entities = [] + labels = concat_tensors_with_padding(outputs['logits'], padding_index=-1, padding_shape=[0, 1, 0, 1], + padding_value=0) + logits = concat_tensors_with_padding(outputs['labels'], padding_index=-1, padding_shape=[0, 1, 0, 1], + padding_value=0) + y_pred = tensor_to_numpy(logits) + y_true = tensor_to_numpy(labels) + for b, l, start, end in zip(*np.where(y_pred > 0)): + pred_entities.append((b, l, start, end)) + for b, l, start, end in zip(*np.where(y_true > 0)): + true_entities.append((b, l, start, end)) + return {"preds": pred_entities, "target": true_entities} + + +def load_data(data_name, data_dir, data_type, tokenizer, max_sequence_length, **kwargs): + process_piplines = [ + ProcessExample2Feature( + ResumeDataset.label2id(), tokenizer, max_sequence_length), + ] + return ResumeDataset(data_name, data_dir, data_type, process_piplines, **kwargs) + + +MODEL_CLASSES = { + "bert": (BertConfig, BertGlobalPointerForSeqLabel, BertTokenizer), +} + + +def main(): + parser = Argparser.build_parser() + group = parser.add_argument_group(title="global pointer", description="Global pointer") + group.add_argument('--use_rope', action='store_true') + group.add_argument('--inner_dim', default=64, type=int, help='The dim of Positional embedding') + opts = parser.build_args_from_parser(parser) + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(ResumeDataset.get_labels()) + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + config.use_rope = opts.use_rope + config.inner_dim = opts.inner_dim + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = SequenceLabelingTrainer(opts=opts, + model=model, + metrics=[GobalPointerMetric()], + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_sequence_labeling_resume_span.py b/examples/task_sequence_labeling_resume_span.py new file mode 100644 index 0000000..1bb93a6 --- /dev/null +++ b/examples/task_sequence_labeling_resume_span.py @@ -0,0 +1,236 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchblocks.losses.span_loss import SpanLoss +from transformers import BertPreTrainedModel, BertModel +from transformers import BertConfig, BertTokenizer +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils.options import Argparser +from torchblocks.utils.logger import Logger +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.utils.device import build_device +from torchblocks.utils import seed_everything +from torchblocks.utils import tensor_to_list +from torchblocks.modules import PoolerStartLogits, PoolerEndLogits +from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore +from torchblocks.tasks import get_spans_from_bio_tags + + +class BertSpanForSeqLabel(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.start_fc = PoolerStartLogits(config.hidden_size, config.num_labels) + self.end_fc = PoolerEndLogits(config.hidden_size + config.num_labels, config.num_labels) + self.init_weights() + + def compute_loss(self, start_logits, end_logits, start_positions, end_positions, attention_mask): + loss_fct = SpanLoss() + loss = loss_fct(preds=(start_logits, end_logits), + target=(start_positions, end_positions), + masks=attention_mask) + return loss + + def forward(self, inputs): + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + start_positions = inputs.get('start_positions', None) + end_positions = inputs.get("end_positions", None) + outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + start_logits = self.start_fc(sequence_output) + if self.training: + batch_size = input_ids.size(0) + seq_len = input_ids.size(1) + label_logits = torch.zeros([batch_size, seq_len, self.config.num_labels]) + label_logits = label_logits.to(input_ids.device) + label_logits.scatter_(2, start_positions.unsqueeze(2), 1) + else: + label_logits = F.softmax(start_logits, -1) + end_logits = self.end_fc(sequence_output, label_logits) + loss = None + if start_positions is not None and end_positions is not None: + loss = self.compute_loss(start_logits, end_logits, start_positions, end_positions, attention_mask) + return {"loss": loss, "start_logits": start_logits, "end_logits": end_logits} + + +class ResumeDataset(DatasetBaseBuilder): + keys_to_ignore_on_collate_batch = ['entities'] + keys_to_dynamical_truncate_on_padding_batch = [ + 'input_ids', + 'attention_mask', + 'token_type_ids', + 'start_positions', + 'end_positions' + ] + + @staticmethod + def get_labels(): + return ["O", "CONT", "ORG", "LOC", 'EDU', 'NAME', 'PRO', 'RACE', 'TITLE'] + + def read_data(self, input_file): + lines = [] + with open(input_file, 'r') as f: + words, labels = [], [] + for line in f: + if line == "" or line == "\n": + if words: + lines.append([words, labels]) + words, labels = [], [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + label = splits[-1].replace("\n", "") + labels.append(label) + else: + labels.append("O") + if words: + lines.append([words, labels]) + return lines + + def build_examples(self, data, data_type): + examples = [] + for i, (words, labels) in enumerate(data): + spans = get_spans_from_bio_tags(labels, id2label=None) + new_spans = [] + for span in spans: + tag, start, end = span + new_spans.append([tag, start, end, "".join(words[start:(end + 1)])]) # 左闭右闭 + guid = f"{data_type}-{i}" + entities = new_spans if data_type != 'test' else None + examples.append(dict(guid=guid, tokens=words, entities=entities)) + return examples + + +class ProcessExample2Feature: + + def __init__(self, label2id, tokenizer, max_sequence_length): + super().__init__() + self.label2id = label2id + self.tokenizer = tokenizer + self.max_sequence_length = max_sequence_length + + def __call__(self, example): + tokens = example['tokens'] + entities = example['entities'] + inputs = self.tokenizer( + tokens, + padding="max_length", + truncation="longest_first", + max_length=self.max_sequence_length, + return_overflowing_tokens=True, + is_split_into_words=True, + return_tensors='pt', + ) + inputs.pop("overflowing_tokens") + inputs.pop("num_truncated_tokens") + inputs = {k: v.squeeze(0) for k, v in inputs.items()} + if entities is None: + inputs['start_positions'] = None + inputs['end_positions'] = None + return inputs + start_positions = [self.label2id["O"]] * self.max_sequence_length + end_positions = [self.label2id["O"]] * self.max_sequence_length + for label, start, end, *_ in entities: + start += 1 + end += 1 # [CLS] + label_id = self.label2id[label] + if start < self.max_sequence_length and end < self.max_sequence_length: + start_positions[start] = label_id + end_positions[end] = label_id + inputs['start_positions'] = torch.tensor(start_positions) + inputs['end_positions'] = torch.tensor(end_positions) + inputs['entities'] = entities + return inputs + + +def load_data(data_name, data_dir, data_type, tokenizer, max_sequence_length, **kwargs): + process_piplines = [ + ProcessExample2Feature( + ResumeDataset.label2id(), tokenizer, max_sequence_length), + ] + return ResumeDataset(data_name, data_dir, data_type, process_piplines, **kwargs) + + +class SequenceLabelingTrainer(TrainBaseBuilder): + keys_to_ignore_on_gpu = ["entities"] # batch数据中不转换为GPU的变量名 + keys_to_ignore_on_save_checkpoint = ['optimizer'] # checkpoint中不存储的模块,比如'optimizer' + + def process_batch_outputs(self, outputs): + preds = [] + target = [] + start_logits = outputs['start_logits'] + end_logits = outputs['end_logits'] + attention_mask = outputs['attention_mask'] + entities = outputs['entities'] # batch列表数据 + for s_logit, e_logit, mask in zip(start_logits, end_logits, attention_mask): + input_lens = tensor_to_list(torch.sum(mask, dim=-1)) + start_preds = tensor_to_list(torch.argmax(s_logit, -1)) + end_preds = tensor_to_list(torch.argmax(e_logit, -1)) + for s_pred, e_pred, le in zip(start_preds, end_preds, input_lens): + s_pred = s_pred[:le][1:-1] + e_pred = e_pred[:le][1:-1] + p_ent = [] + for i, s_l in enumerate(s_pred): + if s_l == 0: + continue + for j, e_l in enumerate(e_pred[i:]): + if s_l == e_l: + p_ent.append((self.opts.id2label[s_l], i, i + j)) + break + preds.append(p_ent) + for bd in entities: + for b in bd: + target.append([(x[0], x[1], x[2]) for x in b]) + return {"preds": preds, "target": target} + + +MODEL_CLASSES = { + "bert": (BertConfig, BertSpanForSeqLabel, BertTokenizer), +} + +def main(): + opts = Argparser.build_arguments() + logger = Logger(opts=opts) + + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + + opts.num_labels = len(ResumeDataset.label2id()) + opts.label2id = ResumeDataset.label2id() + opts.id2label = ResumeDataset.id2label() + + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + # FIXED: 默认`from_dict`中,只有config中有键才能设置值,这里强制设置 + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + labels = [label for label in ResumeDataset.get_labels() if label != 'O'] + metrics = [SequenceLabelingScore(labels=labels, average='micro', schema='BIO')] + trainer = SequenceLabelingTrainer(opts=opts, + model=model, + metrics=metrics, + logger=logger, + ) + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_sequence_labeling_resume_token_mask_aug_softmax.py b/examples/task_sequence_labeling_resume_token_mask_aug_softmax.py new file mode 100644 index 0000000..2909b0f --- /dev/null +++ b/examples/task_sequence_labeling_resume_token_mask_aug_softmax.py @@ -0,0 +1,221 @@ +import torch +import torch.nn as nn +import numpy as np +from transformers import ( + BertConfig, + BertTokenizer, + BertPreTrainedModel, + BertModel +) +from torch.nn import CrossEntropyLoss +from torchblocks.data import DatasetBaseBuilder +from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore +from torchblocks.utils.options import Argparser +from torchblocks.utils.logger import Logger +from torchblocks.utils.device import build_device +from torchblocks.utils import seed_everything +from torchblocks.core import TrainBaseBuilder, Application + + +class BertSoftmaxForSeqLabel(BertPreTrainedModel, Application): + + def __init__(self, config): + super().__init__(config) + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, logits, labels, attention_mask): + loss_fct = CrossEntropyLoss() + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.config.num_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + return loss + + def forward(self, inputs): + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + labels = inputs.get("labels", None) + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + loss, groundtruths, predictions = None, None, None + if labels is not None: + loss = self.compute_loss(logits, labels, attention_mask) + if not self.training: + groundtruths = self.decode(labels, attention_mask, is_logits=False) + if not self.training: # 训练时无需解码 + predictions = self.decode(logits, attention_mask, is_logits=True) + return { + "loss": loss, + "logits": logits, + "predictions": predictions, + "groundtruths": groundtruths + } + + def decode(self, logits, mask, is_logits=False): + decode_ids = logits + if is_logits: + decode_ids = torch.argmax(logits, -1) # (batch_size, seq_length) + decode_labels = [] + for ids, mask in zip(decode_ids, mask): + decode_label = [self.config.id2label[id.item()] for id, m in zip(ids, mask) if m > 0][1:-1] # [CLS], [SEP] + decode_labels.append(decode_label) + return decode_labels + + +class ResumeDataset(DatasetBaseBuilder): + keys_to_dynamical_truncate_on_padding_batch = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] + + @staticmethod + def get_labels(): + labels = ['NAME', 'ORG', 'TITLE', 'RACE', 'EDU', 'CONT', 'LOC', 'PRO', ] + Bio_labels = ["O"] + [f"B-{x}" for x in labels] + [f"I-{x}" for x in labels] + return Bio_labels + + def read_data(self, input_file): + lines = [] + with open(input_file, 'r') as f: + words, labels = [], [] + for line in f: + if line == "" or line == "\n": + if words: + lines.append([words, labels]) + words, labels = [], [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + label = splits[-1].replace("\n", "") + labels.append(label) + else: + labels.append("O") + if words: + lines.append([words, labels]) + return lines + + def build_examples(self, data, data_type, **kwargs): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + tokens = line[0] + labels = line[1] if data_type != 'test' else None + examples.append(dict(guid=guid, tokens=tokens, labels=labels)) + return examples + + +class ProcessExample2Feature: + + def __init__(self, label2id, tokenizer, max_sequence_length, mask_aug_prob=.15): + super().__init__() + self.label2id = label2id + self.tokenizer = tokenizer + self.max_sequence_length = max_sequence_length + self.mask_aug_prob = mask_aug_prob + self.mask_token = tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) + + def __call__(self, example): + guid = example['guid'] + tokens = example['tokens'] + labels = example['labels'] + encoding = self.tokenizer(tokens, + padding="max_length", + truncation="longest_first", + max_length=self.max_sequence_length, + return_overflowing_tokens=True, + is_split_into_words=True, + return_tensors='pt', + ) + overflowing_tokens = encoding.pop("overflowing_tokens") + num_truncated_tokens = encoding.pop("num_truncated_tokens") + outputs = {k: v.squeeze(0) for k, v in encoding.items()} + truncate_len = len(tokens) - overflowing_tokens.size(-1) + padd_len = self.max_sequence_length - truncate_len - 2 + labels = ['O'] + labels[: truncate_len] + ['O'] + labels = labels + ['O'] * padd_len + outputs["label_ids"] = torch.tensor([self.label2id[x] for x in labels]) + if 'train' in guid: # 只对训练数据集进行mask增强 + ix = torch.rand(size=(len(outputs['input_ids']),)) < self.mask_aug_prob + outputs['input_ids'][ix & outputs['attention_mask'].bool()] = self.mask_token + return outputs + + +class SequenceLabelingTrainer(TrainBaseBuilder): + # batch动态增强 + def process_batch_inputs(self, inputs): + ids = inputs['input_ids'] + mask = inputs['attention_mask'] + labels = inputs['labels'] + if np.random.uniform() < 0.5: + cut = 0.25 + perm = torch.randperm(ids.shape[0]).cuda() + rand_len = int(ids.shape[1] * cut) + start = np.random.randint(ids.shape[1] - int(ids.shape[1] * cut)) + inputs['input_ids'][:, start:start + rand_len] = ids[perm, start:start + rand_len] + inputs['attention_mask'][:, start:start + rand_len] = mask[perm, start:start + rand_len] + inputs['labels'][:, start:start + rand_len] = labels[perm, start:start + rand_len] + return inputs + + def process_batch_outputs(self, batches): + return {"preds": batches['predictions'], "target": batches['groundtruths']} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length, **kwargs): + process_piplines = [ + ProcessExample2Feature( + ResumeDataset.label2id(), tokenizer, max_sequence_length), + ] + return ResumeDataset(opts, file_name, data_type, process_piplines, **kwargs) + + +MODEL_CLASSES = { + "bert": (BertConfig, BertSoftmaxForSeqLabel, BertTokenizer), +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + + opts.num_labels = len(ResumeDataset.label2id()) + opts.label2id = ResumeDataset.label2id() + opts.id2label = ResumeDataset.id2label() + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, + num_labels=opts.num_labels, + label2id=opts.label2id, + id2label=opts.id2label) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + + # trainer + logger.info("initializing traniner") + labels = {label.split('-')[1] for label in ResumeDataset.get_labels() if '-' in label} + metrics = [SequenceLabelingScore(labels=labels, average='micro', schema='BIO')] + trainer = SequenceLabelingTrainer(opts=opts, model=model, metrics=metrics, logger=logger) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_sequence_labeling_resume_token_mdp_softmax.py b/examples/task_sequence_labeling_resume_token_mdp_softmax.py new file mode 100644 index 0000000..b6c1b19 --- /dev/null +++ b/examples/task_sequence_labeling_resume_token_mdp_softmax.py @@ -0,0 +1,218 @@ +import torch +from typing import * +import torch.nn as nn +from torch.nn import CrossEntropyLoss +from torchblocks.data import DatasetBaseBuilder +from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore +from torchblocks.utils.options import Argparser +from torchblocks.utils.logger import Logger +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.utils.device import build_device +from torchblocks.utils import seed_everything +from transformers import BertPreTrainedModel, BertModel, BertConfig, BertTokenizer + + +class BertSoftmaxForSeqLabel(BertPreTrainedModel, Application): + + def __init__(self, config): + super().__init__(config) + self.bert = BertModel(config) + self.num_labels = config.num_labels + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.dropout1 = nn.Dropout(0.1) + self.dropout2 = nn.Dropout(0.2) + self.dropout3 = nn.Dropout(0.3) + self.dropout4 = nn.Dropout(0.4) + self.dropout5 = nn.Dropout(0.5) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, outputs, target, attention_mask): + loss_fct = CrossEntropyLoss() + active_loss = attention_mask.view(-1) == 1 + active_logits = outputs.view(-1, self.num_labels)[active_loss] + active_labels = target.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + return loss + + def forward(self, inputs): + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + labels = inputs.get("labels", None) + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids + ) + sequence_output = outputs[0] + logits1 = self.classifier(self.dropout1(sequence_output)) + logits2 = self.classifier(self.dropout2(sequence_output)) + logits3 = self.classifier(self.dropout3(sequence_output)) + logits4 = self.classifier(self.dropout4(sequence_output)) + logits5 = self.classifier(self.dropout5(sequence_output)) + logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5 + logits = torch.softmax(logits, dim=-1) + loss, groundtruths, predictions = None, None, None + if labels is not None: + loss1 = self.compute_loss(logits1, labels, attention_mask) + loss2 = self.compute_loss(logits2, labels, attention_mask) + loss3 = self.compute_loss(logits3, labels, attention_mask) + loss4 = self.compute_loss(logits4, labels, attention_mask) + loss5 = self.compute_loss(logits5, labels, attention_mask) + loss = (loss1 + loss2 + loss3 + loss4 + loss5) / 5 + if not self.training: + groundtruths = self.decode(labels, attention_mask, is_logits=False) + if not self.training: # 训练时无需解码 + predictions = self.decode(logits, attention_mask, is_logits=True) + return { + "loss": loss, + "logits": logits, + "predictions": predictions, + "groundtruths": groundtruths + } + + def decode(self, logits, mask, is_logits=False) -> List[List[List[str]]]: + decode_ids = logits + if is_logits: + decode_ids = torch.argmax(logits, -1) # (batch_size, seq_length) + decode_labels = [] + for ids, mask in zip(decode_ids, mask): + decode_label = [self.config.id2label[id.item()] for id, m in zip(ids, mask) if m > 0][1:-1] # [CLS], [SEP] + decode_labels.append(decode_label) + return decode_labels + + +class ResumeDataset(DatasetBaseBuilder): + keys_to_dynamical_truncate_on_padding_batch = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] + + @staticmethod + def get_labels(): + labels = ['NAME', 'ORG', 'TITLE', 'RACE', 'EDU', 'CONT', 'LOC', 'PRO', ] + Bio_labels = ["O"] + [f"B-{x}" for x in labels] + [f"I-{x}" for x in labels] + return Bio_labels + + def read_data(self, input_file): + lines = [] + with open(input_file, 'r') as f: + words, labels = [], [] + for line in f: + if line == "" or line == "\n": + if words: + lines.append([words, labels]) + words, labels = [], [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + label = splits[-1].replace("\n", "") + labels.append(label) + else: + labels.append("O") + if words: + lines.append([words, labels]) + return lines + + def build_examples(self, data, data_type, **kwargs): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + tokens = line[0] + labels = line[1] if data_type != 'test' else None + examples.append(dict(guid=guid, tokens=tokens, labels=labels)) + return examples + + +class ProcessExample2Feature: + + def __init__(self, label2id, tokenizer, max_sequence_length): + super().__init__() + self.label2id = label2id + self.tokenizer = tokenizer + self.max_sequence_length = max_sequence_length + + def __call__(self, example): + text = example['tokens'] + labels = example['labels'] + encoding = self.tokenizer(text, + padding="max_length", + truncation="longest_first", + max_length=self.max_sequence_length, + return_overflowing_tokens=True, + is_split_into_words=True, + return_tensors='pt', + ) + + overflowing_tokens = encoding.pop("overflowing_tokens") + num_truncated_tokens = encoding.pop("num_truncated_tokens") + outputs = {k: v.squeeze(0) for k, v in encoding.items()} + truncate_len = len(text) - overflowing_tokens.size(-1) + padd_len = self.max_sequence_length - truncate_len - 2 + labels = ['O'] + labels[: truncate_len] + ['O'] + labels = labels + ['O'] * padd_len + outputs["label_ids"] = torch.tensor([self.label2id[x] for x in labels]) + return outputs + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length, **kwargs): + process_piplines = [ + ProcessExample2Feature( + ResumeDataset.label2id(), tokenizer, max_sequence_length), + ] + return ResumeDataset(opts, file_name, data_type, process_piplines, **kwargs) + + +class SequenceLabelingTrainer(TrainBaseBuilder): + + def process_batch_outputs(self, batches): + preds = [] + target = [] + for x,y in zip(batches['predictions'],batches['groundtruths']): + preds.extend(x) + target.extend(y) + return {"preds": preds, "target": target} + + +MODEL_CLASSES = { + "bert": (BertConfig, BertSoftmaxForSeqLabel, BertTokenizer), +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + + opts.num_labels = len(ResumeDataset.label2id()) + opts.label2id = ResumeDataset.label2id() + opts.id2label = ResumeDataset.id2label() + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, + num_labels=opts.num_labels, + label2id=opts.label2id, + id2label=opts.id2label) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + + # trainer + logger.info("initializing traniner") + labels = {label.split('-')[1] for label in ResumeDataset.get_labels() if '-' in label} + metrics = [SequenceLabelingScore(labels=labels, average='micro', schema='BIO')] + trainer = SequenceLabelingTrainer(opts=opts, model=model, metrics=metrics, logger=logger) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_sequence_labeling_resume_token_softmax.py b/examples/task_sequence_labeling_resume_token_softmax.py new file mode 100644 index 0000000..a28a225 --- /dev/null +++ b/examples/task_sequence_labeling_resume_token_softmax.py @@ -0,0 +1,201 @@ +import torch +import torch.nn as nn +from transformers import ( + BertConfig, + BertTokenizer, +) +from torch.nn import CrossEntropyLoss +from torchblocks.data import DatasetBaseBuilder +from torchblocks.metrics.sequence_labeling.seqTag_score import SequenceLabelingScore +from torchblocks.utils.options import Argparser +from torchblocks.utils.logger import Logger +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.utils.device import build_device +from torchblocks.utils import seed_everything +from transformers import BertPreTrainedModel, BertModel + + +class BertSoftmaxForSeqLabel(BertPreTrainedModel, Application): + + def __init__(self, config): + super().__init__(config) + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, logits, labels, attention_mask): + loss_fct = CrossEntropyLoss() + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.config.num_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + return loss + + def forward(self, inputs): + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + labels = inputs.get("labels", None) + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + loss, groundtruths, predictions = None, None, None + if labels is not None: + loss = self.compute_loss(logits, labels, attention_mask) + if not self.training: + groundtruths = self.decode(labels, attention_mask, is_logits=False) + if not self.training: # 训练时无需解码 + predictions = self.decode(logits, attention_mask, is_logits=True) + return { + "loss": loss, + "logits": logits, + "predictions": predictions, + "groundtruths": groundtruths + } + + def decode(self, logits, mask, is_logits=False): + decode_ids = logits + if is_logits: + decode_ids = torch.argmax(logits, -1) # (batch_size, seq_length) + decode_labels = [] + for ids, mask in zip(decode_ids, mask): + decode_label = [self.config.id2label[id.item()] for id, m in zip(ids, mask) if m > 0][1:-1] # [CLS], [SEP] + decode_labels.append(decode_label) + return decode_labels + + +class ResumeDataset(DatasetBaseBuilder): + keys_to_dynamical_truncate_on_padding_batch = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] + + @staticmethod + def get_labels(): + labels = ['NAME', 'ORG', 'TITLE', 'RACE', 'EDU', 'CONT', 'LOC', 'PRO', ] + Bio_labels = ["O"] + [f"B-{x}" for x in labels] + [f"I-{x}" for x in labels] + return Bio_labels + + def read_data(self, input_file): + lines = [] + with open(input_file, 'r') as f: + words, labels = [], [] + for line in f: + if line == "" or line == "\n": + if words: + lines.append([words, labels]) + words, labels = [], [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + label = splits[-1].replace("\n", "") + labels.append(label) + else: + labels.append("O") + if words: + lines.append([words, labels]) + return lines + + def build_examples(self, data, data_type, **kwargs): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + tokens = line[0] + labels = line[1] if data_type != 'test' else None + examples.append(dict(guid=guid, tokens=tokens, labels=labels)) + return examples + +class ProcessExample2Feature: + + def __init__(self, label2id, tokenizer, max_sequence_length): + super().__init__() + self.label2id = label2id + self.tokenizer = tokenizer + self.max_sequence_length = max_sequence_length + + def __call__(self, example): + text = example['tokens'] + labels = example['labels'] + encoding = self.tokenizer(text, + padding="max_length", + truncation="longest_first", + max_length=self.max_sequence_length, + return_overflowing_tokens=True, + is_split_into_words=True, + return_tensors='pt', + ) + + overflowing_tokens = encoding.pop("overflowing_tokens") + num_truncated_tokens = encoding.pop("num_truncated_tokens") + outputs = {k: v.squeeze(0) for k, v in encoding.items()} + truncate_len = len(text) - overflowing_tokens.size(-1) + padd_len = self.max_sequence_length - truncate_len - 2 + labels = ['O'] + labels[: truncate_len] + ['O'] + labels = labels + ['O'] * padd_len + outputs["label_ids"] = torch.tensor([self.label2id[x] for x in labels]) + return outputs + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length, **kwargs): + process_piplines = [ + ProcessExample2Feature( + ResumeDataset.label2id(), tokenizer, max_sequence_length), + ] + return ResumeDataset(opts, file_name, data_type, process_piplines, **kwargs) + + +class SequenceLabelingTrainer(TrainBaseBuilder): + + def process_batch_outputs(self, batches): + return {"preds": batches['predictions'], "target": batches['groundtruths']} + + +MODEL_CLASSES = { + "bert": (BertConfig, BertSoftmaxForSeqLabel, BertTokenizer), +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + + opts.num_labels = len(ResumeDataset.label2id()) + opts.label2id = ResumeDataset.label2id() + opts.id2label = ResumeDataset.id2label() + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, + num_labels=opts.num_labels, + label2id=opts.label2id, + id2label=opts.id2label) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + + # trainer + logger.info("initializing traniner") + labels = {label.split('-')[1] for label in ResumeDataset.get_labels() if '-' in label} + metrics = [SequenceLabelingScore(labels=labels, average='micro', schema='BIO')] + trainer = SequenceLabelingTrainer(opts=opts, model=model, metrics=metrics, logger=logger) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_text_classification_cola.py b/examples/task_text_classification_cola.py old mode 100755 new mode 100644 index 67e0e8f..362f226 --- a/examples/task_text_classification_cola.py +++ b/examples/task_text_classification_cola.py @@ -1,42 +1,63 @@ import os import csv -from typing import List, Dict, Callable, Any -from torchblocks.core import TextClassifierTrainer -from torchblocks.data.dataset import DatasetBase -from torchblocks.utils.seed import seed_everything +import torch +import torch.nn as nn +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.utils import seed_everything from torchblocks.utils.options import Argparser -from torchblocks.utils.device import prepare_device +from torchblocks.utils.device import build_device from torchblocks.utils.logger import Logger -from torchblocks.utils.paths import check_dir -from torchblocks.data.process_base import ProcessBase -from torchblocks.utils.paths import find_all_checkpoints +from torchblocks.utils import check_dir +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import find_all_checkpoints from torchblocks.metrics.classification.matthews_corrcoef import MattewsCorrcoef -from transformers import BertForSequenceClassification, BertConfig, BertTokenizer - -MODEL_CLASSES = { - 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer) -} - - -class ColaDataset(DatasetBase): - - def __init__(self, - data_name, - data_dir, - data_type, - process_piplines: List[Callable], - **kwargs): - super().__init__(data_name, data_dir, data_type, process_piplines, **kwargs) - - @classmethod - def get_labels(self) -> List[str]: +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel + + +class BertForSequenceClassification(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, outputs, labels): + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(outputs.view(-1, self.num_labels), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert(inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids']) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + loss = None + labels = inputs.get("labels", None) + if labels is not None: + loss = self.compute_loss(logits, labels) + return {"loss": loss, "logits": logits} + + +# 定义数据集加载 +class ColaDataset(DatasetBaseBuilder): + + @staticmethod + def get_labels(): return ["0", "1"] - def read_data(self, input_file: str) -> Any: + def read_data(self, input_file): with open(input_file, "r", encoding="utf-8-sig") as f: return list(csv.reader(f, delimiter="\t")) - def create_examples(self, data: Any, data_type: str, **kwargs) -> List[Dict[str, Any]]: + def build_examples(self, data, data_type): test_mode = data_type == "test" if test_mode: data = data[1:] @@ -50,11 +71,13 @@ def create_examples(self, data: Any, data_type: str, **kwargs) -> List[Dict[str, return examples -class ProcessEncodeText(ProcessBase): +# 数据的处理 +class ProcessEncodeText: """ 编码单句任务文本,在原有example上追加 """ - def __init__(self, tokenizer, tokenizer_params, return_input_length=False): + def __init__(self, tokenizer, label2id, tokenizer_params, return_input_length=False): self.tokenizer = tokenizer + self.label2id = label2id self.tokenizer_params = tokenizer_params self.return_input_length = return_input_length @@ -63,67 +86,74 @@ def __call__(self, example): inputs = {k: v.squeeze() for k, v in inputs.items()} if self.return_input_length: inputs["input_length"] = inputs["attention_mask"].sum().item() - example = dict(example, **inputs) - return example + inputs["label"] = self.label2id.get(example["label"], None) + return inputs -class ProcessEncodeLabel(ProcessBase): - """ 编码单标签文本标签 """ +# 定义任务的训练模块 +class TextClassifierTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' - def __init__(self, label2id): - self.label2id = label2id - - def __call__(self, example): - example["label"] = self.label2id.get(example["label"], None) - return example + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + preds = torch.cat([batch for batch in batches['logits']], dim=dim) + target = torch.cat([batch for batch in batches['labels']], dim=dim) + return {"preds": preds, "target": target} -def load_data(data_name, data_dir, data_type, tokenizer, max_sequence_length): - process_piplines = [ProcessEncodeText(tokenizer, +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(tokenizer, ColaDataset.label2id(), tokenizer_params={ "padding": "max_length", "truncation": "longest_first", "max_length": max_sequence_length, "return_tensors": "pt", - }), - ProcessEncodeLabel(ColaDataset.label2id()) + }) ] - return ColaDataset(data_name=data_name, - data_dir=data_dir, - data_type=data_type, - process_piplines=process_piplines - ) + return ColaDataset(opts, file_name, data_type, process_piplines) + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer) +} def main(): - opts = Argparser().get_training_arguments() + opts = Argparser().build_arguments() logger = Logger(opts=opts) # device logger.info("initializing device") - opts.device, opts.device_num = prepare_device(opts.device_id) + opts.device, opts.device_num = build_device(opts.device_id) seed_everything(opts.seed) config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] # data processor logger.info("initializing data processor") tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) - train_dataset = load_data(opts.train_input_file, opts.data_dir, "train", tokenizer, opts.train_max_seq_length) - dev_dataset = load_data(opts.eval_input_file, opts.data_dir, "dev", tokenizer, opts.eval_max_seq_length) - opts.num_labels = train_dataset.num_labels + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(ColaDataset.label2id()) + # model logger.info("initializing model and config") config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + config.output_hidden_states = False + config.hidden_dropout_prob = 0. + config.attention_probs_dropout_prob = 0. model = model_class.from_pretrained(opts.pretrained_model_path, config=config) model.to(opts.device) # trainer logger.info("initializing traniner") trainer = TextClassifierTrainer(opts=opts, model=model, - metrics=[MattewsCorrcoef(num_classes=opts.num_labels)], + metrics=[MattewsCorrcoef(task="multiclass", num_classes=opts.num_labels)], logger=logger ) # do train if opts.do_train: trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + # do eval if opts.do_eval: checkpoints = [] if opts.checkpoint_predict_code is not None: @@ -131,15 +161,15 @@ def main(): check_dir(checkpoint) checkpoints.append(checkpoint) if opts.eval_all_checkpoints: - checkpoints = find_all_checkpoints(checkpoint_dir=opts.output_dir) + checkpoints = find_all_checkpoints(ckpt_dir=opts.output_dir) logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: prefix = checkpoint.split("/")[-1] model = model_class.from_pretrained(checkpoint, config=config) model.to(opts.device) trainer.model = model - trainer.evaluate(dev_data=dev_dataset, save_result=True, save_dir=prefix) - + trainer.evaluate(model=model, dev_data=dev_dataset, save_result=True, save_dir=prefix) + # do predict if opts.do_predict: test_dataset = load_data(opts.test_input_file, opts.data_dir, "test", tokenizer, opts.test_max_seq_length) checkpoints = [] @@ -153,7 +183,7 @@ def main(): model = model_class.from_pretrained(checkpoint, config=config) model.to(opts.device) trainer.model = model - trainer.predict(test_data=test_dataset, save_result=True, save_dir=prefix) + trainer.predict(model=model, test_data=test_dataset, save_result=True, save_dir=prefix) if __name__ == "__main__": diff --git a/examples/task_text_classification_cola_adan.py b/examples/task_text_classification_cola_adan.py new file mode 100644 index 0000000..56faa80 --- /dev/null +++ b/examples/task_text_classification_cola_adan.py @@ -0,0 +1,213 @@ +import os +import csv +import torch +import torch.nn as nn +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.optims.adan import Adan +from torchblocks.utils.logger import Logger +from torchblocks.utils import check_dir +from torchblocks.utils import find_all_checkpoints +from torchblocks.optims.lr_scheduler import get_polynomial_decay_schedule_with_warmup +from torchblocks.metrics.classification.matthews_corrcoef import MattewsCorrcoef +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel + + +class BertForSequenceClassification(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, outputs, labels): + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(outputs.view(-1, self.num_labels), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert(inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids']) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + loss = None + labels = inputs.get("labels", None) + if labels is not None: + loss = self.compute_loss(logits, labels) + return {"loss": loss, "logits": logits} + + +# 定义数据集加载 +class ColaDataset(DatasetBaseBuilder): + + @staticmethod + def get_labels(): + return ["0", "1"] + + def read_data(self, input_file): + with open(input_file, "r", encoding="utf-8-sig") as f: + return list(csv.reader(f, delimiter="\t")) + + def build_examples(self, data, data_type): + test_mode = data_type == "test" + if test_mode: + data = data[1:] + text_index = 1 if test_mode else 3 + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + text = line[text_index] + label = None if test_mode else line[1] + examples.append(dict(guid=guid, text=text, label=label)) + return examples + + +# 数据的处理 +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, tokenizer, label2id, tokenizer_params, return_input_length=False): + self.tokenizer = tokenizer + self.label2id = label2id + self.tokenizer_params = tokenizer_params + self.return_input_length = return_input_length + + def __call__(self, example): + inputs = self.tokenizer(example["text"], **self.tokenizer_params) + inputs = {k: v.squeeze() for k, v in inputs.items()} + if self.return_input_length: + inputs["input_length"] = inputs["attention_mask"].sum().item() + inputs["label"] = self.label2id.get(example["label"], None) + return inputs + + +# 定义任务的训练模块 +class TextClassifierTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + + def build_optimizer(self, model): + self.logger.info("The custom optimizer is `Adan` optimizer") + optimizer_grouped_parameters = self.build_model_param_optimizer(model) + optimizer = Adan(params=optimizer_grouped_parameters, + lr=self.opts.learning_rate, + eps=self.opts.adan_epsilon, + betas=(self.opts.adan_beta1, self.opts.adan_beta2, self.opts.adan_beta3), + weight_decay=self.opts.weight_decay) + return optimizer + + def build_lr_scheduler(self): + ''' + the learning rate scheduler. + ''' + warmup_steps = self.build_warmup_steps() + scheduler = get_polynomial_decay_schedule_with_warmup(optimizer=self.optimizer, + num_warmup_steps=warmup_steps, + num_training_steps=self.num_update_training_steps) + return scheduler + + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + preds = torch.cat([batch for batch in batches['logits']], dim=dim) + target = torch.cat([batch for batch in batches['labels']], dim=dim) + return {"preds": preds, "target": target} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(tokenizer, ColaDataset.label2id(), + tokenizer_params={ + "padding": "max_length", + "truncation": "longest_first", + "max_length": max_sequence_length, + "return_tensors": "pt", + }) + ] + return ColaDataset(opts, file_name, data_type, process_piplines) + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer) +} + + +def main(): + parser = Argparser.build_parser() + group = parser.add_argument_group(title="Adan optimizer", description="Adan pointer") + group.add_argument("--adan_beta1", default=0.9, type=float, help="Beta1 for optimizer") + group.add_argument("--adan_beta2", default=0.999, type=float, help='Beta2 for optimizer') + group.add_argument("--adan_beta3", default=0.999, type=float, help='Beta2 for optimizer') + group.add_argument('--adan_epsilon', default=1e-08, type=float) + opts = parser.build_args_from_parser(parser) + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(ColaDataset.label2id()) + + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = TextClassifierTrainer(opts=opts, + model=model, + metrics=[MattewsCorrcoef(task="multiclass", num_classes=opts.num_labels)], + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + # do eval + if opts.do_eval: + checkpoints = [] + if opts.checkpoint_predict_code is not None: + checkpoint = os.path.join(opts.output_dir, opts.checkpoint_predict_code) + check_dir(checkpoint) + checkpoints.append(checkpoint) + if opts.eval_all_checkpoints: + checkpoints = find_all_checkpoints(ckpt_dir=opts.output_dir) + logger.info("Evaluate the following checkpoints: %s", checkpoints) + for checkpoint in checkpoints: + prefix = checkpoint.split("/")[-1] + model = model_class.from_pretrained(checkpoint, config=config) + model.to(opts.device) + trainer.evaluate(model=model, dev_data=dev_dataset, save_result=True, save_dir=prefix) + # do predict + if opts.do_predict: + test_dataset = load_data(opts.test_input_file, opts.data_dir, "test", tokenizer, opts.test_max_seq_length) + checkpoints = [] + if opts.checkpoint_predict_code is not None: + checkpoint = os.path.join(opts.output_dir, opts.checkpoint_predict_code) + check_dir(checkpoint) + checkpoints.append(checkpoint) + logger.info("Evaluate the following checkpoints: %s", checkpoints) + for checkpoint in checkpoints: + prefix = checkpoint.split("/")[-1] + model = model_class.from_pretrained(checkpoint, config=config) + model.to(opts.device) + trainer.predict(model=model, test_data=test_dataset, save_result=True, save_dir=prefix) + + +if __name__ == "__main__": + main() diff --git a/examples/task_text_classification_wsc.py b/examples/task_text_classification_wsc.py new file mode 100644 index 0000000..3f4dd71 --- /dev/null +++ b/examples/task_text_classification_wsc.py @@ -0,0 +1,203 @@ +import os +import json +import torch +import torch.nn as nn +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.utils import check_dir +from torchblocks.utils import find_all_checkpoints +from torchblocks.metrics.classification.accuracy import Accuracy +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel + + +class BertForSequenceClassification(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, outputs, labels): + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(outputs.view(-1, self.num_labels), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert(inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids']) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + loss = None + labels = inputs.get("labels", None) + if labels is not None: + loss = self.compute_loss(logits, labels) + return {"loss": loss, "logits": logits} + + +# 定义数据集加载 +class WscDataset(DatasetBaseBuilder): + + @staticmethod + def get_labels(): + return ["true", "false"] + + def read_data(self, input_file: str): + with open(input_file, "r", encoding="utf-8-sig") as f: + lines = [] + for line in f.readlines(): + lines.append(json.loads(line)) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + text = line['text'] + span = line['target'] + label = line['label'] if data_type != 'test' else None + examples.append(dict(guid=guid, text=text, span=span, label=label)) + return examples + + +# 数据的处理 +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, tokenizer, label2id, tokenizer_params, return_input_length=False): + self.tokenizer = tokenizer + self.label2id = label2id + self.tokenizer_params = tokenizer_params + self.return_input_length = return_input_length + + def __call__(self, example): + text_a = example['text'] + text_a_list = list(text_a) + target = example['span'] + query = target['span1_text'] + query_idx = target['span1_index'] + pronoun = target['span2_text'] + pronoun_idx = target['span2_index'] + assert text_a[pronoun_idx: (pronoun_idx + len(pronoun))] == pronoun, "pronoun: {}".format(pronoun) + assert text_a[query_idx: (query_idx + len(query))] == query, "query: {}".format(query) + if pronoun_idx > query_idx: + text_a_list.insert(query_idx, "_") + text_a_list.insert(query_idx + len(query) + 1, "_") + text_a_list.insert(pronoun_idx + 2, "[") + text_a_list.insert(pronoun_idx + len(pronoun) + 2 + 1, "]") + else: + text_a_list.insert(pronoun_idx, "[") + text_a_list.insert(pronoun_idx + len(pronoun) + 1, "]") + text_a_list.insert(query_idx + 2, "_") + text_a_list.insert(query_idx + len(query) + 2 + 1, "_") + text_a = "".join(text_a_list) + inputs = self.tokenizer(text_a, **self.tokenizer_params) + inputs = {k: v.squeeze() for k, v in inputs.items()} + inputs["label"] = self.label2id.get(example["label"], None) + return inputs + + +# 定义任务的训练模块 +class TextClassifierTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + preds = torch.cat([batch for batch in batches['logits']], dim=dim) + target = torch.cat([batch for batch in batches['labels']], dim=dim) + return {"preds": preds, "target": target} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(tokenizer, WscDataset.label2id(), + tokenizer_params={ + "padding": "max_length", + "truncation": "longest_first", + "max_length": max_sequence_length, + "return_tensors": "pt", + }) + ] + return WscDataset(opts, file_name, data_type, process_piplines) + + +MODEL_CLASSES = { + 'macbert': (BertConfig, BertForSequenceClassification, BertTokenizer) +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(train_dataset.label2id()) + + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = TextClassifierTrainer(opts=opts, + model=model, + metrics=[Accuracy(task="multiclass",num_classes=opts.num_labels)], + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + # do eval + if opts.do_eval: + checkpoints = [] + if opts.checkpoint_predict_code is not None: + checkpoint = os.path.join(opts.output_dir, opts.checkpoint_predict_code) + check_dir(checkpoint) + checkpoints.append(checkpoint) + if opts.eval_all_checkpoints: + checkpoints = find_all_checkpoints(ckpt_dir=opts.output_dir) + logger.info("Evaluate the following checkpoints: %s", checkpoints) + for checkpoint in checkpoints: + prefix = checkpoint.split("/")[-1] + model = model_class.from_pretrained(checkpoint, config=config) + model.to(opts.device) + trainer.evaluate(model=model, dev_data=dev_dataset, save_result=True, save_dir=prefix) + # do predict + if opts.do_predict: + test_dataset = load_data(opts.test_input_file, opts.data_dir, "test", tokenizer, opts.test_max_seq_length) + checkpoints = [] + if opts.checkpoint_predict_code is not None: + checkpoint = os.path.join(opts.output_dir, opts.checkpoint_predict_code) + check_dir(checkpoint) + checkpoints.append(checkpoint) + logger.info("Evaluate the following checkpoints: %s", checkpoints) + for checkpoint in checkpoints: + prefix = checkpoint.split("/")[-1] + model = model_class.from_pretrained(checkpoint, config=config) + model.to(opts.device) + trainer.predict(model=model, test_data=test_dataset, save_result=True, save_dir=prefix) + + +if __name__ == "__main__": + main() diff --git a/examples/task_text_classify_fewshot_pet.py b/examples/task_text_classify_fewshot_pet.py new file mode 100644 index 0000000..9caefae --- /dev/null +++ b/examples/task_text_classify_fewshot_pet.py @@ -0,0 +1,237 @@ +import torch +import torch.nn as nn +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.metrics.classification.accuracy import Accuracy +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel +from transformers.models.bert.modeling_bert import BertOnlyMLMHead + + +class BertForMaskedLM(BertPreTrainedModel, Application): + # _keys_to_ignore_on_load_unexpected = [r"pooler"] + # _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + def __init__(self, config): + super().__init__(config) + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + self.init_weights() + + def compute_loss(self, outputs, labels, **kwargs): + loss_fct = nn.CrossEntropyLoss() # -100 index = padding token + loss = loss_fct(outputs.view(-1, self.config.vocab_size), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert( + inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids'] + ) + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + masked_lm_loss = None + labels = inputs.get("labels", None) + if labels is not None: + masked_lm_loss = self.compute_loss(prediction_scores, labels) + return {"loss": masked_lm_loss, "logits": prediction_scores} + + +# 定义数据集加载 +class FewShotPETDataset(DatasetBaseBuilder): + # collecate + keys_to_ignore_on_collate_batch = ['raw_label', 'mask_span_indices'] + # 动态batch处理过程中需要进行按照batch最长长度进行截取的keys + keys_to_dynamical_truncate_on_padding_batch = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] + + @staticmethod + def get_labels(): + return ["0", "1"] + + @staticmethod + def get_label_desc(): + return ["否", "能"] + + def read_data(self, input_file): + with open(input_file, 'r') as f: + data_rows = f.readlines() + return data_rows + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + lines = line.strip("\n").split("\t") + guid = f"{data_type}-{i}" + text_a = lines[1] + text_b = lines[2] + label = lines[3] if data_type != 'test' else None + examples.append(dict(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +# 数据的处理 +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, tokenizer, max_sequence_length, label2desc): + self.tokenizer = tokenizer + self.label2desc = label2desc + self.max_sequence_length = max_sequence_length + self.vocab_size = len(self.tokenizer.vocab) + self.pad_idx = self.tokenizer.pad_token_id + self.label_length = len(tokenizer.tokenize(list(label2desc.values())[0])) # label的长度对应mask个数 + + def truncate_seq_pair(self, tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + def __call__(self, example): + # pattern: sent1,label,用,sent2,概括。 + text_a = example['text_a'] + text_b = example['text_b'] + raw_label = example['label'] + num_extra_tokens = self.label_length + 3 # num_extra_tokens:能/否 用 概 括。 + tokens_a = self.tokenizer.tokenize(text_a) + max_seq_length = self.max_sequence_length - num_extra_tokens + tokens_b = self.tokenizer.tokenize(text_b) + self.truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 2) # cls sep + + # 构建pattern: sent1,label,用,sent2,概括。 + tokens = [self.tokenizer.cls_token] # cls + tokens += tokens_a # text_a + label_position = len(tokens) + tokens += [self.tokenizer.mask_token] * self.label_length # [MASK]插入 + tokens += self.tokenizer.tokenize("用") # + tokens += tokens_b # text_b + tokens += self.tokenizer.tokenize("概括") + tokens += [self.tokenizer.sep_token] # sep + # 转化 + input_ids = self.tokenizer.convert_tokens_to_ids(tokens) + length = len(input_ids) + attention_mask = [1] * length + token_type_ids = [0] * length + # token padding + padding_length = self.max_sequence_length - length + input_ids += [self.pad_idx] * padding_length + attention_mask += [self.pad_idx] * padding_length + token_type_ids += [0] * padding_length + mask_span_indices = [] + for i in range(self.label_length): + mask_span_indices.append([label_position + i]) + mask_labels = None + if raw_label is not None: + label_desc = self.label2desc[raw_label] + label_desc_tokens = self.tokenizer.tokenize(label_desc) + label_tokens_ids = self.tokenizer.convert_tokens_to_ids(label_desc_tokens) + mask_labels = [-100] * self.max_sequence_length + for i in range(self.label_length): + mask_labels[label_position + i] = label_tokens_ids[i] + return { + 'input_ids': torch.tensor(input_ids), + 'attention_mask': torch.tensor(attention_mask), + 'token_type_ids': torch.tensor(token_type_ids), + 'label_ids': torch.tensor(mask_labels), + 'raw_label': int(raw_label), + 'mask_span_indices': mask_span_indices + } + + +# 定义任务的训练模块 +class FewShotPETTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + keys_to_ignore_on_gpu = ['raw_label', 'mask_span_indices'] # batch数据中不转换为GPU的变量名 + keys_to_ignore_on_save_result = ['input_ids', 'token_type_ids'] # eval和predict结果不存储的变量 + keys_to_ignore_on_save_checkpoint = ["optimizer"] # checkpoint中不存储的模块,比如'optimizer' + + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + batch_num = len(batches['logits']) + desc2ids = self.opts.desc2ids + label2desc = self.opts.label2desc + desc2label = {value: key for key, value in label2desc.items()} + target = [] + preds = [] + # 处理desc到真实label的映射 + for b in range(batch_num): + logits = batches['logits'][b].float().log_softmax(dim=-1) + mask_span_indices = batches['mask_span_indices'][b] + raw_labels = batches['raw_label'][b] + for i in range(logits.shape[0]): + y_logits = logits[i] + indices = mask_span_indices[i] + target.append(raw_labels[i]) + pred_label_probs = [] + # 计算预测标签prob + for key, value in desc2ids.items(): + pred_prob = 0. + # subword采用相加方式 + for l_ids, span_indices in zip(value, indices): + span_idx = span_indices[0] + pred_prob += y_logits[span_idx, l_ids] + pred_label_probs.append([key, pred_prob]) + pred_label = sorted(pred_label_probs, key=lambda x: x[1], reverse=True)[0][0] + preds.append(int(desc2label[pred_label])) + return {'preds': torch.tensor(preds), 'target': torch.tensor(target)} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(tokenizer, max_sequence_length, opts.label2desc)] + return FewShotPETDataset(opts, file_name, data_type, process_piplines) + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForMaskedLM, BertTokenizer) +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + labels = FewShotPETDataset.get_labels() + label_desc = FewShotPETDataset.get_label_desc() + label2desc = dict(zip(labels, label_desc)) # 原始标签与desc的对应关系 + desc2ids = {key: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(key)) for key in label_desc} + opts.desc2ids = desc2ids # 每一个label desc对应的tokenizer的ids,主要用于eval和test过程 + opts.label2desc = label2desc + opts.num_labels = len(labels) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = FewShotPETTrainer(opts=opts, model=model, + metrics=[Accuracy(task="multiclass", num_classes=opts.num_labels)], logger=logger) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}, + convert_output_cuda_to_cpu=True) + + +if __name__ == "__main__": + main() diff --git a/examples/task_text_classify_fewshot_ptuning.py b/examples/task_text_classify_fewshot_ptuning.py new file mode 100644 index 0000000..3341296 --- /dev/null +++ b/examples/task_text_classify_fewshot_ptuning.py @@ -0,0 +1,240 @@ +import torch +import torch.nn as nn +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.metrics.classification.accuracy import Accuracy +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel +from transformers.models.bert.modeling_bert import BertOnlyMLMHead + + +class BertForMaskedLM(BertPreTrainedModel, Application): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + self.init_weights() + + def compute_loss(self, outputs, labels, **kwargs): + loss_fct = nn.CrossEntropyLoss() # -100 index = padding token + loss = loss_fct(outputs.view(-1, self.config.vocab_size), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert( + inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids'] + ) + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + masked_lm_loss = None + labels = inputs.get("labels", None) + if labels is not None: + masked_lm_loss = self.compute_loss(prediction_scores, labels) + return {"loss": masked_lm_loss, "logits": prediction_scores} + + +# 定义数据集加载 +class FewShotPtuningDataset(DatasetBaseBuilder): + # zai collecate + keys_to_ignore_on_collate_batch = ['raw_label', 'mask_span_indices'] + # 动态batch处理过程中需要进行按照batch最长长度进行截取的keys + keys_to_dynamical_truncate_on_padding_batch = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] + + @staticmethod + def get_labels(): + return ["0", "1"] + + @staticmethod + def get_label_desc(): + return ["否", "能"] + + def read_data(self, input_file): + with open(input_file, 'r') as f: + data_rows = f.readlines() + return data_rows + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + lines = line.strip("\n").split("\t") + guid = f"{data_type}-{i}" + text_a = lines[1] + text_b = lines[2] + label = lines[3] if data_type != 'test' else None + examples.append(dict(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +# 数据的处理 +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, tokenizer, max_sequence_length, label2desc): + self.tokenizer = tokenizer + self.label2desc = label2desc + self.max_sequence_length = max_sequence_length + self.vocab_size = len(self.tokenizer.vocab) + self.pad_idx = self.tokenizer.pad_token_id + # import pdb + # pdb.set_trace() + self.label_length = len(tokenizer.tokenize(list(label2desc.values())[0])) # label的长度对应mask个数 + + def truncate_seq_pair(self, tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + def __call__(self, example): + # pattern=sent1,,label,,sent2 + text_a = example['text_a'] + text_b = example['text_b'] + raw_label = example['label'] + num_extra_tokens = self.label_length + 2 # num_extra_tokens: ,label,。 + tokens_a = self.tokenizer.tokenize(text_a) + max_seq_length = self.max_sequence_length - num_extra_tokens + tokens_b = self.tokenizer.tokenize(text_b) + self.truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 2) # cls sep + + # 构建pattern=sent1,,label,,sent2 + tokens = [self.tokenizer.cls_token] # cls + tokens += tokens_a # text_a + tokens += ['[unused0]'] + label_position = len(tokens) + tokens += [self.tokenizer.mask_token] * self.label_length # [MASK]插入 + tokens += ['[unused1]'] # + tokens += tokens_b # text_b + tokens += [self.tokenizer.sep_token] # sep + # 转化 + input_ids = self.tokenizer.convert_tokens_to_ids(tokens) + length = len(input_ids) + attention_mask = [1] * length + token_type_ids = [0] * length + # token padding + padding_length = self.max_sequence_length - length + input_ids += [self.pad_idx] * padding_length + attention_mask += [self.pad_idx] * padding_length + token_type_ids += [0] * padding_length + mask_span_indices = [] + for i in range(self.label_length): + mask_span_indices.append([label_position + i]) + mask_labels = None + if raw_label is not None: + label_desc = self.label2desc[raw_label] + label_desc_tokens = self.tokenizer.tokenize(label_desc) + label_tokens_ids = self.tokenizer.convert_tokens_to_ids(label_desc_tokens) + mask_labels = [-100] * self.max_sequence_length + for i in range(self.label_length): + mask_labels[label_position + i] = label_tokens_ids[i] + return { + 'input_ids': torch.tensor(input_ids), + 'attention_mask': torch.tensor(attention_mask), + 'token_type_ids': torch.tensor(token_type_ids), + 'label_ids': torch.tensor(mask_labels), + 'raw_label': int(raw_label), + 'mask_span_indices': mask_span_indices + } + + +# 定义任务的训练模块 +class FewShotPtuningTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + keys_to_ignore_on_gpu = ['raw_label', 'mask_span_indices'] # batch数据中不转换为GPU的变量名,比如'input_length’ + keys_to_ignore_on_save_result = ['input_ids', 'token_type_ids'] # eval和predict结果不存储的变量 + keys_to_ignore_on_save_checkpoint = ["optimzier"] # checkpoint中不存储的模块,比如'optimizer' + + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + batch_num = len(batches['logits']) + desc2ids = self.opts.desc2ids + label2desc = self.opts.label2desc + desc2label = {value: key for key, value in label2desc.items()} + target = [] + preds = [] + # 处理desc到真实label的映射 + for b in range(batch_num): + logits = batches['logits'][b].float().log_softmax(dim=-1) + mask_span_indices = batches['mask_span_indices'][b] + raw_labels = batches['raw_label'][b] + for i in range(logits.shape[0]): + y_logits = logits[i] + indices = mask_span_indices[i] + target.append(raw_labels[i]) + pred_label_probs = [] + # 计算预测标签prob + for key, value in desc2ids.items(): + pred_prob = 0. + # subword采用相加方式 + for l_ids, span_indices in zip(value, indices): + span_idx = span_indices[0] + pred_prob += y_logits[span_idx, l_ids] + pred_label_probs.append([key, pred_prob]) + pred_label = sorted(pred_label_probs, key=lambda x: x[1], reverse=True)[0][0] + preds.append(int(desc2label[pred_label])) + return {'preds': torch.tensor(preds), 'target': torch.tensor(target)} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(tokenizer, max_sequence_length, opts.label2desc)] + return FewShotPtuningDataset(opts, file_name, data_type, process_piplines) + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForMaskedLM, BertTokenizer) +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + labels = FewShotPtuningDataset.get_labels() + label_desc = FewShotPtuningDataset.get_label_desc() + label2desc = dict(zip(labels, label_desc)) # 原始标签与desc的对应关系 + desc2ids = {key: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(key)) for key in label_desc} + opts.desc2ids = desc2ids # 每一个label desc对应的tokenizer的ids,主要用于eval和test过程 + opts.label2desc = label2desc + opts.num_labels = len(labels) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + metric = Accuracy(task="multiclass", num_classes=opts.num_labels) + trainer = FewShotPtuningTrainer(opts=opts, model=model, metrics=[metric], logger=logger) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_text_classify_tnews.py b/examples/task_text_classify_tnews.py new file mode 100644 index 0000000..14df6aa --- /dev/null +++ b/examples/task_text_classify_tnews.py @@ -0,0 +1,156 @@ +import json +import torch +import torch.nn as nn +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.metrics.classification.matthews_corrcoef import MattewsCorrcoef +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel + +class BertForSequenceClassification(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, outputs, labels): + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(outputs.view(-1, self.num_labels), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert(inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids']) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + loss = None + labels = inputs.get("labels", None) + if labels is not None: + loss = self.compute_loss(logits, labels) + return {"loss": loss, "logits": logits} + + +# 定义数据集加载 +class TnewsDataset(DatasetBaseBuilder): + + @staticmethod + def get_labels(): + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def read_data(self, input_file: str): + with open(input_file, "r", encoding="utf-8-sig") as f: + lines = [] + for line in f.readlines(): + lines.append(json.loads(line)) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + text = line['sentence'] + label = line['label'] if data_type != 'test' else None + examples.append(dict(guid=guid, text=text, label=label)) + return examples + +# 数据的处理 +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, tokenizer, label2id, tokenizer_params, return_input_length=False): + self.tokenizer = tokenizer + self.label2id = label2id + self.tokenizer_params = tokenizer_params + self.return_input_length = return_input_length + + def __call__(self, example): + inputs = self.tokenizer(example["text"], **self.tokenizer_params) + inputs = {k: v.squeeze() for k, v in inputs.items()} + if example['label'] is not None: + inputs["label"] = self.label2id[example["label"]] + return inputs + + +# 定义任务的训练模块 +class TextClassifierTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + preds = torch.cat([batch for batch in batches['logits']], dim=dim) + target = torch.cat([batch for batch in batches['labels']], dim=dim) + return {"preds": preds, "target": target} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(tokenizer, TnewsDataset.label2id(), + tokenizer_params={ + "padding": "max_length", + "truncation": "longest_first", + "max_length": max_sequence_length, + "return_tensors": "pt", + }) + ] + return TnewsDataset(opts, file_name, data_type, process_piplines) + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer) +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(train_dataset.label2id()) + + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + config.output_hidden_states = False + config.hidden_dropout_prob = 0. + config.attention_probs_dropout_prob = 0. + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = TextClassifierTrainer(opts=opts, + model=model, + metrics=[MattewsCorrcoef(task="multiclass",num_classes=opts.num_labels)], + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_text_match_afqmc.py b/examples/task_text_match_afqmc.py new file mode 100644 index 0000000..0c85373 --- /dev/null +++ b/examples/task_text_match_afqmc.py @@ -0,0 +1,156 @@ +import json +import torch +import torch.nn as nn +from typing import Any +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.metrics.classification.accuracy import Accuracy +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel + +class BertForSequenceClassification(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, outputs, labels): + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(outputs.view(-1, self.num_labels), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert(inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids']) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + loss = None + labels = inputs.get("labels", None) + if labels is not None: + loss = self.compute_loss(logits, labels) + return {"loss": loss, "logits": logits} + + +class AfqmcDataset(DatasetBaseBuilder): + + @staticmethod + def get_labels(): + return ["0", "1"] + + def read_data(self, input_file: str) -> Any: + with open(input_file, "r", encoding="utf-8-sig") as f: + lines = [] + for line in f.readlines(): + lines.append(json.loads(line)) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + text_a = line['sentence1'] + text_b = line['sentence2'] + label = line['label'] if data_type != 'test' else None + examples.append(dict(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, label2id, tokenizer, tokenizer_params, return_input_length=False): + self.label2id = label2id + self.tokenizer = tokenizer + self.tokenizer_params = tokenizer_params + self.return_input_length = return_input_length + + def __call__(self, example): + encoding = self.tokenizer.encode_plus(text=example["text_a"], + text_pair=example['text_b'], + **self.tokenizer_params) + encoding = {k: v.squeeze() for k, v in encoding.items()} + encoding["label"] = self.label2id.get(example["label"], None) + return encoding + + +class TextClassifierTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + preds = torch.cat([batch for batch in batches['logits']], dim=dim) + target = torch.cat([batch for batch in batches['labels']], dim=dim) + return {"preds": preds, "target": target} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(AfqmcDataset.label2id(), + tokenizer, + tokenizer_params={ + "padding": "max_length", + "truncation": "longest_first", + "max_length": max_sequence_length, + "return_tensors": "pt", + }) + ] + return AfqmcDataset(opts, file_name, data_type=data_type, process_piplines=process_piplines) + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer) +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(train_dataset.label2id()) + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + config.update( + { + "output_hidden_states": True, + "hidden_dropout_prob": 0.1, + "layer_norm_eps": 1e-7, + "add_pooling_layer": False + } + ) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = TextClassifierTrainer(opts=opts, + model=model, + metrics=Accuracy(task="multiclass",num_classes=opts.num_labels), + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + +if __name__ == "__main__": + main() diff --git a/examples/task_text_match_cmnli.py b/examples/task_text_match_cmnli.py new file mode 100644 index 0000000..8e33bd7 --- /dev/null +++ b/examples/task_text_match_cmnli.py @@ -0,0 +1,164 @@ +import json +import torch +import torch.nn as nn +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.metrics.classification.accuracy import Accuracy +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel + + +class BertForSequenceClassification(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, outputs, labels): + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(outputs.view(-1, self.num_labels), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert(inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids']) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + loss = None + labels = inputs.get("labels", None) + if labels is not None: + loss = self.compute_loss(logits, labels) + return {"loss": loss, "logits": logits} + + +class CmnliDataset(DatasetBaseBuilder): + + @staticmethod + def get_labels(): + return ["contradiction", "entailment", "neutral"] + + def read_data(self, input_file): + with open(input_file, "r", encoding="utf-8-sig") as f: + lines = [] + for line in f.readlines(): + lines.append(json.loads(line)) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + text_a = line['sentence1'] + text_b = line['sentence2'] + if data_type == 'test': + label = None + else: + if line['label'] == '-': + continue + label = line['label'] + examples.append(dict(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, label2id, tokenizer, tokenizer_params, return_input_length=False): + self.label2id = label2id + self.tokenizer = tokenizer + self.tokenizer_params = tokenizer_params + self.return_input_length = return_input_length + + def __call__(self, example): + encoding = self.tokenizer.encode_plus(text=example["text_a"], + text_pair=example['text_b'], + **self.tokenizer_params) + encoding = {k: v.squeeze() for k, v in encoding.items()} + if example['label'] is not None: + encoding["label"] = self.label2id[example["label"]] + return encoding + + +class TextClassifierTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + preds = torch.cat([batch for batch in batches['logits']], dim=dim) + target = torch.cat([batch for batch in batches['labels']], dim=dim) + return {"preds": preds, "target": target} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(CmnliDataset.label2id(), + tokenizer, + tokenizer_params={ + "padding": "max_length", + "truncation": "longest_first", + "max_length": max_sequence_length, + "return_tensors": "pt", + }) + ] + return CmnliDataset(opts, file_name, data_type=data_type, process_piplines=process_piplines) + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer) +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(train_dataset.label2id()) + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + config.update( + { + "output_hidden_states": False, + "hidden_dropout_prob": 0.1, + "layer_norm_eps": 1e-7, + "add_pooling_layer": False + } + ) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = TextClassifierTrainer(opts=opts, + model=model, + metrics=Accuracy(task="multiclass",num_classes=opts.num_labels), + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_text_match_csl.py b/examples/task_text_match_csl.py new file mode 100644 index 0000000..1956e1c --- /dev/null +++ b/examples/task_text_match_csl.py @@ -0,0 +1,157 @@ +import json +import torch +import torch.nn as nn +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.metrics.classification.accuracy import Accuracy +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel + + +class BertForSequenceClassification(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, outputs, labels): + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(outputs.view(-1, self.num_labels), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert(inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids']) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + loss = None + labels = inputs.get("labels", None) + if labels is not None: + loss = self.compute_loss(logits, labels) + return {"loss": loss, "logits": logits} + + +class CslDataset(DatasetBaseBuilder): + + @staticmethod + def get_labels(): + return ["0", "1"] + + def read_data(self, input_file): + with open(input_file, "r", encoding="utf-8-sig") as f: + lines = [] + for line in f.readlines(): + lines.append(json.loads(line)) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + text_a = " ".join(line['keyword']) + text_b = line['abst'] + label = line['label'] if data_type != 'test' else None + examples.append(dict(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, label2id, tokenizer, tokenizer_params, return_input_length=False): + self.label2id = label2id + self.tokenizer = tokenizer + self.tokenizer_params = tokenizer_params + self.return_input_length = return_input_length + + def __call__(self, example): + encoding = self.tokenizer.encode_plus(text=example["text_a"], + text_pair=example['text_b'], + **self.tokenizer_params) + encoding = {k: v.squeeze() for k, v in encoding.items()} + encoding["label"] = self.label2id.get(example["label"], None) + return encoding + + +class TextClassifierTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + preds = torch.cat([batch for batch in batches['logits']], dim=dim) + target = torch.cat([batch for batch in batches['labels']], dim=dim) + return {"preds": preds, "target": target} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(CslDataset.label2id(), + tokenizer, + tokenizer_params={ + "padding": "max_length", + "truncation": "longest_first", + "max_length": max_sequence_length, + "return_tensors": "pt", + }) + ] + return CslDataset(opts, file_name, data_type=data_type, process_piplines=process_piplines) + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer) +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(train_dataset.label2id()) + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + config.update( + { + "output_hidden_states": False, + "hidden_dropout_prob": 0.1, + "layer_norm_eps": 1e-7, + "add_pooling_layer": False + } + ) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = TextClassifierTrainer(opts=opts, + model=model, + metrics=Accuracy(task="multiclass",num_classes=opts.num_labels), + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_text_match_ocnli.py b/examples/task_text_match_ocnli.py new file mode 100644 index 0000000..843801b --- /dev/null +++ b/examples/task_text_match_ocnli.py @@ -0,0 +1,162 @@ +import json +import torch +import torch.nn as nn +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.metrics.classification.accuracy import Accuracy +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel + + +class BertForSequenceClassification(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def compute_loss(self, outputs, labels): + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(outputs.view(-1, self.num_labels), labels.view(-1)) + return loss + + def forward(self, inputs): + outputs = self.bert(inputs['input_ids'], + attention_mask=inputs['attention_mask'], + token_type_ids=inputs['token_type_ids']) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + loss = None + labels = inputs.get("labels", None) + if labels is not None: + loss = self.compute_loss(logits, labels) + return {"loss": loss, "logits": logits} + + +class OcnliDataset(DatasetBaseBuilder): + + @staticmethod + def get_labels(): + return ["contradiction", "entailment", "neutral"] + + def read_data(self, input_file): + with open(input_file, "r", encoding="utf-8-sig") as f: + lines = [] + for line in f.readlines(): + lines.append(json.loads(line)) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + text_a = line['sentence1'] + text_b = line['sentence2'] + if data_type == 'test': + label = None + else: + if line['label'] == '-': + continue + label = line['label'] + examples.append(dict(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, label2id, tokenizer, tokenizer_params, return_input_length=False): + self.label2id = label2id + self.tokenizer = tokenizer + self.tokenizer_params = tokenizer_params + self.return_input_length = return_input_length + + def __call__(self, example): + encoding = self.tokenizer.encode_plus(text=example["text_a"], + text_pair=example['text_b'], + **self.tokenizer_params) + encoding = {k: v.squeeze() for k, v in encoding.items()} + if example['label'] is not None: + encoding["label"] = self.label2id[example["label"]] + return encoding + + +class TextClassifierTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + preds = torch.cat([batch for batch in batches['logits']], dim=dim) + target = torch.cat([batch for batch in batches['labels']], dim=dim) + return {"preds": preds, "target": target} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(OcnliDataset.label2id(), + tokenizer, + tokenizer_params={ + "padding": "max_length", + "truncation": "longest_first", + "max_length": max_sequence_length, + "return_tensors": "pt", + }) + ] + return OcnliDataset(opts, file_name, data_type=data_type, process_piplines=process_piplines) + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer) +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(train_dataset.label2id()) + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + config.update( + { + "output_hidden_states": True, + "hidden_dropout_prob": 0.1, + "layer_norm_eps": 1e-7, + "add_pooling_layer": False + } + ) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = TextClassifierTrainer(opts=opts, model=model, + metrics=Accuracy(task="multiclass", num_classes=opts.num_labels), logger=logger) + + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_text_similarity_ccks2021.py b/examples/task_text_similarity_ccks2021.py new file mode 100644 index 0000000..396c62e --- /dev/null +++ b/examples/task_text_similarity_ccks2021.py @@ -0,0 +1,187 @@ +import json +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.metrics.classification.accuracy import Accuracy +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel + + +class BertForSequenceClassification(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.initializer_range = config.initializer_range + self.num_labels = config.num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(p=0.2) + self.high_dropout = nn.Dropout(p=0.5) + n_weights = config.num_hidden_layers + 1 + weights_init = torch.zeros(n_weights).float() + weights_init.data[:-1] = -3 + self.layer_weights = torch.nn.Parameter(weights_init) + self.classifier = nn.Linear(config.hidden_size, self.num_labels) + self._init_weights(self.classifier) + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def compute_loss(self, logits, labels): + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + + def forward(self, inputs): + input_ids =inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + labels = inputs.get("labels",None) + outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) + hidden_layers = outputs[2] + cls_outputs = torch.stack( + [self.dropout(layer[:, 0, :]) for layer in hidden_layers], dim=2 + ) + cls_output = (torch.softmax(self.layer_weights, dim=0) * cls_outputs).sum(-1) + logits = torch.mean( + torch.stack( + [self.classifier(self.high_dropout(cls_output)) for _ in range(5)], + dim=0, + ), + dim=0, + ) + loss = self.compute_loss(logits, labels) + outputs = {"loss": loss, "logits": logits} + return outputs + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer) +} + + +class CCKSDataset(DatasetBaseBuilder): + + @staticmethod + def get_labels(): + return {'不匹配': 0, '部分匹配': 1, '完全匹配': 2} + + def read_data(self, input_file): + with open(input_file, "r", encoding="utf-8-sig") as f: + lines = [] + for line in f.readlines(): + lines.append(json.loads(line)) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + text_a = line['query'] + text_b = line['candidate'] + label = line['label'] if data_type != 'test' else None + examples.append(dict(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, label2id, tokenizer, tokenizer_params, return_input_length=False): + self.label2id = label2id + self.tokenizer = tokenizer + self.tokenizer_params = tokenizer_params + self.return_input_length = return_input_length + + def __call__(self, example): + encoding = self.tokenizer.encode_plus(text=example["text_a"], + text_pair=example['text_b'], + **self.tokenizer_params) + encoding = {k: v.squeeze() for k, v in encoding.items()} + encoding["label"] = self.label2id.get(example["label"], None) + return encoding + + +# 定义任务的训练模块 +class TextClassifierTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + preds = torch.cat([batch for batch in batches['logits']], dim=dim) + target = torch.cat([batch for batch in batches['labels']], dim=dim) + return {"preds": preds, "target": target} + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(CCKSDataset.label2id(), + tokenizer, + tokenizer_params={ + "padding": "max_length", + "truncation": "longest_first", + "max_length": max_sequence_length, + "return_tensors": "pt", + }) + ] + return CCKSDataset(opts, file_name, data_type=data_type, process_piplines=process_piplines) + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(train_dataset.label2id()) + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path) + config.update( + { + "output_hidden_states": True, + "hidden_dropout_prob": 0.1, + "layer_norm_eps": 1e-7, + "add_pooling_layer": False, + "num_labels": opts.num_labels, + } + ) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = TextClassifierTrainer(opts=opts, + model=model, + metrics=Accuracy(task="multiclass",num_classes=opts.num_labels), + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/examples/task_text_similarity_lcqmc.py b/examples/task_text_similarity_lcqmc.py new file mode 100644 index 0000000..93b0a3d --- /dev/null +++ b/examples/task_text_similarity_lcqmc.py @@ -0,0 +1,162 @@ +import torch +import csv +import torch.nn as nn +from torch.nn import CrossEntropyLoss +from torchblocks.core import TrainBaseBuilder, Application +from torchblocks.data import DatasetBaseBuilder +from torchblocks.utils import seed_everything +from torchblocks.utils.options import Argparser +from torchblocks.utils.device import build_device +from torchblocks.utils.logger import Logger +from torchblocks.metrics.classification.accuracy import Accuracy +from transformers import BertPreTrainedModel, BertConfig, BertTokenizer, BertModel + + +class BertForSequenceClassification(BertPreTrainedModel, Application): + def __init__(self, config): + super().__init__(config) + self.initializer_range = config.initializer_range + self.num_labels = config.num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(p=0.2) + self.classifier = nn.Linear(config.hidden_size, self.num_labels) + self.init_weights() + + def compute_loss(self, logits, labels): + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + + def forward(self, inputs): + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + token_type_ids = inputs['token_type_ids'] + outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + loss = None + labels = inputs.get("labels", None) + if labels is not None: + loss = self.compute_loss(logits, labels) + return {"loss": loss, "logits": logits} + + +class LcqmcDataset(DatasetBaseBuilder): + + @staticmethod + def get_labels(): + return ["0", "1"] + + def read_data(self, input_file): + with open(input_file, "r", encoding="utf-8-sig") as f: + reader = csv.reader(f, delimiter="\t", quotechar=None) + lines = [] + for line in reader: + lines.append(line) + return lines + + def build_examples(self, data, data_type): + examples = [] + for (i, line) in enumerate(data): + guid = f"{data_type}-{i}" + text_a = line[0] + text_b = line[1] + label = str(int(line[2])) if data_type != 'test' else None + examples.append(dict(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ProcessEncodeText: + """ 编码单句任务文本,在原有example上追加 """ + + def __init__(self, tokenizer,tokenizer_params, return_input_length=False): + self.tokenizer = tokenizer + self.tokenizer_params = tokenizer_params + self.return_input_length = return_input_length + + def __call__(self, example): + inputs = self.tokenizer.encode_plus(text=example["text_a"], text_pair=example['text_b'], + **self.tokenizer_params) + inputs = {k: v.squeeze() for k, v in inputs.items()} + if example['label'] is not None: + inputs["label"] = example["label"] + return inputs + + +class ProcessEncodeLabel: + """ 编码单标签文本标签 """ + + def __init__(self, label2id): + self.label2id = label2id + + def __call__(self, example): + example["label"] = self.label2id.get(example["label"], None) + return example + + +def load_data(opts, file_name, data_type, tokenizer, max_sequence_length): + process_piplines = [ProcessEncodeText(tokenizer, + tokenizer_params={ + "padding": "max_length", + "truncation": "longest_first", + "max_length": max_sequence_length, + "return_tensors": "pt", + }), + ProcessEncodeLabel(LcqmcDataset.label2id()) + ] + return LcqmcDataset(opts, file_name, data_type=data_type, process_piplines=process_piplines) + + +class TextClassifierTrainer(TrainBaseBuilder): + ''' + 文本分类 + ''' + + # 跟model的输出、metric的输入相关 + def process_batch_outputs(self, batches, dim=0): + preds = torch.cat([batch for batch in batches['logits']], dim=dim) + target = torch.cat([batch for batch in batches['labels']], dim=dim) + return {"preds": preds, "target": target} + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer) +} + + +def main(): + opts = Argparser().build_arguments() + logger = Logger(opts=opts) + # device + logger.info("initializing device") + opts.device, opts.device_num = build_device(opts.device_id) + seed_everything(opts.seed) + config_class, model_class, tokenizer_class = MODEL_CLASSES[opts.model_type] + # data processor + logger.info("initializing data processor") + tokenizer = tokenizer_class.from_pretrained(opts.pretrained_model_path, do_lower_case=opts.do_lower_case) + train_dataset = load_data(opts, opts.train_input_file, "train", tokenizer, opts.train_max_seq_length) + dev_dataset = load_data(opts, opts.eval_input_file, "dev", tokenizer, opts.eval_max_seq_length) + opts.num_labels = len(train_dataset.get_labels()) + # model + logger.info("initializing model and config") + config = config_class.from_pretrained(opts.pretrained_model_path, num_labels=opts.num_labels) + model = model_class.from_pretrained(opts.pretrained_model_path, config=config) + model.to(opts.device) + # trainer + logger.info("initializing traniner") + trainer = TextClassifierTrainer(opts=opts, + model=model, + metrics=Accuracy(task="multiclass",num_classes=opts.num_labels), + logger=logger + ) + # do train + if opts.do_train: + trainer.train(train_data=train_dataset, dev_data=dev_dataset, state_to_save={'vocab': tokenizer}) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0809b57 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,20 @@ +[tool.black] +line-length = 100 +include = '\.pyi?$' +extend-exclude = ''' +( + \docs + | \tests +) +''' +skip-string-normalization = true # Avoid black replace all single quotes to the double + +[tool.isort] +profile = "black" +src_paths = ["torchblocks", "scripts", "tests"] +skip_gitignore = true +known_first_party = ["torchblocks"] + +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt old mode 100755 new mode 100644 index d39911b..8129899 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,10 @@ tokenizers >= 0.7.0 -transformers >= 4.1.1 +transformers>=4.10.0 sacremoses sentencepiece scipy scikit-learn -torchmetrics>=0.6.0 \ No newline at end of file +matplotlib +zhconv +torchmetrics>=0.11.3 +torch>=1.10.0,<1.13.0 \ No newline at end of file diff --git a/scripts/run_task_fewshot_pet.sh b/scripts/run_task_fewshot_pet.sh new file mode 100644 index 0000000..c6577bb --- /dev/null +++ b/scripts/run_task_fewshot_pet.sh @@ -0,0 +1,35 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=fewshot +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_classify_fewshot_pet.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --checkpoint_mode=max \ + --experiment_name='ver_001' \ + --checkpoint_monitor=eval_acc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=fewshot_train.tsv \ + --eval_input_file=fewshot_dev.tsv \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=256 \ + --eval_max_seq_length=256 \ + --per_gpu_train_batch_size=8 \ + --per_gpu_eval_batch_size=16 \ + --learning_rate=3e-5 \ + --num_train_epochs=10 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --scheduler_type=cosine \ + --scheduler_on=batch \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 \ No newline at end of file diff --git a/scripts/run_task_fewshot_ptuning.sh b/scripts/run_task_fewshot_ptuning.sh new file mode 100644 index 0000000..ec96bd9 --- /dev/null +++ b/scripts/run_task_fewshot_ptuning.sh @@ -0,0 +1,35 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=fewshot +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_classify_fewshot_ptuning.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --checkpoint_mode=max \ + --experiment_name='ver_001' \ + --checkpoint_monitor=eval_acc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=fewshot_train.tsv \ + --eval_input_file=fewshot_dev.tsv \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=256 \ + --eval_max_seq_length=256 \ + --per_gpu_train_batch_size=8 \ + --per_gpu_eval_batch_size=16 \ + --learning_rate=3e-5 \ + --num_train_epochs=10 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --scheduler_type=cosine \ + --scheduler_on=batch \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 \ No newline at end of file diff --git a/scripts/run_task_pretrain_ccks.sh b/scripts/run_task_pretrain_ccks.sh new file mode 100644 index 0000000..44fceed --- /dev/null +++ b/scripts/run_task_pretrain_ccks.sh @@ -0,0 +1,35 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/macbert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=ccks2021 +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_pretrain_ccks.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --checkpoint_mode=min \ + --experiment_name='ver_001' \ + --checkpoint_monitor=train_loss \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=round1_train.txt \ + --eval_input_file=round2_train.txt \ + --test_input_file=Xeon3NLP_round1_test_20210524.txt \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --mlm_probability=0.15 \ + --train_max_seq_length=128 \ + --eval_max_seq_length=128 \ + --per_gpu_train_batch_size=16 \ + --learning_rate=1e-4 \ + --num_train_epochs=20 \ + --gradient_accumulation_steps=8 \ + --warmup_rate=0.1 \ + --scheduler_type=cosine \ + --logging_steps=1000 \ + --save_steps=4000 \ + --seed=42 \ No newline at end of file diff --git a/scripts/run_task_sequence_labeling_cner_beam_search_softmax.sh b/scripts/run_task_sequence_labeling_cner_beam_search_softmax.sh new file mode 100644 index 0000000..f05cae7 --- /dev/null +++ b/scripts/run_task_sequence_labeling_cner_beam_search_softmax.sh @@ -0,0 +1,39 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=cner +export MODEL_TYPE=bert + +#-----------training----------------- + +python examples/task_sequence_labeling_cner_beam_search_softmax.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='ver_001' \ + --checkpoint_save_best \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_f1_micro \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=cner_train_fold0.json \ + --eval_input_file=cner_dev_fold0.json \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=128 \ + --eval_max_seq_length=128 \ + --test_max_seq_length=512 \ + --per_gpu_train_batch_size=24 \ + --per_gpu_eval_batch_size=24 \ + --per_gpu_test_batch_size=24 \ + --learning_rate=3e-5 \ + --other_learning_rate=1e-3 \ + --num_train_epochs=15 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --decode_beam_size=3 \ + --seed=42 \ No newline at end of file diff --git a/scripts/run_task_sequence_labeling_resume_beam_search_softmax.sh b/scripts/run_task_sequence_labeling_resume_beam_search_softmax.sh new file mode 100644 index 0000000..1ab33cf --- /dev/null +++ b/scripts/run_task_sequence_labeling_resume_beam_search_softmax.sh @@ -0,0 +1,38 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=resume +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_sequence_labeling_resume_beam_search_softmax.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='ver_001' \ + --checkpoint_save_best \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_f1_micro \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.txt \ + --eval_input_file=dev.txt \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=200 \ + --eval_max_seq_length=512 \ + --test_max_seq_length=512 \ + --per_gpu_train_batch_size=24 \ + --per_gpu_eval_batch_size=24 \ + --per_gpu_test_batch_size=24 \ + --learning_rate=3e-5 \ + --other_learning_rate=1e-3 \ + --num_train_epochs=15 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --decode_beam_size=3 \ + --seed=42 \ No newline at end of file diff --git a/scripts/run_task_sequence_labeling_cner_global_pointer.sh b/scripts/run_task_sequence_labeling_resume_biaffine.sh old mode 100755 new mode 100644 similarity index 59% rename from scripts/run_task_sequence_labeling_cner_global_pointer.sh rename to scripts/run_task_sequence_labeling_resume_biaffine.sh index 6773c7f..11bcbcb --- a/scripts/run_task_sequence_labeling_cner_global_pointer.sh +++ b/scripts/run_task_sequence_labeling_resume_biaffine.sh @@ -2,31 +2,30 @@ CURRENT_DIR=`pwd` export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn export DATA_DIR=$CURRENT_DIR/dataset export OUTPUR_DIR=$CURRENT_DIR/outputs -export TASK_NAME=cner +export TASK_NAME=resume export MODEL_TYPE=bert -python task_sequence_labeling_cner_global_pointer.py \ +python examples/task_sequence_labeling_resume_biaffine.py \ --task_name=$TASK_NAME \ --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ --model_type=$MODEL_TYPE \ --data_dir=dataset/$TASK_NAME/ \ - --do_train --do_eval \ - --use_rope \ - --pe_dim=64 \ - --evaluate_during_training \ - --experiment_code=pointer_v0 \ - --train_input_file=train.char.bmes \ - --eval_input_file=test.char.bmes \ - --test_input_file=test.char.bmes \ - --train_max_seq_length=128 \ + --do_train \ + --biaffine_ffnn_size=150 \ + --experiment_name=ver_001 \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_f1_micro \ + --train_input_file=train.txt \ + --eval_input_file=dev.txt \ + --train_max_seq_length=200 \ --eval_max_seq_length=512 \ --test_max_seq_length=512 \ - --per_gpu_train_batch_size=24 \ + --per_gpu_train_batch_size=16 \ --per_gpu_eval_batch_size=24 \ --per_gpu_test_batch_size=24 \ --pretrained_model_path=$MODEL_DIR \ --learning_rate=3e-5 \ --other_learning_rate=1e-3 \ - --num_train_epochs=4 \ - --checkpoint_monitor=eval_f1_micro \ + --warmup_rate=0.1 \ + --num_train_epochs=10 \ --seed=42 \ No newline at end of file diff --git a/scripts/run_task_sequence_labeling_cner_crf.sh b/scripts/run_task_sequence_labeling_resume_crf.sh old mode 100755 new mode 100644 similarity index 71% rename from scripts/run_task_sequence_labeling_cner_crf.sh rename to scripts/run_task_sequence_labeling_resume_crf.sh index 615712c..0dd218b --- a/scripts/run_task_sequence_labeling_cner_crf.sh +++ b/scripts/run_task_sequence_labeling_resume_crf.sh @@ -2,35 +2,35 @@ CURRENT_DIR=`pwd` export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn export DATA_DIR=$CURRENT_DIR/dataset export OUTPUR_DIR=$CURRENT_DIR/outputs -export TASK_NAME=cner +export TASK_NAME=resume export MODEL_TYPE=bert #-----------training----------------- -python task_sequence_labeling_cner_crf.py \ +python examples/task_sequence_labeling_resume_crf.py \ --model_type=$MODEL_TYPE \ --pretrained_model_path=$MODEL_DIR \ --task_name=$TASK_NAME \ --do_train \ --do_lower_case \ --device_id='0' \ - --experiment_code='V0' \ + --experiment_name='ver_001' \ + --checkpoint_mode=max \ --checkpoint_monitor=eval_f1_micro \ --data_dir=$DATA_DIR/$TASK_NAME/ \ - --train_input_file=train.char.bmes \ - --eval_input_file=dev.char.bmes \ - --test_input_file=test.char.bmes \ + --train_input_file=train.txt \ + --eval_input_file=dev.txt \ --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ - --train_max_seq_length=128 \ + --train_max_seq_length=200 \ --eval_max_seq_length=512 \ --test_max_seq_length=512 \ --per_gpu_train_batch_size=24 \ --per_gpu_eval_batch_size=24 \ --per_gpu_test_batch_size=24 \ - --learning_rate=3e-5 \ + --learning_rate=2e-5 \ --other_learning_rate=1e-3 \ - --num_train_epochs=4 \ + --num_train_epochs=20 \ --gradient_accumulation_steps=1 \ - --warmup_proportion=0.1 \ + --warmup_rate=0.1 \ --logging_steps=-1 \ --save_steps=-1 \ --seed=42 diff --git a/scripts/run_task_sequence_labeling_resume_global_pointer.sh b/scripts/run_task_sequence_labeling_resume_global_pointer.sh new file mode 100644 index 0000000..b027d29 --- /dev/null +++ b/scripts/run_task_sequence_labeling_resume_global_pointer.sh @@ -0,0 +1,32 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=resume +export MODEL_TYPE=bert + +python examples/task_sequence_labeling_resume_global_pointer.py \ + --task_name=$TASK_NAME \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --model_type=$MODEL_TYPE \ + --data_dir=dataset/$TASK_NAME/ \ + --do_train \ + --use_rope \ + --inner_dim=64 \ + --experiment_name='ver_001' \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_f1 \ + --train_input_file=train.txt \ + --eval_input_file=dev.txt \ + --train_max_seq_length=200 \ + --eval_max_seq_length=512 \ + --test_max_seq_length=512 \ + --per_gpu_train_batch_size=16 \ + --per_gpu_eval_batch_size=24 \ + --per_gpu_test_batch_size=24 \ + --pretrained_model_path=$MODEL_DIR \ + --learning_rate=3e-5 \ + --other_learning_rate=1e-3 \ + --num_train_epochs=10 \ + --warmup_rate=0.1 \ + --seed=42 \ No newline at end of file diff --git a/scripts/run_task_sequence_labeling_cner_span.sh b/scripts/run_task_sequence_labeling_resume_span.sh old mode 100755 new mode 100644 similarity index 65% rename from scripts/run_task_sequence_labeling_cner_span.sh rename to scripts/run_task_sequence_labeling_resume_span.sh index 9b209db..9fa6874 --- a/scripts/run_task_sequence_labeling_cner_span.sh +++ b/scripts/run_task_sequence_labeling_resume_span.sh @@ -2,35 +2,33 @@ CURRENT_DIR=`pwd` export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn export DATA_DIR=$CURRENT_DIR/dataset export OUTPUR_DIR=$CURRENT_DIR/outputs -export TASK_NAME=cner +export TASK_NAME=resume export MODEL_TYPE=bert -python task_sequence_labeling_cner_span.py \ +python examples/task_sequence_labeling_resume_span.py \ --task_name=$TASK_NAME \ + --pretrained_model_path=$MODEL_DIR \ --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ --model_type=$MODEL_TYPE \ --data_dir=dataset/$TASK_NAME/ \ --do_train \ - --do_eval \ --do_lower_case \ - --evaluate_during_training \ - --experiment_code=span_v0 \ - --train_input_file=train.char.bmes \ - --eval_input_file=dev.char.bmes \ - --test_input_file=test.char.bmes \ - --train_max_seq_length=128 \ + --experiment_name=ver_001 \ + --checkpoint_save_best \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_f1_micro \ + --train_input_file=train.txt \ + --eval_input_file=dev.txt \ + --train_max_seq_length=200 \ --eval_max_seq_length=512 \ --test_max_seq_length=512 \ --per_gpu_train_batch_size=24 \ --per_gpu_eval_batch_size=24 \ --per_gpu_test_batch_size=24 \ - --pretrained_model_path=$MODEL_DIR \ - --learning_rate=2e-5 \ - --num_train_epochs=10 \ - --checkpoint_mode=max \ - --checkpoint_monitor=eval_f1_micro \ + --learning_rate=3e-5 \ + --num_train_epochs=6 \ --gradient_accumulation_steps=1 \ - --warmup_proportion=0.1 \ + --warmup_rate=0.1 \ --logging_steps=-1 \ --save_steps=-1 \ --seed=42 \ No newline at end of file diff --git a/scripts/run_task_sequence_labeling_cner_softmax.sh b/scripts/run_task_sequence_labeling_resume_token_mdp_softmax.sh old mode 100755 new mode 100644 similarity index 70% rename from scripts/run_task_sequence_labeling_cner_softmax.sh rename to scripts/run_task_sequence_labeling_resume_token_mdp_softmax.sh index d97be4b..5f7b0f2 --- a/scripts/run_task_sequence_labeling_cner_softmax.sh +++ b/scripts/run_task_sequence_labeling_resume_token_mdp_softmax.sh @@ -2,37 +2,35 @@ CURRENT_DIR=`pwd` export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn export DATA_DIR=$CURRENT_DIR/dataset export OUTPUR_DIR=$CURRENT_DIR/outputs -export TASK_NAME=cner +export TASK_NAME=resume export MODEL_TYPE=bert #-----------training----------------- -python task_sequence_labeling_cner_softmax.py \ +python examples/task_sequence_labeling_resume_token_mdp_softmax.py \ --model_type=$MODEL_TYPE \ --pretrained_model_path=$MODEL_DIR \ --task_name=$TASK_NAME \ --do_train \ --do_lower_case \ --device_id='0' \ - --experiment_code='V0' \ + --experiment_name='ver_001' \ + --checkpoint_save_best \ + --checkpoint_mode=max \ --checkpoint_monitor=eval_f1_micro \ --data_dir=$DATA_DIR/$TASK_NAME/ \ - --train_input_file=train.char.bmes \ - --eval_input_file=dev.char.bmes \ - --test_input_file=test.char.bmes \ + --train_input_file=train.txt \ + --eval_input_file=dev.txt \ --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ - --train_max_seq_length=128 \ + --train_max_seq_length=200 \ --eval_max_seq_length=512 \ --test_max_seq_length=512 \ --per_gpu_train_batch_size=24 \ --per_gpu_eval_batch_size=24 \ --per_gpu_test_batch_size=24 \ --learning_rate=3e-5 \ - --num_train_epochs=10 \ + --num_train_epochs=6 \ --gradient_accumulation_steps=1 \ - --warmup_proportion=0.1 \ + --warmup_rate=0.1 \ --logging_steps=-1 \ --save_steps=-1 \ --seed=42 - - - diff --git a/scripts/run_task_text_classification_cola.sh b/scripts/run_task_text_classification_cola.sh old mode 100755 new mode 100644 index 5a38874..9831e9a --- a/scripts/run_task_text_classification_cola.sh +++ b/scripts/run_task_text_classification_cola.sh @@ -6,16 +6,17 @@ export TASK_NAME=cola export MODEL_TYPE=bert #-----------training----------------- -python task_text_classification_cola.py \ +python examples/task_text_classification_cola.py \ --model_type=$MODEL_TYPE \ --pretrained_model_path=$MODEL_DIR \ --task_name=$TASK_NAME \ --do_train \ --do_lower_case \ - --fp16 \ --device_id='0' \ - --experiment_code='V0' \ + --experiment_name='ver_001' \ --checkpoint_mode=max \ + --checkpoint_save_best \ + --scheduler_type=linear \ --checkpoint_monitor=eval_mcc \ --data_dir=$DATA_DIR/$TASK_NAME/ \ --train_input_file=train.tsv \ @@ -25,10 +26,10 @@ python task_text_classification_cola.py \ --eval_max_seq_length=128 \ --per_gpu_train_batch_size=32 \ --per_gpu_eval_batch_size=32 \ - --learning_rate=2e-5 \ - --num_train_epochs=3 \ + --learning_rate=3e-5 \ + --num_train_epochs=4 \ --gradient_accumulation_steps=1 \ - --warmup_proportion=0.1 \ + --warmup_rate=0.1 \ --logging_steps=-1 \ --save_steps=-1 \ --seed=42 diff --git a/scripts/run_task_text_classification_cola_adan.sh b/scripts/run_task_text_classification_cola_adan.sh new file mode 100644 index 0000000..70ab1c9 --- /dev/null +++ b/scripts/run_task_text_classification_cola_adan.sh @@ -0,0 +1,42 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-en +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=cola +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_classification_cola_adan.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='ver_001' \ + --checkpoint_mode=max \ + --scheduler_type=linear \ + --checkpoint_monitor=eval_mcc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.tsv \ + --eval_input_file=dev.tsv \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=128 \ + --eval_max_seq_length=128 \ + --per_gpu_train_batch_size=16 \ + --per_gpu_eval_batch_size=32 \ + --learning_rate=3e-5 \ + --num_train_epochs=10 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --weight_decay=0.01 \ + --adan_beta1=0.98 \ + --adan_beta1=0.99 \ + --adan_beta1=0.99 \ + --adan_epsilon=1e-08 \ + --seed=42 + + + diff --git a/scripts/run_task_text_classification_cola_awp.sh b/scripts/run_task_text_classification_cola_awp.sh new file mode 100644 index 0000000..a691efa --- /dev/null +++ b/scripts/run_task_text_classification_cola_awp.sh @@ -0,0 +1,44 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-en +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=cola +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_classification_cola.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --device_id='0' \ + --do_lower_case \ + --experiment_name='fgm_awp_ver_001' \ + --do_fgm \ + --fgm_epsilon=0.1 \ + --fgm_name=word_embeddings \ + --do_awp \ + --awp_name=weight \ + --awp_epsilon=0.001 \ + --awp_alpha=0.00001 \ + --awp_number=1 \ + --awp_start_epoch=2 \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_mcc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.tsv \ + --eval_input_file=dev.tsv \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=128 \ + --eval_max_seq_length=128 \ + --per_gpu_train_batch_size=32 \ + --per_gpu_eval_batch_size=32 \ + --learning_rate=3e-5 \ + --num_train_epochs=4 \ + --scheduler_type=cosine \ + --max_grad_norm=1.0 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 diff --git a/scripts/run_task_text_classification_cola_fgm.sh b/scripts/run_task_text_classification_cola_fgm.sh old mode 100755 new mode 100644 index 1aa8093..b562c3e --- a/scripts/run_task_text_classification_cola_fgm.sh +++ b/scripts/run_task_text_classification_cola_fgm.sh @@ -6,18 +6,18 @@ export TASK_NAME=cola export MODEL_TYPE=bert #-----------training----------------- -python task_text_classification_cola.py \ +python examples/task_text_classification_cola.py \ --model_type=$MODEL_TYPE \ --pretrained_model_path=$MODEL_DIR \ --task_name=$TASK_NAME \ --do_train \ --device_id='0' \ --do_lower_case \ - --experiment_code='V1' \ - --adv_enable \ - --adv_type='fgm' \ - --adv_epsilon=1.0 \ + --experiment_name='fgm_ver_001' \ + --do_fgm \ + --fgm_epsilon=1.0 \ --scheduler_type=linear \ + --checkpoint_mode=max \ --checkpoint_monitor=eval_mcc \ --data_dir=$DATA_DIR/$TASK_NAME/ \ --train_input_file=train.tsv \ @@ -30,7 +30,7 @@ python task_text_classification_cola.py \ --learning_rate=3e-5 \ --num_train_epochs=4 \ --gradient_accumulation_steps=1 \ - --warmup_proportion=0.1 \ + --warmup_rate=0.1 \ --logging_steps=-1 \ --save_steps=-1 \ --seed=42 \ No newline at end of file diff --git a/scripts/run_task_text_classification_cola_pgd.sh b/scripts/run_task_text_classification_cola_pgd.sh new file mode 100644 index 0000000..1f27e37 --- /dev/null +++ b/scripts/run_task_text_classification_cola_pgd.sh @@ -0,0 +1,37 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-en +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=cola +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_classification_cola.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --device_id='0' \ + --do_lower_case \ + --experiment_name='pgd_ver_001' \ + --do_pgd \ + --pgd_number=3 \ + --pgd_epsilon=0.01 \ + --pgd_alpha=0.2 \ + --scheduler_type=linear \ + --checkpoint_monitor=eval_mcc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.tsv \ + --eval_input_file=dev.tsv \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=128 \ + --eval_max_seq_length=128 \ + --per_gpu_train_batch_size=32 \ + --per_gpu_eval_batch_size=32 \ + --learning_rate=3e-5 \ + --num_train_epochs=4 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 \ No newline at end of file diff --git a/scripts/run_task_text_classification_cola_rdrop.sh b/scripts/run_task_text_classification_cola_rdrop.sh new file mode 100644 index 0000000..354a308 --- /dev/null +++ b/scripts/run_task_text_classification_cola_rdrop.sh @@ -0,0 +1,40 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-en +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=cola +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_classification_cola.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='rdrop_ver_001' \ + --checkpoint_mode=max \ + --scheduler_type=linear \ + --do_rdrop \ + --rdrop_weight=0.2 \ + --rdrop_start_epoch=2 \ + --checkpoint_monitor=eval_mcc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.tsv \ + --eval_input_file=dev.tsv \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=128 \ + --eval_max_seq_length=128 \ + --per_gpu_train_batch_size=32 \ + --per_gpu_eval_batch_size=32 \ + --learning_rate=3e-5 \ + --num_train_epochs=4 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 + + + diff --git a/scripts/run_task_text_classification_wsc.sh b/scripts/run_task_text_classification_wsc.sh new file mode 100644 index 0000000..8eaabed --- /dev/null +++ b/scripts/run_task_text_classification_wsc.sh @@ -0,0 +1,41 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/macbert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=cluewsc +export MODEL_TYPE=macbert + +#-----------training----------------- +python examples/task_text_classification_wsc.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='ver_0001' \ + --checkpoint_mode=max \ + --checkpoint_save_best \ + --checkpoint_monitor=eval_acc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.json \ + --eval_input_file=val.json \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=128 \ + --eval_max_seq_length=128 \ + --per_gpu_train_batch_size=32 \ + --per_gpu_eval_batch_size=32 \ + --learning_rate=3e-5 \ + --num_train_epochs=50 \ + --scheduler_type=linear \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 + + + + + + diff --git a/scripts/run_task_text_match_afqmc.sh b/scripts/run_task_text_match_afqmc.sh new file mode 100644 index 0000000..e79a47f --- /dev/null +++ b/scripts/run_task_text_match_afqmc.sh @@ -0,0 +1,40 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/macbert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=afqmc +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_match_afqmc.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='ver_001' \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_acc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.json \ + --eval_input_file=dev.json \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=256 \ + --eval_max_seq_length=256 \ + --per_gpu_train_batch_size=16 \ + --per_gpu_eval_batch_size=8 \ + --learning_rate=2e-5 \ + --num_train_epochs=5 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --scheduler_type=linear \ + --weight_decay=0.01 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 + + + + + diff --git a/scripts/run_task_text_match_cmnli.sh b/scripts/run_task_text_match_cmnli.sh new file mode 100644 index 0000000..1fedc28 --- /dev/null +++ b/scripts/run_task_text_match_cmnli.sh @@ -0,0 +1,40 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=cmnli +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_match_cmnli.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='ver_001' \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_acc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.json \ + --eval_input_file=dev.json \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=128 \ + --eval_max_seq_length=128 \ + --per_gpu_train_batch_size=16 \ + --per_gpu_eval_batch_size=16 \ + --learning_rate=3e-5 \ + --num_train_epochs=5 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --scheduler_type=linear \ + --weight_decay=0.01 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 + + + + + diff --git a/scripts/run_task_text_match_csl.sh b/scripts/run_task_text_match_csl.sh new file mode 100644 index 0000000..1bdbba0 --- /dev/null +++ b/scripts/run_task_text_match_csl.sh @@ -0,0 +1,40 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=csl +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_match_csl.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='ver_001' \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_acc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.json \ + --eval_input_file=dev.json \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=256 \ + --eval_max_seq_length=256 \ + --per_gpu_train_batch_size=16 \ + --per_gpu_eval_batch_size=16 \ + --learning_rate=1e-5 \ + --num_train_epochs=5 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --scheduler_type=linear \ + --weight_decay=0.01 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 + + + + + diff --git a/scripts/run_task_text_match_ocnli.sh b/scripts/run_task_text_match_ocnli.sh new file mode 100644 index 0000000..ba4aff4 --- /dev/null +++ b/scripts/run_task_text_match_ocnli.sh @@ -0,0 +1,39 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/macbert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=ocnli +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_match_ocnli.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='ver_001' \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_acc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.json \ + --eval_input_file=dev.json \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=128 \ + --eval_max_seq_length=128 \ + --per_gpu_train_batch_size=32 \ + --per_gpu_eval_batch_size=32 \ + --learning_rate=3e-5 \ + --num_train_epochs=5 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --scheduler_type=linear \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 + + + + + diff --git a/scripts/run_task_text_similarity_ccks2021.sh b/scripts/run_task_text_similarity_ccks2021.sh new file mode 100644 index 0000000..53e0ca1 --- /dev/null +++ b/scripts/run_task_text_similarity_ccks2021.sh @@ -0,0 +1,37 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=ccks2021 +export MODEL_TYPE=bert + +python examples/task_text_similarity_ccks2021.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='ver_001' \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_acc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=ccks2021_train_seed42_fold0.json \ + --eval_input_file=ccks2021_dev_seed42_fold0.json \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=100 \ + --eval_max_seq_length=100 \ + --per_gpu_train_batch_size=32 \ + --per_gpu_eval_batch_size=32 \ + --learning_rate=3e-5 \ + --num_train_epochs=3 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --scheduler_type=linear \ + --scheduler_on=batch \ + --weight_decay=0.01 \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 + + diff --git a/scripts/run_task_text_similarity_lcqmc.sh b/scripts/run_task_text_similarity_lcqmc.sh new file mode 100644 index 0000000..e0eb22e --- /dev/null +++ b/scripts/run_task_text_similarity_lcqmc.sh @@ -0,0 +1,35 @@ +CURRENT_DIR=`pwd` +export MODEL_DIR=$CURRENT_DIR/pretrained_models/bert-base-cn +export DATA_DIR=$CURRENT_DIR/dataset +export OUTPUR_DIR=$CURRENT_DIR/outputs +export TASK_NAME=lcqmc +export MODEL_TYPE=bert + +#-----------training----------------- +python examples/task_text_similarity_lcqmc.py \ + --model_type=$MODEL_TYPE \ + --pretrained_model_path=$MODEL_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_lower_case \ + --device_id='0' \ + --experiment_name='ver_001' \ + --checkpoint_mode=max \ + --checkpoint_monitor=eval_acc \ + --data_dir=$DATA_DIR/$TASK_NAME/ \ + --train_input_file=train.txt \ + --eval_input_file=dev.txt \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --train_max_seq_length=50 \ + --eval_max_seq_length=50 \ + --per_gpu_train_batch_size=32 \ + --per_gpu_eval_batch_size=32 \ + --learning_rate=5e-5 \ + --num_train_epochs=3 \ + --gradient_accumulation_steps=1 \ + --warmup_rate=0.1 \ + --scheduler_type=linear \ + --scheduler_on=batch \ + --logging_steps=-1 \ + --save_steps=-1 \ + --seed=42 \ No newline at end of file diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index 89368af..72caf3c --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ def readme(): return content def find_version(): - version_file = 'torchblocks/version.py' + version_file = 'src/torchblocks/version.py' with open(version_file, 'r') as f: exec(compile(f.read(), version_file, 'exec')) return locals()['__version__'] @@ -26,21 +26,12 @@ def get_requirements(filename='requirements.txt'): description="A PyTorch-based toolkit for natural language processing", long_description=readme(), long_description_content_type="text/markdown", - keywords=["NLP", "Deep Learning", "Transformers", "PyTorch"], - license="MIT", + keywords=["NLP", "Deep Learning", "Transformers", "PyTorch",'Natural Language Processing'], + license="MIT License", + platforms='Linux', url="https://github.com/lonePatient/TorchBlocks", - packages=find_packages("torchblocks"), + packages=find_packages(where='src'), + package_dir={'': 'src'}, install_requires=get_requirements(), python_requires=">=3.7.0", - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Education", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - ], ) diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..68a2e7352bef5df01b71f2e427c64f25a719e705 GIT binary patch literal 6148 zcmeHK%}T>S5T3176GZGmP;a?-E7FSMNr+Vs-h_xARBF-|8;se~q?TGMc@BLD-@_O1 zadc*PE7mG_QB-Ea>^HkRUzU6u_6Gn&rBg2h6ac_NC(PMcd|=d1K4v-7l1CKk9U-Jp z#b*UtW7*vB8x^3rn^AKXLJV)`ueTS;S`f(qGwAo&LSK1Sh7qw+kkE!2G$DWp;~HE* z@{aZJHCE6~T4t=rpcRElS}Hw>+(dqIYTB}G%bvAQy{7DXsh@U|ir+q_+PkUjfISS*t>{etat|#g}p6*ZsRSQe~{(7lYbjs^% zgQC+fudQNbV`FtNu&ssV^5)*zW$(IwGq}A|I)K1`U)h|&A>3o|iAq;qJC0<0ju}Jt zkj2OfumY^W_bXsl9&6_NhvE@f0aoCTDM0IkgHGrg%rvT{0}FKpK%`@&GK}f4NF2wY zYcSJ@J!nEx5j9nqD~8Z?Y}(9fA{nI|0s!TtN<(UuM`mZ9dD 0: + attack_start = False + if epoch >= self.start_epoch: + attack_start = True + elif self.start_step > 0: + attack_start = False + if step >= self.start_step: + attack_start = True + elif self.start_score > 0: + attack_start = False + monitor_op = self.mode_dict[self.score_mode] + if current_score is None: + attack_start = False + else: + if monitor_op(current_score, self.start_score): + attack_start = True + return attack_start diff --git a/src/torchblocks/callback/attacks/fgm.py b/src/torchblocks/callback/attacks/fgm.py new file mode 100644 index 0000000..8568914 --- /dev/null +++ b/src/torchblocks/callback/attacks/fgm.py @@ -0,0 +1,26 @@ +import torch +from .attack_base import AttackBaseBuilder + +class FGM(AttackBaseBuilder): + def __init__(self, model, attack_name, epsilon=1.0): + super(FGM, self).__init__() + self.model = model + self.attack_name = attack_name + self.epsilon = epsilon + self.backup = {} + + def attack(self): + for name, param in self.model.named_parameters(): + if param.requires_grad and self.attack_name in name: + self.backup[name] = param.data.clone() + norm = torch.norm(param.grad) + if norm != 0: + r_at = self.epsilon * param.grad / norm + param.data.add_(r_at) + + def restore(self): + for name, param in self.model.named_parameters(): + if param.requires_grad and self.attack_name in name: + assert name in self.backup + param.data = self.backup[name] + self.backup = {} diff --git a/src/torchblocks/callback/attacks/pgd.py b/src/torchblocks/callback/attacks/pgd.py new file mode 100644 index 0000000..51d3388 --- /dev/null +++ b/src/torchblocks/callback/attacks/pgd.py @@ -0,0 +1,49 @@ +import torch +from .attack_base import AttackBaseBuilder + + +class PGD(AttackBaseBuilder): + def __init__(self, model, attack_name, epsilon=1., alpha=0.3): + super(PGD, self).__init__() + self.model = model + self.attack_name = attack_name + self.epsilon = epsilon + self.alpha = alpha + self.attack_backup = {} + self.grad_backup = {} + + def attack(self, is_first_attack=False): + for name, param in self.model.named_parameters(): + if param.requires_grad and self.attack_name in name: + if is_first_attack: + self.attack_backup[name] = param.data.clone() + norm = torch.norm(param.grad) + if norm != 0 and not torch.isnan(norm): + r_at = self.alpha * param.grad / norm + param.data.add_(r_at) + param.data = self.project(name, param.data, self.epsilon) + + def restore(self): + for name, param in self.model.named_parameters(): + if param.requires_grad and self.attack_name in name: + assert name in self.attack_backup + param.data = self.attack_backup[name] + self.attack_backup = {} + + def project(self, param_name, param_data, epsilon): + r = param_data - self.attack_backup[param_name] + if torch.norm(r) > epsilon: + r = epsilon * r / torch.norm(r) + return self.attack_backup[param_name] + r + + def backup_grad(self): + for name, param in self.model.named_parameters(): + if param.requires_grad: + if 'encoder' in name or self.attack_name in name: + self.grad_backup[name] = param.grad.clone() + + def restore_grad(self): + for name, param in self.model.named_parameters(): + if param.requires_grad: + if 'encoder' in name or self.attack_name in name: + param.grad = self.grad_backup[name] diff --git a/torchblocks/callback/early_stopping.py b/src/torchblocks/callback/early_stopping.py old mode 100755 new mode 100644 similarity index 95% rename from torchblocks/callback/early_stopping.py rename to src/torchblocks/callback/early_stopping.py index 8523b68..452db97 --- a/torchblocks/callback/early_stopping.py +++ b/src/torchblocks/callback/early_stopping.py @@ -2,7 +2,7 @@ import numpy as np import logging -logger = logging.getLogger(__name__) +logger = logging.getLogger() class EarlyStopping(object): @@ -77,7 +77,7 @@ def step(self, current): if self.monitor_op(current, self.best_score): msg = ( f" Metric {self.monitor} improved from {self.best_score:.4f} to {current:.4f}" - f" New best score: {current:.3f}" + f" --- New best score ---: {current:.3f}" ) self.best_score = current self.wait_count = 0 diff --git a/src/torchblocks/callback/ema.py b/src/torchblocks/callback/ema.py new file mode 100644 index 0000000..6a429f1 --- /dev/null +++ b/src/torchblocks/callback/ema.py @@ -0,0 +1,34 @@ +class EMA: + """ Model Exponential Moving Average""" + + def __init__(self, model, decay): + self.model = model + self.decay = decay + self.shadow = {} + self.backup = {} + + def register(self): + for name, param in self.model.named_parameters(): + if param.requires_grad: + self.shadow[name] = param.data.clone() + + def update(self): + for name, param in self.model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name] + self.shadow[name] = new_average.clone() + + def apply_shadow(self): + for name, param in self.model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + self.backup[name] = param.data + param.data = self.shadow[name] + + def restore(self): + for name, param in self.model.named_parameters(): + if param.requires_grad: + assert name in self.backup + param.data = self.backup[name] + self.backup = {} \ No newline at end of file diff --git a/torchblocks/callback/file_writer.py b/src/torchblocks/callback/file_writer.py old mode 100755 new mode 100644 similarity index 86% rename from torchblocks/callback/file_writer.py rename to src/torchblocks/callback/file_writer.py index cdfc23b..81a6a06 --- a/torchblocks/callback/file_writer.py +++ b/src/torchblocks/callback/file_writer.py @@ -2,8 +2,7 @@ import matplotlib.pyplot as plt import matplotlib.ticker as mtick from collections import defaultdict -from torchblocks.utils.paths import save_json, create_dir - +from ..utils.io_utils import save_json, build_dir plt.switch_backend('agg') # 防止ssh上绘图问题 FILE_NAME = 'training_info.json' @@ -14,7 +13,7 @@ class FileWriter: def __init__(self, log_dir): self.log_dir = log_dir self.scale_dicts = defaultdict(list) - create_dir(self.log_dir) + build_dir(self.log_dir) def add_scalar(self, tag, scalar_value, global_step=None): if global_step is not None: @@ -28,9 +27,6 @@ def save(self, plot=True): if plot: self.plot() - def close(self): - pass - def plot(self): keys = list(self.scale_dicts.keys()) for key in keys: @@ -40,13 +36,14 @@ def plot(self): values = sorted(values, key=lambda x: x['step']) x = [i['step'] for i in values] - y = [i[key] for i in values] + y = [v[key] for v in values] plt.style.use("ggplot") fig = plt.figure(figsize=(15, 5), facecolor='w') ax = fig.add_subplot(111) if "eval_" in name: - y = [round(float(x), 2) for x in y] + x = [str(v) for v in x] + y = [round(float(v), 4) for v in y] ax.plot(x, y, label=name) if key == 'train_lr': # 科学计数法显示 @@ -57,3 +54,6 @@ def plot(self): plt.title(f"Training {name} [Step {x[-1]}]") plt.savefig(png_file) plt.close() + + def close(self): + pass diff --git a/torchblocks/callback/model_checkpoint.py b/src/torchblocks/callback/model_checkpoint.py old mode 100755 new mode 100644 similarity index 86% rename from torchblocks/callback/model_checkpoint.py rename to src/torchblocks/callback/model_checkpoint.py index f953c8b..f6b4737 --- a/torchblocks/callback/model_checkpoint.py +++ b/src/torchblocks/callback/model_checkpoint.py @@ -2,10 +2,10 @@ import torch import logging import numpy as np -from torchblocks.utils.paths import save_model -from torchblocks.utils.paths import json_to_text +from ..utils.ckpt_utils import save_model +from ..utils.io_utils import json_to_text -logger = logging.getLogger(__name__) +tmp_logger = logging.getLogger() CHECKPOINT_DIR_PREFIX = 'checkpoint' WEIGHTS_NAME = 'pytorch_model.bin' @@ -31,12 +31,14 @@ def __init__(self, monitor='eval_loss', verbose=True, save_best=False, + logger=None, keys_to_ignore_on_save=[] ): self.ckpt_dir = ckpt_dir self.monitor = monitor self.verbose = verbose self.save_best = save_best + self.logger = tmp_logger if logger is None else logger self.keys_to_ignore_on_save = keys_to_ignore_on_save if mode not in self.mode_dict: @@ -52,17 +54,17 @@ def init_save_dir(self): if self.save_best: self.save_ckpt_dir = os.path.join(self.ckpt_dir, f"{prefix}-best") else: - self.save_ckpt_dir = os.path.join(self.ckpt_dir, prefix + '-{:.4f}-step-{}') + self.save_ckpt_dir = os.path.join(self.ckpt_dir, prefix + '-{:.2f}-step-{}') def step(self, state, current=None): - if current is not None and not isinstance(current, torch.Tensor): + if current is not None and not isinstance(current, torch.Tensor): current = torch.tensor(current) state['monitor'] = self.monitor state['score'] = current state['save_dir'] = self.save_ckpt_dir global_step = state['global_step'] is_saving = False - if current is None: # evaluate_during_training = False + if current is None: # evaluate_during_training = False is_saving = True else: if not self.save_best: @@ -70,10 +72,10 @@ def step(self, state, current=None): state['save_dir'] = self.save_ckpt_dir.format(state['score'], global_step) if self.monitor_op(current, self.best_score): # best msg = ( - f" Steps {global_step}: Metric {self.monitor} improved from {self.best_score:.4f} to {state['score']:.4f}" - f". New best score: {state['score']:.4f}" + f"Steps {global_step}: Metric {self.monitor} improved from {self.best_score:.4f} to {state['score']:.4f}" + f". --- New best score ---: {state['score']:.4f}" ) - logger.info(msg) + self.logger.info(msg) self.best_score = current state['best_score'] = self.best_score is_saving = True @@ -95,7 +97,7 @@ def save_checkpoint(self, state): def _save_model(self, state): assert 'model' in state, "state['model'] does not exist." if self.verbose: - logger.info("Saving model checkpoint to %s", state['save_dir']) + self.logger.info("Saving model checkpoint to %s", state['save_dir']) model = state['model'] if hasattr(model, 'save'): model.save(state['save_dir']) diff --git a/torchblocks/callback/progressbar.py b/src/torchblocks/callback/progressbar.py old mode 100755 new mode 100644 similarity index 91% rename from torchblocks/callback/progressbar.py rename to src/torchblocks/callback/progressbar.py index ab8b986..811472c --- a/torchblocks/callback/progressbar.py +++ b/src/torchblocks/callback/progressbar.py @@ -8,7 +8,7 @@ class ProgressBar(object): Example: >>> pbar = ProgressBar(n_total=30,desc='Training') >>> step = 2 - >>> pbar(step=step,info={'loss':20}) + >>> pbar.step(step=step,info={'loss':20}) ''' def __init__(self, @@ -71,12 +71,12 @@ def epoch(self, current_epoch): self.file.write(f"Epoch: {current_epoch}/{int(self.num_epochs)}") self.file.write("\n") - def step(self, step, info={}): + def step(self, step, info=None): now = time.time() - current = step + 1 + current = step if step >0 else 1 bar = self._bar(current) show_bar = f"\r{bar}" + self._time_info(now, current) - if len(info) != 0: + if info is not None: show_bar = f'{show_bar} ' + " [" + "-".join( [f' {key}={value:4f} ' for key, value in info.items()]) + "]" if current >= self.n_total: diff --git a/src/torchblocks/callback/swa.py b/src/torchblocks/callback/swa.py new file mode 100644 index 0000000..e04ab30 --- /dev/null +++ b/src/torchblocks/callback/swa.py @@ -0,0 +1,29 @@ +import os +import copy +import torch +from ..utils.ckpt_utils import find_all_checkpoints + +def SWA(model, model_dir, swa_start=1): + """ + swa 滑动平均模型,一般在训练平稳阶段再使用 SWA + """ + model_path_list = find_all_checkpoints(model_dir) + assert 1 <= swa_start < len(model_path_list) - 1, \ + f'Using swa, swa start should smaller than {len(model_path_list) - 1} and bigger than 0' + swa_model = copy.deepcopy(model) + swa_n = 0. + with torch.no_grad(): + for _ckpt in model_path_list[swa_start:]: + print(_ckpt) + model.load_state_dict(torch.load(_ckpt, map_location=torch.device('cpu'))) + tmp_para_dict = dict(model.named_parameters()) + alpha = 1. / (swa_n + 1.) + for name, para in swa_model.named_parameters(): + para.copy_(tmp_para_dict[name].data.clone() * alpha + para.data.clone() * (1. - alpha)) + swa_n += 1 + swa_model_dir = os.path.join(model_dir, f'checkpoint-swa') + if not os.path.exists(swa_model_dir): + os.mkdir(swa_model_dir) + swa_model_path = os.path.join(swa_model_dir, 'pytorch_model.bin') + torch.save(swa_model.state_dict(), swa_model_path) + return swa_model diff --git a/src/torchblocks/core/__init__.py b/src/torchblocks/core/__init__.py new file mode 100644 index 0000000..2db3ec7 --- /dev/null +++ b/src/torchblocks/core/__init__.py @@ -0,0 +1,2 @@ +from .train_base import * +from .application import Application \ No newline at end of file diff --git a/src/torchblocks/core/application.py b/src/torchblocks/core/application.py new file mode 100644 index 0000000..58a56e0 --- /dev/null +++ b/src/torchblocks/core/application.py @@ -0,0 +1,13 @@ +import torch.nn as nn + + +class Application(nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, inputs): + raise NotImplementedError('Method [Application.forward] should be implemented.') + + def compute_loss(self, **kwargs): + raise NotImplementedError('Method [Application.compute_loss] should be implemented.') diff --git a/src/torchblocks/core/train_base.py b/src/torchblocks/core/train_base.py new file mode 100644 index 0000000..17f7b21 --- /dev/null +++ b/src/torchblocks/core/train_base.py @@ -0,0 +1,890 @@ +import os +import gc +import math +import copy +import warnings +import torch +import torch.nn as nn +from argparse import Namespace +from packaging import version +from torch.utils.data import Dataset, DataLoader, ConcatDataset +from torch.utils.data.sampler import RandomSampler, SequentialSampler +from ..losses.kl_divergence import BKL +from ..callback import ModelCheckpoint, EarlyStopping, ProgressBar, EMA, SWA +from ..callback.model_checkpoint import (WEIGHTS_NAME, + TRAINER_STATE_NAME, + OPTIMIZER_NAME, + SCHEDULER_NAME, + SCALER_NAME) +from ..callback.attacks import FGM, PGD, AWP +from ..callback.file_writer import FileWriter +from ..optims.adamw import AdamW +from ..optims.lr_scheduler import get_lr_scheduler +from ..utils.common_utils import (check_object_type, + convert_to_list, + has_key, + check_object_keys) +from ..utils.logger import Logger +from ..utils.meter import AverageMeter +from ..utils.import_utils import is_apex_available +from ..utils.io_utils import (to_json_string, + save_pickle, + json_to_text, + save_json, + is_file) +from ..utils.ckpt_utils import load_model +from ..utils.seed import seed_everything +from ..utils.tensor_utils import convert_tensor_list_to_dict, convert_cuda_to_cpu + +warnings.filterwarnings('ignore') +if version.parse(torch.__version__) >= version.parse("1.10"): + torch.set_warn_always(False) + +_is_native_amp_available = False +if is_apex_available(): + from apex import amp + +if version.parse(torch.__version__) >= version.parse("1.6"): + _is_native_amp_available = True + from torch.cuda.amp import autocast, GradScaler + +try: + from torch.utils.tensorboard import SummaryWriter + + _has_tensorboard = True +except ImportError: + try: + from tensorboardX import SummaryWriter + + _has_tensorboard = True + except ImportError: + _has_tensorboard = False + + +class TrainBaseBuilder: + """Base class for iterative trainer.""" + # Variable names that are not converted to GPU in batch data,For example 'input_length’ + keys_to_ignore_on_gpu = [] + # Variables that are not stored in the eval and predict process + keys_to_ignore_on_save_result = ['input_ids', 'token_type_ids'] + # Variables that are not stored in the checkpoint. For example 'optimizer' + keys_to_ignore_on_save_checkpoint = [] + mode_dict = {'min': torch.lt, 'max': torch.gt} + + def __init__(self, + opts, + model, + metrics, + logger, + **kwargs): + ''' + Training master function + Args: + opts: options + model + metrics + logger + **kwargs: + ''' + self.opts = opts + self.model = model + self.logger = logger + self.metrics = metrics + self.global_step = 0 + self._init_ema() + self._init_swa() + self._init_attack() + self._init_optimizer() + self._init_early_stopping() + self._init_model_checkpoint() + self.metrics = convert_to_list(self.metrics) + self.device_num = getattr(opts, 'device_num', 0) + self.device = getattr(opts, 'device', torch.device("cpu")) + self.prefix = "_".join([opts.task_name, opts.model_type, opts.experiment_name]) + self.build_log_writer() + self.build_mixed_precision() + check_object_type(object=self.model, check_type=nn.Module, name='model') + check_object_type(object=self.opts, check_type=Namespace, name='self.opts') + check_object_type(object=self.logger, check_type=Logger, name='self.logger') + check_object_type(object=self.metrics, check_type=list, name='metric') + for key, value in kwargs.items(): + setattr(self, key, value) + + def _init_ema(self): + # EMA + if self.opts.do_ema: + self.logger.info('Using EMA training.....') + self.model_ema = EMA(model=self.model.module if hasattr(self.model, 'module') else self.model, + decay=self.opts.ema_decay) + self.model_ema.register() + + def _init_swa(self): + # SWA + if self.opts.do_swa: + self.logger.info('Using SWA training.....') + self.model_swa = copy.deepcopy(self.model) + + def _init_attack(self): + # Adversarial training + msg = f"Adversarial training. FGM: {self.opts.do_fgm} PGD: {self.opts.do_pgd} AWP: {self.opts.do_awp}" + self.logger.info(msg) + self.attack_models = self.build_attack_model() + + def _init_optimizer(self): + # optimizer + self.optimizer = self.build_optimizer(self.model) + + def _init_model_checkpoint(self): + # checkpoint + self.model_checkpoint = ModelCheckpoint( + logger=self.logger, + mode=self.opts.checkpoint_mode, + ckpt_dir=self.opts.output_dir, + monitor=self.opts.checkpoint_monitor, + verbose=self.opts.checkpoint_verbose, + save_best=self.opts.checkpoint_save_best, + keys_to_ignore_on_save=self.keys_to_ignore_on_save_checkpoint + ) + + def _init_early_stopping(self): + # earlystopping + self.early_stopping = None + if self.opts.earlystopping_patience > 0: + msg = f"`EarlyStopping patience` is {self.opts.earlystopping_patience},using early stopping." + self.logger.info(msg) + self.early_stopping = EarlyStopping( + mode=self.opts.earlystopping_mode, + patience=self.opts.earlystopping_patience, + monitor=self.opts.earlystopping_monitor, + save_state_path=self.opts.earlystopping_save_state_path, + load_state_path=self.opts.earlystopping_load_state_path + ) + + def build_mixed_precision(self): + # Mixed precision setup + self.use_apex = False + self.use_amp = False + self.fp16_backend = None + if self.opts.do_fp16: + if self.opts.fp16_backend == "auto": + self.fp16_backend = "amp" if _is_native_amp_available else "apex" + else: + self.fp16_backend = self.opts.fp16_backend + self.logger.info(f"Using {self.fp16_backend} fp16 backend") + if self.fp16_backend == "amp": + self.use_amp = True + self.scaler = GradScaler() + else: + if not is_apex_available(): + msg = ("Using FP16 with APEX but APEX is not installed, " + "please refer to https://www.github.com/nvidia/apex.") + raise ImportError(msg) + self.use_apex = True + + # TODO:If there are multiple Adversarial learning methods, consider the order of methods + def build_attack_model(self): + attack_models = {} + if self.opts.do_fgm: + attack_model = FGM(self.model, self.opts.fgm_name, self.opts.fgm_epsilon) + attack_models['fgm'] = attack_model + if self.opts.do_pgd: + attack_model = PGD(self.model, self.opts.pgd_name, self.opts.pgd_epsilon, self.opts.pgd_alpha) + attack_models['pgd'] = attack_model + if self.opts.do_awp: + attack_model = AWP(self.model, self.opts.awp_name, self.opts.awp_epsilon, self.opts.awp_alpha, + self.opts.awp_start_epoch, self.opts.awp_start_step, self.opts.awp_start_score, + self.opts.awp_score_mode) + attack_models['awp'] = attack_model + return attack_models + + def build_record_tracker(self, **kwargs): + self.record_tracker = {} + self.record_tracker['result'] = {} + for key, value in kwargs.items(): + if key not in self.record_tracker: + self.record_tracker[key] = value + + def build_record_meter(self, key, value=None, n=1): + if key not in self.record_tracker: + self.record_tracker[key] = AverageMeter() + if value is not None: + self.record_tracker[key].update(value, n=n) + else: + self.record_tracker[key].update(value, n=n) + + def build_log_writer(self): + # tensorboard + if _has_tensorboard and self.opts.log_writer == 'tensorboard': + msg = f'Initializing summary writer for tensorboard with log_dir={self.opts.output_dir}' + self.logger.info(msg) + exp_dir = os.path.join(self.opts.output_dir, f'{self.prefix}_tb_logs') + self.writer = SummaryWriter(log_dir=exp_dir, comment='Training logs') + self.writer.add_text("train_arguments", to_json_string(self.opts.__dict__)) + elif self.opts.log_writer == 'file': + exp_dir = os.path.join(self.opts.output_dir, f'{self.prefix}_file_logs') + self.writer = FileWriter(log_dir=exp_dir) + else: + # TODO: Add WB + pass + + def reset_metrics(self): + ''' + The `metric` class must contain the `reset` function + Returns: + ''' + for metric in self.metrics: + if not hasattr(metric, 'reset'): + msg = "module 'metric' has no attribute 'reset'" + return ValueError(msg) + metric.reset() + + def _param_optimizer(self, params, learning_rate, no_decay, weight_decay): + _params = [ + {'params': [p for n, p in params if not any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay, + 'lr': learning_rate}, + {'params': [p for n, p in params if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0, + 'lr': learning_rate}, + ] + return _params + + def build_model_param_optimizer(self, model): + ''' + If you need to assign different learning rates to different models, + In the `transformer` module,specify `base_model_name`, the default is `base_model_name=`base_model`. + For base_model use learning_rate, for the rest use other_learning_rate + ''' + no_decay = ["bias", 'LayerNorm.weight'] + optimizer_grouped_parameters = [] + if ( + hasattr(model, self.opts.base_model_name) + and self.opts.other_learning_rate > 0.0 + ): + msg = ( + f"The initial learning rate for {self.opts.base_model_name} model params : {self.opts.learning_rate} ," + f"and other model params is : {self.opts.other_learning_rate}" + ) + self.logger.info(msg) + base_model = getattr(model, self.opts.base_model_name) + base_model_param = list(base_model.named_parameters()) + base_model_param_ids = [id(p) for n, p in base_model_param] + other_model_param = [(n, p) for n, p in model.named_parameters() if + id(p) not in base_model_param_ids] + optimizer_grouped_parameters.extend( + self._param_optimizer(base_model_param, self.opts.learning_rate, no_decay, + self.opts.weight_decay)) + optimizer_grouped_parameters.extend( + self._param_optimizer(other_model_param, self.opts.other_learning_rate, no_decay, + self.opts.weight_decay)) + else: + all_model_param = list(model.named_parameters()) + optimizer_grouped_parameters.extend( + self._param_optimizer(all_model_param, self.opts.learning_rate, no_decay, self.opts.weight_decay)) + return optimizer_grouped_parameters + + def build_optimizer(self, model): + ''' + Setup the optimizer. + ''' + self.logger.info("The custom optimizer is None, using default `AdamW` optimizer") + optimizer_grouped_parameters = self.build_model_param_optimizer(model) + optimizer = AdamW(params=optimizer_grouped_parameters, + lr=self.opts.learning_rate, + eps=self.opts.adam_epsilon, + betas=(self.opts.adam_beta1, self.opts.adam_beta2), + weight_decay=self.opts.weight_decay) + return optimizer + + def build_warmup_steps(self): + """ + Get number of steps used for a linear warmup. + """ + if self.opts.warmup_rate < 0 or self.opts.warmup_rate > 1: + raise ValueError("warmup_rate must lie in range [0,1]") + elif self.opts.warmup_rate > 0 and self.opts.warmup_steps > 0: + msg = ("Both warmup_rate and warmup_steps given, " + "warmup_steps will override any effect of warmup_rate during training") + self.logger.info(msg) + warmup_steps = ( + self.opts.warmup_steps if self.opts.warmup_steps > 0 else math.ceil( + self.num_update_training_steps * self.opts.warmup_rate) + ) + return warmup_steps + + def build_lr_scheduler(self): + ''' + the learning rate scheduler. + ''' + scheduler_function = get_lr_scheduler(self.opts.scheduler_type) + warmup_steps = self.build_warmup_steps() + scheduler = scheduler_function(optimizer=self.optimizer, + num_warmup_steps=warmup_steps, + num_training_steps=self.num_update_training_steps, + num_cycles=self.opts.num_cycles) + return scheduler + + def build_train_dataloader(self, train_data): + ''' + Load train dataset + ''' + if isinstance(train_data, DataLoader): + data_loader = train_data + elif isinstance(train_data, Dataset): + batch_size = self.opts.per_gpu_train_batch_size * max(1, self.device_num) + sampler = RandomSampler(train_data) + if hasattr(train_data, 'build_train_sampler'): + sampler = train_data.build_train_sampler + collate_fn = train_data.build_data_collator + if hasattr(train_data, "build_train_collator"): + collate_fn = train_data.build_train_collator + data_loader = DataLoader(train_data, + sampler=sampler, + batch_size=batch_size, + collate_fn=collate_fn, + pin_memory=self.opts.pin_memory, + drop_last=self.opts.train_drop_last, + num_workers=self.opts.num_workers) + else: + raise TypeError("train_data type `{}` not support".format(type(train_data))) + return data_loader + + def build_eval_dataloader(self, dev_data): + ''' + Load eval dataset + ''' + if isinstance(dev_data, DataLoader): + data_loader = dev_data + elif isinstance(dev_data, Dataset): + batch_size = self.opts.per_gpu_eval_batch_size * max(1, self.device_num) + sampler = SequentialSampler(dev_data) + if hasattr(dev_data, 'build_eval_sampler'): + sampler = dev_data.build_eval_sampler + collate_fn = dev_data.build_data_collator + if hasattr(dev_data, "build_eval_collator"): + collate_fn = dev_data.build_eval_collator + data_loader = DataLoader(dev_data, + sampler=sampler, + batch_size=batch_size, + collate_fn=collate_fn, + drop_last=self.opts.eval_drop_last, + pin_memory=self.opts.pin_memory, + num_workers=self.opts.num_workers) + else: + raise TypeError("dev_data type `{}` not support".format(type(dev_data))) + return data_loader + + def build_test_dataloader(self, test_data): + ''' + Load test dataset + ''' + if isinstance(test_data, DataLoader): + data_loader = test_data + elif isinstance(test_data, Dataset): + batch_size = self.opts.per_gpu_test_batch_size * max(1, self.device_num) + sampler = SequentialSampler(test_data) + if hasattr(test_data, 'build_test_sampler'): + sampler = test_data.build_test_sampler + collate_fn = test_data.build_data_collator + if hasattr(test_data, "build_test_collator"): + collate_fn = test_data.build_test_collator + data_loader = DataLoader(test_data, + sampler=sampler, + batch_size=batch_size, + collate_fn=collate_fn, + drop_last=self.opts.test_drop_last, + pin_memory=self.opts.pin_memory, + num_workers=self.opts.num_workers) + else: + raise TypeError("test_data type `{}` not support".format(type(test_data))) + return data_loader + + def build_batch_inputs(self, batch): + ''' + Sent all model inputs to the appropriate device (GPU on CPU) + rreturn: + The inputs are in a dictionary format + keys_to_ignore_on_gpu: Variables stored in the cpu + ''' + outputs = {} + for key, value in batch.items(): + if (key not in self.keys_to_ignore_on_gpu) and value is not None: + outputs[key] = value.to(self.device) + else: + outputs[key] = value + return outputs + + def process_batch_inputs(self, batch): + ''' + dynamic processing of batches + Args: + batch: + Returns: + ''' + return batch + + def build_eval_and_save_steps(self): + if self.opts.logging_strategy == 'epoch' or self.opts.logging_steps <= 0: + self.opts.logging_steps = self.num_update_steps_per_epoch + if self.opts.save_steps <= 0: + self.opts.save_steps = self.num_update_steps_per_epoch + + def build_model_warp(self): + # Mixed precision training with apex (torch < 1.6) + if self.use_apex: + self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level=self.opts.fp16_opt_level) + # Multi-gpu training (should be after apex fp16 initialization) + if self.device_num > 1: + self.model = nn.DataParallel(self.model) + + def name_to_metric(self, metric_name): + check_object_keys(object=self.record_tracker['result'], key=metric_name, msg='Metric Result') + return self.record_tracker['result'][metric_name] + + def running_scheduler_on_batch(self): + # Update learning rate schedule + if self.scheduler: + if self.opts.scheduler_on == 'batch': + if self.opts.scheduler_metric is None: + self.scheduler.step() + else: + step_metric = self.name_to_metric(self.opts.scheduler_metric) + self.scheduler.step(step_metric) + + def running_scheduler_on_epoch(self): + # Update learning rate schedule + if self.scheduler is not None: + if self.opts.scheduler_on == 'epoch': + if self.opts.scheduler_metric is None: + self.scheduler.step() + else: + step_metric = self.name_to_metric(self.opts.scheduler_metric) + self.scheduler.step(step_metric) + + def build_state_object(self, **kwargs): + ''' + save state object + ''' + states = { + 'opts': self.opts, + 'optimizer': self.optimizer, + 'global_step': self.global_step, + 'model': self.model.module if hasattr(self.model, "module") else self.model + } + if self.scheduler is not None: + states['scheduler'] = self.scheduler + if self.use_amp: + states['scaler'] = self.scaler + for key, value in kwargs.items(): + if key not in states: + states[key] = value + return states + + def resume_from_checkpoint(self, resume_path=None): + ''' + Check if continuing training from a checkpoint + ''' + if resume_path is not None: + optimizer_path = os.path.join(resume_path, OPTIMIZER_NAME) + scheduler_path = os.path.join(resume_path, SCHEDULER_NAME) + state_path = os.path.join(resume_path, TRAINER_STATE_NAME) + model_path = os.path.join(resume_path, WEIGHTS_NAME) + scaler_path = os.path.join(resume_path, SCALER_NAME) + if is_file(optimizer_path): + self.optimizer.load_state_dict(torch.load(optimizer_path, map_location=self.device)) + if is_file(scheduler_path): + self.scheduler.load_state_dict(torch.load(scheduler_path)) + if is_file(state_path): + state = torch.load(state_path) + if self.model_checkpoint and hasattr(state, 'best_score'): + self.model_checkpoint.best = state['best_score'] + del state + gc.collect() + if is_file(model_path): + if self.use_amp and is_file(scaler_path): + self.scaler.load_state_dict(torch.load(scaler_path)) + load_model(self.model, model_path, device=self.device) + + def train_rdrop_forward(self, inputs, epoch): + # rdrop training forward + rdrop_fct = BKL() + outputs = self.train_common_forward(inputs) + if epoch >= self.opts.rdrop_start_epoch: + outputs_2 = self.train_common_forward(inputs) + rdrop_loss = rdrop_fct(outputs['logits'], outputs_2['logits']) + loss_weight = (1 - self.opts.rdrop_weight) / 2 + loss = loss_weight * outputs['loss'] + loss_weight * outputs_2['loss'] + self.opts.rdrop_weight * rdrop_loss + outputs['loss'] = loss + return outputs + + def train_common_forward(self, inputs): + ''' + common training forward + ''' + self.model.train() + if self.use_amp: + with autocast(): + outputs = self.model(inputs) + else: + outputs = self.model(inputs) + check_object_type(object=outputs, check_type=dict, name='outputs') + if self.device_num > 1: outputs['loss'] = outputs['loss'].mean() + return outputs + + def train_forward(self, inputs, epoch): + # main training forward + if self.opts.do_rdrop: + return self.train_rdrop_forward(inputs, epoch) + else: + return self.train_common_forward(inputs) + + def train_backward(self, loss, factor): + ''' + Training backward + ''' + loss = loss * factor + if self.use_amp: + self.scaler.scale(loss).backward() + elif self.use_apex: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + def train_update(self): + ''' + Training update + ''' + if self.opts.max_grad_norm is not None and self.opts.max_grad_norm > 0: + if self.use_amp: + # before gradient clipping the optimizer parameters must be unscaled. + self.scaler.unscale_(self.optimizer) + torch.nn.utils.clip_grad_norm_( + amp.master_params(self.optimizer) if self.use_apex else self.model.parameters(), + self.opts.max_grad_norm) + if self.use_amp: + self.scaler.step(self.optimizer) + self.scaler.update() + else: + self.optimizer.step() + if self.opts.do_ema: + self.model_ema.update() + self.optimizer.zero_grad() # Reset gradients to zero + self.running_scheduler_on_batch() + + def train_attack(self, inputs, epoch, step, factor): + if len(self.attack_models) >= 1: + for key, attack_model in self.attack_models.items(): + attack_model.backup_grad() + if key == 'fgm': + attack_model.attack() + adv_outputs = self.train_forward(inputs, epoch) + self.train_backward(adv_outputs['loss'], factor=factor) + attack_model.restore() + elif key == 'pgd': + for t in range(self.opts.pgd_number): + attack_model.attack(is_first_attack=(t == 0)) + if t != self.opts.pgd_number - 1: + self.optimizer.zero_grad() + else: + attack_model.restore_grad() + adv_outputs = self.train_forward(inputs, epoch) + self.train_backward(adv_outputs['loss'], factor=factor) + attack_model.restore() + elif key == 'awp': + current_score = self.record_tracker['result'].get(self.opts.awp_score_monitor, None) + kwargs = {'epoch': epoch, 'step': step, 'score': current_score} + if attack_model.is_attack(**kwargs): + for t in range(self.opts.awp_number): + attack_model.attack() + adv_outputs = self.train_forward(inputs, epoch) + self.optimizer.zero_grad() # 清空梯度 + self.train_backward(adv_outputs['loss'], factor=factor) + attack_model.restore() + + def train_step(self, batch, epoch, step, factor): + batch_inputs = self.build_batch_inputs(batch) + batch_inputs = self.process_batch_inputs(batch_inputs) + outputs = self.train_forward(batch_inputs, epoch) + self.train_backward(outputs['loss'], factor=factor) + self.train_attack(batch_inputs, epoch, step, factor) + return outputs + + def print_training_summary(self, num_examples): + ''' + print training parameters information + ''' + options = list(self.opts.__dict__.items()) + options.append(['num_examples', num_examples]) + options.append(['total_optimization_steps', self.num_update_training_steps]) + options.append(['total_number_of_parameters', sum(p.numel() for p in self.model.parameters())]) + options = sorted(options, key=lambda x: x[0]) + msg = '\n\n' + '=' * 10 + ' Training Start ' + '=' * 10 + '\n' + for k, v in options: + msg += f' - {k}: {v}\n' + self.logger.info(msg) + + def _zero_grad(self): + self.optimizer.zero_grad() + self.model.zero_grad() + + def is_update(self, step): + should_update, should_logging, should_save = False, False, False + loss_factor = 1.0 + # Normal conditions + if step % self.opts.gradient_accumulation_steps == 0: + should_update = True + self.global_step += 1 + should_logging = self.global_step % self.opts.logging_steps == 0 + should_save = self.global_step % self.opts.save_steps == 0 + loss_factor = 1.0 / self.opts.gradient_accumulation_steps + # Each epoch save the last model, mainly for the gradient_accumulation_steps>1 case + elif step == self.steps_in_epoch and self.opts.gradient_accumulation_steps > 1: + should_update = True + self.global_step += 1 + loss_factor = 1.0 / self.remaind_in_epoch + should_logging, should_save = True, True + else: + pass + return should_update, should_logging, should_save, loss_factor + + # TODO distributed training + def train(self, train_data, dev_data=None, resume_path=None, state_to_save=None, train_with_add_datasets=None, + convert_output_cuda_to_cpu=True): + ''' + train function + Args: + train_data: + dev_data: + resume_path: + state_to_save: Additional Variables need to be saved, for example {'vocab':tokenizer} + train_with_add_dataset:Adding additional datasets, such as pseudo-labeled data + convert_output_cuda_to_cpu: Convert cuda storage to cpu, mainly used in eval or predict phase to avoid OOM, default is True. + Returns: + ''' + if ( + train_with_add_datasets is not None + and isinstance(train_data, Dataset) + ): + train_with_add_datasets = convert_to_list(train_with_add_datasets) + msg = ("If dataset is not None, the dataset is added to the training data. " + f"The size of data : from {len(train_data)} to {sum([len(x) for x in train_with_add_datasets])}." + ) + self.logger.info(msg) + for dset in train_with_add_datasets: + train_data = ConcatDataset([train_data, dset]) + train_data.build_data_collator = dset.build_data_collator + train_dataloader = self.build_train_dataloader(train_data) + self.steps_in_epoch = len(train_dataloader) + self.round_in_epoch = self.steps_in_epoch // self.opts.gradient_accumulation_steps + self.remaind_in_epoch = self.steps_in_epoch % self.opts.gradient_accumulation_steps + self.num_update_steps_per_epoch = max(1, + self.round_in_epoch + 1 if self.remaind_in_epoch > 0 else self.round_in_epoch) + self.num_update_training_steps = self.num_update_steps_per_epoch * self.opts.num_train_epochs + self.scheduler = self.build_lr_scheduler() + self.resume_from_checkpoint(resume_path=resume_path) + self.build_model_warp() + self.reset_metrics() + self.build_eval_and_save_steps() + self.build_record_tracker() + self.print_training_summary(len(train_data)) + self._zero_grad() + seed_everything(self.opts.seed, verbose=False) # Added here for reproductibility (even between python 2 and 3) + pbar = ProgressBar(n_total=self.num_update_steps_per_epoch, desc='Training', + num_epochs=self.opts.num_train_epochs) + for epoch in range(1, int(self.opts.num_train_epochs) + 1): + if self.opts.epoch_seed: + seed_everything(self.opts.seed + epoch) # To turn off or not, do experiment + pbar.epoch(current_epoch=epoch) + gc.collect() + for step, batch in enumerate(train_dataloader): + step += 1 + should_update, should_logging, should_save, loss_factor = self.is_update(step) + outputs = self.train_step(batch, epoch, step, loss_factor) + msg = {'loss': outputs['loss'].item(), "lr": self.optimizer.param_groups[0]['lr']} + step_round = step // self.opts.gradient_accumulation_steps + if step_round == self.round_in_epoch: + if step % self.opts.gradient_accumulation_steps > 0: + step_round = step_round + 1 + pbar.step(step=step_round, info=msg) + if should_update: + self.train_update() + self.build_record_meter('train_loss_meter', outputs['loss'].item(), 1) + self.writer.add_scalar('loss/train_loss', outputs['loss'].item(), self.global_step) + if hasattr(self.scheduler, 'get_lr'): + self.writer.add_scalar('learningRate/train_lr', self.scheduler.get_lr()[0], self.global_step) + if self.global_step > 0: + if should_logging: + train_loss = self.record_tracker['train_loss_meter'].avg + self.build_record_tracker() + if dev_data is not None: + # Before each eval, you need to reset the metric + self.reset_metrics() + eval_outputs = self.evaluate(self.model, dev_data, + convert_output_cuda_to_cpu=convert_output_cuda_to_cpu) + eval_outputs = self.process_batch_outputs(eval_outputs) + self.update_metrics(eval_outputs) + if self.opts.do_ema: + self.model_ema.apply_shadow() + self.reset_metrics() + eval_ema_outputs = self.evaluate(self.model, dev_data, postfix='ema', + convert_output_cuda_to_cpu=convert_output_cuda_to_cpu) + eval_ema_outputs = self.process_batch_outputs(eval_ema_outputs) + self.update_metrics(eval_ema_outputs, postfix='ema') + self.model_ema.restore() + self.record_tracker['result']['train_loss'] = train_loss + self.print_evaluate_result() + if hasattr(self.writer, 'save'): + self.writer.save() + if should_save: + # model checkpoint + if self.model_checkpoint: + state = self.build_state_object(**state_to_save) + step_metric_score = self.name_to_metric(self.model_checkpoint.monitor) + self.model_checkpoint.step( + state=state, + current=step_metric_score + ) + self.running_scheduler_on_epoch() + # early_stopping + if self.early_stopping: + step_metric_score = self.name_to_metric(self.early_stopping.monitor) + self.early_stopping.step(current=step_metric_score) + if self.early_stopping.stop_training: + break + if torch.cuda.is_available(): + torch.cuda.empty_cache() + if self.opts.do_swa: + self.model_swa = SWA(self.model_swa, self.opts.output_dir, swa_start=self.opts.swa_start) + self.reset_metrics() + eval_swa_outputs = self.evaluate(self.model_swa, dev_data, postfix='swa', + convert_output_cuda_to_cpu=convert_output_cuda_to_cpu) + eval_swa_outputs = self.process_batch_outputs(eval_swa_outputs) + self.update_metrics(eval_swa_outputs, postfix='swa') + if self.writer: + self.writer.close() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + gc.collect() + + def build_postfix(self, postfix): + postfix = '' if postfix in [None, ''] else postfix + "_" + return postfix + + def evaluate(self, model, dev_data, data_type='eval', save_dir=None, save_result=False, file_name=None, + postfix=None, + convert_output_cuda_to_cpu=True): + batches = [] + postfix = self.build_postfix(postfix) + eval_dataloader = self.build_eval_dataloader(dev_data) + pbar = ProgressBar(n_total=len(eval_dataloader), desc='Evaluating') + for step, batch in enumerate(eval_dataloader): + batch = self.predict_forward(model, batch, convert_output_cuda_to_cpu=convert_output_cuda_to_cpu) + if batch.get("loss", None): + self.build_record_meter(f'eval_{postfix}loss_meter', batch['loss'], 1) + batches.append(batch) + pbar.step(step + 1) + # 将list形式转化为dict形式 + predict_outputs = convert_tensor_list_to_dict(batches) + if save_result: + self.save_predict_result(predict_outputs, file_name, postfix, data_type, save_dir) + if has_key(self.record_tracker, f'eval_{postfix}loss_meter'): + self.record_tracker['result'][f'eval_{postfix}loss'] = self.record_tracker[f'eval_{postfix}loss_meter'].avg + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return predict_outputs + + def predict(self, model, test_data, save_result=True, file_name=None, save_dir=None, postfix=None, data_type='test', + convert_output_cuda_to_cpu=True): + + batches = [] + test_dataloader = self.build_test_dataloader(test_data) + pbar = ProgressBar(n_total=len(test_dataloader), desc='Predicting') + for step, batch in enumerate(test_dataloader): + batch = self.predict_forward(model, batch, convert_output_cuda_to_cpu=convert_output_cuda_to_cpu) + batches.append(batch) + pbar.step(step + 1) + predict_outputs = convert_tensor_list_to_dict(batches) + if save_result: + postfix = self.build_postfix(postfix) + self.save_predict_result(predict_outputs, file_name, postfix, data_type, save_dir) + return predict_outputs + + def predict_forward(self, model, batch, convert_output_cuda_to_cpu=True): + batch_inputs = self.build_batch_inputs(batch) + model.eval() + with torch.no_grad(): + batch_outputs = model(batch_inputs) + if batch_outputs.get('loss', None): + batch_outputs['loss'] = batch_outputs['loss'].mean().detach().item() + if convert_output_cuda_to_cpu: + batch_outputs = convert_cuda_to_cpu(batch_outputs) + batch_inputs = batch + batch_outputs = {key: value for key, value in dict(batch_inputs, **batch_outputs).items() if + key not in self.keys_to_ignore_on_save_result} + return batch_outputs + + def print_evaluate_result(self): + ''' + 打印evaluate的结果 + ''' + if len(self.record_tracker['result']) == 0: + self.logger.warning(f"Evaluating results of {self.opts.task_name} are empty") + if self.opts.logging_steps < self.num_update_steps_per_epoch: print(" ") + msg = f"Result: | {self.global_step}/{self.num_update_training_steps} steps " + for key, value in self.record_tracker['result'].items(): + if isinstance(value, (int, float)): + name = "_".join(key.split("_")[1:]) if "_" in key else key + self.writer.add_scalar(f"{name}/{key}", value, int(self.global_step)) + value = str(round(value, 5)) + msg += f"| {key}: {value} " + self.logger.info(msg) + + def update_metrics(self, outputs, postfix=None): + postfix = self.build_postfix(postfix) + for metric in self.metrics: + metric.update(preds=outputs['preds'], target=outputs['target']) + value = metric.value() + if isinstance(value, float): + self.record_tracker['result'][f'eval_{postfix}{metric.name()}'] = value + elif isinstance(value, dict): + self.record_tracker['result'].update({f"eval_{postfix}{k}": v for k, v in value.items()}) + elif value is None: + self.logger.info(f"The value of {metric.name()} is None") + else: + msg = "The type of metric value: expected one of (float, dict,None)" + raise ValueError(msg) + + def save_predict_result(self, data, file_name, postfix, data_type, save_dir=None): + ''' + 保存预测信息 + ''' + if save_dir is not None: + if not os.path.isdir(save_dir): + save_dir = os.path.join(self.opts.output_dir, save_dir) + else: + save_dir = self.opts.output_dir + if file_name is None: + file_name = self.prefix + postfix + data_type + "_results.pkl" + file_path = os.path.join(save_dir, file_name) + if ".pkl" in file_path: + save_pickle(file_path=file_path, data=data) + elif ".json" in file_path: + if isinstance(data, list): + json_to_text(file_path=file_path, data=data) + elif isinstance(data, dict): + save_json(data=data, file_path=file_path) + else: + pass + else: + raise ValueError("file type: expected one of (.pkl, .json)") + + def process_batch_outputs(self, *args, **kwargs): + ''' + 对eval或者predict结果进行处理,适配metric计算 + Args: + *args: + **kwargs: + Returns: + ''' + raise NotImplementedError('Method [TrainBaseBuilder.process_batch_outputs] should be implemented.') diff --git a/torchblocks/data/Vocabulary.py b/src/torchblocks/data/Vocabulary.py old mode 100755 new mode 100644 similarity index 96% rename from torchblocks/data/Vocabulary.py rename to src/torchblocks/data/Vocabulary.py index 980bf2c..6412780 --- a/torchblocks/data/Vocabulary.py +++ b/src/torchblocks/data/Vocabulary.py @@ -2,7 +2,7 @@ import logging from collections import Counter, OrderedDict -logger = logging.getLogger(__name__) +logger = logging.getLogger() VOCAB_NAME = "vocab.txt" diff --git a/src/torchblocks/data/__init__.py b/src/torchblocks/data/__init__.py new file mode 100644 index 0000000..6b88e94 --- /dev/null +++ b/src/torchblocks/data/__init__.py @@ -0,0 +1,4 @@ +from .embedding import * +from .Vocabulary import * +from .dataset_builder import * +from .dataset_builder import * \ No newline at end of file diff --git a/src/torchblocks/data/dataset_builder.py b/src/torchblocks/data/dataset_builder.py new file mode 100644 index 0000000..93bfca9 --- /dev/null +++ b/src/torchblocks/data/dataset_builder.py @@ -0,0 +1,129 @@ +import os +import torch +import logging +from tqdm import tqdm +from torch.utils.data import Dataset +from ..utils.io_utils import check_file, is_file +from ..utils.common_utils import convert_to_list + +logger = logging.getLogger() + + +class DatasetBaseBuilder(Dataset): + # 在collect_fn不做任务操作,直接以list方式合并 + keys_to_ignore_on_collate_batch = [] + # 动态batch处理过程中需要进行按照batch最长长度进行截取的keys + keys_to_dynamical_truncate_on_padding_batch = ['input_ids', 'attention_mask', 'token_type_ids'] + + def __init__(self, opts, file_name, data_type, process_piplines, cached_feature_file=None): + super().__init__() + self.data_type = data_type + self.data_dir = opts.data_dir + self.max_examples = opts.max_examples + self.use_data_cache = opts.use_data_cache + self.dynamical_padding = opts.dynamical_padding + self.cached_feature_file = cached_feature_file + self.process_piplines = convert_to_list(process_piplines) + if not is_file(file_name): + file_name = os.path.join(self.data_dir, file_name) + check_file(file_name) + self.examples = self.build_examples(self.read_data(file_name), data_type) + if self.max_examples is not None: + logger.info(f'[Debug]: use {self.max_examples} examples. ') + self.examples = self.examples[: self.max_examples] + if self.use_data_cache: + self.build_feature_cache(opts) + + def build_feature_cache(self, opts): + if not is_file(self.cached_feature_file): + if self.cached_feature_file is None: + prefix = f'{opts.task_name}_{opts.model_type}_{self.data_type}_{opts.experiment_name}' + self.cached_feature_file = prefix + '_feature.cache' + self.cached_feature_file = os.path.join(self.data_dir, self.cached_feature_file) + if not opts.overwrite_data_cache: + logger.info(f"Loading features from cached file: {self.cached_feature_file}") + self.features = torch.load(self.cached_feature_file) + else: + logger.info(f"Creating features from dataset file: {self.data_dir}") + self.features = [ + self.process_example(example) for example in + tqdm(self.examples, total=len(self.examples), desc="Converting examples to features......")] + logger.info(f"Saving features to cached file: {self.cached_feature_file}") + torch.save(self.features, self.cached_feature_file) + + def process_example(self, example): + for proc in self.process_piplines: + if proc is None: continue + example = proc(example) + return example + + def process_collator(self, batch, max_input_length): + # 动态padding + if self.dynamical_padding: + for k in self.keys_to_dynamical_truncate_on_padding_batch: + if k not in batch: continue + if batch[k].dim() >= 2: batch[k] = batch[k][:, : max_input_length] + return batch + + def build_data_collator(self, features): + batch = {} + first = features[0] + max_input_length = first['input_ids'].size(0) + if self.dynamical_padding: + max_input_length = max([torch.sum(f["attention_mask"]) for f in features]) + if "label" in first and first["label"] is not None: + label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"] + dtype = torch.long if isinstance(label, int) else torch.float + batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype) + elif "label_ids" in first and first["label_ids"] is not None: + if isinstance(first["label_ids"], torch.Tensor): + batch["labels"] = torch.stack([f["label_ids"] for f in features]) + else: + dtype = torch.long if type(first["label_ids"][0]) is int else torch.float + batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype) + # Handling of all other possible keys. + # Again, we will use the first element to figure out which key/values are not None for this model. + for k, v in first.items(): + if k not in ("label", "label_ids") and v is not None: + if k in self.keys_to_ignore_on_collate_batch: + bv = [f[k] for f in features] + else: + bv = torch.stack([f[k] for f in features]) if isinstance(v, torch.Tensor) else torch.tensor( + [f[k] for f in features]) + batch[k] = bv + batch = self.process_collator(batch, max_input_length) + return batch + + def __getitem__(self, index): + if self.use_data_cache: + feature = self.features[index] + else: + feature = self.process_example(self.examples[index]) + return feature + + def __len__(self): + return len(self.examples) + + @classmethod + def label2id(cls): + labels = cls.get_labels() + if not isinstance(labels, dict): + return {label: i for i, label in enumerate(labels)} + return labels + + @classmethod + def id2label(cls): + labels = cls.get_labels() + if isinstance(labels, dict): + return {value: key for key, value in labels.items()} + return {i: label for i, label in enumerate(labels)} + + @staticmethod + def get_labels(): + raise NotImplementedError('Method [DatasetBaseBuilder.get_labels] should be implemented.') + + def read_data(self, input_file): + raise NotImplementedError('Method [DatasetBaseBuilder.read_data] should be implemented.') + + def build_examples(self, data, data_type): + raise NotImplementedError('Method [DatasetBaseBuilder.build_examples] should be implemented.') diff --git a/torchblocks/data/embedding.py b/src/torchblocks/data/embedding.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/data/embedding.py rename to src/torchblocks/data/embedding.py diff --git a/src/torchblocks/data/ngram.py b/src/torchblocks/data/ngram.py new file mode 100644 index 0000000..2c015c8 --- /dev/null +++ b/src/torchblocks/data/ngram.py @@ -0,0 +1,12 @@ +def build_ngrams(input, minn=3, maxn=3, start='<', end='>'): + input = start + input + end + len_ = len(input) + ngrams = [] + for ngram in reversed(range(minn, maxn + 1)): + for i in range(0, len_ - ngram + 1): + ngrams.append(input[i:i + ngram]) + return ngrams + +if __name__ == "__main__": + input = '人工智能大赛' + print(build_ngrams(input)) \ No newline at end of file diff --git a/src/torchblocks/data/samplers.py b/src/torchblocks/data/samplers.py new file mode 100644 index 0000000..247ae87 --- /dev/null +++ b/src/torchblocks/data/samplers.py @@ -0,0 +1,93 @@ +import math +from torch.utils.data import BatchSampler, SubsetRandomSampler, Sampler + + +class SortedSampler(Sampler): + """ Samples elements sequentially, always in the same order. + + Args: + data (iterable): Iterable data. + sort_key (callable): Specifies a function of one argument that is used to extract a + numerical comparison key from each list element. + + Example: + >>> list(SortedSampler(range(10), sort_key=lambda i: -i)) + [9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + + """ + + def __init__(self, data, sort_key): + super().__init__(data) + self.data = data + self.sort_key = sort_key + zip_ = [(i, self.sort_key(row)) for i, row in enumerate(self.data)] + zip_ = sorted(zip_, key=lambda r: r[1]) + self.sorted_indexes = [item[0] for item in zip_] + + def __iter__(self): + return iter(self.sorted_indexes) + + def __len__(self): + return len(self.data) + + +class BucketBatchSampler(BatchSampler): + """ `BucketBatchSampler` toggles between `sampler` batches and sorted batches. + + Typically, the `sampler` will be a `RandomSampler` allowing the user to toggle between + random batches and sorted batches. A larger `bucket_size_multiplier` is more sorted and vice + versa. + + Background: + ``BucketBatchSampler`` is similar to a ``BucketIterator`` found in popular libraries like + ``AllenNLP`` and ``torchtext``. A ``BucketIterator`` pools together examples with a similar + size length to reduce the padding required for each batch while maintaining some noise + through bucketing. + + **AllenNLP Implementation:** + https://github.com/allenai/allennlp/blob/master/allennlp/data/iterators/bucket_iterator.py + + **torchtext Implementation:** + https://github.com/pytorch/text/blob/master/torchtext/data/iterator.py#L225 + + Args: + sampler (torch.data.utils.sampler.Sampler): + batch_size (int): Size of mini-batch. + drop_last (bool): If `True` the sampler will drop the last batch if its size would be less + than `batch_size`. + sort_key (callable, optional): Callable to specify a comparison key for sorting. + bucket_size_multiplier (int, optional): Buckets are of size + `batch_size * bucket_size_multiplier`. + example: + train_sampler = RandomSampler(train_dataset) + bucket_sampler = BucketBatchSampler(train_sampler, batch_size=config['batch_size'], + drop_last=False, sort_key=lambda x: len(train_dataset[x][0]), # 以 input_id 长度作为排序的指标 + bucket_size_multiplier=config['bucket_multiplier']) + train_dataloader = DataLoader(dataset=train_dataset, batch_sampler=bucket_sampler, + num_workers=4, collate_fn=collate_fn) + """ + + def __init__(self, + sampler, + batch_size, + drop_last, + sort_key, + bucket_size_multiplier=100): + super().__init__(sampler, batch_size, drop_last) + self.sort_key = sort_key + self.bucket_sampler = BatchSampler(sampler, + min(batch_size * bucket_size_multiplier, len(sampler)), + False) + + def __iter__(self): + for bucket in self.bucket_sampler: + sorted_sampler = SortedSampler(bucket, self.sort_key) + for batch in SubsetRandomSampler( + list(BatchSampler(sorted_sampler, self.batch_size, self.drop_last))): + yield [bucket[i] for i in batch] + + def __len__(self): + if self.drop_last: + return len(self.sampler) // self.batch_size + else: + return math.ceil(len(self.sampler) / self.batch_size) diff --git a/src/torchblocks/data/splits/__init__.py b/src/torchblocks/data/splits/__init__.py new file mode 100644 index 0000000..4015d04 --- /dev/null +++ b/src/torchblocks/data/splits/__init__.py @@ -0,0 +1,2 @@ +from .ml_stratifiers import * +from .seq_splits import * \ No newline at end of file diff --git a/src/torchblocks/data/splits/ml_stratifiers.py b/src/torchblocks/data/splits/ml_stratifiers.py new file mode 100644 index 0000000..6492ddd --- /dev/null +++ b/src/torchblocks/data/splits/ml_stratifiers.py @@ -0,0 +1,369 @@ +# Author: Trent J. Bradberry +# License: BSD 3 clause + +import numpy as np + +from sklearn.utils import check_random_state +from sklearn.utils.validation import _num_samples, check_array +from sklearn.utils.multiclass import type_of_target + +from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, \ + BaseShuffleSplit, _validate_shuffle_split + + +def IterativeStratification(labels, r, random_state): + """This function implements the Iterative Stratification algorithm described + in the following paper: + Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of + Multi-Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M. + (eds) Machine Learning and Knowledge Discovery in Databases. ECML PKDD + 2011. Lecture Notes in Computer Science, vol 6913. Springer, Berlin, + Heidelberg. + """ + + n_samples = labels.shape[0] + test_folds = np.zeros(n_samples, dtype=int) + + # Calculate the desired number of examples at each subset + c_folds = r * n_samples + + # Calculate the desired number of examples of each label at each subset + c_folds_labels = np.outer(r, labels.sum(axis=0)) + + labels_not_processed_mask = np.ones(n_samples, dtype=bool) + + while np.any(labels_not_processed_mask): + # Find the label with the fewest (but at least one) remaining examples, + # breaking ties randomly + num_labels = labels[labels_not_processed_mask].sum(axis=0) + + # Handle case where only all-zero labels are left by distributing + # across all folds as evenly as possible (not in original algorithm but + # mentioned in the text). (By handling this case separately, some + # code redundancy is introduced; however, this approach allows for + # decreased execution time when there are a relatively large number + # of all-zero labels.) + if num_labels.sum() == 0: + sample_idxs = np.where(labels_not_processed_mask)[0] + + for sample_idx in sample_idxs: + fold_idx = np.where(c_folds == c_folds.max())[0] + + if fold_idx.shape[0] > 1: + fold_idx = fold_idx[random_state.choice(fold_idx.shape[0])] + + test_folds[sample_idx] = fold_idx + c_folds[fold_idx] -= 1 + + break + + label_idx = np.where(num_labels == num_labels[np.nonzero(num_labels)].min())[0] + if label_idx.shape[0] > 1: + label_idx = label_idx[random_state.choice(label_idx.shape[0])] + + sample_idxs = np.where(np.logical_and(labels[:, label_idx].flatten(), labels_not_processed_mask))[0] + + for sample_idx in sample_idxs: + # Find the subset(s) with the largest number of desired examples + # for this label, breaking ties by considering the largest number + # of desired examples, breaking further ties randomly + label_folds = c_folds_labels[:, label_idx] + fold_idx = np.where(label_folds == label_folds.max())[0] + + if fold_idx.shape[0] > 1: + temp_fold_idx = np.where(c_folds[fold_idx] == + c_folds[fold_idx].max())[0] + fold_idx = fold_idx[temp_fold_idx] + + if temp_fold_idx.shape[0] > 1: + fold_idx = fold_idx[random_state.choice(temp_fold_idx.shape[0])] + + test_folds[sample_idx] = fold_idx + labels_not_processed_mask[sample_idx] = False + + # Update desired number of examples + c_folds_labels[fold_idx, labels[sample_idx]] -= 1 + c_folds[fold_idx] -= 1 + + return test_folds + + +class MultilabelStratifiedKFold(_BaseKFold): + """Multilabel stratified K-Folds cross-validator + Provides train/test indices to split multilabel data into train/test sets. + This cross-validation object is a variation of KFold that returns + stratified folds for multilabel data. The folds are made by preserving + the percentage of samples for each label. + Parameters + ---------- + n_splits : int, default=3 + Number of folds. Must be at least 2. + shuffle : boolean, optional + Whether to shuffle each stratification of the data before splitting + into batches. + random_state : int, RandomState instance or None, optional, default=None + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. Unlike StratifiedKFold that only uses random_state + when ``shuffle`` == True, this multilabel implementation + always uses the random_state since the iterative stratification + algorithm breaks ties randomly. + Examples + -------- + >>> from from src.torchblocks import MultilabelStratifiedKFold + >>> import numpy as np + >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]]) + >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]]) + >>> mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0) + >>> mskf.get_n_splits(X, y) + 2 + >>> print(mskf) # doctest: +NORMALIZE_WHITESPACE + MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=False) + >>> for train_index, test_index in mskf.split(X, y): + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [0 3 4 6] TEST: [1 2 5 7] + TRAIN: [1 2 5 7] TEST: [0 3 4 6] + Notes + ----- + Train and test sizes may be slightly different in each fold. + See also + -------- + RepeatedMultilabelStratifiedKFold: Repeats Multilabel Stratified K-Fold + n times. + """ + + def __init__(self, n_splits=3, *, shuffle=False, random_state=None): + super(MultilabelStratifiedKFold, self).__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + + def _make_test_folds(self, X, y): + y = np.asarray(y, dtype=bool) + type_of_target_y = type_of_target(y) + + if type_of_target_y != 'multilabel-indicator': + raise ValueError( + 'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(type_of_target_y)) + + num_samples = y.shape[0] + + rng = check_random_state(self.random_state) + indices = np.arange(num_samples) + + if self.shuffle: + rng.shuffle(indices) + y = y[indices] + + r = np.asarray([1 / self.n_splits] * self.n_splits) + + test_folds = IterativeStratification(labels=y, r=r, random_state=rng) + + return test_folds[np.argsort(indices)] + + def _iter_test_masks(self, X=None, y=None, groups=None): + test_folds = self._make_test_folds(X, y) + for i in range(self.n_splits): + yield test_folds == i + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data, where n_samples is the number of samples + and n_features is the number of features. + Note that providing ``y`` is sufficient to generate the splits and + hence ``np.zeros(n_samples)`` may be used as a placeholder for + ``X`` instead of actual training data. + y : array-like, shape (n_samples, n_labels) + The target variable for supervised learning problems. + Multilabel stratification is done based on the y labels. + groups : object + Always ignored, exists for compatibility. + Returns + ------- + train : ndarray + The training set indices for that split. + test : ndarray + The testing set indices for that split. + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting ``random_state`` + to an integer. + """ + y = check_array(y, ensure_2d=False, dtype=None) + return super(MultilabelStratifiedKFold, self).split(X, y, groups) + + +class RepeatedMultilabelStratifiedKFold(_RepeatedSplits): + """Repeated Multilabel Stratified K-Fold cross validator. + Repeats Mulilabel Stratified K-Fold n times with different randomization + in each repetition. + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + n_repeats : int, default=10 + Number of times cross-validator needs to be repeated. + random_state : None, int or RandomState, default=None + Random state to be used to generate random state for each + repetition as well as randomly breaking ties within the iterative + stratification algorithm. + Examples + -------- + >>> from from src.torchblocks import RepeatedMultilabelStratifiedKFold + >>> import numpy as np + >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]]) + >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]]) + >>> rmskf = RepeatedMultilabelStratifiedKFold(n_splits=2, n_repeats=2, + ... random_state=0) + >>> for train_index, test_index in rmskf.split(X, y): + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + ... + TRAIN: [0 3 4 6] TEST: [1 2 5 7] + TRAIN: [1 2 5 7] TEST: [0 3 4 6] + TRAIN: [0 1 4 5] TEST: [2 3 6 7] + TRAIN: [2 3 6 7] TEST: [0 1 4 5] + See also + -------- + RepeatedStratifiedKFold: Repeats (Non-multilabel) Stratified K-Fold + n times. + """ + + def __init__(self, n_splits=5, *, n_repeats=10, random_state=None): + super(RepeatedMultilabelStratifiedKFold, self).__init__( + MultilabelStratifiedKFold, n_repeats=n_repeats, random_state=random_state, + n_splits=n_splits) + + +class MultilabelStratifiedShuffleSplit(BaseShuffleSplit): + """Multilabel Stratified ShuffleSplit cross-validator + Provides train/test indices to split data into train/test sets. + This cross-validation object is a merge of MultilabelStratifiedKFold and + ShuffleSplit, which returns stratified randomized folds for multilabel + data. The folds are made by preserving the percentage of each label. + Note: like the ShuffleSplit strategy, multilabel stratified random splits + do not guarantee that all folds will be different, although this is + still very likely for sizeable dataset. + Parameters + ---------- + n_splits : int, default 10 + Number of re-shuffling & splitting iterations. + test_size : float, int, None, optional + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. By default, the value is set to 0.1. + The default will change in version 0.21. It will remain 0.1 only + if ``train_size`` is unspecified, otherwise it will complement + the specified ``train_size``. + train_size : float, int, or None, default is None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. Unlike StratifiedShuffleSplit that only uses + random_state when ``shuffle`` == True, this multilabel implementation + always uses the random_state since the iterative stratification + algorithm breaks ties randomly. + Examples + -------- + >>> from src.torchblocks import MultilabelStratifiedShuffleSplit + >>> import numpy as np + >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]]) + >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]]) + >>> msss = MultilabelStratifiedShuffleSplit(n_splits=3, test_size=0.5, + ... random_state=0) + >>> msss.get_n_splits(X, y) + 3 + >>> print(mss) # doctest: +ELLIPSIS + MultilabelStratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5, + train_size=None) + >>> for train_index, test_index in msss.split(X, y): + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [1 2 5 7] TEST: [0 3 4 6] + TRAIN: [2 3 6 7] TEST: [0 1 4 5] + TRAIN: [1 2 5 6] TEST: [0 3 4 7] + Notes + ----- + Train and test sizes may be slightly different from desired due to the + preference of stratification over perfectly sized folds. + """ + + def __init__(self, n_splits=10, *, test_size="default", train_size=None, + random_state=None): + super(MultilabelStratifiedShuffleSplit, self).__init__( + n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state) + + def _iter_indices(self, X, y, groups=None): + n_samples = _num_samples(X) + y = check_array(y, ensure_2d=False, dtype=None) + y = np.asarray(y, dtype=bool) + type_of_target_y = type_of_target(y) + + if type_of_target_y != 'multilabel-indicator': + raise ValueError( + 'Supported target type is: multilabel-indicator. Got {!r} instead.'.format( + type_of_target_y)) + + n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, + self.train_size) + + n_samples = y.shape[0] + rng = check_random_state(self.random_state) + y_orig = y.copy() + + r = np.array([n_train, n_test]) / (n_train + n_test) + + for _ in range(self.n_splits): + indices = np.arange(n_samples) + rng.shuffle(indices) + y = y_orig[indices] + + test_folds = IterativeStratification(labels=y, r=r, random_state=rng) + + test_idx = test_folds[np.argsort(indices)] == 1 + test = np.where(test_idx)[0] + train = np.where(~test_idx)[0] + + yield train, test + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data, where n_samples is the number of samples + and n_features is the number of features. + Note that providing ``y`` is sufficient to generate the splits and + hence ``np.zeros(n_samples)`` may be used as a placeholder for + ``X`` instead of actual training data. + y : array-like, shape (n_samples, n_labels) + The target variable for supervised learning problems. + Multilabel stratification is done based on the y labels. + groups : object + Always ignored, exists for compatibility. + Returns + ------- + train : ndarray + The training set indices for that split. + test : ndarray + The testing set indices for that split. + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting ``random_state`` + to an integer. + """ + y = check_array(y, ensure_2d=False, dtype=None) + return super(MultilabelStratifiedShuffleSplit, self).split(X, y, groups) diff --git a/src/torchblocks/data/splits/seq_splits.py b/src/torchblocks/data/splits/seq_splits.py new file mode 100644 index 0000000..ac81bde --- /dev/null +++ b/src/torchblocks/data/splits/seq_splits.py @@ -0,0 +1,74 @@ +from collections import Counter +from .ml_stratifiers import MultilabelStratifiedKFold + + +def split_ner_stratified_kfold(entities_list, num_folds, shuffle=True, random_state=42): + """Split to the NER entity-level stratified k-folds. + Args: + entities_list: The list of list of entities. + num_folds: The number of folds to split. + fold_index: The index of current fold. + Returns: + A tuple of indices for train and validation. + """ + # Collect the entity types and sort them for deterministics. + entity_types = sorted({y for x in entities_list for y, *_ in x}) + # Count the entity appearances and transform to vectors for the multilabel k-fold. + entity_counts = [Counter(y for y, *_ in x) for x in entities_list] + entity_labels = [[cnt[x] for x in entity_types] for cnt in entity_counts] + kfold = MultilabelStratifiedKFold(num_folds, shuffle=shuffle, random_state=random_state) + return list(kfold.split(entities_list, entity_labels)) + + +if __name__ == "__main__": + sentences = [{'id': '0', + 'text': '大于三十岁的与临时居住地是松陵村东区473号楼的初婚东乡族同网吧的学历为高中的外来务工人员', + 'entities': [['AGE', 0, 5, '大于三十岁'], + ['EDU', 36, 38, '高中'], + ['TAG', 39, 45, '外来务工人员'], + ['PER', 13, 23, '松陵村东区473号楼']], + 'intent': 'KBQA' + }, + {'id': '1', + 'text': '大于三十岁的与临时居住地是松陵村东区473号楼的初婚东乡族同网吧的学历为高中的外来务工人员', + 'entities': [['AGE', 0, 5, '大于三十岁'], + ['EDU', 36, 38, '高中'], + ['TAG', 39, 45, '外来务工人员'], + ['PER', 13, 23, '松陵村东区473号楼']], + 'intent': 'KBQA' + }, + {'id': '2', + 'text': '大于三十岁的与临时居住地是松陵村东区473号楼的初婚东乡族同网吧的学历为高中的外来务工人员', + 'entities': [['AGE', 0, 5, '大于三十岁'], + ['EDU', 36, 38, '高中'], + ['TAG', 39, 45, '外来务工人员'], + ['PER', 13, 23, '松陵村东区473号楼']], + 'intent': 'KBQA' + }, + {'id': '3', + 'text': '大于三十岁的与临时居住地是松陵村东区473号楼的初婚东乡族同网吧的学历为高中的外来务工人员', + 'entities': [['AGE', 0, 5, '大于三十岁'], + ['EDU', 36, 38, '高中'], + ['TAG', 39, 45, '外来务工人员'], + ['PER', 13, 23, '松陵村东区473号楼']], + 'intent': 'KBQA' + }, + {'id': '4', + 'text': '大于三十岁的与临时居住地是松陵村东区473号楼的初婚东乡族同网吧的学历为高中的外来务工人员', + 'entities': [['AGE', 0, 5, '大于三十岁'], + ['EDU', 36, 38, '高中'], + ['TAG', 39, 45, '外来务工人员'], + ['PER', 13, 23, '松陵村东区473号楼']], + 'intent': 'KBQA' + } + ] + entities_list = [x['entities'] for x in sentences] + data_indices = split_ner_stratified_kfold(entities_list, num_folds=5) + ''' + output: + [(array([0, 1, 2, 3]), array([4])), + (array([0, 1, 2, 4]), array([3])), + (array([1, 2, 3, 4]), array([0])), + (array([0, 1, 3, 4]), array([2])), + (array([0, 2, 3, 4]), array([1]))] + ''' diff --git a/src/torchblocks/data/token_text_mapping.py b/src/torchblocks/data/token_text_mapping.py new file mode 100644 index 0000000..ed3ecb0 --- /dev/null +++ b/src/torchblocks/data/token_text_mapping.py @@ -0,0 +1,88 @@ +import unicodedata + + +class TokenTextMapping: + def __init__(self): + pass + + def _is_control(self, ch): + """控制类字符判断 + """ + return unicodedata.category(ch) in ('Cc', 'Cf') + + def lowercase_and_normalize(self, text): + """转小写,并进行简单的标准化 + """ + text = text.lower() + text = unicodedata.normalize('NFD', text) + text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn']) + return text + + def stem(self, token): + """获取token的“词干”(如果是##开头,则自动去掉##) + """ + if token[:2] == '##': + return token[2:] + else: + return token + + def _is_special(self, ch): + """判断是不是有特殊含义的符号 + """ + special = ['[CLS]', '[SEP]', '[PAD]'] + # special = ['[CLS]', '[SEP]', '[PAD]', '[UNK]'] + if ch in special: + return True + else: + return False + + def __call__(self, text, tokens): + """给出原始的text和tokenize后的tokens的映射关系""" + text = text.lower() + normalized_text, char_mapping = '', [] + for i, ch in enumerate(text): + ch = self.lowercase_and_normalize(ch) + ch = ''.join([ + c for c in ch + if not (ord(c) == 0 or ord(c) == 0xfffd or self._is_control(c)) + ]) + normalized_text += ch + char_mapping.extend([i] * len(ch)) + text, token_mapping, offset = normalized_text, [], 0 + for token in tokens: + if self._is_special(token): + token_mapping.append([]) + elif token == '[unused1]' or token == '[UNK]': + start = offset + end = offset + 1 + token_mapping.append(char_mapping[start:end]) + offset = end + else: + token = self.stem(token) + start = text[offset:].index(token) + offset + end = start + len(token) + token_mapping.append(char_mapping[start:end]) + offset = end + return token_mapping + + +if __name__ == "__main__": + tokens = ['[CLS]', '大', '于', 'book', '##es', '岁', '的', '与', '临', '时', '居', '住', '[SEP]'] + text = '大于bookes岁的与临时居住' + TokenTextMapping()(text, tokens) + ''' + result: + [[], + [0], + [1], + [2, 3, 4, 5], + [6, 7], + [8], + [9], + [10], + [11], + [12], + [13], + [14], + []] + ''' diff --git a/torchblocks/losses/__init__.py b/src/torchblocks/losses/__init__.py old mode 100755 new mode 100644 similarity index 75% rename from torchblocks/losses/__init__.py rename to src/torchblocks/losses/__init__.py index 1737a1f..0bf6aa4 --- a/torchblocks/losses/__init__.py +++ b/src/torchblocks/losses/__init__.py @@ -5,5 +5,5 @@ from .label_smoothing import LabelSmoothingCE from .hard_mining import HardMining from .cross_entropy import MultiLabelCategoricalCrossEntropy -from .cross_entropy import SoftTargetCrossEntropy -from .span_loss import SpanLoss \ No newline at end of file +from .span_loss import SpanLoss +from .poly_loss import Poly1FocalLoss,Poly1CrossEntropyLoss \ No newline at end of file diff --git a/torchblocks/losses/aslsinglelabel_loss.py b/src/torchblocks/losses/aslsinglelabel_loss.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/losses/aslsinglelabel_loss.py rename to src/torchblocks/losses/aslsinglelabel_loss.py diff --git a/torchblocks/losses/asymmetric_loss.py b/src/torchblocks/losses/asymmetric_loss.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/losses/asymmetric_loss.py rename to src/torchblocks/losses/asymmetric_loss.py diff --git a/torchblocks/losses/cross_entropy.py b/src/torchblocks/losses/cross_entropy.py old mode 100755 new mode 100644 similarity index 56% rename from torchblocks/losses/cross_entropy.py rename to src/torchblocks/losses/cross_entropy.py index 1a0b76f..2a8de28 --- a/torchblocks/losses/cross_entropy.py +++ b/src/torchblocks/losses/cross_entropy.py @@ -3,20 +3,28 @@ import torch.nn.functional as F -class SoftTargetCrossEntropy(nn.Module): +class SoftCrossEntropyLoss(nn.Module): + def __init__(self, weights=None, ignore_index=-100): + super(SoftCrossEntropyLoss, self).__init__() + self.weights = weights + self.ignore_index = ignore_index - def __init__(self): - super(SoftTargetCrossEntropy, self).__init__() - - def forward(self, preds, target): - loss = torch.sum(-target * F.log_softmax(preds, dim=-1), dim=-1) - return loss.mean() + def forward(self, input, target): + if self.weights is None: + self.weights = torch.ones(input.shape[-1]) + self.weights = self.weights.to(input.device) + mask = (target != self.ignore_index).any(axis=-1) + p = F.log_softmax(input[mask], -1, dtype=input.dtype) + w_labels = self.weights * target[mask] + loss = -(w_labels * p).sum() / (w_labels).sum() + return loss class MultiLabelCategoricalCrossEntropy(nn.Module): """ https://kexue.fm/archives/7359 """ + def __init__(self): super(MultiLabelCategoricalCrossEntropy, self).__init__() diff --git a/torchblocks/losses/focal_loss.py b/src/torchblocks/losses/focal_loss.py old mode 100755 new mode 100644 similarity index 88% rename from torchblocks/losses/focal_loss.py rename to src/torchblocks/losses/focal_loss.py index a36738d..f00374b --- a/torchblocks/losses/focal_loss.py +++ b/src/torchblocks/losses/focal_loss.py @@ -50,17 +50,13 @@ def forward(self, preds, target): loss = loss.mean() elif self.reduction == 'sum': loss = loss.sum() - elif self.reduction == 'none': - pass return loss class FocalCosineLoss(nn.Module): """Implementation Focal cosine loss. - [Data-Efficient Deep Learning Method for Image Classification Using Data Augmentation, Focal Cosine Loss, and Ensemble](https://arxiv.org/abs/2007.07805). - Source : """ @@ -73,19 +69,17 @@ def __init__(self, alpha=1, gamma=2, xent=0.1, reduction="mean"): self.xent = xent self.reduction = reduction - def forward(self, logits, target): + def forward(self, preds, target): """Forward Method.""" cosine_loss = F.cosine_embedding_loss( - logits, - torch.nn.functional.one_hot(target, num_classes=logits.size(-1)), + preds, + torch.nn.functional.one_hot(target, num_classes=preds.size(-1)), torch.tensor([1], device=target.device), reduction=self.reduction, ) - - cent_loss = F.cross_entropy(F.normalize(logits), target, reduction="none") + cent_loss = F.cross_entropy(F.normalize(preds), target, reduction="none") pt = torch.exp(-cent_loss) focal_loss = self.alpha * (1 - pt) ** self.gamma * cent_loss - if self.reduction == "mean": focal_loss = torch.mean(focal_loss) return cosine_loss + self.xent * focal_loss diff --git a/torchblocks/losses/hard_mining.py b/src/torchblocks/losses/hard_mining.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/losses/hard_mining.py rename to src/torchblocks/losses/hard_mining.py diff --git a/torchblocks/losses/kl_divergence.py b/src/torchblocks/losses/kl_divergence.py old mode 100755 new mode 100644 similarity index 64% rename from torchblocks/losses/kl_divergence.py rename to src/torchblocks/losses/kl_divergence.py index e9b080e..dcb300e --- a/torchblocks/losses/kl_divergence.py +++ b/src/torchblocks/losses/kl_divergence.py @@ -15,6 +15,20 @@ def forward(self, preds, target): return loss +class BKL(nn.Module): + def __init__(self, reduction='mean'): + super(BKL, self).__init__() + self.reduction = reduction + + def forward(self, preds, target): + preds = preds.float() + target = target.float() + loss1 = F.kl_div(F.log_softmax(preds, dim=-1), F.softmax(target, dim=-1), reduction=self.reduction) + loss2 = F.kl_div(F.log_softmax(target, dim=-1), F.softmax(preds, dim=-1), reduction=self.reduction) + loss = (loss1.mean() + loss2.mean()) / 2 + return loss + + class SKL(nn.Module): def __init__(self, epsilon=1e-8): super(SKL, self).__init__() diff --git a/torchblocks/losses/label_smoothing.py b/src/torchblocks/losses/label_smoothing.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/losses/label_smoothing.py rename to src/torchblocks/losses/label_smoothing.py diff --git a/src/torchblocks/losses/poly_loss.py b/src/torchblocks/losses/poly_loss.py new file mode 100644 index 0000000..bbab76e --- /dev/null +++ b/src/torchblocks/losses/poly_loss.py @@ -0,0 +1,116 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +class Poly1CrossEntropyLoss(nn.Module): + def __init__(self, + num_classes: int, + epsilon: float = 1.0, + alpha=0.0, + reduction: str = "none", + weight: Tensor = None): + """ + Create instance of Poly1CrossEntropyLoss + :param num_classes: + :param epsilon: + :param reduction: one of none|sum|mean, apply reduction to final loss tensor + :param weight: manual rescaling weight for each class, passed to Cross-Entropy loss + """ + super(Poly1CrossEntropyLoss, self).__init__() + self.num_classes = num_classes + self.epsilon = epsilon + self.reduction = reduction + self.weight = weight + self.alpha = alpha + + def forward(self, preds, target): + """ + Forward pass + :param preds: tensor of shape [N, num_classes] + :param target: tensor of shape [N] + :return: poly cross-entropy loss + """ + labels_onehot = F.one_hot(target, num_classes=self.num_classes).to(device=preds.device, + dtype=preds.dtype) + pt = torch.sum(labels_onehot * F.softmax(preds, dim=-1), dim=-1) + CE = F.cross_entropy(input=preds, + target=target, + reduction='none', + weight=self.weight) + poly2 = CE + self.epsilon * (1 - pt) + self.alpha * (1 - pt) * (1 - pt) + if self.reduction == "mean": + poly2 = poly2.mean() + elif self.reduction == "sum": + poly2 = poly2.sum() + return poly2 + + +class Poly1FocalLoss(nn.Module): + def __init__(self, + num_classes: int, + epsilon: float = 1.0, + alpha: float = 0.25, + gamma: float = 2.0, + reduction: str = "none", + weight: Tensor = None, + pos_weight: Tensor = None, + label_is_onehot: bool = False): + """ + Create instance of Poly1FocalLoss + :param num_classes: number of classes + :param epsilon: poly loss epsilon + :param alpha: focal loss alpha + :param gamma: focal loss gamma + :param reduction: one of none|sum|mean, apply reduction to final loss tensor + :param weight: manual rescaling weight for each class, passed to binary Cross-Entropy loss + :param label_is_onehot: set to True if labels are one-hot encoded + """ + super(Poly1FocalLoss, self).__init__() + self.num_classes = num_classes + self.epsilon = epsilon + self.alpha = alpha + self.gamma = gamma + self.reduction = reduction + self.weight = weight + self.pos_weight = pos_weight + self.label_is_onehot = label_is_onehot + + def forward(self, preds, target): + """ + Forward pass + :param preds: output of neural netwrok of shape [N, num_classes] or [N, num_classes, ...] + :param target: ground truth tensor of shape [N] or [N, ...] with class ids if label_is_onehot was set to False, otherwise + one-hot encoded tensor of same shape as logits + :return: poly focal loss + """ + # focal loss implementation taken from + # https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/focal_loss.py + p = torch.sigmoid(preds) + if not self.label_is_onehot: + # if labels are of shape [N] + # convert to one-hot tensor of shape [N, num_classes] + if target.ndim == 1: + target = F.one_hot(target, num_classes=self.num_classes) + # if labels are of shape [N, ...] e.g. segmentation task + # convert to one-hot tensor of shape [N, num_classes, ...] + else: + target = F.one_hot(target.unsqueeze(1), self.num_classes).transpose(1, -1).squeeze_(-1) + target = target.to(device=preds.device, dtype=target.dtype) + ce_loss = F.binary_cross_entropy_with_logits(input=preds, + target=target, + reduction="none", + weight=self.weight, + pos_weight=self.pos_weight) + pt = target * p + (1 - target) * (1 - p) + FL = ce_loss * ((1 - pt) ** self.gamma) + if self.alpha >= 0: + alpha_t = self.alpha * target + (1 - self.alpha) * (1 - target) + FL = alpha_t * FL + poly1 = FL + self.epsilon * torch.pow(1 - pt, self.gamma + 1) + if self.reduction == "mean": + poly1 = poly1.mean() + elif self.reduction == "sum": + poly1 = poly1.sum() + return poly1 diff --git a/torchblocks/losses/span_loss.py b/src/torchblocks/losses/span_loss.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/losses/span_loss.py rename to src/torchblocks/losses/span_loss.py diff --git a/torchblocks/losses/symmetric_loss.py b/src/torchblocks/losses/symmetric_loss.py old mode 100755 new mode 100644 similarity index 94% rename from torchblocks/losses/symmetric_loss.py rename to src/torchblocks/losses/symmetric_loss.py index eb7fce3..2b5f507 --- a/torchblocks/losses/symmetric_loss.py +++ b/src/torchblocks/losses/symmetric_loss.py @@ -1,6 +1,7 @@ -import torch.nn.functional as F import torch import torch.nn as nn +import torch.nn.functional as F + class SymmetricCE(nn.Module): @@ -9,7 +10,7 @@ class SymmetricCE(nn.Module): Paper: https://arxiv.org/abs/1908.06112 """ - def __init__(self, num_classes, alpha: float = 1.0, beta: float = 1.0): + def __init__(self, num_classes, alpha=1.0, beta=1.0): """Constructor method for symmetric CE. Args: diff --git a/torchblocks/losses/triplet_loss.py b/src/torchblocks/losses/triplet_loss.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/losses/triplet_loss.py rename to src/torchblocks/losses/triplet_loss.py diff --git a/torchblocks/metrics/__init__.py b/src/torchblocks/metrics/__init__.py old mode 100755 new mode 100644 similarity index 88% rename from torchblocks/metrics/__init__.py rename to src/torchblocks/metrics/__init__.py index 64cdae3..585371c --- a/torchblocks/metrics/__init__.py +++ b/src/torchblocks/metrics/__init__.py @@ -1,6 +1,5 @@ from .base import Metric -from .sequence_labeling.scheme import get_scheme from .sequence_labeling.precision_recall_fscore import precision_recall_fscore_support from .sequence_labeling.seqTag_score import SequenceLabelingScore diff --git a/torchblocks/metrics/base.py b/src/torchblocks/metrics/base.py old mode 100755 new mode 100644 similarity index 95% rename from torchblocks/metrics/base.py rename to src/torchblocks/metrics/base.py index 51bcd6f..c44ceff --- a/torchblocks/metrics/base.py +++ b/src/torchblocks/metrics/base.py @@ -1,6 +1,7 @@ class Metric: """Store the average and current value for a set of metrics. """ + def update(self, preds, target): raise NotImplementedError @@ -12,4 +13,3 @@ def name(self): def reset(self): pass - diff --git a/torchblocks/tasks/__init__.py b/src/torchblocks/metrics/classification/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/tasks/__init__.py rename to src/torchblocks/metrics/classification/__init__.py diff --git a/src/torchblocks/metrics/classification/accuracy.py b/src/torchblocks/metrics/classification/accuracy.py new file mode 100644 index 0000000..ce60fb9 --- /dev/null +++ b/src/torchblocks/metrics/classification/accuracy.py @@ -0,0 +1,56 @@ +from typing import Optional +from typing_extensions import Literal +from ..base import Metric +from torchmetrics.classification.accuracy import Accuracy as _Accuracy + + +class Accuracy(Metric): + ''' + Computes accuracy. Works with binary, multiclass, and multilabel data. + Accepts logits from a model output or integer class values in prediction. + Works with multi-dimensional preds and target. + Args: + threshold: + Threshold value for binary or multi-label logits. default: 0.5 + ''' + + def __init__(self, task: Literal["binary", "multiclass", "multilabel"], + threshold: float = 0.5, + num_classes: Optional[int] = None, + num_labels: Optional[int] = None, + average: Optional[Literal["micro", "macro", "weighted", "none"]] = "micro", + multidim_average: Literal["global", "samplewise"] = "global", + top_k: Optional[int] = 1, + ignore_index: Optional[int] = None, + validate_args: bool = True): + self.task = task + self.threshold = threshold + self.num_classes = num_classes + self.num_labels = num_labels + self.average = average + self.multidim_average = multidim_average + self.top_k = top_k + self.ignore_index = ignore_index + self.validate_args = validate_args + self.reset() + + def reset(self): + self.method = _Accuracy(task=self.task, + threshold=self.threshold, + num_classes=self.num_classes, + num_labels=self.num_labels, + average=self.average, + multidim_average=self.multidim_average, + top_k=self.top_k, + ignore_index=self.ignore_index, + validate_args=self.validate_args) + + def update(self, preds, target): + self.method.update(preds, target) + + def value(self): + score = self.method.compute() + return score.item() + + def name(self): + return 'acc' diff --git a/src/torchblocks/metrics/classification/auc.py b/src/torchblocks/metrics/classification/auc.py new file mode 100644 index 0000000..1474bb9 --- /dev/null +++ b/src/torchblocks/metrics/classification/auc.py @@ -0,0 +1,51 @@ +from ..base import Metric +from typing import List, Optional, Union +from torch import Tensor +from typing_extensions import Literal +from torchmetrics.classification.auroc import AUROC as _AUROC + + +class AUC(Metric): + ''' + Area Under Curve + ''' + + def __init__(self, + task: Literal["binary", "multiclass", "multilabel"], + thresholds: Optional[Union[int, List[float], Tensor]] = None, + num_classes: Optional[int] = None, + num_labels: Optional[int] = None, + average: Optional[Literal["macro", "weighted", "none"]] = "macro", + max_fpr: Optional[float] = None, + ignore_index: Optional[int] = None, + validate_args: bool = True, + ): + self.task = task, + self.thresholds = thresholds, + self.num_classes = num_classes, + self.num_labels = num_labels, + self.average = average, + self.max_fpr = max_fpr, + self.ignore_index = ignore_index, + self.validate_args = validate_args, + self.reset() + + def reset(self): + self.method = _AUROC(task=self.task, + thresholds=self.thresholds, + num_classes=self.num_classes, + num_labels=self.num_labels, + average=self.average, + max_fpr=self.max_fpr, + ignore_index=self.ignore_index, + validate_args=self.validate_args) + + def update(self, preds, target): + self.method.update(preds, target) + + def value(self): + score = self.method.compute() + return score.item() + + def name(self): + return 'auc' diff --git a/src/torchblocks/metrics/classification/f1_score.py b/src/torchblocks/metrics/classification/f1_score.py new file mode 100644 index 0000000..8febde0 --- /dev/null +++ b/src/torchblocks/metrics/classification/f1_score.py @@ -0,0 +1,52 @@ +from ..base import Metric +from typing import Optional +from typing_extensions import Literal +from torchmetrics.classification.f_beta import F1Score as _F1Score + + +class F1Score(Metric): + ''' + F1 Score + ''' + + def __init__(self, task: Literal["binary", "multiclass", "multilabel"], + threshold: float = 0.5, + num_classes: Optional[int] = None, + num_labels: Optional[int] = None, + average: Optional[Literal["micro", "macro", "weighted", "none"]] = "micro", + multidim_average: Optional[Literal["global", "samplewise"]] = "global", + top_k: Optional[int] = 1, + ignore_index: Optional[int] = None, + validate_args: bool = True, + ): + self.task = task + self.threshold = threshold + self.num_classes = num_classes + self.num_labels = num_labels + self.average = average + self.multidim_average = multidim_average + self.top_k = top_k + self.ignore_index = ignore_index + self.validate_args = validate_args + self.reset() + + def reset(self): + self.method = _F1Score(task=self.task, + threshold=self.threshold, + num_classes=self.num_classes, + num_labels=self.num_labels, + average=self.average, + multidim_average=self.multidim_average, + top_k=self.top_k, + ignore_index=self.ignore_index, + validate_args=self.validate_args, ) + + def update(self, preds, target): + self.method.update(preds, target) + + def value(self): + score = self.method.compute() + return score.item() + + def name(self): + return 'f1' diff --git a/src/torchblocks/metrics/classification/matthews_corrcoef.py b/src/torchblocks/metrics/classification/matthews_corrcoef.py new file mode 100644 index 0000000..3c97ded --- /dev/null +++ b/src/torchblocks/metrics/classification/matthews_corrcoef.py @@ -0,0 +1,51 @@ +import torchmetrics +from ..base import Metric +from packaging import version +from typing import Optional +from typing_extensions import Literal + +if version.parse(torchmetrics.__version__) >= version.parse("0.11.3"): + from torchmetrics.classification.matthews_corrcoef import MatthewsCorrCoef as _MatthewsCorrcoef +else: + msg = ("The torchmetrics package version needs to be greater than 0.11.3, please update") + raise ImportError(msg) + + +class MattewsCorrcoef(Metric): + ''' + Matthews Correlation Coefficient + ''' + + def __init__(self, task: Literal["binary", "multiclass", "multilabel"] = None, + threshold: float = 0.5, + num_classes: Optional[int] = None, + num_labels: Optional[int] = None, + ignore_index: Optional[int] = None, + validate_args: bool = True + ): + self.task = task + self.threshold = threshold + self.num_classes = num_classes + self.num_labels = num_labels + self.ignore_index = ignore_index + self.validate_args = validate_args + self.reset() + + def reset(self): + self.method = _MatthewsCorrcoef(task=self.task, + threshold=self.threshold, + num_classes=self.num_classes, + num_labels=self.num_labels, + ignore_index=self.ignore_index, + validate_args=self.validate_args + ) + + def value(self): + score = self.method.compute() + return score.item() + + def update(self, preds, target): # type: ignore + self.method.update(preds, target) + + def name(self): + return 'mcc' diff --git a/src/torchblocks/metrics/sequence_labeling/__init__.py b/src/torchblocks/metrics/sequence_labeling/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/torchblocks/metrics/sequence_labeling/precision_recall_fscore.py b/src/torchblocks/metrics/sequence_labeling/precision_recall_fscore.py old mode 100755 new mode 100644 similarity index 94% rename from torchblocks/metrics/sequence_labeling/precision_recall_fscore.py rename to src/torchblocks/metrics/sequence_labeling/precision_recall_fscore.py index d1bbaf3..cb85969 --- a/torchblocks/metrics/sequence_labeling/precision_recall_fscore.py +++ b/src/torchblocks/metrics/sequence_labeling/precision_recall_fscore.py @@ -1,8 +1,8 @@ import numpy as np from typing import * from collections import defaultdict -from torchblocks.metrics.sequence_labeling.scheme import get_scheme -from torchblocks.metrics.sequence_labeling.util import _prf_divide,_warn_prf,check_consistent_length +from ...tasks.sequence_tags import get_scheme +from ..sequence_labeling.util import _prf_divide, _warn_prf, check_consistent_length PER_CLASS_SCORES = Tuple[List[float], List[float], List[float], List[int]] AVERAGE_SCORES = Tuple[float, float, float, int] @@ -154,7 +154,7 @@ def precision_recall_fscore_support(y_true: Union[List[List[str]], List[List[Tup support : int (if average is not None) or array of int, shape = [n_unique_labels] The number of occurrences of each label in ``y_true``. Examples: - >>> from torchblocks.metrics.sequence_labeling.precision_recall_fscore import precision_recall_fscore_support + >>> from src.torchblocks import precision_recall_fscore_support >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> precision_recall_fscore_support(y_true, y_pred, average='macro') @@ -178,6 +178,7 @@ def precision_recall_fscore_support(y_true: Union[List[List[str]], List[List[Tup def extract_tp_actual_correct(y_true, y_pred, suffix, *args): entities_true = defaultdict(set) entities_pred = defaultdict(set) + # [['O', 'O', 'O'], ['B-PER', 'I-PER', 'O']] if len(y_pred[0]) > 0 and isinstance(y_pred[0][0], str): check_consistent_length(y_true, y_pred) get_entities = get_scheme(scheme_type=schema) @@ -187,10 +188,11 @@ def extract_tp_actual_correct(y_true, y_pred, suffix, *args): for type_name, start, end in get_entities(y_p): entities_pred[type_name].add((i, start, end)) else: + # [('TITLE',35, 41),('TITLE',35, 41),('TITLE',35, 41)] for i, (y_t, y_p) in enumerate(zip(y_true, y_pred)): - for start, end, type_name in y_t: + for type_name, start, end,*_ in y_t: entities_true[type_name].add((i, start, end)) - for start, end, type_name in y_p: + for type_name, start, end,*_ in y_p: entities_pred[type_name].add((i, start, end)) if labels is not None: entities_true = {k: v for k, v in entities_true.items() if k in labels} diff --git a/torchblocks/metrics/sequence_labeling/seqTag_score.py b/src/torchblocks/metrics/sequence_labeling/seqTag_score.py old mode 100755 new mode 100644 similarity index 71% rename from torchblocks/metrics/sequence_labeling/seqTag_score.py rename to src/torchblocks/metrics/sequence_labeling/seqTag_score.py index 50c870a..1233f95 --- a/torchblocks/metrics/sequence_labeling/seqTag_score.py +++ b/src/torchblocks/metrics/sequence_labeling/seqTag_score.py @@ -1,6 +1,6 @@ import pandas as pd -from torchblocks.metrics.base import Metric -from torchblocks.metrics.sequence_labeling.precision_recall_fscore import precision_recall_fscore_support +from ..base import Metric +from .precision_recall_fscore import precision_recall_fscore_support class SequenceLabelingScore(Metric): @@ -25,8 +25,10 @@ def value(self): values.append([label, p, r, f, s]) df = pd.DataFrame(values, columns=columns) f1 = df[df['label'] == self.average]['f1'].item() + recall = df[df['label'] == self.average]['recall'].item() + precision = df[df['label'] == self.average]['precision'].item() return { - "df": df, f"f1_{self.average}": f1, # for monitor + "df": df, f"f1_{self.average}": f1, 'precision': precision, 'recall': recall # for monitor } def name(self): @@ -34,4 +36,4 @@ def name(self): def reset(self): self.preds = [] - self.target = [] \ No newline at end of file + self.target = [] diff --git a/torchblocks/metrics/sequence_labeling/util.py b/src/torchblocks/metrics/sequence_labeling/util.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/metrics/sequence_labeling/util.py rename to src/torchblocks/metrics/sequence_labeling/util.py diff --git a/src/torchblocks/models/__init__.py b/src/torchblocks/models/__init__.py new file mode 100644 index 0000000..d9a312e --- /dev/null +++ b/src/torchblocks/models/__init__.py @@ -0,0 +1,2 @@ +from .nezha.configuration_nezha import * +from .nezha.modeling_nezha import * \ No newline at end of file diff --git a/torchblocks/models/configuration_base.py b/src/torchblocks/models/configuration_base.py old mode 100755 new mode 100644 similarity index 96% rename from torchblocks/models/configuration_base.py rename to src/torchblocks/models/configuration_base.py index 37232d7..018273b --- a/torchblocks/models/configuration_base.py +++ b/src/torchblocks/models/configuration_base.py @@ -1,14 +1,14 @@ """ Configuration base class and utils.""" +import os import copy import json import logging -import os logger = logging.getLogger(__name__) CONFIG_NAME = 'config.json' -class TrainConfig(object): +class TrainConfigBuilder(object): def __init__(self, **kwargs): self.model_type = kwargs.pop("model_type", '') # Additional attributes without default values diff --git a/torchblocks/models/model_base.py b/src/torchblocks/models/model_base.py old mode 100755 new mode 100644 similarity index 94% rename from torchblocks/models/model_base.py rename to src/torchblocks/models/model_base.py index fd3e278..9efa733 --- a/torchblocks/models/model_base.py +++ b/src/torchblocks/models/model_base.py @@ -3,7 +3,7 @@ import torch from torch import nn import torch.nn.init as init -from .configuration_base import TrainConfig +from .configuration_base import TrainConfigBuilder logger = logging.getLogger(__name__) WEIGHTS_NAME = 'pytorch_model.bin' @@ -18,10 +18,11 @@ 'uniform': init.uniform_ } -class TrainModel(nn.Module): + +class TrainModelBuilder(nn.Module): def __init__(self, config, *inputs, **kwargs): super().__init__() - if not isinstance(config, TrainConfig): + if not isinstance(config, TrainConfigBuilder): raise ValueError( "Parameter config in `{}(config)` should be an instance of class `TrainConfig`. " "To create a model from a train model use " @@ -58,7 +59,7 @@ def from_pretrained(cls, model_path, *model_args, **kwargs): config = kwargs.pop("config", None) state_dict = kwargs.pop("state_dict", None) # Load config if we don't provide a configuration - if not isinstance(config, TrainConfig): + if not isinstance(config, TrainConfigBuilder): config_path = config if config is not None else model_path config, model_kwargs = cls.config_class.from_pretrained( config_path, diff --git a/src/torchblocks/models/nezha/__init__.py b/src/torchblocks/models/nezha/__init__.py new file mode 100644 index 0000000..21c6d67 --- /dev/null +++ b/src/torchblocks/models/nezha/__init__.py @@ -0,0 +1,2 @@ +from .modeling_nezha import * +from .configuration_nezha import * \ No newline at end of file diff --git a/src/torchblocks/models/nezha/configuration_nezha.py b/src/torchblocks/models/nezha/configuration_nezha.py new file mode 100644 index 0000000..d92991d --- /dev/null +++ b/src/torchblocks/models/nezha/configuration_nezha.py @@ -0,0 +1,124 @@ +from transformers import PretrainedConfig + +NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class NeZhaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. + It is used to instantiate an ALBERT model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the ALBERT `xxlarge `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + + Args: + vocab_size (:obj:`int`, optional, defaults to 30000): + Vocabulary size of the ALBERT model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. + embedding_size (:obj:`int`, optional, defaults to 128): + Dimensionality of vocabulary embeddings. + hidden_size (:obj:`int`, optional, defaults to 4096): + Dimensionality of the encoder modules and the pooler layer. + num_hidden_layers (:obj:`int`, optional, defaults to 12): + Number of hidden modules in the Transformer encoder. + num_hidden_groups (:obj:`int`, optional, defaults to 1): + Number of groups for the hidden modules, parameters in the same group are shared. + num_attention_heads (:obj:`int`, optional, defaults to 64): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, optional, defaults to 16384): + The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + inner_group_num (:obj:`int`, optional, defaults to 1): + The number of inner repetition of attention and ffn. + hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): + The non-linear activation function (function or string) in the encoder and pooler. + If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob (:obj:`float`, optional, defaults to 0): + The dropout probability for all fully connected modules in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something + large (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, optional, defaults to 2): + The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization modules. + classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for attached classifiers. + + Example:: + + from transformers import AlbertConfig, AlbertModel + # Initializing an ALBERT-xxlarge style configuration + albert_xxlarge_configuration = AlbertConfig() + + # Initializing an ALBERT-base style configuration + albert_base_configuration = AlbertConfig( + hidden_size=768, + num_attention_heads=12, + intermediate_size=3072, + ) + + # Initializing a model from the ALBERT-base style configuration + model = AlbertModel(albert_xxlarge_configuration) + + # Accessing the model configuration + configuration = model.config + + Attributes: + pretrained_config_archive_map (Dict[str, str]): + A dictionary containing all the available pre-trained checkpoints. + """ + + pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP + model_type = "nezha" + + def __init__( + self, + vocab_size=30000, + hidden_size=768, + num_hidden_layers=12, + num_hidden_groups=1, + num_attention_heads=12, + intermediate_size=3072, + inner_group_num=1, + hidden_act="gelu_new", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + max_relative_position=64, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + classifier_dropout_prob=0.1, + use_relative_position=True, + pad_token_id=0, + bos_token_id=2, + eos_token_id=3, + **kwargs + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_hidden_groups = num_hidden_groups + self.num_attention_heads = num_attention_heads + self.inner_group_num = inner_group_num + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.max_relative_position = max_relative_position + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_relative_position = use_relative_position + self.classifier_dropout_prob = classifier_dropout_prob diff --git a/src/torchblocks/models/nezha/modeling_nezha.py b/src/torchblocks/models/nezha/modeling_nezha.py new file mode 100644 index 0000000..714b1bf --- /dev/null +++ b/src/torchblocks/models/nezha/modeling_nezha.py @@ -0,0 +1,1236 @@ +import math +import os +import logging +import torch + +from torch import nn +import torch.nn.functional as F +from torch.nn import CrossEntropyLoss, MSELoss +base_dir = os.path.dirname(__file__) +print(base_dir) +import sys +sys.path.append(os.path.join(base_dir, '/')) +from configuration_nezha import NeZhaConfig +from transformers.file_utils import add_start_docstrings +from transformers.modeling_utils import PreTrainedModel, prune_linear_layer +from transformers.models.bert.modeling_bert import ( + BertOutput, + BertPooler, + BertSelfOutput, + BertIntermediate, + BertOnlyMLMHead, + BertOnlyNSPHead, + BertPreTrainingHeads, + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) + +logger = logging.getLogger(__name__) + +_CONFIG_FOR_DOC = "NeZhaConfig" +_TOKENIZER_FOR_DOC = "NeZhaTokenizer" + +NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = [] +NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP = {} + + +def add_start_docstrings_to_model_forward(*docstr): + def docstring_decorator(fn): + class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0]) + intro = " The {} forward method, overrides the :func:`__call__` special method.".format(class_name) + note = r""" + + .. note:: + Although the recipe for forward pass needs to be defined within this function, one should call the + :class:`Module` instance afterwards instead of this since the former takes care of running the pre and post + processing steps while the latter silently ignores them. + """ + fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") + return fn + + return docstring_decorator + + +def load_tf_weights_in_nezha(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + # logger.info("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "lamb_m", "lamb_v", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", + "global_step", "good_steps", "loss_scale", 'bad_steps'] + for n in name + ): + logger.info("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info("Skipping {}".format("/".join(name))) + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +class NeZhaEmbeddings(nn.Module): + """ + Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config): + super().__init__() + self.use_relative_position = config.use_relative_position + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + device = input_ids.device if input_ids is not None else inputs_embeds.device + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings = inputs_embeds + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +def relative_position_encoding(depth, max_length=512, max_relative_position=127): + vocab_size = max_relative_position * 2 + 1 + range_vec = torch.arange(max_length) + range_mat = range_vec.repeat(max_length).view(max_length, max_length) + distance_mat = range_mat - torch.t(range_mat) + distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position) + final_mat = distance_mat_clipped + max_relative_position + + embeddings_table = torch.zeros(vocab_size, depth) + position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth)) + embeddings_table[:, 0::2] = torch.sin(position * div_term) + embeddings_table[:, 1::2] = torch.cos(position * div_term) + embeddings_table = embeddings_table.unsqueeze(0).transpose(0, 1).squeeze(1) + + flat_relative_positions_matrix = final_mat.view(-1) + one_hot_relative_positions_matrix = torch.nn.functional.one_hot(flat_relative_positions_matrix, + num_classes=vocab_size).float() + positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table) + my_shape = list(final_mat.size()) + my_shape.append(depth) + positions_encoding = positions_encoding.view(my_shape) + return positions_encoding + + +class NeZhaSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + self.output_attentions = config.output_attentions + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + self.relative_positions_encoding = relative_position_encoding(max_length=config.max_position_embeddings, + depth=self.attention_head_size, + max_relative_position=config.max_relative_position) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + if encoder_hidden_states is not None: + mixed_key_layer = self.key(encoder_hidden_states) + mixed_value_layer = self.value(encoder_hidden_states) + attention_mask = encoder_attention_mask + else: + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size() + + relations_keys = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :].to(hidden_states.device) + query_layer_t = query_layer.permute(2, 0, 1, 3) + + query_layer_r = query_layer_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, + self.attention_head_size) + key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1)) + key_position_scores_r = key_position_scores.view(from_seq_length, batch_size, + num_attention_heads, from_seq_length) + key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3) + attention_scores = attention_scores + key_position_scores_r_t + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all modules in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + relations_values = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :].to(hidden_states.device) + attention_probs_t = attention_probs.permute(2, 0, 1, 3) + attentions_probs_r = attention_probs_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, + to_seq_length) + value_position_scores = torch.matmul(attentions_probs_r, relations_values) + value_position_scores_r = value_position_scores.view(from_seq_length, batch_size, + num_attention_heads, self.attention_head_size) + value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3) + context_layer = context_layer + value_position_scores_r_t + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) + return outputs + + +class NeZhaAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = NeZhaSelfAttention(config) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) + heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads + for head in heads: + # Compute how many pruned heads are before the head and move the index accordingly + head = head - sum(1 if h < head else 0 for h in self.pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + # Prune linear modules + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): + self_outputs = self.self( + hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class NeZhaLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.attention = NeZhaAttention(config) + self.is_decoder = config.is_decoder + if self.is_decoder: + self.crossattention = NeZhaAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): + self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + outputs = (layer_output,) + outputs + return outputs + + +class NeZhaEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.output_attentions = config.output_attentions + self.output_hidden_states = True + self.layer = nn.ModuleList([NeZhaLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): + all_hidden_states = () + all_attentions = () + for i, layer_module in enumerate(self.layer): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + layer_outputs = layer_module( + hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask + ) + hidden_states = layer_outputs[0] + if self.output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + # Add last layer + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + outputs = outputs + (all_attentions,) + return outputs # last-layer hidden state, (all hidden states), (all attentions) + + +class NeZhaPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for downloading and loading pretrained models. + """ + config_class = NeZhaConfig + pretrained_model_archive_map = NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_nezha + base_model_prefix = "bert" + + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, +) +class NeZhaModel(NeZhaPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well + as a decoder, in which case a layer of cross-attention is added between + the self-attention modules, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, + Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the + :obj:`is_decoder` argument of the configuration set to :obj:`True`; an + :obj:`encoder_hidden_states` is expected as an input to the forward pass. + + .. _`Attention is all you need`: + https://arxiv.org/abs/1706.03762 + + """ + + def __init__(self, config): + super().__init__(config) + self.config = config + self.embeddings = NeZhaEmbeddings(config) + self.encoder = NeZhaEncoder(config) + self.pooler = BertPooler(config) + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + head_mask=None, + position_ids=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during pre-training. + + This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + from transformers import BertModel, BertTokenizer + import torch + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, self.device + ) + + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) + + outputs = (sequence_output, pooled_output,) + encoder_outputs[ + 1: + ] # add hidden_states and attentions if they are here + return outputs # sequence_output, pooled_output, (hidden_states), (attentions) + + +@add_start_docstrings( + """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and + a `next sentence prediction (classification)` head. """, + BERT_START_DOCSTRING, +) +class NeZhaForPreTraining(NeZhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.bert = NeZhaModel(config) + self.cls = BertPreTrainingHeads(config) + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + head_mask=None, + position_ids=None, + inputs_embeds=None, + labels=None, + next_sentence_label=None, + ): + r""" + masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) + Indices should be in ``[0, 1]``. + ``0`` indicates sequence B is a continuation of sequence A, + ``1`` indicates sequence B is a random sequence. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. + prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False + continuation before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + + Examples:: + + from transformers import BertTokenizer, BertForPreTraining + import torch + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForPreTraining.from_pretrained('bert-base-uncased') + + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + + prediction_scores, seq_relationship_scores = outputs[:2] + + """ + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + # add hidden states and attention if they are here + outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] + + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + outputs = (total_loss,) + outputs + + return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) +class NeZhaForMaskedLM(NeZhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.bert = NeZhaModel(config) + self.cls = BertOnlyMLMHead(config) + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + head_mask=None, + position_ids=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + ): + r""" + masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the left-to-right language modeling loss (next word prediction). + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Masked language modeling loss. + ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): + Next token prediction loss. + prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + from transformers import BertTokenizer, BertForMaskedLM + import torch + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForMaskedLM.from_pretrained('bert-base-uncased') + + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, masked_lm_labels=input_ids) + + loss, prediction_scores = outputs[:2] + + """ + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + + # Although this may seem awkward, BertForMaskedLM supports two scenarios: + # 1. If a tensor that contains the indices of masked labels is provided, + # the cross-entropy is the MLM cross-entropy that measures the likelihood + # of predictions for masked words. + # 2. If `lm_labels` is provided we are in a causal scenario where we + # try to predict the next token for each input in the decoder. + masked_lm_labels = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + outputs = (loss,) + outputs + return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions) + + def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # if model is does not use a causal mask then add a dummy token + if self.config.is_decoder is False: + assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" + attention_mask = torch.cat( + [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1 + ) + + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, +) +class NeZhaForNextSentencePrediction(NeZhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.bert = NeZhaModel(config) + self.cls = BertOnlyNSPHead(config) + self.init_weights() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + head_mask=None, + position_ids=None, + inputs_embeds=None, + next_sentence_label=None, + ): + r""" + next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) + Indices should be in ``[0, 1]``. + ``0`` indicates sequence B is a continuation of sequence A, + ``1`` indicates sequence B is a random sequence. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): + Next sequence prediction (classification) loss. + seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + from transformers import BertTokenizer, BertForNextSentencePrediction + import torch + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') + + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + + seq_relationship_scores = outputs[0] + + """ + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) + + pooled_output = outputs[1] + seq_relationship_score = self.cls(pooled_output) + outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + if next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + outputs = (next_sentence_loss,) + outputs + + return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) + + +@add_start_docstrings( + """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + BERT_START_DOCSTRING, +) +class NeZhaForSequenceClassification(NeZhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.bert = NeZhaModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), logits, (hidden_states), (attentions) + + +@add_start_docstrings( + """Bert Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + BERT_START_DOCSTRING, +) +class NeZhaForMultipleChoice(NeZhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.bert = NeZhaModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + self.init_weights() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + head_mask=None, + position_ids=None, + inputs_embeds=None, + labels=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification loss. + classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): + `num_choices` is the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + from transformers import BertTokenizer, BertForMultipleChoice + import torch + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForMultipleChoice.from_pretrained('bert-base-uncased') + choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] + + input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + labels = torch.tensor(1).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + + loss, classification_scores = outputs[:2] + + """ + num_choices = input_ids.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + outputs = (loss,) + outputs + + return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + + +@add_start_docstrings( + """Bert Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + BERT_START_DOCSTRING, +) +class NeZhaForTokenClassification(NeZhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.bert = NeZhaModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + head_mask=None, + position_ids=None, + inputs_embeds=None, + labels=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : + Classification loss. + scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + from transformers import BertTokenizer, BertForTokenClassification + import torch + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForTokenClassification.from_pretrained('bert-base-uncased') + + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + + loss, scores = outputs[:2] + + """ + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), scores, (hidden_states), (attentions) + + +@add_start_docstrings( + """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + modules on top of the hidden-states output to compute `span start logits` and `span end logits`). """, + BERT_START_DOCSTRING, +) +class NeZhaForQuestionAnswering(NeZhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.bert = NeZhaModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + head_mask=None, + inputs_embeds=None, + position_ids=None, + start_positions=None, + end_positions=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + from transformers import BertTokenizer, BertForQuestionAnswering + import torch + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') + + question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" + encoding = tokenizer.encode_plus(question, text) + input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] + start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) + + all_tokens = tokenizer.convert_ids_to_tokens(input_ids) + answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) + + assert answer == "a nice puppet" + + """ + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) diff --git a/src/torchblocks/models/utils.py b/src/torchblocks/models/utils.py new file mode 100644 index 0000000..bce7a1d --- /dev/null +++ b/src/torchblocks/models/utils.py @@ -0,0 +1,103 @@ +import torch +import torch.nn as nn + + +def open_all_layers(model): + r"""Open all modules in model for training. + + Examples:: + >>> open_all_layers(model) + """ + model.train() + for p in model.parameters(): + p.requires_grad = True + + +def freeze_topK(n, model): + """Freeze first n modules of model + * **n** - Starting from initial layer, freeze all modules up to nth layer inclusively + """ + layers = list(model.parameters()) + # Freeze up to n modules + for param in layers[:n]: + param.requires_grad = False + for param in layers[n:]: + param.requires_grad = True + + +def freeze(model): + for param in model.parameters(): + param.requires_grad = False + + +def unfreeze(model): + for param in model.parameters(): + param.requires_grad = True + + +def reinit_last_layers(model, num_layers): + """Re-initialize the last-k transformer modules. + + Args: + model: The target transformer model. + num_layers: The number of modules to be re-initialized. + """ + if num_layers > 0: + base_model = getattr(model, model.base_model_prefix) + base_model.encoder.layer[-num_layers:].apply(model._init_weights) + + +def get_parameter_groups(module): + """Get parameter groups for transformer training. + + It is well-known that excluding layer-norm and bias parameters from weight-decay + leads better performance at training transformer-based models. To achieve that, this + function creates the separated parameter groups for applying weight-decay and + ignoring weight-decay. + + Args: + module: The target module to get the parameters from. + + Returns: + A list of two parameter groups. + """ + do_decay = [p for p in module.parameters() if p.ndim < 2] + no_decay = [p for p in module.parameters() if p.ndim >= 2] + return [{"params": do_decay}, {"params": no_decay, "weight_decay": 0.0}] + + +def open_specified_layers(model, open_layers): + r"""Open specified modules in model for training while keeping + other modules frozen. + + Args: + model (nn.Module): neural net model. + open_layers (str or list): modules open for training. + + Examples:: + >>> # Only model.classifier will be updated. + >>> open_layers = 'classifier' + >>> open_specified_layers(model, open_layers) + >>> # Only model.fc and model.classifier will be updated. + >>> open_layers = ['fc', 'classifier'] + >>> open_specified_layers(model, open_layers) + """ + if isinstance(model, nn.DataParallel): + model = model.module + + if isinstance(open_layers, str): + open_layers = [open_layers] + + for layer in open_layers: + assert hasattr( + model, layer + ), '"{}" is not an attribute of the model, please provide the correct name'.format( + layer + ) + for name, module in model.named_children(): + if name in open_layers: + module.train() + unfreeze(module) + else: + module.eval() + freeze(module) diff --git a/src/torchblocks/modules/__init__.py b/src/torchblocks/modules/__init__.py new file mode 100644 index 0000000..25dc9d3 --- /dev/null +++ b/src/torchblocks/modules/__init__.py @@ -0,0 +1,6 @@ +from .activations import * +from .activations import * +from .biaffine import * +from .conv import * +from .linears import * +from .crf import * \ No newline at end of file diff --git a/torchblocks/layers/activations.py b/src/torchblocks/modules/activations.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/layers/activations.py rename to src/torchblocks/modules/activations.py diff --git a/torchblocks/layers/attentions.py b/src/torchblocks/modules/attentions.py old mode 100755 new mode 100644 similarity index 96% rename from torchblocks/layers/attentions.py rename to src/torchblocks/modules/attentions.py index 090f18d..8c443e4 --- a/torchblocks/layers/attentions.py +++ b/src/torchblocks/modules/attentions.py @@ -38,7 +38,7 @@ def forward(self,q,k,v,mask = None): # mask 1:表示真实的 ,0时padding mask = mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility mask = (1.0 - mask) * -10000.0 - # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + # Apply the attention mask is (precomputed for all modules in BertModel forward() function) attention_scores = attention_scores + mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) diff --git a/src/torchblocks/modules/biaffine.py b/src/torchblocks/modules/biaffine.py new file mode 100644 index 0000000..06a1a3d --- /dev/null +++ b/src/torchblocks/modules/biaffine.py @@ -0,0 +1,42 @@ +import torch + + +class Biaffine(torch.nn.Module): + """Biaffine Attention""" + + def __init__(self, in_features: int, out_features: int, bias=(True, True)): + super().__init__() + self.in_features = in_features # mlp_arc_size / mlp_label_size + self.out_features = out_features # 1 / rel_size + self.bias = bias + # arc: mlp_size + # label: mlp_size + 1 + self.linear_input_size = in_features + bias[0] + # arc: mlp_size * 1 + # label: (mlp_size + 1) * rel_size + self.linear_output_size = out_features * (in_features + bias[1]) + self.linear = torch.nn.Linear( + in_features=self.linear_input_size, out_features=self.linear_output_size, bias=False + ) + + def forward(self, input1, input2): # noqa: D102 + batch_size, len1, dim1 = input1.size() + batch_size, len2, dim2 = input2.size() + if self.bias[0]: + ones = input1.data.new_ones(batch_size, len1, 1) + input1 = torch.cat((input1, ones), dim=-1) + # dim1 += 1 + if self.bias[1]: + ones = input2.data.new_ones(batch_size, len2, 1) + input2 = torch.cat((input2, ones), dim=-1) + # dim2 += 1 + # (bz, len1, dim1+1) -> (bz, len1, linear_output_size) + affine = self.linear(input1) + # (bz, len1 * self.out_features, dim2) + affine = affine.reshape(batch_size, len1 * self.out_features, -1) + # (bz, len1 * out_features, dim2) * (bz, dim2, len2) + # -> (bz, len1 * out_features, len2) -> (bz, len2, len1 * out_features) + biaffine = torch.bmm(affine, input2.transpose(1, 2)).transpose(1, 2).contiguous() + # (bz, len2, len1, out_features) # out_features: 1 or rel_size + biaffine = biaffine.reshape((batch_size, len2, len1, -1)).squeeze(-1) + return biaffine diff --git a/torchblocks/layers/capsule.py b/src/torchblocks/modules/capsule.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/layers/capsule.py rename to src/torchblocks/modules/capsule.py diff --git a/torchblocks/layers/conv.py b/src/torchblocks/modules/conv.py old mode 100755 new mode 100644 similarity index 54% rename from torchblocks/layers/conv.py rename to src/torchblocks/modules/conv.py index ceaf107..a558ae7 --- a/torchblocks/layers/conv.py +++ b/src/torchblocks/modules/conv.py @@ -2,6 +2,7 @@ import torch.nn as nn import torch.nn.functional as F + class Conv1D(nn.Module): def __init__(self, in_channels, out_channels): """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) @@ -20,6 +21,7 @@ def forward(self, x): x = x.view(*size_out) return x + class MaskedConv1d(nn.Conv1d): def __init__(self, in_channels, out_channels, kernel_size, dilation=1, @@ -52,6 +54,7 @@ def forward(self, inputs): return output * mask + class DilateConvLayer(nn.Module): def __init__(self, input_size, channels, dilation, dropout=0.1): super(DilateConvLayer, self).__init__() @@ -74,4 +77,48 @@ def forward(self, x): outputs.append(x) outputs = torch.cat(outputs, dim=1) outputs = outputs.permute(0, 2, 3, 1).contiguous() - return outputs \ No newline at end of file + return outputs + + +class ConvolutionCharEncoder(nn.Module): + r""" + char级别的卷积编码器. + """ + + def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(1, 3, 5)): + r""" + :param int char_emb_size: char级别embedding的维度. Default: 50 + :例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50. + :param tuple feature_maps: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的filter. + :param tuple kernels: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的卷积核. + :param initial_method: 初始化参数的方式, 默认为`xavier normal` + """ + super(ConvolutionCharEncoder, self).__init__() + self.convs = nn.ModuleList([ + nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, + padding=(0, kernels[i] // 2)) + for i in range(len(kernels))]) + + def forward(self, x): + r""" + :param torch.Tensor x: ``[batch_size * sent_length, word_length, char_emb_size]`` 输入字符的embedding + :return: torch.Tensor : 卷积计算的结果, 维度为[batch_size * sent_length, sum(feature_maps), 1] + """ + x = x.contiguous().view(x.size(0), 1, x.size(1), x.size(2)) + # [batch_size*sent_length, channel, width, height] + x = x.transpose(2, 3) + # [batch_size*sent_length, channel, height, width] + return self._convolute(x).unsqueeze(2) + + def _convolute(self, x): + feats = [] + for conv in self.convs: + y = conv(x) + # [batch_size*sent_length, feature_maps[i], 1, width - kernels[i] + 1] + y = torch.squeeze(y, 2) + # [batch_size*sent_length, feature_maps[i], width - kernels[i] + 1] + y = torch.tanh(y) + y, __ = torch.max(y, 2) + # [batch_size*sent_length, feature_maps[i]] + feats.append(y) + return torch.cat(feats, 1) # [batch_size*sent_length, sum(feature_maps)] diff --git a/torchblocks/layers/crf.py b/src/torchblocks/modules/crf.py old mode 100755 new mode 100644 similarity index 99% rename from torchblocks/layers/crf.py rename to src/torchblocks/modules/crf.py index 6148d22..0b954e3 --- a/torchblocks/layers/crf.py +++ b/src/torchblocks/modules/crf.py @@ -1,6 +1,5 @@ import torch import torch.nn as nn -from typing import List, Optional class CRF(nn.Module): """Conditional random field. diff --git a/src/torchblocks/modules/dropouts.py b/src/torchblocks/modules/dropouts.py new file mode 100644 index 0000000..6fec1b0 --- /dev/null +++ b/src/torchblocks/modules/dropouts.py @@ -0,0 +1,112 @@ +import torch +import torch.nn as nn +from itertools import repeat + +class SpatialDropout(nn.Module): + """ + 对字级别的向量进行丢弃 + """ + def __init__(self, drop_prob): + super(SpatialDropout, self).__init__() + self.drop_prob = drop_prob + @staticmethod + def _make_noise(input): + return input.new().resize_(input.size(0), *repeat(1, input.dim() - 2), input.size(2)) + def forward(self, inputs): + output = inputs.clone() + if not self.training or self.drop_prob == 0: + return inputs + else: + noise = self._make_noise(inputs) + if self.drop_prob == 1: + noise.fill_(0) + else: + noise.bernoulli_(1 - self.drop_prob).div_(1 - self.drop_prob) + noise = noise.expand_as(inputs) + output.mul_(noise) + return output + + +class MultiSampleDropout(nn.Module): + ''' + # multisample dropout (wut): https://arxiv.org/abs/1905.09788 + ''' + + def __init__(self, hidden_size, num_labels, K=5, dropout_rate=0.5): + super().__init__() + self.K = K + self.dropout = nn.Dropout(dropout_rate) + self.classifier = nn.Linear(hidden_size, num_labels) + + def forward(self, input): + logits = torch.stack([self.classifier(self.dropout(input)) for _ in range(self.K)], dim=0) + logits = torch.mean(logits, dim=0) + return logits + + +class TimestepDropout(torch.nn.Dropout): + r""" + 传入参数的shape为 ``(batch_size, num_timesteps, embedding_dim)`` + 使用同一个shape为 ``(batch_size, embedding_dim)`` 的mask在每个timestamp上做dropout。 + """ + + def forward(self, x): + dropout_mask = x.new_ones(x.shape[0], x.shape[-1]) + torch.nn.functional.dropout(dropout_mask, self.p, self.training, inplace=True) + dropout_mask = dropout_mask.unsqueeze(1) # [batch_size, 1, embedding_dim] + if self.inplace: + x *= dropout_mask + return + else: + return x * dropout_mask + +class LockedDropout(torch.nn.Module): + """ + Implementation of locked (or variational) dropout. Randomly drops out entire parameters in embedding space. + """ + + def __init__(self, dropout_rate=0.5, batch_first=True, inplace=False): + super(LockedDropout, self).__init__() + self.dropout_rate = dropout_rate + self.batch_first = batch_first + self.inplace = inplace + + def forward(self, x): + if not self.training or not self.dropout_rate: + return x + + if not self.batch_first: + m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - self.dropout_rate) + else: + m = x.data.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout_rate) + + mask = torch.autograd.Variable(m, requires_grad=False) / (1 - self.dropout_rate) + mask = mask.expand_as(x) + return mask * x + + def extra_repr(self): + inplace_str = ", inplace" if self.inplace else "" + return "p={}{}".format(self.dropout_rate, inplace_str) + +class WordDropout(torch.nn.Module): + """ + Implementation of word dropout. Randomly drops out entire words (or characters) in embedding space. + """ + + def __init__(self, dropout_rate=0.05, inplace=False): + super(WordDropout, self).__init__() + self.dropout_rate = dropout_rate + self.inplace = inplace + + def forward(self, x): + if not self.training or not self.dropout_rate: + return x + + m = x.data.new(x.size(0), x.size(1), 1).bernoulli_(1 - self.dropout_rate) + + mask = torch.autograd.Variable(m, requires_grad=False) + return mask * x + + def extra_repr(self): + inplace_str = ", inplace" if self.inplace else "" + return "p={}{}".format(self.dropout_rate, inplace_str) \ No newline at end of file diff --git a/torchblocks/layers/embeddings.py b/src/torchblocks/modules/embeddings.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/layers/embeddings.py rename to src/torchblocks/modules/embeddings.py diff --git a/src/torchblocks/modules/gate.py b/src/torchblocks/modules/gate.py new file mode 100644 index 0000000..b8f0951 --- /dev/null +++ b/src/torchblocks/modules/gate.py @@ -0,0 +1,46 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Gate(nn.Module): + """Gate Unit + g = sigmoid(Wx) + x = g * x + """ + def __init__(self, input_size, dropout_rate=0.): + super(Gate, self).__init__() + self.linear = nn.Linear(input_size, input_size, bias=False) + self.dropout_rate = dropout_rate + + def forward(self, x): + """ + Args: + x: batch * len * dim + x_mask: batch * len (1 for padding, 0 for true) + Output: + res: batch * len * dim + """ + if self.dropout_rate: + x = F.dropout(x, p=self.dropout_rate, training=self.training) + x_proj = self.linear(x) + gate = torch.sigmoid(x) + return x_proj * gate + +class GatedFeedForward(nn.Module): + """ Feed Forward Layer with Gated Linear Unit. + https://arxiv.org/abs/2002.05202 + """ + def __init__(self, hidden_size, feedforward_size, has_bias=True): + super(GatedFeedForward, self).__init__() + self.linear_gate = nn.Linear(hidden_size, feedforward_size, bias=has_bias) + self.linear_1 = nn.Linear(hidden_size, feedforward_size, bias=has_bias) + self.linear_2 = nn.Linear(feedforward_size, hidden_size, bias=has_bias) + self.act = nn.ELU() + + def forward(self, x): + gate = self.act(self.linear_gate(x)) + inter_linear = self.linear_1(x) + inter = gate * inter_linear + output = self.linear_2(inter) + return output \ No newline at end of file diff --git a/src/torchblocks/modules/global_pointer.py b/src/torchblocks/modules/global_pointer.py new file mode 100644 index 0000000..419b513 --- /dev/null +++ b/src/torchblocks/modules/global_pointer.py @@ -0,0 +1,77 @@ +import copy +import torch +import torch.nn as nn +from .position import SinusoidalPositionEmbedding + + +class GlobalPointer(nn.Module): + """全局指针模块 + 将序列的每个(start, end)作为整体来进行判断 + """ + + def __init__(self, head_size, inner_dim, hidden_size, rope=True): + super(GlobalPointer, self).__init__() + self.rope = rope + self.inner_dim = inner_dim + self.head_size = head_size # num_label + self.dense = nn.Linear(hidden_size, self.head_size * self.inner_dim * 2) + self.pos_emd = SinusoidalPositionEmbedding(self.inner_dim, 'zero') + + def multilabel_categorical_crossentropy(self, targets, entity_score): + """Multi-label cross entropy loss. + """ + entity_score = (1 - 2 * targets) * entity_score # -1 -> pos classes, 1 -> neg classes + entity_score_neg = entity_score - targets * 1e12 # mask the pred outputs of pos classes + entity_score_pos = ( + entity_score - (1 - targets) * 1e12 + ) # mask the pred outputs of neg classes + zeros = torch.zeros_like(entity_score[..., :1]) + entity_score_neg = torch.cat([entity_score_neg, zeros], dim=-1) + entity_score_pos = torch.cat([entity_score_pos, zeros], dim=-1) + neg_loss = torch.logsumexp(entity_score_neg, dim=-1) + pos_loss = torch.logsumexp(entity_score_pos, dim=-1) + return (neg_loss + pos_loss).mean() + + def compute_loss(self, entity_score, targets) -> torch.Tensor: + """ + targets : (batch_size, num_classes, seq_len, seq_len) + entity_score : (batch_size, num_classes, seq_len, seq_len) + """ + batch_size, num_classes = entity_score.shape[:2] + targets = targets.reshape(batch_size * num_classes, -1) + entity_score = entity_score.reshape(batch_size * num_classes, -1) + loss = self.multilabel_categorical_crossentropy(targets, entity_score) + return loss + + def add_position_embedding(self, input_embed, cos_pos, sin_pos): + tran_embed = torch.stack([-input_embed[..., 1::2], input_embed[..., ::2]], 4) + tran_embed = torch.reshape(tran_embed, input_embed.shape) + output_embed = input_embed * cos_pos + tran_embed * sin_pos + return output_embed + + def forward(self, sequence_output, mask=None): + batch_size = sequence_output.size()[0] + seq_len = sequence_output.size()[1] + outputs = self.dense(sequence_output) + outputs = torch.split(outputs, self.inner_dim * 2, dim=-1) + # 按照-1这个维度去分,每块包含x个小块 + outputs = torch.stack(outputs, dim=-2) + # 沿着一个新维度对输入张量序列进行连接。 序列中所有的张量都应该为相同形状 + qw, kw = outputs[..., :self.inner_dim], outputs[..., self.inner_dim:] + # 分出qw和kw + # RoPE编码 + if self.rope: + pos_emb = self.pos_emd(outputs) + cos_pos = pos_emb[..., None, 1::2].repeat_interleave(2, dim=-1) + sin_pos = pos_emb[..., None, ::2].repeat_interleave(2, dim=-1) + qw = self.add_position_embedding(qw, cos_pos, sin_pos) + kw = self.add_position_embedding(kw, cos_pos, sin_pos) + # 计算内积 + logits = torch.einsum('bmhd , bnhd -> bhmn', qw, kw) + # padding mask + pad_mask = mask.unsqueeze(1).unsqueeze(1).expand(batch_size, self.head_size, seq_len, seq_len) + logits = logits * pad_mask - (1 - pad_mask) * 1e12 + # 排除下三角 + mask = torch.tril(torch.ones_like(logits), -1) + logits = logits - mask * 1e12 + return logits / self.inner_dim ** 0.5 diff --git a/torchblocks/layers/layer_norm.py b/src/torchblocks/modules/layer_norm.py old mode 100755 new mode 100644 similarity index 59% rename from torchblocks/layers/layer_norm.py rename to src/torchblocks/modules/layer_norm.py index 1f479b8..9a12577 --- a/torchblocks/layers/layer_norm.py +++ b/src/torchblocks/modules/layer_norm.py @@ -1,18 +1,20 @@ import torch from torch import nn +try: + from apex.normalization import FusedLayerNorm +except ModuleNotFoundError: + from torch.nn import LayerNorm as FusedLayerNorm + + class ConditionalLayerNorm(nn.Module): - def __init__(self, - normalized_shape, - cond_shape, - eps=1e-12): + def __init__(self, normalized_shape, cond_shape, eps=1e-12): super().__init__() self.eps = eps self.weight = nn.Parameter(torch.Tensor(normalized_shape)) self.bias = nn.Parameter(torch.Tensor(normalized_shape)) self.weight_dense = nn.Linear(cond_shape, normalized_shape, bias=False) self.bias_dense = nn.Linear(cond_shape, normalized_shape, bias=False) - self.reset_weight_and_bias() def reset_weight_and_bias(self): @@ -35,4 +37,21 @@ def forward(self, inputs, cond=None): std = torch.sqrt(variance + self.eps) # (b, s, 1) outputs = outputs / std # (b, s, h) outputs = outputs * weight + bias - return outputs \ No newline at end of file + return outputs + + +def replace_with_fused_layernorm(module): + """Replace the normal (PyTorch-vanilla) layer-norms to apex fused layer-norms. + Args: + module: The target module to be replaced. + """ + for submodule in module.modules(): + for name, layer in submodule.named_children(): + if not isinstance(layer, nn.LayerNorm): + continue + # Create new fused layer-norm and copy the original parameters. + new_layer = FusedLayerNorm(layer.normalized_shape, layer.eps) + new_layer.weight = layer.weight + new_layer.bias = layer.bias + # Replace the layer-norm to the new one. + setattr(submodule, name, new_layer) diff --git a/torchblocks/layers/linears.py b/src/torchblocks/modules/linears.py old mode 100755 new mode 100644 similarity index 96% rename from torchblocks/layers/linears.py rename to src/torchblocks/modules/linears.py index 7abdebc..ab620cd --- a/torchblocks/layers/linears.py +++ b/src/torchblocks/modules/linears.py @@ -1,7 +1,9 @@ +import math import torch import torch.nn as nn import torch.nn.functional as F + class FeedForwardNetwork(nn.Module): def __init__(self, input_size, hidden_size, output_size, dropout_rate=0): super(FeedForwardNetwork, self).__init__() @@ -14,6 +16,7 @@ def forward(self, x): x_proj = self.linear2(x_proj) return x_proj + class PoolerStartLogits(nn.Module): def __init__(self, hidden_size, num_classes): super(PoolerStartLogits, self).__init__() @@ -28,6 +31,7 @@ def forward(self, hidden_states, p_mask=None): x = x * (1 - p_mask) - 1e30 * p_mask return x + class PoolerEndLogits(nn.Module): def __init__(self, hidden_size, num_classes): super(PoolerEndLogits, self).__init__() diff --git a/torchblocks/layers/mixout.py b/src/torchblocks/modules/mixout.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/layers/mixout.py rename to src/torchblocks/modules/mixout.py diff --git a/torchblocks/layers/mlp.py b/src/torchblocks/modules/mlp.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/layers/mlp.py rename to src/torchblocks/modules/mlp.py diff --git a/torchblocks/layers/pooling.py b/src/torchblocks/modules/pooling.py old mode 100755 new mode 100644 similarity index 72% rename from torchblocks/layers/pooling.py rename to src/torchblocks/modules/pooling.py index 74a8a7a..a62b2d5 --- a/torchblocks/layers/pooling.py +++ b/src/torchblocks/modules/pooling.py @@ -1,13 +1,14 @@ import torch import torch.nn as nn + class MaxPool(nn.Module): r""" Max-pooling模块。 """ + def __init__(self, stride=None, padding=0, dilation=1, dimension=1, kernel_size=None, ceil_mode=False): r""" - :param stride: 窗口移动大小,默认为kernel_size :param padding: padding的内容,默认为0 :param dilation: 控制窗口内元素移动距离的大小 @@ -131,3 +132,38 @@ def forward(self, tensor, mask, dim=1): masks = mask.view(mask.size(0), mask.size(1), -1).float() return torch.sum(tensor * masks.float(), dim=dim) / torch.sum(masks.float(), dim=1) + +class Pooling(nn.Module): + def __init__(self, hidden_size, pooling_mode='cls', last_layers=None): + super(Pooling, self).__init__() + assert pooling_mode in ['mean', 'max', 'cls', 'mean_sqrt'] + self.hidden_size = hidden_size + self.last_layers = last_layers + self.pooling_mode = pooling_mode + self.pooling_output_dimension = hidden_size if last_layers is None else hidden_size * last_layers + + def forward(self, features, attention_mask): + sequence_outputs = features['last_hidden_state'] + cls_outputs = features['pooler_output'] + hidden_outputs = features['hidden_states'] + if self.last_layers is not None: + sequence_outputs = torch.cat([hidden_outputs[-i] for i in range(1, self.last_layers + 1)], dim=-1) + if self.pooling_mode == 'cls': + vectors = cls_outputs + if self.pooling_mode == 'max': + input_mask_expanded = attention_mask.unsqueeze(-1).expand(sequence_outputs.size()).float() + sequence_outputs[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value + vectors = torch.max(sequence_outputs, 1)[0] + if self.pooling_mode in ['mean', 'mean_sqrt']: + input_mask_expanded = attention_mask.unsqueeze(-1).expand(sequence_outputs.size()).float() + sum_embeddings = torch.sum(sequence_outputs * input_mask_expanded, 1) + sum_mask = input_mask_expanded.sum(1) + sum_mask = torch.clamp(sum_mask, min=1e-9) + if self.pooling_mode == 'mean': + vectors = sum_embeddings / sum_mask + if self.pooling_mode == 'mean_sqrt': + vectors = sum_embeddings / torch.sqrt(sum_mask) + return vectors + + def get_pooling_output_dimension(self): + return self.pooling_output_dimension diff --git a/torchblocks/layers/position.py b/src/torchblocks/modules/position.py old mode 100755 new mode 100644 similarity index 50% rename from torchblocks/layers/position.py rename to src/torchblocks/modules/position.py index 55e135f..7dfd323 --- a/torchblocks/layers/position.py +++ b/src/torchblocks/modules/position.py @@ -32,3 +32,37 @@ def forward(self, batch_size, seq_length): output: [batch size,sequence length, embed dim] """ return self.pe[:seq_length, :].repeat((batch_size, *([1] * len(self.pe.shape)))) + + +class SinusoidalPositionEmbedding(nn.Module): + """Sin-Cos Embedding. + ref: https://spaces.ac.cn/archives/8265 + """ + + def __init__(self, output_dim: int, merge_mode: str = 'add', custom_position_ids: bool = False): + super(SinusoidalPositionEmbedding, self).__init__() + self.output_dim = output_dim + self.merge_mode = merge_mode + self.custom_position_ids = custom_position_ids + + def forward(self, inputs): # noqa + if self.custom_position_ids: + seq_len = inputs.shape[1] + inputs, position_ids = inputs + position_ids = position_ids.type(torch.float) + else: + input_shape = inputs.shape + seq_len = input_shape[1] + position_ids = torch.arange(seq_len).type(torch.float)[None] + indices = torch.arange(self.output_dim // 2).type(torch.float) + indices = torch.pow(10000.0, -2 * indices / self.output_dim) + embeddings = torch.einsum('bn,d->bnd', position_ids, indices) + embeddings = torch.stack([torch.sin(embeddings), torch.cos(embeddings)], dim=-1) + embeddings = torch.reshape(embeddings, (-1, seq_len, self.output_dim)) + embeddings = embeddings.to(inputs.device) + if self.merge_mode == 'add': + return inputs + embeddings + elif self.merge_mode == 'mul': + return inputs * (embeddings + 1.0) + elif self.merge_mode == 'zero': + return embeddings diff --git a/src/torchblocks/modules/rnn.py b/src/torchblocks/modules/rnn.py new file mode 100644 index 0000000..96c52dd --- /dev/null +++ b/src/torchblocks/modules/rnn.py @@ -0,0 +1,66 @@ +import torch +import copy +import torch.nn as nn +import torch.nn.functional as F + + +class ResidualLSTM(nn.Module): + def __init__(self, d_model, rnn='GRU', rnn_dropout_rate=0.2, dropout_rate=0.2): + super(ResidualLSTM, self).__init__() + self.downsample = nn.Linear(d_model, d_model // 2) + if rnn == 'GRU': + self.LSTM = nn.GRU(d_model // 2, d_model // 2, num_layers=2, bidirectional=False, dropout=rnn_dropout_rate) + else: + self.LSTM = nn.LSTM(d_model // 2, d_model // 2, num_layers=2, bidirectional=False, dropout=rnn_dropout_rate) + self.dropout1 = nn.Dropout(dropout_rate) + self.norm1 = nn.LayerNorm(d_model // 2) + self.linear1 = nn.Linear(d_model // 2, d_model * 4) + self.linear2 = nn.Linear(d_model * 4, d_model) + self.dropout2 = nn.Dropout(dropout_rate) + self.norm2 = nn.LayerNorm(d_model) + + def forward(self, x): + res = copy.deepcopy(x) + x = self.downsample(x) + x, _ = self.LSTM(x) + x = self.dropout1(x) + x = self.norm1(x) + x = F.relu(self.linear1(x)) + x = self.linear2(x) + x = self.dropout2(x) + x = res + x + return self.norm2(x) + +class LSTMCharEncoder(nn.Module): + r""" + char级别基于LSTM的encoder. + """ + + def __init__(self, char_emb_size=50, hidden_size=None): + r""" + :param int char_emb_size: char级别embedding的维度. Default: 50 + 例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50. + :param int hidden_size: LSTM隐层的大小, 默认为char的embedding维度 + :param initial_method: 初始化参数的方式, 默认为`xavier normal` + """ + super(LSTMCharEncoder, self).__init__() + self.hidden_size = char_emb_size if hidden_size is None else hidden_size + self.lstm = nn.LSTM(input_size=char_emb_size, + hidden_size=self.hidden_size, + num_layers=1, + bias=True, + batch_first=True) + + def forward(self, x): + r""" + :param torch.Tensor x: ``[ n_batch*n_word, word_length, char_emb_size]`` 输入字符的embedding + :return: torch.Tensor : [ n_batch*n_word, char_emb_size]经过LSTM编码的结果 + """ + batch_size = x.shape[0] + h0 = torch.empty(1, batch_size, self.hidden_size) + h0 = nn.init.orthogonal_(h0) + c0 = torch.empty(1, batch_size, self.hidden_size) + c0 = nn.init.orthogonal_(c0) + + _, hidden = self.lstm(x, (h0, c0)) + return hidden[0].squeeze().unsqueeze(2) \ No newline at end of file diff --git a/torchblocks/layers/utils.py b/src/torchblocks/modules/utils.py old mode 100755 new mode 100644 similarity index 79% rename from torchblocks/layers/utils.py rename to src/torchblocks/modules/utils.py index 779cd77..c10781a --- a/torchblocks/layers/utils.py +++ b/src/torchblocks/modules/utils.py @@ -1,5 +1,7 @@ import copy from torch.nn import ModuleList + def get_clones(module, N): + "Produce N identical modules." return ModuleList([copy.deepcopy(module) for _ in range(N)]) diff --git a/torchblocks/optims/__init__.py b/src/torchblocks/optims/__init__.py old mode 100755 new mode 100644 similarity index 91% rename from torchblocks/optims/__init__.py rename to src/torchblocks/optims/__init__.py index 58db018..9d696c3 --- a/torchblocks/optims/__init__.py +++ b/src/torchblocks/optims/__init__.py @@ -16,6 +16,7 @@ from .planradam import * from .sgdw import * from .sgdp import * +from .adan import * from .lr_scheduler import * diff --git a/torchblocks/optims/adabelief.py b/src/torchblocks/optims/adabelief.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/adabelief.py rename to src/torchblocks/optims/adabelief.py diff --git a/torchblocks/optims/adabound.py b/src/torchblocks/optims/adabound.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/adabound.py rename to src/torchblocks/optims/adabound.py diff --git a/torchblocks/optims/adafactor.py b/src/torchblocks/optims/adafactor.py old mode 100755 new mode 100644 similarity index 97% rename from torchblocks/optims/adafactor.py rename to src/torchblocks/optims/adafactor.py index d65e8f3..31adc9a --- a/torchblocks/optims/adafactor.py +++ b/src/torchblocks/optims/adafactor.py @@ -1,7 +1,7 @@ import math import torch from torch.optim.optimizer import Optimizer -from torchblocks.utils.versions import require_version +from src.torchblocks import require_version class Adafactor(Optimizer): """ diff --git a/src/torchblocks/optims/adai.py b/src/torchblocks/optims/adai.py new file mode 100644 index 0000000..da5b8dd --- /dev/null +++ b/src/torchblocks/optims/adai.py @@ -0,0 +1,115 @@ +import torch +from torch.optim.optimizer import Optimizer, required + +class Adai(Optimizer): + r"""Implements Adaptive Inertia Estimation (Adai) algorithm. + It is proposed in the ICML2022 paper + `Adaptive Inertia: Disentangling the Effects of Adaptive Learning Rate and Momentum`. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float): learning rate + betas (Tuple[float, float], optional): beta0 and beta2 (default: (0.1, 0.99)) + eps (float, optional): the inertia bound (default: 1e-03) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + + """ + + def __init__(self, params, lr=required, betas=(0.1, 0.99), eps=1e-03, + weight_decay=0): + if lr is not required and lr < 0.0: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0]: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + super(Adai, self).__init__(params, defaults) + + + def __setstate__(self, state): + super(Adai, self).__setstate__(state) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + param_size = 0 + exp_avg_sq_hat_sum = 0. + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + param_size += p.numel() + grad = p.grad.data + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data, memory_format=torch.preserve_format) + # Cumulative products of beta1 + state['beta1_prod'] = torch.ones_like(p.data, memory_format=torch.preserve_format) + + state['step'] += 1 + + exp_avg_sq = state['exp_avg_sq'] + beta0, beta2 = group['betas'] + + bias_correction2 = 1 - beta2 ** state['step'] + + if group['weight_decay'] != 0: + grad.add_(p.data, alpha=group['weight_decay']) + + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + + exp_avg_sq_hat_sum += exp_avg_sq.sum() / bias_correction2 + + # Calculate the mean of all elements in exp_avg_sq_hat + exp_avg_sq_hat_mean = exp_avg_sq_hat_sum / param_size + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + + state = self.state[p] + + exp_avg = state['exp_avg'] + exp_avg_sq = state['exp_avg_sq'] + beta1_prod = state['beta1_prod'] + beta0, beta2 = group['betas'] + + bias_correction2 = 1 - beta2 ** state['step'] + + exp_avg_sq_hat = exp_avg_sq / bias_correction2 + beta1 = (1. - (exp_avg_sq_hat / exp_avg_sq_hat_mean).mul(beta0)).clamp(0., 1 - group['eps']) + + beta1_prod.mul_(beta1) + bias_correction1 = 1 - beta1_prod + + exp_avg.mul_(beta1).addcmul_(1 - beta1, grad) + exp_avg_hat = exp_avg / bias_correction1 + + p.data.add_(exp_avg_hat, alpha=-group['lr']) + + return loss diff --git a/src/torchblocks/optims/adaiw.py b/src/torchblocks/optims/adaiw.py new file mode 100644 index 0000000..2dbaf07 --- /dev/null +++ b/src/torchblocks/optims/adaiw.py @@ -0,0 +1,117 @@ + +import torch +from torch.optim.optimizer import Optimizer, required + + +class AdaiW(Optimizer): + r"""Implements Adai with decoupled weight decay (AdaiW). + It is proposed in the ICML2022 paper + `Adaptive Inertia: Disentangling the Effects of Adaptive Learning Rate and Momentum` + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): beta0 and beta2 (default: (0.1, 0.99)) + eps (float, optional): the inertia bound (default: 1e-03) + weight_decay (float, optional): weight decay (default: 0) + + """ + + def __init__(self, params, lr=required, betas=(0.1, 0.99), eps=1e-03, + weight_decay=0): + if lr is not required and lr < 0.0: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0]: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + super(AdaiW, self).__init__(params, defaults) + + + def __setstate__(self, state): + super(AdaiW, self).__setstate__(state) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + param_size = 0 + exp_avg_sq_hat_sum = 0. + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + param_size += p.numel() + grad = p.grad.data + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data, memory_format=torch.preserve_format) + # Cumulative products of beta1 + state['beta1_prod'] = torch.ones_like(p.data, memory_format=torch.preserve_format) + + exp_avg_sq = state['exp_avg_sq'] + beta0, beta2 = group['betas'] + + state['step'] += 1 + bias_correction2 = 1 - beta2 ** state['step'] + + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + + exp_avg_sq_hat = exp_avg_sq / bias_correction2 + + exp_avg_sq_hat_sum += exp_avg_sq_hat.sum() + + # Calculate the mean of all elements in exp_avg_sq_hat + exp_avg_sq_hat_mean = exp_avg_sq_hat_sum / param_size + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + + # Perform stable/decoupled weight decay + if group['weight_decay'] !=0: + p.data.mul_(1 - group['lr'] * group['weight_decay']) + + state = self.state[p] + + exp_avg = state['exp_avg'] + exp_avg_sq = state['exp_avg_sq'] + beta0, beta2 = group['betas'] + beta1_prod = state['beta1_prod'] + bias_correction2 = 1 - beta2 ** state['step'] + + exp_avg_sq_hat = exp_avg_sq / bias_correction2 + + beta1 = (1. - (exp_avg_sq_hat / exp_avg_sq_hat_mean).mul(beta0)).clamp(0., 1 - group['eps']) + + beta1_prod.mul_(beta1) + bias_correction1 = 1 - beta1_prod + + exp_avg.mul_(beta1).addcmul_(1 - beta1, grad) + exp_avg_hat = exp_avg.div(bias_correction1) + + p.data.add_(exp_avg_hat, alpha=-group['lr']) + + return loss diff --git a/torchblocks/optims/adamod.py b/src/torchblocks/optims/adamod.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/adamod.py rename to src/torchblocks/optims/adamod.py diff --git a/torchblocks/optims/adamp.py b/src/torchblocks/optims/adamp.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/adamp.py rename to src/torchblocks/optims/adamp.py diff --git a/torchblocks/optims/adamw.py b/src/torchblocks/optims/adamw.py old mode 100755 new mode 100644 similarity index 95% rename from torchblocks/optims/adamw.py rename to src/torchblocks/optims/adamw.py index 3fe6ae3..7eb9fdf --- a/torchblocks/optims/adamw.py +++ b/src/torchblocks/optims/adamw.py @@ -3,7 +3,7 @@ from torch import nn from torch.optim.optimizer import Optimizer from typing import Callable, Iterable, Optional, Tuple, Union -from torchblocks.utils.versions import require_version +# from torchblocks.utils.versions import require_version class AdamW(Optimizer): """ @@ -34,7 +34,7 @@ def __init__( weight_decay: float = 0.0, correct_bias: bool = True, ): - require_version("torch>=1.5.0") # add_ with alpha + # require_version("torch>=1.5.0") # add_ with alpha if lr < 0.0: raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0") if not 0.0 <= betas[0] < 1.0: diff --git a/src/torchblocks/optims/adan.py b/src/torchblocks/optims/adan.py new file mode 100644 index 0000000..247dc67 --- /dev/null +++ b/src/torchblocks/optims/adan.py @@ -0,0 +1,154 @@ +# Copyright 2022 Garena Online Private Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +import torch +from torch.optim.optimizer import Optimizer + + +class Adan(Optimizer): + """ + Implements a pytorch variant of Adan + + Adan was proposed in + Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022. + https://arxiv.org/abs/2208.06677 + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + betas (Tuple[float, float, flot], optional): coefficients used for computing + running averages of gradient and its norm. (default: (0.98, 0.92, 0.99)) + (0.98, 0.99, 0.99) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0) + max_grad_norm (float, optional): value used to clip + global grad norm (default: 0.0 no clip) + no_prox (bool): how to perform the decoupled weight decay (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8, + weight_decay=0.0, max_grad_norm=0.0, no_prox=False): + if not 0.0 <= max_grad_norm: + raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm)) + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= betas[2] < 1.0: + raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + max_grad_norm=max_grad_norm, no_prox=no_prox) + super(Adan, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Adan, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('no_prox', False) + + @torch.no_grad() + def restart_opt(self): + for group in self.param_groups: + group['step'] = 0 + for p in group['params']: + if p.requires_grad: + state = self.state[p] + # State initialization + + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p) + # Exponential moving average of gradient difference + state['exp_avg_diff'] = torch.zeros_like(p) + + @torch.no_grad() + def step(self): + """ + Performs a single optimization step. + """ + if self.defaults['max_grad_norm'] > 0: + device = self.param_groups[0]['params'][0].device + global_grad_norm = torch.zeros(1, device=device) + + max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device) + for group in self.param_groups: + + for p in group['params']: + if p.grad is not None: + grad = p.grad + global_grad_norm.add_(grad.pow(2).sum()) + + global_grad_norm = torch.sqrt(global_grad_norm) + + clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0) + else: + clip_global_grad_norm = 1.0 + + for group in self.param_groups: + beta1, beta2, beta3 = group['betas'] + # assume same step across group now to simplify things + # per parameter step can be easily support by making it tensor, or pass list into kernel + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + bias_correction1 = 1.0 - beta1 ** group['step'] + + bias_correction2 = 1.0 - beta2 ** group['step'] + + bias_correction3 = 1.0 - beta3 ** group['step'] + + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + if len(state) == 0: + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + state['exp_avg_diff'] = torch.zeros_like(p) + + grad = p.grad.mul_(clip_global_grad_norm) + if 'pre_grad' not in state or group['step'] == 1: + state['pre_grad'] = grad + + copy_grad = grad.clone() + + exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff'] + diff = grad - state['pre_grad'] + + update = grad + beta2 * diff + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # m_t + exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2) # diff_t + exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3) # n_t + + denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps']) + update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom) + + if group['no_prox']: + p.data.mul_(1 - group['lr'] * group['weight_decay']) + p.add_(update, alpha=-group['lr']) + else: + p.add_(update, alpha=-group['lr']) + p.data.div_(1 + group['lr'] * group['weight_decay']) + + state['pre_grad'] = copy_grad diff --git a/torchblocks/optims/adax.py b/src/torchblocks/optims/adax.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/adax.py rename to src/torchblocks/optims/adax.py diff --git a/torchblocks/optims/lamb.py b/src/torchblocks/optims/lamb.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/lamb.py rename to src/torchblocks/optims/lamb.py diff --git a/torchblocks/optims/lars.py b/src/torchblocks/optims/lars.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/lars.py rename to src/torchblocks/optims/lars.py diff --git a/torchblocks/optims/lookahead.py b/src/torchblocks/optims/lookahead.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/lookahead.py rename to src/torchblocks/optims/lookahead.py diff --git a/torchblocks/optims/lr_scheduler.py b/src/torchblocks/optims/lr_scheduler.py old mode 100755 new mode 100644 similarity index 91% rename from torchblocks/optims/lr_scheduler.py rename to src/torchblocks/optims/lr_scheduler.py index 5db54c2..d75dd60 --- a/torchblocks/optims/lr_scheduler.py +++ b/src/torchblocks/optims/lr_scheduler.py @@ -3,11 +3,12 @@ import logging from torch.optim import Optimizer from torch.optim.lr_scheduler import LambdaLR +from torch.optim.lr_scheduler import CosineAnnealingLR logger = logging.getLogger(__name__) -def get_constant_schedule(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): +def get_constant_schedule(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1, **kwargs): """ Create a schedule with a constant learning rate, using the learning rate set in optimizer. @@ -23,7 +24,7 @@ def get_constant_schedule(optimizer, num_warmup_steps, num_training_steps, last_ return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch) -def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): +def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1, **kwargs): """ Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate increases linearly between 0 and the initial lr set in the optimizer. @@ -48,7 +49,7 @@ def lr_lambda(current_step: int): return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) -def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): +def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1, **kwargs): """ Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer. @@ -79,7 +80,7 @@ def lr_lambda(current_step: int): def get_cosine_schedule_with_warmup( optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, - last_epoch: int = -1 + last_epoch: int = -1, **kwargs ): """ Create a schedule with a learning rate that decreases following the values of the cosine function between the @@ -113,7 +114,8 @@ def lr_lambda(current_step): def get_cosine_with_hard_restarts_schedule_with_warmup( - optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1 + optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1, + **kwargs ): """ Create a schedule with a learning rate that decreases following the values of the cosine function between the @@ -148,7 +150,7 @@ def lr_lambda(current_step): def get_polynomial_decay_schedule_with_warmup( - optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1 + optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1, **kwargs ): """ Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the @@ -197,10 +199,15 @@ def lr_lambda(current_step: int): return LambdaLR(optimizer, lr_lambda, last_epoch) +def get_cosine_with_one_cycle(optimizer, num_training_steps, num_warmup_steps, min_lr=1e-7, **kwargs): + return CosineAnnealingLR(optimizer, T_max=num_training_steps, eta_min=min_lr) + + TYPE_TO_SCHEDULER = { 'constant': get_constant_schedule, 'linear': get_linear_schedule_with_warmup, 'cosine': get_cosine_schedule_with_warmup, + 'one_cycle_cosine': get_cosine_with_one_cycle, 'polynomial': get_polynomial_decay_schedule_with_warmup, 'constant_with_warmup': get_constant_schedule_with_warmup, 'cosine_with_restarts': get_cosine_with_hard_restarts_schedule_with_warmup diff --git a/torchblocks/optims/nadam.py b/src/torchblocks/optims/nadam.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/nadam.py rename to src/torchblocks/optims/nadam.py diff --git a/torchblocks/optims/novograd.py b/src/torchblocks/optims/novograd.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/novograd.py rename to src/torchblocks/optims/novograd.py diff --git a/torchblocks/optims/planradam.py b/src/torchblocks/optims/planradam.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/planradam.py rename to src/torchblocks/optims/planradam.py diff --git a/torchblocks/optims/priorwd.py b/src/torchblocks/optims/priorwd.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/priorwd.py rename to src/torchblocks/optims/priorwd.py diff --git a/torchblocks/optims/radam.py b/src/torchblocks/optims/radam.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/radam.py rename to src/torchblocks/optims/radam.py diff --git a/torchblocks/optims/ralamb.py b/src/torchblocks/optims/ralamb.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/ralamb.py rename to src/torchblocks/optims/ralamb.py diff --git a/torchblocks/optims/ralars.py b/src/torchblocks/optims/ralars.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/ralars.py rename to src/torchblocks/optims/ralars.py diff --git a/torchblocks/optims/ranger_adabelief.py b/src/torchblocks/optims/ranger_adabelief.py old mode 100755 new mode 100644 similarity index 97% rename from torchblocks/optims/ranger_adabelief.py rename to src/torchblocks/optims/ranger_adabelief.py index 6cf0c95..006fbd3 --- a/torchblocks/optims/ranger_adabelief.py +++ b/src/torchblocks/optims/ranger_adabelief.py @@ -11,7 +11,7 @@ # Lookahead paper --> MZhang,G Hinton https://arxiv.org/abs/1907.08610 # summary of changes: -# 9/4/20 - updated addcmul_ signature to avoid warning. Integrates latest changes from GC developer (he did the work for this), and verified on performance on private datasets. +# 9/4/20 - updated addcmul_ signature to avoid warning. Integrates latest changes from GC developer (he did the work for this), and verified on performance on private dataset. # 4/11/20 - add gradient centralization option. Set new testing benchmark for accuracy with it, toggle with use_gc flag at init. # full code integration with all updates at param level instead of group, moves slow weights into state dict (from generic weights), # supports group learning rates (thanks @SHolderbach), fixes sporadic load from saved model issues. @@ -40,7 +40,7 @@ class RangerAdaBelief(Optimizer): def __init__(self, params, lr=1e-3, # lr alpha=0.5, k=6, N_sma_threshhold=5, # Ranger options betas=(.95, 0.999), eps=1e-5, weight_decay=0, # Adam options - # Gradient centralization on or off, applied to conv layers only or conv + fc layers + # Gradient centralization on or off, applied to conv modules only or conv + fc modules use_gc=True, gc_conv_only=False, gc_loc=True, adabelief=True, weight_decouple=True, ): @@ -57,7 +57,7 @@ def __init__(self, params, lr=1e-3, # lr # parameter comments: # beta1 (momentum) of .95 seems to work better than .90... # N_sma_threshold of 5 seems better in testing than 4. - # In both cases, worth testing on your datasets (.90 vs .95, 4 vs 5) to make sure which works best for you. + # In both cases, worth testing on your dataset (.90 vs .95, 4 vs 5) to make sure which works best for you. # prep defaults and init torch.optim base defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas, @@ -148,7 +148,7 @@ def step(self, closure=None): exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] - # GC operation for Conv layers and FC layers + # GC operation for Conv modules and FC modules # if grad.dim() > self.gc_gradient_threshold: # grad.add_(-grad.mean(dim=tuple(range(1, grad.dim())), keepdim=True)) if self.gc_loc: diff --git a/torchblocks/optims/sgdp.py b/src/torchblocks/optims/sgdp.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/sgdp.py rename to src/torchblocks/optims/sgdp.py diff --git a/torchblocks/optims/sgdw.py b/src/torchblocks/optims/sgdw.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/sgdw.py rename to src/torchblocks/optims/sgdw.py diff --git a/torchblocks/optims/shampoo.py b/src/torchblocks/optims/shampoo.py old mode 100755 new mode 100644 similarity index 100% rename from torchblocks/optims/shampoo.py rename to src/torchblocks/optims/shampoo.py diff --git a/src/torchblocks/tasks/__init__.py b/src/torchblocks/tasks/__init__.py new file mode 100644 index 0000000..1190ef6 --- /dev/null +++ b/src/torchblocks/tasks/__init__.py @@ -0,0 +1 @@ +from .sequence_tags import * \ No newline at end of file diff --git a/src/torchblocks/tasks/sequence_tags.py b/src/torchblocks/tasks/sequence_tags.py new file mode 100644 index 0000000..d96d363 --- /dev/null +++ b/src/torchblocks/tasks/sequence_tags.py @@ -0,0 +1,296 @@ +import torch +import numpy as np + + +def get_spans_from_bios_tags(tags, id2label=None): + """Gets entities from sequence. + note: BIOS + Args: + tags (list): sequence of labels. + Returns: + list: list of (chunk_type, chunk_start, chunk_end). + Example: + >>> tags = ['B-PER', 'I-PER', 'O', 'S-LOC'] + >>> get_spans_from_bios_tags(tags) + # output: [['PER', 0,1], ['LOC', 3, 3]] + """ + chunks = [] + chunk = [-1, -1, -1] + for indx, tag in enumerate(tags): + if not isinstance(tag, str): + tag = id2label[tag] + if tag.startswith("S-"): + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + chunk[1] = indx + chunk[2] = indx + chunk[0] = tag.split('-')[1] + chunks.append(chunk) + chunk = (-1, -1, -1) + if tag.startswith("B-"): + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + chunk[1] = indx + chunk[0] = tag.split('-')[1] + elif tag.startswith('I-') and chunk[1] != -1: + _type = tag.split('-')[1] + if _type == chunk[0]: + chunk[2] = indx + if indx == len(tags) - 1: + chunks.append(chunk) + else: + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + return chunks + + +def get_spans_from_biob_tags(seq, id2label=None): + """Gets entities from sequence. + note: BIO + Args: + seq (list): sequence of labels. + Returns: + list: list of (chunk_type, chunk_start, chunk_end). + Example: + >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC'] + >>> get_spans_from_biob_tags(seq) + #output + [['PER', 0, 1], ['LOC', 3, 3]] + """ + chunks = [] + chunk = [-1, -1, -1] + for indx, tag in enumerate(seq): + if not isinstance(tag, str): + tag = id2label[tag] + if tag.startswith("B-"): + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + chunk[1] = indx + chunk[0] = tag.split('-')[1] + chunk[2] = indx + if indx == len(seq) - 1: + chunks.append(chunk) + elif tag.startswith('I-') and chunk[1] != -1: + _type = tag.split('-')[1] + if _type == chunk[0]: + chunk[2] = indx + if indx == len(seq) - 1: + chunks.append(chunk) + else: + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + return chunks + + +def get_spans_from_bio_tags(tags, id2label=None): + """Gets entities from sequence. + Args: + tags (list): sequence of labels. + Returns: + list: list of (chunk_type, chunk_start, chunk_end). + Example: + >>> tags = ['B-PER', 'I-PER', 'O', 'B-LOC'] + >>> get_spans_from_bio_tags(tags) + # output [['PER', 0,1] + """ + chunks = [] + chunk = [-1, -1, -1] + for indx, tag in enumerate(tags): + if not isinstance(tag, str): + tag = id2label[tag] + if tag.startswith("B-"): + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + chunk[1] = indx + chunk[0] = tag.split('-')[1] + elif tag.startswith('I-') and chunk[1] != -1: + _type = tag.split('-')[1] + if _type == chunk[0]: + chunk[2] = indx + if indx == len(tags) - 1: + chunks.append(chunk) + else: + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + return chunks + + +def generate_bio_tags_from_spans(entities, offset_mapping): + """Generate NER-tags (with BIO naming) for subword tokens from the entities. + Args: + entities: The list of entities which consist of an entity name with its offset + mappings. + offset_mapping: The list of offsets which are positions of the tokens. + Returns: + A list of NER-tags encoded from the given entity informations. + """ + ner_tags = ["O" for _ in offset_mapping] # [):左闭右开 + for entity_tag, entity_start, entity_end, *_ in sorted(entities, key=lambda x: x[1]): + current_ner_tag = f"B-{entity_tag}" + for i, (token_start, token_end) in enumerate(offset_mapping): + if min(entity_end, token_end) - max(entity_start, token_start) > 0: + ner_tags[i] = current_ner_tag + current_ner_tag = f"I-{entity_tag}" + return ner_tags + + +def build_ner_bio_conditional_masks(id2label): + """Build a NER-conditional mask matrix which implies the relations between + before-tag and after-tag. + + According to the rule of BIO-naming system, it is impossible that `I-Dog` cannot be + appeard after `B-Dog` or `I-Dog` tags. This function creates the calculable + relation-based conditional matrix to prevent from generating wrong tags. + Args: + id2label: A dictionary which maps class indices to their label names. + Returns: + A conditional mask tensor. + """ + conditional_masks = torch.zeros(len(id2label), len(id2label)) + for i, before in id2label.items(): + for j, after in id2label.items(): + if after == "O" or after.startswith("B-") or after == f"I-{before[2:]}": + conditional_masks[i, j] = 1.0 + return conditional_masks + + +def ner_beam_search_decode(log_probs, id2label, beam_size=2): + """Decode NER-tags from the predicted log-probabilities using beam-search. + + This function decodes the predictions using beam-search algorithm. Because all tags + are predicted simultaneously while the tags have dependencies of their previous + tags, the greedy algorithm cannot decode the tags properly. With beam-search, it is + possible to prevent the below situation: + + >>> sorted = probs[t].sort(dim=-1) + >>> print("\t".join([f"{id2label[i]} {p}" for p, i in zip()])) + I-Dog 0.54 B-Cat 0.44 ... + >>> sorted = probs[t + 1].sort(dim=-1) + >>> print("\t".join([f"{id2label[i]} {p}" for p, i in zip()])) + I-Cat 0.99 I-Dog 0.01 ... + + The above shows that if the locally-highest tags are selected, then `I-Dog, I-Dog` + will be generated even the confidence of the second tag `I-Dog` is significantly + lower than `I-Cat`. It is more natural that `B-Cat, I-Cat` is generated rather than + `I-Dog, I-Dog`. The beam-search for NER-tagging task can solve this problem. + Args: + log_probs: The log-probabilities of the token predictions. + id2label: A dictionary which maps class indices to their label names. + beam_size: The number of candidates for each search step. Default is `2`. + + Returns: + A tuple of beam-searched indices and their probability tensors. + """ + # Create the log-probability mask for the invalid predictions. + log_prob_masks = -10000.0 * (1 - build_ner_bio_conditional_masks(id2label)) + log_prob_masks = log_prob_masks.to(log_probs.device) + beam_search_shape = (log_probs.size(0), beam_size, log_probs.size(1)) + searched_tokens = log_probs.new_zeros(beam_search_shape, dtype=torch.long) + searched_log_probs = log_probs.new_zeros(beam_search_shape) + searched_scores = log_probs.new_zeros(log_probs.size(0), beam_size) + searched_scores[:, 1:] = -10000.0 + + for i in range(log_probs.size(1)): + # Calculate the accumulated score (log-probabilities) with excluding invalid + # next-tag predictions. + scores = searched_scores.unsqueeze(2) + scores = scores + log_probs[:, i, :].unsqueeze(1) + scores = scores + (log_prob_masks[searched_tokens[:, :, i - 1]] if i > 0 else 0) + # Select the top-k (beam-search size) predictions. + best_scores, best_indices = scores.flatten(1).topk(beam_size) + best_tokens = best_indices % scores.size(2) + best_log_probs = log_probs[:, i, :].gather(dim=1, index=best_tokens) + # best_buckets = best_indices.div(scores.size(2), rounding_mode="floor") # pytorch>=1.10.0+ + best_buckets = best_indices.floor_divide(scores.size(2)) # pytorch<1.10.0+ + best_buckets = best_buckets.unsqueeze(2).expand(-1, -1, log_probs.size(1)) + + # Gather the best buckets and their log-probabilities. + searched_tokens = searched_tokens.gather(dim=1, index=best_buckets) + searched_log_probs = searched_log_probs.gather(dim=1, index=best_buckets) + + # Update the predictions by inserting to the corresponding timestep. + searched_scores = best_scores + searched_tokens[:, :, i] = best_tokens + searched_log_probs[:, :, i] = best_log_probs + + # Return the best beam-searched sequence and its probabilities. + return searched_tokens[:, 0, :], searched_log_probs[:, 0, :].exp() + + +def get_spans_from_subword_bio_tags(ner_tags, offset_mapping, probs=None): + """Extract the entities from NER-tagged subword tokens. + This function detects the entities from BIO NER-tags and collects them with + averaging their confidences (prediction probabilities). Using the averaged + probabilities, you can filter the low-confidence entities. + Args: + ner_tags: The list of subword-token-level NER-tags. + offset_mapping: The list of offsets which are positions of the tokens. + probs: An optional prediction probabilities of the subword tokens. Default is + `None`. + Returns: + A tuple of collected NER entities with their averaged entity confidencs + (prediction probabilities). + """ + probs = probs if probs is not None else np.zeros(offset_mapping.shape[0]) + entities, gathered_probs, entity, i = [], [], None, None + for j, ner_tag in enumerate(ner_tags): + if entity is not None and ner_tag != f"I-{entity}": + entities.append((entity, offset_mapping[i][0], offset_mapping[j - 1][1])) + gathered_probs.append(probs[i:j].mean()) + entity = None + if ner_tag.startswith("B-"): + entity, i = ner_tag[2:], j + # Because BIO-naming does not ensure the end of the entities (i.e. E-tag), we cannot + # automatically detect the end of the last entity in the above loop. + if entity is not None: + entities.append((entity, offset_mapping[i][0], offset_mapping[-1][1])) + gathered_probs.append(probs[i:].mean()) + return entities, gathered_probs + + +TYPE_TO_SCHEME = { + "BIO": get_spans_from_bio_tags, + "BIOS": get_spans_from_bios_tags, + 'BIOB': get_spans_from_biob_tags, +} + + +def get_scheme(scheme_type): + if scheme_type not in TYPE_TO_SCHEME: + msg = ("There were expected keys in the `TYPE_TO_SCHEME`: " + f"{', '.join(list(TYPE_TO_SCHEME.keys()))}, " + f"but get {scheme_type}." + ) + raise TypeError(msg) + scheme_function = TYPE_TO_SCHEME[scheme_type] + return scheme_function + + +if __name__ == "__main__": + sentence = {'id': '0', + 'text': '大于三十岁的与临时居住地是松陵村东区473号楼的初婚东乡族同网吧的学历为高中的外来务工人员', + 'entities': [['AGE', 0, 5, '大于三十岁'], + ['EDU', 36, 38, '高中'], + ['TAG', 39, 45, '外来务工人员'], + ['PER', 13, 23, '松陵村东区473号楼']], + 'intent': 'KBQA' + } + entities = sentence['entities'] + # BertTokenizerFast:output,return_offsets_mapping=True + # 需要注意:[CLS][SEP]特殊符号 + offset_mapping = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), + (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 20), + (20, 21), (21, 22), (22, 23), (23, 24), (24, 25), (25, 26), (26, 27), (27, 28), (28, 29), + (29, 30), (30, 31), (31, 32), (32, 33), (33, 34), (34, 35), (35, 36), (36, 37), (37, 38), + (38, 39), (39, 40), (40, 41), (41, 42), (42, 43), (43, 44), (44, 45)] + print(generate_bio_tags_from_spans(entities, offset_mapping)) + ''' + outputs:['B-AGE', 'I-AGE', 'I-AGE', 'I-AGE', 'I-AGE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-EDU', 'I-EDU', 'O', 'B-TAG', 'I-TAG', 'I-TAG', 'I-TAG', 'I-TAG', 'I-TAG'] + ''' diff --git a/src/torchblocks/utils/__init__.py b/src/torchblocks/utils/__init__.py new file mode 100644 index 0000000..d7c66fc --- /dev/null +++ b/src/torchblocks/utils/__init__.py @@ -0,0 +1,13 @@ +from .device import * +from .tensor_utils import * +from .io_utils import * +from .common_utils import * +from .logger import * +from .meter import * +from .seed import * +from .ckpt_utils import * +from .chinese_utils import * +from .import_utils import * +from .npy_utils import * +from .options import * +from .visual_utils import * diff --git a/src/torchblocks/utils/chinese_utils.py b/src/torchblocks/utils/chinese_utils.py new file mode 100644 index 0000000..0de752b --- /dev/null +++ b/src/torchblocks/utils/chinese_utils.py @@ -0,0 +1,32 @@ + +def is_chinese_char(word: str): + chinese_punctuations = { + ',', '。', ';', ':' + '!', '?', '《', '》', '‘', '’', '“', '”', '(', ')', '【', '】' + } + return len(word) == 1 \ + and ('\u4e00' <= word <= '\u9fa5' or word in chinese_punctuations) + +def remove_space_between_chinese_chars(decoded_str: str): + old_word_list = decoded_str.split(' ') + new_word_list = [] + start = -1 + for i, word in enumerate(old_word_list): + if is_chinese_char(word): + if start == -1: + start = i + else: + if start != -1: + new_word_list.append(''.join(old_word_list[start:i])) + start = -1 + new_word_list.append(word) + if start != -1: + new_word_list.append(''.join(old_word_list[start:])) + return ' '.join(new_word_list).strip() + + +def rebuild_chinese_str(string: str): + # add space for each chinese char + return ' '.join(''.join([ + f' {char} ' if is_chinese_char(char) else char for char in string + ]).split()) diff --git a/src/torchblocks/utils/ckpt_utils.py b/src/torchblocks/utils/ckpt_utils.py new file mode 100644 index 0000000..5920e30 --- /dev/null +++ b/src/torchblocks/utils/ckpt_utils.py @@ -0,0 +1,57 @@ +import os +import torch +import glob +import logging +import torch.nn as nn +from .io_utils import check_file, build_dir + +logger = logging.getLogger() + + +def save_model(model, file_path): + if isinstance(model, nn.DataParallel): + model = model.module + state_dict = model.state_dict() + torch.save(state_dict, file_path) + + +def load_model(model, file_path, device=None): + check_file(file_path) + logger.info(f"loading model from {str(file_path)} .") + state_dict = torch.load(file_path, map_location="cpu" if device is None else device) + if isinstance(model, nn.DataParallel) or hasattr(model, "module"): + model.module.load_state_dict(state_dict, strict=False) + else: + model.load_state_dict(state_dict, strict=False) + + +def save_jit_model(model, example_inputs, save_dir, dir_name=None): + model.eval() + with torch.no_grad(): + traced_model = torch.jit.trace(model, example_inputs=example_inputs, strict=False) + if dir_name is None: + save_dir = os.path.join(save_dir, 'save_model_jit_traced') + else: + save_dir = os.path.join(save_dir, dir_name) + build_dir(save_dir) + torch.jit.save(traced_model, os.path.join(save_dir, 'pytorch_model.ts')) + return save_dir + + +def find_all_checkpoints(ckpt_dir, + ckpt_prefix='checkpoint', + ckpt_postfix='-step-', + ckpt_name='pytorch_model.bin', + ckpt_custom_names=None): + ckpt_list = list( + os.path.dirname(c) for c in sorted(glob.glob(ckpt_dir + "/**/" + ckpt_name, recursive=True)) + ) + ckpt_list = [x for x in ckpt_list if ckpt_prefix in x and ckpt_postfix in x] + if len(ckpt_list) == 0: + raise ValueError(f"No checkpoint found at : '{ckpt_dir}'") + if ckpt_custom_names is not None: + if not isinstance(ckpt_custom_names, list): + ckpt_custom_names = [ckpt_custom_names] + ckpt_list = [x for x in ckpt_list if x.split('/')[-1] in ckpt_custom_names] + logger.info(f"Successfully get checkpoints:\n{ckpt_list}.") + return ckpt_list diff --git a/src/torchblocks/utils/common_utils.py b/src/torchblocks/utils/common_utils.py new file mode 100644 index 0000000..40ff1b9 --- /dev/null +++ b/src/torchblocks/utils/common_utils.py @@ -0,0 +1,80 @@ +import datetime + + +def convert_to_list(obj): + """ + Converts to list if given object is not a list. + """ + if not isinstance(obj, list): + obj = [obj] + return obj + + +def check_object_keys(object, key, msg): + ''' + object包含key,否则报错 + Args: + object: + key: + msg: + Returns: + ''' + if key not in object: + msg = (f"There were expected keys in the {msg}: " + f"{', '.join(list(object.keys()))}, " + f"but get {key}." + ) + raise ValueError(msg) + + +def check_object_type(object, check_type, name, prefix=None): + ''' + object满足check_type类型,否则报错 + Args: + object: + check_type: + name: + prefix: + Returns: + ''' + if not isinstance(object, check_type): + msg = f"The type of {name} must be {check_type}, but got {type(object)}." + if prefix is not None: + msg += f' And {prefix}' + raise TypeError(msg) + + +def build_datetime_str(): + """Create a string indicating current time + Returns: + str: current time string + """ + datetime_dt = datetime.datetime.today() + datetime_str = datetime_dt.strftime('%y%m%d%H%M%S') + return datetime_str + + +def has_key(_dict, key): + if isinstance(_dict, dict) and key in _dict: + return True + else: + return False + + +def has_keys(_dict, *keys): + """Check whether a nested dict has a key + Args: + _dict (Dict): a nested dict like object + *keys (str): flattened key list + Returns: + bool: whether _dict has keys + """ + if not _dict or not keys: + return False + sub_dict = _dict + for key in keys: + if isinstance(sub_dict, dict) and key in sub_dict: + sub_dict = sub_dict[key] + else: + return False + return True diff --git a/torchblocks/utils/device.py b/src/torchblocks/utils/device.py old mode 100755 new mode 100644 similarity index 52% rename from torchblocks/utils/device.py rename to src/torchblocks/utils/device.py index ace686d..50b58eb --- a/torchblocks/utils/device.py +++ b/src/torchblocks/utils/device.py @@ -1,11 +1,16 @@ import torch import logging +from .common_utils import check_object_type -logger = logging.getLogger(__name__) +logger = logging.getLogger() -def prepare_device(device_id): - """ +def get_all_available_gpus(): + return torch.cuda.device_count() + + +def build_device(device_id): + """build torch device setup GPU device if available, move model into configured device # 如果输入的是一个list,则默认使用list[0]作为controller Example: @@ -13,13 +18,20 @@ def prepare_device(device_id): device_id = '0': cuda:0 device_id = '0,1' : cuda:0 and cuda:1 """ - if not isinstance(device_id, str): - msg = 'device_id should be a str,e.g. multi-gpu:"0,1,.." or single-gpu:"0" or cpu:"cpu"' - raise TypeError(msg) + if isinstance(device_id, int): device_id = str(device_id) + if '.' in device_id: device_id = device_id.repalce('.', ',') + check_object_type(device_id, check_type=str, name='device_id') machine_device_num = get_all_available_gpus() - if machine_device_num == 0 or device_id == 'cpu': + device_type = 'cuda' + if ( + len(device_id) == 0 + or machine_device_num == 0 + or device_id == 'cpu' + or len(device_id.strip()) == 0 + ): + device_type = 'cpu' + if device_type == 'cpu': device_num = 0 - device = torch.device('cpu') msg = "Warning: There\'s no GPU available on this machine, training will be performed on CPU." logger.warning(msg) else: @@ -27,16 +39,29 @@ def prepare_device(device_id): device_ids = [int(x) for x in device_id.split(",")] device_num = len(device_ids) device_type = f"cuda:{device_ids[0]}" - device = torch.device(device_type) if device_num > machine_device_num: msg = (f"The number of GPU\'s configured to use is {device_num}, " f"but only {machine_device_num} are available on this machine." ) logger.warning(msg) device_num = machine_device_num + device = torch.device(device_type) logger.info("Finally, device: %s, n_gpu: %s", device, device_num) return device, device_num -def get_all_available_gpus(): - return torch.cuda.device_count() +if __name__ == "__main__": + device_id = '' + device_id0 = ' ' + device_id1 = '0' + device_id2 = '0,1' + device_id3 = 'cpu' + device_id4 = '0,1,2,3' + device_id5 = 0 + print(build_device(device_id)) + print(build_device(device_id0)) + print(build_device(device_id1)) + print(build_device(device_id2)) + print(build_device(device_id3)) + print(build_device(device_id4)) + print(build_device(device_id5)) diff --git a/src/torchblocks/utils/import_utils.py b/src/torchblocks/utils/import_utils.py new file mode 100644 index 0000000..2bd7c6f --- /dev/null +++ b/src/torchblocks/utils/import_utils.py @@ -0,0 +1,23 @@ +import os +import sys +import importlib.util +from pathlib import Path +from importlib import import_module + +def is_apex_available(): + return importlib.util.find_spec("apex") is not None + +def import_modules_from_file(py_file: str): + """ Import module from a certrain file + Args: + py_file: path to a python file to be imported + Return: + """ + dirname, basefile = os.path.split(py_file) + if dirname == '': + dirname = Path.cwd() + module_name = os.path.splitext(basefile)[0] + sys.path.insert(0, dirname) + mod = import_module(module_name) + sys.path.pop(0) + return module_name, mod diff --git a/src/torchblocks/utils/io_utils.py b/src/torchblocks/utils/io_utils.py new file mode 100644 index 0000000..d453fe0 --- /dev/null +++ b/src/torchblocks/utils/io_utils.py @@ -0,0 +1,113 @@ +import os +import torch +import json +import pickle +import logging +import numpy as np +import yaml + +logger = logging.getLogger() + + +def is_file(file_path): + if os.path.isfile(file_path): + return True + return False + + +def is_dir(file_path): + if os.path.isdir(file_path): + return True + return False + + +def check_file(file_path): + if not os.path.exists(file_path): + raise ValueError(f"File is not found here: {file_path}") + return True + + +def check_dir(dir_path): + if not os.path.isdir(dir_path): + raise ValueError(f"Directory is not found here: {dir_path}") + return True + + +def find_all_files(dir_path): + dir_path = os.path.expanduser(dir_path) + files = [os.path.join(dir_path, fname) for fname in os.listdir(dir_path)] + logger.info(f"The number of files: {len(files)} , Direcory:{dir_path}") + return files + + +def build_dir(dir_path, exist_ok=True): + if os.path.isdir(dir_path) and os.path.exists(dir_path): + logger.info(f"Directory {dir_path} exist. ") + os.makedirs(dir_path, exist_ok=exist_ok) + + +def save_pickle(data, file_path): + with open(str(file_path), 'wb') as f: + pickle.dump(data, f) + + +def load_pickle(file_path): + with open(str(file_path), 'rb') as f: + data = pickle.load(f) + return data + + +def save_numpy(data, file_path): + np.save(str(file_path), data) + + +def load_numpy(file_path): + np.load(str(file_path)) + + +def save_json(data, file_path): + with open(str(file_path), 'w') as f: + json.dump(data, f) + + +def load_json(file_path): + with open(str(file_path), 'r') as f: + data = json.load(f) + return data + + +def to_json_string(data): + """Serializes this instance to a JSON string.""" + return json.dumps(data, indent=2, sort_keys=True, cls=_Encoder) + + +class _Encoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, torch.device): + return str(obj) + else: + return super(_Encoder, self).default(obj) + + +def json_to_text(file_path, data): + with open(str(file_path), 'w') as fw: + for line in data: + line = json.dumps(line, ensure_ascii=False) + fw.write(line + '\n') + + +def dict_to_text(file_path, data): + with open(str(file_path), 'w') as fw: + for key in sorted(data.keys()): + fw.write("{} = {}\n".format(key, str(data[key]))) + + +def load_yaml(file_path): + with open(file_path, 'r') as f: + data = yaml.load(f) + return data + + +def save_yaml(data, file_path): + with open(file_path, 'w') as fw: + yaml.dump(data, fw) diff --git a/torchblocks/utils/logger.py b/src/torchblocks/utils/logger.py old mode 100755 new mode 100644 similarity index 73% rename from torchblocks/utils/logger.py rename to src/torchblocks/utils/logger.py index 73f4b93..8f48252 --- a/torchblocks/utils/logger.py +++ b/src/torchblocks/utils/logger.py @@ -1,12 +1,15 @@ import os -import time import logging +from .common_utils import build_datetime_str + +msg_format = "[%(asctime)s %(levelname)s] %(message)s" +date_format = '%Y-%m-%d %H:%M:%S' +logger_name = __name__.split('.')[0] class Logger: ''' Base class for experiment loggers. - 日志模块 ''' def __init__(self, opts, log_file_level=logging.NOTSET): @@ -20,10 +23,8 @@ def __init__(self, opts, log_file_level=logging.NOTSET): def setup_logger(self): log_file_path = self.setup_log_path() - fmt = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' - dmt = '%Y-%m-%d %H:%M:%S' - log_format = logging.Formatter(fmt=fmt, datefmt=dmt) - self.logger = logging.getLogger() + log_format = logging.Formatter(fmt=msg_format, datefmt=date_format) + self.logger = logging.getLogger(logger_name) self.logger.setLevel(logging.INFO) console_handler = logging.StreamHandler() console_handler.setFormatter(log_format) @@ -33,15 +34,14 @@ def setup_logger(self): file_handler.setLevel(self.log_file_level) self.logger.addHandler(file_handler) - def setup_time(self): - local_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) - return local_time + def log_line(self): + self.info("-" * 100) def setup_log_path(self): - log_time = self.setup_time() + log_time = build_datetime_str() log_prefix = self.setup_prefix() log_file_name = f"{self.opts.task_name}-{self.opts.model_type}-" \ - f"{self.opts.experiment_code}-{log_prefix}-{log_time}.log" + f"{self.opts.experiment_name}-{log_prefix}-{log_time}.log" log_file_path = os.path.join(self.opts.output_dir, log_file_name) return log_file_path @@ -54,4 +54,3 @@ def setup_prefix(self): return 'predict' else: return '' - diff --git a/torchblocks/utils/meter.py b/src/torchblocks/utils/meter.py old mode 100755 new mode 100644 similarity index 69% rename from torchblocks/utils/meter.py rename to src/torchblocks/utils/meter.py index e08399c..4c5bc96 --- a/torchblocks/utils/meter.py +++ b/src/torchblocks/utils/meter.py @@ -2,7 +2,6 @@ class AverageMeter(object): ''' computes and stores the average and current value ''' - def __init__(self): self.reset() @@ -17,4 +16,7 @@ def update(self, val, n=1): self.val = val self.sum += val * n self.count += n - self.avg = self.sum / self.count \ No newline at end of file + self.avg = self.sum / self.count + + def __repr__(self): + return f"AverageMeter(val={self.val}, avg={self.avg}, sum={self.sum}, count={self.count})" diff --git a/src/torchblocks/utils/npy_utils.py b/src/torchblocks/utils/npy_utils.py new file mode 100644 index 0000000..311446f --- /dev/null +++ b/src/torchblocks/utils/npy_utils.py @@ -0,0 +1,11 @@ +import numpy as np + + +def softmax(x): + assert len(x.shape) == 2 + s = np.max(x, axis=1) + s = s[:, np.newaxis] # necessary step to do broadcasting + e_x = np.exp(x - s) + div = np.sun(e_x, axis=1) + div = div[:, np.newaxis] # dito + return e_x / div diff --git a/src/torchblocks/utils/options.py b/src/torchblocks/utils/options.py new file mode 100644 index 0000000..92726cd --- /dev/null +++ b/src/torchblocks/utils/options.py @@ -0,0 +1,300 @@ +import os +import sys +import json +import argparse +from pathlib import Path +from .io_utils import check_file, load_json, build_dir, load_yaml +from .import_utils import import_modules_from_file + + +class Argparser(argparse.ArgumentParser): + def __init__(self, **kwargs): + super(Argparser, self).__init__(**kwargs) + + @classmethod + def build_parser(cls, description='Arguments'): + parser = cls(description=description, add_help=True) + parser.arguments_required() + parser.arguments_common() + parser.arguments_input_file() + parser.arguments_dataset() + parser.arguments_dataloader() + parser.arguments_pretrained() + parser.arguments_ema() + parser.arguments_swa() + parser.arguments_rdrop() + parser.arguments_attack() + parser.arguments_optimimzer_and_scheduler() + parser.arguments_mixed_precision() + parser.arguments_model_checkpoint() + parser.arguments_earlystopping() + return parser + + @classmethod + def build_args_from_parser(cls, parser): + args = parser.parse_args() + parser.build_experiment_dir(args) + parser.save_args_to_json(args) + return args + + @classmethod + def build_arguments(cls): + parser = cls.build_parser() + args = cls.build_args_from_parser(parser) + return args + + @classmethod + def build_args_from_file(cls, file_name): + if isinstance(file_name, Path): + file_name = str(file_name) + check_file(file_name) + fileExtname = os.path.splitext(file_name)[1] + if fileExtname not in ['.py', '.json', '.yaml', '.yml']: + raise IOError('Only py/yml/yaml/json type are supported now!') + if fileExtname in ['.py']: + module_nanme, mod = import_modules_from_file(file_name) + opts_dict = {} + for name, value in mod.__dict__.items(): + opts_dict[name] = value + # delete imported module + del sys.modules[module_nanme] + elif fileExtname in ['.json']: + opts_dict = load_json(file_name) + else: + opts_dict = load_yaml(file_name) + return argparse.Namespace(**opts_dict) + + def arguments_required(self): + group = self.add_argument_group(title="required arguments", description="required arguments") + group.add_argument('-t', "--task_name", default=None, type=str, required=True, + help="The name of the task to train. ") + group.add_argument('-o', "--output_dir", default=None, type=str, required=True, + help="directory to save experiment logs and checkpoints.") + group.add_argument('-m', "--model_type", default=None, type=str, required=True, + help="The name of the model to train.") + group.add_argument('-d', "--data_dir", default=None, type=str, required=True, + help="The input data dir. Should contain the training files for task.") + + def arguments_common(self): + group = self.add_argument_group(title="common arguments", description="common arguments") + group.add_argument( + "--seed", type=int, default=42, help="random seed for initialization") + group.add_argument( + "--epoch_seed", action="store_true", help="Whether to seed+ every epoch.") + group.add_argument( + "--do_train", action="store_true", help="Whether to run training.") + group.add_argument( + "--do_eval", action="store_true", help="Whether to run eval.") + group.add_argument( + "--do_predict", action="store_true", help="Whether to run predict.") + group.add_argument( + "--device_id", type=str, default='0', + help='cuda device string. Multi-gpu:"0,1,.." or single-gpu:"0" or cpu:"cpu"') + group.add_argument( + '-c', '--config_path', type=str, default=None, help="configuration YAML file") + group.add_argument( + '-ss', "--save_steps", type=int, default=-1, + help="Save checkpoint every X updates steps. ``-1`` means that a epoch") + group.add_argument( + '-ls', "--logging_steps", type=int, default=-1, + help="Log every X updates steps.``-1`` means that a epoch, not used if logging_strategy is `epoch`") + group.add_argument( + '-lsy', '--logging_strategy', default=None, type=str, choices=[None, 'epoch']) + group.add_argument( + '-exp_name', '--experiment_name', type=str, default='v0', help='experiment name') + group.add_argument( + '--log_writer', default='file', choices=['file', 'tensorboard']) + group.add_argument( + '--local_rank', type=str, default='0') + group.add_argument( + '-f', '--force', default=None, help='overwrite the output directory if it exists.' + ) + + def arguments_input_file(self): + group = self.add_argument_group(title="input file arguments", description="input file arguments") + group.add_argument('-train_file', "--train_input_file", default=None, type=str, + help="The name of train input file") + group.add_argument('-eval_file', "--eval_input_file", default=None, type=str, + help="The name of eval input file") + group.add_argument('-test_file', "--test_input_file", default=None, type=str, + help="The name of test input file") + group.add_argument('-label_file', "--label_file_path", default=None, type=str, + help="The name of label input file") + + def arguments_dataset(self): + group = self.add_argument_group(title="dataset arguments", description="dataset arguments") + group.add_argument('-train_len', "--train_max_seq_length", default=128, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", ) + group.add_argument('-eval_len', "--eval_max_seq_length", default=512, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", ) + group.add_argument('-test_len', "--test_max_seq_length", default=512, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", ) + group.add_argument('-train_bs', "--per_gpu_train_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for training.") + group.add_argument('-eval_bs', "--per_gpu_eval_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for evaluation.") + group.add_argument('-test_bs', "--per_gpu_test_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for test evaluation.") + group.add_argument("--overwrite_data_cache", action='store_true', + help="Whether to overwrite the cached training and evaluation feature sets") + group.add_argument("--use_data_cache", action='store_true', + help='Whether to load the cached training feature sets') + group.add_argument('--cached_features_file', default=None, type=str, help='custom cached feature file') + group.add_argument('--max_examples', default=None, type=int, help='debug') + group.add_argument('--dynamical_padding',default=True,type=bool, + help='If dynamical_padding is False, uniform length sequences in batches. Default True') + + def arguments_dataloader(self): + group = self.add_argument_group(title="dataloader arguments", description="dataloader arguments") + group.add_argument('--pin_memory', default=False, action='store_true', + help='Use pin memory option in data loader') + group.add_argument('-train_dl', "--train_drop_last", default=False, action='store_true') + group.add_argument('-eval_dl', "--eval_drop_last", default=False, action='store_true') + group.add_argument('-test_dl', "--test_drop_last", default=False, action='store_true') + group.add_argument('--num_workers', default=0, type=int, help='Number of data workers') + group.add_argument("--persistent_workers", default=False, action="store_true") + + def arguments_pretrained(self): + group = self.add_argument_group(title="pretrained arguments", description="pretrained arguments") + group.add_argument("--pretrained_model_path", default=None, type=str, + help="Path to pre-trained model selected in the list") + group.add_argument("--pretrained_config_path", default=None, type=str, + help="Pretrained config path if not the same as model_name") + group.add_argument("--pretrained_tokenizer_path", default=None, type=str, + help="Pretrained tokenizer path if not the same as model_name") + group.add_argument("--do_lower_case", action="store_true", + help="Set this flag if you are using an uncased model.") + + def arguments_optimimzer_and_scheduler(self): + group = self.add_argument_group(title='optimizer and scheduler', description='Optimizer related arguments') + group.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay for optimizer.") + group.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm. common: 1,10,100,1000") + group.add_argument("--adam_beta1", default=0.9, type=float, help="Beta1 for optimizer") + group.add_argument("--adam_beta2", default=0.999, type=float, help='Beta2 for optimizer') + group.add_argument("--adam_epsilon", default=1e-8, type=float, + help="Epsilon for optimizer. common: 1e-6,1e-7,5e-7,1e-8") + group.add_argument('--num_cycles', default=0.5, type=float, + help='The number of waves in the cosine schedule,common:0.5、1') + group.add_argument('--min_lr', default=1e-7, type=float, help='Minimum learning rate. common: 1e-7,1e-8') + group.add_argument("--learning_rate", default=3e-5, type=float, help="Learning rate.") + group.add_argument("--other_learning_rate", default=0.0, type=float, help='other learning rate') + group.add_argument("--base_model_name", default='base_model', type=str, help='The main body of the model.') + group.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs") + group.add_argument("--gradient_accumulation_steps", type=int, default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", ) + group.add_argument("--warmup_rate", default=0.0, type=float, + help="Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training.") + group.add_argument("--warmup_steps", default=0, type=int, help='Linear warmup over warmup_steps.') + group.add_argument("--scheduler_type", default='linear', type=str, + choices=["linear", 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', + 'constant_with_warmup', 'one_cycle_cosine'], + help='The scheduler type to use.') + group.add_argument('--scheduler_on', default='batch', type=str, choices=['batch', 'epoch'], + help='scheduler to start') + group.add_argument("--scheduler_metric", default=None, type=str) + + def arguments_ema(self): + group = self.add_argument_group(title='EMA', description='Exponential moving average arguments') + group.add_argument('--do_ema', action='store_true', help='Exponential moving average') + group.add_argument('--swa_start', type=int, default=-1, help='EMA start') + + def arguments_swa(self): + group = self.add_argument_group(title='SWA', description='swa arguments') + group.add_argument('--do_swa', action='store_true', help='SWA') + group.add_argument('--ema_decay', type=float, default=0.9999, help='EMA decay') + + def arguments_rdrop(self): + group = self.add_argument_group(title='Rdrop', description='Rdrop arguments') + group.add_argument("--do_rdrop", action="store_true", help="Whether to run rdrop training.") + group.add_argument('--rdrop_weight', type=float, default=0.0, help="The weight of rdrop loss") + group.add_argument('--rdrop_start_epoch', type=int, default=1, help='the epoch to start do rdrop') + + def arguments_attack(self): + group = self.add_argument_group(title='Adversarial training', description='Adversarial training arguments') + group.add_argument('--do_fgm', action='store_true', help='Adversarial training') + group.add_argument('--fgm_name', default='word_embeddings', type=str, + help='name for attacks layer,`FGM` use word_embeddings') + group.add_argument('--fgm_epsilon', default=1.0, type=float, help='attack epsilon,such as 1e-2,1e-3') + + group.add_argument('--do_pgd', action='store_true', help='Adversarial training') + group.add_argument('--pgd_name', default='word_embeddings', type=str, + help='name for attacks layer,`PGD` use word_embeddings') + group.add_argument('--pgd_epsilon', default=1.0, type=float, help='attack epsilon,such as 1e-2,1e-3') + group.add_argument('--pgd_number', default=1, type=int, help='the number of attack') + group.add_argument('--pgd_alpha', default=0.3, type=float, help='attack alpha (lr),such as 1e-4,5e-4,1e-5') + + group.add_argument('--do_awp', action='store_true', help='Adversarial training') + group.add_argument('--awp_number', default=1, type=int, help='the number of attack') + group.add_argument('--awp_name', default='weight', type=str, help='name for attacks layer, `AWP` use weight') + group.add_argument('--awp_epsilon', default=1.0, type=float, help='attack epsilon,such as 1e-2,1e-3') + group.add_argument('--awp_alpha', default=0.3, type=float, help='attack alpha (lr),such as 1e-4,5e-4,1e-5') + group.add_argument('--awp_start_step', default=-1, type=int, + help='the step to start attack,``-1`` means that no limits') + group.add_argument('--awp_start_epoch', default=-1, type=int, + help='the epoch to start attack,``-1`` means that no limits') + group.add_argument('--awp_start_score', default=-1, type=float, + help='the score to start accack,``-1`` means that no limits') + group.add_argument('--awp_score_mode', default='min', help='attack score mode') + group.add_argument('--awp_score_monitor', default='eval_loss', help='attack score monitor') + + def arguments_mixed_precision(self): + group = self.add_argument_group(title="mixed precision arguments", description="mixed precision arguments") + group.add_argument("--do_fp16", action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) + group.add_argument("--fp16_opt_level", type=str, default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", ) + group.add_argument('--fp16_backend', default='apex', type=str, choices=['apex', 'amp', 'auto'], + help="The backend to be used for mixed precision.") + group.add_argument("--fp16_full_eval", action='store_true', + help="Whether to use full 16-bit precision evaluation instead of 32-bit") + + def arguments_model_checkpoint(self): + group = self.add_argument_group(title='model checkpoint', description='model checkpoint arguments') + group.add_argument('-ckpt_d', "--checkpoint_mode", default='min', type=str, help='model checkpoint mode') + group.add_argument('-ckpt_m', "--checkpoint_monitor", default='eval_loss', type=str, + help='model checkpoint monitor') + group.add_argument('-ckpt_s', "--checkpoint_save_best", action='store_true', help='Whether to save best model') + group.add_argument('-ckpt_v', "--checkpoint_verbose", default=1, type=int, + help='whether to print checkpoint info') + group.add_argument("-ckpt_p", "--checkpoint_predict_code", type=str, default=None, + help='The version of checkpoint to predict') + group.add_argument('--eval_all_checkpoints', action="store_true", help="Evaluate all checkpoints starting", ) + + def arguments_earlystopping(self): + group = self.add_argument_group(title='early stopping', description='early stopping arguments') + group.add_argument("--earlystopping_patience", default=-1, type=int, + help='Interval (number of epochs) between checkpoints,``-1`` means that no earlystopping') + group.add_argument("--earlystopping_mode", default='min', type=str, help='early stopping mode') + group.add_argument("--earlystopping_monitor", default='eval_loss', type=str, help='early stopping monitor') + group.add_argument("--earlystopping_verbose", default=1, type=int, help='whether to print earlystopping info') + group.add_argument('--earlystopping_save_state_path', default=None, type=str) + group.add_argument('--earlystopping_load_state_path', default=None, type=str) + + def save_args_to_json(self, args): + if args.do_train: + save_arguments_file_name = f"{args.task_name}_{args.model_type}_{args.experiment_name}_options.json" + save_arguments_file_path = os.path.join(args.output_dir, save_arguments_file_name) + if os.path.exists(save_arguments_file_path): + print(f"[Warning]File {save_arguments_file_path} exist,Overwrite arguments file") + with open(str(save_arguments_file_path), 'w') as f: + json.dump(vars(args), f, ensure_ascii=False, indent=4) + + def print_args(self, args): + print('**********************************') + print('************ Arguments ***********') + print('**********************************') + args_list = sorted(args.__dict__.items(), key=lambda x: x[0]) + msg = '' + for k, v in args_list: + msg += f' - {k}: {v}\n' + print(msg) + + def build_experiment_dir(self, args): + _name = f'{args.task_name}_{args.model_type}_{args.experiment_name}' + args.output_dir = os.path.join(args.output_dir, _name) + build_dir(args.output_dir, exist_ok=True) + diff --git a/torchblocks/utils/seed.py b/src/torchblocks/utils/seed.py old mode 100755 new mode 100644 similarity index 83% rename from torchblocks/utils/seed.py rename to src/torchblocks/utils/seed.py index af28ee9..3456ea6 --- a/torchblocks/utils/seed.py +++ b/src/torchblocks/utils/seed.py @@ -1,10 +1,10 @@ import os import random -import numpy as np import torch import logging +import numpy as np -logger = logging.getLogger(__name__) +logger = logging.getLogger() def select_seed_randomly(min_seed_value=0, max_seed_value=1024): @@ -13,7 +13,7 @@ def select_seed_randomly(min_seed_value=0, max_seed_value=1024): return int(seed) -def seed_everything(seed=None,verbose=True): +def seed_everything(seed=None, verbose=True): ''' init random seed for random functions in numpy, torch, cuda and cudnn Args: @@ -21,8 +21,7 @@ def seed_everything(seed=None,verbose=True): reproducibility (bool): Whether to require reproducibility ''' if seed is None: seed = select_seed_randomly() - if verbose: - logger.info(f"Global seed set to {seed}") + if verbose: logger.info(f"Global seed set to {seed}") random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) diff --git a/torchblocks/utils/tensor.py b/src/torchblocks/utils/tensor_utils.py old mode 100755 new mode 100644 similarity index 66% rename from torchblocks/utils/tensor.py rename to src/torchblocks/utils/tensor_utils.py index 9fbd129..97b0941 --- a/torchblocks/utils/tensor.py +++ b/src/torchblocks/utils/tensor_utils.py @@ -1,27 +1,35 @@ import torch import numbers +import logging import numpy as np import torch.nn.functional as F -from torchblocks.utils.common import check_object_type +from .common_utils import check_object_type -''' -常用tensor操作 -''' -def convert_to_tensor(obj): +logger = logging.getLogger() + + +def convert_to_tensor(obj, device=torch.device("cpu")): """ Converts to Tensor if given object is not a Tensor. """ if not isinstance(obj, torch.Tensor): - obj = torch.Tensor(obj) + obj = torch.Tensor(obj).to(device) return obj + def numpy_to_tensor(array, device=torch.device("cpu")): + """ + Converts to Tensor if given object is a array. + """ if not isinstance(array, np.ndarray): raise ValueError("array type: expected one of (np.ndarray,)") return torch.from_numpy(array).to(device) def number_to_tensor(number, device=torch.device("cpu")): + """ + Converts to Tensor if given object is a number. + """ if not isinstance(number, numbers.Number): raise ValueError("number type: expected one of (numbers.Number,)") return torch.tensor([number], device=device) @@ -138,7 +146,7 @@ def to_categorical(tensor, argmax_dim=1): return torch.argmax(tensor, dim=argmax_dim) -def get_dropout_mask(drop_p, tensor): +def build_dropout_mask(drop_p, tensor): r""" 根据tensor的形状,生成一个mask :param drop_p: float, 以多大的概率置为0。 @@ -151,7 +159,8 @@ def get_dropout_mask(drop_p, tensor): def select_topk(tensor, topk=1, dim=1): - """Convert a probability tensor to binary by selecting top-k highest entries. + """ + Convert a probability tensor to binary by selecting top-k highest entries. """ check_object_type(object=tensor, check_type=torch.Tensor, name='tensor') zeros = torch.zeros_like(tensor) @@ -160,3 +169,52 @@ def select_topk(tensor, topk=1, dim=1): else: topk_tensor = zeros.scatter(dim, tensor.topk(k=topk, dim=dim).indices, 1.0) return topk_tensor.int() + + +def concat_tensors_with_padding(tensor_list, padding_shape, padding_index=1, padding_value=0): + """Concatenate the list of tensors to be a single tensor with paddings. + + Args: + tensor_list: The list of tensors which have different lengths. They should have + the shape of `(batch_size, seq_len, dim)` or `(batch_size, seq_len)`. + padding: The padding value for the tensors. If the tensor is shorter than other + tensors, than it will be padded with this value. Default is `0`. + + Returns: + A concatenated single tnesor. + """ + max_length = max(x.size(padding_index) for x in tensor_list) + padded_tensor_list = [] + for tensor in tensor_list: + padding_length = max_length - tensor.size(padding_index) + padding_size = [x * padding_length for x in padding_shape] + padded_tensor_list.append(F.pad(tensor, padding_size, value=padding_value)) + return torch.cat(padded_tensor_list) + + +def convert_tensor_list_to_dict(tensor_list): + ''' + 将tensor列表转化dict形式 + Args: + tensor_list: + Returns: + ''' + tensor_dict = {} + first = tensor_list[0] + for k, v in first.items(): + bv = [f[k] for f in tensor_list] + tensor_dict[k] = bv + return tensor_dict + +def convert_cuda_to_cpu(inputs): + outputs = {} + for key, value in inputs.items(): + if isinstance(value, torch.Tensor): + outputs[key] = tensor_to_cpu(value) + elif isinstance(value, (list, tuple)): + outputs[key] = [tensor_to_cpu(x) if isinstance(x, torch.Tensor) else x for x in value] + elif isinstance(value, dict): + outputs[key] = {x: tensor_to_cpu(y) if isinstance(y, torch.Tensor) else y for x, y in value.items()} + else: + outputs[key] = value + return outputs \ No newline at end of file diff --git a/src/torchblocks/utils/visual_utils.py b/src/torchblocks/utils/visual_utils.py new file mode 100644 index 0000000..981e248 --- /dev/null +++ b/src/torchblocks/utils/visual_utils.py @@ -0,0 +1,55 @@ +import matplotlib.pylab as plt +from itertools import cycle + +plt.style.use("ggplot") +color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"] +color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"]) + + +def text_to_color(text, color): + """ + Returns highlighted text for the string + """ + return f"\033[4{color};30m{text}\033[m" + + +def get_colored_sentence(sentence, label_to_color): + colored_sentence = [] + spans = sentence['entities'] + text = sentence['text'] + spans = sorted(spans, key=lambda x: (x[0], x[1])) + span_num = len(spans) + b_end = 0 + for i in range(span_num): + tag, start, end, fragment = spans[i] + if start > b_end: + O_fragment = text[b_end:start] + color = label_to_color['None'] + colored_text = text_to_color(O_fragment, color) + colored_sentence.append(colored_text) + color = label_to_color[tag] + colored_text = text_to_color(fragment, color) + colored_sentence.append(colored_text) + b_end = end + return " ".join(colored_sentence) + + +if __name__ == "__main__": + sentence = {'id': '0', + 'text': '大于三十岁的与临时居住地是松陵村东区473号楼的初婚东乡族同网吧的学历为高中的外来务工人员', + 'entities': [['AGE', 0, 5, '大于三十岁'], + ['EDU', 36, 38, '高中'], + ['TAG', 39, 45, '外来务工人员'], + ['PER', 13, 23, '松陵村东区473号楼']], + 'intent': 'KBQA' + } + label_to_color = { + "AGE": 1, # 1 red + "EDU": 2, # 2 green + "TAG": 3, # 3 yellow + "PER": 4, # 4 blue + "None": 9, # default + } + # jupyter notebook + # FLAT NER + print(get_colored_sentence(sentence, label_to_color)) diff --git a/src/torchblocks/version.py b/src/torchblocks/version.py new file mode 100644 index 0000000..ef72cc0 --- /dev/null +++ b/src/torchblocks/version.py @@ -0,0 +1 @@ +__version__ = '0.8.1' diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e2fd963 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +#pass \ No newline at end of file diff --git a/torchblocks/__init__.py b/torchblocks/__init__.py deleted file mode 100755 index 792d600..0000000 --- a/torchblocks/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# diff --git a/torchblocks/callback/adversarial/awp.py b/torchblocks/callback/adversarial/awp.py deleted file mode 100644 index 8530c8a..0000000 --- a/torchblocks/callback/adversarial/awp.py +++ /dev/null @@ -1,64 +0,0 @@ -import torch - -class AWP(object): - """ [Adversarial weight perturbation helps robust generalization](https://arxiv.org/abs/2004.05884) - """ - def __init__( - self, - model, - emb_name="weight", - epsilon=0.001, - alpha=1.0, - ): - self.model = model - self.emb_name = emb_name - self.epsilon = epsilon - self.alpha = alpha - self.param_backup = {} - self.param_backup_eps = {} - self.grad_backup = {} - - def attack(self, is_first_attack=False): - if self.alpha == 0: return - e = 1e-6 - for name, param in self.model.named_parameters(): - if param.requires_grad and param.grad is not None and self.emb_name in name: - # save - if is_first_attack: - self.param_backup[name] = param.data.clone() - grad_eps = self.epsilon * param.abs().detach() - self.param_backup_eps[name] = ( - self.param_backup[name] - grad_eps, - self.param_backup[name] + grad_eps, - ) - # attack - norm1 = torch.norm(param.grad) - norm2 = torch.norm(param.data.detach()) - if norm1 != 0 and not torch.isnan(norm1): - r_at = self.alpha * param.grad / (norm1 + e) * (norm2 + e) - param.data.add_(r_at) - param.data = torch.min( - torch.max( - param.data, - self.param_backup_eps[name][0] - ), - self.param_backup_eps[name][1] - ) - - def restore(self): - for name, param in self.model.named_parameters(): - if name in self.param_backup: - param.data = self.param_backup[name] - self.param_backup = {} - self.param_backup_eps = {} - - def backup_grad(self): - for name, param in self.model.named_parameters(): - if param.requires_grad and param.grad is not None: - self.grad_backup[name] = param.grad.clone() - - def restore_grad(self): - for name, param in self.model.named_parameters(): - if name in self.grad_backup: - param.grad = self.grad_backup[name] - self.grad_backup = {} diff --git a/torchblocks/callback/adversarial/fgm.py b/torchblocks/callback/adversarial/fgm.py deleted file mode 100755 index 32478a2..0000000 --- a/torchblocks/callback/adversarial/fgm.py +++ /dev/null @@ -1,41 +0,0 @@ -import torch - - -class FGM(object): - def __init__(self, model, emb_name, epsilon=1.0): - # emb_name这个参数要换成你模型中embedding的参数名 - self.model = model - self.epsilon = epsilon - self.emb_name = emb_name - self.emb_backup = {} - self.grad_backup = {} - - def attack(self): - for name, param in self.model.named_parameters(): - if param.requires_grad and self.emb_name in name: - self.emb_backup[name] = param.data.clone() - norm = torch.norm(param.grad) - if norm != 0 and not torch.isnan(norm): - r_at = self.epsilon * param.grad / norm - param.data.add_(r_at) - - def restore(self): - for name, param in self.model.named_parameters(): - if param.requires_grad and self.emb_name in name: - assert name in self.emb_backup - param.data = self.emb_backup[name] - self.emb_backup = {} - - def backup_grad(self): - for name, param in self.model.named_parameters(): - if param.requires_grad and param.grad is not None: - self.grad_backup[name] = param.grad.clone() - - def restore_grad(self): - for name, param in self.model.named_parameters(): - if param.requires_grad and param.grad is not None: - if self.emb_name in name: - param.grad = self.grad_backup[name] - else: - param.grad += self.grad_backup[name] - self.grad_backup = {} diff --git a/torchblocks/callback/adversarial/pgd.py b/torchblocks/callback/adversarial/pgd.py deleted file mode 100755 index b9c9b80..0000000 --- a/torchblocks/callback/adversarial/pgd.py +++ /dev/null @@ -1,49 +0,0 @@ -import torch - -class PGD(object): - def __init__(self, model, emb_name, epsilon=1., alpha=0.3): - # emb_name这个参数要换成你模型中embedding的参数名 - self.model = model - self.emb_name = emb_name - self.epsilon = epsilon - self.alpha = alpha - self.emb_backup = {} - self.grad_backup = {} - - def attack(self, is_first_attack=False): - for name, param in self.model.named_parameters(): - if param.requires_grad and self.emb_name in name: - if is_first_attack: - self.emb_backup[name] = param.data.clone() - norm = torch.norm(param.grad) - if norm != 0: - r_at = self.alpha * param.grad / norm - param.data.add_(r_at) - param.data = self.project(name, param.data, self.epsilon) - - def restore(self): - for name, param in self.model.named_parameters(): - if param.requires_grad and self.emb_name in name: - assert name in self.emb_backup - param.data = self.emb_backup[name] - self.emb_backup = {} - - def project(self, param_name, param_data, epsilon): - r = param_data - self.emb_backup[param_name] - if torch.norm(r) > epsilon: - r = epsilon * r / torch.norm(r) - return self.emb_backup[param_name] + r - - def backup_grad(self): - for name, param in self.model.named_parameters(): - if param.requires_grad and param.grad is not None: - self.grad_backup[name] = param.grad.clone() - - def restore_grad(self): - for name, param in self.model.named_parameters(): - if param.requires_grad and param.grad is not None: - if self.emb_name in name: - param.grad = self.grad_backup[name] - else: - param.grad += self.grad_backup[name] - self.grad_backup = {} diff --git a/torchblocks/callback/ema.py b/torchblocks/callback/ema.py deleted file mode 100755 index b2d04eb..0000000 --- a/torchblocks/callback/ema.py +++ /dev/null @@ -1,51 +0,0 @@ -import torch -import torch.nn as nn -from copy import deepcopy - - -class EMA(nn.Module): - """ Model Exponential Moving Average V2 - - Keep a moving average of everything in the model state_dict (parameters and buffers). - V2 of this module is simpler, it does not match params/buffers based on name but simply - iterates in order. It works with torchscript (JIT of full model). - - This is intended to allow functionality like - https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage - - A smoothed version of the weights is necessary for some training schemes to perform well. - E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use - RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA - smoothing of weights to match results. Pay attention to the decay constant you are using - relative to your update count per epoch. - - To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but - disable validation of the EMA weights. Validation will have to be done manually in a separate - process, or after the training stops converging. - - This class is sensitive where it is initialized in the sequence of model init, - GPU assignment and distributed training wrappers. - """ - - def __init__(self, model, decay=0.9999, device=None): - super(EMA, self).__init__() - # make a copy of the model for accumulating moving average of weights - self.module = deepcopy(model) - self.module.eval() - self.decay = decay - self.device = device # perform ema on different device from model if set - if self.device is not None: - self.module.to(device=device) - - def _update(self, model, update_fn): - with torch.no_grad(): - for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()): - if self.device is not None: - model_v = model_v.to(device=self.device) - ema_v.copy_(update_fn(ema_v, model_v)) - - def update(self, model): - self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m) - - def set(self, model): - self._update(model, update_fn=lambda e, m: m) diff --git a/torchblocks/callback/swa.py b/torchblocks/callback/swa.py deleted file mode 100755 index 6c29a16..0000000 --- a/torchblocks/callback/swa.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -import copy -import torch -import logging -from torchblocks.utils.paths import create_dir, save_model -from torchblocks.utils.paths import find_all_checkpoints - -logger = logging.getLogger(__name__) - -#TODO 待优化 -class SWA: - ''' - checkpoint_dir:模型目录 - monitor:排序对象,按照step还是metric - sort_mode:如果monitor=metric,则需要制定排序,最大还是最小 - k_best_checkpoints:多少个模型 - ''' - monitor_list = ['metric', 'step'] - mode_list = ['max', 'min'] - - def __init__(self, checkpoint_dir, - monitor='step', - sort_mode='max', - device='cpu', - k_best_checkpoints=0, - checkpont_weights=None, - checkpoint_dir_prefix='checkpoint', - checkpoint_name='pytorch_model.bin'): - self.checkpoint_dir = checkpoint_dir - self.checkpoint_name = checkpoint_name - self.sort_mode = sort_mode - self.monitor = monitor - self.k_best_checkpoints = k_best_checkpoints - self.checkpoint_dir_prefix = checkpoint_dir_prefix - self.device = torch.device(device) - self.weights = self.init_checkpint_weight(checkpont_weights) - - if sort_mode not in self.mode_list: - raise ValueError(f"mode: expected one of {', '.join(self.mode_list)}") - - if monitor not in self.monitor_list: - raise ValueError(f"monitor: expected one of {', '.join(self.monitor_list)}") - - def init_checkpint_weight(self, weights): - if not isinstance(weights, list): - weights = [1. / (n + 1.) for n in range(self.k_best_checkpoints)] - return weights - - def get_model_path_list(self): - try: - model_lists = find_all_checkpoints(checkpoint_dir=self.checkpoint_dir, - checkpoint_prefix=self.checkpoint_dir_prefix, - checkpoint_name=self.checkpoint_name) - if self.monitor == 'step': - model_lists = sorted(model_lists, - key=lambda x: int(x.split("/")[-2].split("-")[-1])) - elif self.monitor == 'metric': - is_reverse = False - if self.sort_mode == 'min': is_reverse = True - model_lists = sorted(model_lists, - key=lambda x: float(x.split("/")[-2].split("-")[-1][2]), - reverse=is_reverse) - model_lists = model_lists[-self.k_best_checkpoints:] - logger.info(f"Averaging checkpoints: {[f.split('/')[-2] for f in model_lists]}") - return model_lists - except Exception as e: - logger.info("Error in `swa.get_model_path_list") - print(e) - - def step(self, model): - """ - swa 滑动平均模型,一般在训练平稳阶段再使用 SWA - """ - model_path_list = self.get_model_path_list() - swa_model = copy.deepcopy(model) - with torch.no_grad(): - for indx, _ckpt in enumerate(model_path_list): - logger.info(f'Load model from {_ckpt}') - model.load_state_dict(torch.load(_ckpt, map_location=self.device)) - tmp_para_dict = dict(model.named_parameters()) - alpha = self.weights[indx] - if indx == 0: - for name, para in swa_model.named_parameters(): - para.copy_(tmp_para_dict[name].data.clone() * alpha) - else: - for name, para in swa_model.named_parameters(): - para.copy_(tmp_para_dict[name].data.clone() * alpha + para.data.clone() * (1. - alpha)) - swa_model_dir = os.path.join(self.checkpoint_dir, f'checkpoint-swa-{self.sort_mode}-{self.k_best_checkpoints}') - create_dir(swa_model_dir) - logger.info(f'Save swa model in: {swa_model_dir}') - swa_model_path = os.path.join(swa_model_dir, self.checkpoint_name) - save_model(swa_model.state_dict(), swa_model_path) - return swa_model diff --git a/torchblocks/core/__init__.py b/torchblocks/core/__init__.py deleted file mode 100755 index 9c8eef5..0000000 --- a/torchblocks/core/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .trainer_base import * -from .classification_trainer import * -from .sequence_labeling_trainer import * \ No newline at end of file diff --git a/torchblocks/core/classification_trainer.py b/torchblocks/core/classification_trainer.py deleted file mode 100755 index 50eff39..0000000 --- a/torchblocks/core/classification_trainer.py +++ /dev/null @@ -1,13 +0,0 @@ -import torch -from torchblocks.core import TrainerBase - - -class TextClassifierTrainer(TrainerBase): - ''' - 文本分类 - ''' - def build_batch_concat(self, all_batch_list, dim=0): - preds = torch.cat([batch['logits'] for batch in all_batch_list], dim=dim) - target = torch.cat([batch['labels'] for batch in all_batch_list], dim=dim) - return {"preds": preds, "target": target} - diff --git a/torchblocks/core/sequence_labeling_trainer.py b/torchblocks/core/sequence_labeling_trainer.py deleted file mode 100755 index 7cd5020..0000000 --- a/torchblocks/core/sequence_labeling_trainer.py +++ /dev/null @@ -1,14 +0,0 @@ -from torchblocks.core import TrainerBase - - -class SequenceLabelingTrainer(TrainerBase): - - def build_batch_concat(self, all_batch_list, dim=0): - preds = [] - target = [] - for batch in all_batch_list: - preds.extend(batch['predictions']) - target.extend(batch['groundtruths']) - return {"preds":preds, "target":target} - - diff --git a/torchblocks/core/trainer_base.py b/torchblocks/core/trainer_base.py deleted file mode 100755 index ebe0f4c..0000000 --- a/torchblocks/core/trainer_base.py +++ /dev/null @@ -1,705 +0,0 @@ -import os -import sys -import math -import torch -import warnings -import pandas as pd -import torch.nn as nn -from argparse import Namespace -from packaging import version - -from torch.utils.data import Dataset, DataLoader -from torch.utils.data.sampler import RandomSampler, SequentialSampler - -from torchblocks.optims.adamw import AdamW -from torchblocks.utils.logger import Logger -from torchblocks.callback.adversarial import FGM, PGD, AWP -from torchblocks.core.utils import is_apex_available -from torchblocks.optims.lr_scheduler import get_lr_scheduler -from torchblocks.utils.common import check_object_type -from torchblocks.utils.seed import seed_everything -from torchblocks.utils.meter import AverageMeter -from torchblocks.callback import ModelCheckpoint, EarlyStopping, ProgressBar, EMA -from torchblocks.utils.paths import to_json_string, save_pickle, json_to_text, load_model, is_file -from torchblocks.callback.model_checkpoint import (WEIGHTS_NAME, - TRAINER_STATE_NAME, - OPTIMIZER_NAME, - SCHEDULER_NAME, - SCALER_NAME) - -if not sys.warnoptions: - warnings.simplefilter("ignore") - -_is_native_amp_available = False -if is_apex_available(): - from apex import amp - -if version.parse(torch.__version__) >= version.parse("1.6"): - _is_native_amp_available = True - from torch.cuda.amp import autocast - -try: - from torch.utils.tensorboard import SummaryWriter - - _has_tensorboard = True -except ImportError: - try: - from tensorboardX import SummaryWriter - - _has_tensorboard = True - except ImportError: - _has_tensorboard = False - -if not _has_tensorboard: - from torchblocks.callback.file_writer import FileWriter - - -class TrainerBase: - """Base class for iterative trainer.""" - keys_to_ignore_on_gpu = [] # batch不存放在gpu中的变量,比如'input_length’ - keys_to_ignore_on_result_save = ['input_ids', 'token_type_ids'] # eval和predict结果不存储的变量 - keys_to_ignore_on_checkpoint_save = [] # checkpoint中不存储的模块,比如'optimizer' - - def __init__(self, - opts, - model, - metrics, - logger, - optimizer=None, - scheduler=None, - adv_model=None, - model_checkpoint=None, - early_stopping=None, - **kwargs): - self.opts = opts - self.model = model - self.metrics = metrics - self.logger = logger - self.scheduler = scheduler - self.global_step = 0 - self.device_num = getattr(opts, 'device_num', 0) - self.warmup_steps = getattr(opts, 'warmup_steps', 0) - self.num_train_epochs = getattr(opts, "num_train_epochs", 3) - self.device = getattr(opts, 'device', torch.device("cpu")) - self.max_grad_norm = getattr(opts, 'max_grad_norm', 0.0) - self.warmup_proportion = getattr(opts, 'warmup_proportion', 0.1) - self.gradient_accumulation_steps = getattr(opts, "gradient_accumulation_steps", 1) - self.prefix = "_".join([opts.model_type, opts.task_name, opts.experiment_code]) - for key, value in kwargs.items(): - setattr(self, key, value) - self.build_writer() - self.build_mixed_precision() - if not isinstance(self.metrics, list): - self.metrics = [self.metrics] - check_object_type(object=self.metrics, check_type=list, name='metric') - check_object_type(object=self.model, check_type=nn.Module, name='model') - check_object_type(object=self.opts, check_type=Namespace, name='self.opts') - check_object_type(object=self.logger, check_type=Logger, name='self.logger') - # EMA - if opts.ema_enable: - self.logger.info('Using EMA') - self.model_ema = EMA(model=self.model, - decay=opts.ema_decay, - device='cpu' if opts.model_ema_force_cpu else None) - # Adversarial training - if opts.adv_enable: - msg = f"Using Adversarial training and type: {opts.adv_type}" - self.logger.info(msg) - self.adv_model = adv_model - if adv_model is None: - self.adv_model = self.build_adv_model() - # optimizer - self.optimizer = optimizer - if self.optimizer is None: - self.optimizer = self.build_optimizer(model) - # checkpoint - self.model_checkpoint = model_checkpoint - if model_checkpoint is None: - self.model_checkpoint = ModelCheckpoint( - mode=opts.checkpoint_mode, - monitor=opts.checkpoint_monitor, - ckpt_dir=opts.output_dir, - verbose=opts.checkpoint_verbose, - save_best=opts.checkpoint_save_best, - keys_to_ignore_on_save=self.keys_to_ignore_on_checkpoint_save - ) - # earlystopping - self.early_stopping = early_stopping - if early_stopping is None and opts.earlystopping_patience > 0: - self.early_stopping = EarlyStopping( - mode=opts.earlystopping_mode, - patience=opts.earlystopping_patience, - monitor=opts.earlystopping_monitor, - save_state_path=opts.earlystopping_save_state_path, - load_state_path=opts.earlystopping_load_state_path - ) - - def build_mixed_precision(self): - # Mixed precision setup - self.use_apex = False - self.use_amp = False - self.fp16_backend = None - if self.opts.fp16: - if self.opts.fp16_backend == "auto": - self.fp16_backend = "amp" if _is_native_amp_available else "apex" - else: - self.fp16_backend = self.opts.fp16_backend - self.logger.info(f"Using {self.fp16_backend} fp16 backend") - if self.fp16_backend == "amp": - self.use_amp = True - self.scaler = torch.cuda.amp.GradScaler() - else: - if not is_apex_available(): - msg = ("Using FP16 with APEX but APEX is not installed, " - "please refer to https://www.github.com/nvidia/apex.") - raise ImportError(msg) - self.use_apex = True - - def build_adv_model(self): - adv_model = None - if self.opts.adv_type == 'fgm': - adv_model = FGM(self.model, - emb_name=self.opts.adv_name, - epsilon=self.opts.adv_epsilon) - elif self.opts.adv_type == 'pgd': - adv_model = PGD(self.model, - emb_name=self.opts.adv_name, - epsilon=self.opts.adv_epsilon, - alpha=self.opts.adv_alpha) - elif self.opts.adv_type == "awp": - adv_model = AWP(self.model, - emb_name=self.opts.adv_name, - epsilon=self.opts.adv_epsilon, - alpha=self.opts.adv_alpha) - return adv_model - - def build_record_tracker(self, **kwargs): - ''' - build record object - ''' - self.records = {} - self.records['result'] = {} - self.records['loss_meter'] = AverageMeter() - for key, value in kwargs.items(): - if key not in self.records: - self.records[key] = value - - def reset_metrics(self): - for metric in self.metrics: - if hasattr(metric, 'reset'): - metric.reset() - - def _param_optimizer(self, params, learning_rate, no_decay, weight_decay): - _params = [ - {'params': [p for n, p in params if not any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay, - 'lr': learning_rate}, - {'params': [p for n, p in params if any(nd in n for nd in no_decay)], - 'weight_decay': 0.0, - 'lr': learning_rate}, - ] - return _params - - def build_model_param_optimizer(self, model): - ''' - 若需要对不同模型赋予不同学习率,则指定`base_model_name`, - 在`transformer`模块中,默认为`base_model_name=`base_model`. - 对于base_model使用learning_rate, - 其余统一使用other_learning_rate - ''' - no_decay = ["bias", 'LayerNorm.weight'] - optimizer_grouped_parameters = [] - if hasattr(model, self.opts.base_model_name) and self.opts.other_learning_rate != 0.0: - msg = (f"The initial learning rate for model params : {self.opts.learning_rate} ," - f"and {self.opts.other_learning_rate}" - ) - self.logger.info(msg) - base_model = getattr(model, self.opts.base_model_name) - base_model_param = list(base_model.named_parameters()) - base_model_param_ids = [id(p) for n, p in base_model_param] - other_model_param = [(n, p) for n, p in model.named_parameters() if - id(p) not in base_model_param_ids] - optimizer_grouped_parameters.extend( - self._param_optimizer(base_model_param, self.opts.learning_rate, no_decay, self.opts.weight_decay)) - optimizer_grouped_parameters.extend( - self._param_optimizer(other_model_param, self.opts.other_learning_rate, no_decay, - self.opts.weight_decay)) - else: - all_model_param = list(model.named_parameters()) - optimizer_grouped_parameters.extend( - self._param_optimizer(all_model_param, self.opts.learning_rate, no_decay, self.opts.weight_decay)) - return optimizer_grouped_parameters - - def build_optimizer(self, model): - ''' - Setup the optimizer. - ''' - optimizer_grouped_parameters = self.build_model_param_optimizer(model) - optimizer = AdamW(params=optimizer_grouped_parameters, - lr=self.opts.learning_rate, - eps=self.opts.adam_epsilon, - betas=(self.opts.adam_beta1, self.opts.adam_beta2), - weight_decay=self.opts.weight_decay) - return optimizer - - def build_warmup_steps(self, num_training_steps): - """ - Get number of steps used for a linear warmup. - """ - if self.warmup_proportion < 0 or self.warmup_proportion > 1: - raise ValueError("warmup_proportion must lie in range [0,1]") - elif self.warmup_proportion > 0 and self.warmup_steps > 0: - msg = ("Both warmup_ratio and warmup_steps given, " - "warmup_steps will override any effect of warmup_ratio during training") - self.logger.info(msg) - warmup_steps = ( - self.warmup_steps if self.warmup_steps > 0 else math.ceil( - num_training_steps * self.warmup_proportion) - ) - return warmup_steps - - def build_lr_scheduler(self, num_training_steps): - ''' - the learning rate scheduler. - ''' - scheduler_function = get_lr_scheduler(self.opts.scheduler_type) - warmup_steps = self.build_warmup_steps(num_training_steps) - scheduler = scheduler_function(optimizer=self.optimizer, - num_warmup_steps=warmup_steps, - num_training_steps=num_training_steps) - return scheduler - - def build_train_dataloader(self, train_data): - ''' - Load train datasets - ''' - if isinstance(train_data, DataLoader): - return train_data - elif isinstance(train_data, Dataset): - batch_size = self.opts.per_gpu_train_batch_size * max(1, self.device_num) - sampler = RandomSampler(train_data) if not hasattr(train_data, 'sampler') else train_data.sampler - collate_fn = train_data.collate_fn if hasattr(train_data, 'collate_fn') else None - data_loader = DataLoader(train_data, - sampler=sampler, - batch_size=batch_size, - collate_fn=collate_fn, - drop_last=self.opts.drop_last, - num_workers=self.opts.num_workers) - return data_loader - else: - raise TypeError("train_data type{} not support".format(type(train_data))) - - def build_eval_dataloader(self, dev_data): - ''' - Load eval datasets - ''' - if isinstance(dev_data, DataLoader): - return dev_data - elif isinstance(dev_data, Dataset): - batch_size = self.opts.per_gpu_eval_batch_size * max(1, self.device_num) - sampler = SequentialSampler(dev_data) if not hasattr(dev_data, 'sampler') else dev_data.sampler - collate_fn = dev_data.collate_fn if hasattr(dev_data, 'collate_fn') else None - data_loader = DataLoader(dev_data, - sampler=sampler, - batch_size=batch_size, - collate_fn=collate_fn, - num_workers=self.opts.num_workers) - return data_loader - else: - raise TypeError("dev_data type{} not support".format(type(dev_data))) - - def build_test_dataloader(self, test_data): - ''' - Load test datasets - ''' - if isinstance(test_data, DataLoader): - return test_data - elif isinstance(test_data, Dataset): - batch_size = self.opts.per_gpu_test_batch_size * max(1, self.device_num) - sampler = SequentialSampler(test_data) if not hasattr(test_data, 'sampler') else test_data.sampler - collate_fn = test_data.collate_fn if hasattr(test_data, 'collate_fn') else None - data_loader = DataLoader(test_data, - sampler=sampler, - batch_size=batch_size, - collate_fn=collate_fn, - num_workers=self.opts.num_workers) - return data_loader - else: - raise TypeError("test_data type{} not support".format(type(test_data))) - - def build_batch_inputs(self, batch): - ''' - Sent all model inputs to the appropriate device (GPU on CPU) - rreturn: - The inputs are in a dictionary format - ''' - inputs = {key: ( - value.to(self.device) if ( - (key not in self.keys_to_ignore_on_gpu) and (value is not None) - ) else value - ) for key, value in batch.items()} - return inputs - - def check_nan(self, loss): - if torch.isnan(loss): - raise ValueError('Training loss is nan') - if isinstance(loss, torch.Tensor) and torch.isnan(loss): - import pdb - pdb.set_trace() - - def build_writer(self): - # tensorboard - if _has_tensorboard: - msg = f'Initializing summary writer for tensorboard with log_dir={self.opts.output_dir}' - self.logger.info(msg) - exp_dir = os.path.join(self.opts.output_dir, f'{self.prefix}_tb_logs') - self.writer = SummaryWriter(log_dir=exp_dir, comment='Training logs') - self.writer.add_text("train_arguments", to_json_string(self.opts.__dict__)) - else: - exp_dir = os.path.join(self.opts.output_dir, f'{self.prefix}_file_logs') - self.writer = FileWriter(log_dir=exp_dir) - - def build_model_warp(self): - # Mixed precision training with apex (torch < 1.6) - if self.use_apex: - self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level=self.opts.fp16_opt_level) - # Multi-gpu training (should be after apex fp16 initialization) - if self.device_num > 1: - self.model = nn.DataParallel(self.model) - - def train_forward(self, batch): - ''' - Training forward - ''' - self.model.train() - inputs = self.build_batch_inputs(batch) - if self.use_amp: - with autocast(): - outputs = self.model(**inputs) - else: - outputs = self.model(**inputs) - check_object_type(object=outputs, check_type=dict, name='outputs') - if self.device_num > 1: outputs['loss'] = outputs['loss'].mean() - return outputs - - def train_backward(self, loss): - ''' - Training backward - ''' - self.check_nan(loss) - if self.gradient_accumulation_steps > 1: - loss = loss / self.gradient_accumulation_steps - if self.use_amp: - self.scaler.scale(loss).backward() - elif self.use_apex: - with amp.scale_loss(loss, self.optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - def train_update(self): - if self.use_amp: - # AMP: gradients need unscaling - self.scaler.unscale_(self.optimizer) - if self.max_grad_norm is not None and self.max_grad_norm > 0: - torch.nn.utils.clip_grad_norm_( - amp.master_params(self.optimizer) if self.use_apex else self.model.parameters(), - self.max_grad_norm) - optimizer_was_run = True - if self.use_amp: - scale_before = self.scaler.get_scale() - self.scaler.step(self.optimizer) - self.scaler.update() - scale_after = self.scaler.get_scale() - optimizer_was_run = scale_before <= scale_after - else: - self.optimizer.step() - if optimizer_was_run: self.scheduler.step() # Update learning rate schedule - self.model.zero_grad() # Reset gradients to zero - self.global_step += 1 - - def train_adv(self, batch): - self.adv_model.backup_grad() - for t in range(self.opts.adv_number): - if self.opts.adv_type == 'fgm': - self.adv_model.attack() - elif self.opts.adv_type == 'pgd': - self.adv_model.attack(is_first_attack=(t == 0)) - elif self.opts.adv_type == 'awp': - self.adv_model.attack(is_first_attack=(t == 0)) - adv_outputs = self.train_forward(batch) - adv_loss = adv_outputs['loss'] - self.train_backward(adv_loss) - self.adv_model.restore_grad() - self.adv_model.restore() - - def train_step(self, step, batch): - outputs = self.train_forward(batch) - loss = outputs['loss'] - self.train_backward(loss) - should_save = False - should_logging = False - if self.opts.adv_enable and step >= self.opts.adv_start_steps: - self.train_adv(batch) - if (step + 1) % self.gradient_accumulation_steps == 0 or ( - self.steps_in_epoch <= self.gradient_accumulation_steps - and (step + 1) == self.steps_in_epoch - ): - self.train_update() - should_logging = self.global_step % self.opts.logging_steps == 0 - should_save = self.global_step % self.opts.save_steps == 0 - self.records['loss_meter'].update(loss.item(), n=1) - self.writer.add_scalar('loss/train_loss', loss.item(), self.global_step) - if hasattr(self.scheduler, 'get_lr'): - self.writer.add_scalar('learningRate/train_lr', self.scheduler.get_lr()[0], self.global_step) - return outputs, should_logging, should_save - else: - return None, should_logging, should_save - - # TODO 多机分布式训练 - def train(self, train_data, dev_data=None, resume_path=None, start_epoch=1, state_to_save=dict()): - train_dataloader = self.build_train_dataloader(train_data) - num_training_steps = len(train_dataloader) // self.gradient_accumulation_steps * self.num_train_epochs - self.steps_in_epoch = len(train_dataloader) - if self.scheduler is None: - self.scheduler = self.build_lr_scheduler(num_training_steps) - self.resume_from_checkpoint(resume_path=resume_path) - self.build_model_warp() - self.print_summary(len(train_data), num_training_steps) - self.optimizer.zero_grad() - seed_everything(self.opts.seed, verbose=False) # Added here for reproductibility (even between python 2 and 3) - if self.opts.logging_steps < 0: - self.opts.logging_steps = len(train_dataloader) // self.gradient_accumulation_steps - self.opts.logging_steps = max(1, self.opts.logging_steps) - if self.opts.save_steps < 0: - self.opts.save_steps = len(train_dataloader) // self.gradient_accumulation_steps - self.opts.save_steps = max(1, self.opts.save_steps) - self.build_record_tracker() - self.reset_metrics() - pbar = ProgressBar(n_total=len(train_dataloader), desc='Training', num_epochs=self.num_train_epochs) - for epoch in range(start_epoch, int(self.num_train_epochs) + 1): - pbar.epoch(current_epoch=epoch) - for step, batch in enumerate(train_dataloader): - outputs, should_logging, should_save = self.train_step(step, batch) - if outputs is not None: - if self.opts.ema_enable: - self.model_ema.update(self.model) - pbar.step(step, {'loss': outputs['loss'].item()}) - if (self.opts.logging_steps > 0 and self.global_step > 0) and \ - should_logging and self.opts.evaluate_during_training: - self.evaluate(dev_data) - if self.opts.ema_enable and self.model_ema is not None: - self.evaluate(dev_data, prefix_metric='ema') - if hasattr(self.writer, 'save'): - self.writer.save() - if (self.opts.save_steps > 0 and self.global_step > 0) and should_save: - # model checkpoint - if self.model_checkpoint: - state = self.build_state_object(**state_to_save) - if self.opts.evaluate_during_training: - if self.model_checkpoint.monitor not in self.records['result']: - msg = ("There were expected keys in the eval result: " - f"{', '.join(list(self.records['result'].keys()))}, " - f"but get {self.model_checkpoint.monitor}." - ) - raise TypeError(msg) - self.model_checkpoint.step( - state=state, - current=self.records['result'][self.model_checkpoint.monitor] - ) - else: - self.model_checkpoint.step( - state=state, - current=None - ) - - # early_stopping - if self.early_stopping: - if self.early_stopping.monitor not in self.records['result']: - msg = ("There were expected keys in the eval result: " - f"{', '.join(list(self.records['result'].keys()))}, " - f"but get {self.early_stopping.monitor}." - ) - raise TypeError(msg) - self.early_stopping.step( - current=self.records['result'][self.early_stopping.monitor]) - if self.early_stopping.stop_training: - break - if torch.cuda.is_available(): - torch.cuda.empty_cache() - if self.writer: - self.writer.close() - - def build_state_object(self, **kwargs): - ''' - save state object - ''' - states = { - 'model': self.model.module if hasattr(self.model, "module") else self.model, - 'opts': self.opts, - 'optimizer': self.optimizer, - 'global_step': self.global_step, - } - if self.scheduler is not None: - states['scheduler'] = self.scheduler - if self.use_amp: - states['scaler'] = self.scaler - for key, value in kwargs.items(): - if key not in states: - states[key] = value - return states - - def resume_from_checkpoint(self, resume_path=None): - ''' - Check if continuing training from a checkpoint - ''' - if resume_path is not None: - optimizer_path = os.path.join(resume_path, OPTIMIZER_NAME) - scheduler_path = os.path.join(resume_path, SCHEDULER_NAME) - state_path = os.path.join(resume_path, TRAINER_STATE_NAME) - model_path = os.path.join(resume_path, WEIGHTS_NAME) - scaler_path = os.path.join(resume_path, SCALER_NAME) - if is_file(optimizer_path): - self.optimizer.load_state_dict(torch.load(optimizer_path, map_location=self.device)) - if is_file(scheduler_path): - self.scheduler.load_state_dict(torch.load(scheduler_path)) - if is_file(state_path): - state = torch.load(state_path) - if self.model_checkpoint and hasattr(state, 'best_score'): - self.model_checkpoint.best = state['best_score'] - del state - if is_file(model_path): - if self.use_amp and is_file(scaler_path): - self.scaler.load_state_dict(torch.load(scaler_path)) - load_model(self.model, model_path, device=self.device) - - def print_summary(self, examples, t_total): - ''' - print training parameters information - ''' - # self.logger.info("Training/evaluation parameters %s", self.opts) - self.logger.info("***** Running training %s *****", self.opts.task_name) - self.logger.info(" Options = %s", self.opts) - self.logger.info(" Model type = %s", self.opts.model_type) - self.logger.info(" Num examples = %d", examples) - self.logger.info(" Num Epochs = %d", self.num_train_epochs) - self.logger.info(" Instantaneous batch size per GPU = %d", self.opts.per_gpu_train_batch_size) - self.logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - self.opts.per_gpu_train_batch_size * self.device_num * self.gradient_accumulation_steps) - self.logger.info(" Gradient Accumulation steps = %d", self.gradient_accumulation_steps) - self.logger.info(" Total optimization steps = %d", t_total) - self.logger.info(" Total Number of Parameters: %d" % sum(p.numel() for p in self.model.parameters())) - # Calculating total number of trainable params - self.logger.info(" Total Number of Trainable Parameters: %d " % sum( - p.numel() for p in self.model.parameters() if p.requires_grad)) - - def print_evaluate_result(self): - ''' - 打印evaluation结果, - ''' - if len(self.records['result']) == 0: - self.logger.warning("eval result record is empty") - self.logger.info("***** Evaluating results of %s *****", self.opts.task_name) - self.logger.info(" global step = %s", self.global_step) - print_result = [] - for key, value in self.records['result'].items(): - if isinstance(value, (int, float)): - print_result.insert(0, [key, value]) - elif isinstance(value, pd.DataFrame): - print_result.append([key, value]) - else: - print_result.append([key, value]) - for key, value in print_result: - if isinstance(value, pd.DataFrame): - self.logger.info(f" %s : \n %s", key, str(round(value, 5))) - else: - self.logger.info(f" %s = %s", key, str(round(value, 5))) - name = "_".join(key.split("_")[1:]) if "_" in key else key - self.writer.add_scalar(f"{name}/{key}", value, int(self.global_step / self.opts.logging_steps)) - - def save_predict_result(self, data, file_name, save_dir=None): - ''' - 保存预测信息 - ''' - if save_dir is None: - save_dir = self.opts.output_dir - elif not os.path.isdir(save_dir): - save_dir = os.path.join(self.opts.output_dir, save_dir) - file_path = os.path.join(save_dir, file_name) - if ".pkl" in file_path: - save_pickle(file_path=file_path, data=data) - elif ".json" in file_path: - json_to_text(file_path=file_path, data=data) - else: - raise ValueError("file type: expected one of (.pkl, .json)") - - def evaluate(self, dev_data, prefix_metric=None, save_dir=None, save_result=False, file_name=None): - ''' - Evaluate the model on a validation set - ''' - all_batch_list = [] - eval_dataloader = self.build_eval_dataloader(dev_data) - self.build_record_tracker() - self.reset_metrics() - pbar = ProgressBar(n_total=len(eval_dataloader), desc='Evaluating') - for step, batch in enumerate(eval_dataloader): - batch = self.predict_forward(batch) - if 'loss' in batch and batch['loss'] is not None: - self.records['loss_meter'].update(batch['loss'], n=1) - all_batch_list.append(batch) - pbar.step(step) - self.records['result']['eval_loss'] = self.records['loss_meter'].avg - self.update_metrics(all_batch_list, prefix_metric) - self.print_evaluate_result() - if save_result: - if file_name is None: file_name = f"dev_eval_results.pkl" - self.save_predict_result(data=all_batch_list, file_name=file_name, save_dir=save_dir) - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - def update_metrics(self, all_batch_list, prefix): - eval_data = self.build_batch_concat(all_batch_list, dim=0) - prefix = '' if prefix is None else prefix + "_" - for metric in self.metrics: - metric.update(preds=eval_data['preds'], target=eval_data['target']) - value = metric.value() - if isinstance(value, float): - self.records['result'][f'{prefix}eval_{metric.name()}'] = value - elif isinstance(value, dict): - self.records['result'].update({f"{prefix}eval_{k}": v for k, v in value.items()}) - elif value is None: - self.logger.info(f"{metric.name()} value is None") - else: - msg = "metric value type: expected one of (float, dict,None)" - raise ValueError(msg) - - def predict(self, test_data, save_result=True, file_name=None, save_dir=None): - ''' - test数据集预测 - ''' - all_batch_list = [] - test_dataloader = self.build_test_dataloader(test_data) - pbar = ProgressBar(n_total=len(test_dataloader), desc='Predicting') - for step, batch in enumerate(test_dataloader): - batch = self.predict_forward(batch) - all_batch_list.append(batch) - pbar.step(step) - if save_result: - if file_name is None: file_name = f"test_predict_results.pkl" - self.save_predict_result(data=all_batch_list, file_name=file_name, save_dir=save_dir) - - def predict_forward(self, batch): - self.model.eval() - inputs = self.build_batch_inputs(batch) - with torch.no_grad(): - outputs = self.model(**inputs) - if 'loss' in outputs and outputs['loss'] is not None: - outputs['loss'] = outputs['loss'].mean().detach().item() - outputs = {key: value.detach().cpu() if isinstance(value, torch.Tensor) else value for key, value in - outputs.items()} - batch = {key: value for key, value in dict(batch, **outputs).items() if - key not in self.keys_to_ignore_on_result_save} - return batch - - def build_batch_concat(self, all_batch_list): - raise NotImplementedError('Method [build_batch_concat] should be implemented.') diff --git a/torchblocks/core/utils.py b/torchblocks/core/utils.py deleted file mode 100755 index 6fab86a..0000000 --- a/torchblocks/core/utils.py +++ /dev/null @@ -1,3 +0,0 @@ -import importlib.util -def is_apex_available(): - return importlib.util.find_spec("apex") is not None diff --git a/torchblocks/data/__init__.py b/torchblocks/data/__init__.py deleted file mode 100755 index 23aa91a..0000000 --- a/torchblocks/data/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .embedding import * -from .Vocabulary import * -from .dataset import * -from .process_base import * \ No newline at end of file diff --git a/torchblocks/data/dataset.py b/torchblocks/data/dataset.py deleted file mode 100755 index a9dd009..0000000 --- a/torchblocks/data/dataset.py +++ /dev/null @@ -1,118 +0,0 @@ -import os -import torch -import logging -from tqdm import tqdm -from typing import List, Dict, Callable, Any -from torchblocks.utils.paths import check_file, is_file - -logger = logging.getLogger(__name__) - - -class DatasetBase(torch.utils.data.Dataset): - keys_to_truncate_on_dynamic_batch = ['input_ids', 'attention_mask', 'token_type_ids'] - - def __init__(self, - data_name, - data_dir, - data_type, - process_piplines: List[Callable], - max_examples: int = None, - use_cache: bool = False, - collate_dynamic: bool = True, - cached_features_file: str = None, - overwrite_cache: bool = False, - **kwargs) -> None: - super().__init__() - self.data_dir = data_dir - file_path = data_name - if not is_file(data_name): file_path = os.path.join(data_dir, data_name) - check_file(file_path) - self.examples = self.create_examples(self.read_data(file_path), data_type) - if max_examples is not None: self.examples = self.examples[: max_examples] - self.process_piplines = process_piplines if isinstance(process_piplines, list) else [process_piplines] - self.num_examples = len(self.examples) - self.num_labels = len(self.get_labels()) - self.use_cache = use_cache - self.collate_dynamic = collate_dynamic - self.cached_features_file = cached_features_file - self.overwrite_cache = overwrite_cache - if self.use_cache: - if cached_features_file is None: - cached_features_file = f'{data_type}.cache' - cached_features_file = os.path.join(self.data_dir, cached_features_file) - self.create_features_cache(cached_features_file) - - def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: - if self.use_cache: - feature = self.features[index] - else: - feature = self.process_example(self.examples[index]) - return feature - - def __len__(self): - return self.num_examples - - def create_features_cache(self, cached_features_file): - if is_file(cached_features_file) and not self.overwrite_cache: - logger.info(f"Loading features from cached file {cached_features_file}") - self.features = torch.load(self.cached_features_file) - else: - logger.info(f"Creating features from dataset file at {self.data_dir}") - self.features = [ - self.process_example(example) for example in - tqdm(self.examples, total=self.num_examples, desc="Converting examples to features...")] - logger.info(f"Saving features to cached file {cached_features_file}") - torch.save(self.features, cached_features_file) - - @classmethod - def get_labels(self) -> List[str]: - raise NotImplementedError('Method [get_labels] should be implemented.') - - @classmethod - def label2id(cls): - return {label: i for i, label in enumerate(cls.get_labels())} - - @classmethod - def id2label(cls): - return {i: label for i, label in enumerate(cls.get_labels())} - - def read_data(self, input_file: str) -> Any: - raise NotImplementedError('Method [read_data] should be implemented.') - - def create_examples(self, data: Any, set_type: str, **kwargs) -> List[Dict[str, Any]]: - raise NotImplementedError('Method [create_examples] should be implemented.') - - def process_example(self, example: Dict[str, Any]) -> Dict[str, torch.Tensor]: - for proc in self.process_piplines: - if proc is None: continue - example = proc(example) - return example - - def collate_fn(self, features: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: - batch = {} - first = features[0] - max_input_length = first['input_ids'].size(0) - if self.collate_dynamic: - max_input_length = max([torch.sum(f["attention_mask"]) for f in features]) - if "label" in first and first["label"] is not None: - label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"] - dtype = torch.long if isinstance(label, int) else torch.float - batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype) - elif "label_ids" in first and first["label_ids"] is not None: - if isinstance(first["label_ids"], torch.Tensor): - batch["labels"] = torch.stack([f["label_ids"] for f in features]) - else: - dtype = torch.long if type(first["label_ids"][0]) is int else torch.float - batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype) - # Handling of all other possible keys. - # Again, we will use the first element to figure out which key/values are not None for this model. - for k, v in first.items(): - if k not in ("label", "label_ids") and v is not None and not isinstance(v, str): - bv = torch.stack([f[k] for f in features]) if isinstance(v, torch.Tensor) else torch.tensor( - [f[k] for f in features]) - batch[k] = bv - if self.collate_dynamic: - for k in self.keys_to_truncate_on_dynamic_batch: - if k not in batch: continue - if batch[k].dim() >= 2: batch[k] = batch[k][:, : max_input_length] - return batch diff --git a/torchblocks/data/process_base.py b/torchblocks/data/process_base.py deleted file mode 100755 index 7bab8c5..0000000 --- a/torchblocks/data/process_base.py +++ /dev/null @@ -1,5 +0,0 @@ -class ProcessBase(object): - """ 用于处理单个example """ - - def __call__(self, example): - raise NotImplementedError('Method [__call__] should be implemented.') diff --git a/torchblocks/layers/char.py b/torchblocks/layers/char.py deleted file mode 100755 index c50281f..0000000 --- a/torchblocks/layers/char.py +++ /dev/null @@ -1,80 +0,0 @@ -import torch -import torch.nn as nn - - -class ConvolutionCharEncoder(nn.Module): - r""" - char级别的卷积编码器. - """ - def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(1, 3, 5)): - r""" - :param int char_emb_size: char级别embedding的维度. Default: 50 - :例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50. - :param tuple feature_maps: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的filter. - :param tuple kernels: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的卷积核. - :param initial_method: 初始化参数的方式, 默认为`xavier normal` - """ - super(ConvolutionCharEncoder, self).__init__() - self.convs = nn.ModuleList([ - nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, - padding=(0, kernels[i] // 2)) - for i in range(len(kernels))]) - - def forward(self, x): - r""" - :param torch.Tensor x: ``[batch_size * sent_length, word_length, char_emb_size]`` 输入字符的embedding - :return: torch.Tensor : 卷积计算的结果, 维度为[batch_size * sent_length, sum(feature_maps), 1] - """ - x = x.contiguous().view(x.size(0), 1, x.size(1), x.size(2)) - # [batch_size*sent_length, channel, width, height] - x = x.transpose(2, 3) - # [batch_size*sent_length, channel, height, width] - return self._convolute(x).unsqueeze(2) - - def _convolute(self, x): - feats = [] - for conv in self.convs: - y = conv(x) - # [batch_size*sent_length, feature_maps[i], 1, width - kernels[i] + 1] - y = torch.squeeze(y, 2) - # [batch_size*sent_length, feature_maps[i], width - kernels[i] + 1] - y = torch.tanh(y) - y, __ = torch.max(y, 2) - # [batch_size*sent_length, feature_maps[i]] - feats.append(y) - return torch.cat(feats, 1) # [batch_size*sent_length, sum(feature_maps)] - - -class LSTMCharEncoder(nn.Module): - r""" - char级别基于LSTM的encoder. - """ - - def __init__(self, char_emb_size=50, hidden_size=None): - r""" - :param int char_emb_size: char级别embedding的维度. Default: 50 - 例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50. - :param int hidden_size: LSTM隐层的大小, 默认为char的embedding维度 - :param initial_method: 初始化参数的方式, 默认为`xavier normal` - """ - super(LSTMCharEncoder, self).__init__() - self.hidden_size = char_emb_size if hidden_size is None else hidden_size - self.lstm = nn.LSTM(input_size=char_emb_size, - hidden_size=self.hidden_size, - num_layers=1, - bias=True, - batch_first=True) - - def forward(self, x): - r""" - :param torch.Tensor x: ``[ n_batch*n_word, word_length, char_emb_size]`` 输入字符的embedding - :return: torch.Tensor : [ n_batch*n_word, char_emb_size]经过LSTM编码的结果 - """ - batch_size = x.shape[0] - h0 = torch.empty(1, batch_size, self.hidden_size) - h0 = nn.init.orthogonal_(h0) - c0 = torch.empty(1, batch_size, self.hidden_size) - c0 = nn.init.orthogonal_(c0) - - _, hidden = self.lstm(x, (h0, c0)) - return hidden[0].squeeze().unsqueeze(2) diff --git a/torchblocks/layers/dropouts.py b/torchblocks/layers/dropouts.py deleted file mode 100755 index 0efeeff..0000000 --- a/torchblocks/layers/dropouts.py +++ /dev/null @@ -1,48 +0,0 @@ -import torch -import torch.nn as nn - - -class SpatialDropout(nn.Dropout2d): - def __init__(self, p=0.5): - super(SpatialDropout, self).__init__(p=p) - - def forward(self, input): - input = input.unsqueeze(2) # (N, T, 1, K) - input = input.permute(0, 3, 2, 1) # (N, K, 1, T) - input = super(SpatialDropout, self).forward(input) # (N, K, 1, T), some features are masked - input = input.permute(0, 3, 2, 1) # (N, T, 1, K) - return input.squeeze(2) # (N, T, K) - - -class MultiSampleDropout(nn.Module): - ''' - # multisample dropout (wut): https://arxiv.org/abs/1905.09788 - ''' - - def __init__(self, hidden_size, num_labels, K=5, p=0.5): - super().__init__() - self.K = K - self.dropout = nn.Dropout(p) - self.classifier = nn.Linear(hidden_size, num_labels) - - def forward(self, input): - logits = torch.stack([self.classifier(self.dropout(input)) for _ in range(self.K)], dim=0) - logits = torch.mean(logits, dim=0) - return logits - - -class TimestepDropout(torch.nn.Dropout): - r""" - 传入参数的shape为 ``(batch_size, num_timesteps, embedding_dim)`` - 使用同一个shape为 ``(batch_size, embedding_dim)`` 的mask在每个timestamp上做dropout。 - """ - - def forward(self, x): - dropout_mask = x.new_ones(x.shape[0], x.shape[-1]) - torch.nn.functional.dropout(dropout_mask, self.p, self.training, inplace=True) - dropout_mask = dropout_mask.unsqueeze(1) # [batch_size, 1, embedding_dim] - if self.inplace: - x *= dropout_mask - return - else: - return x * dropout_mask \ No newline at end of file diff --git a/torchblocks/layers/gate.py b/torchblocks/layers/gate.py deleted file mode 100755 index 6d4ae74..0000000 --- a/torchblocks/layers/gate.py +++ /dev/null @@ -1,29 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Gate(nn.Module): - """Gate Unit - g = sigmoid(Wx) - x = g * x - """ - - def __init__(self, input_size, dropout_rate=0.): - super(Gate, self).__init__() - self.linear = nn.Linear(input_size, input_size, bias=False) - self.dropout_rate = dropout_rate - - def forward(self, x): - """ - Args: - x: batch * len * dim - x_mask: batch * len (1 for padding, 0 for true) - Output: - res: batch * len * dim - """ - if self.dropout_rate: - x = F.dropout(x, p=self.dropout_rate, training=self.training) - x_proj = self.linear(x) - gate = torch.sigmoid(x) - return x_proj * gate diff --git a/torchblocks/metrics/classification/accuracy.py b/torchblocks/metrics/classification/accuracy.py deleted file mode 100755 index d1f8f32..0000000 --- a/torchblocks/metrics/classification/accuracy.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch -import logging -from torchblocks.metrics.base import Metric -from torchmetrics.classification.accuracy import Accuracy as _Accuracy - -logger = logging.getLogger(__name__) - - -class Accuracy(_Accuracy, Metric): - ''' - Computes accuracy. Works with binary, multiclass, and multilabel data. - Accepts logits from a model output or integer class values in prediction. - Works with multi-dimensional preds and target. - Args: - threshold: - Threshold value for binary or multi-label logits. default: 0.5 - ''' - - def __init__(self, threshold=0.5, - num_classes=None, - average="micro", - ignore_index=None, - top_k=None, - multiclass=None): - super(Accuracy, self).__init__(threshold=threshold, - num_classes=num_classes, - average=average, - ignore_index=ignore_index, - top_k=top_k, - multiclass=multiclass) - self.reset() - - def reset(self): - self.tp = torch.zeros([], dtype=torch.long) - self.fp = torch.zeros([], dtype=torch.long) - self.tn = torch.zeros([], dtype=torch.long) - self.fn = torch.zeros([], dtype=torch.long) - - def value(self): - score = self.compute() - return score.item() - - def name(self): - return 'acc' diff --git a/torchblocks/metrics/classification/auc.py b/torchblocks/metrics/classification/auc.py deleted file mode 100755 index ef0cb8f..0000000 --- a/torchblocks/metrics/classification/auc.py +++ /dev/null @@ -1,30 +0,0 @@ -import logging -from torchblocks.metrics.base import Metric -from torchmetrics.classification.auroc import AUROC as _AUROC - -logger = logging.getLogger(__name__) - - -class AUC(_AUROC, Metric): - ''' - Area Under Curve - ''' - - def __init__(self, num_classes=None, - pos_label=None, - average="macro"): - super(AUC, self).__init__(num_classes=num_classes, - pos_label=pos_label, - average=average) - self.reset() - - def reset(self): - self.add_state("preds", default=[], dist_reduce_fx="cat") - self.add_state("target", default=[], dist_reduce_fx="cat") - - def value(self): - score = self.compute() - return score.item() - - def name(self): - return 'auc' diff --git a/torchblocks/metrics/classification/f1_score.py b/torchblocks/metrics/classification/f1_score.py deleted file mode 100755 index 34b05b0..0000000 --- a/torchblocks/metrics/classification/f1_score.py +++ /dev/null @@ -1,40 +0,0 @@ -import torch -import logging -from torchblocks.metrics.base import Metric -from torchmetrics.classification.f_beta import F1 as _F1 - -logger = logging.getLogger(__name__) - - -class F1Score(_F1, Metric): - ''' - F1 Score - ''' - - def __init__(self, num_classes=None, - threshold=0.5, - average="micro", - ignore_index=None, - top_k=None, - multiclass=None): - super(F1Score, self).__init__(top_k=top_k, - average=average, - num_classes=num_classes, - threshold=threshold, - ignore_index=ignore_index, - multiclass=multiclass) - - self.reset() - - def reset(self): - default = lambda: [] - reduce_fn = None - for s in ("tp", "fp", "tn", "fn"): - self.add_state(s, default=default(), dist_reduce_fx=reduce_fn) - - def value(self): - score = self.compute() - return score.item() - - def name(self): - return 'f1' diff --git a/torchblocks/metrics/classification/matthews_corrcoef.py b/torchblocks/metrics/classification/matthews_corrcoef.py deleted file mode 100755 index 8b289b6..0000000 --- a/torchblocks/metrics/classification/matthews_corrcoef.py +++ /dev/null @@ -1,26 +0,0 @@ -import torch -import logging -from torchblocks.metrics.base import Metric -from torchmetrics.classification.matthews_corrcoef import MatthewsCorrcoef as _MatthewsCorrcoef - -logger = logging.getLogger(__name__) - - -class MattewsCorrcoef(_MatthewsCorrcoef, Metric): - ''' - Matthews Correlation Coefficient - ''' - - def __init__(self, num_classes, threshold=0.5): - super(MattewsCorrcoef, self).__init__(num_classes=num_classes, threshold=threshold) - - def reset(self): - default = torch.zeros(self.num_classes, self.num_classes) - self.add_state("confmat", default=default, dist_reduce_fx="sum") - - def value(self): - score = self.compute() - return score.item() - - def name(self): - return 'mcc' diff --git a/torchblocks/metrics/sequence_labeling/scheme.py b/torchblocks/metrics/sequence_labeling/scheme.py deleted file mode 100755 index 8c92483..0000000 --- a/torchblocks/metrics/sequence_labeling/scheme.py +++ /dev/null @@ -1,95 +0,0 @@ -def get_spans_bios(tags, id2label=None): - """Gets entities from sequence. - note: BIOS - Args: - tags (list): sequence of labels. - Returns: - list: list of (chunk_type, chunk_start, chunk_end). - Example: - >>> tags = ['B-PER', 'I-PER', 'O', 'S-LOC'] - >>> get_spans_bios(tags) - # output: [['PER', 0,1], ['LOC', 3, 3]] - """ - chunks = [] - chunk = [-1, -1, -1] - for indx, tag in enumerate(tags): - if not isinstance(tag, str): - tag = id2label[tag] - if tag.startswith("S-"): - if chunk[2] != -1: - chunks.append(chunk) - chunk = [-1, -1, -1] - chunk[1] = indx - chunk[2] = indx - chunk[0] = tag.split('-')[1] - chunks.append(chunk) - chunk = (-1, -1, -1) - if tag.startswith("B-"): - if chunk[2] != -1: - chunks.append(chunk) - chunk = [-1, -1, -1] - chunk[1] = indx - chunk[0] = tag.split('-')[1] - elif tag.startswith('I-') and chunk[1] != -1: - _type = tag.split('-')[1] - if _type == chunk[0]: - chunk[2] = indx - if indx == len(tags) - 1: - chunks.append(chunk) - else: - if chunk[2] != -1: - chunks.append(chunk) - chunk = [-1, -1, -1] - return chunks - - -def get_spans_bio(tags, id2label=None): - """Gets entities from sequence. - Args: - tags (list): sequence of labels. - Returns: - list: list of (chunk_type, chunk_start, chunk_end). - Example: - >>> tags = ['B-PER', 'I-PER', 'O', 'B-LOC'] - >>> get_spans_bio(tags) - # output [['PER', 0,1], ['LOC', 3, 3]] - """ - chunks = [] - chunk = [-1, -1, -1] - for indx, tag in enumerate(tags): - if not isinstance(tag, str): - tag = id2label[tag] - if tag.startswith("B-"): - if chunk[2] != -1: - chunks.append(chunk) - chunk = [-1, -1, -1] - chunk[1] = indx - chunk[0] = tag.split('-')[1] - elif tag.startswith('I-') and chunk[1] != -1: - _type = tag.split('-')[1] - if _type == chunk[0]: - chunk[2] = indx - if indx == len(tags) - 1: - chunks.append(chunk) - else: - if chunk[2] != -1: - chunks.append(chunk) - chunk = [-1, -1, -1] - return chunks - - -TYPE_TO_SCHEME = { - "BIO": get_spans_bio, - "BIOS": get_spans_bios, -} - - -def get_scheme(scheme_type): - if scheme_type not in TYPE_TO_SCHEME: - msg = ("There were expected keys in the `TYPE_TO_SCHEME`: " - f"{', '.join(list(TYPE_TO_SCHEME.keys()))}, " - f"but get {scheme_type}." - ) - raise TypeError(msg) - scheme_function = TYPE_TO_SCHEME[scheme_type] - return scheme_function diff --git a/torchblocks/models/utils.py b/torchblocks/models/utils.py deleted file mode 100755 index ea36b36..0000000 --- a/torchblocks/models/utils.py +++ /dev/null @@ -1,64 +0,0 @@ -import torch -import torch.nn as nn - - -def open_all_layers(model): - r"""Open all layers in model for training. - - Examples:: - >>> open_all_layers(model) - """ - model.train() - for p in model.parameters(): - p.requires_grad = True - - -def freeze_to(n, model): - """Freeze first n layers of model - * **n** - Starting from initial layer, freeze all layers up to nth layer inclusively - """ - layers = list(model.parameters()) - # Freeze up to n layers - for param in layers[:n]: - param.requires_grad = False - for param in layers[n:]: - param.requires_grad = True - - -def open_specified_layers(model, open_layers): - r"""Open specified layers in model for training while keeping - other layers frozen. - - Args: - model (nn.Module): neural net model. - open_layers (str or list): layers open for training. - - Examples:: - >>> # Only model.classifier will be updated. - >>> open_layers = 'classifier' - >>> open_specified_layers(model, open_layers) - >>> # Only model.fc and model.classifier will be updated. - >>> open_layers = ['fc', 'classifier'] - >>> open_specified_layers(model, open_layers) - """ - if isinstance(model, nn.DataParallel): - model = model.module - - if isinstance(open_layers, str): - open_layers = [open_layers] - - for layer in open_layers: - assert hasattr( - model, layer - ), '"{}" is not an attribute of the model, please provide the correct name'.format( - layer - ) - for name, module in model.named_children(): - if name in open_layers: - module.train() - for p in module.parameters(): - p.requires_grad = True - else: - module.eval() - for p in module.parameters(): - p.requires_grad = False diff --git a/torchblocks/tasks/sequence_classification.py b/torchblocks/tasks/sequence_classification.py deleted file mode 100755 index 1af424b..0000000 --- a/torchblocks/tasks/sequence_classification.py +++ /dev/null @@ -1,97 +0,0 @@ -import torch -import torch.utils.checkpoint -from torch import nn -from torch.nn import CrossEntropyLoss -from transformers import BertPreTrainedModel, BertModel -from transformers.modeling_outputs import SequenceClassifierOutput - - -class Pooling(nn.Module): - def __init__(self, hidden_size, pooling_mode='cls', last_layers=None): - super(Pooling, self).__init__() - assert pooling_mode in ['mean', 'max', 'cls', 'mean_sqrt'] - self.hidden_size = hidden_size - self.last_layers = last_layers - self.pooling_mode = pooling_mode - self.pooling_output_dimension = hidden_size if last_layers is None else hidden_size * last_layers - - def forward(self, features, attention_mask): - sequence_outputs = features['last_hidden_state'] - cls_outputs = features['pooler_output'] - hidden_outputs = features['hidden_states'] - if self.last_layers is not None: - sequence_outputs = torch.cat([hidden_outputs[-i] for i in range(1, self.last_layers + 1)], dim=-1) - if self.pooling_mode == 'cls': - vectors = cls_outputs - if self.pooling_mode == 'max': - input_mask_expanded = attention_mask.unsqueeze(-1).expand(sequence_outputs.size()).float() - sequence_outputs[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value - vectors = torch.max(sequence_outputs, 1)[0] - if self.pooling_mode in ['mean', 'mean_sqrt']: - input_mask_expanded = attention_mask.unsqueeze(-1).expand(sequence_outputs.size()).float() - sum_embeddings = torch.sum(sequence_outputs * input_mask_expanded, 1) - sum_mask = input_mask_expanded.sum(1) - sum_mask = torch.clamp(sum_mask, min=1e-9) - if self.pooling_mode == 'mean': - vectors = sum_embeddings / sum_mask - if self.pooling_mode == 'mean_sqrt': - vectors = sum_embeddings / torch.sqrt(sum_mask) - return vectors - - def get_pooling_output_dimension(self): - return self.pooling_output_dimension - - -class BertForSequenceClassification(BertPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.config = config - self.bert = BertModel(config) - classifier_dropout = ( - config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob - ) - self.dropout = nn.Dropout(classifier_dropout) - self.pooling = Pooling(hidden_size=config.hidden_size, - pooling_mode=config.pooling_mode, - last_layers=config.last_layers) - pooling_output_dimension = self.pooling.get_pooling_output_dimension() - self.classifier = nn.Linear(pooling_output_dimension, config.num_labels) - self.init_weights() - - def forward(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=True, - return_dict=None, - ): - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - pooled_output = self.pooling(outputs, attention_mask) - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - return SequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) diff --git a/torchblocks/tasks/sequence_labeling_crf.py b/torchblocks/tasks/sequence_labeling_crf.py deleted file mode 100755 index e88f3e8..0000000 --- a/torchblocks/tasks/sequence_labeling_crf.py +++ /dev/null @@ -1,68 +0,0 @@ -import torch -import torch.nn as nn -from typing import * -from transformers.file_utils import ModelOutput -from torchblocks.layers.crf import CRF -from dataclasses import dataclass -from transformers import BertPreTrainedModel, BertModel - - -@dataclass -class SequenceLabelingOutput(ModelOutput): - loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - predictions: List[List[List[str]]] = None - groundtruths: List[List[List[str]]] = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -class BertCrfForSeqLabel(BertPreTrainedModel): - - def __init__(self, config): - super().__init__(config) - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.crf = CRF(num_tags=config.num_labels, batch_first=True) - self.init_weights() - - def forward(self, - input_ids, - token_type_ids=None, - attention_mask=None, - labels=None - ): - outputs = self.bert( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids - ) - sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - loss, groundtruths, predictions = None, None, None - if labels is not None: - loss = -1 * self.crf(emissions=logits, tags=labels, mask=attention_mask) - if not self.training: - groundtruths = self.decode(labels, attention_mask, is_logits=False) - if not self.training: # 训练时无需解码 - predictions = self.decode(logits, attention_mask) - return SequenceLabelingOutput( - loss=loss, - logits=logits, - predictions=predictions, - groundtruths=groundtruths, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def decode(self, logits_or_labels, mask, is_logits=True) -> List[List[List[str]]]: - decode_ids = logits_or_labels - if is_logits: - decode_ids = self.crf.decode(logits_or_labels, mask).squeeze(0) # (batch_size, seq_length) - decode_labels = [] - for ids, mask in zip(decode_ids, mask): - decode_label = [self.config.id2label[id.item()] for id, m in zip(ids, mask) if m > 0][1:-1] # [CLS], [SEP] - decode_labels.append(decode_label) - return decode_labels diff --git a/torchblocks/tasks/sequence_labeling_global_pointer.py b/torchblocks/tasks/sequence_labeling_global_pointer.py deleted file mode 100755 index d7c131f..0000000 --- a/torchblocks/tasks/sequence_labeling_global_pointer.py +++ /dev/null @@ -1,91 +0,0 @@ -from typing import * -from dataclasses import dataclass -import torch -import torch.nn as nn -from transformers import BertPreTrainedModel,BertModel -from transformers.file_utils import ModelOutput -from torchblocks.layers.position import PositionalEncoding -from torchblocks.losses.cross_entropy import MultiLabelCategoricalCrossEntropy - - -@dataclass -class GlobalPointOutput(ModelOutput): - loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - predictions: List[List[List[str]]] = None - groundtruths: List[List[List[str]]] = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -class BertGlobalPointerForSeqLabel(BertPreTrainedModel): # config.pe_dim=64 - def __init__(self, config): - super().__init__(config) - self.config = config - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.pe = PositionalEncoding(d_model=config.pe_dim, max_len=config.max_seq_length) - self.linear = nn.Linear(config.hidden_size, config.num_labels * config.pe_dim * 2) - self.init_weights() - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): - outputs = self.bert( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids - ) - sequence_output = outputs[0] - batch_size, seq_length = input_ids.shape - sequence_output = self.dropout(sequence_output) - sequence_output = self.linear(sequence_output) - sequence_output = torch.split(sequence_output, self.config.pe_dim * 2, dim=-1) - # (batch_size, seq_len, num_labels, pe_dim * 2) - sequence_output = torch.stack(sequence_output, dim=-2) - # query, key: (batch_size, seq_len,num_labels, pe_dim) - query, key = sequence_output[..., :self.config.pe_dim], sequence_output[..., self.config.pe_dim:] - if self.config.use_rope: - pos_emb = self.pe(batch_size, seq_length) - cos_pos = pos_emb[..., None, 1::2].repeat_interleave(2, dim=-1) - sin_pos = pos_emb[..., None, ::2].repeat_interleave(2, dim=-1) - qw2 = torch.stack([-query[..., 1::2], query[..., ::2]], -1) - qw2 = qw2.reshape(query.shape) - query = query * cos_pos + qw2 * sin_pos - kw2 = torch.stack([-key[..., 1::2], key[..., ::2]], -1) - kw2 = kw2.reshape(key.shape) - key = key * cos_pos + kw2 * sin_pos - logits = torch.einsum('bmhd,bnhd->bhmn', query, key) # logits: (batch_size, ent_type_size, seq_len, seq_len) - # 构建mask - extended_attention_mask = attention_mask[:, None, None, :] * torch.triu(torch.ones_like(logits)) - extended_attention_mask = (1.0 - extended_attention_mask) * -1e12 - logits += extended_attention_mask - logits /= self.config.pe_dim ** 0.5 - loss, groudtruths, predictions = None, None, None - if labels is not None: - loss_fct = MultiLabelCategoricalCrossEntropy() - loss = loss_fct(preds=logits.reshape(batch_size * self.config.num_labels, -1), - target=labels.reshape(batch_size * self.config.num_labels, -1)) - if not self.training: # 训练时无需解码 - groudtruths = self.decode(logits=labels) - if not self.training: # 训练时无需解码 - predictions = self.decode(logits=logits) - return GlobalPointOutput(loss=loss, - logits=logits, - predictions=predictions, - groundtruths=groudtruths) - - def decode(self, logits): - all_entity_list = [] - batch_size = logits.size(0) - for bs in range(batch_size): - entity_list = [] - _logits = logits[bs].float() - _logits[:,[0,-1]] -= torch.tensor(float("inf")) - _logits[:,:,[0,-1]] -= torch.tensor(float("inf")) - for label_id, start_idx, end_idx in zip(*torch.where(_logits > self.config.decode_thresh)): - label_id, start_idx, end_idx = label_id.item(), start_idx.item(), end_idx.item() - label = self.config.id2label[label_id] - entity_list.append([start_idx - 1, end_idx - 1, label]) - all_entity_list.append(entity_list) - # import pdb - # pdb.set_trace() - return all_entity_list diff --git a/torchblocks/tasks/sequence_labeling_softmax.py b/torchblocks/tasks/sequence_labeling_softmax.py deleted file mode 100755 index 401f29d..0000000 --- a/torchblocks/tasks/sequence_labeling_softmax.py +++ /dev/null @@ -1,69 +0,0 @@ -import torch -import torch.nn as nn -from typing import * -from torch.nn import CrossEntropyLoss -from transformers.file_utils import ModelOutput -from dataclasses import dataclass -from transformers import BertPreTrainedModel, BertModel - -@dataclass -class SequenceLabelingOutput(ModelOutput): - loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - predictions: List[List[List[str]]] = None - groundtruths: List[List[List[str]]] = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - -class BertSoftmaxForSeqLabel(BertPreTrainedModel): - - def __init__(self, config): - super().__init__(config) - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.init_weights() - - def forward(self, - input_ids, - token_type_ids=None, - attention_mask=None, - labels=None - ): - outputs = self.bert( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids - ) - sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - loss, groundtruths, predictions = None, None, None - if labels is not None: - loss_fct = CrossEntropyLoss(ignore_index=0) - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.config.num_labels)[active_loss] - active_labels = labels.view(-1)[active_loss] - loss = loss_fct(active_logits, active_labels) - if not self.training: - groundtruths = self.decode(labels, attention_mask, is_logits=False) - if not self.training: # 训练时无需解码 - predictions = self.decode(logits, attention_mask, is_logits=True) - return SequenceLabelingOutput( - loss=loss, - logits=logits, - predictions=predictions, - groundtruths=groundtruths, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def decode(self, logits, mask, is_logits=False) -> List[List[List[str]]]: - decode_ids = logits - if is_logits: - decode_ids = torch.argmax(logits, -1) # (batch_size, seq_length) - decode_labels = [] - for ids, mask in zip(decode_ids, mask): - decode_label = [self.config.id2label[id.item()] for id, m in zip(ids, mask) if m > 0][1:-1] # [CLS], [SEP] - decode_labels.append(decode_label) - return decode_labels diff --git a/torchblocks/tasks/sequence_labeling_span.py b/torchblocks/tasks/sequence_labeling_span.py deleted file mode 100755 index a4c51e7..0000000 --- a/torchblocks/tasks/sequence_labeling_span.py +++ /dev/null @@ -1,138 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from typing import * -from dataclasses import dataclass -from collections import defaultdict -from torchblocks.losses.span_loss import SpanLoss -from transformers import BertPreTrainedModel, BertModel -from transformers.file_utils import ModelOutput -from torchblocks.layers.linears import PoolerStartLogits, PoolerEndLogits - - -@dataclass -class SpanOutput(ModelOutput): - loss: Optional[torch.FloatTensor] = None - start_logits: torch.FloatTensor = None - end_logits: torch.FloatTensor = None - predictions: List[List[Tuple[int, int, int]]] = None - groundtruths: List[List[Tuple[int, int, int]]] = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -class BertSpanForSeqLabel(BertPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.start_fc = PoolerStartLogits(config.hidden_size, config.num_labels) - self.end_fc = PoolerEndLogits(config.hidden_size + config.num_labels, config.num_labels) - self.init_weights() - - def forward(self, - input_ids, - attention_mask=None, - token_type_ids=None, - start_positions=None, - end_positions=None, - ): - outputs = self.bert( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids - ) - sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output) - start_logits = self.start_fc(sequence_output) - if self.training: - batch_size = input_ids.size(0) - seq_len = input_ids.size(1) - label_logits = torch.zeros([batch_size, seq_len, self.config.num_labels]) - label_logits = label_logits.to(input_ids.device) - label_logits.scatter_(2, start_positions.unsqueeze(2), 1) - else: - label_logits = F.softmax(start_logits, -1) - end_logits = self.end_fc(sequence_output, label_logits) - loss, predictions, groundtruths = None, None, None - if start_positions is not None and end_positions is not None: - loss_fct = SpanLoss() - loss = loss_fct(preds=(start_logits, end_logits), - target=(start_positions, end_positions), - masks=attention_mask) - if not self.training: # 训练时无需解码 - groundtruths = self.decode( - start_positions, end_positions, attention_mask, is_logits=False - ) - if not self.training: # 训练时无需解码 - predictions = self.decode( - start_logits, end_logits, attention_mask, - start_thresh=getattr(self.config, "start_thresh", 0.0), - end_thresh=getattr(self.config, "end_thresh", 0.0), - ) - return SpanOutput( - loss=loss, - start_logits=start_logits, - end_logits=end_logits, - predictions=predictions, - groundtruths=groundtruths, - hidden_states=None, - attentions=None - ) - - def decode(self, start_logits_or_labels, end_logits_or_labels, sequence_mask, - start_thresh=0.0, end_thresh=0.0, is_logits=True, **kwargs): - """ - Params: - start_logits_or_labels: tensor(batch_size, sequence_length, num_labels) - end_logits_or_labels: tensor(batch_size, sequence_length, num_labels) - sequence_mask: tensor(batch_size, sequence_length) - Returns: - predictions: List[List[Tuple[int, int, int]]] - """ - other_id = self.config.label2id["O"] - id2label = self.config.id2label - max_span_length = kwargs.get("max_span_length", float("inf")) - if is_logits: # 复用decode - # TODO: 概率先判断是否为实体 - start_probs = start_logits_or_labels.softmax(dim=-1) # (batch_size, sequence_length, num_labels) - other_probs = start_probs[..., other_id] # (batch_size, sequence_length) - other_probs = torch.where(other_probs < start_thresh, - torch.zeros_like(other_probs), other_probs) - start_probs[..., other_id] = other_probs - start_probs, start_labels = start_probs.max(dim=-1) - - end_probs = end_logits_or_labels.softmax(dim=-1) # (batch_size, sequence_length, num_labels) - other_probs = end_probs[..., other_id] # (batch_size, sequence_length) - other_probs = torch.where(other_probs < end_thresh, - torch.zeros_like(other_probs), other_probs) - end_probs[..., other_id] = other_probs - end_probs, end_labels = end_probs.max(dim=-1) - - else: - start_labels, end_labels = start_logits_or_labels, end_logits_or_labels - decode_labels = [] - batch_size = sequence_mask.size(0) - for i in range(batch_size): - decode_labels.append([]) - label_start_map = defaultdict(list) # 每种类别设置起始标志,处理实体重叠情况,如: - # start: [0, 0, 1, 0, 2, 0, 0, 0] - # end: [0, 0, 0, 0, 0, 2, 1, 0] - for pos, (s, e, m) in enumerate(zip(start_labels[i], end_labels[i], sequence_mask[i])): - s, e, m = s.item(), e.item(), m.item() - if m == 0: break - if s != other_id: - label_start_map[s].append(pos) # 以下两个功能: - # 1. 进入s类型span,以label_start_map[s]标记; - # 2. 若在s类型span内,但重新遇到s类型span起始时,追加位置 - if e != other_id: # 在e类型span内(已包括处理单个token的实体) - for start in label_start_map[e]: - start, end, label = start - 1, pos, id2label[e] # [CLS] - if end - start < max_span_length: - decode_labels[-1].append((start, end, label)) # 遇到结束位置,新建span - label_start_map[e] = list() - # TODO: 强制匹配策略,即start必须匹配end - for k, v in label_start_map.items(): - if v is not None: - pass - return decode_labels diff --git a/torchblocks/tasks/siamese_classification.py b/torchblocks/tasks/siamese_classification.py deleted file mode 100755 index 35b746f..0000000 --- a/torchblocks/tasks/siamese_classification.py +++ /dev/null @@ -1,48 +0,0 @@ -import torch -from typing import * -import torch.nn as nn -from dataclasses import dataclass -from torch.nn import CrossEntropyLoss -from transformers import BertPreTrainedModel, BertModel -from transformers.file_utils import ModelOutput - - -@dataclass -class SiameseClassificatioOutput(ModelOutput): - loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - - -class BertForSiameseClassification(BertPreTrainedModel): - - def __init__(self, config): - super(BertForSiameseClassification, self).__init__(config) - self.num_labels = config.num_labels - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.seq_relationship = nn.Linear(config.hidden_size * 3, config.num_labels) - self.init_weights() - - def forward(self, input_ids_a, input_ids_b, - token_type_ids_a=None, - token_type_ids_b=None, - attention_mask_a=None, - attention_mask_b=None, - labels=None): - outputs_a = self.bert(input_ids_a, - token_type_ids=token_type_ids_a, - attention_mask=attention_mask_a) - outputs_b = self.bert(input_ids_b, - token_type_ids=token_type_ids_b, - attention_mask=attention_mask_b) - pooled_output = torch.cat([outputs_a[1], outputs_b[1], torch.abs(outputs_a[1] - outputs_b[1])], dim=1) - pooled_output = self.dropout(pooled_output) - logits = self.seq_relationship(pooled_output) - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - return SiameseClassificatioOutput( - loss=loss, - logits=logits, - ) diff --git a/torchblocks/utils/__init__.py b/torchblocks/utils/__init__.py deleted file mode 100755 index a4c0e0c..0000000 --- a/torchblocks/utils/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .device import * -from .tensor import * -from .paths import * -from .options import * -from .common import * -from .logger import * -from .meter import * -from .seed import * \ No newline at end of file diff --git a/torchblocks/utils/common.py b/torchblocks/utils/common.py deleted file mode 100755 index f8507c2..0000000 --- a/torchblocks/utils/common.py +++ /dev/null @@ -1,3 +0,0 @@ -def check_object_type(object, check_type, name): - if not isinstance(object, check_type): - raise TypeError(f"The type of {name} must be {check_type}, but got {type(object)}.") diff --git a/torchblocks/utils/options.py b/torchblocks/utils/options.py deleted file mode 100755 index 5b1c066..0000000 --- a/torchblocks/utils/options.py +++ /dev/null @@ -1,230 +0,0 @@ -import os -import json -import argparse -from pathlib import Path -from torchblocks.utils.paths import save_json, check_file - - -class Argparser(argparse.ArgumentParser): - def __init__(self, **kwargs): - super(Argparser, self).__init__(**kwargs) - - @classmethod - def get_training_parser(cls, description='Arguments'): - parser = cls(description=description, add_help=True) - parser.arguments_required() - parser.arguments_common() - parser.arguments_input_file() - parser.arguments_dataset() - parser.arguments_dataloader() - parser.arguments_pretrained() - parser.arguments_ema() - parser.arguments_adv() - parser.arguments_optimimzer() - parser.arguments_lr_scheduler() - parser.arguments_apex() - parser.arguments_checkpoint() - parser.arguments_earlystopping() - return parser - - @classmethod - def parse_args_from_parser(cls, parser): - args = parser.parse_args() - parser.make_experiment_dir(args) - parser.save_args_to_json(args) - parser.print_args(args) - return args - - @classmethod - def parse_args_from_json(cls, json_file): - check_file(json_file) - data = json.loads(Path(json_file).read_text()) - return argparse.Namespace(**data) - - @classmethod - def get_training_arguments(cls): - parser = cls.get_training_parser() - args = cls.parse_args_from_parser(parser) - return args - - def get_val_argments(self): - args = Argparser.get_training_arguments() - return args - - def get_predict_arguments(self): - args = Argparser.get_training_arguments() - return args - - def arguments_required(self): - group = self.add_argument_group(title="required arguments", description="required arguments") - group.add_argument("--task_name", default=None, type=str, required=True, - help="The name of the task to train. ") - group.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") - group.add_argument("--model_type", default=None, type=str, required=True, - help="The name of the model to train.") - group.add_argument("--data_dir", default=None, type=str, required=True, - help="The input data dir. Should contain the training files for task.") - - def arguments_common(self): - group = self.add_argument_group(title="common arguments", description="common arguments") - group.add_argument("--seed", type=int, default=42, help="random seed for initialization") - group.add_argument("--do_train", action="store_true", help="Whether to run training.") - group.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") - group.add_argument("--do_predict", action="store_true", help="Whether to run predict on the test set.") - group.add_argument("--device_id", type=str, default='0', - help='multi-gpu:"0,1,.." or single-gpu:"0" or cpu:"cpu"') - group.add_argument("--evaluate_during_training", action="store_true", - help="Whether to run evaluation during training at each logging step.", ) - group.add_argument('--load_arguments_file', type=str, default=None, help="load args from arguments file") - group.add_argument("--save_steps", type=int, default=-1, - help="Save checkpoint every X updates steps. ``-1`` means that a epoch") - group.add_argument("--logging_steps", type=int, default=-1, - help="Log every X updates steps.``-1`` means that a epoch") - group.add_argument('--experiment_code', type=str, default='v0', help='experiment code') - - def arguments_input_file(self): - group = self.add_argument_group(title="input file arguments", description="input file arguments") - group.add_argument("--train_input_file", default=None, type=str, help="The name of train input file") - group.add_argument("--eval_input_file", default=None, type=str, help="The name of eval input file") - group.add_argument("--test_input_file", default=None, type=str, help="The name of test input file") - - def arguments_dataset(self): - group = self.add_argument_group(title="datasets arguments", description="datasets arguments") - group.add_argument("--train_max_seq_length", default=128, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.", ) - group.add_argument("--eval_max_seq_length", default=512, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.", ) - group.add_argument("--test_max_seq_length", default=512, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.", ) - group.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - group.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - group.add_argument("--per_gpu_test_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for test evaluation.") - group.add_argument("--overwrite_data_cache", action='store_true', - help="Whether to overwrite the cached training and evaluation feature sets") - group.add_argument("--use_data_cache", action='store_true', - help='Whether to load the cached training feature sets') - - def arguments_dataloader(self): - group = self.add_argument_group(title="dataloader arguments", description="dataloader arguments") - group.add_argument('--pin_memory', default=False, action='store_true', - help='Use pin memory option in data loader') - group.add_argument("--drop_last", default=False, action='store_true') - group.add_argument('--num_workers', default=0, type=int, help='Number of data workers') - group.add_argument("--persistent_workers", default=False, action="store_true", help="") - - def arguments_pretrained(self): - group = self.add_argument_group(title="pretrained arguments", description="pretrained arguments") - group.add_argument("--pretrained_model_path", default=None, type=str, - help="Path to pre-trained model or shortcut name selected in the list") - group.add_argument("--pretrained_config_name", default=None, type=str, - help="Pretrained config name or path if not the same as model_name") - group.add_argument("--pretrained_tokenizer_name", default=None, type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - group.add_argument("--do_lower_case", action="store_true", - help="Set this flag if you are using an uncased model.") - group.add_argument("--pretrained_cache_dir", default=None, type=str, - help="Where do you want to store the pre-trained models downloaded from s3", ) - - def arguments_ema(self): - group = self.add_argument_group(title='EMA', description='Exponential moving average arguments') - group.add_argument('--ema_enable', action='store_true', help='Exponential moving average') - group.add_argument('--ema_decay', type=float, default=0.9999, help='EMA decay') - group.add_argument("--model_ema_force_cpu", action='store_true') - - def arguments_adv(self): - group = self.add_argument_group(title='Adversarial training', description='Adversarial training arguments') - group.add_argument('--adv_enable', action='store_true', help='Adversarial training') - group.add_argument('--adv_start_steps', default=0, type=int, help='the step to start attack') - group.add_argument('--adv_type', default='fgm', type=str, choices=['fgm', 'pgd', 'awp']) - group.add_argument('--adv_epsilon', type=float, default=1.0, help='adv epsilon') - group.add_argument('--adv_name', type=str, default='word_embeddings', - help='name for adversarial layer') - group.add_argument('--adv_number', default=1, type=int, help='the number of attack') - group.add_argument('--adv_alpha', default=0.3, type=float, help='adv alpha') - - def arguments_optimimzer(self): - group = self.add_argument_group(title='optimizer', description='Optimizer related arguments') - group.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.") - group.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") - group.add_argument("--adam_beta1", default=0.9, type=float, help="Beta1 for AdamW optimizer") - group.add_argument("--adam_beta2", default=0.999, type=float, help='Beta2 for AdamW optimizer') - group.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") - - def arguments_lr_scheduler(self): - group = self.add_argument_group(title="lr scheduler arguments", description="LR scheduler arguments") - group.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") - group.add_argument("--other_learning_rate", default=0.0, type=float) - group.add_argument("--base_model_name", default='base_model', type=str, help='The main body of the model.') - group.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs") - group.add_argument("--gradient_accumulation_steps", type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.", ) - group.add_argument("--warmup_proportion", default=0.1, type=float, - help="Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training.") - group.add_argument("--warmup_steps", default=0, type=int, - help='Linear warmup over warmup_steps.') - group.add_argument("--scheduler_type", default='linear', type=str, - choices=["linear", 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', - 'constant_with_warmup'], - help='The scheduler type to use.') - - def arguments_apex(self): - group = self.add_argument_group(title="apex arguments", description="apex arguments") - group.add_argument("--fp16", action="store_true", - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) - group.add_argument("--fp16_opt_level", type=str, default="O1", - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html", ) - group.add_argument('--fp16_backend', default='auto', type=str, choices=["auto", "amp", "apex"], - help="The backend to be used for mixed precision.") - group.add_argument("--fp16_full_eval", action='store_true', - help="Whether to use full 16-bit precision evaluation instead of 32-bit") - - def arguments_checkpoint(self): - group = self.add_argument_group(title='model checkpoint', description='model checkpoint arguments') - group.add_argument("--checkpoint_mode", default='min', type=str, help='model checkpoint mode') - group.add_argument("--checkpoint_monitor", default='eval_loss', type=str, help='model checkpoint monitor') - group.add_argument("--checkpoint_save_best", action='store_true', help='Whether to save best model') - group.add_argument("--checkpoint_verbose", default=1, type=int, help='whether to print checkpoint info') - group.add_argument("--checkpoint_predict_code", type=str, default=None, - help='The version of checkpoint to predict') - group.add_argument('--eval_all_checkpoints', action="store_true", help="Evaluate all checkpoints starting", ) - - def arguments_earlystopping(self): - group = self.add_argument_group(title='early stopping', description='early stopping arguments') - group.add_argument("--earlystopping_patience", default=-1, type=int, - help='Interval (number of epochs) between checkpoints') - group.add_argument("--earlystopping_mode", default='min', type=str, help='early stopping mode') - group.add_argument("--earlystopping_monitor", default='eval_loss', type=str, help='early stopping monitor') - group.add_argument("--earlystopping_verbose", default=1, type=int, help='whether to print earlystopping info') - group.add_argument('--earlystopping_save_state_path', default=None, type=str) - group.add_argument('--earlystopping_load_state_path', default=None, type=str) - - def save_args_to_json(self, args): - if args.do_train: - save_arguments_file_name = f"{args.task_name}_{args.model_type}_{args.experiment_code}_opts.json" - save_arguments_file_path = os.path.join(args.output_dir, save_arguments_file_name) - if os.path.exists(save_arguments_file_path): - print(f"[Warning]File {save_arguments_file_path} exist,Overwrite arguments file") - with open(str(save_arguments_file_path), 'w') as f: - json.dump(vars(args), f, ensure_ascii=False, indent=4) - - def print_args(self, args): - print('**********************************') - print('************ Arguments ***********') - print('**********************************') - args_list = sorted(args.__dict__.items(),key=lambda x:x[0]) - msg = '' - for k,v in args_list: - msg += f' {k}: {v}\n' - print(msg) - - def make_experiment_dir(self, args): - args.output_dir = os.path.join(args.output_dir, f'{args.task_name}_{args.model_type}_{args.experiment_code}') - os.makedirs(args.output_dir, exist_ok=True) \ No newline at end of file diff --git a/torchblocks/utils/paths.py b/torchblocks/utils/paths.py deleted file mode 100755 index 7df7d40..0000000 --- a/torchblocks/utils/paths.py +++ /dev/null @@ -1,149 +0,0 @@ -import os -import torch -import json -import pickle -import logging -import glob -import numpy as np -import torch.nn as nn - -logger = logging.getLogger(__name__) - - -def check_file(file_path): - if not os.path.isfile(file_path): - raise ValueError(f"File is not found here: {file_path}") - return True - - -def is_file(file_path): - if os.path.isfile(file_path): - return True - return False - - -def check_dir(dir_path): - if not os.path.isdir(dir_path): - raise ValueError(f"Directory is not found here: {dir_path}") - return True - - -def find_all_files(dir_path): - dir_path = os.path.expanduser(dir_path) - files = [os.path.join(dir_path, fname) for fname in os.listdir(dir_path)] - logger.info(f"The number of files: {len(files)} , Direcory:{dir_path}") - return - - -def create_dir(dir_path): - if not os.path.isdir(dir_path): - os.makedirs(dir_path) - logger.info(f"Directory {dir_path} do not exist; creating...") - - -def save_pickle(data, file_path): - with open(str(file_path), 'wb') as f: - pickle.dump(data, f) - - -def load_pickle(file_path): - with open(str(file_path), 'rb') as f: - data = pickle.load(f) - return data - - -def save_numpy(data, file_path): - np.save(str(file_path), data) - - -def load_numpy(file_path): - np.load(str(file_path)) - - -def save_json(data, file_path): - with open(str(file_path), 'w') as f: - json.dump(data, f) - - -def load_json(file_path): - with open(str(file_path), 'r') as f: - data = json.load(f) - return data - - -class _Encoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, torch.device): - return str(obj) - else: - return super(_Encoder, self).default(obj) - - -def to_json_string(data): - """Serializes this instance to a JSON string.""" - return json.dumps(data, indent=2, sort_keys=True, cls=_Encoder) - - -def json_to_text(file_path, data): - with open(str(file_path), 'w') as fw: - for line in data: - line = json.dumps(line, ensure_ascii=False) - fw.write(line + '\n') - - -def dict_to_text(file_path, data): - with open(str(file_path), 'w') as fw: - for key in sorted(data.keys()): - fw.write("{} = {}\n".format(key, str(data[key]))) - - -def save_model(model, file_path): - if isinstance(model, nn.DataParallel): - model = model.module - state_dict = model.state_dict() - torch.save(state_dict, file_path) - - -def load_model(model, file_path, device=None): - if check_file(file_path): - print(f"loading model from {str(file_path)} .") - state_dict = torch.load(file_path, map_location="cpu" if device is None else device) - if isinstance(model, nn.DataParallel) or hasattr(model, "module"): - model.module.load_state_dict(state_dict, strict=False) - else: - model.load_state_dict(state_dict, strict=False) - - -def save_jit_model(model, example_inputs, save_dir, dir_name=None): - model.eval() - with torch.no_grad(): - traced_model = torch.jit.trace(model, example_inputs=example_inputs, strict=False) - if dir_name is None: - save_dir = os.path.join(save_dir, 'save_model_jit_traced') - else: - save_dir = os.path.join(save_dir, dir_name) - os.makedirs(save_dir, exist_ok=True) - torch.jit.save(traced_model, os.path.join(save_dir, 'pytorch_model.ts')) - return save_dir - - -def find_all_checkpoints(checkpoint_dir, - checkpoint_prefix='checkpoint', - checkpoint_name='pytorch_model.bin', - checkpoint_custom_names=None): - ''' - 获取模型保存路径下所有checkpoint模型路径,其中 - checkpoint_custom_names:表示自定义checkpoint列表 - ''' - checkpoints = list( - os.path.dirname(c) for c in sorted(glob.glob(checkpoint_dir + "/**/" + checkpoint_name, recursive=True)) - ) - checkpoints = [x for x in checkpoints if checkpoint_prefix in x] - if len(checkpoints) == 0: - raise ValueError("No checkpoint found at : '{}'".format(checkpoint_dir)) - if checkpoint_custom_names is not None: - if not isinstance(checkpoint_custom_names, list): - checkpoint_custom_names = [checkpoint_custom_names] - checkpoints = [x for x in checkpoints if x.split('/')[-1] in checkpoint_custom_names] - logger.info(f"Successfully get checkpoints:{checkpoints}.") - return checkpoints diff --git a/torchblocks/utils/versions.py b/torchblocks/utils/versions.py deleted file mode 100755 index cb2fbdb..0000000 --- a/torchblocks/utils/versions.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Utilities for working with package versions -""" - -import operator -import re -import sys -from typing import Optional - -from packaging import version - - -# The package importlib_metadata is in a different place, depending on the python version. -if sys.version_info < (3, 8): - import importlib_metadata -else: - import importlib.metadata as importlib_metadata - - -ops = { - "<": operator.lt, - "<=": operator.le, - "==": operator.eq, - "!=": operator.ne, - ">=": operator.ge, - ">": operator.gt, -} - - -def _compare_versions(op, got_ver, want_ver, requirement, pkg, hint): - if got_ver is None: - raise ValueError("got_ver is None") - if want_ver is None: - raise ValueError("want_ver is None") - if not ops[op](version.parse(got_ver), version.parse(want_ver)): - raise ImportError( - f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}" - ) - - -def require_version(requirement: str, hint: Optional[str] = None) -> None: - """ - Perform a runtime check of the dependency versions, using the exact same syntax used by pip. - - The installed module version comes from the `site-packages` dir via `importlib_metadata`. - - Args: - requirement (:obj:`str`): pip style definition, e.g., "tokenizers==0.9.4", "tqdm>=4.27", "numpy" - hint (:obj:`str`, `optional`): what suggestion to print in case of requirements not being met - - Example:: - - require_version("pandas>1.1.2") - require_version("numpy>1.18.5", "this is important to have for whatever reason") - - """ - - hint = f"\n{hint}" if hint is not None else "" - - # non-versioned check - if re.match(r"^[\w_\-\d]+$", requirement): - pkg, op, want_ver = requirement, None, None - else: - match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2}.+)", requirement) - if not match: - raise ValueError( - f"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but got {requirement}" - ) - pkg, want_full = match[0] - want_range = want_full.split(",") # there could be multiple requirements - wanted = {} - for w in want_range: - match = re.findall(r"^([\s!=<>]{1,2})(.+)", w) - if not match: - raise ValueError( - f"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but got {requirement}" - ) - op, want_ver = match[0] - wanted[op] = want_ver - if op not in ops: - raise ValueError(f"{requirement}: need one of {list(ops.keys())}, but got {op}") - - # special case - if pkg == "python": - got_ver = ".".join([str(x) for x in sys.version_info[:3]]) - for op, want_ver in wanted.items(): - _compare_versions(op, got_ver, want_ver, requirement, pkg, hint) - return - - # check if any version is installed - try: - got_ver = importlib_metadata.version(pkg) - except importlib_metadata.PackageNotFoundError: - raise importlib_metadata.PackageNotFoundError( - f"The '{requirement}' distribution was not found and is required by this application. {hint}" - ) - - # check that the right version is installed if version number or a range was provided - if want_ver is not None: - for op, want_ver in wanted.items(): - _compare_versions(op, got_ver, want_ver, requirement, pkg, hint) - - -def require_version_core(requirement): - """require_version wrapper which emits a core-specific hint on failure""" - hint = "Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git master" - return require_version(requirement, hint) diff --git a/torchblocks/version.py b/torchblocks/version.py deleted file mode 100755 index ba00bb9..0000000 --- a/torchblocks/version.py +++ /dev/null @@ -1,13 +0,0 @@ -__version__ = '0.7.0' -def parse_version_info(version_str): - version_info = [] - for x in version_str.split('.'): - if x.isdigit(): - version_info.append(int(x)) - elif x.find('rc') != -1: - patch_version = x.split('rc') - version_info.append(int(patch_version[0])) - version_info.append(f'rc{patch_version[1]}') - return tuple(version_info) - -version_info = parse_version_info(__version__)