diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 399f93c10e..652897a658 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -193,8 +193,10 @@ def get_args(): group.add_argument('--tokenizer-type', type=str, required=True, choices=['BertWordPieceLowerCase','BertWordPieceCase', 'GPT2BPETokenizer', 'SentencePieceTokenizer', - 'GPTSentencePieceTokenizer', 'NullTokenizer'], + 'GPTSentencePieceTokenizer', 'NullTokenizer', 'HFTokenizer'], help='What type of tokenizer to use.') + group.add_argument('--seq-length', type=int, default=1024, + help='The length of the packed inputs.') group.add_argument('--tokenizer-model', type=str, default=None, help='YTTM tokenizer model.') group.add_argument('--vocab-file', type=str, default=None,