From 803205a8d74e9cc0e18ee309e6b97d216654e551 Mon Sep 17 00:00:00 2001 From: Risto Luukkonen <risto.m.luukkonen@utu.fi> Date: Tue, 28 Sep 2021 12:59:22 +0300 Subject: [PATCH] update creat_gpt_tokenizer.py --- gpt-fin/tokenization/README.md | 8 ++ gpt-fin/tokenization/create_gpt_tokenizer.py | 101 +++++++++++-------- 2 files changed, 65 insertions(+), 44 deletions(-) create mode 100644 gpt-fin/tokenization/README.md diff --git a/gpt-fin/tokenization/README.md b/gpt-fin/tokenization/README.md new file mode 100644 index 0000000..91f5801 --- /dev/null +++ b/gpt-fin/tokenization/README.md @@ -0,0 +1,8 @@ +**Example** +``` +sinteractive + +module load pytorch + +python3 --data 1-percent-sample.txt --output_dir tokenizer --vocab_size 50257 --save_vocab_only True +``` diff --git a/gpt-fin/tokenization/create_gpt_tokenizer.py b/gpt-fin/tokenization/create_gpt_tokenizer.py index 8888138..77d0c19 100644 --- a/gpt-fin/tokenization/create_gpt_tokenizer.py +++ b/gpt-fin/tokenization/create_gpt_tokenizer.py @@ -1,50 +1,63 @@ -# Byte Level BPE (BBPE) tokenizers from Transformers and Tokenizers (Hugging Face libraries) +import os +from tokenizers.models import BPE +from tokenizers import Tokenizer +from tokenizers.decoders import ByteLevel as ByteLevelDecoder +from tokenizers.normalizers import NFKC, Sequence +from tokenizers.pre_tokenizers import ByteLevel +from tokenizers.trainers import BpeTrainer +import argparse + +class BPE_token(object): + def __init__(self): + self.tokenizer = Tokenizer(BPE()) + self.tokenizer.normalizer = Sequence([ + NFKC() + ]) + self.tokenizer.pre_tokenizer = ByteLevel() + self.tokenizer.decoder = ByteLevelDecoder() + + def bpe_train(self,vocab_size, paths): + trainer = BpeTrainer(vocab_size=vocab_size, show_progress=True, initial_alphabet=ByteLevel.alphabet(), special_tokens=["<|endoftext|>"]) + self.tokenizer.train(paths,trainer) + + def save_tokenizer(self, location,vocab_only=True): + if not os.path.exists(location): + os.makedirs(location) + if vocab_only: + self.tokenizer.model.save(location) + else: + self.tokenizer.save(location+".json") -from transformers import GPT2TokenizerFast -from pathlib import Path -import argparse -import os def main(): - parser.add_argument('--data', type=str,help='path to data dir') - parser.add_argument('--output_dir',type=str, help='output dir for tokenizer') - parser.add_argument('--vocab_size',type=int, default=50257, help='tokenizer vocab size') - - # Train a Byte Level BPE (BBPE) tokenizer on data - args =parser.parse_args() - print(args) - # Get GPT2 tokenizer_en vocab size - vocab_size = args.vocab_size - print("Create tokenizer with vocab size",vocab_size) - - # ByteLevelBPETokenizer Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model - - from tokenizers import ByteLevelBPETokenizer - bbpe_tok_fin = ByteLevelBPETokenizer() - - if not os.path.isdir(args.output_dir): - os.mkdir(args.output_dir) - - # Get list of paths to corpus files - paths = [str(x) for x in Path(args.data).glob("**/*.txt")] - - if len(paths)>100: - print(f"Warning: file count is {len(paths} and processing may take for a very long time") - - print("files:",len(paths)) - # Customize training with <|endoftext|> special GPT2 token - bbpe_tok_fin.train(files=paths, - vocab_size=vocab_size, - min_frequency=2, - special_tokens=["<|endoftext|>"]) - - # Get sequence length max of 1024 - bbpe_tok_fin.enable_truncation(max_length=1024) - + parser = argparse.ArgumentParser() + parser.add_argument('--data',type=str) + parser.add_argument('--output_dir',type=str,required=True) + parser.add_argument('--vocab_size',type=int,default=50257) + parser.add_argument('--save_vocab_only',default=True, action='store_true',help="If true, saves merges.txt and vocab.json,\n \ + else saves a single file that can be load with Tokenizer.from_file() but isn't so easy to use with AutoTokenizer-api") + #TODO see how Tokenizer.from_file() can be used with transformers.AutoTokenizer() + + args = parser.parse_args() + + from pathlib import Path + + if os.path.isdir(args.data): + paths = [str(x) for x in Path(path).glob("**/*.txt")] + if len(paths)>200: + print(f"WARNING: file count is {len(paths)}, trainer may take a while...") + elif args.data.split('.')[-1]!='txt': + print("data format needed is plain text with .txt-suffix") + sys.exit(1) + else: + paths = [args.data] - # save tokenizer - bbpe_tok_fin.save_model(args.output_dir) - + tokenizer = BPE_token() + # train the tokenizer model + tokenizer.bpe_train(args.vocab_size, paths) + # saving the tokenized data in our specified folder + tokenizer.save_tokenizer(args.output_dir) + if __name__=='__main__': - main() + main() -- GitLab