Skip to content
Snippets Groups Projects
Commit 803205a8 authored by Risto Luukkonen's avatar Risto Luukkonen
Browse files

update creat_gpt_tokenizer.py

parent fc82462c
No related branches found
No related tags found
No related merge requests found
**Example**
```
sinteractive
module load pytorch
python3 --data 1-percent-sample.txt --output_dir tokenizer --vocab_size 50257 --save_vocab_only True
```
# Byte Level BPE (BBPE) tokenizers from Transformers and Tokenizers (Hugging Face libraries)
from transformers import GPT2TokenizerFast
from pathlib import Path
import argparse
import os import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
import argparse
def main(): class BPE_token(object):
parser.add_argument('--data', type=str,help='path to data dir') def __init__(self):
parser.add_argument('--output_dir',type=str, help='output dir for tokenizer') self.tokenizer = Tokenizer(BPE())
parser.add_argument('--vocab_size',type=int, default=50257, help='tokenizer vocab size') self.tokenizer.normalizer = Sequence([
NFKC()
# Train a Byte Level BPE (BBPE) tokenizer on data ])
args =parser.parse_args() self.tokenizer.pre_tokenizer = ByteLevel()
print(args) self.tokenizer.decoder = ByteLevelDecoder()
# Get GPT2 tokenizer_en vocab size
vocab_size = args.vocab_size
print("Create tokenizer with vocab size",vocab_size)
# ByteLevelBPETokenizer Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
from tokenizers import ByteLevelBPETokenizer
bbpe_tok_fin = ByteLevelBPETokenizer()
if not os.path.isdir(args.output_dir): def bpe_train(self,vocab_size, paths):
os.mkdir(args.output_dir) trainer = BpeTrainer(vocab_size=vocab_size, show_progress=True, initial_alphabet=ByteLevel.alphabet(), special_tokens=["<|endoftext|>"])
self.tokenizer.train(paths,trainer)
# Get list of paths to corpus files def save_tokenizer(self, location,vocab_only=True):
paths = [str(x) for x in Path(args.data).glob("**/*.txt")] if not os.path.exists(location):
os.makedirs(location)
if vocab_only:
self.tokenizer.model.save(location)
else:
self.tokenizer.save(location+".json")
if len(paths)>100:
print(f"Warning: file count is {len(paths} and processing may take for a very long time")
print("files:",len(paths)) def main():
# Customize training with <|endoftext|> special GPT2 token parser = argparse.ArgumentParser()
bbpe_tok_fin.train(files=paths, parser.add_argument('--data',type=str)
vocab_size=vocab_size, parser.add_argument('--output_dir',type=str,required=True)
min_frequency=2, parser.add_argument('--vocab_size',type=int,default=50257)
special_tokens=["<|endoftext|>"]) parser.add_argument('--save_vocab_only',default=True, action='store_true',help="If true, saves merges.txt and vocab.json,\n \
else saves a single file that can be load with Tokenizer.from_file() but isn't so easy to use with AutoTokenizer-api")
#TODO see how Tokenizer.from_file() can be used with transformers.AutoTokenizer()
# Get sequence length max of 1024 args = parser.parse_args()
bbpe_tok_fin.enable_truncation(max_length=1024)
from pathlib import Path
# save tokenizer if os.path.isdir(args.data):
bbpe_tok_fin.save_model(args.output_dir) paths = [str(x) for x in Path(path).glob("**/*.txt")]
if len(paths)>200:
print(f"WARNING: file count is {len(paths)}, trainer may take a while...")
elif args.data.split('.')[-1]!='txt':
print("data format needed is plain text with .txt-suffix")
sys.exit(1)
else:
paths = [args.data]
tokenizer = BPE_token()
# train the tokenizer model
tokenizer.bpe_train(args.vocab_size, paths)
# saving the tokenized data in our specified folder
tokenizer.save_tokenizer(args.output_dir)
if __name__=='__main__': if __name__=='__main__':
main() main()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment