Skip to content
Snippets Groups Projects
Commit 803205a8 authored by Risto Luukkonen's avatar Risto Luukkonen
Browse files

update creat_gpt_tokenizer.py

parent fc82462c
No related branches found
No related tags found
No related merge requests found
**Example**
```
sinteractive
module load pytorch
python3 --data 1-percent-sample.txt --output_dir tokenizer --vocab_size 50257 --save_vocab_only True
```
# Byte Level BPE (BBPE) tokenizers from Transformers and Tokenizers (Hugging Face libraries)
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
import argparse
class BPE_token(object):
def __init__(self):
self.tokenizer = Tokenizer(BPE())
self.tokenizer.normalizer = Sequence([
NFKC()
])
self.tokenizer.pre_tokenizer = ByteLevel()
self.tokenizer.decoder = ByteLevelDecoder()
def bpe_train(self,vocab_size, paths):
trainer = BpeTrainer(vocab_size=vocab_size, show_progress=True, initial_alphabet=ByteLevel.alphabet(), special_tokens=["<|endoftext|>"])
self.tokenizer.train(paths,trainer)
def save_tokenizer(self, location,vocab_only=True):
if not os.path.exists(location):
os.makedirs(location)
if vocab_only:
self.tokenizer.model.save(location)
else:
self.tokenizer.save(location+".json")
from transformers import GPT2TokenizerFast
from pathlib import Path
import argparse
import os
def main():
parser.add_argument('--data', type=str,help='path to data dir')
parser.add_argument('--output_dir',type=str, help='output dir for tokenizer')
parser.add_argument('--vocab_size',type=int, default=50257, help='tokenizer vocab size')
# Train a Byte Level BPE (BBPE) tokenizer on data
args =parser.parse_args()
print(args)
# Get GPT2 tokenizer_en vocab size
vocab_size = args.vocab_size
print("Create tokenizer with vocab size",vocab_size)
# ByteLevelBPETokenizer Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
from tokenizers import ByteLevelBPETokenizer
bbpe_tok_fin = ByteLevelBPETokenizer()
if not os.path.isdir(args.output_dir):
os.mkdir(args.output_dir)
# Get list of paths to corpus files
paths = [str(x) for x in Path(args.data).glob("**/*.txt")]
if len(paths)>100:
print(f"Warning: file count is {len(paths} and processing may take for a very long time")
print("files:",len(paths))
# Customize training with <|endoftext|> special GPT2 token
bbpe_tok_fin.train(files=paths,
vocab_size=vocab_size,
min_frequency=2,
special_tokens=["<|endoftext|>"])
# Get sequence length max of 1024
bbpe_tok_fin.enable_truncation(max_length=1024)
parser = argparse.ArgumentParser()
parser.add_argument('--data',type=str)
parser.add_argument('--output_dir',type=str,required=True)
parser.add_argument('--vocab_size',type=int,default=50257)
parser.add_argument('--save_vocab_only',default=True, action='store_true',help="If true, saves merges.txt and vocab.json,\n \
else saves a single file that can be load with Tokenizer.from_file() but isn't so easy to use with AutoTokenizer-api")
#TODO see how Tokenizer.from_file() can be used with transformers.AutoTokenizer()
args = parser.parse_args()
from pathlib import Path
if os.path.isdir(args.data):
paths = [str(x) for x in Path(path).glob("**/*.txt")]
if len(paths)>200:
print(f"WARNING: file count is {len(paths)}, trainer may take a while...")
elif args.data.split('.')[-1]!='txt':
print("data format needed is plain text with .txt-suffix")
sys.exit(1)
else:
paths = [args.data]
# save tokenizer
bbpe_tok_fin.save_model(args.output_dir)
tokenizer = BPE_token()
# train the tokenizer model
tokenizer.bpe_train(args.vocab_size, paths)
# saving the tokenized data in our specified folder
tokenizer.save_tokenizer(args.output_dir)
if __name__=='__main__':
main()
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment