Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
G
gpt_model_proto
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Risto Luukkonen
gpt_model_proto
Commits
803205a8
Commit
803205a8
authored
3 years ago
by
Risto Luukkonen
Browse files
Options
Downloads
Patches
Plain Diff
update creat_gpt_tokenizer.py
parent
fc82462c
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
gpt-fin/tokenization/README.md
+8
-0
8 additions, 0 deletions
gpt-fin/tokenization/README.md
gpt-fin/tokenization/create_gpt_tokenizer.py
+57
-44
57 additions, 44 deletions
gpt-fin/tokenization/create_gpt_tokenizer.py
with
65 additions
and
44 deletions
gpt-fin/tokenization/README.md
0 → 100644
+
8
−
0
View file @
803205a8
**Example**
```
sinteractive
module load pytorch
python3 --data 1-percent-sample.txt --output_dir tokenizer --vocab_size 50257 --save_vocab_only True
```
This diff is collapsed.
Click to expand it.
gpt-fin/tokenization/create_gpt_tokenizer.py
+
57
−
44
View file @
803205a8
# Byte Level BPE (BBPE) tokenizers from Transformers and Tokenizers (Hugging Face libraries)
import
os
from
tokenizers.models
import
BPE
from
tokenizers
import
Tokenizer
from
tokenizers.decoders
import
ByteLevel
as
ByteLevelDecoder
from
tokenizers.normalizers
import
NFKC
,
Sequence
from
tokenizers.pre_tokenizers
import
ByteLevel
from
tokenizers.trainers
import
BpeTrainer
import
argparse
class
BPE_token
(
object
):
def
__init__
(
self
):
self
.
tokenizer
=
Tokenizer
(
BPE
())
self
.
tokenizer
.
normalizer
=
Sequence
([
NFKC
()
])
self
.
tokenizer
.
pre_tokenizer
=
ByteLevel
()
self
.
tokenizer
.
decoder
=
ByteLevelDecoder
()
def
bpe_train
(
self
,
vocab_size
,
paths
):
trainer
=
BpeTrainer
(
vocab_size
=
vocab_size
,
show_progress
=
True
,
initial_alphabet
=
ByteLevel
.
alphabet
(),
special_tokens
=
[
"
<|endoftext|>
"
])
self
.
tokenizer
.
train
(
paths
,
trainer
)
def
save_tokenizer
(
self
,
location
,
vocab_only
=
True
):
if
not
os
.
path
.
exists
(
location
):
os
.
makedirs
(
location
)
if
vocab_only
:
self
.
tokenizer
.
model
.
save
(
location
)
else
:
self
.
tokenizer
.
save
(
location
+
"
.json
"
)
from
transformers
import
GPT2TokenizerFast
from
pathlib
import
Path
import
argparse
import
os
def
main
():
parser
.
add_argument
(
'
--data
'
,
type
=
str
,
help
=
'
path to data dir
'
)
parser
.
add_argument
(
'
--output_dir
'
,
type
=
str
,
help
=
'
output dir for tokenizer
'
)
parser
.
add_argument
(
'
--vocab_size
'
,
type
=
int
,
default
=
50257
,
help
=
'
tokenizer vocab size
'
)
# Train a Byte Level BPE (BBPE) tokenizer on data
args
=
parser
.
parse_args
()
print
(
args
)
# Get GPT2 tokenizer_en vocab size
vocab_size
=
args
.
vocab_size
print
(
"
Create tokenizer with vocab size
"
,
vocab_size
)
# ByteLevelBPETokenizer Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
from
tokenizers
import
ByteLevelBPETokenizer
bbpe_tok_fin
=
ByteLevelBPETokenizer
()
if
not
os
.
path
.
isdir
(
args
.
output_dir
):
os
.
mkdir
(
args
.
output_dir
)
# Get list of paths to corpus files
paths
=
[
str
(
x
)
for
x
in
Path
(
args
.
data
).
glob
(
"
**/*.txt
"
)]
if
len
(
paths
)
>
100
:
print
(
f
"
Warning: file count is
{
len
(
paths
}
and processing may take for a very long time
"
)
print
(
"
files:
"
,
len
(
paths
))
# Customize training with <|endoftext|> special GPT2 token
bbpe_tok_fin
.
train
(
files
=
paths
,
vocab_size
=
vocab_size
,
min_frequency
=
2
,
special_tokens
=
[
"
<|endoftext|>
"
])
# Get sequence length max of 1024
bbpe_tok_fin
.
enable_truncation
(
max_length
=
1024
)
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'
--data
'
,
type
=
str
)
parser
.
add_argument
(
'
--output_dir
'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'
--vocab_size
'
,
type
=
int
,
default
=
50257
)
parser
.
add_argument
(
'
--save_vocab_only
'
,
default
=
True
,
action
=
'
store_true
'
,
help
=
"
If true, saves merges.txt and vocab.json,
\n
\
else saves a single file that can be load with Tokenizer.from_file() but isn
'
t so easy to use with AutoTokenizer-api
"
)
#TODO see how Tokenizer.from_file() can be used with transformers.AutoTokenizer()
args
=
parser
.
parse_args
()
from
pathlib
import
Path
if
os
.
path
.
isdir
(
args
.
data
):
paths
=
[
str
(
x
)
for
x
in
Path
(
path
).
glob
(
"
**/*.txt
"
)]
if
len
(
paths
)
>
200
:
print
(
f
"
WARNING: file count is
{
len
(
paths
)
}
, trainer may take a while...
"
)
elif
args
.
data
.
split
(
'
.
'
)[
-
1
]
!=
'
txt
'
:
print
(
"
data format needed is plain text with .txt-suffix
"
)
sys
.
exit
(
1
)
else
:
paths
=
[
args
.
data
]
# save tokenizer
bbpe_tok_fin
.
save_model
(
args
.
output_dir
)
tokenizer
=
BPE_token
()
# train the tokenizer model
tokenizer
.
bpe_train
(
args
.
vocab_size
,
paths
)
# saving the tokenized data in our specified folder
tokenizer
.
save_tokenizer
(
args
.
output_dir
)
if
__name__
==
'
__main__
'
:
main
()
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment