-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathtrain_tokenizer.sh
More file actions
33 lines (29 loc) · 1.26 KB
/
train_tokenizer.sh
File metadata and controls
33 lines (29 loc) · 1.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
dataset_name=olmo2_p99_truncate
vocab_size=128000
num_bytes=$((10**10))
regex_string="[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+"
corpus_dir=/gscratch/xlab/alisaliu/pretokenization/data/${dataset_name}/train # a directory containing txt files for tokenizer training
# convert num_bytes to something like 10G or 100M, depending on the value
if [ $num_bytes -ge $((10**9)) ]; then
num_bytes_str=$(($num_bytes / 10**9))G
elif [ $num_bytes -ge $((10**6)) ]; then
num_bytes_str=$(($num_bytes / 10**6))M
elif [ $num_bytes -ge $((10**3)) ]; then
num_bytes_str=$(($num_bytes / 10**3))K
else
num_bytes_str=${num_bytes}
fi
# convert vocab_size to something like 100K, depending on the value
if [ $vocab_size -ge 1000 ]; then
vocab_size_str=$(($vocab_size / 1000))K
else
vocab_size_str=${vocab_size}
fi
output_dir=tokenizer_json/bpe_${dataset_name}_${num_bytes_str}_${vocab_size_str}
echo "output_dir: $output_dir"
python -m train_tokenizer \
--output_dir $output_dir \
--corpus_dir $corpus_dir \
--num_bytes $num_bytes \
--vocab_size $vocab_size \
--regex_string "$regex_string"