- Step1: clone necessary repositories and create environments
# install dependencies for superbpe in a virtual environment
git clone https://github.com/xiulinyang/superbpe.git
conda create -n superbpe python=3.12 rust
conda activate superbpe
pip install -r requirements.txt
# install dependencies for others in another environment
git clone https://github.com/xiulinyang/causal-lm-training.git
cd causal-lm-training
pip install -e .- Step2: put training/dev/test data under the
datafolder. - Step3: train models using the following script. Note you can customize training/model/data hyperparameters in
generate_config.py
bash train_model.sh $LANG $vocab_size $tokenizer_type $model_type- Step4: Evaluation
-
Probablity-based metrics:
-
python src/clm/evaluation/perplexities.py $experiment "$model_name" log-ppl
-
-
KS Metric
-
python src/clm/evaluation/ks_distribution.py $experiment "$model_name"
-
-
Multilingual-BLiMP
-
python src/clm/evaluation/get_multiblimp_scores.py $experiment "$model_name"
-
-
SAS (TBD)
-