diff --git a/scripts_on_cluster/bodyseg_training/job.run b/scripts_on_cluster/bodyseg_training/job.run index 73e663b..7a1154c 100644 --- a/scripts_on_cluster/bodyseg_training/job.run +++ b/scripts_on_cluster/bodyseg_training/job.run @@ -5,11 +5,11 @@ #SBATCH --ntasks 1 #SBATCH --cpus-per-task 16 #SBATCH --mem 92GB -#SBATCH --time 48:00:00 +#SBATCH --time 72:00:00 #SBATCH --partition=h100 #SBATCH --qos=normal #SBATCH --gres=gpu:1 -#SBATCH --output /home/sibwang/poseforge/scripts_on_cluster/bodyseg_training/output_20251118a.log +#SBATCH --output /home/sibwang/poseforge/scripts_on_cluster/bodyseg_training/output_20251127a.log echo "Hello from $(hostname)" @@ -19,7 +19,10 @@ conda activate poseforge cd $HOME/poseforge training_cli_path="src/poseforge/pose/bodyseg/scripts/run_bodyseg_training.py" -training_trial_name="trial_20251118a" +training_trial_name="trial_20251127a" +contrastive_pretraining_trial_name="trial_20251125a_lowlr" +contrastive_pretraining_epoch="epoch009" +contrastive_pretraining_local_step="step003055" echo "Training starting at $(date)" @@ -30,32 +33,32 @@ python -u $training_cli_path \ --model-architecture-config.final-upsampler-n-hidden-channels 32 \ --model-architecture-config.confidence-method entropy \ --model-weights-config.feature-extractor-weights \ - "bulk_data/pose_estimation/contrastive_pretraining/trial_20251117a/checkpoints/checkpoint_epoch009_step003055.feature_extractor.pth" \ + "bulk_data/pose_estimation/contrastive_pretraining/$contrastive_pretraining_trial_name/checkpoints/checkpoint_${contrastive_pretraining_epoch}_${contrastive_pretraining_local_step}.feature_extractor.pth" \ --loss-config.weight-dice 1.0 \ --loss-config.weight-ce 1.0 \ --training-data-config.train-data-dirs \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial001" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial002" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial003" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial004" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial005" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly2_trial001" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly2_trial002" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly2_trial003" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly2_trial004" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly2_trial005" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly3_trial001" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly3_trial002" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly3_trial003" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly3_trial004" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly3_trial005" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly4_trial001" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly4_trial002" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly4_trial003" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly4_trial004" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly4_trial005" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial001" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial002" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial003" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial004" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial005" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly2_trial001" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly2_trial002" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly2_trial003" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly2_trial004" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly2_trial005" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly3_trial001" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly3_trial002" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly3_trial003" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly3_trial004" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly3_trial005" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly4_trial001" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly4_trial002" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly4_trial003" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly4_trial004" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly4_trial005" \ --training-data-config.val-data-dirs \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial001" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial001" \ --training-data-config.input-image-size 256 256 \ --training-data-config.atomic-batch-n-samples 32 \ --training-data-config.atomic-batch-n-variants 4 \ @@ -67,7 +70,7 @@ python -u $training_cli_path \ --optimizer-config.learning-rate-segmentation-head 3e-4 \ --optimizer-config.weight-decay 1e-5 \ --training-artifacts-config.output-basedir \ - "bulk_data/pose_estimation/bodyseg/trial_20251118a/" \ + "bulk_data/pose_estimation/bodyseg/$training_trial_name/" \ --training-artifacts-config.logging-interval 10 \ --training-artifacts-config.checkpoint-interval 1000 \ --training-artifacts-config.validation-interval 1000 \ diff --git a/scripts_on_cluster/contrastive_pretraining_training/job.run b/scripts_on_cluster/contrastive_pretraining_training/2variants.run similarity index 94% rename from scripts_on_cluster/contrastive_pretraining_training/job.run rename to scripts_on_cluster/contrastive_pretraining_training/2variants.run index befbd3c..8ba3069 100644 --- a/scripts_on_cluster/contrastive_pretraining_training/job.run +++ b/scripts_on_cluster/contrastive_pretraining_training/2variants.run @@ -5,11 +5,11 @@ #SBATCH --ntasks 1 #SBATCH --cpus-per-task 16 #SBATCH --mem 90GB -#SBATCH --time 48:00:00 +#SBATCH --time 72:00:00 #SBATCH --partition=h100 #SBATCH --qos=normal #SBATCH --gres=gpu:1 -#SBATCH --output /home/sibwang/poseforge/scripts_on_cluster/contrastive_pretraining_training/output_20251117a.log +#SBATCH --output /home/sibwang/poseforge/scripts_on_cluster/contrastive_pretraining_training/output_20251125b_2variants.log echo "Hello from $(hostname)" @@ -20,7 +20,7 @@ conda activate poseforge cd $HOME/poseforge echo "Training starting at $(date)" -trial_name="trial_20251117a" +trial_name="trial_20251125b_2variants" python -u src/poseforge/pose/contrast/scripts/run_contrastive_pretraining.py \ --n-epochs 10 \ @@ -53,7 +53,7 @@ python -u src/poseforge/pose/contrast/scripts/run_contrastive_pretraining.py \ --training-data-config.val-data-dirs \ "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial001" \ --training-data-config.atomic-batch-n-samples 32 \ - --training-data-config.atomic-batch-n-variants 4 \ + --training-data-config.atomic-batch-n-variants 2 \ --training-data-config.train-batch-size 960 \ --training-data-config.val-batch-size 256 \ --training-data-config.image-size 256 256 \ diff --git a/scripts_on_cluster/contrastive_pretraining_training/low_lr.run b/scripts_on_cluster/contrastive_pretraining_training/low_lr.run new file mode 100644 index 0000000..c519def --- /dev/null +++ b/scripts_on_cluster/contrastive_pretraining_training/low_lr.run @@ -0,0 +1,70 @@ +#!/bin/bash -l + +#SBATCH --job-name contr_pretrain_lr3e-5 +#SBATCH --nodes 1 +#SBATCH --ntasks 1 +#SBATCH --cpus-per-task 16 +#SBATCH --mem 90GB +#SBATCH --time 72:00:00 +#SBATCH --partition=h100 +#SBATCH --qos=normal +#SBATCH --gres=gpu:1 +#SBATCH --output /home/sibwang/poseforge/scripts_on_cluster/contrastive_pretraining_training/output_20251125a_lowlr.log + +echo "Hello from $(hostname)" + +. ~/spack/share/spack/setup-env.sh +spack load ffmpeg +conda activate poseforge + +cd $HOME/poseforge +echo "Training starting at $(date)" + +trial_name="trial_20251125a_lowlr" + +python -u src/poseforge/pose/contrast/scripts/run_contrastive_pretraining.py \ + --n-epochs 10 \ + --seed 42 \ + --model-architecture-config.projection-head-hidden-dim 512 \ + --model-architecture-config.projection-head-output-dim 256 \ + --model-weights-config.feature-extractor-weights "IMAGENET1K_V1" \ + --loss-config.info-nce-temperature 0.1 \ + --training-data-config.train-data-dirs \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly1_trial001" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly1_trial002" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly1_trial003" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly1_trial004" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly1_trial005" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly2_trial001" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly2_trial002" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly2_trial003" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly2_trial004" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly2_trial005" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly3_trial001" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly3_trial002" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly3_trial003" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly3_trial004" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly3_trial005" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly4_trial001" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly4_trial002" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly4_trial003" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly4_trial004" \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly4_trial005" \ + --training-data-config.val-data-dirs \ + "bulk_data/pose_estimation/atomic_batches_4variants/BO_Gal4_fly1_trial001" \ + --training-data-config.atomic-batch-n-samples 32 \ + --training-data-config.atomic-batch-n-variants 4 \ + --training-data-config.train-batch-size 960 \ + --training-data-config.val-batch-size 256 \ + --training-data-config.image-size 256 256 \ + --training-data-config.n-workers 4 \ + --optimizer-config.adam-lr 3e-5 \ + --optimizer-config.adam-weight-decay 1e-4 \ + --training-artifacts-config.output-basedir \ + "bulk_data/pose_estimation/contrastive_pretraining/$trial_name" \ + --training-artifacts-config.logging-interval 10 \ + --training-artifacts-config.checkpoint-interval 500 \ + --training-artifacts-config.validation-interval 200 \ + --training-artifacts-config.n-batches-per-validation 100 + +echo "Training ends at $(date)" diff --git a/scripts_on_cluster/keypoints3d_training/job.run b/scripts_on_cluster/keypoints3d_training/job.run index 678ebcb..5e623d4 100644 --- a/scripts_on_cluster/keypoints3d_training/job.run +++ b/scripts_on_cluster/keypoints3d_training/job.run @@ -1,6 +1,6 @@ #!/bin/bash -l -#SBATCH --job-name keypoints3d-20251118a +#SBATCH --job-name keypoints3d #SBATCH --nodes 1 #SBATCH --ntasks 1 #SBATCH --cpus-per-task 16 @@ -9,7 +9,7 @@ #SBATCH --partition=h100 #SBATCH --qos=normal #SBATCH --gres=gpu:1 -#SBATCH --output /home/sibwang/poseforge/scripts_on_cluster/keypoints3d_training/output_20251118a.log +#SBATCH --output /home/sibwang/poseforge/scripts_on_cluster/keypoints3d_training/output_20251127a.log echo "Hello from $(hostname)" @@ -19,37 +19,40 @@ conda activate poseforge cd $HOME/poseforge training_cli_path="src/poseforge/pose/keypoints3d/scripts/run_keypoints3d_training.py" -training_trial_name="trial_20251118a" +training_trial_name="trial_20251127a" +contrastive_pretraining_trial_name="trial_20251125a_lowlr" +contrastive_pretraining_epoch="epoch009" +contrastive_pretraining_local_step="step003055" echo "Training starting at $(date)" python -u $training_cli_path \ --n-epochs 30 \ --model-weights-config.feature-extractor-weights \ - "bulk_data/pose_estimation/contrastive_pretraining/trial_20251117a/checkpoints/checkpoint_epoch009_step003055.feature_extractor.pth" \ + "bulk_data/pose_estimation/contrastive_pretraining/$contrastive_pretraining_trial_name/checkpoints/checkpoint_${contrastive_pretraining_epoch}_${contrastive_pretraining_local_step}.feature_extractor.pth" \ --training-data-config.train-data-dirs \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial001" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial002" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial003" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial004" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly1_trial005" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly2_trial001" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly2_trial002" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly2_trial003" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly2_trial004" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly2_trial005" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly3_trial001" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly3_trial002" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly3_trial003" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly3_trial004" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly3_trial005" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly4_trial001" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly4_trial002" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly4_trial003" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly4_trial004" \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly4_trial005" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial001" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial002" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial003" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial004" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly1_trial005" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly2_trial001" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly2_trial002" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly2_trial003" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly2_trial004" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly2_trial005" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly3_trial001" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly3_trial002" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly3_trial003" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly3_trial004" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly3_trial005" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly4_trial001" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly4_trial002" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly4_trial003" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly4_trial004" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly4_trial005" \ --training-data-config.val-data-dirs \ - "bulk_data/pose_estimation/atomic_batches/BO_Gal4_fly5_trial001" \ + "bulk_data/pose_estimation/atomic_batches/4variants/BO_Gal4_fly5_trial001" \ --training-data-config.input-image-size 256 256 \ --training-data-config.atomic-batch-n-samples 32 \ --training-data-config.atomic-batch-n-variants 4 \