| # python3 offline_compress_m1.py \ | |
| # --input_dir data/m1 \ | |
| # --output_dir test_data/m1 \ | |
| # --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000 | |
| NUM_GPUS=8 | |
| total_jsonls=8 | |
| total_jobs=1 | |
| # --firstbyte_prob_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/ac_unigram_probs/python500k_unigram_prob.json \ | |
| for JSONL_IDX in $(seq 1 $total_jsonls); do | |
| for index in $(seq 0 $((total_jobs - 1))); do | |
| echo "Starting job $index..." | |
| GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS )) | |
| CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_entropy_window_split.py \ | |
| --input_file /mnt/hdfs/user/linzheng/data/ocpython_subsampled_50G/ocp.chunk.${JSONL_IDX}.jsonl \ | |
| --output_dir ocpython_subsampled_50G_entropy90_splits \ | |
| --entropy_model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python/checkpoints/0000200000 \ | |
| --compression_model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python/checkpoints/0000200000 \ | |
| --data_batch_size 256 \ | |
| --max_entropy_batch_size 256 --max_compression_batch_size 8192 \ | |
| --num_workers 1 --process_id $index --num_processes $total_jobs \ | |
| --base_global_quantile 0.9 --base_monotonic_quantile 0.9 \ | |
| --chunk_size 2048 > jsonl${JSONL_IDX}_process${index}_total${total_jobs}.log 2>&1 & | |
| done | |
| done | |
| wait | |
| # for JSONL_IDX in $(seq 1 $total_jsonls); do | |
| # for index in $(seq 0 $((total_jobs - 1))); do | |
| # echo "Starting job $index..." | |
| # GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS )) | |
| # CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_compress_m1_entropy_splits.py \ | |
| # --input_file /mnt/hdfs/user/linzheng/data/ocpython_subsampled_50G/ocp.chunk.${JSONL_IDX}.jsonl \ | |
| # --output_dir ocpython_subsampled_50G_entropy90_splits \ | |
| # --entropy_model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python/checkpoints/0000200000 \ | |
| # --compression_model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000 \ | |
| # --data_batch_size 64 --output_window_size 24 --max_window_size 64 \ | |
| # --max_entropy_batch_size 256 --max_compression_batch_size 8192 \ | |
| # --num_workers 1 --process_id $index --num_processes $total_jobs \ | |
| # --base_global_quantile 0.90 --base_monotonic_quantile 0.90 \ | |
| # --chunk_size 2048 > jsonl${JSONL_IDX}_process${index}_total${total_jobs}.log 2>&1 & | |
| # done | |
| # done | |
| # wait | |
| # for JSONL_IDX in $(seq 1 $total_jsonls); do | |
| # for index in $(seq 0 $((total_jobs - 1))); do | |
| # echo "Starting job $index..." | |
| # GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS )) | |
| # CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_compress_m1_entropy_splits.py \ | |
| # --input_file /mnt/hdfs/user/linzheng/data/ocpython_subsampled_50G/ocp.chunk.${JSONL_IDX}.jsonl \ | |
| # --output_dir ocpython_subsampled_50G_entropy95_splits \ | |
| # --entropy_model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python/checkpoints/0000200000 \ | |
| # --compression_model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_6M_lr1e-2_steps50k_bs128_seqlen512/checkpoints/0000050000 \ | |
| # --data_batch_size 64 --output_window_size 24 --max_window_size 64 \ | |
| # --max_entropy_batch_size 256 --max_compression_batch_size 8192 \ | |
| # --num_workers 1 --process_id $index --num_processes $total_jobs \ | |
| # --base_global_quantile 0.95 --base_monotonic_quantile 0.95 \ | |
| # --chunk_size 2048 > jsonl${JSONL_IDX}_process${index}_total${total_jobs}.log 2>&1 & | |
| # done | |
| # done | |
| # for JSONL_IDX in $(seq 1 $total_jsonls); do | |
| # for index in $(seq 0 $((total_jobs - 1))); do | |
| # echo "Starting job $index..." | |
| # GPU_IDX=$(( JSONL_IDX - 1 )) | |
| # CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_compress_m1_outputwindow_v3.py \ | |
| # --input_file /mnt/hdfs/user/linzheng/data/ocpython_subsampled_50G/ocp.chunk.${JSONL_IDX}.jsonl \ | |
| # --output_dir ocpython_subsampled_50G_outputwindow_24 \ | |
| # --model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs32_seqlen512_python/checkpoints/0000200000 \ | |
| # --data_batch_size 512 --output_window_size 24 --max_m1_batch_size 4096 --max_window_size 64 \ | |
| # --num_workers 1 --process_id $index --num_processes $total_jobs \ | |
| # --output_window_size 32 > gpu${GPU_IDX}_process${index}_total${total_jobs}.log 2>&1 & | |
| # done | |
| # done | |