#!/bin/bash

DATA=${1:-"ocp_subsampled_5G"}

if [[ $DATA == "ocp_subsampled_5G" ]]; then
  input_dir="ocp_subsampled_5G"
  splits_dir="ocp_subsampled_5G_entropy90_splits_ac"
  compression_dir="ocp_subsampled_5G_enumerative_compression_ac"

  #### Step 1: setup subsampled data for evaluation
  if [[ ! -d "$input_dir" ]]; then
    mkdir $input_dir
    head -n 1050000 /mnt/hdfs/linzheng/data/opencoder_python/opencoder_python.chunk.1.jsonl > temp.jsonl
    split -n r/8 --suffix-length=1 --numeric-suffixes=1 --additional-suffix=.jsonl temp.jsonl ${input_dir}/ocp.chunk.
    rm temp.jsonl
  else
    echo "Directory '$input_dir' already exists."
  fi
elif [[ $DATA == "ocp_subsampled_50G" ]]; then
  input_dir="/mnt/hdfs/linzheng/data/ocpython_subsampled_50G"
  splits_dir="ocpython_subsampled_50G_entropy92_splits"
  compression_dir="ocpython_subsampled_50G_entropy92_enumerative_compression"
else
  echo "Unknown $DATA."
  exit 0
fi

LOG_DIR="ac_result"
mkdir -p $LOG_DIR

entropy_model_path=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python/checkpoints/0000200000
compression_model_path=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python/checkpoints/0000200000

NUM_GPUS=4
total_jsonls=4
total_jobs=1

wait 

for JSONL_IDX in $(seq 1 $total_jsonls); do
    for index in $(seq 0 $((total_jobs - 1))); do
        echo "Starting job $index..."

        GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS ))
        CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_entropy_window_split.py \
            --input_file ${input_dir}/ocp.chunk.${JSONL_IDX}.jsonl \
            --output_dir $splits_dir \
            --entropy_model_path $entropy_model_path --compression_model_path $compression_model_path \
            --data_batch_size 256 --max_entropy_batch_size 256 \
            --num_workers 1 --process_id $index --num_processes $total_jobs \
            --base_global_quantile 0.92 --base_monotonic_quantile 0.92 \
            --chunk_size 2048 > ${LOG_DIR}/split_jsonl${JSONL_IDX}_process${index}_total${total_jobs}.log 2>&1 &
    done
done

wait


NUM_GPUS=1
total_jsonls=1
total_jobs=1


for JSONL_IDX in $(seq 1 $total_jsonls); do
    for index in $(seq 0 $((total_jobs - 1))); do
        echo "Starting job $index..."

        GPU_IDX=$(( (JSONL_IDX - 1) % NUM_GPUS ))
        CUDA_VISIBLE_DEVICES=${GPU_IDX} python3 offline_entropy_window_compress_v2.py \
            --input_file ${splits_dir}/ocp.chunk.${JSONL_IDX}_out_0.jsonl \
            --output_dir $compression_dir \
            --compressor arithmetic \
            --entropy_model_path $entropy_model_path --compression_model_path $compression_model_path \
            --data_batch_size 128 --max_compression_batch_size 1024 \
            --num_workers 1 --process_id $index --num_processes $total_jobs > ${LOG_DIR}/compress_jsonl${JSONL_IDX}_process${index}_total${total_jobs}.log 2>&1 &
    done
done


# # debug 
# CUDA_LAUNCH_BLOCKING=1 python3 offline_entropy_window_compress_v2.py \
# --input_file ocp_subsampled_5G_entropy90_splits_ac/ocp.chunk.1_out_0.jsonl \
# --output_dir ocp_subsampled_5G_enumerative_compression_ac \
# --compressor arithmetic \
# --entropy_model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python/checkpoints/0000200000 \
# --compression_model_path /mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python/checkpoints/0000200000 \
# --data_batch_size 128 --max_compression_batch_size 1024 \
# --num_workers 1 --process_id 0 --num_processes 1 > ac_result/compress_jsonl1_process0_total1.log 2>&1 &