{
    "output_dir": "/workspace/experiments/vietnamese_finetune/vi_finetune_20260405_033526",
    "data_config": "configs/data_vietnamese_finetune.json",
    "llm_name_or_path": "Qwen/Qwen3-0.6B",
    "audio_vocab_size": 1025,
    "audio_mask_id": 1024,
    "num_audio_codebook": 8,
    "audio_codebook_weights": [
        8,
        8,
        6,
        6,
        4,
        4,
        2,
        2
    ],
    "drop_cond_ratio": 0.1,
    "prompt_ratio_range": [
        0.0,
        0.3
    ],
    "mask_ratio_range": [
        0.0,
        1.0
    ],
    "language_ratio": 1.0,
    "use_pinyin_ratio": 0.0,
    "instruct_ratio": 0.0,
    "only_instruct_ratio": 0.0,
    "resume_from_checkpoint": null,
    "init_from_checkpoint": "k2-fsa/OmniVoice",
    "learning_rate": 3e-05,
    "weight_decay": 0.01,
    "max_grad_norm": 1.0,
    "steps": 8000,
    "seed": 42,
    "lr_scheduler_type": "cosine",
    "warmup_type": "ratio",
    "warmup_ratio": 0.03,
    "warmup_steps": 0,
    "batch_tokens": 32768,
    "gradient_accumulation_steps": 2,
    "num_workers": 8,
    "mixed_precision": "bf16",
    "allow_tf32": true,
    "use_deepspeed": false,
    "deepspeed_config": null,
    "log_with": "tensorboard,wandb",
    "logging_steps": 50,
    "eval_steps": 500,
    "save_steps": 1000,
    "keep_last_n_checkpoints": 5
}