diff --git a/train.py b/train.py index 3ac6503..668914a 100644 --- a/train.py +++ b/train.py @@ -442,7 +442,7 @@ SCALAR_LR = 0.5 # learning rate for per-layer scalars (Adam) WEIGHT_DECAY = 0.2 # cautious weight decay for Muon ADAM_BETAS = (0.8, 0.95) # Adam beta1, beta2 WARMUP_RATIO = 0.0 # fraction of time budget for LR warmup -WARMDOWN_RATIO = 0.7 # fraction of time budget for LR warmdown +WARMDOWN_RATIO = 0.75 # fraction of time budget for LR warmdown FINAL_LR_FRAC = 0.05 # final LR as fraction of initial # Model size