diff --git a/train.py b/train.py index e7ce028..e8792c6 100644 --- a/train.py +++ b/train.py @@ -443,7 +443,7 @@ WEIGHT_DECAY = 0.2 # cautious weight decay for Muon ADAM_BETAS = (0.8, 0.95) # Adam beta1, beta2 WARMUP_RATIO = 0.0 # fraction of time budget for LR warmup WARMDOWN_RATIO = 0.7 # fraction of time budget for LR warmdown -FINAL_LR_FRAC = 0.0 # final LR as fraction of initial +FINAL_LR_FRAC = 0.05 # final LR as fraction of initial # Model size DEPTH = 9 # number of transformer layers