diff --git a/train.py b/train.py index d1c6b9f..3ac6503 100644 --- a/train.py +++ b/train.py @@ -435,7 +435,7 @@ WINDOW_PATTERN = "SSSSL" # sliding window pattern: L=full, S=half context # Optimization TOTAL_BATCH_SIZE = 2**18 # ~262K tokens per optimizer step -EMBEDDING_LR = 0.8 # learning rate for token embeddings (Adam) +EMBEDDING_LR = 0.9 # learning rate for token embeddings (Adam) UNEMBEDDING_LR = 0.005 # learning rate for lm_head (Adam) MATRIX_LR = 0.04 # learning rate for matrix parameters (Muon) SCALAR_LR = 0.5 # learning rate for per-layer scalars (Adam)