From 7f63c17076a3bdd65f28f513fa888165ab0a8ea6 Mon Sep 17 00:00:00 2001 From: autoresearch Date: Sun, 8 Mar 2026 07:05:48 +0000 Subject: [PATCH] unembedding LR 0.006 to 0.005 --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 41bf1e8..e06466b 100644 --- a/train.py +++ b/train.py @@ -436,7 +436,7 @@ WINDOW_PATTERN = "SSSSL" # sliding window pattern: L=full, S=half context # Optimization TOTAL_BATCH_SIZE = 2**18 # ~262K tokens per optimizer step EMBEDDING_LR = 0.8 # learning rate for token embeddings (Adam) -UNEMBEDDING_LR = 0.006 # learning rate for lm_head (Adam) +UNEMBEDDING_LR = 0.005 # learning rate for lm_head (Adam) MATRIX_LR = 0.04 # learning rate for matrix parameters (Muon) SCALAR_LR = 0.5 # learning rate for per-layer scalars (Adam) WEIGHT_DECAY = 0.2 # cautious weight decay for Muon