embedding LR 0.8 to 0.9 (re-test with WD)

This commit is contained in:
autoresearch
2026-03-08 14:23:02 +00:00
parent 637f82f215
commit b1d50048d9
+1 -1
View File
@@ -435,7 +435,7 @@ WINDOW_PATTERN = "SSSSL" # sliding window pattern: L=full, S=half context
# Optimization
TOTAL_BATCH_SIZE = 2**18 # ~262K tokens per optimizer step
EMBEDDING_LR = 0.8 # learning rate for token embeddings (Adam)
EMBEDDING_LR = 0.9 # learning rate for token embeddings (Adam)
UNEMBEDDING_LR = 0.005 # learning rate for lm_head (Adam)
MATRIX_LR = 0.04 # learning rate for matrix parameters (Muon)
SCALAR_LR = 0.5 # learning rate for per-layer scalars (Adam)