unembedding LR 0.006 to 0.005
This commit is contained in:
@@ -436,7 +436,7 @@ WINDOW_PATTERN = "SSSSL" # sliding window pattern: L=full, S=half context
|
||||
# Optimization
|
||||
TOTAL_BATCH_SIZE = 2**18 # ~262K tokens per optimizer step
|
||||
EMBEDDING_LR = 0.8 # learning rate for token embeddings (Adam)
|
||||
UNEMBEDDING_LR = 0.006 # learning rate for lm_head (Adam)
|
||||
UNEMBEDDING_LR = 0.005 # learning rate for lm_head (Adam)
|
||||
MATRIX_LR = 0.04 # learning rate for matrix parameters (Muon)
|
||||
SCALAR_LR = 0.5 # learning rate for per-layer scalars (Adam)
|
||||
WEIGHT_DECAY = 0.2 # cautious weight decay for Muon
|
||||
|
||||
Reference in New Issue
Block a user