halve batch size 524K to 262K for more steps in 5 min

This commit is contained in:
autoresearch
2026-03-08 03:38:47 +00:00
parent 500114a035
commit bea057bc08
+1 -1
View File
@@ -434,7 +434,7 @@ HEAD_DIM = 128 # target head dimension for attention
WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context
# Optimization
TOTAL_BATCH_SIZE = 2**19 # ~524K tokens per optimizer step
TOTAL_BATCH_SIZE = 2**18 # ~262K tokens per optimizer step
EMBEDDING_LR = 0.6 # learning rate for token embeddings (Adam)
UNEMBEDDING_LR = 0.004 # learning rate for lm_head (Adam)
MATRIX_LR = 0.04 # learning rate for matrix parameters (Muon)