From bea057bc083643799bb52655983f49533da9ff4d Mon Sep 17 00:00:00 2001 From: autoresearch Date: Sun, 8 Mar 2026 03:38:47 +0000 Subject: [PATCH] halve batch size 524K to 262K for more steps in 5 min --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 6994fb9..ec85895 100644 --- a/train.py +++ b/train.py @@ -434,7 +434,7 @@ HEAD_DIM = 128 # target head dimension for attention WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context # Optimization -TOTAL_BATCH_SIZE = 2**19 # ~524K tokens per optimizer step +TOTAL_BATCH_SIZE = 2**18 # ~262K tokens per optimizer step EMBEDDING_LR = 0.6 # learning rate for token embeddings (Adam) UNEMBEDDING_LR = 0.004 # learning rate for lm_head (Adam) MATRIX_LR = 0.04 # learning rate for matrix parameters (Muon)