From 8363d52e8d2210b91f7d088b7765df7c75624659 Mon Sep 17 00:00:00 2001 From: autoresearch Date: Sun, 8 Mar 2026 04:01:58 +0000 Subject: [PATCH] SSSSL window pattern (5:1 short:long ratio) --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 37bdb2c..c0efde8 100644 --- a/train.py +++ b/train.py @@ -431,7 +431,7 @@ class MuonAdamW(torch.optim.Optimizer): # Model architecture ASPECT_RATIO = 57 # model_dim = depth * ASPECT_RATIO HEAD_DIM = 128 # target head dimension for attention -WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context +WINDOW_PATTERN = "SSSSL" # sliding window pattern: L=full, S=half context # Optimization TOTAL_BATCH_SIZE = 2**18 # ~262K tokens per optimizer step