SSSSL window pattern (5:1 short:long ratio)

This commit is contained in:
autoresearch
2026-03-08 04:01:58 +00:00
parent 4e6697f68d
commit 8363d52e8d
+1 -1
View File
@@ -431,7 +431,7 @@ class MuonAdamW(torch.optim.Optimizer):
# Model architecture
ASPECT_RATIO = 57 # model_dim = depth * ASPECT_RATIO
HEAD_DIM = 128 # target head dimension for attention
WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context
WINDOW_PATTERN = "SSSSL" # sliding window pattern: L=full, S=half context
# Optimization
TOTAL_BATCH_SIZE = 2**18 # ~262K tokens per optimizer step