SSSSL window pattern (5:1 short:long ratio)
This commit is contained in:
@@ -431,7 +431,7 @@ class MuonAdamW(torch.optim.Optimizer):
|
||||
# Model architecture
|
||||
ASPECT_RATIO = 57 # model_dim = depth * ASPECT_RATIO
|
||||
HEAD_DIM = 128 # target head dimension for attention
|
||||
WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context
|
||||
WINDOW_PATTERN = "SSSSL" # sliding window pattern: L=full, S=half context
|
||||
|
||||
# Optimization
|
||||
TOTAL_BATCH_SIZE = 2**18 # ~262K tokens per optimizer step
|
||||
|
||||
Reference in New Issue
Block a user