depth 9 aspect_ratio 57 (extra layer, dim ~512)

This commit is contained in:
autoresearch
2026-03-08 03:44:34 +00:00
parent bea057bc08
commit 7f2a65c9a5
+2 -2
View File
@@ -429,7 +429,7 @@ class MuonAdamW(torch.optim.Optimizer):
# ---------------------------------------------------------------------------
# Model architecture
ASPECT_RATIO = 64 # model_dim = depth * ASPECT_RATIO
ASPECT_RATIO = 57 # model_dim = depth * ASPECT_RATIO
HEAD_DIM = 128 # target head dimension for attention
WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context
@@ -446,7 +446,7 @@ WARMDOWN_RATIO = 0.5 # fraction of time budget for LR warmdown
FINAL_LR_FRAC = 0.0 # final LR as fraction of initial
# Model size
DEPTH = 8 # number of transformer layers
DEPTH = 9 # number of transformer layers
DEVICE_BATCH_SIZE = 128 # per-device batch size (reduce if OOM)
# ---------------------------------------------------------------------------