depth 9 aspect_ratio 57 (extra layer, dim ~512)

2026-03-08 03:44:34 +00:00
parent bea057bc08
commit 7f2a65c9a5
1 changed files with 2 additions and 2 deletions
@@ -429,7 +429,7 @@ class MuonAdamW(torch.optim.Optimizer):
 # ---------------------------------------------------------------------------

 # Model architecture
-ASPECT_RATIO = 64       # model_dim = depth * ASPECT_RATIO
+ASPECT_RATIO = 57       # model_dim = depth * ASPECT_RATIO
 HEAD_DIM = 128          # target head dimension for attention
 WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context

@@ -446,7 +446,7 @@ WARMDOWN_RATIO = 0.5    # fraction of time budget for LR warmdown
 FINAL_LR_FRAC = 0.0     # final LR as fraction of initial

 # Model size
-DEPTH = 8               # number of transformer layers
+DEPTH = 9               # number of transformer layers
 DEVICE_BATCH_SIZE = 128  # per-device batch size (reduce if OOM)

 # ---------------------------------------------------------------------------