depth 9 aspect_ratio 57 (extra layer, dim ~512)
This commit is contained in:
@@ -429,7 +429,7 @@ class MuonAdamW(torch.optim.Optimizer):
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Model architecture
|
||||
ASPECT_RATIO = 64 # model_dim = depth * ASPECT_RATIO
|
||||
ASPECT_RATIO = 57 # model_dim = depth * ASPECT_RATIO
|
||||
HEAD_DIM = 128 # target head dimension for attention
|
||||
WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context
|
||||
|
||||
@@ -446,7 +446,7 @@ WARMDOWN_RATIO = 0.5 # fraction of time budget for LR warmdown
|
||||
FINAL_LR_FRAC = 0.0 # final LR as fraction of initial
|
||||
|
||||
# Model size
|
||||
DEPTH = 8 # number of transformer layers
|
||||
DEPTH = 9 # number of transformer layers
|
||||
DEVICE_BATCH_SIZE = 128 # per-device batch size (reduce if OOM)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user