From 7f2a65c9a5335adaa9fb3cf5f675c92102e8db26 Mon Sep 17 00:00:00 2001 From: autoresearch Date: Sun, 8 Mar 2026 03:44:34 +0000 Subject: [PATCH] depth 9 aspect_ratio 57 (extra layer, dim ~512) --- train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index ec85895..b8018f8 100644 --- a/train.py +++ b/train.py @@ -429,7 +429,7 @@ class MuonAdamW(torch.optim.Optimizer): # --------------------------------------------------------------------------- # Model architecture -ASPECT_RATIO = 64 # model_dim = depth * ASPECT_RATIO +ASPECT_RATIO = 57 # model_dim = depth * ASPECT_RATIO HEAD_DIM = 128 # target head dimension for attention WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context @@ -446,7 +446,7 @@ WARMDOWN_RATIO = 0.5 # fraction of time budget for LR warmdown FINAL_LR_FRAC = 0.0 # final LR as fraction of initial # Model size -DEPTH = 8 # number of transformer layers +DEPTH = 9 # number of transformer layers DEVICE_BATCH_SIZE = 128 # per-device batch size (reduce if OOM) # ---------------------------------------------------------------------------