From 7f2a65c9a5335adaa9fb3cf5f675c92102e8db26 Mon Sep 17 00:00:00 2001
From: autoresearch <autoresearch@local>
Date: Sun, 8 Mar 2026 03:44:34 +0000
Subject: [PATCH] depth 9 aspect_ratio 57 (extra layer, dim ~512)

---
 train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train.py b/train.py
index ec85895..b8018f8 100644
--- a/train.py
+++ b/train.py
@@ -429,7 +429,7 @@ class MuonAdamW(torch.optim.Optimizer):
 # ---------------------------------------------------------------------------
 
 # Model architecture
-ASPECT_RATIO = 64       # model_dim = depth * ASPECT_RATIO
+ASPECT_RATIO = 57       # model_dim = depth * ASPECT_RATIO
 HEAD_DIM = 128          # target head dimension for attention
 WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context
 
@@ -446,7 +446,7 @@ WARMDOWN_RATIO = 0.5    # fraction of time budget for LR warmdown
 FINAL_LR_FRAC = 0.0     # final LR as fraction of initial
 
 # Model size
-DEPTH = 8               # number of transformer layers
+DEPTH = 9               # number of transformer layers
 DEVICE_BATCH_SIZE = 128  # per-device batch size (reduce if OOM)
 
 # ---------------------------------------------------------------------------