From f5979a746462b104064955d0c75df2bb35ed56f1 Mon Sep 17 00:00:00 2001 From: autoresearch Date: Sun, 8 Mar 2026 10:14:06 +0000 Subject: [PATCH] init scale 0.7x --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 867d33c..51a6db9 100644 --- a/train.py +++ b/train.py @@ -152,7 +152,7 @@ class GPT(nn.Module): torch.nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.001) # Transformer blocks n_embd = self.config.n_embd - s = 0.8 * 3**0.5 * n_embd**-0.5 + s = 0.7 * 3**0.5 * n_embd**-0.5 for block in self.transformer.h: torch.nn.init.uniform_(block.attn.c_q.weight, -s, s) torch.nn.init.uniform_(block.attn.c_k.weight, -s, s)