diff --git a/train.py b/train.py index 867d33c..51a6db9 100644 --- a/train.py +++ b/train.py @@ -152,7 +152,7 @@ class GPT(nn.Module): torch.nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.001) # Transformer blocks n_embd = self.config.n_embd - s = 0.8 * 3**0.5 * n_embd**-0.5 + s = 0.7 * 3**0.5 * n_embd**-0.5 for block in self.transformer.h: torch.nn.init.uniform_(block.attn.c_q.weight, -s, s) torch.nn.init.uniform_(block.attn.c_k.weight, -s, s)