diff --git a/train.py b/train.py index 668914a..c6c377b 100644 --- a/train.py +++ b/train.py @@ -278,7 +278,7 @@ class GPT(nn.Module): x = block(x, ve, cos_sin, self.window_sizes[i]) x = norm(x) - softcap = 15 + softcap = 17 logits = self.lm_head(x) logits = logits.float() logits = softcap * torch.tanh(logits / softcap)