From 59e9dd9aab75d2a3e46e32c9458beb9a0a2d147f Mon Sep 17 00:00:00 2001 From: autoresearch Date: Sun, 8 Mar 2026 04:13:30 +0000 Subject: [PATCH] RoPE base frequency 10K to 200K --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 11d27e7..fdef2cb 100644 --- a/train.py +++ b/train.py @@ -179,7 +179,7 @@ class GPT(nn.Module): for ve in self.value_embeds.values(): ve.to(dtype=torch.bfloat16) - def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): + def _precompute_rotary_embeddings(self, seq_len, head_dim, base=200000, device=None): if device is None: device = self.transformer.wte.weight.device channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)