RoPE base frequency 10K to 200K

This commit is contained in:
autoresearch
2026-03-08 04:13:30 +00:00
parent 7da0b673a1
commit 59e9dd9aab
+1 -1
View File
@@ -179,7 +179,7 @@ class GPT(nn.Module):
for ve in self.value_embeds.values():
ve.to(dtype=torch.bfloat16)
def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None):
def _precompute_rotary_embeddings(self, seq_len, head_dim, base=200000, device=None):
if device is None:
device = self.transformer.wte.weight.device
channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)