add experiment results log (125 experiments, best val_bpb=0.969686)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 16:22:01 +00:00
parent 216eeb8d6e
commit fedfef398b
1 changed files with 127 additions and 0 deletions
@@ -0,0 +1,127 @@
+commit	val_bpb	memory_gb	status	description
+baseline	0.997900	44.0	keep	baseline
+bea057b	0.986041	43.9	keep	halve batch 524K to 262K (more steps in 5 min)
+7f2a65c	0.981773	60.2	keep	depth 9 aspect_ratio 57 (extra layer dim ~512)
+187e419	0.982603	60.2	discard	add 5% warmup
+4e6697f	0.981201	60.2	keep	warmdown 0.5 to 0.7
+8363d52	0.980903	60.2	keep	SSSSL window pattern (5:1 short:long)
+7da0b67	0.979969	60.2	keep	short window 1/8 context (256 tokens)
+59e9dd9	0.978784	60.2	keep	RoPE base frequency 10K to 200K
+7d047e4	0.975524	60.2	keep	embedding LR 0.6 to 0.8
+c4ce95c	0.975895	60.2	discard	unembedding LR 0.004 to 0.008
+0640555	0.974729	60.2	keep	x0_lambda init 0.1 to 0.05
+772dada	0.974119	60.2	keep	FINAL_LR_FRAC 0.0 to 0.05
+ccf6012	0.974903	60.2	discard	matrix LR 0.04 to 0.045
+aa8f408	0.973104	60.2	keep	unembedding LR 0.004 to 0.006
+889dbed	0.973799	60.2	discard	random seed 42 to 137
+e05a87d	0.000000	0.0	crash	batch 131K (assert fail: not divisible by device batch)
+0dc6130	0.974134	60.2	discard	embedding LR 0.8 to 1.0
+fa14910	0.973824	60.2	discard	softcap 15 to 20
+187db84	0.973659	60.2	discard	warmdown 0.7 to 0.8
+3ec9cfa	0.979340	53.7	discard	depth 10 aspect 51 dim 512 (too narrow)
+2913af9	0.973177	60.2	discard	weight decay 0.2 to 0.15
+a7aa309	0.972849	60.2	keep	muon momentum warmup 300 to 200 steps
+6b6b241	0.973385	60.2	discard	VE gate channels 32 to 48
+77d7a47	0.973121	60.2	discard	scalar LR 0.5 to 0.3
+d6cad11	0.973130	60.2	discard	Adam beta1 0.8 to 0.85
+b19f649	0.978313	60.2	discard	remove cautious WD mask (much worse)
+9aa1e29	0.974490	60.2	discard	FINAL_LR_FRAC 0.05 to 0.1
+aa61e0b	0.973658	60.2	discard	gradient clipping max_norm=1.0
+31b838e	0.973706	60.2	discard	Muon ns_steps 5 to 4
+ebff004	0.973076	60.2	discard	LR scale reference 768 to 640
+d3c7143	0.973339	60.2	discard	muon final momentum 0.95 to 0.96
+a6b2ac7	0.973119	60.2	discard	Muon beta2 0.95 to 0.90
+6b59591	0.972821	60.2	discard	VE gate scale 2 to 3 (flat)
+d41c1df	0.976114	60.2	discard	resid lambda init 1.0 to 0.9
+97aa364	0.973828	60.2	discard	matrix LR 0.04 to 0.035
+5db6ed4	0.979735	59.5	discard	VE only last 3 layers (much worse)
+8189d27	0.973894	60.2	discard	embedding LR 0.8 to 0.9
+7f63c17	0.972779	60.2	keep	unembedding LR 0.006 to 0.005
+d0662d1	0.974038	60.2	discard	RoPE base 200K to 400K
+d3840ec	0.974356	60.2	discard	constant WD at 0.1 (decaying better)
+264a05b	0.972694	60.2	keep	add WD 0.01 to lm_head
+7d3f0e4	0.972847	60.2	discard	softcap 15 to 13
+00a7c09	0.979754	72.6	discard	depth 11 dim 640 (too big, too few steps)
+674a510	0.975033	60.2	discard	add WD 0.01 to embeddings (hurts)
+b1f02f7	0.975328	60.2	discard	add 2% warmup (any warmup hurts)
+81261f5	0.973469	60.2	discard	halve value embedding LR
+51f0499	0.972844	60.2	discard	x0_lambda beta1 0.96 to 0.90
+de48b64	0.974912	60.2	discard	SSSL pattern (more long layers hurt steps)
+01a3c69	0.973105	60.2	discard	FINAL_LR_FRAC 0.05 to 0.02
+86c7e66	0.974639	60.2	discard	lm_head init std 0.001 to 0.01
+489bb99	0.976462	60.2	discard	x0_lambda init 0.0 (x0 skip important)
+a16391b	0.973059	60.2	discard	rotary precompute 10x to 2x
+8dd93ec	0.972712	60.2	discard	VE LR 1.5x (flat)
+802d184	0.974123	60.2	discard	embedding init std 1.0 to 2.0
+2b9a688	0.974331	60.2	discard	sqrt WD schedule
+ffcb3c2	0.972982	60.2	discard	muon start momentum 0.85 to 0.80
+3cde993	0.974655	66.2	discard	depth 10 same dim 640 (too few steps)
+8be9036	0.975285	53.9	discard	depth 8 dim 640 (too shallow)
+2271cc2	0.974190	60.2	discard	WD follows LR schedule
+46cf5f2	0.983719	54.6	discard	parallel attn+MLP (much worse)
+59316b9	0.973312	60.2	discard	warmdown 0.7 to 0.65
+c4b0731	0.973803	57.4	discard	MLP hidden 4x to 3.5x
+6193116	0.973173	60.2	discard	RoPE base 200K to 150K
+c1f79a6	0.973005	60.2	discard	FINAL_LR_FRAC 0.05 to 0.03
+ee60bf7	0.976203	60.2	discard	SSSSSL pattern (too few long layers)
+a7b953a	0.973088	60.2	discard	lm_head WD 0.01 to 0.05
+41d50a8	0.972258	60.2	keep	reduce transformer init scale by 0.8x
+991abb2	0.972721	60.2	discard	init scale 0.6x (0.8 better)
+f5979a7	0.972128	60.2	keep	init scale 0.7x
+2216fd6	0.973025	60.2	discard	init scale 0.65x (0.7 better)
+ddcd35a	0.972587	60.2	discard	embedding init std 1.0 to 0.7
+8934eec	0.972776	60.2	discard	lm_head init std 0.001 to 0.002
+92b4765	0.973847	60.2	discard	small random init for c_proj (worse)
+d385aa7	0.972901	60.2	discard	scalar LR 0.5 to 0.7
+db37d12	0.973155	60.2	discard	unembedding LR 0.005 to 0.004
+f04daec	0.973155	60.2	discard	weight decay 0.2 to 0.25
+d931c3a	0.975790	60.2	discard	x0_lambda init 0.05 to 0.04 (worse)
+c5a4645	0.972216	60.2	discard	VE init scale 0.5x of transformer init
+30f1b8d	0.973361	60.2	discard	cosine warmdown schedule (linear better)
+5a9c951	0.972877	63.1	discard	MLP hidden 4x to 4.5x (fewer steps)
+ab8f970	0.975964	60.2	discard	decreasing resid_lambda init (hurts)
+2a3f587	0.972901	60.2	discard	softcap 15 to 14
+362937e	0.972495	60.2	discard	VE gate channels 32 to 16
+0d77d4d	0.972621	60.2	discard	Adam beta2 0.95 to 0.99
+4eebd43	0.973493	60.2	discard	x0_lambda LR 2x
+b85567f	0.979987	52.0	discard	multi-query attention n_kv_head=1 (too few KV heads)
+0da44e6	0.973545	60.2	discard	small nonzero init for c_proj (zero better)
+d6c139a	0.973831	60.2	discard	embedding init std 1.0 to 0.5
+d70987b	3.215849	60.2	discard	weight tying (shared embed/unembed, broken)
+bff5cda	0.975852	59.5	discard	VE every 3rd layer (too few VEs)
+5953d58	0.973423	60.2	discard	WD constant until warmdown then decay
+d1eb994	0.974314	60.2	discard	smaller QK init 0.5x (uniform init matters for Muon)
+3c19fba	0.974046	60.2	discard	depth-dependent init scale 1/sqrt(layer+1)
+119065a	0.972335	60.2	discard	init scale 0.7 to 0.72
+97dda85	0.972097	60.2	keep	init scale 0.7 to 0.68
+58b8b7a	0.972350	60.2	discard	init scale 0.68 to 0.66 (0.68 better)
+70c2737	0.972731	60.2	discard	Muon NorMuon beta2 0.95 to 0.98
+8232e01	0.973000	60.2	discard	resid_lambda LR 0.01x to 0.04x
+21389c4	0.973723	60.2	discard	Adam beta1 0.8 to 0.9
+e4c0f3e	0.974043	60.2	discard	short window 1/6 context (slower)
+2e2a2f8	0.972632	60.2	discard	short window 1/10 context (quality loss)
+9db7b86	0.972744	60.2	discard	lm_head init std 0.001 to 0.0005
+ece9101	0.972009	60.2	keep	tiny embedding WD 0.001
+b07c56b	0.972438	60.2	discard	embedding WD 0.001 to 0.002
+1a85362	0.971058	60.2	keep	tiny VE WD 0.001
+73c77ca	0.970655	60.2	keep	VE WD 0.001 to 0.002
+637f82f	0.970433	60.2	keep	VE WD 0.002 to 0.003
+c152812	0.970644	60.2	discard	VE WD 0.003 to 0.005 (0.003 better)
+efd2171	0.970703	60.2	discard	embedding WD 0.001 to 0.002
+328de7c	0.970612	60.2	discard	lm_head WD 0.01 to 0.02
+c0c2349	0.970758	60.2	discard	lm_head WD 0.01 to 0.005
+b1d5004	0.969952	60.2	keep	embedding LR 0.8 to 0.9 (with WD)
+2ca8872	0.970767	60.2	discard	embedding LR 0.9 to 1.0
+74a3b33	0.970759	60.2	discard	unembedding LR 0.005 to 0.006
+d1f68da	0.970106	60.2	discard	embedding WD 0.001 to 0.002 (with LR 0.9)
+ebbe8c0	0.971004	60.2	discard	matrix LR 0.04 to 0.045
+b9ee7d6	0.970040	60.2	discard	VE WD 0.003 to 0.004
+2f0a8ec	0.970573	60.2	discard	Muon WD 0.2 to 0.22
+438a26e	0.969686	60.2	keep	warmdown 0.7 to 0.75
+d9322b9	0.970244	60.2	discard	warmdown 0.75 to 0.8
+8876cf3	0.969714	60.2	discard	FINAL_LR_FRAC 0.05 to 0.03
+80330e2	0.970135	60.2	discard	x0_lambda init 0.05 to 0.06
+2f0cec6	0.970678	60.2	discard	RoPE base 200K to 300K
+c044a14	0.970212	60.2	discard	VE gate scale 2 to 3
+80a519a	0.969857	60.2	discard	VE LR 1.5x with WD
+a6b6476	0.970286	60.2	discard	muon momentum warmup 200 to 150 steps