Line Chart · 折线图#confidence-band#ema-smoothing#latex#boxed-axes

Self-Distillation · LiveCodeBench training curve with confidence band

Self-Distillation · LiveCodeBench 训练曲线 + 置信区间阴影

Continuous training curve on LiveCodeBench v6: SDPO (green) vs GRPO (blue), each with a half-transparent confidence band (alpha≈0.20) plotted under the EMA-smoothed mean. A grey dotted Claude reference line sits on top. LaTeX Computer Modern serif, 4-side box, ticks pointing inward, frameless legend with the proposed method in bold. The shipped script also produces a second figure (model scaling) — see the sibling `line_selfdistill_scale` entry.

LiveCodeBench v6 上的连续训练曲线：SDPO（绿）vs GRPO（蓝），每条线下方配半透明置信区间（alpha≈0.20），上方是灰色点线 Claude 参考。LaTeX Computer Modern serif，四边框，刻度向内，无边框图例（主方法粗体）。同一脚本还会生成第二张 model scaling 图（见 line_selfdistill_scale 条目）。

@paper · 来自论文

Reinforcement Learning via Self-Distillation

通过自蒸馏的强化学习

ETH Zurich, MPI, MIT, Stanford (lasgroup/SDPO) · arXiv 2026

arXiv:2601.20802 ↗PDF ↗code ↗

// original from paper · 论文原图

// reproduced via line_selfdistill_train.py · 脚本复现download png

line_selfdistill_train.py

download .py

"""
复现 image2 & image3: Self-distillation 论文折线图
image2: 连续训练曲线 + 置信区间阴影 + 水平参考线
image3: 离散点折线 + 置信区间阴影（模型规模 scaling）
来源：Reinforcement learning via self-distillation
"""

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

# ── 预分析结论 ─────────────────────────────────────────────
# 字体：serif，接近 LaTeX Computer Modern，启用 usetex
# 加粗：标题 normal | 图例 SDPO bold | 其他 normal
# Spine：只保留左/下（开口式）
# Grid：无
# 颜色：绿 #3A8B3A (SDPO) | 蓝 #3B6BB5 (GRPO) | 灰 #999999 (base)
# 阴影：主线颜色 alpha=0.15 的半透明填充
plt.rcParams.update({
    'text.usetex': True,
    'font.family': 'serif',
    'font.serif': ['Computer Modern Roman', 'STIX Two Text', 'DejaVu Serif'],
    'axes.unicode_minus': False,
})

C_SDPO  = '#2CA02C'   # matplotlib tab green
C_GRPO  = '#1F77B4'   # matplotlib tab blue
C_BASE  = '#BCBCBC'   # 浅灰，原图 base model

# ══════════════════════════════════════════════════════════
# 图 2：连续训练曲线（LiveCodeBench v6）
# ══════════════════════════════════════════════════════════
np.random.seed(42)

steps = np.linspace(0, 20000, 400)   # 更多点 + EMA 后更平滑

def raw_curve(start, end, steps, noise=0.008):
    t = steps / steps[-1]
    curve = start + (end - start) * (1 - np.exp(-4 * t))
    curve += np.random.normal(0, noise, len(steps)) * (1 - t * 0.7)
    return curve

def ema(arr, alpha=0.96):
    """Exponential moving average — 模拟论文中对 training log 的平滑"""
    out = np.zeros_like(arr)
    out[0] = arr[0]
    for i in range(1, len(arr)):
        out[i] = alpha * out[i - 1] + (1 - alpha) * arr[i]
    return out

# 中心线：先生成有噪声的曲线，再 EMA 平滑（与论文一致）
sdpo_mean = ema(raw_curve(0.285, 0.490, steps, noise=0.006))
sdpo_std  = 0.012 * np.exp(-2 * steps / steps[-1]) + 0.007
grpo_mean = ema(raw_curve(0.285, 0.415, steps, noise=0.006))
grpo_std  = 0.010 * np.exp(-2 * steps / steps[-1]) + 0.006

fig2, ax2 = plt.subplots(figsize=(6.5, 4.4))

ax2.fill_between(steps, sdpo_mean - sdpo_std, sdpo_mean + sdpo_std,
                 color=C_SDPO, alpha=0.20)
ax2.fill_between(steps, grpo_mean - grpo_std, grpo_mean + grpo_std,
                 color=C_GRPO, alpha=0.20)
ax2.plot(steps, sdpo_mean, color=C_SDPO, lw=2.5, label=r'\textbf{SDPO}')
ax2.plot(steps, grpo_mean, color=C_GRPO, lw=2.5, label='GRPO')
# 原图 Claude 参考线为稀疏圆点线，非虚线
ax2.axhline(0.406, color='#AAAAAA', lw=1.8,
            linestyle=(0, (1, 2)), label='Claude Sonnet 4')

ax2.set_xlim(0, 20000)
ax2.set_ylim(0.28, 0.52)
ax2.set_xlabel('Number of Generations', fontsize=13)
ax2.set_ylabel('Accuracy', fontsize=13)
ax2.set_title('LiveCodeBench v6', fontsize=15, pad=7)

ax2.xaxis.set_major_formatter(ticker.FuncFormatter(
    lambda x, _: f'{int(x/1000)}k' if x > 0 else '0'))
ax2.xaxis.set_major_locator(ticker.MultipleLocator(5000))
ax2.yaxis.set_major_locator(ticker.MultipleLocator(0.05))

leg2 = ax2.legend(fontsize=11, loc='lower right',
                  framealpha=0, edgecolor='none',
                  handlelength=2.2, borderaxespad=0.5, labelspacing=0.3)
for text in leg2.get_texts():
    if 'SDPO' in text.get_text():
        text.set_fontweight('bold')

# 四边框 + 向内刻度（与原图一致）
for sp in ax2.spines.values():
    sp.set_visible(True)
    sp.set_linewidth(1.5)
ax2.tick_params(direction='in', length=5, width=1.2, labelsize=11)
ax2.grid(False)

fig2.tight_layout(pad=0.9)
fig2.savefig('line_selfdistill_v6_repro.png',
             dpi=300, facecolor='white')
plt.close(fig2)
print('saved: line_selfdistill_v6_repro.png')

# ══════════════════════════════════════════════════════════
# 图 3：模型 scaling 折线（Model scaling Qwen3）
# ══════════════════════════════════════════════════════════
param_labels = ['0.6', '1.7', '4', '8']
param_x      = [0.6,   1.7,   4,   8]
x_pos        = [0, 1, 2, 3]   # 等间距排列，x 轴用 param_labels

sdpo_pts = [0.215, 0.333, 0.450, 0.490]
grpo_pts = [0.212, 0.295, 0.400, 0.414]
base_pts = [0.095, 0.150, 0.233, 0.284]

sdpo_std3 = [0.005, 0.006, 0.008, 0.006]
grpo_std3 = [0.005, 0.006, 0.007, 0.006]

fig3, ax3 = plt.subplots(figsize=(10, 5))   # 2:1 宽高比，与原图一致

ax3.fill_between(x_pos,
                 [v - s for v, s in zip(sdpo_pts, sdpo_std3)],
                 [v + s for v, s in zip(sdpo_pts, sdpo_std3)],
                 color=C_SDPO, alpha=0.18)
ax3.fill_between(x_pos,
                 [v - s for v, s in zip(grpo_pts, grpo_std3)],
                 [v + s for v, s in zip(grpo_pts, grpo_std3)],
                 color=C_GRPO, alpha=0.18)

MEC = 'black'   # 原图标记点有黑色描边
ax3.plot(x_pos, sdpo_pts, color=C_SDPO, lw=2.5,
         marker='o', ms=7, mfc=C_SDPO,
         markeredgecolor=MEC, markeredgewidth=1.0,
         label=r'\textbf{SDPO}')
ax3.plot(x_pos, grpo_pts, color=C_GRPO, lw=2.5,
         marker='o', ms=7, mfc=C_GRPO,
         markeredgecolor=MEC, markeredgewidth=1.0,
         label='GRPO')
ax3.plot(x_pos, base_pts, color=C_BASE, lw=2.5,   # 与主线同粗
         marker='o', ms=7, mfc=C_BASE,
         markeredgecolor=MEC, markeredgewidth=1.0,
         label='base model')

ax3.set_xticks(x_pos)
ax3.set_xticklabels(param_labels, fontsize=11)
ax3.set_xlim(-0.35, 3.35)
ax3.set_ylim(0.08, 0.51)   # 与原图 0.1~0.5 刻度对齐，留极小顶部空
ax3.set_xlabel('Model parameters (B)', fontsize=13)
ax3.set_ylabel(r'\textit{Accuracy}', fontsize=13)
ax3.set_title('Model scaling (Qwen3)', fontsize=15, pad=7)
ax3.yaxis.set_major_locator(ticker.MultipleLocator(0.1))

# 图例移右下（原图位置）
leg3 = ax3.legend(fontsize=11, loc='lower right',
                  bbox_to_anchor=(0.98, 0.02),
                  framealpha=0, edgecolor='none',
                  handlelength=2.2, borderaxespad=0.5, labelspacing=0.3)
for text in leg3.get_texts():
    if 'SDPO' in text.get_text():
        text.set_fontweight('bold')

# 四边框 + 向内刻度
for sp in ax3.spines.values():
    sp.set_visible(True)
    sp.set_linewidth(1.5)
ax3.tick_params(direction='in', length=5, width=1.2, labelsize=11)
ax3.grid(False)

fig3.tight_layout(pad=0.9)
fig3.savefig('line_selfdistill_scaling_repro.png',
             dpi=300, facecolor='white')
plt.close(fig3)
print('saved: line_selfdistill_scaling_repro.png')