Calibration scorecard

Can you trust these numbers?

A dashboard full of probabilities is only worth as much as the probabilities are honest. This page asks three questions of the model on the held-out 2025 season: are its probabilities calibrated, can it out-rank the experts, and does the blend’s hedge actually lower error over the long run.

Code

palette = ({
  accent: "#93c54b",
  accentDark: "#7aa83c",
  expert: "#b5651d",   // warm brown for the expert-driven (Model A) signal
  data:   "#3a6ea5",   // muted blue for the data-driven (Model B) signal
  mixture: "#3e3f3a",  // ink for the blended predictive
  sand: "#f8f5f0",
  bust: "#c9b8a3",
  held: "#dfe7c8",
  strong: "#b9d68a",
  leagueWinner: "#93c54b"
})

// Probability formatting with the "no false precision" rule: anything that
// would round to a flat tiny number is shown as "<1%"; high values mirror it.
fmtPct = function (p) {
  if (p == null || isNaN(p)) return "—";
  if (p < 0.01) return "<1%";
  if (p > 0.99) return ">99%";
  return (100 * p).toFixed(0) + "%";
}

// Fantasy-point formatting to one decimal.
fmtFp = function (x) {
  if (x == null || isNaN(x)) return "—";
  return x.toFixed(1);
}

// Four narrative-bin probabilities from the three exceedance probabilities.
// Inputs are P(exceed floor), P(exceed target), P(exceed ceiling).
narrativeProbs = function (pFloor, pTarget, pCeiling) {
  return {
    bust: Math.max(0, 1 - pFloor),
    held_up: Math.max(0, pFloor - pTarget),
    strong: Math.max(0, pTarget - pCeiling),
    league_winner: Math.max(0, pCeiling)
  };
}

// Map an ECR rank to its tier label (matches the export's ecr_tier bins).
ecrTier = function (ecr) {
  if (ecr == null || isNaN(ecr)) return null;
  if (ecr <= 5) return "1-5";
  if (ecr <= 12) return "6-12";
  if (ecr <= 24) return "13-24";
  if (ecr <= 48) return "25-48";
  if (ecr <= 96) return "49-96";
  return "97+";
}

// Human-readable archetype labels for badges.
archetypeLabel = ({
  fill_in_situation: "fill-in situation",
  emerging_player_elevation: "emerging player",
  late_season_expansion: "late-season expansion",
  recent_role_change: "recent role change",
  rookie_or_low_sample: "rookie / low sample",
  stable_veteran: "stable veteran",
  star_returning: "star returning"
})

Code

db = DuckDBClient.of({
  cal: FileAttachment("data/calibration.parquet"),
  wk: FileAttachment("data/weekly_summary.parquet")
})
calibration = db.query(`SELECT * FROM cal`)
weekly = db.query(`SELECT * FROM wk`)
cfg = FileAttachment("data/locked_config.json").json()
accuracy = FileAttachment("data/accuracy_summary.json").json()

ACCENT_TINT = "rgba(147, 197, 75, 0.18)"
SRC_LABEL = ({ expert_marginal: "Expert", data_marginal: "Data", cross_blend: "Blend" })
SRC_DOMAIN = ["Expert", "Data", "Blend"]
SRC_RANGE = [palette.expert, palette.data, palette.mixture]

1 · Are the probabilities calibrated?

Code

{
  const tv = cfg.thresholds[calSlot][calThr];
  const rows = calibration
    .filter(d => d.slot === calSlot && d.threshold === calThr && SRC_LABEL[d.predictive] != null)
    .map(d => ({ ...d, src: SRC_LABEL[d.predictive] }))
    .sort((a, b) => d3.ascending(a.src, b.src) || d3.ascending(a.pred_mid, b.pred_mid));
  if (!rows.length) return html`<em>No calibration bins for this slot and threshold.</em>`;
  const N_MIN = 5;
  const denseRows = rows.filter(d => d.n >= N_MIN);
  const sparseRows = rows.filter(d => d.n < N_MIN);
  const plot = Plot.plot({
    width: 540, height: 520, aspectRatio: 1, marginLeft: 54, marginBottom: 46,
    x: { label: "Predicted P(clear) →", domain: [0, 1], tickFormat: "%", grid: true },
    y: { label: "↑ Observed frequency", domain: [0, 1], tickFormat: "%", grid: true },
    color: { legend: true, domain: SRC_DOMAIN, range: SRC_RANGE },
    marks: [
      Plot.line([{ x: 0, y: 0 }, { x: 1, y: 1 }], { x: "x", y: "y", stroke: "#bbb", strokeDasharray: "4,4" }),
      // bootstrap bands and connected line only for bins with n ≥ 5
      Plot.areaY(denseRows, { x: "pred_mid", y1: "boot_lo", y2: "boot_hi", fill: "src", z: "src", fillOpacity: 0.15, curve: "linear" }),
      Plot.line(denseRows, { x: "pred_mid", y: "obs_freq", stroke: "src", z: "src", strokeWidth: 1.5, curve: "linear" }),
      // dense dots (filled, full color)
      Plot.dot(denseRows, { x: "pred_mid", y: "obs_freq", fill: "src", r: 3.5, title: d => `${d.src}: predicted ${Math.round(100*d.pred_mid)}%, observed ${Math.round(100*d.obs_freq)}% (n=${d.n})` }),
      // sparse dots (hollow grey, labeled with n)
      Plot.dot(sparseRows, { x: "pred_mid", y: "obs_freq", stroke: "#aaa", fill: "white", r: 3.5, title: d => `${d.src}: predicted ${Math.round(100*d.pred_mid)}%, observed ${Math.round(100*d.obs_freq)}% (n=${d.n} — sparse bin)` }),
      Plot.text(sparseRows, { x: "pred_mid", y: "obs_freq", text: d => `n=${d.n}`, dy: -10, fontSize: 9, fill: "#999" })
    ]
  });
  // n-weighted mean |predicted − observed| per model for this slot/threshold
  const eceByModel = d3.rollup(rows,
    v => { const N = d3.sum(v, d => d.n); return N > 0 ? d3.sum(v, d => d.n * Math.abs(d.pred_mid - d.obs_freq)) / N : null; },
    d => d.src
  );
  const eceLine = SRC_DOMAIN.map(s => {
    const v = eceByModel.get(s);
    const col = SRC_RANGE[SRC_DOMAIN.indexOf(s)];
    return html`<span style="margin-right:1.1rem;white-space:nowrap;">
      <span style="display:inline-block;width:10px;height:10px;background:${col};border-radius:2px;margin-right:3px;vertical-align:middle;"></span>
      <strong>${s}</strong> ${v != null ? (v * 100).toFixed(1) + " pp" : "—"}
    </span>`;
  });
  return html`<div>
    <div style="font-size:0.85rem;color:var(--rc-muted);margin-bottom:0.4rem;">Clearing the <strong>${calSlot} ${calThr}</strong> line (${tv} fp). Points on the diagonal are perfectly calibrated. Filled dots and shaded bands: bins with n ≥ 5. Hollow grey dots with n labels: sparse bins (n &lt; 5) — lines and bands not drawn through these.</div>
    ${plot}
    <div style="font-size:0.82rem;margin-top:0.5rem;color:var(--rc-muted);">Average prediction error for this slot/threshold (n-weighted mean |predicted − observed|): ${eceLine}</div>
  </div>`;
}

A reliability curve asks whether the probabilities mean what they say. For each predicted chance of clearing a threshold, player-weeks are grouped into ten bins, and we plot how often the threshold was actually cleared against how often the model said it would be. A perfectly calibrated model falls on the diagonal; points above it mean the model was too cautious, and points below mean it was overconfident. The shaded band is a bootstrap interval around each point.

What do the curves actually show? Across all slots and thresholds the three models are well-calibrated: the average gap between predicted and observed frequency is only about two percentage points for each, smallest for the blend (1.9) and largest for the data model (2.5). Through the dense low-to-middle range, where most receivers sit in a given week, the points track the diagonal closely, so a stated 20% or 40% chance of clearing a line means roughly what it says. The high-probability end is sparser and noisier, since only a minority of receivers ever carry a large chance of clearing the tougher lines, and what tendency there is runs slightly toward under-confidence: when a model does call a high probability, the event tends to happen a touch more often than predicted. The blend is the best-calibrated of the three by a small margin, consistent with its edge on CRPS below.

2 · Can the model out-rank the experts?

This site’s deployed “Blend” model is not built to out-predict the experts. Rather, our Bayesian Blend model is designed to combine information from expert consensus rankings and from player usage/game environment data in a principled way so that we can generate actionable probabilities, transparently assess the accuracy and communicate uncertainty around predictions, and surface potentially meaningful disagreements between expert consensus and usage/game data. Still, we know you will be curious about how the blend model stacks up to just using the expert consensus alone. Let’s start by focusing on how often each model gets the “top” WRs correct each week.

Code

{
  const m = k => Math.round(100 * d3.mean(weekly, d => d[k]));
  return html`<p>Across the 2025 season, our deployed blend was about as accurate as ranking receivers by raw expert consensus (ECR) alone. ECR hit <strong>${m("hit12_ecr")}%</strong> of the weekly top 12 and <strong>${m("hit24_ecr")}%</strong> of the top 24, against ${m("hit12_cross_blend")}% / ${m("hit24_cross_blend")}% for the blend and ${m("hit12_pure_data")}% / ${m("hit24_pure_data")}% for the data model.</p>`;
}

Code

{
  const mean = k => d3.mean(weekly, d => d[k]);
  const data = [
    { method: "ECR (experts)", k: "top-12 hit rate", v: mean("hit12_ecr") },
    { method: "Blend", k: "top-12 hit rate", v: mean("hit12_cross_blend") },
    { method: "Data", k: "top-12 hit rate", v: mean("hit12_pure_data") },
    { method: "ECR (experts)", k: "top-24 hit rate", v: mean("hit24_ecr") },
    { method: "Blend", k: "top-24 hit rate", v: mean("hit24_cross_blend") },
    { method: "Data", k: "top-24 hit rate", v: mean("hit24_pure_data") }
  ];
  return Plot.plot({
    width: 640, height: 320, marginLeft: 48, marginBottom: 40,
    fx: { label: null },
    x: { axis: null, domain: ["ECR (experts)", "Blend", "Data"] },
    y: { label: "weekly hit rate ↑", domain: [0, Math.max(0.55, d3.max(data, d => d.v) * 1.15)], tickFormat: "%", grid: true },
    color: { legend: true, domain: ["ECR (experts)", "Blend", "Data"], range: [palette.expert, palette.mixture, palette.data] },
    marks: [
      Plot.barY(data, { fx: "k", x: "method", y: "v", fill: "method" }),
      Plot.text(data, { fx: "k", x: "method", y: "v", text: d => `${Math.round(100 * d.v)}%`, dy: -6, fontSize: 11 }),
      Plot.ruleY([0])
    ]
  });
}

Ranking here is by raw expert consensus rank (ECR), the published ordering of players before any modeling. That is deliberately different from the “Expert” view used elsewhere on this dashboard, which is a model anchored on that consensus and recalibrated against past outcomes. ECR is the right benchmark for ranking because it is what the experts actually publish, and it is what the model was tested against — and across 2025 it ranked receivers at least as well as, or slightly better than, the blend on both top-12 and top-24 weekly hit rate. That ECR ranks best is not surprising, since expert consensus projections compress a wealth of rich and varied information sources into a single usable rankings signal. Again, the model’s contribution was never intended to be a better ranking; rather, it is actionable probabilities, transparent uncertainty, and surfaced disagreement.

3 · Does the hedge lower error over the long run?

Code

{
  const a = accuracy;
  const f2 = x => (x == null || isNaN(x)) ? "—" : x.toFixed(2);
  const rows = [
    { label: "Expert", color: palette.expert, ...a.expert },
    { label: "Data", color: palette.data, ...a.data },
    { label: "Blend", color: palette.mixture, ...a.blend }
  ];
  const minMAE = Math.min(...rows.map(r => r.mae));
  const minRMSE = Math.min(...rows.map(r => r.rmse));
  const minCRPS = Math.min(...rows.map(r => r.crps));
  const eq = (x, y) => Math.abs(x - y) < 1e-9;
  const tint = on => on ? `background:${ACCENT_TINT};` : "";
  const cell = (v, on) => html`<td style="padding:3px 12px;text-align:right;${tint(on)}">${f2(v)}</td>`;
  const body = rows.map(r => html`<tr>
    <td style="padding:3px 12px;text-align:left;"><span style="display:inline-block;width:10px;height:10px;background:${r.color};border-radius:2px;margin-right:6px;vertical-align:middle;"></span>${r.label}</td>
    ${cell(r.mae, eq(r.mae, minMAE))}
    ${cell(r.rmse, eq(r.rmse, minRMSE))}
    ${cell(r.crps, eq(r.crps, minCRPS))}
  </tr>`);
  const table = html`<table style="border-collapse:collapse;font-size:0.88rem;">
    <thead><tr>
      <th style="padding:4px 12px;text-align:left;border-bottom:2px solid var(--rc-sand-panel);">source</th>
      <th style="padding:4px 12px;text-align:right;border-bottom:2px solid var(--rc-sand-panel);">MAE</th>
      <th style="padding:4px 12px;text-align:right;border-bottom:2px solid var(--rc-sand-panel);">RMSE</th>
      <th style="padding:4px 12px;text-align:right;border-bottom:2px solid var(--rc-sand-panel);">CRPS</th>
    </tr></thead>
    <tbody>${body}</tbody>
  </table>
  <div style="font-size:0.76rem;color:var(--rc-muted);margin-top:3px;">Lower is better in every column; the tinted cell leads. Over ${a.n_player_weeks.toLocaleString()} player-weeks with a realized score.</div>`;

  // two-way Expert-vs-Data times-closest split
  const tc = a.times_closest;
  const eShare = 100 * tc.expert / tc.n_decided, dShare = 100 * tc.data / tc.n_decided;
  const splitBar = html`<div style="margin-top:0.9rem;max-width:520px;">
    <div style="font-size:0.8rem;color:var(--rc-muted);margin-bottom:2px;">Which mean landed closer, week by week (Expert vs Data, ${tc.n_decided.toLocaleString()} weeks)</div>
    <div style="display:flex;height:20px;border-radius:3px;overflow:hidden;border:1px solid var(--rc-sand-panel);">
      <div style="width:${eShare.toFixed(1)}%;background:${palette.expert};color:#fff;font-size:0.72rem;display:flex;align-items:center;justify-content:center;">Expert ${Math.round(eShare)}%</div>
      <div style="width:${dShare.toFixed(1)}%;background:${palette.data};color:#fff;font-size:0.72rem;display:flex;align-items:center;justify-content:center;">Data ${Math.round(dShare)}%</div>
    </div>
  </div>`;

  const takeaway = html`<p style="margin-top:0.9rem;">As with hit rates for the "top" players, the Expert and Blend models show highly similar accuracy across all player-week data for the full 2025 season (mean absolute error, or MAE, of ${f2(a.expert.mae)} for Expert vs. ${f2(a.blend.mae)} for Blend), with the Data model falling slightly behind. However, the blend earns the lowest CRPS (the <em>continuous ranked probability score</em>, a proper scoring rule used to evaluate probabilistic predictions) of ${f2(a.blend.crps)} across the season, compared to ${f2(a.expert.crps)} for Expert and ${f2(a.data.crps)} for Data. Its win here reflects the payoff of not having to guess which source — Expert or Data — will land closer to the realized fantasy score each week. Since the Expert model is closest on some weeks and the Data model wins other weeks, neither is reliably the better bet in advance; combining them, with somewhat more weight on the usage and game-environment data, lowers the Blend model's error over the long run even though it rarely wins any single week outright.</p>`;

  return html`<div>${table}${splitBar}${takeaway}</div>`;
}

The reliability curves, ranking hit rates, and accuracy scores on this page are computed at the deployed (default) settings on the 2025 holdout; the interactive controls on the other pages are sensitivity exploration, and their calibration is not separately guaranteed here.

What is CRPS, and why include it?

CRPS, the continuous ranked probability score, measures how well an entire predicted distribution matches what actually happened, rewarding a forecast for placing probability mass near the realized score and penalizing both misplaced confidence and excessive vagueness. Lower is better. Unlike the average error on the mean, it credits a model for being honest about its uncertainty rather than only for landing close on average, which is why it is the score these models were selected on and the most complete single measure of forecast quality shown here.