{
  "version": "2.0",
  "updated": "2026-04-23",
  "note": "Every entry below corresponds to a real adapter in runner/adapters/. The registry is the catalogue of benchmarks we CAN run end-to-end — the verified-runs list (data/runs.json) starts empty until the first real on-chain attestation lands.",
  "benchmarks": [
    {
      "id": "humaneval",
      "name": "HumanEval",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/openai/openai_humaneval",
      "paperUrl": "https://arxiv.org/abs/2107.03374",
      "leaderboardUrl": "https://paperswithcode.com/sota/code-generation-on-humaneval",
      "upstreamRepo": "https://github.com/openai/human-eval",
      "license": "MIT",
      "problemCount": 164,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "164 Python programming problems, pass@1 via deterministic test execution.",
      "adapter": "humaneval",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "humaneval-plus",
      "name": "HumanEval+",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/evalplus/humanevalplus",
      "paperUrl": "https://arxiv.org/abs/2305.01210",
      "leaderboardUrl": "https://evalplus.github.io/leaderboard.html",
      "upstreamRepo": "https://github.com/evalplus/evalplus",
      "license": "Apache-2.0",
      "problemCount": 164,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "EvalPlus hardening — ~80× more test cases than HumanEval.",
      "adapter": "humaneval_plus",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "mbpp",
      "name": "MBPP",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/google-research-datasets/mbpp",
      "paperUrl": "https://arxiv.org/abs/2108.07732",
      "leaderboardUrl": "https://paperswithcode.com/sota/code-generation-on-mbpp",
      "upstreamRepo": "https://github.com/google-research/google-research/tree/master/mbpp",
      "license": "CC-BY-4.0",
      "problemCount": 974,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "974 entry-level Python problems with auto-graded tests.",
      "adapter": "mbpp",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "mbpp-plus",
      "name": "MBPP+",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/evalplus/mbppplus",
      "paperUrl": "https://arxiv.org/abs/2305.01210",
      "leaderboardUrl": "https://evalplus.github.io/leaderboard.html",
      "license": "Apache-2.0",
      "problemCount": 378,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "EvalPlus hardening over the sanitized MBPP subset.",
      "adapter": "mbpp_plus",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "bigcodebench",
      "name": "BigCodeBench",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/bigcode/bigcodebench",
      "paperUrl": "https://arxiv.org/abs/2406.15877",
      "leaderboardUrl": "https://bigcode-bench.github.io/",
      "upstreamRepo": "https://github.com/bigcode-project/bigcodebench",
      "license": "Apache-2.0",
      "problemCount": 1140,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "low",
      "oneliner": "1140 realistic Python tasks invoking complex libraries (pandas, matplotlib, sklearn).",
      "adapter": "bigcodebench",
      "adapterStatus": "live",
      "pricingTier": "agent"
    },
    {
      "id": "livecodebench",
      "name": "LiveCodeBench",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/livecodebench/code_generation_lite",
      "paperUrl": "https://arxiv.org/abs/2403.07974",
      "leaderboardUrl": "https://livecodebench.github.io/leaderboard.html",
      "upstreamRepo": "https://github.com/LiveCodeBench/LiveCodeBench",
      "license": "CC-BY-4.0",
      "problemCount": 500,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Contest-style coding problems with monthly time windows — contamination-resistant.",
      "adapter": "livecodebench",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "swe-bench-verified",
      "name": "SWE-bench Verified",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified",
      "paperUrl": "https://arxiv.org/abs/2310.06770",
      "leaderboardUrl": "https://www.swebench.com/",
      "upstreamRepo": "https://github.com/princeton-nlp/SWE-bench",
      "license": "CC0-1.0",
      "problemCount": 500,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 8192
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "medium",
      "oneliner": "500 real GitHub issues, human-vetted, Docker test harness.",
      "adapter": "swe_bench_verified",
      "adapterStatus": "docker",
      "pricingTier": "agent"
    },
    {
      "id": "swe-bench-lite",
      "name": "SWE-bench Lite",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Lite",
      "paperUrl": "https://arxiv.org/abs/2310.06770",
      "leaderboardUrl": "https://www.swebench.com/lite.html",
      "license": "CC0-1.0",
      "problemCount": 300,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 8192
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "medium",
      "oneliner": "300-issue lightweight subset of SWE-bench.",
      "adapter": "swe_bench_lite",
      "adapterStatus": "docker",
      "pricingTier": "agent"
    },
    {
      "id": "multi-swe-bench",
      "name": "Multi-SWE-bench",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://github.com/multi-swe-bench/multi-swe-bench",
      "paperUrl": "https://arxiv.org/abs/2504.02605",
      "leaderboardUrl": "https://multi-swe-bench.github.io/",
      "license": "Apache-2.0",
      "problemCount": 1632,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 8192
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "low",
      "oneliner": "SWE-bench-style issues across Java, Go, Rust, TypeScript, C++.",
      "adapter": "multi_swe_bench",
      "adapterStatus": "docker",
      "pricingTier": "agent"
    },
    {
      "id": "swe-rebench",
      "name": "SWE-rebench",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/nebius/SWE-rebench",
      "paperUrl": "https://swerebench.ai/",
      "leaderboardUrl": "https://swerebench.ai/",
      "license": "MIT",
      "problemCount": 600,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 8192
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "low",
      "oneliner": "Monthly-refreshed SWE-bench variant, contamination-resistant.",
      "adapter": "swe_rebench",
      "adapterStatus": "docker",
      "pricingTier": "agent"
    },
    {
      "id": "tau-bench",
      "name": "τ-Bench",
      "category": "agent-framework",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://github.com/sierra-research/tau-bench",
      "paperUrl": "https://arxiv.org/abs/2406.12045",
      "leaderboardUrl": "https://github.com/sierra-research/tau-bench#results",
      "license": "MIT",
      "problemCount": 230,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 4096
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "low",
      "oneliner": "Multi-turn tool-use trajectories in retail and airline environments.",
      "adapter": "tau_bench",
      "adapterStatus": "external",
      "externalReason": "agent-harness",
      "pricingTier": "agent"
    },
    {
      "id": "gaia",
      "name": "GAIA",
      "category": "agent-framework",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/gaia-benchmark/GAIA",
      "paperUrl": "https://arxiv.org/abs/2311.12983",
      "leaderboardUrl": "https://huggingface.co/spaces/gaia-benchmark/leaderboard",
      "license": "Apache-2.0",
      "problemCount": 466,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "low",
      "oneliner": "General AI assistants — 466 real-world tasks, exact-match scoring.",
      "adapter": "gaia",
      "adapterStatus": "live",
      "pricingTier": "agent"
    },
    {
      "id": "webarena",
      "name": "WebArena",
      "category": "agent-framework",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://github.com/web-arena-x/webarena",
      "paperUrl": "https://arxiv.org/abs/2307.13854",
      "leaderboardUrl": "https://webarena.dev/",
      "license": "Apache-2.0",
      "problemCount": 812,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "low",
      "oneliner": "Realistic web navigation tasks in hosted shopping / CMS / GitLab environments.",
      "adapter": "webarena",
      "adapterStatus": "external",
      "externalReason": "agent-harness",
      "pricingTier": "agent"
    },
    {
      "id": "mmlu",
      "name": "MMLU",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/cais/mmlu",
      "paperUrl": "https://arxiv.org/abs/2009.03300",
      "leaderboardUrl": "https://paperswithcode.com/sota/multi-task-language-understanding-on-mmlu",
      "license": "MIT",
      "problemCount": 14042,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 32
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "57 subjects, 4-choice MCQ. The classic knowledge benchmark.",
      "adapter": "mmlu",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "mmlu-pro",
      "name": "MMLU-Pro",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
      "paperUrl": "https://arxiv.org/abs/2406.01574",
      "leaderboardUrl": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
      "license": "MIT",
      "problemCount": 12032,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "MMLU's hardened successor — 10 options, harder reasoning.",
      "adapter": "mmlu_pro",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "gpqa",
      "name": "GPQA Diamond",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/Idavidrein/gpqa",
      "paperUrl": "https://arxiv.org/abs/2311.12022",
      "leaderboardUrl": "https://github.com/idavidrein/gpqa",
      "license": "CC-BY-4.0",
      "problemCount": 448,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "PhD-level physics, chemistry, biology. Google-proof.",
      "adapter": "gpqa",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "bbh",
      "name": "BIG-Bench Hard",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/lukaemon/bbh",
      "paperUrl": "https://arxiv.org/abs/2210.09261",
      "leaderboardUrl": "https://github.com/suzgunmirac/BIG-Bench-Hard",
      "license": "MIT",
      "problemCount": 6511,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "23 challenging reasoning tasks pulled from BIG-Bench.",
      "adapter": "bbh",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "arc-challenge",
      "name": "ARC-Challenge",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/allenai/ai2_arc",
      "paperUrl": "https://arxiv.org/abs/1803.05457",
      "leaderboardUrl": "https://leaderboard.allenai.org/arc/submissions/public",
      "license": "CC-BY-SA-4.0",
      "problemCount": 1172,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 128
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "Grade-school science MCQ, harder subset.",
      "adapter": "arc_challenge",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "hellaswag",
      "name": "HellaSwag",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/Rowan/hellaswag",
      "paperUrl": "https://arxiv.org/abs/1905.07830",
      "leaderboardUrl": "https://rowanzellers.com/hellaswag/",
      "license": "MIT",
      "problemCount": 10042,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 32
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "Commonsense sentence-completion MCQ.",
      "adapter": "hellaswag",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "winogrande",
      "name": "WinoGrande",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/allenai/winogrande",
      "paperUrl": "https://arxiv.org/abs/1907.10641",
      "license": "Apache-2.0",
      "problemCount": 1267,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 16
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "Commonsense coreference, binary choice (Winograd-style).",
      "adapter": "winogrande",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "piqa",
      "name": "PIQA",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/ybisk/piqa",
      "paperUrl": "https://arxiv.org/abs/1911.11641",
      "license": "AFL-3.0",
      "problemCount": 1838,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 16
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "Physical commonsense reasoning, binary choice.",
      "adapter": "piqa",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "commonsenseqa",
      "name": "CommonsenseQA",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/tau/commonsense_qa",
      "paperUrl": "https://arxiv.org/abs/1811.00937",
      "license": "MIT",
      "problemCount": 1221,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 16
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "5-way MCQ over ConceptNet-derived commonsense.",
      "adapter": "commonsenseqa",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "musr",
      "name": "MuSR",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/TAUR-Lab/MuSR",
      "paperUrl": "https://arxiv.org/abs/2310.16049",
      "license": "MIT",
      "problemCount": 756,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Multistep soft reasoning: murder mysteries, team allocation.",
      "adapter": "musr",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "agieval",
      "name": "AGIEval",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/baber/agieval",
      "paperUrl": "https://arxiv.org/abs/2304.06364",
      "license": "MIT",
      "problemCount": 8062,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "Human-exam questions: SAT, GRE, Chinese civil service.",
      "adapter": "agieval",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "livebench",
      "name": "LiveBench",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/livebench/live_bench",
      "paperUrl": "https://arxiv.org/abs/2406.19314",
      "leaderboardUrl": "https://livebench.ai/",
      "upstreamRepo": "https://github.com/LiveBench/LiveBench",
      "license": "Apache-2.0",
      "problemCount": 960,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Monthly-refreshed benchmark across math, coding, reasoning, data analysis, language.",
      "adapter": "livebench",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "gsm8k",
      "name": "GSM8K",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/openai/gsm8k",
      "paperUrl": "https://arxiv.org/abs/2110.14168",
      "leaderboardUrl": "https://paperswithcode.com/sota/arithmetic-reasoning-on-gsm8k",
      "license": "MIT",
      "problemCount": 1319,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "8.5k grade-school arithmetic word problems.",
      "adapter": "gsm8k",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "math",
      "name": "MATH",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/hendrycks/competition_math",
      "paperUrl": "https://arxiv.org/abs/2103.03874",
      "leaderboardUrl": "https://paperswithcode.com/sota/math-word-problem-solving-on-math",
      "license": "MIT",
      "problemCount": 5000,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "Competition math problems, LaTeX boxed answers.",
      "adapter": "math",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "aime",
      "name": "AIME 2024",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024",
      "paperUrl": "https://artofproblemsolving.com/wiki/index.php/AIME_Problems_and_Solutions",
      "leaderboardUrl": "https://matharena.ai/",
      "license": "unknown",
      "problemCount": 30,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 4096
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "American Invitational Math Exam. Integer answers 0-999.",
      "adapter": "aime",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "mgsm",
      "name": "MGSM",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/juletxara/mgsm",
      "paperUrl": "https://arxiv.org/abs/2210.03057",
      "license": "MIT",
      "problemCount": 2750,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "Multilingual GSM8K across 11 languages.",
      "adapter": "mgsm",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "theoremqa",
      "name": "TheoremQA",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/wenhu/TheoremQA",
      "paperUrl": "https://arxiv.org/abs/2305.12524",
      "license": "MIT",
      "problemCount": 800,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Theorem-grounded STEM problems requiring numeric/expression answers.",
      "adapter": "theoremqa",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "frontiermath",
      "name": "FrontierMath",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://epoch.ai/frontiermath",
      "paperUrl": "https://arxiv.org/abs/2411.04872",
      "leaderboardUrl": "https://epoch.ai/frontiermath",
      "license": "research-only",
      "problemCount": 300,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 8192
      },
      "costEstimateUsd": 0.0,
      "contaminationRisk": "low",
      "oneliner": "Epoch AI's frontier math benchmark. Dataset gated.",
      "adapter": "frontiermath",
      "adapterStatus": "external",
      "externalReason": "gated",
      "pricingTier": "external"
    },
    {
      "id": "humanitys-last-exam",
      "name": "Humanity's Last Exam",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://agi.safe.ai/",
      "paperUrl": "https://arxiv.org/abs/2501.14249",
      "leaderboardUrl": "https://agi.safe.ai/",
      "license": "research-only",
      "problemCount": 3000,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 8192
      },
      "costEstimateUsd": 0.0,
      "contaminationRisk": "low",
      "oneliner": "Expert-crafted frontier benchmark. Dataset gated.",
      "adapter": "humanitys_last_exam",
      "adapterStatus": "external",
      "externalReason": "gated",
      "pricingTier": "external"
    },
    {
      "id": "longmemeval",
      "name": "LongMemEval",
      "category": "memory",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/xiaowu0162/longmemeval",
      "paperUrl": "https://arxiv.org/abs/2410.10813",
      "leaderboardUrl": "https://github.com/xiaowu0162/LongMemEval",
      "license": "Apache-2.0",
      "problemCount": 500,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 10.0,
      "contaminationRisk": "low",
      "oneliner": "Long-conversation memory Q&A. GPT-4o canonical judge.",
      "adapter": "longmemeval",
      "adapterStatus": "live",
      "pricingTier": "long-context"
    },
    {
      "id": "niah",
      "name": "NIAH · Needle in a Haystack",
      "category": "memory",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://github.com/gkamradt/LLMTest_NeedleInAHaystack",
      "paperUrl": "https://github.com/gkamradt/LLMTest_NeedleInAHaystack#readme",
      "license": "MIT",
      "problemCount": 20,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 128
      },
      "costEstimateUsd": 10.0,
      "contaminationRisk": "low",
      "oneliner": "Retrieve a seeded fact from 4k → 128k token contexts. Deterministic generator.",
      "adapter": "niah",
      "adapterStatus": "live",
      "pricingTier": "long-context"
    },
    {
      "id": "ruler",
      "name": "RULER",
      "category": "memory",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://github.com/NVIDIA/RULER",
      "paperUrl": "https://arxiv.org/abs/2404.06654",
      "leaderboardUrl": "https://github.com/NVIDIA/RULER#results",
      "license": "Apache-2.0",
      "problemCount": 2600,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 256
      },
      "costEstimateUsd": 10.0,
      "contaminationRisk": "low",
      "oneliner": "NVIDIA's 13-task long-context suite at configurable window sizes.",
      "adapter": "ruler",
      "adapterStatus": "live",
      "pricingTier": "long-context"
    },
    {
      "id": "frames",
      "name": "FRAMES",
      "category": "rag",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/google/frames-benchmark",
      "paperUrl": "https://arxiv.org/abs/2409.12941",
      "leaderboardUrl": "https://github.com/google/frames-benchmark",
      "license": "Apache-2.0",
      "problemCount": 824,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Multi-document factuality / retrieval benchmark.",
      "adapter": "frames",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "squad-v2",
      "name": "SQuAD 2.0",
      "category": "rag",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/rajpurkar/squad_v2",
      "paperUrl": "https://arxiv.org/abs/1806.03822",
      "license": "CC-BY-SA-4.0",
      "problemCount": 11873,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 128
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "Reading comprehension with unanswerable questions.",
      "adapter": "squad",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "triviaqa",
      "name": "TriviaQA",
      "category": "rag",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/mandarjoshi/trivia_qa",
      "paperUrl": "https://arxiv.org/abs/1705.03551",
      "license": "Apache-2.0",
      "problemCount": 17944,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 64
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "Trivia QA, no-context closed-book evaluation.",
      "adapter": "triviaqa",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "drop",
      "name": "DROP",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/ucinlp/drop",
      "paperUrl": "https://arxiv.org/abs/1903.00161",
      "license": "CC-BY-4.0",
      "problemCount": 9536,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 64
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "high",
      "oneliner": "Reading comprehension with discrete reasoning (numerical / multi-span).",
      "adapter": "drop",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "ifeval",
      "name": "IFEval",
      "category": "llm",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/google/IFEval",
      "paperUrl": "https://arxiv.org/abs/2311.07911",
      "leaderboardUrl": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard",
      "license": "Apache-2.0",
      "problemCount": 541,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Instruction-following with verifiable constraints (format, length, keywords).",
      "adapter": "ifeval",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "mt-bench",
      "name": "MT-Bench",
      "category": "llm",
      "metric": "score",
      "maxScore": 10.0,
      "datasetUrl": "https://huggingface.co/datasets/HuggingFaceH4/mt_bench_prompts",
      "paperUrl": "https://arxiv.org/abs/2306.05685",
      "leaderboardUrl": "https://lmsys.org/",
      "license": "Apache-2.0",
      "problemCount": 80,
      "decoding": {
        "temperature": 0.7,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "80 multi-turn open-ended prompts, GPT-4 judged on 10-point Likert.",
      "adapter": "mt_bench",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "arena-hard",
      "name": "Arena-Hard-Auto",
      "category": "llm",
      "metric": "score",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/lmarena-ai/arena-hard-auto-v0.1",
      "paperUrl": "https://lmsys.org/blog/2024-04-19-arena-hard/",
      "leaderboardUrl": "https://huggingface.co/spaces/lmarena-ai/arena-hard-browser",
      "license": "Apache-2.0",
      "problemCount": 500,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "500 tough user prompts, pairwise judged vs. GPT-4-Turbo baseline.",
      "adapter": "arena_hard",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "alpaca-eval",
      "name": "AlpacaEval 2",
      "category": "llm",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/tatsu-lab/alpaca_eval",
      "paperUrl": "https://arxiv.org/abs/2305.14387",
      "leaderboardUrl": "https://tatsu-lab.github.io/alpaca_eval/",
      "license": "CC-BY-NC-4.0",
      "problemCount": 805,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "805 prompts, pairwise length-controlled win-rate.",
      "adapter": "alpacaeval",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "wildbench",
      "name": "WildBench",
      "category": "llm",
      "metric": "score",
      "maxScore": 10.0,
      "datasetUrl": "https://huggingface.co/datasets/allenai/WildBench",
      "paperUrl": "https://arxiv.org/abs/2406.04770",
      "leaderboardUrl": "https://huggingface.co/spaces/allenai/WildBench",
      "license": "Apache-2.0",
      "problemCount": 1024,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "In-the-wild user requests, rubric-scored.",
      "adapter": "wildbench",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "truthfulqa",
      "name": "TruthfulQA MC1",
      "category": "safety",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/truthfulqa/truthful_qa",
      "paperUrl": "https://arxiv.org/abs/2109.07958",
      "license": "Apache-2.0",
      "problemCount": 817,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 32
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "817 tricky questions, single-correct multiple choice.",
      "adapter": "truthfulqa",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "harmbench",
      "name": "HarmBench",
      "category": "safety",
      "metric": "score",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/walledai/HarmBench",
      "paperUrl": "https://arxiv.org/abs/2402.04249",
      "leaderboardUrl": "https://www.harmbench.org/",
      "license": "MIT",
      "problemCount": 510,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Refusal rate on harmful prompts. Higher = safer.",
      "adapter": "harmbench",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "mmmu",
      "name": "MMMU",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/MMMU/MMMU",
      "paperUrl": "https://arxiv.org/abs/2311.16502",
      "leaderboardUrl": "https://mmmu-benchmark.github.io/",
      "license": "Apache-2.0",
      "problemCount": 11500,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "11.5k multimodal questions across 30 disciplines.",
      "adapter": "mmmu",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "mmbench",
      "name": "MMBench",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/lmms-lab/MMBench",
      "paperUrl": "https://arxiv.org/abs/2307.06281",
      "leaderboardUrl": "https://mmbench.opencompass.org.cn/",
      "license": "Apache-2.0",
      "problemCount": 3217,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 16
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "3.2k multimodal MCQ covering fine-grained visual skills.",
      "adapter": "mmbench",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "mathvista",
      "name": "MathVista",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/AI4Math/MathVista",
      "paperUrl": "https://arxiv.org/abs/2310.02255",
      "leaderboardUrl": "https://mathvista.github.io/",
      "license": "CC-BY-SA-4.0",
      "problemCount": 1000,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Math reasoning with visual context (1k testmini).",
      "adapter": "mathvista",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "chartqa",
      "name": "ChartQA",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/HuggingFaceM4/ChartQA",
      "paperUrl": "https://arxiv.org/abs/2203.10244",
      "license": "CC-BY-4.0",
      "problemCount": 2500,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 256
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Question-answering over charts with 5% numeric tolerance.",
      "adapter": "chartqa",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "mteb",
      "name": "MTEB",
      "category": "embedding",
      "metric": "score",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/spaces/mteb/leaderboard",
      "paperUrl": "https://arxiv.org/abs/2210.07316",
      "leaderboardUrl": "https://huggingface.co/spaces/mteb/leaderboard",
      "upstreamRepo": "https://github.com/embeddings-benchmark/mteb",
      "license": "Apache-2.0",
      "problemCount": 58,
      "decoding": {},
      "costEstimateUsd": 0.0,
      "contaminationRisk": "low",
      "oneliner": "58 embedding tasks across retrieval, clustering, classification, STS.",
      "adapter": "mteb",
      "adapterStatus": "external",
      "externalReason": "embedding-harness",
      "pricingTier": "external"
    },
    {
      "id": "mbxp",
      "name": "MBXP",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/mbxp/mbxp",
      "paperUrl": "https://arxiv.org/abs/2210.14868",
      "license": "Apache-2.0",
      "problemCount": 974,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "Multilingual MBPP (Python, JS, Go, Rust, etc.).",
      "adapter": "mbxp",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "simple-evals",
      "name": "OpenAI Simple-Evals",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://github.com/openai/simple-evals",
      "paperUrl": "https://github.com/openai/simple-evals#readme",
      "license": "MIT",
      "problemCount": 0,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 0.0,
      "contaminationRisk": "medium",
      "oneliner": "OpenAI's canonical re-implementations. Meta-adapter pointing at subtasks.",
      "adapter": "openai_simple_evals",
      "adapterStatus": "external",
      "externalReason": "meta",
      "pricingTier": "external"
    },
    {
      "id": "math-500",
      "name": "MATH-500",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500",
      "paperUrl": "https://arxiv.org/abs/2305.20050",
      "leaderboardUrl": "https://github.com/openai/simple-evals",
      "license": "MIT",
      "problemCount": 500,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "OpenAI's 500-problem subset of Hendrycks MATH, canonical for frontier evals.",
      "adapter": "math_500",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "simpleqa",
      "name": "SimpleQA",
      "category": "safety",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/basicv8vc/SimpleQA",
      "paperUrl": "https://arxiv.org/abs/2411.04368",
      "leaderboardUrl": "https://github.com/openai/simple-evals",
      "license": "MIT",
      "problemCount": 4326,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 256
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "OpenAI's factuality benchmark. 4,326 short-answer questions, GPT-4o judged.",
      "adapter": "simpleqa",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "mmlu-redux",
      "name": "MMLU-Redux",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux-2.0",
      "paperUrl": "https://arxiv.org/abs/2406.04127",
      "license": "CC-BY-4.0",
      "problemCount": 5700,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 32
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Error-corrected MMLU. Broken items removed, filtered for clean evaluation.",
      "adapter": "mmlu_redux",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "humaneval-pack",
      "name": "HumanEvalPack",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/bigcode/humanevalpack",
      "paperUrl": "https://arxiv.org/abs/2308.07124",
      "license": "Apache-2.0",
      "problemCount": 984,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "HumanEval in 6 languages (Python, JS, Java, Go, C++, Rust).",
      "adapter": "humaneval_pack",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "arc-agi",
      "name": "ARC-AGI",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://github.com/fchollet/ARC-AGI",
      "paperUrl": "https://arxiv.org/abs/1911.01547",
      "leaderboardUrl": "https://arcprize.org/",
      "license": "MIT",
      "problemCount": 800,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 4096
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Chollet's abstraction and reasoning corpus. Grid-puzzle transformations.",
      "adapter": "arc_agi",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "zebralogic",
      "name": "ZebraLogic",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/allenai/ZebraLogic",
      "paperUrl": "https://arxiv.org/abs/2502.01100",
      "leaderboardUrl": "https://huggingface.co/spaces/allenai/ZebraLogic",
      "license": "Apache-2.0",
      "problemCount": 1000,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 4096
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Einstein-style logic-grid puzzles with constraint-satisfaction scoring.",
      "adapter": "zebralogic",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "longbench",
      "name": "LongBench",
      "category": "memory",
      "metric": "score",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/THUDM/LongBench",
      "paperUrl": "https://arxiv.org/abs/2308.14508",
      "leaderboardUrl": "https://github.com/THUDM/LongBench",
      "license": "MIT",
      "problemCount": 4500,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 10.0,
      "contaminationRisk": "low",
      "oneliner": "21-task long-context evaluation averaging 18k tokens per example.",
      "adapter": "longbench",
      "adapterStatus": "live",
      "pricingTier": "long-context"
    },
    {
      "id": "infinitebench",
      "name": "InfiniteBench",
      "category": "memory",
      "metric": "score",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/xinrongzhang2022/InfiniteBench",
      "paperUrl": "https://arxiv.org/abs/2402.13718",
      "leaderboardUrl": "https://github.com/OpenBMB/InfiniteBench",
      "license": "Apache-2.0",
      "problemCount": 3946,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 10.0,
      "contaminationRisk": "low",
      "oneliner": "100k+ token benchmark spanning retrieval, QA, math, code, summarization.",
      "adapter": "infinitebench",
      "adapterStatus": "live",
      "pricingTier": "long-context"
    },
    {
      "id": "beir",
      "name": "BEIR",
      "category": "embedding",
      "metric": "score",
      "maxScore": 1.0,
      "datasetUrl": "https://huggingface.co/BeIR",
      "paperUrl": "https://arxiv.org/abs/2104.08663",
      "leaderboardUrl": "https://github.com/beir-cellar/beir",
      "license": "Apache-2.0",
      "problemCount": 18,
      "decoding": {},
      "costEstimateUsd": 0.0,
      "contaminationRisk": "low",
      "oneliner": "18-task heterogeneous IR benchmark. nDCG@10 primary metric.",
      "adapter": "beir",
      "adapterStatus": "external",
      "externalReason": "embedding-harness",
      "pricingTier": "external"
    },
    {
      "id": "cruxeval",
      "name": "CRUXEval",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/cruxeval-org/cruxeval",
      "paperUrl": "https://arxiv.org/abs/2401.03065",
      "leaderboardUrl": "https://crux-eval.github.io/",
      "license": "MIT",
      "problemCount": 800,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 256
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Code reasoning. Predict input or output of Python functions.",
      "adapter": "cruxeval",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "repobench",
      "name": "RepoBench",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/tianyang/repobench",
      "paperUrl": "https://arxiv.org/abs/2306.03091",
      "leaderboardUrl": "https://github.com/Leolty/repobench",
      "license": "Apache-2.0",
      "problemCount": 5000,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "Repository-level code auto-completion. Retrieval plus next-line generation.",
      "adapter": "repobench",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "bird",
      "name": "BIRD-SQL",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://bird-bench.github.io/",
      "paperUrl": "https://arxiv.org/abs/2305.03111",
      "leaderboardUrl": "https://bird-bench.github.io/",
      "license": "CC-BY-SA-4.0",
      "problemCount": 12751,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 0.0,
      "contaminationRisk": "low",
      "oneliner": "Realistic text-to-SQL over large, messy databases. Execution-matched.",
      "adapter": "bird",
      "adapterStatus": "external",
      "externalReason": "embedding-harness",
      "pricingTier": "external"
    },
    {
      "id": "spider",
      "name": "Spider",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/xlangai/spider",
      "paperUrl": "https://arxiv.org/abs/1809.08887",
      "leaderboardUrl": "https://yale-lily.github.io/spider",
      "license": "CC-BY-SA-4.0",
      "problemCount": 1034,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "10181 question and SQL pairs across 200 databases, 138 domains.",
      "adapter": "spider",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "medqa",
      "name": "MedQA",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/bigbio/med_qa",
      "paperUrl": "https://arxiv.org/abs/2009.13081",
      "license": "MIT",
      "problemCount": 1273,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 64
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "USMLE-style medical licensing exam questions.",
      "adapter": "medqa",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "legalbench",
      "name": "LegalBench",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/nguha/legalbench",
      "paperUrl": "https://arxiv.org/abs/2308.11462",
      "leaderboardUrl": "https://hazyresearch.stanford.edu/legalbench/",
      "license": "Apache-2.0",
      "problemCount": 90000,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 128
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "162 legal reasoning tasks covering contracts, rules, issues, conclusions.",
      "adapter": "legalbench",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "olympiadbench",
      "name": "OlympiadBench",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/Hothan/OlympiadBench",
      "paperUrl": "https://arxiv.org/abs/2402.14008",
      "leaderboardUrl": "https://github.com/OpenBMB/OlympiadBench",
      "license": "Apache-2.0",
      "problemCount": 8476,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 4096
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "8476 bilingual olympiad-level math and physics problems.",
      "adapter": "olympiadbench",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "osworld",
      "name": "OSWorld",
      "category": "agent-framework",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://github.com/xlang-ai/OSWorld",
      "paperUrl": "https://arxiv.org/abs/2404.07972",
      "leaderboardUrl": "https://os-world.github.io/",
      "license": "Apache-2.0",
      "problemCount": 369,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 4096
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "low",
      "oneliner": "369 real computer-use tasks across Linux and Windows environments.",
      "adapter": "osworld",
      "adapterStatus": "external",
      "externalReason": "agent-harness",
      "pricingTier": "agent"
    },
    {
      "id": "visualwebarena",
      "name": "VisualWebArena",
      "category": "agent-framework",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://github.com/web-arena-x/visualwebarena",
      "paperUrl": "https://arxiv.org/abs/2401.13649",
      "leaderboardUrl": "https://jykoh.com/vwa",
      "license": "Apache-2.0",
      "problemCount": 910,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 2048
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "low",
      "oneliner": "Web navigation with vision across 3 hosted sites.",
      "adapter": "visualwebarena",
      "adapterStatus": "external",
      "externalReason": "agent-harness",
      "pricingTier": "agent"
    },
    {
      "id": "mixeval",
      "name": "MixEval",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/MixEval/MixEval",
      "paperUrl": "https://arxiv.org/abs/2406.06565",
      "leaderboardUrl": "https://mixeval.github.io/",
      "license": "Apache-2.0",
      "problemCount": 2000,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Monthly-refreshed curated mix-bench. LMArena-correlated evaluation.",
      "adapter": "mixeval",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "halueval",
      "name": "HaluEval",
      "category": "safety",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/pminervini/HaluEval",
      "paperUrl": "https://arxiv.org/abs/2305.11747",
      "license": "MIT",
      "problemCount": 35000,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 16
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Hallucination detection. Binary classification on generated answers.",
      "adapter": "halueval",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "multif",
      "name": "MultiIF",
      "category": "llm",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/facebook/Multi-IF",
      "paperUrl": "https://arxiv.org/abs/2410.15553",
      "license": "Apache-2.0",
      "problemCount": 4501,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 1024
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Multilingual IFEval across 8 languages.",
      "adapter": "multif",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "flores-200",
      "name": "FLORES-200",
      "category": "llm",
      "metric": "score",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/facebook/flores",
      "paperUrl": "https://arxiv.org/abs/2207.04672",
      "leaderboardUrl": "https://github.com/facebookresearch/flores",
      "license": "CC-BY-SA-4.0",
      "problemCount": 2009,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 512
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "Machine translation across 200 languages. chrF++ primary metric.",
      "adapter": "flores",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "c-eval",
      "name": "C-Eval",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/ceval/ceval-exam",
      "paperUrl": "https://arxiv.org/abs/2305.08322",
      "leaderboardUrl": "https://cevalbenchmark.com/",
      "license": "Apache-2.0",
      "problemCount": 13948,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 32
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "Chinese evaluation benchmark, 52 disciplines.",
      "adapter": "ceval",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "asdiv",
      "name": "ASDiv",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/allenai/asdiv",
      "paperUrl": "https://aclanthology.org/2020.acl-main.92/",
      "license": "CC-BY-NC-4.0",
      "problemCount": 2305,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 256
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "2305 elementary math word problems with answer types.",
      "adapter": "asdiv",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "svamp",
      "name": "SVAMP",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/ChilleD/SVAMP",
      "paperUrl": "https://arxiv.org/abs/2103.07191",
      "license": "MIT",
      "problemCount": 1000,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 256
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "medium",
      "oneliner": "1000 variation-aware math word problems.",
      "adapter": "svamp",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "swe-bench-multimodal",
      "name": "SWE-bench Multimodal",
      "category": "code-agent",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Multimodal",
      "paperUrl": "https://arxiv.org/abs/2310.06770",
      "leaderboardUrl": "https://www.swebench.com/",
      "license": "CC0-1.0",
      "problemCount": 619,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 8192
      },
      "costEstimateUsd": 25.0,
      "contaminationRisk": "low",
      "oneliner": "JavaScript issues with visual context. Docker harness required.",
      "adapter": "swe_bench_multimodal",
      "adapterStatus": "docker",
      "pricingTier": "agent"
    },
    {
      "id": "video-mme",
      "name": "Video-MME",
      "category": "reasoning",
      "metric": "accuracy",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/lmms-lab/Video-MME",
      "paperUrl": "https://arxiv.org/abs/2405.21075",
      "leaderboardUrl": "https://video-mme.github.io/",
      "license": "CC-BY-4.0",
      "problemCount": 2700,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 32
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "900 videos, 2700 questions. Multimodal video understanding.",
      "adapter": "videomme",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "docvqa",
      "name": "DocVQA",
      "category": "reasoning",
      "metric": "score",
      "maxScore": 100.0,
      "datasetUrl": "https://huggingface.co/datasets/lmms-lab/DocVQA",
      "paperUrl": "https://arxiv.org/abs/2007.00398",
      "leaderboardUrl": "https://rrc.cvc.uab.es/?ch=17",
      "license": "CC-BY-4.0",
      "problemCount": 5349,
      "decoding": {
        "temperature": 0.0,
        "max_tokens": 128
      },
      "costEstimateUsd": 5.0,
      "contaminationRisk": "low",
      "oneliner": "Document visual question answering. ANLS metric.",
      "adapter": "docvqa",
      "adapterStatus": "live",
      "pricingTier": "standard"
    },
    {
      "id": "lmarena-elo",
      "name": "LMArena Elo",
      "category": "llm",
      "metric": "score",
      "maxScore": 1500.0,
      "datasetUrl": "https://lmarena.ai/",
      "paperUrl": "https://arxiv.org/abs/2403.04132",
      "leaderboardUrl": "https://lmarena.ai/",
      "license": "LMArena-ToS",
      "problemCount": 0,
      "decoding": {},
      "costEstimateUsd": 0.0,
      "contaminationRisk": "low",
      "oneliner": "Chatbot Arena human-preference Elo. Catalog-only (crowdsourced).",
      "adapter": "lmarena",
      "adapterStatus": "external",
      "externalReason": "crowdsourced",
      "pricingTier": "external"
    }
  ]
}
