{
  "version": "1",
  "name": "validation-set-v1",
  "description": "Hand-curated fixture set for measuring /api/verify precision/recall. Five categories: lancet-fabricated (three illustrative cases from Topaz et al. 2026 Suppl Appx 2, verbatim — real DOI/PMID + invented title), known-good (real verified DOI + canonical title), wrong-doi (real paper's title paired with somebody else's real DOI — Topaz 'citation error' subtype), paraphrase (real DOI + paraphrased title, scoped to land in mismatch/low so the LLM screen has work to do), and invented (no real paper at all). Real-paper metadata sourced via scholar-sidekick MCP resolveIdentifier on 2026-05-12 — see per-entry `source` fields for provenance. IMMUTABILITY: published numbers cite a specific fixture version. Once this file is referenced from /citation-integrity, its entries MUST NOT change. Add or revise entries by creating validation-set-v2.json and re-measuring; do not edit in place.",
  "created_at": "2026-05-12",
  "immutable": true,
  "entries": [
    {
      "id": "lancet-example-a",
      "category": "lancet-fabricated",
      "description": "Lancet Suppl Appx 2 Example A. The fabricated citation's DOI and PMID each resolve to DIFFERENT real papers (both in J Occup Environ Med). Claim's DOI 10.1097/JOM.0000000000002567 points to Ramos et al. 'Occupational Balance and Depressive Symptoms During the COVID-19 Pandemic' (2022). The fabricated title 'Impact of enhanced safety protocols on ICU admissions in the construction industry' does not exist anywhere.",
      "source": "Topaz et al. Lancet 2026 Suppl Appx 2 p. 5 (verbatim — docs/references/mmc1.pdf)",
      "claimed": {
        "title": "Impact of enhanced safety protocols on ICU admissions in the construction industry: A longitudinal analysis",
        "authors": [
          { "family": "Doe", "given": "J" },
          { "family": "Smith", "given": "R" }
        ],
        "year": 2023,
        "container": "Journal of Occupational and Environmental Medicine",
        "doi": "10.1097/JOM.0000000000002567",
        "pmid": "36730737"
      },
      "expected": {
        "verdict": "mismatch",
        "confidence_in": ["high"]
      }
    },
    {
      "id": "lancet-example-b",
      "category": "lancet-fabricated",
      "description": "Lancet Suppl Appx 2 Example B. PMID and DOI are consistent — both point to Guisnet et al. 'Three-Dimensional Fruit Tissue Habitats for Culturing Caenorhabditis elegans' (Current Protocols 2021). The fabricated title combines two real methodologies (DMM, PTX) into a protocol that does not exist.",
      "source": "Topaz et al. Lancet 2026 Suppl Appx 2 p. 5–6 (verbatim — docs/references/mmc1.pdf)",
      "claimed": {
        "title": "A Protocol for the Use of DMM/PTX-Induced Mouse Models of Osteoarthritis and Rheumatoid Arthritis",
        "authors": [
          { "family": "Krustev", "given": "E" },
          { "family": "Rioux", "given": "D" },
          { "family": "McDougall", "given": "J J" }
        ],
        "year": 2021,
        "container": "Current Protocols",
        "doi": "10.1002/cpz1.288",
        "pmid": "34767311"
      },
      "expected": {
        "verdict": "mismatch",
        "confidence_in": ["high"]
      }
    },
    {
      "id": "lancet-example-c",
      "category": "lancet-fabricated",
      "description": "Lancet Suppl Appx 2 Example C. PMID and DOI are consistent — both point to Graf & Bernardi 'ChatGPT in Research: Balancing Ethics, Transparency and Advancement' (Neuroscience 2023, vol 515). The fabricated title combines three real neuroscience concepts (microglial modulation, CB2, fibromyalgia) into a plausible-sounding study that does not exist.",
      "source": "Topaz et al. Lancet 2026 Suppl Appx 2 p. 6 (verbatim — docs/references/mmc1.pdf)",
      "claimed": {
        "title": "Microglial Modulation via Cannabinoid Receptor 2 Alleviates Fibromyalgia-Related Central Sensitization and Pain Hypersensitivity",
        "authors": [
          { "family": "Chen", "given": "F" },
          { "family": "Liu", "given": "Y" },
          { "family": "Wang", "given": "H" },
          { "family": "Zhang", "given": "X" },
          { "family": "Li", "given": "J" },
          { "family": "Yang", "given": "K" }
        ],
        "year": 2023,
        "container": "Neuroscience",
        "doi": "10.1016/j.neuroscience.2023.02.008",
        "pmid": "36813155"
      },
      "expected": {
        "verdict": "mismatch",
        "confidence_in": ["high"]
      }
    },
    {
      "id": "good-topaz-lancet-2026",
      "category": "known-good",
      "description": "Topaz et al. 2026 — the source paper for Phase 12i. Real DOI, real title verbatim from Crossref.",
      "source": "scholar-sidekick MCP resolveIdentifier (Crossref) 2026-05-12",
      "claimed": {
        "title": "Fabricated citations: an audit across 2·5 million biomedical papers",
        "authors": [
          { "family": "Topaz", "given": "Maxim" },
          { "family": "Roguin", "given": "Nir" }
        ],
        "year": 2026,
        "container": "The Lancet",
        "doi": "10.1016/S0140-6736(26)00603-3"
      },
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium"]
      }
    },
    {
      "id": "good-bnt162b2-polack",
      "category": "known-good",
      "description": "Polack et al. BNT162b2 phase 3 vaccine trial (NEJM 2020). Real DOI, canonical title.",
      "source": "scholar-sidekick MCP resolveIdentifier (Crossref) 2026-05-12",
      "claimed": {
        "title": "Safety and Efficacy of the BNT162b2 mRNA Covid-19 Vaccine",
        "authors": [{ "family": "Polack", "given": "Fernando P." }],
        "year": 2020,
        "container": "New England Journal of Medicine",
        "doi": "10.1056/NEJMoa2034577"
      },
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium"]
      }
    },
    {
      "id": "good-attention-vaswani",
      "category": "known-good",
      "description": "Vaswani et al. Transformer paper. arXiv resolver path; canonical title.",
      "source": "scholar-sidekick MCP resolveIdentifier (arXiv) 2026-05-12",
      "claimed": {
        "title": "Attention Is All You Need",
        "authors": [{ "family": "Vaswani", "given": "Ashish" }],
        "year": 2017,
        "arxiv": "1706.03762"
      },
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium"]
      }
    },
    {
      "id": "good-bert-devlin",
      "category": "known-good",
      "description": "Devlin et al. BERT paper. arXiv resolver path; canonical title.",
      "source": "scholar-sidekick MCP resolveIdentifier (arXiv) 2026-05-12",
      "claimed": {
        "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
        "authors": [{ "family": "Devlin", "given": "Jacob" }],
        "year": 2018,
        "arxiv": "1810.04805"
      },
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium"]
      }
    },
    {
      "id": "good-chatgpt-neuroscience",
      "category": "known-good",
      "description": "Graf & Bernardi 2023 editorial — the real paper that Lancet Example C's DOI/PMID resolve to. Real DOI + correct canonical title; verifier should return matched.",
      "source": "scholar-sidekick MCP resolveIdentifier (Crossref) 2026-05-12",
      "claimed": {
        "title": "ChatGPT in Research: Balancing Ethics, Transparency and Advancement",
        "authors": [{ "family": "Graf", "given": "Akseli" }],
        "year": 2023,
        "container": "Neuroscience",
        "doi": "10.1016/j.neuroscience.2023.02.008"
      },
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium"]
      }
    },
    {
      "id": "wrong-doi-topaz-on-bnt-doi",
      "category": "wrong-doi",
      "description": "Topaz title (real paper) paired with Polack BNT162b2 DOI (different real paper). Tests CITADEL 'citation error' subtype: verifier should detect title mismatch on the resolved record, then title-search the claim and surface the actual paper as a candidate → verdict=ambiguous.",
      "source": "Synthetic — both DOIs verified via resolveIdentifier",
      "claimed": {
        "title": "Fabricated citations: an audit across 2·5 million biomedical papers",
        "authors": [{ "family": "Topaz", "given": "Maxim" }],
        "year": 2026,
        "doi": "10.1056/NEJMoa2034577"
      },
      "expected": {
        "verdict_in": ["ambiguous", "mismatch"],
        "confidence_in": ["high", "medium"]
      }
    },
    {
      "id": "wrong-doi-attention-on-topaz-doi",
      "category": "wrong-doi",
      "description": "Vaswani Attention title paired with Topaz Lancet DOI. Tests wrong-identifier path with cross-discipline confusion (CS vs biomedicine).",
      "source": "Synthetic — both DOIs verified",
      "claimed": {
        "title": "Attention Is All You Need",
        "authors": [{ "family": "Vaswani", "given": "Ashish" }],
        "year": 2017,
        "doi": "10.1016/S0140-6736(26)00603-3"
      },
      "expected": {
        "verdict_in": ["ambiguous", "mismatch"],
        "confidence_in": ["high", "medium"]
      }
    },
    {
      "id": "wrong-doi-bert-on-attention-arxiv",
      "category": "wrong-doi",
      "description": "BERT title paired with Vaswani Attention arXiv id. Same-discipline (both ML/NLP) wrong-identifier confusion — harder for similarity to disambiguate but distinct enough to mismatch on title.",
      "source": "Synthetic — both arXiv IDs verified",
      "claimed": {
        "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
        "authors": [{ "family": "Devlin", "given": "Jacob" }],
        "year": 2018,
        "arxiv": "1706.03762"
      },
      "expected": {
        "verdict_in": ["ambiguous", "mismatch"],
        "confidence_in": ["high", "medium"]
      }
    },
    {
      "id": "wrong-doi-bnt-on-topaz-doi",
      "category": "wrong-doi",
      "description": "BNT162b2 trial title paired with Topaz Lancet DOI. Same broad domain (biomedicine, both in elite general-medical journals) — verifier must distinguish on title content alone.",
      "source": "Synthetic — both DOIs verified",
      "claimed": {
        "title": "Safety and Efficacy of the BNT162b2 mRNA Covid-19 Vaccine",
        "authors": [{ "family": "Polack", "given": "Fernando P." }],
        "year": 2020,
        "doi": "10.1016/S0140-6736(26)00603-3"
      },
      "expected": {
        "verdict_in": ["ambiguous", "mismatch"],
        "confidence_in": ["high", "medium"]
      }
    },
    {
      "id": "paraphrase-bert-no-acronym",
      "category": "paraphrase",
      "description": "Devlin BERT paper with the 'BERT:' acronym stripped. Confirmed via live probe 2026-05-12 to land in mismatch/low — the bucket the LLM screen is designed to rescue.",
      "source": "Synthetic — paraphrase confirmed via live verifier probe",
      "claimed": {
        "title": "Pre-training of Deep Bidirectional Transformers for Language Understanding",
        "authors": [{ "family": "Devlin" }],
        "year": 2018,
        "arxiv": "1810.04805"
      },
      "expected_pre_llm": {
        "verdict": "mismatch",
        "confidence_in": ["low"]
      },
      "expected_with_llm": {
        "verdict": "matched",
        "confidence_in": ["high", "medium", "low"]
      }
    },
    {
      "id": "paraphrase-chatgpt-abbreviated",
      "category": "paraphrase",
      "description": "Graf & Bernardi ChatGPT-in-Research editorial — claim drops 'in Research:' and 'Balancing', leaves only the topic triad. Confirmed via live probe to land in mismatch/low.",
      "source": "Synthetic — paraphrase confirmed via live verifier probe",
      "claimed": {
        "title": "ChatGPT: Ethics, Transparency, Advancement",
        "authors": [{ "family": "Graf" }],
        "year": 2023,
        "doi": "10.1016/j.neuroscience.2023.02.008"
      },
      "expected_pre_llm": {
        "verdict": "mismatch",
        "confidence_in": ["low"]
      },
      "expected_with_llm": {
        "verdict": "matched",
        "confidence_in": ["high", "medium", "low"]
      }
    },
    {
      "id": "paraphrase-bnt-phase3",
      "category": "paraphrase",
      "description": "Polack BNT162b2 trial described as 'Phase 3 trial of …'. Drops 'Safety and Efficacy of the', reorders. Confirmed via live probe to land in mismatch/low.",
      "source": "Synthetic — paraphrase confirmed via live verifier probe",
      "claimed": {
        "title": "Phase 3 trial of BNT162b2 mRNA Covid-19 vaccine",
        "authors": [{ "family": "Polack" }],
        "year": 2020,
        "doi": "10.1056/NEJMoa2034577"
      },
      "expected_pre_llm": {
        "verdict": "mismatch",
        "confidence_in": ["low"]
      },
      "expected_with_llm": {
        "verdict": "matched",
        "confidence_in": ["high", "medium", "low"]
      }
    },
    {
      "id": "paraphrase-topaz-reordered",
      "category": "paraphrase",
      "description": "Topaz Lancet title reordered — 'Audit of fabricated citations across biomedical papers'. Drops the '2·5 million' quantifier. Confirmed via live probe to land in mismatch/low.",
      "source": "Synthetic — paraphrase confirmed via live verifier probe",
      "claimed": {
        "title": "Audit of fabricated citations across biomedical papers",
        "authors": [{ "family": "Topaz" }],
        "year": 2026,
        "doi": "10.1016/S0140-6736(26)00603-3"
      },
      "expected_pre_llm": {
        "verdict": "mismatch",
        "confidence_in": ["low"]
      },
      "expected_with_llm": {
        "verdict": "matched",
        "confidence_in": ["high", "medium", "low"]
      }
    },
    {
      "id": "invented-fake-doi-plausible-title",
      "category": "invented",
      "description": "Plausible-sounding clinical title with a made-up DOI in the 10.1234/ range. Verifier resolver returns 'no_record_in_registries' → not_found. Search aggregator runs on title; expected to return zero hits.",
      "source": "Synthetic",
      "claimed": {
        "title": "Quantum trifold pancake synthesis in mice — a randomised crossover trial",
        "authors": [{ "family": "Fictional", "given": "A" }],
        "year": 2023,
        "doi": "10.1234/quantum-pancakes-99999"
      },
      "expected": {
        "verdict": "not_found",
        "confidence_in": ["high"]
      }
    },
    {
      "id": "invented-title-only-no-identifier",
      "category": "invented",
      "description": "Title-only claim with no identifier. Verifier skips resolver and runs title-search across all three registries. Title is deliberately implausible; expected to return zero candidates → not_found.",
      "source": "Synthetic",
      "claimed": {
        "title": "Telomere shortening predicts onset of imaginary disorder XYZ",
        "authors": [{ "family": "Nonexistent" }],
        "year": 2024
      },
      "expected": {
        "verdict": "not_found",
        "confidence_in": ["high"]
      }
    },
    {
      "id": "invented-fake-pmid",
      "category": "invented",
      "description": "Implausibly large PMID. PubMed adapter returns 404; verifier should map to not_found via the 12i.1b resolveEnvelope wrapper.",
      "source": "Synthetic",
      "claimed": {
        "title": "Effects of zeroth-order entropy on second-system architecture",
        "authors": [{ "family": "Imaginary" }],
        "year": 2020,
        "pmid": "999999999"
      },
      "expected": {
        "verdict": "not_found",
        "confidence_in": ["high"]
      }
    },
    {
      "id": "invented-fake-doi-resolves-to-nothing",
      "category": "invented",
      "description": "Made-up DOI in a vendor prefix range that does not exist. Verifier should return not_found.",
      "source": "Synthetic",
      "claimed": {
        "title": "Universal cure for procrastination via gut-brain axis modulation",
        "authors": [{ "family": "Madeup" }],
        "year": 2022,
        "doi": "10.9999/this-doi-does-not-exist-987654"
      },
      "expected": {
        "verdict": "not_found",
        "confidence_in": ["high"]
      }
    }
  ]
}
