{
  "fixture_version": "1",
  "fixture_name": "validation-set-v1",
  "measured_at": "2026-05-12T02:50:48.332Z",
  "base": "http://localhost:3000",
  "with_llm": true,
  "counts": {
    "total": 20,
    "passed": 20,
    "failed": 0,
    "skipped": 0,
    "true_positive": 11,
    "true_negative": 9,
    "false_positive": 0,
    "false_negative": 0
  },
  "metrics": {
    "precision": 1,
    "recall": 1,
    "f1": 1,
    "pass_rate": 1
  },
  "by_category": {
    "lancet-fabricated": {
      "total": 3,
      "passed": 3,
      "http_failed": 0,
      "pass_rate": 1
    },
    "known-good": {
      "total": 5,
      "passed": 5,
      "http_failed": 0,
      "pass_rate": 1
    },
    "wrong-doi": {
      "total": 4,
      "passed": 4,
      "http_failed": 0,
      "pass_rate": 1
    },
    "paraphrase": {
      "total": 4,
      "passed": 4,
      "http_failed": 0,
      "pass_rate": 1
    },
    "invented": {
      "total": 4,
      "passed": 4,
      "http_failed": 0,
      "pass_rate": 1
    }
  },
  "results": [
    {
      "id": "lancet-example-a",
      "category": "lancet-fabricated",
      "http_status": 200,
      "verdict": "mismatch",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "mismatch",
        "confidence_in": ["high"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 2248,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare", "search"],
        "resolved_via": "crossref",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 0
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "lancet-example-b",
      "category": "lancet-fabricated",
      "http_status": 200,
      "verdict": "mismatch",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "mismatch",
        "confidence_in": ["high"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 4356,
      "attempts": 2,
      "provenance": {
        "stages_run": ["compare", "search"],
        "resolved_via": "crossref",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 0
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "lancet-example-c",
      "category": "lancet-fabricated",
      "http_status": 200,
      "verdict": "mismatch",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "mismatch",
        "confidence_in": ["high"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 58885,
      "attempts": 2,
      "provenance": {
        "stages_run": ["compare", "search"],
        "resolved_via": "crossref",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 10
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "good-topaz-lancet-2026",
      "category": "known-good",
      "http_status": 200,
      "verdict": "matched",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 741,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare"],
        "resolved_via": "crossref",
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "good-bnt162b2-polack",
      "category": "known-good",
      "http_status": 200,
      "verdict": "matched",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 752,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare"],
        "resolved_via": "crossref",
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "good-attention-vaswani",
      "category": "known-good",
      "http_status": 200,
      "verdict": "matched",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 342,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare"],
        "resolved_via": "arxiv",
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "good-bert-devlin",
      "category": "known-good",
      "http_status": 200,
      "verdict": "matched",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 303,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare"],
        "resolved_via": "arxiv",
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "good-chatgpt-neuroscience",
      "category": "known-good",
      "http_status": 200,
      "verdict": "matched",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 1182,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare"],
        "resolved_via": "crossref",
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "wrong-doi-topaz-on-bnt-doi",
      "category": "wrong-doi",
      "http_status": 200,
      "verdict": "ambiguous",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict_in": ["ambiguous", "mismatch"],
        "confidence_in": ["high", "medium"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 1535,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare", "search"],
        "resolved_via": "crossref",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 1
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "wrong-doi-attention-on-topaz-doi",
      "category": "wrong-doi",
      "http_status": 200,
      "verdict": "ambiguous",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict_in": ["ambiguous", "mismatch"],
        "confidence_in": ["high", "medium"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 2327,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare", "search"],
        "resolved_via": "crossref",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 10
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "wrong-doi-bert-on-attention-arxiv",
      "category": "wrong-doi",
      "http_status": 200,
      "verdict": "mismatch",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict_in": ["ambiguous", "mismatch"],
        "confidence_in": ["high", "medium"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 2180,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare", "search"],
        "resolved_via": "arxiv",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 7
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "wrong-doi-bnt-on-topaz-doi",
      "category": "wrong-doi",
      "http_status": 200,
      "verdict": "ambiguous",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict_in": ["ambiguous", "mismatch"],
        "confidence_in": ["high", "medium"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 3172,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare", "search"],
        "resolved_via": "crossref",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 2
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 10
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "paraphrase-bert-no-acronym",
      "category": "paraphrase",
      "http_status": 200,
      "verdict": "matched",
      "confidence": "low",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium", "low"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 35745,
      "attempts": 2,
      "provenance": {
        "stages_run": ["compare", "search", "llm_screen"],
        "resolved_via": "arxiv",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 7
          }
        ],
        "llm_screen": {
          "applied": true,
          "model": "anthropic/claude-haiku-4.5",
          "prompt_version": "2026-05-12",
          "verdict": "informal_abbreviation",
          "reasoning": "The claimed title is 'Pre-training of Deep Bidirectional Transformers for Language Understanding' by Devlin, while the resolved title is 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding' by Devlin. The resolved title includes the acronym 'BERT:' as a prefix, but the substantive content of both titles is identical. The claimed title is simply the full title without the acronym prefix. This is a clear case of informal abbreviation where the citation omitted the model acronym (BERT) but preserved the complete descriptive title. The first authors match, and the core topic and specificity are identical. This is definitely referring to the same seminal paper on bidirectional transformer pre-training for language understanding.",
          "cost_usd": 0.0011296000000000001
        }
      }
    },
    {
      "id": "paraphrase-chatgpt-abbreviated",
      "category": "paraphrase",
      "http_status": 200,
      "verdict": "matched",
      "confidence": "low",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium", "low"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 5964,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare", "search", "llm_screen"],
        "resolved_via": "crossref",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 1
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 3
          }
        ],
        "llm_screen": {
          "applied": true,
          "model": "anthropic/claude-haiku-4.5",
          "prompt_version": "2026-05-12",
          "verdict": "informal_abbreviation",
          "reasoning": "The claimed title 'ChatGPT: Ethics, Transparency, Advancement' is a shortened version of the resolved title 'ChatGPT in Research: Balancing Ethics, Transparency and Advancement'. Both papers share the same first author (Graf) and cover the same core topic: the ethical, transparency, and advancement considerations related to ChatGPT. The claimed title omits 'in Research' and 'Balancing' but preserves the three main thematic elements (Ethics, Transparency, Advancement). This is a typical informal abbreviation where the essential subject matter and specificity are maintained while using more concise wording.",
          "cost_usd": 0.0010072
        }
      }
    },
    {
      "id": "paraphrase-bnt-phase3",
      "category": "paraphrase",
      "http_status": 200,
      "verdict": "matched",
      "confidence": "low",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium", "low"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 6702,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare", "search", "llm_screen"],
        "resolved_via": "crossref",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 1
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 10
          }
        ],
        "llm_screen": {
          "applied": true,
          "model": "anthropic/claude-haiku-4.5",
          "prompt_version": "2026-05-12",
          "verdict": "informal_abbreviation",
          "reasoning": "The claimed title 'Phase 3 trial of BNT162b2 mRNA Covid-19 vaccine' is a shortened, informal description of the resolved title 'Safety and Efficacy of the BNT162b2 mRNA Covid-19 Vaccine.' Both reference the same vaccine candidate (BNT162b2) and the same clinical trial. The claimed title focuses on the trial phase (Phase 3), while the resolved title emphasizes the measured outcomes (safety and efficacy). These are two ways of describing the same landmark clinical trial paper. The first authors match (Polack), confirming this is the same publication. The core topic—evaluation of the BNT162b2 mRNA vaccine—is identical, and the difference is merely in how concisely the main findings are referenced.",
          "cost_usd": 0.0011728
        }
      }
    },
    {
      "id": "paraphrase-topaz-reordered",
      "category": "paraphrase",
      "http_status": 200,
      "verdict": "matched",
      "confidence": "low",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "matched",
        "confidence_in": ["high", "medium", "low"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 5874,
      "attempts": 1,
      "provenance": {
        "stages_run": ["compare", "search", "llm_screen"],
        "resolved_via": "crossref",
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 10
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 1
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 1
          }
        ],
        "llm_screen": {
          "applied": true,
          "model": "anthropic/claude-haiku-4.5",
          "prompt_version": "2026-05-12",
          "verdict": "informal_abbreviation",
          "reasoning": "The claimed title 'Audit of fabricated citations across biomedical papers' and the resolved title 'Fabricated citations: an audit across 2·5 million biomedical papers' refer to the same paper. Both titles describe an audit of fabricated citations in biomedical literature, with the same first author (Topaz). The claimed title is a shortened, informal version that omits the specific scale ('2·5 million biomedical papers') but preserves the core topic and essence of the work. This is a clear case of title abbreviation/paraphrase rather than a different paper.",
          "cost_usd": 0.0009784
        }
      }
    },
    {
      "id": "invented-fake-doi-plausible-title",
      "category": "invented",
      "http_status": 200,
      "verdict": "not_found",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "not_found",
        "confidence_in": ["high"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 3870,
      "attempts": 1,
      "provenance": {
        "stages_run": ["search"],
        "resolved_via": null,
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 0
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 0
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "invented-title-only-no-identifier",
      "category": "invented",
      "http_status": 200,
      "verdict": "not_found",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "not_found",
        "confidence_in": ["high"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 1045,
      "attempts": 1,
      "provenance": {
        "stages_run": ["search"],
        "resolved_via": null,
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 0
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 0
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "invented-fake-pmid",
      "category": "invented",
      "http_status": 200,
      "verdict": "not_found",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "not_found",
        "confidence_in": ["high"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 2273,
      "attempts": 1,
      "provenance": {
        "stages_run": ["search"],
        "resolved_via": null,
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 0
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 0
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    },
    {
      "id": "invented-fake-doi-resolves-to-nothing",
      "category": "invented",
      "http_status": 200,
      "verdict": "not_found",
      "confidence": "high",
      "error_code": null,
      "error_message": null,
      "expected": {
        "verdict": "not_found",
        "confidence_in": ["high"]
      },
      "match": {
        "ok": true
      },
      "elapsed_ms": 3343,
      "attempts": 1,
      "provenance": {
        "stages_run": ["search"],
        "resolved_via": null,
        "registries_searched": [
          {
            "registry": "crossref",
            "ok": true,
            "count": 0
          },
          {
            "registry": "pubmed",
            "ok": true,
            "count": 0
          },
          {
            "registry": "openalex",
            "ok": true,
            "count": 0
          }
        ],
        "llm_screen": {
          "applied": false,
          "reason": "verdict_not_eligible"
        }
      }
    }
  ]
}
