{"markup":"\u003C?xml version=\u00221.0\u0022 encoding=\u0022UTF-8\u0022 ?\u003E\n    \u003Chtml version=\u0022HTML+RDFa+MathML 1.1\u0022\n    xmlns:content=\u0022http:\/\/purl.org\/rss\/1.0\/modules\/content\/\u0022\n    xmlns:dc=\u0022http:\/\/purl.org\/dc\/terms\/\u0022\n    xmlns:foaf=\u0022http:\/\/xmlns.com\/foaf\/0.1\/\u0022\n    xmlns:og=\u0022http:\/\/ogp.me\/ns#\u0022\n    xmlns:rdfs=\u0022http:\/\/www.w3.org\/2000\/01\/rdf-schema#\u0022\n    xmlns:sioc=\u0022http:\/\/rdfs.org\/sioc\/ns#\u0022\n    xmlns:sioct=\u0022http:\/\/rdfs.org\/sioc\/types#\u0022\n    xmlns:skos=\u0022http:\/\/www.w3.org\/2004\/02\/skos\/core#\u0022\n    xmlns:xsd=\u0022http:\/\/www.w3.org\/2001\/XMLSchema#\u0022\n    xmlns:mml=\u0022http:\/\/www.w3.org\/1998\/Math\/MathML\u0022\u003E\n  \u003Chead\u003E\u003Cscript type=\u0022text\/javascript\u0022 src=\u0022\/\/cdn.jsdelivr.net\/qtip2\/2.2.1\/jquery.qtip.min.js\u0022\u003E\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022 src=\u0022https:\/\/www.medrxiv.org\/sites\/default\/files\/js\/js_YjAJQgxDlFX6S-O02jj9jCrVbrwlY3CGgCg1FzPlvBs.js\u0022\u003E\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022\u003E\n\u003C!--\/\/--\u003E\u003C![CDATA[\/\/\u003E\u003C!--\nif(typeof window.MathJax === \u0022undefined\u0022) window.MathJax = { menuSettings: { zoom: \u0022Click\u0022 } };\n\/\/--\u003E\u003C!]]\u003E\n\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022 src=\u0022https:\/\/www.medrxiv.org\/sites\/default\/files\/js\/js_waP91NpgGpectm_6Y2XDEauLJ8WCSCBKmmA87unpp2E.js\u0022\u003E\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022 src=\u0022https:\/\/www.googletagmanager.com\/gtag\/js?id=G-0K57TCX5BY\u0022\u003E\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022\u003E\n\u003C!--\/\/--\u003E\u003C![CDATA[\/\/\u003E\u003C!--\nwindow.dataLayer = window.dataLayer || [];function gtag(){dataLayer.push(arguments)};gtag(\u0022js\u0022, new Date());gtag(\u0022set\u0022, \u0022developer_id.dMDhkMT\u0022, true);gtag(\u0022config\u0022, \u0022G-0K57TCX5BY\u0022, {\u0022groups\u0022:\u0022default\u0022,\u0022anonymize_ip\u0022:true});\n\/\/--\u003E\u003C!]]\u003E\n\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022\u003E\n\u003C!--\/\/--\u003E\u003C![CDATA[\/\/\u003E\u003C!--\njQuery.extend(Drupal.settings, {\u0022basePath\u0022:\u0022\\\/\u0022,\u0022pathPrefix\u0022:\u0022\u0022,\u0022highwire\u0022:{\u0022ac\u0022:{\u0022medrxiv;2025.12.08.25341823v1\u0022:{\u0022access\u0022:{\u0022full\u0022:true},\u0022pisa_id\u0022:\u0022medrxiv;2025.12.08.25341823v1\u0022,\u0022apath\u0022:\u0022\u0022,\u0022jcode\u0022:\u0022medrxiv\u0022}},\u0022processed\u0022:[\u0022highwire_math\u0022],\u0022markup\u0022:[{\u0022requested\u0022:\u0022full-text\u0022,\u0022variant\u0022:\u0022full-text\u0022,\u0022view\u0022:\u0022full\u0022,\u0022pisa\u0022:\u0022medrxiv;2025.12.08.25341823v1\u0022}]},\u0022instances\u0022:\u0022{\\u0022highwire_abstract_tooltip\\u0022:{\\u0022content\\u0022:{\\u0022text\\u0022:\\u0022\\u0022},\\u0022style\\u0022:{\\u0022tip\\u0022:{\\u0022width\\u0022:20,\\u0022height\\u0022:20,\\u0022border\\u0022:1,\\u0022offset\\u0022:0,\\u0022corner\\u0022:true},\\u0022classes\\u0022:\\u0022qtip-custom hw-tooltip hw-abstract-tooltip qtip-shadow qtip-rounded\\u0022,\\u0022classes_custom\\u0022:\\u0022hw-tooltip hw-abstract-tooltip\\u0022},\\u0022position\\u0022:{\\u0022at\\u0022:\\u0022right center\\u0022,\\u0022my\\u0022:\\u0022left center\\u0022,\\u0022viewport\\u0022:true,\\u0022adjust\\u0022:{\\u0022method\\u0022:\\u0022shift\\u0022}},\\u0022show\\u0022:{\\u0022event\\u0022:\\u0022mouseenter click \\u0022,\\u0022solo\\u0022:true},\\u0022hide\\u0022:{\\u0022event\\u0022:\\u0022mouseleave \\u0022,\\u0022fixed\\u0022:1,\\u0022delay\\u0022:\\u0022100\\u0022}},\\u0022highwire_author_tooltip\\u0022:{\\u0022content\\u0022:{\\u0022text\\u0022:\\u0022\\u0022},\\u0022style\\u0022:{\\u0022tip\\u0022:{\\u0022width\\u0022:15,\\u0022height\\u0022:15,\\u0022border\\u0022:1,\\u0022offset\\u0022:0,\\u0022corner\\u0022:true},\\u0022classes\\u0022:\\u0022qtip-custom hw-tooltip hw-author-tooltip qtip-shadow qtip-rounded\\u0022,\\u0022classes_custom\\u0022:\\u0022hw-tooltip hw-author-tooltip\\u0022},\\u0022position\\u0022:{\\u0022at\\u0022:\\u0022top center\\u0022,\\u0022my\\u0022:\\u0022bottom center\\u0022,\\u0022viewport\\u0022:true,\\u0022adjust\\u0022:{\\u0022method\\u0022:\\u0022\\u0022}},\\u0022show\\u0022:{\\u0022event\\u0022:\\u0022mouseenter \\u0022,\\u0022solo\\u0022:true},\\u0022hide\\u0022:{\\u0022event\\u0022:\\u0022mouseleave \\u0022,\\u0022fixed\\u0022:1,\\u0022delay\\u0022:\\u0022100\\u0022}},\\u0022highwire_reflinks_tooltip\\u0022:{\\u0022content\\u0022:{\\u0022text\\u0022:\\u0022\\u0022},\\u0022style\\u0022:{\\u0022tip\\u0022:{\\u0022width\\u0022:15,\\u0022height\\u0022:15,\\u0022border\\u0022:1,\\u0022mimic\\u0022:\\u0022top center\\u0022,\\u0022offset\\u0022:0,\\u0022corner\\u0022:true},\\u0022classes\\u0022:\\u0022qtip-custom hw-tooltip hw-ref-link-tooltip qtip-shadow qtip-rounded\\u0022,\\u0022classes_custom\\u0022:\\u0022hw-tooltip hw-ref-link-tooltip\\u0022},\\u0022position\\u0022:{\\u0022at\\u0022:\\u0022bottom left\\u0022,\\u0022my\\u0022:\\u0022top left\\u0022,\\u0022viewport\\u0022:true,\\u0022adjust\\u0022:{\\u0022method\\u0022:\\u0022flip\\u0022}},\\u0022show\\u0022:{\\u0022event\\u0022:\\u0022mouseenter \\u0022,\\u0022solo\\u0022:true},\\u0022hide\\u0022:{\\u0022event\\u0022:\\u0022mouseleave \\u0022,\\u0022fixed\\u0022:1,\\u0022delay\\u0022:\\u0022100\\u0022}}}\u0022,\u0022qtipDebug\u0022:\u0022{\\u0022leaveElement\\u0022:0}\u0022,\u0022googleanalytics\u0022:{\u0022account\u0022:[\u0022G-0K57TCX5BY\u0022],\u0022trackOutbound\u0022:1,\u0022trackMailto\u0022:1,\u0022trackDownload\u0022:1,\u0022trackDownloadExtensions\u0022:\u00227z|aac|arc|arj|asf|asx|avi|bin|csv|doc(x|m)?|dot(x|m)?|exe|flv|gif|gz|gzip|hqx|jar|jpe?g|js|mp(2|3|4|e?g)|mov(ie)?|msi|msp|pdf|phps|png|ppt(x|m)?|pot(x|m)?|pps(x|m)?|ppam|sld(x|m)?|thmx|qtm?|ra(m|r)?|sea|sit|tar|tgz|torrent|txt|wav|wma|wmv|wpd|xls(x|m|b)?|xlt(x|m)|xlam|xml|z|zip\u0022,\u0022trackColorbox\u0022:1},\u0022ajaxPageState\u0022:{\u0022js\u0022:{\u0022\\\/\\\/cdn.jsdelivr.net\\\/qtip2\\\/2.2.1\\\/jquery.qtip.min.js\u0022:1,\u0022sites\\\/all\\\/modules\\\/highwire\\\/highwire\\\/plugins\\\/highwire_markup_process\\\/js\\\/highwire_article_reference_popup.js\u0022:1,\u0022sites\\\/all\\\/modules\\\/highwire\\\/highwire\\\/plugins\\\/highwire_markup_process\\\/js\\\/highwire_at_symbol.js\u0022:1,\u00220\u0022:1,\u0022sites\\\/all\\\/modules\\\/contrib\\\/google_analytics\\\/googleanalytics.js\u0022:1,\u0022https:\\\/\\\/www.googletagmanager.com\\\/gtag\\\/js?id=G-0K57TCX5BY\u0022:1,\u00221\u0022:1}}});\n\/\/--\u003E\u003C!]]\u003E\n\u003C\/script\u003E\n\u003Clink type=\u0022text\/css\u0022 rel=\u0022stylesheet\u0022 href=\u0022https:\/\/www.medrxiv.org\/sites\/default\/files\/advagg_css\/css__dn-cpI1YtkU_iLHgA5WhlkxgYWyat_IxjF_B-WSYrpE__a9hIbt0eaZ7d5nhwnm2weG8R_2eXK4EvoOx9dOxouHE__QrrGUc7CpljPR5Aph-ukPbcwtK4AWrHGwCEXJ_k1V_c.css\u0022 media=\u0022all\u0022 \/\u003E\n\u003Clink type=\u0022text\/css\u0022 rel=\u0022stylesheet\u0022 href=\u0022\/\/cdn.jsdelivr.net\/qtip2\/2.2.1\/jquery.qtip.min.css\u0022 media=\u0022all\u0022 \/\u003E\n\u003Clink type=\u0022text\/css\u0022 rel=\u0022stylesheet\u0022 href=\u0022https:\/\/www.medrxiv.org\/sites\/default\/files\/advagg_css\/css__HGACIFBlu2o05y3afvqlt5wrE_5Dn6MXsexfuEpeIwg__t4SOPxucAPoV3Os7g8dXqyMB1HRXQridRJ82X7nE33E__QrrGUc7CpljPR5Aph-ukPbcwtK4AWrHGwCEXJ_k1V_c.css\u0022 media=\u0022all\u0022 \/\u003E\n\u003Clink rel=\u0027stylesheet\u0027 type=\u0027text\/css\u0027 href=\u0027\/sites\/all\/modules\/contrib\/panels\/plugins\/layouts\/onecol\/onecol.css\u0027 \/\u003E\u003C\/head\u003E\u003Cbody\u003E\u003Cdiv class=\u0022panels-ajax-tab-panel panels-ajax-tab-panel-article-tab-full-text\u0022\u003E\u003Cdiv class=\u0022panel-display panel-1col clearfix\u0022 \u003E\n  \u003Cdiv class=\u0022panel-panel panel-col\u0022\u003E\n    \u003Cdiv\u003E\u003Cdiv class=\u0022panel-pane pane-highwire-markup\u0022 \u003E\n  \n      \n  \n  \u003Cdiv class=\u0022pane-content\u0022\u003E\n    \u003Cdiv class=\u0022highwire-markup\u0022\u003E\u003Cdiv xmlns=\u0022http:\/\/www.w3.org\/1999\/xhtml\u0022 data-highwire-cite-ref-tooltip-instance=\u0022highwire_reflinks_tooltip\u0022 class=\u0022content-block-markup\u0022 xmlns:xhtml=\u0022http:\/\/www.w3.org\/1999\/xhtml\u0022\u003E\u003Cdiv class=\u0022article fulltext-view \u0022\u003E\u003Cspan class=\u0022highwire-journal-article-marker-start\u0022\u003E\u003C\/span\u003E\u003Cdiv class=\u0022section abstract\u0022 id=\u0022abstract-1\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EABSTRACT\u003C\/h2\u003E\u003Cdiv id=\u0022sec-1\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-2\u0022\u003E\u003Cstrong\u003EBackground\u003C\/strong\u003E Large Language Models (LLMs) have demonstrated impressive capabilities in medical knowledge tasks, achieving 60-80% accuracy on licensing examinations. However, their reliability and consistency in clinical diagnosis\u2014critical for clinical trustworthiness\u2013remain incompletely characterized.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-2\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-3\u0022\u003E\u003Cstrong\u003EObjective\u003C\/strong\u003E To systematically evaluate the consistency and diagnostic accuracy of state-of-the-art LLMs in binary medical diagnosis, examining the relationship between reproducibility and diagnostic performance.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-3\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-4\u0022\u003E\u003Cstrong\u003EMethods\u003C\/strong\u003E We evaluated three frontier LLMs (GPT-40, Gemini-2.0-Flash, Qwen-Plus) on heart disease diagnosis using 100 diverse clinical cases from the UCI Heart Disease dataset. Each model performed 4 independent assessments per case (1,200 total predictions). We tested two prompt variations (\u201cExpert Cardiologist\u201d vs \u201cNeutral Assessor\u201d) and measured intra-model consistency, inter-model agreement, diagnostic accuracy, and prompt sensitivity using a SQLite-based checkpoint system.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-4\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-5\u0022\u003E\u003Cstrong\u003EResults\u003C\/strong\u003E All models achieved exceptional intra-model consistency (99-100%), with Qwen-Plus demonstrating perfect reproducibility (100%). Inter-model agreement was similarly high (98-99%), indicating convergent reasoning patterns. However, diagnostic accuracy remained at approximately 50%, equivalent to random guessing. Models exhibited strong systematic bias toward positive diagnosis (49-51 false positives vs 0-1 false negatives per 100 cases). Prompt variation had minimal impact (\u22643% prediction changes), and error patterns were highly systematic, with all models making identical errors on 48-51% of cases. This created a consistency-accuracy gap of approximately 50 percentage points.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-5\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-6\u0022\u003E\u003Cstrong\u003EConclusions\u003C\/strong\u003E Our findings reveal a critical dissociation between consistency and accuracy in LLM medical diagnosis. While LLMs demonstrate remarkable reproducibility\u2013a desirable property for clinical tools\u2013their systematic tendency toward over-diagnosis and limited discriminative accuracy constrain direct clinical utility. The high inter-model agreement on errors suggests fundamental limitations in applying general-purpose LLMs to medical diagnosis rather than model-specific artifacts. Results suggest LLMs may be better suited as supplementary decision-support tools with human oversight rather than primary diagnostic systems. Future development should prioritize discriminative fine-tuning on labeled diagnostic datasets and calibration techniques to address systematic biases.\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-6\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EINTRODUCTION\u003C\/h2\u003E\u003Cp id=\u0022p-17\u0022\u003ELarge Language Models (LLMs) have emerged as promising tools for clinical applications, demonstrating impressive performance on medical licensing examinations and case analysis [\u003Ca id=\u0022xref-ref-1-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-1\u0022\u003E1\u003C\/a\u003E-\u003Ca id=\u0022xref-ref-3-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-3\u0022\u003E3\u003C\/a\u003E]. Recent evaluations show frontier models achieving 60-80% accuracy on USMLE exams and other standardized medical assessments, approaching or exceeding average human physician performance on knowledge-based tasks. However, their deployment in clinical settings raises critical questions about reliability and consistency.\u003C\/p\u003E\u003Cp id=\u0022p-18\u0022\u003EUnlike traditional diagnostic tools expected to yield reproducible results, LLMs employ stochastic generation mechanisms that can lead to varying outputs for identical inputs. Temperature parameters, sampling methods, and prompt formulations can all influence model behavior, potentially undermining clinical reproducibility. Despite this concern, most evaluations of LLMs in medical contexts focus on single-run accuracy without assessing reproducibility across repeated assessments [\u003Ca id=\u0022xref-ref-4-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-4\u0022\u003E4\u003C\/a\u003E].\u003C\/p\u003E\u003Cdiv id=\u0022sec-7\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EResearch Gap\u003C\/h3\u003E\u003Cp id=\u0022p-19\u0022\u003EThe existing literature demonstrates a significant gap in understanding the relationship between consistency and accuracy in LLM medical diagnosis. While studies have evaluated diagnostic accuracy on various medical tasks [\u003Ca id=\u0022xref-ref-5-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-5\u0022\u003E5\u003C\/a\u003E,\u003Ca id=\u0022xref-ref-6-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-6\u0022\u003E6\u003C\/a\u003E], few have systematically examined whether high accuracy is accompanied by high consistency, or whether models can be consistently wrong. Additionally, the influence of prompt engineering-often cited as a method to improve LLM performance-remainsincompletely characterized for diagnostic tasks [\u003Ca id=\u0022xref-ref-7-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-7\u0022\u003E7\u003C\/a\u003E].\u003C\/p\u003E\u003Cp id=\u0022p-20\u0022\u003EThree key questions remain unanswered:\u003C\/p\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-1\u0022\u003E\u003Cli id=\u0022list-item-1\u0022\u003E\u003Cp id=\u0022p-21\u0022\u003E\u003Cstrong\u003EHow reproducible are LLM diagnostic assessments?\u003C\/strong\u003E Do models provide consistent diagnoses when presented with identical clinical information multiple times?\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-2\u0022\u003E\u003Cp id=\u0022p-22\u0022\u003E\u003Cstrong\u003EWhat is the relationship between consistency and accuracy?\u003C\/strong\u003E Can models be highly consistent but systematically inaccurate, or does consistency guarantee accuracy?\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-3\u0022\u003E\u003Cp id=\u0022p-23\u0022\u003E\u003Cstrong\u003EHow sensitive are diagnostic decisions to prompt formulation?\u003C\/strong\u003E Can prompt engineering effectively modulate diagnostic behavior, or is behavior deeply encoded in model weights?\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-8\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EStudy Objectives\u003C\/h3\u003E\u003Cp id=\u0022p-24\u0022\u003EThis study addresses these gaps through comprehensive evaluation of three state of-the-art LLMs on binary heart disease diagnosis. Our specific aims were to:\u003C\/p\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-2\u0022\u003E\u003Cli id=\u0022list-item-4\u0022\u003E\u003Cp id=\u0022p-25\u0022\u003E\u003Cstrong\u003EQuantify intra-model consistency\u003C\/strong\u003E by measuring agreement across repeated independent assessments of identical cases\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-5\u0022\u003E\u003Cp id=\u0022p-26\u0022\u003E\u003Cstrong\u003EEvaluate inter-model agreement\u003C\/strong\u003E to determine whether diagnostic patterns are model-specific or generalizable\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-6\u0022\u003E\u003Cp id=\u0022p-27\u0022\u003E\u003Cstrong\u003EMeasure diagnostic accuracy\u003C\/strong\u003E relative to ground truth and compare with consistency metrics\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-7\u0022\u003E\u003Cp id=\u0022p-28\u0022\u003E\u003Cstrong\u003EAssess prompt sensitivity\u003C\/strong\u003E by comparing two distinct prompt formulations\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-8\u0022\u003E\u003Cp id=\u0022p-29\u0022\u003E\u003Cstrong\u003EAnalyze error patterns\u003C\/strong\u003E to determine whether mistakes are random or systematic\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003Cp id=\u0022p-30\u0022\u003EUnderstanding these relationships is critical for responsible deployment of LLMs in clinical settings, where both accuracy and reproducibility are essential for patient safety and physician trust.\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-9\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EMETHODS\u003C\/h2\u003E\u003Cdiv id=\u0022sec-10\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EDataset and Study Design\u003C\/h3\u003E\u003Cp id=\u0022p-31\u0022\u003EWe utilized the UCI Heart Disease dataset [\u003Ca id=\u0022xref-ref-8-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-8\u0022\u003E8\u003C\/a\u003E], a widely-used benchmark containing clinical data from 303 patients evaluated for coronary artery disease. The dataset includes 13 clinical parameters:\u003C\/p\u003E\u003Cul class=\u0022list-unord \u0022 id=\u0022list-3\u0022\u003E\u003Cli id=\u0022list-item-9\u0022\u003E\u003Cp id=\u0022p-32\u0022\u003E\u003Cstrong\u003EDemographics:\u003C\/strong\u003E Age, sex\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-10\u0022\u003E\u003Cp id=\u0022p-33\u0022\u003E\u003Cstrong\u003ESymptoms:\u003C\/strong\u003E Chest pain type (4 categories: typical angina, atypical angina, non-anginal pain, asymptomatic)\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-11\u0022\u003E\u003Cp id=\u0022p-34\u0022\u003E\u003Cstrong\u003EVital signs:\u003C\/strong\u003E Resting blood pressure (mm Hg)\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-12\u0022\u003E\u003Cp id=\u0022p-35\u0022\u003E\u003Cstrong\u003ELaboratory values:\u003C\/strong\u003E Serum cholesterol (mg\/dl), fasting blood sugar (\u0026gt;120 mg\/dl: yes\/no)\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-13\u0022\u003E\u003Cp id=\u0022p-36\u0022\u003E\u003Cstrong\u003EElectrocardiography:\u003C\/strong\u003E Resting ECG findings (normal, ST-T wave abnormality, left ventricular hypertrophy)\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-14\u0022\u003E\u003Cp id=\u0022p-37\u0022\u003E\u003Cstrong\u003EExercise testing:\u003C\/strong\u003E Maximum heart rate achieved (beats\/min), exercise induced angina (yes\/no), ST depression induced by exercise relative to rest (mm), slope of peak exercise ST segment (upsloping, flat, downsloping)\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-15\u0022\u003E\u003Cp id=\u0022p-38\u0022\u003E\u003Cstrong\u003EImaging:\u003C\/strong\u003E Number of major vessels (0-3) colored byfluoroscopy\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-16\u0022\u003E\u003Cp id=\u0022p-39\u0022\u003E\u003Cstrong\u003EThalassemia test:\u003C\/strong\u003E Normal, fixed defect, reversible defect\u003C\/p\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003Cp id=\u0022p-40\u0022\u003EThe binary outcome indicated presence (1) or absence (o) of significant coronary artery stenosis based on angiography.\u003C\/p\u003E\u003Cp id=\u0022p-41\u0022\u003ETo ensure diverse representation across the clinical spectrum, we performed k means clustering (k=2) on all features and selected 50 cases from each cluster via stratified random sampling. This yielded 100 test cases with balanced disease prevalence (51% positive, 49% negative), representing varied clinical presentations from low-risk to high-risk profiles.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-11\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EModels Evaluated\u003C\/h3\u003E\u003Cp id=\u0022p-42\u0022\u003EWe evaluated three frontier LLMs representing different architectural families and training approaches:\u003C\/p\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-4\u0022\u003E\u003Cli id=\u0022list-item-17\u0022\u003E\u003Cp id=\u0022p-43\u0022\u003E\u003Cstrong\u003EGPT-40\u003C\/strong\u003E (OpenAI): Multimodal model with extensive medical training\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-18\u0022\u003E\u003Cp id=\u0022p-44\u0022\u003E\u003Cstrong\u003EGemini-2.0-Flash\u003C\/strong\u003E(Google): Efficient variant with strong reasoning capabilities\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-19\u0022\u003E\u003Cp id=\u0022p-45\u0022\u003E\u003Cstrong\u003EQwen-Plus\u003C\/strong\u003E (Alibaba): Large-scale Chinese-English bilingual model\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003Cp id=\u0022p-46\u0022\u003EAll models were accessed via official APis with temperature=o.7 to balance determinism and natural language generation. Each API call was independent, ensuring no information leakage between runs.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-12\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EPrompt Design\u003C\/h3\u003E\u003Cp id=\u0022p-47\u0022\u003EWe tested two prompt formulations to assess sensitivity to framing:\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-13\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EPrompt A (\u201cExpert Cardiologist\u201d)\u003C\/h3\u003E\u003Cspan class=\u0022preformat\u0022\u003E\nYou are Dr. CardioExpert, a highly experienced cardiologist with over\n20 years of\nclinical practice. You are reviewing a patient\u2019s clinical data to determine if they\nhave heart disease. Based on the following clinical parameters, provide your diagnosis.\n[Clinical data provided]\nDoes this patient have heart disease? Answer with:\n\u201cYes\u201d if you believe heart disease is present\n\u201cNo\u201d if you believe heart disease is absent\nProvide a brief 2-3 sentence justification for your diagnosis.\n\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-14\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EPrompt B (\u201cNeutral Assessor\u201d)\u003C\/h3\u003E\u003Cspan class=\u0022preformat\u0022\u003E\nYou are a medical AI assistant trained to provide accurate and\nbalanced diagnostic\nassessments. Your goal is to carefully evaluate clinical data and\nprovide a diagnosis,\navoiding both over-diagnosis and under-diagnosis. Based on the\nfollowing clinical\nparameters, provide your diagnosis.\n[Clinical data provided]\nDoes this patient have heart disease? Answer with:\n\u201cYes\u201d if the clinical evidence suggests heart disease is present\n\u201cNo\u201d if the clinical evidence suggests heart disease is absent\nProvide a brief 2-3 sentence justification for your diagnosis.\n\u003C\/span\u003E\u003Cp id=\u0022p-48\u0022\u003EBoth prompts provided identical clinical data with clear parameter definitions. The key difference was framing: Prompt A emphasized expert authority (potentially inducing medical conservatism), while Prompt B emphasized balanced assessment.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-15\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EExperimental Protocol\u003C\/h3\u003E\u003Cp id=\u0022p-49\u0022\u003EEach model performed 4 independent diagnostic assessments for each of the 100 test cases, yielding 1,200 predictions per prompt (100 cases \u00d7 4 runs \u00d7 3 models). We implemented a SQLite-based checkpoint system with the following features:\u003C\/p\u003E\u003Cul class=\u0022list-unord \u0022 id=\u0022list-5\u0022\u003E\u003Cli id=\u0022list-item-20\u0022\u003E\u003Cp id=\u0022p-50\u0022\u003E\u003Cstrong\u003EImmediate data persistence:\u003C\/strong\u003E Each prediction saved immediately after API response\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-21\u0022\u003E\u003Cp id=\u0022p-51\u0022\u003E\u003Cstrong\u003EDuplicate prevention:\u003C\/strong\u003E UNIQUE constraint on (test_id, run_id, model) prevented accidental re-runs\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-22\u0022\u003E\u003Cp id=\u0022p-52\u0022\u003E\u003Cstrong\u003EAutomatic resumption:\u003C\/strong\u003E System detected completed runs and continued from last checkpoint\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-23\u0022\u003E\u003Cp id=\u0022p-53\u0022\u003E\u003Cstrong\u003EComprehensive logging:\u003C\/strong\u003E Timestamps, justifications, and error messages recorded for all predictions\u003C\/p\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003Cp id=\u0022p-54\u0022\u003EThe system enabled reliable execution across multiple days despite API rate limits and connection issues, ensuring complete data collection.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-16\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EOutcome Measures\u003C\/h3\u003E\u003Cdiv id=\u0022sec-17\u0022 class=\u0022subsection\u0022\u003E\u003Ch4\u003EPrimary outcomes\u003C\/h4\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-6\u0022\u003E\u003Cli id=\u0022list-item-24\u0022\u003E\u003Cp id=\u0022p-55\u0022\u003E\u003Cstrong\u003EIntra-model consistency:\u003C\/strong\u003E For each case, we calculated the proportion of 4 runs with majority agreement (\u22652\/4 identical predictions). Perfect consistency = 100% (all 4 runs identical).\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-25\u0022\u003E\u003Cp id=\u0022p-56\u0022\u003E\u003Cstrong\u003EDiagnostic accuracy:\u003C\/strong\u003E Using majority voting per case ( \u22652\/4runs for final diagnosis), we calculated:\u003C\/p\u003E\u003Cul class=\u0022list-unord \u0022 id=\u0022list-7\u0022\u003E\u003Cli id=\u0022list-item-26\u0022\u003E\u003Cp id=\u0022p-57\u0022\u003EAccuracy: (TP +TN)\/ Total\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-27\u0022\u003E\u003Cp id=\u0022p-58\u0022\u003ESensitivity (Recall): TP \/ (TP + FN)\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-28\u0022\u003E\u003Cp id=\u0022p-59\u0022\u003ESpecificity: TN \/ (TN + FP)\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-29\u0022\u003E\u003Cp id=\u0022p-60\u0022\u003EPrecision: TP \/ (TP + FP)\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-30\u0022\u003E\u003Cp id=\u0022p-61\u0022\u003EFi-score: 2 \u00d7 (Precision \u00d7 Recall) \/ (Precision + Recall)\u003C\/p\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-31\u0022\u003E\u003Cp id=\u0022p-62\u0022\u003E\u003Cstrong\u003EInter-model agreement:\u003C\/strong\u003E Pairwise agreement between models and Cohen\u2019s kappa for chance-corrected agreement\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-18\u0022 class=\u0022subsection\u0022\u003E\u003Ch4\u003ESecondary outcomes\u003C\/h4\u003E\u003Cp id=\u0022p-63\u0022\u003E\u003Cstrong\u003E4. Prompt sensitivity:\u003C\/strong\u003E Proportion of cases with identical predictions across both prompt formulations\u003C\/p\u003E\u003Cp id=\u0022p-64\u0022\u003E\u003Cstrong\u003E5. Error pattern analysis:\u003C\/strong\u003E Classification of cases as all-correct (all 3 models right), all-wrong (all 3 models wrong), or mixed outcomes\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-19\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EStatistical Analysis\u003C\/h3\u003E\u003Cp id=\u0022p-65\u0022\u003EWe calculated descriptive statistics (mean, standard deviation, range) for all metrics. Confusion matrices visualized diagnostic patterns. Cohen\u2019s kappa assessed inter-model agreement with chance correction. All analyses used Python with pandas, NumPy, scikit-learn, and scipy. Statistical significance was set at p\u0026lt;0.05 (two-tailed tests).\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-20\u0022\u003E\u003Ch2 class=\u0022\u0022\u003ERESULTS\u003C\/h2\u003E\u003Cdiv id=\u0022sec-21\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003E1. Intra-Model Consistency: Exceptional Reproducibility\u003C\/h3\u003E\u003Cp id=\u0022p-66\u0022\u003EAll three models demonstrated remarkably high intra-model consistency across repeated assessments (\u003Ca id=\u0022xref-table-wrap-1-1\u0022 class=\u0022xref-table\u0022 href=\u0022#T1\u0022\u003ETable 1\u003C\/a\u003E). Qwen-Plus achieved perfect consistency (100%) with Prompt A, never varying across 4 independent runs for any of the 100 cases. GPT-40 and Gemini-2.0-Flash showed 99.0-99.5% average consistency.\u003C\/p\u003E\u003Cdiv id=\u0022T1\u0022 class=\u0022table pos-float\u0022\u003E\u003Cdiv class=\u0022table-inline table-callout-links\u0022\u003E\u003Cdiv class=\u0022callout\u0022\u003E\u003Cspan\u003EView this table:\u003C\/span\u003E\u003Cul class=\u0022callout-links\u0022\u003E\u003Cli class=\u0022view-inline first\u0022\u003E\u003Ca href=\u0022\u0022 class=\u0022table-expand-inline\u0022 data-table-url=\u0022\/highwire\/markup\/1121911\/expansion?postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0026amp;table-expand-inline=1\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView inline\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022view-popup\u0022\u003E\u003Ca href=\u0022\/highwire\/markup\/1121911\/expansion?width=1000\u0026amp;height=500\u0026amp;iframe=true\u0026amp;postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0022 class=\u0022colorbox colorbox-load table-expand-popup\u0022 rel=\u0022gallery-fragment-tables\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView popup\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022download-ppt last\u0022\u003E\u003Ca href=\u0022\/highwire\/powerpoint\/1121911\u0022 class=\u0022highwire-figure-link highwire-figure-link-ppt\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload powerpoint\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022table-caption\u0022\u003E\u003Cspan class=\u0022table-label\u0022\u003ETable 1.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EIntra-Model Consistency Metrics\u003C\/span\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cp id=\u0022p-67\u0022\u003ENotably, 96-100% of cases achieved perfect agreement (4\/ 4 identical predictions), and minimum consistency never fell below 50% (indicating at least 2\/4 runs agreed in all cases). This demonstrates that LLMs apply reasoning patterns consistently rather than generating random outputs.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-22\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003E2. Inter-Model Agreement: High Consensus\u003C\/h3\u003E\u003Cp id=\u0022p-68\u0022\u003EModels showed 98-100% pairwise agreement, indicating remarkably similar reasoning patterns despite different architectures and training procedures (\u003Ca id=\u0022xref-table-wrap-2-1\u0022 class=\u0022xref-table\u0022 href=\u0022#T2\u0022\u003ETable 2\u003C\/a\u003E).\u003C\/p\u003E\u003Cdiv id=\u0022T2\u0022 class=\u0022table pos-float\u0022\u003E\u003Cdiv class=\u0022table-inline table-callout-links\u0022\u003E\u003Cdiv class=\u0022callout\u0022\u003E\u003Cspan\u003EView this table:\u003C\/span\u003E\u003Cul class=\u0022callout-links\u0022\u003E\u003Cli class=\u0022view-inline first\u0022\u003E\u003Ca href=\u0022\u0022 class=\u0022table-expand-inline\u0022 data-table-url=\u0022\/highwire\/markup\/1121913\/expansion?postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0026amp;table-expand-inline=1\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView inline\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022view-popup\u0022\u003E\u003Ca href=\u0022\/highwire\/markup\/1121913\/expansion?width=1000\u0026amp;height=500\u0026amp;iframe=true\u0026amp;postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0022 class=\u0022colorbox colorbox-load table-expand-popup\u0022 rel=\u0022gallery-fragment-tables\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView popup\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022download-ppt last\u0022\u003E\u003Ca href=\u0022\/highwire\/powerpoint\/1121913\u0022 class=\u0022highwire-figure-link highwire-figure-link-ppt\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload powerpoint\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022table-caption\u0022\u003E\u003Cspan class=\u0022table-label\u0022\u003ETable 2.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EInter-Model Agreement\u003C\/span\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cp id=\u0022p-69\u0022\u003EThree-way agreement (all models concurring) occurred in 98-99% of cases. Cohen\u2019s kappa values near zero reflected extreme class imbalance (nearly all positive predictions) rather than lack of agreement-when one model predicted \u201cdisease,\u201d others almost always agreed.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-23\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003E3. Diagnostic Accuracy: Limited Despite High Consistency\u003C\/h3\u003E\u003Cp id=\u0022p-70\u0022\u003EDiagnostic accuracy approximated random guessing (48-51%) despite 99-100% consistency (\u003Ca id=\u0022xref-table-wrap-3-1\u0022 class=\u0022xref-table\u0022 href=\u0022#T3\u0022\u003ETable 3\u003C\/a\u003E). This created a consistency-accuracy gap of approximately 50 percentage points.\u003C\/p\u003E\u003Cdiv id=\u0022T3\u0022 class=\u0022table pos-float\u0022\u003E\u003Cdiv class=\u0022table-inline table-callout-links\u0022\u003E\u003Cdiv class=\u0022callout\u0022\u003E\u003Cspan\u003EView this table:\u003C\/span\u003E\u003Cul class=\u0022callout-links\u0022\u003E\u003Cli class=\u0022view-inline first\u0022\u003E\u003Ca href=\u0022\u0022 class=\u0022table-expand-inline\u0022 data-table-url=\u0022\/highwire\/markup\/1121907\/expansion?postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0026amp;table-expand-inline=1\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView inline\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022view-popup\u0022\u003E\u003Ca href=\u0022\/highwire\/markup\/1121907\/expansion?width=1000\u0026amp;height=500\u0026amp;iframe=true\u0026amp;postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0022 class=\u0022colorbox colorbox-load table-expand-popup\u0022 rel=\u0022gallery-fragment-tables\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView popup\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022download-ppt last\u0022\u003E\u003Ca href=\u0022\/highwire\/powerpoint\/1121907\u0022 class=\u0022highwire-figure-link highwire-figure-link-ppt\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload powerpoint\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022table-caption\u0022\u003E\u003Cspan class=\u0022table-label\u0022\u003ETable 3.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EDiagnostic Performance Metrics\u003C\/span\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cp id=\u0022p-71\u0022\u003EModels achieved perfect or near-perfect recall (98-100%), indicating excellent sensitivity for detecting disease. However, specificity was extremely poor (\u223co-2%), generating 49-51false positives versus only 0-1 false negatives. This pattern suggests systematic positive diagnosis bias rather than random errors.\u003C\/p\u003E\u003Cp id=\u0022p-72\u0022\u003ERepresentative confusion matrices (\u003Ca id=\u0022xref-fig-1-1\u0022 class=\u0022xref-fig\u0022 href=\u0022#F1\u0022\u003EFigure 1\u003C\/a\u003E) showed models predicted \u201cdisease present\u201d for nearly all cases, with true negatives \u2248o across all model-prompt combinations.\u003C\/p\u003E\u003Cdiv id=\u0022F1\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F1.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Confusion Matrices for All Model-Prompt Combinations Six confusion matrices (2\u0026#xD7;3 grid) showing diagnostic patterns for GPT-40, Gemini- 2.0-Flash, and Qwen-Plus with both Expert and Neutral prompts. All matrices show systematic positive bias with TN \u0026#x2248; 0, FP \u0026#x2248; 49-51, FN \u0026#x2248; 0-1, TP \u0026#x2248; 51. Annotations include accuracy, sensitivity, and specificity for each.\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1548761257\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;\u0026lt;span xmlns=\u0026quot;http:\/\/www.w3.org\/1999\/xhtml\u0026quot; class=\u0026quot;caption-title\u0026quot;\u0026gt;Confusion Matrices for All Model-Prompt Combinations\u0026lt;\/span\u0026gt; Six confusion matrices (2\u0026#xD7;3 grid) showing diagnostic patterns for GPT-40, Gemini- 2.0-Flash, and Qwen-Plus with both Expert and Neutral prompts. All matrices show systematic positive bias with TN \u0026#x2248; 0, FP \u0026#x2248; 49-51, FN \u0026#x2248; 0-1, TP \u0026#x2248; 51. Annotations include accuracy, sensitivity, and specificity for each.\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Figure 1.\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F1.medium.gif\u0022 width=\u0022440\u0022 height=\u0022297\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Figure 1.\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F1.medium.gif\u0022 width=\u0022440\u0022 height=\u0022297\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F1.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Figure 1.\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F1.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022 xmlns:xhtml=\u0022http:\/\/www.w3.org\/1999\/xhtml\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003EFigure 1.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EConfusion Matrices for All Model-Prompt Combinations\u003C\/span\u003E\u003Cp id=\u0022p-73\u0022 class=\u0022first-child\u0022\u003ESix confusion matrices (2\u00d73 grid) showing diagnostic patterns for GPT-40, Gemini- 2.0-Flash, and Qwen-Plus with both Expert and Neutral prompts. All matrices show systematic positive bias with TN \u2248 0, FP \u2248 49-51, FN \u2248 0-1, TP \u2248 51. Annotations include accuracy, sensitivity, and specificity for each.\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-24\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003E4. Prompt Sensitivity: Minimal Impact\u003C\/h3\u003E\u003Cp id=\u0022p-74\u0022\u003EChanging from \u201cExpert Cardiologist\u201d to \u201cNeutral Assessor\u201d prompt had minimal effect (\u003Ca id=\u0022xref-table-wrap-4-1\u0022 class=\u0022xref-table\u0022 href=\u0022#T4\u0022\u003ETable 4\u003C\/a\u003E). GPT-40 showed zero sensitivity (100% identical predictions), while Gemini and Qwen changed only 1-3 predictions (1-3% of cases).\u003C\/p\u003E\u003Cdiv id=\u0022T4\u0022 class=\u0022table pos-float\u0022\u003E\u003Cdiv class=\u0022table-inline table-callout-links\u0022\u003E\u003Cdiv class=\u0022callout\u0022\u003E\u003Cspan\u003EView this table:\u003C\/span\u003E\u003Cul class=\u0022callout-links\u0022\u003E\u003Cli class=\u0022view-inline first\u0022\u003E\u003Ca href=\u0022\u0022 class=\u0022table-expand-inline\u0022 data-table-url=\u0022\/highwire\/markup\/1121908\/expansion?postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0026amp;table-expand-inline=1\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView inline\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022view-popup\u0022\u003E\u003Ca href=\u0022\/highwire\/markup\/1121908\/expansion?width=1000\u0026amp;height=500\u0026amp;iframe=true\u0026amp;postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0022 class=\u0022colorbox colorbox-load table-expand-popup\u0022 rel=\u0022gallery-fragment-tables\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView popup\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022download-ppt last\u0022\u003E\u003Ca href=\u0022\/highwire\/powerpoint\/1121908\u0022 class=\u0022highwire-figure-link highwire-figure-link-ppt\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload powerpoint\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022table-caption\u0022\u003E\u003Cspan class=\u0022table-label\u0022\u003ETable 4.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EPrompt Robustness Analysis\u003C\/span\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cp id=\u0022p-75\u0022\u003EThis suggests diagnostic behavior is deeply encoded in model weights rather than easily modifiable through surface-level prompt variations. Notably, prompt changes slightly worsened accuracy, suggesting the \u201cneutral\u201d framing did not reduce positive bias as hypothesized.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-25\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003E5. Error Pattern Analysis: Systematic, Not Random\u003C\/h3\u003E\u003Cp id=\u0022p-76\u0022\u003EErrors were highly systematic rather than random (\u003Ca id=\u0022xref-table-wrap-5-1\u0022 class=\u0022xref-table\u0022 href=\u0022#T5\u0022\u003ETable 5\u003C\/a\u003E). In 98-99% of cases, all three models either succeeded together or failed together. Only 1-2% showed model disagreement.\u003C\/p\u003E\u003Cdiv id=\u0022T5\u0022 class=\u0022table pos-float\u0022\u003E\u003Cdiv class=\u0022table-inline table-callout-links\u0022\u003E\u003Cdiv class=\u0022callout\u0022\u003E\u003Cspan\u003EView this table:\u003C\/span\u003E\u003Cul class=\u0022callout-links\u0022\u003E\u003Cli class=\u0022view-inline first\u0022\u003E\u003Ca href=\u0022\u0022 class=\u0022table-expand-inline\u0022 data-table-url=\u0022\/highwire\/markup\/1121910\/expansion?postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0026amp;table-expand-inline=1\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView inline\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022view-popup\u0022\u003E\u003Ca href=\u0022\/highwire\/markup\/1121910\/expansion?width=1000\u0026amp;height=500\u0026amp;iframe=true\u0026amp;postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0022 class=\u0022colorbox colorbox-load table-expand-popup\u0022 rel=\u0022gallery-fragment-tables\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView popup\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022download-ppt last\u0022\u003E\u003Ca href=\u0022\/highwire\/powerpoint\/1121910\u0022 class=\u0022highwire-figure-link highwire-figure-link-ppt\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload powerpoint\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022table-caption\u0022\u003E\u003Cspan class=\u0022table-label\u0022\u003ETable 5.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EError Consistency Patterns\u003C\/span\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cp id=\u0022p-77\u0022\u003EThis indicates models share fundamental limitations or biases rather than making independent errors. Qualitative analysis of justifications revealed models consistently cited elevated cholesterol, abnormal ECG findings, or exercise abnormalities as disease evidence even when ground truth indicated absence of significant stenosis. This suggests confusion between cardiovascular risk factors and diagnostic criteria for coronary disease.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-26\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003E6. Consistency vs Accuracy Trade-off: The Central Finding\u003C\/h3\u003E\u003Cp id=\u0022p-78\u0022\u003E\u003Ca id=\u0022xref-fig-2-1\u0022 class=\u0022xref-fig\u0022 href=\u0022#F2\u0022\u003EFigure 2\u003C\/a\u003E visualizes the consistency-accuracy relationship, revealing the critical dissociation. All models cluster in the high-consistency, low-accuracy quadrant, demonstrating that exceptional reproducibility (99-100%) coexists with chance level accuracy (\u223c50%).\u003C\/p\u003E\u003Cdiv id=\u0022F2\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F2.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Comprehensive Consistency Analysis (7 Panels) (A) Intra model consistency bar chart showing 99-100% consistency across all models (B) Perfect agreement rate (4\/4 runs identical) showing 96-100% (C) Consistency vs accuracy scatter plot revealing 50-point gap (D) Inter-model agreement heatmap showing 98-100% pairwise agreement (E) Error pattern distribution (all-correct: 48-50%, all-wrong: 48-51%, mixed: 1-2%) (F) Prompt robustness showing 0-3 prediction changes per model (G) Consistency distribution histogram showing clustering at 100%\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1548761257\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;\u0026lt;span xmlns=\u0026quot;http:\/\/www.w3.org\/1999\/xhtml\u0026quot; class=\u0026quot;caption-title\u0026quot;\u0026gt;Comprehensive Consistency Analysis (7 Panels)\u0026lt;\/span\u0026gt; (A) Intra model consistency bar chart showing 99-100% consistency across all models (B) Perfect agreement rate (4\/4 runs identical) showing 96-100% (C) Consistency vs accuracy scatter plot revealing 50-point gap (D) Inter-model agreement heatmap showing 98-100% pairwise agreement (E) Error pattern distribution (all-correct: 48-50%, all-wrong: 48-51%, mixed: 1-2%) (F) Prompt robustness showing 0-3 prediction changes per model (G) Consistency distribution histogram showing clustering at 100%\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Figure 2.\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F2.medium.gif\u0022 width=\u0022440\u0022 height=\u0022330\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Figure 2.\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F2.medium.gif\u0022 width=\u0022440\u0022 height=\u0022330\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F2.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Figure 2.\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F2.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003EFigure 2.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EComprehensive Consistency Analysis (7 Panels)\u003C\/span\u003E\u003Cp id=\u0022p-79\u0022 class=\u0022first-child\u0022\u003E(A) Intra model consistency bar chart showing 99-100% consistency across all models (B) Perfect agreement rate (4\/4 runs identical) showing 96-100% (C) Consistency vs accuracy scatter plot revealing 50-point gap (D) Inter-model agreement heatmap showing 98-100% pairwise agreement (E) Error pattern distribution (all-correct: 48-50%, all-wrong: 48-51%, mixed: 1-2%) (F) Prompt robustness showing 0-3 prediction changes per model (G) Consistency distribution histogram showing clustering at 100%\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022F3\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F3.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Prompt Comparison (4 Panels) (A) Accuracy comparison (Expert vs Neutral) showing minimal difference (B) False positive rate comparison (both 49-51%) (C) Prediction agreement across prompts (98-100%) (D) Sample case showing identical justifications despite prompt difference\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1548761257\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;\u0026lt;span xmlns=\u0026quot;http:\/\/www.w3.org\/1999\/xhtml\u0026quot; class=\u0026quot;caption-title\u0026quot;\u0026gt;Prompt Comparison (4 Panels)\u0026lt;\/span\u0026gt; (A) Accuracy comparison (Expert vs Neutral) showing minimal difference (B) False positive rate comparison (both 49-51%) (C) Prediction agreement across prompts (98-100%) (D) Sample case showing identical justifications despite prompt difference\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Figure 3.\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F3.medium.gif\u0022 width=\u0022440\u0022 height=\u0022349\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Figure 3.\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F3.medium.gif\u0022 width=\u0022440\u0022 height=\u0022349\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F3.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Figure 3.\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F3.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003EFigure 3.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EPrompt Comparison (4 Panels)\u003C\/span\u003E\u003Cp id=\u0022p-80\u0022 class=\u0022first-child\u0022\u003E(A) Accuracy comparison (Expert vs Neutral) showing minimal difference (B) False positive rate comparison (both 49-51%) (C) Prediction agreement across prompts (98-100%) (D) Sample case showing identical justifications despite prompt difference\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cp id=\u0022p-81\u0022\u003EThis 50-percentage-point gap represents the core finding: \u003Cstrong\u003ELLMs reliably apply learned reasoning patterns, but those patterns are systematically biased toward positive diagnosis\u003C\/strong\u003E.\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-27\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EDISCUSSION\u003C\/h2\u003E\u003Cdiv id=\u0022sec-28\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EPrincipal Findings\u003C\/h3\u003E\u003Cp id=\u0022p-82\u0022\u003EThis study demonstrates a critical dissociation between consistency and accuracy in LLM medical diagnosis. Three state-of-the-art models achieved exceptional intra-model consistency (99-100%) and high inter-model agreement (98-99%), yet diagnostic accuracy remained at approximately 50%\u2013equivalent to random guessing. This created a consistency-accuracy gap of \u223c50 percentage points, revealing that \u003Cstrong\u003ELLMs can be reliably wrong\u003C\/strong\u003E.\u003C\/p\u003E\u003Cp id=\u0022p-83\u0022\u003EThe systematic nature of errors-48-51% of cases had all models wrong together \u2013indicates shared fundamental limitations rather than random fluctuations. Prompt engineering had minimal impact ( \u22643% prediction changes), suggesting diagnostic behavior is deeply encoded rather than easily modifiable.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-29\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EThe Consistency-Accuracy Paradox\u003C\/h3\u003E\u003Cp id=\u0022p-84\u0022\u003EHigh consistency without high accuracy indicates LLMs reliably apply learned reasoning patterns, but those patterns are systematically biased. This \u201cconsistent wrongness\u201d is arguably more concerning than random errors for several reasons:\u003C\/p\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-8\u0022\u003E\u003Cli id=\u0022list-item-32\u0022\u003E\u003Cp id=\u0022p-85\u0022\u003E\u003Cstrong\u003EUndermines calibration:\u003C\/strong\u003E Physicians may develop false confidence in reproducible but incorrect assessments\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-33\u0022\u003E\u003Cp id=\u0022p-86\u0022\u003E\u003Cstrong\u003EDifficult to detect:\u003C\/strong\u003E Without ground truth, consistency may be mistaken for accuracy\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-34\u0022\u003E\u003Cp id=\u0022p-87\u0022\u003E\u003Cstrong\u003ESystematic harm:\u003C\/strong\u003E Consistent over-diagnosis leads to predictable patterns of unnecessary testing and treatment\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003Cp id=\u0022p-88\u0022\u003ESeveral mechanisms may explain this paradox:\u003C\/p\u003E\u003Cdiv id=\u0022sec-30\u0022 class=\u0022subsection\u0022\u003E\u003Ch4\u003EMedical Conservatism Bias\u003C\/h4\u003E\u003Cp id=\u0022p-89\u0022\u003ELLMs trained on medical literature may encode the clinical heuristic that missing disease (false negative) carries greater consequences than over-diagnosis (false positive). This \u201cbetter safe than sorry\u201d reasoning, while defensible in human clinical practice with subsequent testing, becomes problematic when applied mechanistically to binary classification.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-31\u0022 class=\u0022subsection\u0022\u003E\u003Ch4\u003ERisk Factor Conflation\u003C\/h4\u003E\u003Cp id=\u0022p-90\u0022\u003EQualitative analysis suggests models conflate cardiovascular risk factors with diagnostic findings. Elevated cholesterol increases long-term disease risk but doesn\u2019t constitute diagnostic evidence of current coronary stenosis. LLMs trained on general medical text emphasizing risk management may struggle to distinguish \u201chigh-risk patient\u201d from \u201cdisease positive patient.\u201d\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-32\u0022 class=\u0022subsection\u0022\u003E\u003Ch4\u003ELack of Discriminative Training\u003C\/h4\u003E\u003Cp id=\u0022p-91\u0022\u003EUnlike supervised models trained explicitly on diagnostic labels with balanced loss functions, LLMs learn from general medical text emphasizing disease description more than differential diagnosis. This leaves them poorly calibrated for binary classification tasks.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-33\u0022 class=\u0022subsection\u0022\u003E\u003Ch4\u003ETraining Data Imbalance\u003C\/h4\u003E\u003Cp id=\u0022p-92\u0022\u003EMedical literature disproportionately discusses disease-positive cases, potentially biasing LLMs toward assuming disease presence when clinical findings are ambiguous.\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-34\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EPrompt Insensitivity: Deep-Rooted Behavior\u003C\/h3\u003E\u003Cp id=\u0022p-93\u0022\u003EThe minimal prompt impact (GPT: 0%, Gemini: 2%, Qwen: 1% prediction changes) was unexpected. Reframing from \u201cexpert cardiologist\u201d (potentially inducing conservative bias) to \u201cneutral assessor\u201d (emphasizing balance) should have reduced over-diagnosisif behavior were prompt-modifiable.\u003C\/p\u003E\u003Cp id=\u0022p-94\u0022\u003EThis insensitivity suggests diagnostic reasoning is encoded in model weights through pre-training and cannot be substantially altered through instruction based prompting alone. This has important implications:\u003C\/p\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-9\u0022\u003E\u003Cli id=\u0022list-item-35\u0022\u003E\u003Cp id=\u0022p-95\u0022\u003E\u003Cstrong\u003EPrompt engineering has limits:\u003C\/strong\u003E Instruction-based approaches may be insufficient for medical tasks requiring calibrated decision thresholds\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-36\u0022\u003E\u003Cp id=\u0022p-96\u0022\u003E\u003Cstrong\u003EFine-tuning may be necessary:\u003C\/strong\u003E Supervised training on labeled diagnostic datasets with balanced objectives may be required\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-37\u0022\u003E\u003Cp id=\u0022p-97\u0022\u003E\u003Cstrong\u003EBehavioral inertia:\u003C\/strong\u003E Models may resist behavioral changes that conflict with deeply encoded patterns from pre-training\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-35\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EInter-Model Agreement: Shared Limitations\u003C\/h3\u003E\u003Cp id=\u0022p-98\u0022\u003EThe 98-99% inter-model agreement despite different architectures (GPT: decoder-only transformer, Gemini: multimodal, Qwen: bilingual) and training procedures suggests observed limitations reflect fundamental challenges in applying general-purpose LLMs to medical diagnosis rather than model-specific artifacts.\u003C\/p\u003E\u003Cp id=\u0022p-99\u0022\u003EPossible explanations include:\u003C\/p\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-10\u0022\u003E\u003Cli id=\u0022list-item-38\u0022\u003E\u003Cp id=\u0022p-100\u0022\u003E\u003Cstrong\u003ESimilar training data:\u003C\/strong\u003E Major models likely train on overlapping medical text corpora\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-39\u0022\u003E\u003Cp id=\u0022p-101\u0022\u003E\u003Cstrong\u003EConvergent learning:\u003C\/strong\u003E Different architectures may converge on similar medical conservatism heuristics\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-40\u0022\u003E\u003Cp id=\u0022p-102\u0022\u003E\u003Cstrong\u003EShared limitations in processing structured data:\u003C\/strong\u003E All models receive tabular clinical data as text, potentially losing important numerical relationships\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-41\u0022\u003E\u003Cp id=\u0022p-103\u0022\u003E\u003Cstrong\u003ECommon threshold calibration failure:\u003C\/strong\u003E Binary classification requires well-calibrated decision boundaries, which general-purpose LLMs lack\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-36\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EClinical Implications\u003C\/h3\u003E\u003Cp id=\u0022p-104\u0022\u003E\u003Cstrong\u003ECurrent LLMs are not ready for primary diagnostic applications\u003C\/strong\u003E requiring binary classification. The \u223c50% accuracy is unacceptable clinically and could lead to:\u003C\/p\u003E\u003Cul class=\u0022list-unord \u0022 id=\u0022list-11\u0022\u003E\u003Cli id=\u0022list-item-42\u0022\u003E\u003Cp id=\u0022p-105\u0022\u003E\u003Cstrong\u003EHarmful over-diagnosis:\u003C\/strong\u003E 49-51% false positive rate means half of healthy patients would receive incorrect disease diagnosis\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-43\u0022\u003E\u003Cp id=\u0022p-106\u0022\u003E\u003Cstrong\u003EUnnecessary downstream testing:\u003C\/strong\u003E Cascade of follow-up tests, imaging, and procedures\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-44\u0022\u003E\u003Cp id=\u0022p-107\u0022\u003E\u003Cstrong\u003EPatient anxiety and cost:\u003C\/strong\u003E Psychological burden and financial impact of false diagnoses\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-45\u0022\u003E\u003Cp id=\u0022p-108\u0022\u003E\u003Cstrong\u003EResource misallocation:\u003C\/strong\u003E Diverting limited healthcare resources to false positive cases\u003C\/p\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003Cp id=\u0022p-109\u0022\u003EIn a typical screening scenario with 50% disease prevalence (as in our test set), deploying these models would result in: - \u003Cstrong\u003E98-100% of disease cases correctly identified\u003C\/strong\u003E (excellent sensitivity) - \u003Cstrong\u003EOnly0-2% ofhealthy cases correctly identified\u003C\/strong\u003E (terrible specificity) - \u003Cstrong\u003ENet effect: \u223c50% unnecessary diagnoses\u003C\/strong\u003E\u003C\/p\u003E\u003Cp id=\u0022p-110\u0022\u003EDespite limitations for primary diagnosis, LLMs\u2019 high consistency suggests potential roles as:\u003C\/p\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-12\u0022\u003E\u003Cli id=\u0022list-item-46\u0022\u003E\u003Cp id=\u0022p-111\u0022\u003E\u003Cstrong\u003ESecond opinion tools:\u003C\/strong\u003E Reproducibility could build physician confidence when LLM agrees with human assessment\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-47\u0022\u003E\u003Cp id=\u0022p-112\u0022\u003E\u003Cstrong\u003ETriage assistance:\u003C\/strong\u003E High sensitivity (98-100%) suitable for initial screening where false positives are acceptable\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-48\u0022\u003E\u003Cp id=\u0022p-113\u0022\u003E\u003Cstrong\u003EMedical education:\u003C\/strong\u003E Consistent reasoning patterns useful for training scenarios\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-49\u0022\u003E\u003Cp id=\u0022p-114\u0022\u003E\u003Cstrong\u003EResearch hypothesis generation:\u003C\/strong\u003E Systematic patterns may reveal interesting clinical relationships\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003Cp id=\u0022p-115\u0022\u003E\u003Cstrong\u003EAll such applications require human oversight\u003C\/strong\u003E given poor specificity.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-37\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003ETechnical Implications\u003C\/h3\u003E\u003Cp id=\u0022p-116\u0022\u003EResults suggest general-purpose LLMs lack discriminative capabilities for diagnostic classification tasks. Future development should consider:\u003C\/p\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-13\u0022\u003E\u003Cli id=\u0022list-item-50\u0022\u003E\u003Cp id=\u0022p-117\u0022\u003E\u003Cstrong\u003ESupervised fine-tuning:\u003C\/strong\u003E Training on labeled diagnostic datasets with balanced loss functions to improve calibration\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-51\u0022\u003E\u003Cp id=\u0022p-118\u0022\u003E\u003Cstrong\u003EReinforcement learning from physician feedback:\u003C\/strong\u003E Learning from expert-verified diagnostic decisions\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-52\u0022\u003E\u003Cp id=\u0022p-119\u0022\u003E\u003Cstrong\u003EThreshold calibration techniques:\u003C\/strong\u003E Post-hoc calibration methods for binary classification (e.g., Platt scaling, isotonic regression)\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-53\u0022\u003E\u003Cp id=\u0022p-120\u0022\u003E\u003Cstrong\u003EHybrid architectures:\u003C\/strong\u003E Combining LLM reasoning with specialized classifiers for final diagnosis\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-54\u0022\u003E\u003Cp id=\u0022p-121\u0022\u003E\u003Cstrong\u003EStructured data processing:\u003C\/strong\u003E Developing methods to better handle numerical clinical parameters\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-55\u0022\u003E\u003Cp id=\u0022p-122\u0022\u003E\u003Cstrong\u003EAdversarial training:\u003C\/strong\u003E Exposing models to challenging cases where risk factors don\u2019t indicate current disease\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-38\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EComparison with Human Physicians\u003C\/h3\u003E\u003Cp id=\u0022p-123\u0022\u003EWhile direct human comparison was beyond our scope, literature provides context. Traditional machine learning models (SVM, Random Forest, XGBoost) achieve 90-93% accuracy on UCI Heart Disease dataset with similar features [\u003Ca id=\u0022xref-ref-9-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-9\u0022\u003E9\u003C\/a\u003E]. Human cardiologists evaluating identical clinical data achieve 85-95% accuracy in research settings [\u003Ca id=\u0022xref-ref-10-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-10\u0022\u003E10\u003C\/a\u003E].\u003C\/p\u003E\u003Cp id=\u0022p-124\u0022\u003ELLMs\u2019 50% accuracy is substantially below both traditional ML and human performance, despite impressive performance on medical knowledge exams. This suggests: - \u003Cstrong\u003EKnowledge \u2260 application:\u003C\/strong\u003E Answering factual questions differs from applying knowledge to specific cases - \u003Cstrong\u003EDifferent cognitive demands:\u003C\/strong\u003E Diagnosis requires discriminative reasoning, not just knowledge recall - \u003Cstrong\u003ENeed for task-specific training:\u003C\/strong\u003E General medical knowledge insufficient for specialized diagnostic tasks\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-39\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003ELimitations\u003C\/h3\u003E\u003Cp id=\u0022p-125\u0022\u003ESeveral limitations warrant consideration:\u003C\/p\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-14\u0022\u003E\u003Cli id=\u0022list-item-56\u0022\u003E\u003Cp id=\u0022p-126\u0022\u003E\u003Cstrong\u003ESingle condition:\u003C\/strong\u003E Heart disease may not generalize to other diagnostic domains\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-57\u0022\u003E\u003Cp id=\u0022p-127\u0022\u003E\u003Cstrong\u003EBinary classification:\u003C\/strong\u003E Real diagnosis often involves multi-class, probabilistic, or hierarchical assessment\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-58\u0022\u003E\u003Cp id=\u0022p-128\u0022\u003E\u003Cstrong\u003EDataset age:\u003C\/strong\u003E 1988 UCI dataset uses diagnostic criteria potentially outdated by current standards\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-59\u0022\u003E\u003Cp id=\u0022p-129\u0022\u003E\u003Cstrong\u003ELimited sample size:\u003C\/strong\u003E 100 cases, though with 4 runs each provides robust consistency estimates\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-60\u0022\u003E\u003Cp id=\u0022p-130\u0022\u003E\u003Cstrong\u003EStructured input only:\u003C\/strong\u003E Missing important narrative information from patient history\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-61\u0022\u003E\u003Cp id=\u0022p-131\u0022\u003E\u003Cstrong\u003EThree models tested:\u003C\/strong\u003E Limited sampling of LLM landscape\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-62\u0022\u003E\u003Cp id=\u0022p-132\u0022\u003E\u003Cstrong\u003EAPI-only access:\u003C\/strong\u003E Inability to analyze internal mechanisms or attention patterns\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-63\u0022\u003E\u003Cp id=\u0022p-133\u0022\u003E\u003Cstrong\u003ESingle temperature setting:\u003C\/strong\u003E Only tested temperature=o.7\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-64\u0022\u003E\u003Cp id=\u0022p-134\u0022\u003E\u003Cstrong\u003ENo clinical context:\u003C\/strong\u003E Models lack patient history, physical exam, or prior test results\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-40\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EFuture Directions\u003C\/h3\u003E\u003Cp id=\u0022p-135\u0022\u003EImportant future work includes:\u003C\/p\u003E\u003Col class=\u0022list-ord \u0022 id=\u0022list-15\u0022\u003E\u003Cli id=\u0022list-item-65\u0022\u003E\u003Cp id=\u0022p-136\u0022\u003E\u003Cstrong\u003EMechanistic studies:\u003C\/strong\u003E Analyzing which clinical parameters LLMs prioritize and how they combine evidence\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-66\u0022\u003E\u003Cp id=\u0022p-137\u0022\u003E\u003Cstrong\u003EImprovement strategies:\u003C\/strong\u003E Testing fine-tuning, ensemble methods, and hybrid approaches\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-67\u0022\u003E\u003Cp id=\u0022p-138\u0022\u003E\u003Cstrong\u003EBroader evaluation:\u003C\/strong\u003E Diverse diagnostic tasks, specialties, and patient populations\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-68\u0022\u003E\u003Cp id=\u0022p-139\u0022\u003E\u003Cstrong\u003EHuman comparison:\u003C\/strong\u003E Direct comparison with physician performance on identical cases\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-69\u0022\u003E\u003Cp id=\u0022p-140\u0022\u003E\u003Cstrong\u003ELongitudinal assessment:\u003C\/strong\u003E Evaluating consistency across model updates and versions\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-70\u0022\u003E\u003Cp id=\u0022p-141\u0022\u003E\u003Cstrong\u003ETheoretical development:\u003C\/strong\u003E Formal frameworks for consistency-accuracy trade-offs\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-71\u0022\u003E\u003Cp id=\u0022p-142\u0022\u003E\u003Cstrong\u003ECalibration methods:\u003C\/strong\u003E Developing post-hoc techniques to improve threshold decisions\u003C\/p\u003E\u003C\/li\u003E\u003Cli id=\u0022list-item-72\u0022\u003E\u003Cp id=\u0022p-143\u0022\u003E\u003Cstrong\u003EClinical integration studies:\u003C\/strong\u003E Real-world pilot implementations with human oversight\u003C\/p\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-41\u0022\u003E\u003Ch2 class=\u0022\u0022\u003ECONCLUSIONS\u003C\/h2\u003E\u003Cp id=\u0022p-144\u0022\u003EThis study provides rigorous evidence that large language models achieve exceptional consistency (99-100%) but limited accuracy ( \u223c50%) in binary medical diagnosis. This consistency-accuracy dissociation represents a fundamental challenge for clinical deployment.\u003C\/p\u003E\u003Cdiv id=\u0022sec-42\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EKey findings\u003C\/h3\u003E\u003Cp id=\u0022p-145\u0022\u003E1. High consistency does not guarantee accuracy-models can be reliably wrong 2. Diagnostic behavior is resistant to prompt engineering, suggesting deep encoding in model weights 3. Errors are systematic rather than random, with all models failing together on \u223c50% of cases 4. Strong positive diagnosis bias (49-51false positives, 0-1 false negatives) indicates conservatism or risk factor conflation\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-43\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EImplications for practice\u003C\/h3\u003E\u003Cp id=\u0022p-146\u0022\u003ECurrent general-purpose LLMs are \u003Cstrong\u003Enot ready for primary diagnostic applications\u003C\/strong\u003E requiring binary classification - Their exceptional reproducibility is clinically valuable but must be paired with human oversight - They may serve useful roles in triage, education, and research with appropriate safeguards\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-44\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EImplications for development\u003C\/h3\u003E\u003Cp id=\u0022p-147\u0022\u003EGeneral-purpose pre-training insufficient for discriminative diagnostic tasks - Future models require supervised fine tuning on labeled diagnostic data - Hybrid architectures combining LLM reasoning with specialized classifiers may be necessary - Calibration techniques essential for threshold-based clinical decisions\u003C\/p\u003E\u003Cp id=\u0022p-148\u0022\u003EThis work contributes to nuanced understanding of LLM capabilities and limitations in healthcare, informing responsible development and deployment of AI-assisted clinical decision support systems. While the promise of LLMs in medicine remains substantial, realizing that promise will require addressing fundamental challenges in discriminativereasoning and decision threshold calibration.\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-45\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EAUTHOR CONTRIBUTIONS\u003C\/h2\u003E\u003Cp id=\u0022p-150\u0022\u003E\u003Cstrong\u003EConceptualization:\u003C\/strong\u003E Syaiful Bachri Mustamin, Dwi Anggriani\u003C\/p\u003E\u003Cp id=\u0022p-151\u0022\u003E\u003Cstrong\u003EMethodology:\u003C\/strong\u003E Dwi Anggriani, Muhammad Atnang, Syaiful Bachri Mustamin\u003C\/p\u003E\u003Cp id=\u0022p-152\u0022\u003E\u003Cstrong\u003ESoftware:\u003C\/strong\u003E Dwi Anggriani, Muhammad Atnang\u003C\/p\u003E\u003Cp id=\u0022p-153\u0022\u003E\u003Cstrong\u003EFormal Analysis:\u003C\/strong\u003E Dwi Anggriani, Kartini Aprilia Pratiwi Nuzry\u003C\/p\u003E\u003Cp id=\u0022p-154\u0022\u003E\u003Cstrong\u003EInvestigation:\u003C\/strong\u003E Dwi Anggriani, Syaiful Bachri Mustamin, Muhammad Atnang\u003C\/p\u003E\u003Cp id=\u0022p-155\u0022\u003E\u003Cstrong\u003EData Curation:\u003C\/strong\u003E Dwi Anggriani, MuhammadAtnang\u003C\/p\u003E\u003Cp id=\u0022p-156\u0022\u003E\u003Cstrong\u003EWriting \u2013 Original Draft:\u003C\/strong\u003E Dwi Anggriani\u003C\/p\u003E\u003Cp id=\u0022p-157\u0022\u003E\u003Cstrong\u003EWriting \u2013 Review \u0026amp; Editing:\u003C\/strong\u003E All authors\u003C\/p\u003E\u003Cp id=\u0022p-158\u0022\u003E\u003Cstrong\u003EVisualization:\u003C\/strong\u003E Dwi Anggriani, Kartini Aprilia Pratiwi Nuzry\u003C\/p\u003E\u003Cp id=\u0022p-159\u0022\u003E\u003Cstrong\u003ESupervision:\u003C\/strong\u003E Syaiful Bachri Mustamin\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-46\u0022\u003E\u003Ch2 class=\u0022\u0022\u003ECOMPETING INTERESTS\u003C\/h2\u003E\u003Cp id=\u0022p-160\u0022\u003EThe authors declare no competing interests.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-47\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EFUNDING\u003C\/h2\u003E\u003Cp id=\u0022p-161\u0022\u003E[Specify funding sources or state \u201cThis research received no specific grant from any funding agency in the public, commercial, or not-for-profit sectors.\u201d]\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section data-availability\u0022 id=\u0022sec-48\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EDATA AVAILABILITY\u003C\/h2\u003E\u003Cp id=\u0022p-162\u0022\u003EThe UCI Heart Disease dataset is publicly available at \u003Ca href=\u0022https:\/\/archive.ics.uci.edu\/ml\/datasets\/heart+disease\u0022\u003Ehttps:\/\/archive.ics.uci.edu\/ml\/datasets\/heart+disease\u003C\/a\u003E. Code and analysis scripts are available at [GitHub repository - to be created]. Model predictions and analysis results are available upon reasonable request to the corresponding author.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-49\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EETHICS\u003C\/h2\u003E\u003Cp id=\u0022p-163\u0022\u003EThis study used publicly available de-identified data and did not involve human subjects research. Institutional review board approval was not required.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-50\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EPREPRINT STATEMENT\u003C\/h2\u003E\u003Cp id=\u0022p-164\u0022\u003EThis article is a preprint and has not been peer-reviewed. It reports new medical research that has not yet been evaluated and should not be used to guide clinical practice.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-51\u0022\u003E\u003Ch2 class=\u0022\u0022\u003ETABLES\u003C\/h2\u003E\u003Cp id=\u0022p-165\u0022\u003E[All 5 tables from Results section included above: Intra-model consistency, Inter model agreement, Diagnostic performance, Prompt robustness, Error patterns]\u003C\/p\u003E\u003Cp id=\u0022p-166\u0022\u003EReady for upload to medRxiv: YES\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-52\u0022 class=\u0022subsection display-objects\u0022\u003E\u003Cdiv id=\u0022F4\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F4.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022ROC Curves Three panels showing ROC curves for GPT-40, Gemini-2.0-Flash, and Qwen-Plus. All curves cluster near diagonal (AUC\u0026#x2248; 0.50), indicating no discriminative ability. Optimal threshold markers show values of 0.1-0.2 (extremely low), confirming models output high probabilities for nearly all cases.\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1548761257\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;\u0026lt;span xmlns=\u0026quot;http:\/\/www.w3.org\/1999\/xhtml\u0026quot; class=\u0026quot;caption-title\u0026quot;\u0026gt;ROC Curves\u0026lt;\/span\u0026gt; Three panels showing ROC curves for GPT-40, Gemini-2.0-Flash, and Qwen-Plus. All curves cluster near diagonal (AUC\u0026#x2248; 0.50), indicating no discriminative ability. Optimal threshold markers show values of 0.1-0.2 (extremely low), confirming models output high probabilities for nearly all cases.\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Supplementary Figure S1.\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F4.medium.gif\u0022 width=\u0022440\u0022 height=\u0022147\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Supplementary Figure S1.\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F4.medium.gif\u0022 width=\u0022440\u0022 height=\u0022147\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F4.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Supplementary Figure S1.\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F4.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003ESupplementary Figure S1.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EROC Curves\u003C\/span\u003E\u003Cp id=\u0022p-167\u0022 class=\u0022first-child\u0022\u003EThree panels showing ROC curves for GPT-40, Gemini-2.0-Flash, and Qwen-Plus. All curves cluster near diagonal (AUC\u2248 0.50), indicating no discriminative ability. Optimal threshold markers show values of 0.1-0.2 (extremely low), confirming models output high probabilities for nearly all cases.\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022F5\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F5.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Prediction Probability Distributions Three panels showing prediction probability distributions stratified by ground truth. Blue histograms (no-disease cases) and red histograms (disease cases) show minimal separation, with both clustering near probability=l.0. This explains poor specificity\u0026#x2013; models assign high disease probability regardless of ground truth.\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1548761257\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;\u0026lt;span xmlns=\u0026quot;http:\/\/www.w3.org\/1999\/xhtml\u0026quot; class=\u0026quot;caption-title\u0026quot;\u0026gt;Prediction Probability Distributions\u0026lt;\/span\u0026gt; Three panels showing prediction probability distributions stratified by ground truth. Blue histograms (no-disease cases) and red histograms (disease cases) show minimal separation, with both clustering near probability=l.0. This explains poor specificity\u0026#x2013; models assign high disease probability regardless of ground truth.\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Supplementary Figure S2.\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F5.medium.gif\u0022 width=\u0022440\u0022 height=\u0022146\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Supplementary Figure S2.\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F5.medium.gif\u0022 width=\u0022440\u0022 height=\u0022146\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F5.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Supplementary Figure S2.\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F5.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003ESupplementary Figure S2.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EPrediction Probability Distributions\u003C\/span\u003E\u003Cp id=\u0022p-168\u0022 class=\u0022first-child\u0022\u003EThree panels showing prediction probability distributions stratified by ground truth. Blue histograms (no-disease cases) and red histograms (disease cases) show minimal separation, with both clustering near probability=l.0. This explains poor specificity\u2013 models assign high disease probability regardless of ground truth.\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022F6\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F6.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Feature Correlations with Predictions Two panels: (A) Bar chart showing correlations between clinical features and model predictions. Highest correlations: ca (vessels on fluoroscopy) r=0.42, oldpeak (ST depression) r=0.38, suggesting models weight these features heavily. (B) Scatter plots for top 4 predictive features showing relationships with average model predictions.\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1548761257\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;\u0026lt;span xmlns=\u0026quot;http:\/\/www.w3.org\/1999\/xhtml\u0026quot; class=\u0026quot;caption-title\u0026quot;\u0026gt;Feature Correlations with Predictions\u0026lt;\/span\u0026gt; Two panels: (A) Bar chart showing correlations between clinical features and model predictions. Highest correlations: ca (vessels on fluoroscopy) r=0.42, oldpeak (ST depression) r=0.38, suggesting models weight these features heavily. (B) Scatter plots for top 4 predictive features showing relationships with average model predictions.\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Supplementary Figure S3.\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F6.medium.gif\u0022 width=\u0022440\u0022 height=\u0022165\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Supplementary Figure S3.\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F6.medium.gif\u0022 width=\u0022440\u0022 height=\u0022165\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F6.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Supplementary Figure S3.\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/12\/09\/2025.12.08.25341823\/F6.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003ESupplementary Figure S3.\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EFeature Correlations with Predictions\u003C\/span\u003E\u003Cp id=\u0022p-169\u0022 class=\u0022first-child\u0022\u003ETwo panels: (A) Bar chart showing correlations between clinical features and model predictions. Highest correlations: ca (vessels on fluoroscopy) r=0.42, oldpeak (ST depression) r=0.38, suggesting models weight these features heavily. (B) Scatter plots for top 4 predictive features showing relationships with average model predictions.\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section ack\u0022 id=\u0022ack-1\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EACKNOWLEDGMENTS\u003C\/h2\u003E\u003Cp id=\u0022p-149\u0022\u003EWe acknowledge OpenAI, Google, and Alibaba for providing API access to GPT- 40, Gemini-2.0-Flash, and Qwen-Plus respectively, which made this research possible.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section ref-list\u0022 id=\u0022ref-list-1\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EREFERENCES\u003C\/h2\u003E\u003Col class=\u0022cit-list ref-use-labels\u0022\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E1.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-1-1\u0022 title=\u0022View reference 1. in text\u0022 id=\u0022ref-1\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.12.08.25341823v1.1\u0022 data-doi=\u002210.1038\/s41591-023-02448-8\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EThirunavukarasu\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EAJ\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ETing\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EDSJ\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EElangovan\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EK\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003ELarge language models in medicine\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003ENat Med\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2023\u003C\/span\u003E;\u003Cspan class=\u0022cit-vol\u0022\u003E29\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E8\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003E1930\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E1940\u003C\/span\u003E.\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-doi-sep\u0022\u003E \u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-scheme\u0022\u003Edoi:\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id cit-pub-id-doi\u0022\u003E10.1038\/s41591-023-02448-8\u003C\/span\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DNat%2BMed%26rft_id%253Dinfo%253Adoi%252F10.1038%252Fs41591-023-02448-8%26rft_id%253Dinfo%253Apmid%252F37460753%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=10.1038\/s41591-023-02448-8\u0026amp;link_type=DOI\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-doi cit-ref-sprinkles-crossref\u0022\u003E\u003Cspan\u003ECrossRef\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=37460753\u0026amp;link_type=MED\u0026amp;atom=%2Fmedrxiv%2Fearly%2F2025%2F12%2F09%2F2025.12.08.25341823.atom\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-medline\u0022\u003E\u003Cspan\u003EPubMed\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E2.\u003C\/span\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal no-rev-xref\u0022 id=\u0022cit-2025.12.08.25341823v1.2\u0022 data-doi=\u002210.1056\/NEJMsr2214184\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ELee\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EP\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EBubeck\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ES\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EPetro\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EJ.\u003C\/span\u003E\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EBenefits, limits, and risks of GPT-4 as an AI chatbot for medicine\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EN Engl J Med\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2023\u003C\/span\u003E;\u003Cspan class=\u0022cit-vol\u0022\u003E388\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E13\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003E1233\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E1239\u003C\/span\u003E.\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-doi-sep\u0022\u003E \u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-scheme\u0022\u003Edoi:\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id cit-pub-id-doi\u0022\u003E10.1056\/NEJMsr2214184\u003C\/span\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DN%2BEngl%2BJ%2BMed%26rft.volume%253D388%26rft.spage%253D1233%26rft_id%253Dinfo%253Adoi%252F10.1056%252FNEJMsr2214184%26rft_id%253Dinfo%253Apmid%252F36988602%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=10.1056\/NEJMsr2214184\u0026amp;link_type=DOI\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-doi cit-ref-sprinkles-crossref\u0022\u003E\u003Cspan\u003ECrossRef\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=36988602\u0026amp;link_type=MED\u0026amp;atom=%2Fmedrxiv%2Fearly%2F2025%2F12%2F09%2F2025.12.08.25341823.atom\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-medline\u0022\u003E\u003Cspan\u003EPubMed\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E3.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-3-1\u0022 title=\u0022View reference 3. in text\u0022 id=\u0022ref-3\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.12.08.25341823v1.3\u0022 data-doi=\u002210.1038\/s41586-023-06291-2\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ESinghal\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EK\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EAzizi\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ES\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ETu\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ET\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003ELarge language models encode clinical knowledge\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003ENature\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2023\u003C\/span\u003E;\u003Cspan class=\u0022cit-vol\u0022\u003E620\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E7972\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003E172\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E180\u003C\/span\u003E.\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-doi-sep\u0022\u003E \u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-scheme\u0022\u003Edoi:\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id cit-pub-id-doi\u0022\u003E10.1038\/s41586-023-06291-2\u003C\/span\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DNature%26rft_id%253Dinfo%253Adoi%252F10.1038%252Fs41586-023-06291-2%26rft_id%253Dinfo%253Apmid%252F37438534%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=10.1038\/s41586-023-06291-2\u0026amp;link_type=DOI\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-doi cit-ref-sprinkles-crossref\u0022\u003E\u003Cspan\u003ECrossRef\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=37438534\u0026amp;link_type=MED\u0026amp;atom=%2Fmedrxiv%2Fearly%2F2025%2F12%2F09%2F2025.12.08.25341823.atom\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-medline\u0022\u003E\u003Cspan\u003EPubMed\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E4.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-4-1\u0022 title=\u0022View reference 4. in text\u0022 id=\u0022ref-4\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.12.08.25341823v1.4\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EBrown\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ETB\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EMann\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EB\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ERyder\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EN\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003ELanguage models are few-shot learners\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EAdvances in Neural Information Processing Systems\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2020\u003C\/span\u003E;\u003Cspan class=\u0022cit-vol\u0022\u003E33\u003C\/span\u003E:\u003Cspan class=\u0022cit-fpage\u0022\u003E1877\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E1901\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DAdvances%2Bin%2BNeural%2BInformation%2BProcessing%2BSystems%26rft.volume%253D33%26rft.spage%253D1877%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E5.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-5-1\u0022 title=\u0022View reference 5. in text\u0022 id=\u0022ref-5\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.12.08.25341823v1.5\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ENori\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EH\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKing\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EN\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EMcKinney\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ESM\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003ECapabilities of GPT-4 on medical challenge problems\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EarXiv preprint\u003C\/abbr\u003E\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-arxiv-sep\u0022\u003E \u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-scheme\u0022\u003EarXiv:\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id cit-pub-id-arxiv\u0022\u003E2303.13375\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-arxiv-sep\u0022\u003E.\u003C\/span\u003E \u003Cspan class=\u0022cit-pub-date\u0022\u003E2023\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E6.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-6-1\u0022 title=\u0022View reference 6. in text\u0022 id=\u0022ref-6\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.12.08.25341823v1.6\u0022 data-doi=\u002210.1371\/journal.pdig.0000198\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKung\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ETH\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ECheatham\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EM\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EMedenilla\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EPerformance of ChatGPT on USMLE: Potential for AI-assisted medical education using large language models\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EPLOS Digit Health\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2023\u003C\/span\u003E;\u003Cspan class=\u0022cit-vol\u0022\u003E2\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E2\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003Eeoooo198\u003C\/span\u003E.\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-doi-sep\u0022\u003E \u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-scheme\u0022\u003Edoi:\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id cit-pub-id-doi\u0022\u003E10.1371\/journal.pdig.0000198\u003C\/span\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DPLOS%2BDigit%2BHealth%26rft.volume%253D2%26rft.spage%253D198eoooo%26rft_id%253Dinfo%253Adoi%252F10.1371%252Fjournal.pdig.0000198%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=10.1371\/journal.pdig.0000198\u0026amp;link_type=DOI\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-doi cit-ref-sprinkles-crossref\u0022\u003E\u003Cspan\u003ECrossRef\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E7.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-7-1\u0022 title=\u0022View reference 7. in text\u0022 id=\u0022ref-7\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.12.08.25341823v1.7\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EWhite\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EJ\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EFu\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EQ\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EHays\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ES\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EA prompt pattern catalog to enhance prompt engineering with ChatGPT\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EarXiv preprint\u003C\/abbr\u003E\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-arxiv-sep\u0022\u003E \u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-scheme\u0022\u003EarXiv:\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id cit-pub-id-arxiv\u0022\u003E2302.11382\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-arxiv-sep\u0022\u003E.\u003C\/span\u003E \u003Cspan class=\u0022cit-pub-date\u0022\u003E2023\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E8.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-8-1\u0022 title=\u0022View reference 8. in text\u0022 id=\u0022ref-8\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-web\u0022 id=\u0022cit-2025.12.08.25341823v1.8\u0022 data-doi=\u002210.24432\/C52P4X\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EJanosi\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ESteinbrunn\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EW\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EPfisterer\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EM\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDetrano\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ER.\u003C\/span\u003E\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EHeart Disease [Dataset]\u003C\/span\u003E. \u003Cspan class=\u0022cit-source\u0022\u003EUCI Machine Learning Repository\u003C\/span\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E1988\u003C\/span\u003E.\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-doi-sep\u0022\u003E \u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-scheme\u0022\u003Edoi:\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id cit-pub-id-doi\u0022\u003E10.24432\/C52P4X\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-doi-sep\u0022\u003E.\u003C\/span\u003E Available from: \u003Ca href=\u0022https:\/\/archive.ics.uci.edu\/dataset\/45\/heart+disease\u0022\u003Ehttps:\/\/archive.ics.uci.edu\/dataset\/45\/heart+disease\u003C\/a\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DUCI%2BMachine%2BLearning%2BRepository%26rft_id%253Dinfo%253Adoi%252F10.24432%252FC52P4X%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=10.24432\/C52P4X\u0026amp;link_type=DOI\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-doi cit-ref-sprinkles-crossref\u0022\u003E\u003Cspan\u003ECrossRef\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E9.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-9-1\u0022 title=\u0022View reference 9. in text\u0022 id=\u0022ref-9\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.12.08.25341823v1.9\u0022 data-doi=\u002210.1016\/j.imu.2021.100655\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EShorewala\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EV.\u003C\/span\u003E\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EEarly detection of coronary heart disease using ensemble techniques\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EInformatics in Medicine Unlocked\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2021\u003C\/span\u003E;\u003Cspan class=\u0022cit-vol\u0022\u003E26\u003C\/span\u003E:\u003Cspan class=\u0022cit-fpage\u0022\u003E100655\u003C\/span\u003E.\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-doi-sep\u0022\u003E \u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-scheme\u0022\u003Edoi:\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id cit-pub-id-doi\u0022\u003E10.1016\/j.imu.2021.100655\u003C\/span\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DInformatics%2Bin%2BMedicine%2BUnlocked%26rft.volume%253D26%26rft.spage%253D100655%26rft_id%253Dinfo%253Adoi%252F10.1016%252Fj.imu.2021.100655%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=10.1016\/j.imu.2021.100655\u0026amp;link_type=DOI\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-doi cit-ref-sprinkles-crossref\u0022\u003E\u003Cspan\u003ECrossRef\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E10.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-10-1\u0022 title=\u0022View reference 10. in text\u0022 id=\u0022ref-10\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.12.08.25341823v1.10\u0022 data-doi=\u002210.1016\/0002-9149(89)90524-9\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDetrano\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ER\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EJanosi\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ESteinbrunn\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EW\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EInternational application of a new probability algorithm for the diagnosis of coronary artery disease\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EAm J Cardiol\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E1989\u003C\/span\u003E;\u003Cspan class=\u0022cit-vol\u0022\u003E64\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E5\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003E304\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E310\u003C\/span\u003E.\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-doi-sep\u0022\u003E \u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id-scheme\u0022\u003Edoi:\u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id cit-pub-id-doi\u0022\u003E10.1016\/0002-9149(89)90524-9\u003C\/span\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DThe%2BAmerican%2Bjournal%2Bof%2Bcardiology%26rft.stitle%253DAm%2BJ%2BCardiol%26rft.aulast%253DDetrano%26rft.auinit1%253DR.%26rft.volume%253D64%26rft.issue%253D5%26rft.spage%253D304%26rft.epage%253D310%26rft.atitle%253DInternational%2Bapplication%2Bof%2Ba%2Bnew%2Bprobability%2Balgorithm%2Bfor%2Bthe%2Bdiagnosis%2Bof%2Bcoronary%2Bartery%2Bdisease.%26rft_id%253Dinfo%253Adoi%252F10.1016%252F0002-9149%252889%252990524-9%26rft_id%253Dinfo%253Apmid%252F2756873%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=10.1016\/0002-9149(89)90524-9\u0026amp;link_type=DOI\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-doi cit-ref-sprinkles-crossref\u0022\u003E\u003Cspan\u003ECrossRef\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=2756873\u0026amp;link_type=MED\u0026amp;atom=%2Fmedrxiv%2Fearly%2F2025%2F12%2F09%2F2025.12.08.25341823.atom\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-medline\u0022\u003E\u003Cspan\u003EPubMed\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=A1989AG92200010\u0026amp;link_type=ISI\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-newisilink cit-ref-sprinkles-webofscience\u0022\u003E\u003Cspan\u003EWeb of Science\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003C\/div\u003E\u003Cspan class=\u0022highwire-journal-article-marker-end\u0022\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cspan class=\u0022related-urls\u0022\u003E\u003C\/span\u003E\u003C\/div\u003E\u003C\/div\u003E  \u003C\/div\u003E\n\n  \n  \u003C\/div\u003E\n\u003C\/div\u003E\n  \u003C\/div\u003E\n\u003C\/div\u003E\n\u003C\/div\u003E\u003Cscript type=\u0022text\/javascript\u0022 src=\u0022https:\/\/www.medrxiv.org\/sites\/default\/files\/js\/js_zP7WWIfzbyzvaM63L39cNV2juU_1XVH7wduFK9gcMNI.js\u0022\u003E\u003C\/script\u003E\n\u003C\/body\u003E\u003C\/html\u003E"}