{"markup":"\u003C?xml version=\u00221.0\u0022 encoding=\u0022UTF-8\u0022 ?\u003E\n    \u003Chtml version=\u0022HTML+RDFa+MathML 1.1\u0022\n    xmlns:content=\u0022http:\/\/purl.org\/rss\/1.0\/modules\/content\/\u0022\n    xmlns:dc=\u0022http:\/\/purl.org\/dc\/terms\/\u0022\n    xmlns:foaf=\u0022http:\/\/xmlns.com\/foaf\/0.1\/\u0022\n    xmlns:og=\u0022http:\/\/ogp.me\/ns#\u0022\n    xmlns:rdfs=\u0022http:\/\/www.w3.org\/2000\/01\/rdf-schema#\u0022\n    xmlns:sioc=\u0022http:\/\/rdfs.org\/sioc\/ns#\u0022\n    xmlns:sioct=\u0022http:\/\/rdfs.org\/sioc\/types#\u0022\n    xmlns:skos=\u0022http:\/\/www.w3.org\/2004\/02\/skos\/core#\u0022\n    xmlns:xsd=\u0022http:\/\/www.w3.org\/2001\/XMLSchema#\u0022\n    xmlns:mml=\u0022http:\/\/www.w3.org\/1998\/Math\/MathML\u0022\u003E\n  \u003Chead\u003E\u003Cscript type=\u0022text\/javascript\u0022 src=\u0022\/\/cdn.jsdelivr.net\/qtip2\/2.2.1\/jquery.qtip.min.js\u0022\u003E\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022 src=\u0022https:\/\/www.medrxiv.org\/sites\/default\/files\/js\/js_YjAJQgxDlFX6S-O02jj9jCrVbrwlY3CGgCg1FzPlvBs.js\u0022\u003E\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022\u003E\n\u003C!--\/\/--\u003E\u003C![CDATA[\/\/\u003E\u003C!--\nif(typeof window.MathJax === \u0022undefined\u0022) window.MathJax = { menuSettings: { zoom: \u0022Click\u0022 } };\n\/\/--\u003E\u003C!]]\u003E\n\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022 src=\u0022https:\/\/www.medrxiv.org\/sites\/default\/files\/js\/js_waP91NpgGpectm_6Y2XDEauLJ8WCSCBKmmA87unpp2E.js\u0022\u003E\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022 src=\u0022https:\/\/www.googletagmanager.com\/gtag\/js?id=G-0K57TCX5BY\u0022\u003E\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022\u003E\n\u003C!--\/\/--\u003E\u003C![CDATA[\/\/\u003E\u003C!--\nwindow.dataLayer = window.dataLayer || [];function gtag(){dataLayer.push(arguments)};gtag(\u0022js\u0022, new Date());gtag(\u0022set\u0022, \u0022developer_id.dMDhkMT\u0022, true);gtag(\u0022config\u0022, \u0022G-0K57TCX5BY\u0022, {\u0022groups\u0022:\u0022default\u0022,\u0022anonymize_ip\u0022:true});\n\/\/--\u003E\u003C!]]\u003E\n\u003C\/script\u003E\n\u003Cscript type=\u0022text\/javascript\u0022\u003E\n\u003C!--\/\/--\u003E\u003C![CDATA[\/\/\u003E\u003C!--\njQuery.extend(Drupal.settings, {\u0022basePath\u0022:\u0022\\\/\u0022,\u0022pathPrefix\u0022:\u0022\u0022,\u0022highwire\u0022:{\u0022ac\u0022:{\u0022medrxiv;2025.10.14.25338040v1\u0022:{\u0022access\u0022:{\u0022full\u0022:true},\u0022pisa_id\u0022:\u0022medrxiv;2025.10.14.25338040v1\u0022,\u0022apath\u0022:\u0022\u0022,\u0022jcode\u0022:\u0022medrxiv\u0022}},\u0022processed\u0022:[\u0022highwire_math\u0022],\u0022markup\u0022:[{\u0022requested\u0022:\u0022full-text\u0022,\u0022variant\u0022:\u0022full-text\u0022,\u0022view\u0022:\u0022full\u0022,\u0022pisa\u0022:\u0022medrxiv;2025.10.14.25338040v1\u0022}]},\u0022instances\u0022:\u0022{\\u0022highwire_abstract_tooltip\\u0022:{\\u0022content\\u0022:{\\u0022text\\u0022:\\u0022\\u0022},\\u0022style\\u0022:{\\u0022tip\\u0022:{\\u0022width\\u0022:20,\\u0022height\\u0022:20,\\u0022border\\u0022:1,\\u0022offset\\u0022:0,\\u0022corner\\u0022:true},\\u0022classes\\u0022:\\u0022qtip-custom hw-tooltip hw-abstract-tooltip qtip-shadow qtip-rounded\\u0022,\\u0022classes_custom\\u0022:\\u0022hw-tooltip hw-abstract-tooltip\\u0022},\\u0022position\\u0022:{\\u0022at\\u0022:\\u0022right center\\u0022,\\u0022my\\u0022:\\u0022left center\\u0022,\\u0022viewport\\u0022:true,\\u0022adjust\\u0022:{\\u0022method\\u0022:\\u0022shift\\u0022}},\\u0022show\\u0022:{\\u0022event\\u0022:\\u0022mouseenter click \\u0022,\\u0022solo\\u0022:true},\\u0022hide\\u0022:{\\u0022event\\u0022:\\u0022mouseleave \\u0022,\\u0022fixed\\u0022:1,\\u0022delay\\u0022:\\u0022100\\u0022}},\\u0022highwire_author_tooltip\\u0022:{\\u0022content\\u0022:{\\u0022text\\u0022:\\u0022\\u0022},\\u0022style\\u0022:{\\u0022tip\\u0022:{\\u0022width\\u0022:15,\\u0022height\\u0022:15,\\u0022border\\u0022:1,\\u0022offset\\u0022:0,\\u0022corner\\u0022:true},\\u0022classes\\u0022:\\u0022qtip-custom hw-tooltip hw-author-tooltip qtip-shadow qtip-rounded\\u0022,\\u0022classes_custom\\u0022:\\u0022hw-tooltip hw-author-tooltip\\u0022},\\u0022position\\u0022:{\\u0022at\\u0022:\\u0022top center\\u0022,\\u0022my\\u0022:\\u0022bottom center\\u0022,\\u0022viewport\\u0022:true,\\u0022adjust\\u0022:{\\u0022method\\u0022:\\u0022\\u0022}},\\u0022show\\u0022:{\\u0022event\\u0022:\\u0022mouseenter \\u0022,\\u0022solo\\u0022:true},\\u0022hide\\u0022:{\\u0022event\\u0022:\\u0022mouseleave \\u0022,\\u0022fixed\\u0022:1,\\u0022delay\\u0022:\\u0022100\\u0022}},\\u0022highwire_reflinks_tooltip\\u0022:{\\u0022content\\u0022:{\\u0022text\\u0022:\\u0022\\u0022},\\u0022style\\u0022:{\\u0022tip\\u0022:{\\u0022width\\u0022:15,\\u0022height\\u0022:15,\\u0022border\\u0022:1,\\u0022mimic\\u0022:\\u0022top center\\u0022,\\u0022offset\\u0022:0,\\u0022corner\\u0022:true},\\u0022classes\\u0022:\\u0022qtip-custom hw-tooltip hw-ref-link-tooltip qtip-shadow qtip-rounded\\u0022,\\u0022classes_custom\\u0022:\\u0022hw-tooltip hw-ref-link-tooltip\\u0022},\\u0022position\\u0022:{\\u0022at\\u0022:\\u0022bottom left\\u0022,\\u0022my\\u0022:\\u0022top left\\u0022,\\u0022viewport\\u0022:true,\\u0022adjust\\u0022:{\\u0022method\\u0022:\\u0022flip\\u0022}},\\u0022show\\u0022:{\\u0022event\\u0022:\\u0022mouseenter \\u0022,\\u0022solo\\u0022:true},\\u0022hide\\u0022:{\\u0022event\\u0022:\\u0022mouseleave \\u0022,\\u0022fixed\\u0022:1,\\u0022delay\\u0022:\\u0022100\\u0022}}}\u0022,\u0022qtipDebug\u0022:\u0022{\\u0022leaveElement\\u0022:0}\u0022,\u0022googleanalytics\u0022:{\u0022account\u0022:[\u0022G-0K57TCX5BY\u0022],\u0022trackOutbound\u0022:1,\u0022trackMailto\u0022:1,\u0022trackDownload\u0022:1,\u0022trackDownloadExtensions\u0022:\u00227z|aac|arc|arj|asf|asx|avi|bin|csv|doc(x|m)?|dot(x|m)?|exe|flv|gif|gz|gzip|hqx|jar|jpe?g|js|mp(2|3|4|e?g)|mov(ie)?|msi|msp|pdf|phps|png|ppt(x|m)?|pot(x|m)?|pps(x|m)?|ppam|sld(x|m)?|thmx|qtm?|ra(m|r)?|sea|sit|tar|tgz|torrent|txt|wav|wma|wmv|wpd|xls(x|m|b)?|xlt(x|m)|xlam|xml|z|zip\u0022,\u0022trackColorbox\u0022:1},\u0022ajaxPageState\u0022:{\u0022js\u0022:{\u0022\\\/\\\/cdn.jsdelivr.net\\\/qtip2\\\/2.2.1\\\/jquery.qtip.min.js\u0022:1,\u0022sites\\\/all\\\/modules\\\/highwire\\\/highwire\\\/plugins\\\/highwire_markup_process\\\/js\\\/highwire_article_reference_popup.js\u0022:1,\u0022sites\\\/all\\\/modules\\\/highwire\\\/highwire\\\/plugins\\\/highwire_markup_process\\\/js\\\/highwire_at_symbol.js\u0022:1,\u00220\u0022:1,\u0022sites\\\/all\\\/modules\\\/contrib\\\/google_analytics\\\/googleanalytics.js\u0022:1,\u0022https:\\\/\\\/www.googletagmanager.com\\\/gtag\\\/js?id=G-0K57TCX5BY\u0022:1,\u00221\u0022:1}}});\n\/\/--\u003E\u003C!]]\u003E\n\u003C\/script\u003E\n\u003Clink type=\u0022text\/css\u0022 rel=\u0022stylesheet\u0022 href=\u0022https:\/\/www.medrxiv.org\/sites\/default\/files\/advagg_css\/css__dn-cpI1YtkU_iLHgA5WhlkxgYWyat_IxjF_B-WSYrpE__a9hIbt0eaZ7d5nhwnm2weG8R_2eXK4EvoOx9dOxouHE__QrrGUc7CpljPR5Aph-ukPbcwtK4AWrHGwCEXJ_k1V_c.css\u0022 media=\u0022all\u0022 \/\u003E\n\u003Clink type=\u0022text\/css\u0022 rel=\u0022stylesheet\u0022 href=\u0022\/\/cdn.jsdelivr.net\/qtip2\/2.2.1\/jquery.qtip.min.css\u0022 media=\u0022all\u0022 \/\u003E\n\u003Clink type=\u0022text\/css\u0022 rel=\u0022stylesheet\u0022 href=\u0022https:\/\/www.medrxiv.org\/sites\/default\/files\/advagg_css\/css__HGACIFBlu2o05y3afvqlt5wrE_5Dn6MXsexfuEpeIwg__t4SOPxucAPoV3Os7g8dXqyMB1HRXQridRJ82X7nE33E__QrrGUc7CpljPR5Aph-ukPbcwtK4AWrHGwCEXJ_k1V_c.css\u0022 media=\u0022all\u0022 \/\u003E\n\u003Clink rel=\u0027stylesheet\u0027 type=\u0027text\/css\u0027 href=\u0027\/sites\/all\/modules\/contrib\/panels\/plugins\/layouts\/onecol\/onecol.css\u0027 \/\u003E\u003C\/head\u003E\u003Cbody\u003E\u003Cdiv class=\u0022panels-ajax-tab-panel panels-ajax-tab-panel-article-tab-full-text\u0022\u003E\u003Cdiv class=\u0022panel-display panel-1col clearfix\u0022 \u003E\n  \u003Cdiv class=\u0022panel-panel panel-col\u0022\u003E\n    \u003Cdiv\u003E\u003Cdiv class=\u0022panel-pane pane-highwire-markup\u0022 \u003E\n  \n      \n  \n  \u003Cdiv class=\u0022pane-content\u0022\u003E\n    \u003Cdiv class=\u0022highwire-markup\u0022\u003E\u003Cdiv xmlns=\u0022http:\/\/www.w3.org\/1999\/xhtml\u0022 data-highwire-cite-ref-tooltip-instance=\u0022highwire_reflinks_tooltip\u0022 class=\u0022content-block-markup\u0022 xmlns:xhtml=\u0022http:\/\/www.w3.org\/1999\/xhtml\u0022\u003E\u003Cdiv class=\u0022article fulltext-view \u0022\u003E\u003Cspan class=\u0022highwire-journal-article-marker-start\u0022\u003E\u003C\/span\u003E\u003Cdiv class=\u0022section abstract\u0022 id=\u0022abstract-1\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EAbstract\u003C\/h2\u003E\u003Cdiv id=\u0022sec-1\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-2\u0022\u003E\u003Cstrong\u003EBackground\u003C\/strong\u003E Large language models (LLMs) have demonstrated rapid advancements in natural language understanding and generation, prompting their integration into biomedical research, clinical practice, and professional education. However, systematic evaluation of LLMs in specialty-specific domains such as dentistry and periodontology remain limited, particularly regarding multidimensional performance metrics.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-2\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-3\u0022\u003E\u003Cstrong\u003EObjective\u003C\/strong\u003E To conduct a comprehensive, multidimensional assessment of commercially available LLMs: GPT-4.0, GPT-5.0, and Claude SONNET 4.0 on the American Academy of Periodontology in-service examination, focusing on response accuracy, self-assessed confidence calibration, citation validity, and hallucination prevalence.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-3\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-4\u0022\u003E\u003Cstrong\u003EMethods\u003C\/strong\u003E Models were evaluated on the 2024 AAP In-Service Examination (331 questions) using two formats: Full Test (all questions at once) and Individual Question (one at a time). Prompts were standardized; models selected answers, and for GPT-5.0 and Claude SONNET 4.0, also provided confidence ratings and citations. Citation validity was assessed using a human-in-the-loop protocol with expert review. Statistical analyses included chi-square, McNemar\u2019s, and logistic regression to assess accuracy, question fatigue, confidence calibration, and citation reliability.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-4\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-5\u0022\u003E\u003Cstrong\u003EResults\u003C\/strong\u003E LLMs achieved high overall accuracy (78\u201387%), with the Individual Question format consistently yielding higher scores than Full Test, though differences were not statistically significant.\u003C\/p\u003E\u003Cp id=\u0022p-6\u0022\u003EAccuracy was highest in fact-dense domains (biochemistry, physiology, microbiology) and lowest in integrative domains (diagnosis, therapy). Significant question fatigue was observed in GPT-5.0 Full Test mode (OR = 0.997, p = 0.035), but not in Individual Question mode.\u003C\/p\u003E\u003Cp id=\u0022p-7\u0022\u003EConfidence scores predicted accuracy, with the strongest calibration in Individual Question mode. Citation analysis revealed frequent hallucinations, mostly critically erroneous, and citation validity was independent of answer accuracy.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-5\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-8\u0022\u003E\u003Cstrong\u003EConclusions\u003C\/strong\u003E LLMs can answer a broad spectrum of periodontal specialty questions, but their reliability varies with context and information presentation. While promising as adjunctive tools, their outputs\u2014 especially for complex reasoning and citations\u2014require rigorous human review in educational and research settings to ensure accuracy and safety.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-6\u0022 class=\u0022subsection\u0022\u003E\u003Cp id=\u0022p-9\u0022\u003E\u003Cstrong\u003EAuthor Summary\u003C\/strong\u003E Artificial intelligence chatbots are rapidly entering medical education, yet we lack comprehensive understanding of their reliability when students depend on them for learning. We developed a multidimensional evaluation framework to systematically assess AI performance beyond simple accuracy, examining how these systems behave across different medical topics, question types, and presentation formats.\u003C\/p\u003E\u003Cp id=\u0022p-10\u0022\u003EUsing 331 real dental examination questions, we tested three major AI systems, analyzing not only correctness but also confidence calibration - whether AI confidence levels match actual accuracy - and implementing human-in-the-loop verification to check if cited sources actually exist.\u003C\/p\u003E\u003Cp id=\u0022p-11\u0022\u003EOur findings highlight critical vulnerabilities in current AI systems. Most alarmingly, these chatbots fabricated nearly half of their citations while maintaining unwavering confidence in both correct and incorrect responses. This combination of overconfidence and misinformation means students cannot distinguish reliable from unreliable AI responses. Additionally, we documented progressive performance decline during sequential questioning, similar to human cognitive fatigue.\u003C\/p\u003E\u003Cp id=\u0022p-12\u0022\u003EWhile we know AI systems generate rather than retrieve information, our research demonstrates the real-world consequences of this limitation. As artificial intelligence integrates into education, healthcare diagnostics, and insurance decisions, these findings underscore the urgent need for better evaluation frameworks and user education about AI limitations.\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-7\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EIntroduction\u003C\/h2\u003E\u003Cp id=\u0022p-25\u0022\u003EArtificial intelligence (AI) has rapidly evolved into a transformative technology across scientific and clinical disciplines. Among its most significant developments are large language models (LLMs), which are trained on vast corpora of text to generate coherent, human-like language. Their ability to recognize linguistic patterns and predict plausible continuations has enabled widespread application in knowledge retrieval, summarization, translation, and dialogue systems. Their integration into healthcare has been particularly notable, with applications ranging from clinical documentation and literature synthesis to diagnostic support and educational innovation.\u003C\/p\u003E\u003Cp id=\u0022p-26\u0022\u003EDentistry has begun to embrace AI in parallel with medicine, with early work demonstrating utility in diagnostic imaging, caries detection, treatment planning, and predictive analytics. (\u003Ca id=\u0022xref-ref-1-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-1\u0022\u003E1\u003C\/a\u003E) More recently, LLMs have been explored as tools for dental education and clinical training, where they may supplement student learning, provide rapid access to foundational knowledge, and support preparation for standardized examinations. (\u003Ca id=\u0022xref-ref-2-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-2\u0022\u003E2\u003C\/a\u003E,\u003Ca id=\u0022xref-ref-3-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-3\u0022\u003E3\u003C\/a\u003E)\u003C\/p\u003E\u003Cp id=\u0022p-27\u0022\u003EIt is important to recognize, however, that LLMs are not designed to discover or verify truth. Instead, they function as statistical models that predict the most probable sequence of words given prior input. This capability allows them to generate fluent and contextually appropriate language, but it also constrains their factual reliability. Unlike structured databases, LLMs do not store or retrieve information in a queryable format. Although their outputs can be supplemented by external databases or curated sources, current methods provide limited transparency or control over how such sources are integrated or weighted. (\u003Ca id=\u0022xref-ref-4-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-4\u0022\u003E4\u003C\/a\u003E)\u003C\/p\u003E\u003Cp id=\u0022p-28\u0022\u003EThese limitations are especially concerning in domains where factual precision is critical, such as medicine and research. Consequently, the systematic evaluation of LLMs has become a central priority, with growing efforts devoted to developing robust metrics and standardized approaches for assessing their performance across multiple dimensions.\u003C\/p\u003E\u003Cp id=\u0022p-29\u0022\u003ETo date, much of the research evaluating LLMs\u2019 performance and accuracy in medicine has employed standardized, high-stakes assessments such as the United States Medical Licensing Examination (USMLE), with systematic reviews confirming variable but generally improving performance across model generations. (\u003Ca id=\u0022xref-ref-5-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-5\u0022\u003E5\u003C\/a\u003E\u2013\u003Ca id=\u0022xref-ref-7-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-7\u0022\u003E7\u003C\/a\u003E)In dentistry, similar efforts have examined LLM performance on the National Board Dental Examination (NBDE\/INBDE) and related professional assessments(\u003Ca id=\u0022xref-ref-8-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-8\u0022\u003E8\u003C\/a\u003E\u2013\u003Ca id=\u0022xref-ref-10-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-10\u0022\u003E10\u003C\/a\u003E), while a recent study in the \u003Cem\u003EJournal of Periodontology\u003C\/em\u003E reported on the use of the AAP Periodontics In-Service Examination to benchmark ChatGPT against specialty-level questions(\u003Ca id=\u0022xref-ref-11-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-11\u0022\u003E11\u003C\/a\u003E). These studies underscore the value of standardized examinations as rigorous testbeds for evaluating LLM performance. While such studies primarily assess accuracy and reasoning, they have largely overlooked the ability of LLMs to provide verifiable citations. The widespread use of these models has revealed a consistent problem of fabricated or \u201challucinated\u201d references, where outputs may include plausible sounding but nonexistent sources or misattributed citations. (\u003Ca id=\u0022xref-ref-12-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-12\u0022\u003E12\u003C\/a\u003E)This issue is particularly problematic in educational settings, where students may lack the experience to critically evaluate references and risk accepting fabricated citations as valid. Such limitations not only undermine trust in AI-assisted learning but also pose risks to the integrity of scholarly work.\u003C\/p\u003E\u003Cp id=\u0022p-30\u0022\u003EGiven these gaps, the present study was designed to provide a multidimensional evaluation of LLM performance in periodontology using the AAP In-Service Examination. The objectives were to compare model accuracy across successive generations, assess the influence of prompt optimization and test presentation format on performance, quantify the rate of citation hallucination, and evaluate the validity of self-reported confidence in responses. By employing a specialty-specific professional examination as the test environment, this study aimed to generate a comprehensive assessment of LLM capabilities and limitations in a context where factual reliability, citation integrity, and calibrated confidence are critical.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-8\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EMethods\u003C\/h2\u003E\u003Cdiv id=\u0022sec-9\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EStudy Design\u003C\/h3\u003E\u003Cp id=\u0022p-31\u0022\u003EThis study employed a comparative performance assessment framework to evaluate large language models (LLMs) in the domain of periodontology. A controlled experimental design (\u003Ca id=\u0022xref-fig-1-1\u0022 class=\u0022xref-fig\u0022 href=\u0022#F1\u0022\u003EFigure 1\u003C\/a\u003E) was used to ensure reproducibility and transparency. Evaluation focused on two dimensions, response accuracy and a human-in-the-loop (HITL) protocol to validate the reliability of model-generated citations. Three transformer-based models were selected based on published performance benchmarks and multimodal capabilities: ChatGPT-4.0 and ChatGPT-5.0 (OpenAI, San Francisco, CA, USA), and Claude SONNET 4.0 (Anthropic, San Francisco, CA, USA). The citation validity was tested only for Chat GPT 5.0 (OpenAI, San Francisco, CA, USA), and Claude SONNET 4.0 (Anthropic, San Francisco, CA, USA).\u003C\/p\u003E\u003Cdiv id=\u0022F1\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F1.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Study flow for evaluating large language model performance on a specialty periodontology examination\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1730309226\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;Study flow for evaluating large language model performance on a specialty periodontology examination\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Figure 1:\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F1.medium.gif\u0022 width=\u0022440\u0022 height=\u0022159\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Figure 1:\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F1.medium.gif\u0022 width=\u0022440\u0022 height=\u0022159\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F1.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Figure 1:\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F1.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022 xmlns:xhtml=\u0022http:\/\/www.w3.org\/1999\/xhtml\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003EFigure 1:\u003C\/span\u003E \u003Cp id=\u0022p-32\u0022 class=\u0022first-child\u0022\u003EStudy flow for evaluating large language model performance on a specialty periodontology examination\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-10\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EEvaluation Dataset\u003C\/h3\u003E\u003Cp id=\u0022p-33\u0022\u003EThe 2024 American Academy of Periodontology (AAP) In-Service Examination, comprising 331 multiple-choice questions, was used as the evaluation instrument. For structured analysis, the investigators categorized the items on the exam into four domains: general factual knowledge, author-specific\/ study specifics factual knowledge, analytical reasoning, and image-based question.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-11\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EPrompt Design\u003C\/h3\u003E\u003Cp id=\u0022p-34\u0022\u003ETwo exam delivery formats were employed. In the Full Test (FT) format, the entire question set was presented simultaneously, simulating unrestricted access. In the Individual Question (IQ) format, questions were delivered one at a time, mirroring the experience of a traditional test. Both ChatGPT models were evaluated under FT and IQ conditions, whereas Claude SONNET 4.0 was assessed only in IQ format.\u003C\/p\u003E\u003Cp id=\u0022p-35\u0022\u003EPrompt formulation was standardized across all experimental conditions to minimize variability and ensure that observed differences in performance reflected intrinsic model capabilities rather than disparities in instruction. All models were required to select the most appropriate answer choice for each multiple-choice question, and prompts were written to be unambiguous, concise, and task-specific, with contextual information included only when necessary to support question comprehension. A single core instruction was employed for baseline evaluation: \u003Cspan class=\u0022underline\u0022\u003E\u201cSelect the single most accurate answer choice for the following multiple-choice question using all relevant knowledge and sources available to you.\u201d\u003C\/span\u003E For ChatGPT-5.0 and Claude SONNET 4.0, additional instructions were included to (\u003Ca id=\u0022xref-ref-1-2\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-1\u0022\u003E1\u003C\/a\u003E) indicate their confidence in the selected answer as a percentage (0\u2013100%), and (\u003Ca id=\u0022xref-ref-2-2\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-2\u0022\u003E2\u003C\/a\u003E) provide verifiable citations including Digital Object Identifiers (DOIs), PubMed identifiers (PMIDs), or persistent URLs to support their response choice. These additional tasks were applied exclusively to these models to enable assessment of confidence calibration and citation validity alongside overall accuracy.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-12\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EHuman-in-the-Loop Citation Validation\u003C\/h3\u003E\u003Cp id=\u0022p-36\u0022\u003EA human-in-the-loop protocol was implemented specifically for citation validation.\u003C\/p\u003E\u003Cp id=\u0022p-37\u0022\u003EReferences generated by ChatGPT-5.0 and Claude SONNET 4.0 were reviewed by a single evaluator trained in biomedical literature retrieval. Each citation was categorized using a three-tier taxonomy: Completely valid-existing and accessible citations accurately supporting the information provided, partially correct \u2013 real citations with minor inaccuracies or partial claim support, critically wrong \u2013 fabricated, inaccessible, or unrelated citations. This process enabled systematic detection of hallucinated or misattributed references.\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-13\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EStatistical Analysis\u003C\/h2\u003E\u003Cp id=\u0022p-38\u0022\u003EStatistical analyses were conducted using IBM SPSS Statistics (version 31, IBM Corp., Armonk, NY, USA) and Python (version 3.12.10) to evaluate model performance, assess citation validity, and explore secondary relationships. Descriptive statistics and chi-square tests of independence were used to summarize and compare response accuracy across models.\u003C\/p\u003E\u003Cp id=\u0022p-39\u0022\u003EWithin each model version, the effect of question presentation format was evaluated using chi-square tests, and McNemar\u2019s test was applied to discordant question-level pairs. For models that generated confidence scores (GPT-5.0 and Claude SONNET 4.0), logistic regression was used to examine the relationship between confidence and accuracy. Citation validity was analyzed by categorizing references into predefined validity groups and comparing their distributions between models using chi-square tests. The potential effect of token length on accuracy was investigated using question position as a proxy, with logistic regression modeling accuracy as the dependent variable and question index as a continuous predictor, and the Cochran\u2013Armitage trend test applied to assess linear trends across ordered question bins.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-14\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EResults\u003C\/h2\u003E\u003Cdiv id=\u0022sec-15\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EOverall Model Performance\u003C\/h3\u003E\u003Cp id=\u0022p-40\u0022\u003EA total of 331 multiple-choice questions from the 2024 AAP In-Service Examination were evaluated across four experimental conditions: GPT-4.0 Full Test (FT), GPT-4.0 Individual Question (IQ), GPT-5.0 FT, and GPT-5.0 IQ. Overall model accuracy varied significantly by version and delivery format. GPT-5.0 achieved the highest performance in IQ mode, correctly answering 279 of 331 questions (84.29%), compared with 274 of 331 (82.78%) for GPT-4.0 IQ. However, contrary to the individual question results, GPT-5.0 in FT mode achieved 80.85% accuracy (266 of 329 questions), substantially higher than GPT-4.0 FT at 78.12% (257 of 329 questions).\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-16\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003ESection and domain specific performance patterns\u003C\/h3\u003E\u003Cp id=\u0022p-41\u0022\u003EA network visualization of model performance across the 2024 American Academy of Periodontology In-Service Examination demonstrated consistent topic-dependent trends (\u003Ca id=\u0022xref-fig-2-1\u0022 class=\u0022xref-fig\u0022 href=\u0022#F2\u0022\u003EFigure 2\u003C\/a\u003E). The visualization illustrates the distribution of model accuracy across nine content topics, with nodes representing large language model configurations (blue) and examination topics (yellow), and edge color and thickness indicating performance levels (green: \u22650.90, red: 0.80\u2013 0.89, gray: \u0026lt;0.80). Descriptive analysis revealed that certain subject areas exhibited systematically higher accuracy scores across all tested models, particularly Biochemistry and Physiology (mean accuracy = 0.968) and Microbiology and Immunology (mean accuracy = 0.960), which can also be visualized as green connections (\u22650.9 accuracy) with multiple LLMs in the network visualization. Conversely, several domains proved universally challenging, with Diagnosis (mean accuracy = 0.710) and Periodontal Therapy (mean accuracy = 0.748) showing predominantly red connections (\u0026lt;0.8 accuracy) across all model variants.\u003C\/p\u003E\u003Cdiv id=\u0022F2\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F2.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Network visualization of large language model performance across examination sections.\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1730309226\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;Network visualization of large language model performance across examination sections.\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Figure 2:\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F2.medium.gif\u0022 width=\u0022440\u0022 height=\u0022306\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Figure 2:\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F2.medium.gif\u0022 width=\u0022440\u0022 height=\u0022306\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F2.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Figure 2:\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F2.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003EFigure 2:\u003C\/span\u003E \u003Cp id=\u0022p-42\u0022 class=\u0022first-child\u0022\u003ENetwork visualization of large language model performance across examination sections.\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cp id=\u0022p-43\u0022\u003EThis pattern remained consistent despite variations in model training approaches, with both Full test and individual question versions of ChatGPT 4.0 and 5.0, as well as Claude SONNET 4.0, exhibiting similar relative performance hierarchies across subject domains. The network visualization further reveals that while individual models varied in overall performance\u2014 with Claude SONNET 4.0 achieving the highest overall accuracy (0.874) and ChatGPT 5.0FT the lowest (0.799)\u2014the ranking of subject difficulty remained remarkably stable, suggesting that certain dental education domains present inherent challenges for current large language model architectures that transcend specific training methodologies or model parameters.\u003C\/p\u003E\u003Cp id=\u0022p-44\u0022\u003E\u003Ca id=\u0022xref-fig-3-1\u0022 class=\u0022xref-fig\u0022 href=\u0022#F3\u0022\u003EFigure 3\u003C\/a\u003E demonstrates marked heterogeneity in large language model (LLM) performance across dental education question taxonomies, revealing distinct competency profiles that transcend model architecture and training methodologies. Radar chart analysis of 331 questions across four cognitive domains\u2014factual recall (n=186), author-referenced queries (n=126), image-based interpretation (n=10), and analytical reasoning (n=9)\u2014exposed systematic performance variations that remained consistent across five state-of-the-art models: ChatGPT 4.0 (FT \u0026amp; IQ), ChatGPT 5.0 (FT \u0026amp;IQ), and Claude SONNET 4.0.\u003C\/p\u003E\u003Cdiv id=\u0022F3\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F3.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Radar plot comparing model accuracy across four question categories\u0026#x2014;factual, author-specific, analytical, and image-based\u0026#x2014;for all LLM configurations.\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1730309226\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;Radar plot comparing model accuracy across four question categories\u0026#x2014;factual, author-specific, analytical, and image-based\u0026#x2014;for all LLM configurations.\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Figure 3:\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F3.medium.gif\u0022 width=\u0022440\u0022 height=\u0022284\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Figure 3:\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F3.medium.gif\u0022 width=\u0022440\u0022 height=\u0022284\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F3.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Figure 3:\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F3.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003EFigure 3:\u003C\/span\u003E \u003Cp id=\u0022p-45\u0022 class=\u0022first-child\u0022\u003ERadar plot comparing model accuracy across four question categories\u2014factual, author-specific, analytical, and image-based\u2014for all LLM configurations.\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-17\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EComparison within successive generations of LLMs: Chat GPT 4.0 vs. Chat GPT 5.0\u003C\/h3\u003E\u003Cp id=\u0022p-46\u0022\u003EA chi-square test of independence was conducted to compare the overall accuracy of GPT-4.0 and GPT-5.0 under Full Test (FT), Individual Question (IQ), and combined conditions. GPT-5.0 exhibited modest but consistent improvements in accuracy across all testing conditions, although none of the differences reached statistical significance (\u03b1 = 0.05) (\u003Ca id=\u0022xref-table-wrap-4-1\u0022 class=\u0022xref-table\u0022 href=\u0022#T4\u0022\u003ETable 4\u003C\/a\u003E). The largest difference was observed in the FT condition (+2.74 percentage points; \u03c7\u00b2 = 0.596, \u003Cem\u003Ep\u003C\/em\u003E = 0.440), while the improvement in the IQ condition was smaller (+1.51 percentage points; \u03c7\u00b2 = 0.176, \u003Cem\u003Ep\u003C\/em\u003E = 0.675). Overall, GPT-5.0 achieved 82.58% accuracy compared with 80.45% for GPT-4.0 (\u03c7\u00b2 = 0.850, \u003Cem\u003Ep\u003C\/em\u003E = 0.357), and all effect sizes indicated small magnitudes of difference (Cramer\u2019s V \u2264 0.030). These findings suggest incremental advancements in model capability between generations, though without statistically significant performance gains.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-18\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EEffect of Presentation Format (Full Test (FT) vs. Individual Question (IQ))\u003C\/h3\u003E\u003Cp id=\u0022p-47\u0022\u003EComparison of model performance across presentation formats revealed a consistent pattern favoring the Individual Question (IQ) condition over the Full Test (FT) condition (\u003Ca id=\u0022xref-table-wrap-2-1\u0022 class=\u0022xref-table\u0022 href=\u0022#T2\u0022\u003ETable 2\u003C\/a\u003E). Although differences did not reach conventional levels of statistical significance (\u03b1 = 0.05), IQ presentation was associated with higher accuracy across all analyses. GPT-4.0 achieved 82.98% accuracy in IQ format compared to 78.12% in FT format, representing a 4.86 percentage point improvement (\u03c7\u00b2 = 2.18, \u003Cem\u003Ep\u003C\/em\u003E = 0.140). GPT-5.0 demonstrated a similar directional effect, with accuracy improving from 80.85% (FT) to 84.19% (IQ), a 3.34 percentage point increase (\u03c7\u00b2 = 1.05, \u003Cem\u003Ep\u003C\/em\u003E = 0.305). When both models were combined, overall performance improved by 4.10 percentage points (\u03c7\u00b2 = 3.41, \u003Cem\u003Ep\u003C\/em\u003E = 0.065), approaching statistical significance (Cramer\u2019s V = 0.051).\u003C\/p\u003E\u003Cdiv id=\u0022T1\u0022 class=\u0022table pos-float\u0022\u003E\u003Cdiv class=\u0022table-inline table-callout-links\u0022\u003E\u003Cdiv class=\u0022callout\u0022\u003E\u003Cspan\u003EView this table:\u003C\/span\u003E\u003Cul class=\u0022callout-links\u0022\u003E\u003Cli class=\u0022view-inline first\u0022\u003E\u003Ca href=\u0022\u0022 class=\u0022table-expand-inline\u0022 data-table-url=\u0022\/highwire\/markup\/1088526\/expansion?postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0026amp;table-expand-inline=1\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView inline\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022view-popup\u0022\u003E\u003Ca href=\u0022\/highwire\/markup\/1088526\/expansion?width=1000\u0026amp;height=500\u0026amp;iframe=true\u0026amp;postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0022 class=\u0022colorbox colorbox-load table-expand-popup\u0022 rel=\u0022gallery-fragment-tables\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView popup\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022download-ppt last\u0022\u003E\u003Ca href=\u0022\/highwire\/powerpoint\/1088526\u0022 class=\u0022highwire-figure-link highwire-figure-link-ppt\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload powerpoint\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022table-caption\u0022\u003E\u003Cspan class=\u0022table-label\u0022\u003ETable 1:\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EDescriptive performance of large language models (LLMs) across subject domains and testing conditions.\u003C\/span\u003E\u003Cp id=\u0022p-48\u0022 class=\u0022first-child\u0022\u003EAccuracy of GPT-4.0 and GPT-5.0 was evaluated using the 2024 American Academy of Periodontology (AAP) In-Service Examination (331 multiple-choice questions). Results are presented as the number and percentage of correct responses for each section under Full Test (FT) and Individual Question (IQ) conditions. Completion rates and missing responses are also reported. IQ presentation consistently yielded higher accuracy than FT for both models, and GPT-5.0 outperformed GPT-4.0 across all sections.\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022T2\u0022 class=\u0022table pos-float\u0022\u003E\u003Cdiv class=\u0022table-inline table-callout-links\u0022\u003E\u003Cdiv class=\u0022callout\u0022\u003E\u003Cspan\u003EView this table:\u003C\/span\u003E\u003Cul class=\u0022callout-links\u0022\u003E\u003Cli class=\u0022view-inline first\u0022\u003E\u003Ca href=\u0022\u0022 class=\u0022table-expand-inline\u0022 data-table-url=\u0022\/highwire\/markup\/1088522\/expansion?postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0026amp;table-expand-inline=1\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView inline\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022view-popup\u0022\u003E\u003Ca href=\u0022\/highwire\/markup\/1088522\/expansion?width=1000\u0026amp;height=500\u0026amp;iframe=true\u0026amp;postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0022 class=\u0022colorbox colorbox-load table-expand-popup\u0022 rel=\u0022gallery-fragment-tables\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView popup\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022download-ppt last\u0022\u003E\u003Ca href=\u0022\/highwire\/powerpoint\/1088522\u0022 class=\u0022highwire-figure-link highwire-figure-link-ppt\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload powerpoint\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022table-caption\u0022\u003E\u003Cspan class=\u0022table-label\u0022\u003ETable 2:\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EChi-square test of independence comparing GPT-4.0 and GPT-5.0 performance across presentation conditions.\u003C\/span\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022T3\u0022 class=\u0022table pos-float\u0022\u003E\u003Cdiv class=\u0022table-inline table-callout-links\u0022\u003E\u003Cdiv class=\u0022callout\u0022\u003E\u003Cspan\u003EView this table:\u003C\/span\u003E\u003Cul class=\u0022callout-links\u0022\u003E\u003Cli class=\u0022view-inline first\u0022\u003E\u003Ca href=\u0022\u0022 class=\u0022table-expand-inline\u0022 data-table-url=\u0022\/highwire\/markup\/1088527\/expansion?postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0026amp;table-expand-inline=1\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView inline\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022view-popup\u0022\u003E\u003Ca href=\u0022\/highwire\/markup\/1088527\/expansion?width=1000\u0026amp;height=500\u0026amp;iframe=true\u0026amp;postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0022 class=\u0022colorbox colorbox-load table-expand-popup\u0022 rel=\u0022gallery-fragment-tables\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView popup\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022download-ppt last\u0022\u003E\u003Ca href=\u0022\/highwire\/powerpoint\/1088527\u0022 class=\u0022highwire-figure-link highwire-figure-link-ppt\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload powerpoint\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022table-caption\u0022\u003E\u003Cspan class=\u0022table-label\u0022\u003ETable 3:\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EMcNemar\u2019s test results for paired comparison of Full Test and Individual Question conditions.\u003C\/span\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022T4\u0022 class=\u0022table pos-float\u0022\u003E\u003Cdiv class=\u0022table-inline table-callout-links\u0022\u003E\u003Cdiv class=\u0022callout\u0022\u003E\u003Cspan\u003EView this table:\u003C\/span\u003E\u003Cul class=\u0022callout-links\u0022\u003E\u003Cli class=\u0022view-inline first\u0022\u003E\u003Ca href=\u0022\u0022 class=\u0022table-expand-inline\u0022 data-table-url=\u0022\/highwire\/markup\/1088521\/expansion?postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0026amp;table-expand-inline=1\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView inline\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022view-popup\u0022\u003E\u003Ca href=\u0022\/highwire\/markup\/1088521\/expansion?width=1000\u0026amp;height=500\u0026amp;iframe=true\u0026amp;postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0022 class=\u0022colorbox colorbox-load table-expand-popup\u0022 rel=\u0022gallery-fragment-tables\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView popup\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022download-ppt last\u0022\u003E\u003Ca href=\u0022\/highwire\/powerpoint\/1088521\u0022 class=\u0022highwire-figure-link highwire-figure-link-ppt\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload powerpoint\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022table-caption\u0022\u003E\u003Cspan class=\u0022table-label\u0022\u003ETable 4:\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003ELogistic Regression Analysis of Question Order and Confidence as Predictors of Accuracy Across Models\u003C\/span\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022T5\u0022 class=\u0022table pos-float\u0022\u003E\u003Cdiv class=\u0022table-inline table-callout-links\u0022\u003E\u003Cdiv class=\u0022callout\u0022\u003E\u003Cspan\u003EView this table:\u003C\/span\u003E\u003Cul class=\u0022callout-links\u0022\u003E\u003Cli class=\u0022view-inline first\u0022\u003E\u003Ca href=\u0022\u0022 class=\u0022table-expand-inline\u0022 data-table-url=\u0022\/highwire\/markup\/1088519\/expansion?postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0026amp;table-expand-inline=1\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView inline\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022view-popup\u0022\u003E\u003Ca href=\u0022\/highwire\/markup\/1088519\/expansion?width=1000\u0026amp;height=500\u0026amp;iframe=true\u0026amp;postprocessors=highwire_tables%2Chighwire_reclass%2Chighwire_figures%2Chighwire_math%2Chighwire_inline_linked_media%2Chighwire_embed\u0022 class=\u0022colorbox colorbox-load table-expand-popup\u0022 rel=\u0022gallery-fragment-tables\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EView popup\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022download-ppt last\u0022\u003E\u003Ca href=\u0022\/highwire\/powerpoint\/1088519\u0022 class=\u0022highwire-figure-link highwire-figure-link-ppt\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload powerpoint\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022table-caption\u0022\u003E\u003Cspan class=\u0022table-label\u0022\u003ETable 5:\u003C\/span\u003E \u003Cspan class=\u0022caption-title\u0022\u003EUnivariable Logistic Regression - Citation Validity Predicting Accuracy\u003C\/span\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cp id=\u0022p-49\u0022\u003EPaired question-level analyses further supported this trend. McNemar\u2019s test revealed that GPT-4.0 achieved more correct responses in IQ mode than FT mode on discordant items (39 vs. 23; \u03c7\u00b2 = 3.63, \u003Cem\u003Ep\u003C\/em\u003E = 0.057), indicating a performance improvement approaching statistical significance. A similar pattern was observed for GPT-5.0, though the difference was not statistically significant (38 vs. 27; \u03c7\u00b2 = 1.54, \u003Cem\u003Ep\u003C\/em\u003E = 0.215). Together, these results suggest that presenting questions individually may confer practical advantages by reducing contextual interference or cognitive overload, leading to modest but consistent performance gains.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-19\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003ECalibration: Confidence\u2013Accuracy Relationship\u003C\/h3\u003E\u003Cp id=\u0022p-50\u0022\u003EBinary logistic regression analysis demonstrated that confidence scores were statistically significant predictors of response accuracy across all models (\u003Cem\u003Ep\u003C\/em\u003E \u0026lt; 0.001) (Table X). GPT-5.0 in the Individual Question format showed the strongest association (OR = 1.125 per confidence point), indicating a 12.5% increase in the odds of a correct response for each unit increase in confidence. Claude SONNET 4.0 exhibited the highest overall classification accuracy (86.4%), although with the weakest discriminative ability (ROC AUC = 0.589), while GPT-5.0 FT demonstrated moderate predictive performance (OR = 1.018). All models showed excellent sensitivity (99.6\u201399.7%), reflecting reliable prediction of correct answers, but poor specificity (0\u20131.9%), indicating limited ability to identify incorrect responses. These findings confirm that model-generated confidence scores reliably tracked internal certainty and were significantly associated with accuracy, supporting their potential utility for evaluating AI reliability in clinical and educational settings.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-20\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EModel Accuracy Degradation Across token length\u003C\/h3\u003E\u003Cp id=\u0022p-51\u0022\u003EGPT-5.0 Full Test demonstrated significant question fatigue (OR = 0.997 per question, p = 0.035) with strong linear decline (R\u00b2 = 0.784), indicating a 0.3% decrease in accuracy odds per question. Confidence showed marginal significance (OR = 1.013, p = 0.068). GPT-5.0 Individual Question mode showed no significant question fatigue effect (OR = 0.998, p = 0.223) with minimal linear correlation (R\u00b2 = 0.095), but maintained strong confidence-accuracy calibration (OR = 1.127, p \u0026lt; 0.001). Claude SONNET 4.0 Individual Question exhibited the most pronounced question fatigue (OR = 0.995 per question, p = 0.011) with moderate linear decline (R\u00b2 = 0.666) and no significant confidence effect (OR = 1.020, p = 0.247).\u003C\/p\u003E\u003Cp id=\u0022p-52\u0022\u003EScatter plots in \u003Ca id=\u0022xref-fig-4-1\u0022 class=\u0022xref-fig\u0022 href=\u0022#F4\u0022\u003Efigure 4\u003C\/a\u003E illustrate the relationship between question position (x-axis) and predicted probability of a correct response (y-axis) derived from multivariable logistic regression models for three large language models. Each panel represents a different model and evaluation format: (A) GPT-5.0 \u2013 Full Test condition, (B) GPT-5.0 \u2013 Individual Question condition, and (C) Claude SONNET 4.0 Individual Question condition. Blue points indicate predicted probabilities for individual questions, with solid lines representing fitted regression trends and shaded areas showing 95% confidence intervals. All three models exhibited a downward trajectory in accuracy as question number increased, suggesting potential cumulative context effects. This decline was most pronounced and statistically significant in GPT-5.0 Full Test mode (OR = 0.997 per question, \u003Cem\u003Ep\u003C\/em\u003E = 0.035, R\u00b2 = 0.784) and in Claude SONNET 4.0 Individual Question mode (OR = 0.995, \u003Cem\u003Ep\u003C\/em\u003E = 0.011, R\u00b2 = 0.666), while GPT-5.0 Individual Question showed a milder, non-significant trend (OR = 0.998, \u003Cem\u003Ep\u003C\/em\u003E = 0.223, R\u00b2 = 0.095).\u003C\/p\u003E\u003Cdiv id=\u0022F4\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F4.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Multivariable logistic regression analyses examining the relationship between question position (proxy for cumulative context exposure) and model accuracy are shown for three large language models. Each panel plots the predicted probability of a correct response (y-axis) against question number (x-axis), with fitted regression lines and 95% confidence intervals. (A) GPT-5.0 in Full Test mode demonstrated significant question fatigue (OR = 0.997 per question, p = 0.035, R\u0026#xB2; = 0.784), indicating a progressive 0.3% reduction in accuracy odds per question. (B) GPT-5.0 in Individual Question mode showed no significant decline across question sequence (OR = 0.998, p = 0.223, R\u0026#xB2; = 0.095), maintaining stable performance with strong confidence\u0026#x2013;accuracy calibration (OR = 1.127, p \u0026lt; 0.001). (C) Claude SONNET 4.0 in Individual Question mode exhibited a significant decline in accuracy with increasing question number (OR = 0.995 per question, p = 0.011, R\u0026#xB2; = 0.666), consistent with model-specific susceptibility to cumulative context effects.\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1730309226\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;Multivariable logistic regression analyses examining the relationship between question position (proxy for cumulative context exposure) and model accuracy are shown for three large language models. Each panel plots the predicted probability of a correct response (y-axis) against question number (x-axis), with fitted regression lines and 95% confidence intervals. (A) GPT-5.0 in Full Test mode demonstrated significant question fatigue (OR = 0.997 per question, p = 0.035, R\u0026#xB2; = 0.784), indicating a progressive 0.3% reduction in accuracy odds per question. (B) GPT-5.0 in Individual Question mode showed no significant decline across question sequence (OR = 0.998, p = 0.223, R\u0026#xB2; = 0.095), maintaining stable performance with strong confidence\u0026#x2013;accuracy calibration (OR = 1.127, p \u0026lt; 0.001). (C) Claude SONNET 4.0 in Individual Question mode exhibited a significant decline in accuracy with increasing question number (OR = 0.995 per question, p = 0.011, R\u0026#xB2; = 0.666), consistent with model-specific susceptibility to cumulative context effects.\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Figure 4:\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F4.medium.gif\u0022 width=\u0022237\u0022 height=\u0022440\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Figure 4:\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F4.medium.gif\u0022 width=\u0022237\u0022 height=\u0022440\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F4.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Figure 4:\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F4.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003EFigure 4:\u003C\/span\u003E \u003Cp id=\u0022p-53\u0022 class=\u0022first-child\u0022\u003EMultivariable logistic regression analyses examining the relationship between question position (proxy for cumulative context exposure) and model accuracy are shown for three large language models. Each panel plots the predicted probability of a correct response (y-axis) against question number (x-axis), with fitted regression lines and 95% confidence intervals. (A) GPT-5.0 in Full Test mode demonstrated significant question fatigue (OR = 0.997 per question, p = 0.035, R\u00b2 = 0.784), indicating a progressive 0.3% reduction in accuracy odds per question. (B) GPT-5.0 in Individual Question mode showed no significant decline across question sequence (OR = 0.998, p = 0.223, R\u00b2 = 0.095), maintaining stable performance with strong confidence\u2013accuracy calibration (OR = 1.127, p \u0026lt; 0.001). (C) Claude SONNET 4.0 in Individual Question mode exhibited a significant decline in accuracy with increasing question number (OR = 0.995 per question, p = 0.011, R\u00b2 = 0.666), consistent with model-specific susceptibility to cumulative context effects.\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-21\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003ECitation Validity and Accuracy\u003C\/h3\u003E\u003Cp id=\u0022p-54\u0022\u003EIn contrast to confidence scores, citation validity did not significantly predict response accuracy for any model (\u003Cem\u003Ep\u003C\/em\u003E \u0026gt; 0.05) (Table 6). Although effect directions varied\u2014with valid citations associated with a 35.9% increase in odds of a correct response for GPT-5.0 IQ and decreases of 31.4% and 11.7% for GPT-5.0 FT and Claude SONNET 4.0, respectively\u2014none of these associations were statistically significant.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-22\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003ECitation Hallucination Assessment\u003C\/h3\u003E\u003Cp id=\u0022p-55\u0022\u003ECitation validity evaluation revealed substantial differences across models. Claude SONNET 4.0 demonstrated the lowest hallucination rate at 5.59% (18\/322), GPT-4.0 IQ showed 32.01% (105\/328), and GPT-5.0 FT exhibited the highest rate at 51.53% (135\/262). Citations were systematically classified using a three-tier validation system: fully verifiable, partially correct (minor errors in volume\/page numbers), and critically wrong (fabricated authors, studies, journals, or completely unrelated content).\u003C\/p\u003E\u003Cp id=\u0022p-56\u0022\u003EThe heatmap in \u003Ca id=\u0022xref-fig-5-1\u0022 class=\u0022xref-fig\u0022 href=\u0022#F5\u0022\u003EFigure 5\u003C\/a\u003E illustrates citation validity across three large language models (5.0 FT, IQ5.0 IQ, and Claude SONNET 4.0) for 331 questions organized into nine topical sections (S1\u2013S9). Each thin vertical strip represents one question\u2019s citation score, color-coded from red (0 = critically wrong) through orange (1 = partially correct, multiple errors) and yellow (2 = partially correct, fine errors) to green (3 = fully verified), with gray indicating missing responses. Claude SONNET 4.0 exhibits a predominance of green strips (\u0026gt;90% fully verified) in all sections, reflecting consistently high citation accuracy. IQ5.0 IQ shows substantial green interspersed with orange and red strips\u2014approximately two-thirds fully verified\u2014indicating moderate performance. In contrast, 5.0 FT displays frequent red and gray gaps, especially in sections S4, S7, and S8, underscoring its lower overall reliability and higher rates of missing or incorrect citations. The black vertical separators delineate topical sections, revealing that some sections (e.g., S5 and S6) pose greater challenges for all models, as evidenced by increased yellow and red densities. Overall, this compact visualization highlights clear performance gradients among the models and identifies specific content areas where citation validity deteriorates.\u003C\/p\u003E\u003Cdiv id=\u0022F5\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F5.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Heatmap showing citation validity across nine exam sections for three LLMs, with green indicating valid citations, yellow partially valid, red invalid, and gray missing.\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1730309226\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;Heatmap showing citation validity across nine exam sections for three LLMs, with green indicating valid citations, yellow partially valid, red invalid, and gray missing.\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Figure 5:\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F5.medium.gif\u0022 width=\u0022440\u0022 height=\u0022223\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Figure 5:\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F5.medium.gif\u0022 width=\u0022440\u0022 height=\u0022223\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F5.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Figure 5:\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F5.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003EFigure 5:\u003C\/span\u003E \u003Cp id=\u0022p-57\u0022 class=\u0022first-child\u0022\u003EHeatmap showing citation validity across nine exam sections for three LLMs, with green indicating valid citations, yellow partially valid, red invalid, and gray missing.\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cp id=\u0022p-58\u0022\u003ECitation performance varied markedly by question category and model configuration (\u003Ca id=\u0022xref-fig-6-1\u0022 class=\u0022xref-fig\u0022 href=\u0022#F6\u0022\u003EFigure 6\u003C\/a\u003E). For non-specific factual questions, Claude SONNET 4.0 demonstrated the highest citation validity, with 96% of references fully verified, compared with 61% for GPT-5.0 IQ and 41% for GPT-5.0 FT. Notably, GPT-5.0 FT exhibited the highest incidence of critically incorrect citations (45%) in this category. Across author-specific questions, all models achieved improved citation accuracy, with Claude SONNET 4.0 verifying 93% of references, GPT-5.0 IQ 63%, and GPT-5.0 FT 61%, suggesting that contextual cues such as author names enhance citation reliability.\u003C\/p\u003E\u003Cdiv id=\u0022F6\u0022 class=\u0022fig pos-float type-figure  odd\u0022\u003E\u003Cdiv class=\u0022highwire-figure\u0022\u003E\u003Cdiv class=\u0022fig-inline-img-wrapper\u0022\u003E\u003Cdiv class=\u0022fig-inline-img\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F6.large.jpg?width=800\u0026amp;height=600\u0026amp;carousel=1\u0022 title=\u0022Proportion of citation validity outcomes by question type for three LLM configurations (GPT-5.0 FT, GPT-5.0 IQ, and Claude SONNET 4.0).\u0022 class=\u0022highwire-fragment fragment-images colorbox-load\u0022 rel=\u0022gallery-fragment-images-1730309226\u0022 data-figure-caption=\u0022\u0026lt;div class=\u0026quot;highwire-markup\u0026quot;\u0026gt;Proportion of citation validity outcomes by question type for three LLM configurations (GPT-5.0 FT, GPT-5.0 IQ, and Claude SONNET 4.0).\u0026lt;\/div\u0026gt;\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003E\u003Cspan class=\u0022hw-responsive-img\u0022\u003E\u003Cimg class=\u0022highwire-fragment fragment-image lazyload\u0022 alt=\u0022Figure 6:\u0022 src=\u0022data:image\/gif;base64,R0lGODlhAQABAIAAAAAAAP\/\/\/yH5BAEAAAAALAAAAAABAAEAAAIBRAA7\u0022 data-src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F6.medium.gif\u0022 width=\u0022440\u0022 height=\u0022242\u0022\/\u003E\u003Cnoscript\u003E\u003Cimg class=\u0022highwire-fragment fragment-image\u0022 alt=\u0022Figure 6:\u0022 src=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F6.medium.gif\u0022 width=\u0022440\u0022 height=\u0022242\u0022\/\u003E\u003C\/noscript\u003E\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cul class=\u0022highwire-figure-links inline\u0022\u003E\u003Cli class=\u0022download-fig first\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F6.large.jpg?download=true\u0022 class=\u0022highwire-figure-link highwire-figure-link-download\u0022 title=\u0022Download Figure 6:\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EDownload figure\u003C\/a\u003E\u003C\/li\u003E\u003Cli class=\u0022new-tab last\u0022\u003E\u003Ca href=\u0022https:\/\/www.medrxiv.org\/content\/medrxiv\/early\/2025\/10\/17\/2025.10.14.25338040\/F6.large.jpg\u0022 class=\u0022highwire-figure-link highwire-figure-link-newtab\u0022 target=\u0022_blank\u0022 data-icon-position=\u0022\u0022 data-hide-link-title=\u00220\u0022\u003EOpen in new tab\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv class=\u0022fig-caption\u0022\u003E\u003Cspan class=\u0022fig-label\u0022\u003EFigure 6:\u003C\/span\u003E \u003Cp id=\u0022p-59\u0022 class=\u0022first-child\u0022\u003EProportion of citation validity outcomes by question type for three LLM configurations (GPT-5.0 FT, GPT-5.0 IQ, and Claude SONNET 4.0).\u003C\/p\u003E\u003Cdiv class=\u0022sb-div caption-clear\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cp id=\u0022p-60\u0022\u003EImage-based questions were also handled with relatively high validity, with Claude SONNET 4.0 verifying 89% of citations and GPT-5.0 IQ 80%, while GPT-5.0 FT achieved 67%. Analytical questions produced the most accurate outputs relative to non-specific factual items: Claude SONNET 4.0 verified 89% of citations, GPT-5.0 IQ 76%, and GPT-5.0 FT 73%, with most remaining errors classified as partially valid rather than critically incorrect.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-23\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003ERelationship of Citation Source to Citation Validity\u003C\/h3\u003E\u003Cp id=\u0022p-61\u0022\u003EPearson\u2019s chi-square tests revealed a highly significant association between citation source type (article vs. book) and citation validity across all three models (\u03c7\u00b2 \u2265 236, df = 3, Bonferroni-adjusted p \u0026lt; 1 \u00d7 10\u207b\u2075\u2070). These results indicate that the likelihood of a citation being verified is strongly dependent on the type of source from which it is drawn.\u003C\/p\u003E\u003Cp id=\u0022p-62\u0022\u003EIn practical terms, all models demonstrated a higher proportion of fully validated citations when referencing journal articles compared with books. For example, in the GPT-5.0 FT condition, approximately 62% of article-derived citations were fully verified compared with only 36% of book-based citations, with the latter contributing disproportionately to critically incorrect or missing references. Similar patterns were observed for GPT-5.0 IQ and Claude SONNET 4.0, underscoring a systematic bias in citation reliability based on source type.\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-24\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EDiscussion\u003C\/h2\u003E\u003Cp id=\u0022p-63\u0022\u003ELarge language models (LLMs) have demonstrated substantial advancements in capability over recent years, with increasing integration across biomedical research, clinical practice, and professional education. (\u003Ca id=\u0022xref-ref-13-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-13\u0022\u003E13\u003C\/a\u003E)These models exhibit sophisticated capacity for synthesizing complex information and generating contextually appropriate responses, establishing their utility as computational tools within scientific and clinical frameworks.\u003C\/p\u003E\u003Cp id=\u0022p-64\u0022\u003ENevertheless, responsible implementation necessitates comprehensive evaluation methodologies that transcend conventional performance metrics.\u003C\/p\u003E\u003Cp id=\u0022p-65\u0022\u003EConventional assessment paradigms have predominantly employed static benchmarks including the Massive Multitask Language Understanding (MMLU), TruthfulQA, and General Language Understanding Evaluation (GLUE) tasks. (\u003Ca id=\u0022xref-ref-14-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-14\u0022\u003E14\u003C\/a\u003E) While these standardized assessments provide foundational performance data, they demonstrate limited capacity to predict model behavior under real-world valid conditions. Contemporary evaluation initiatives within biomedical domains have concentrated primarily on task accuracy, operationally defined as the proportion of correct responses under controlled experimental conditions. The present investigation, while not constituting a comprehensive system-level or longitudinal assessment, was designed within this methodological framework. The study focused on critical behavioral dimensions including response accuracy, confidence calibration, citation validity, and hallucination frequency, thereby addressing key considerations for clinical and educational applications.\u003C\/p\u003E\u003Cdiv id=\u0022sec-25\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EComparing Model Performances\u003C\/h3\u003E\u003Cp id=\u0022p-66\u0022\u003EOur findings demonstrate that commercially available LLMs achieve substantial performance levels on specialty-specific assessments, with overall accuracy ranging from approximately 78% to 87% depending on model architecture and delivery methodology. These results corroborate emerging literature documenting progressive performance improvements across successive LLM generations and establish that general-purpose models, absent domain-specific fine-tuning, can effectively respond to complex professional-level inquiries. However, accuracy exhibited heterogeneous distribution across content domains.\u003C\/p\u003E\u003Cp id=\u0022p-67\u0022\u003ESuperior performance was observed in knowledge-intensive domains including biochemistry, physiology, and microbiology, where questions predominantly require retrieval of well-established factual information. Conversely, integrative domains\u2014particularly diagnosis and periodontal therapy\u2014presented greater challenges across all evaluated models.\u003C\/p\u003E\u003Cp id=\u0022p-68\u0022\u003EPerformance analysis across question typologies revealed that LLMs demonstrated proficiency with both factual and analytical question formats. Notably, contextualized questions yielded superior performance compared to broader conceptual inquiries, as illustrated in the radar plot visualization. The observed superiority of LLM performance on factual versus analytical periodontal questions aligns with patterns documented in a comprehensive systematic review and network meta-analysis by Wang et al. Their analysis of 35,896 medical questions across 168 studies demonstrated consistent LLM superiority on \u201cobjective questions\u201d (characterized by clear, quantifiable responses) compared to \u201copen-ended questions\u201d (requiring complex reasoning without predetermined solutions).(\u003Ca id=\u0022xref-ref-6-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-6\u0022\u003E6\u003C\/a\u003E) This performance pattern remained consistent across examination sections despite variations in model training methodologies. Both full-test and individual question implementations of ChatGPT 4.0, ChatGPT 5.0, and Claude SONNET 4.0 sonnet 4.0 exhibited comparable relative performance hierarchies across subject domains. Network visualization analysis revealed that while individual models demonstrated varying overall performance, subject difficulty rankings remained remarkably stable, indicating that specific dental education domains present inherent challenges for current LLM architectures. Cross-generational model performance analysis revealed that GPT-5.0 demonstrated modest yet consistent improvements relative to GPT-4.0 across all testing conditions. The most substantial performance gain was observed in the full-test format (+2.74 percentage points), with attenuated differences in the individual-question format (+1.51 percentage points). However, these differences did not achieve statistical significance (\u03c7\u00b2 = 0.850, p = 0.357; Cramer\u2019s V = 0.025), indicating that while iterative model refinements contribute to incremental accuracy improvements, their impact remains constrained within specialized domain assessments.\u003C\/p\u003E\u003Cp id=\u0022p-69\u0022\u003EThese findings present a notable contrast to broader benchmark evaluations that document substantial performance gains for GPT-5 relative to GPT-4 across various domains. In standardized assessments, GPT-5 achieved 94.6% accuracy on the AIME 2025 mathematics benchmark compared to GPT-4\u2019s approximately 52% performance and attained 74.9% accuracy on SWE-bench Verified coding tasks versus GPT-4\u2019s 30-50% range. (\u003Ca id=\u0022xref-ref-15-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-15\u0022\u003E15\u003C\/a\u003E,\u003Ca id=\u0022xref-ref-16-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-16\u0022\u003E16\u003C\/a\u003E)\u003C\/p\u003E\u003Cp id=\u0022p-70\u0022\u003EHealthcare applications demonstrate a more nuanced performance profile. While GPT-5 exhibited reduced hallucination rates to 1.6% on selected medical benchmarks compared to GPT-4o\u2019s 15.8%, performance varies considerably across different healthcare tasks, with heterogeneous results on clinical reasoning and medical knowledge assessments.(\u003Ca id=\u0022xref-ref-16-2\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-16\u0022\u003E16\u003C\/a\u003E,\u003Ca id=\u0022xref-ref-17-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-17\u0022\u003E17\u003C\/a\u003E)The observed disparity between general benchmark improvements and domain-specific performance suggests that while GPT-5 demonstrates marked advances in structured cognitive tasks such as mathematics and coding, its impact on specialized professional applications remains inconsistent. (\u003Ca id=\u0022xref-ref-18-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-18\u0022\u003E18\u003C\/a\u003E) This pattern indicates that generational improvements in large language models may not uniformly translate to enhanced performance across all professional domains, necessitating domain-specific evaluation frameworks for clinical and educational applications.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-26\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EComparing the method of data presentation\u003C\/h3\u003E\u003Cp id=\u0022p-71\u0022\u003ELLMs are rapidly evolving, and the ways in which users interact with them continue to adapt as our understanding of their capabilities deepens. Prompt engineering has always been central to optimizing model performance; however, our approach to it has matured \u2014 moving from the assumption that longer, more elaborate prompts are inherently better to the recognition that prompts should instead be information-rich, detailed, and contextually specific, yet concise and focused.(\u003Ca id=\u0022xref-ref-19-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-19\u0022\u003E19\u003C\/a\u003E) Another critical dimension of LLM behavior relates to the influence of input length and conversational duration on accuracy.\u003C\/p\u003E\u003Cp id=\u0022p-72\u0022\u003ESeveral studies demonstrate that performance can degrade as context length increases, with models showing diminished ability to retrieve or reason over information buried deeper within extended inputs. This so-called \u201clost-in-the-middle\u201d phenomenon underscores how the structure and sequencing of information directly affect output quality. (\u003Ca id=\u0022xref-ref-20-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-20\u0022\u003E20\u003C\/a\u003E,\u003Ca id=\u0022xref-ref-21-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-21\u0022\u003E21\u003C\/a\u003E) Some models, such as Claude SONNET 4.0, even prompt users to begin a new conversation when the input becomes too long, reflecting the practical implications of this limitation.\u003C\/p\u003E\u003Cp id=\u0022p-73\u0022\u003ETo examine how these dynamics shape model performance, our study deliberately varied data presentation strategies: in one condition, the entire assessment was provided at once to maximize contextual availability, whereas in another, questions were presented sequentially to minimize token load. This design allowed us to investigate whether well-documented declines in performance with increasing context length persist across different presentation modes and whether strategic segmentation can mitigate accuracy loss in specialized tasks.\u003C\/p\u003E\u003Cp id=\u0022p-74\u0022\u003EOur results reveal two distinct but complementary phenomena. First, presentation format demonstrated a consistent directional effect on outcome accuracy. Individual Question presentation produced improvements across both models, with combined analysis approaching statistical significance (\u03c7\u00b2 = 3.41, p = 0.065). GPT-4.0 showed a 4.86 percentage point improvement (p = 0.140), while GPT-5.0 demonstrated a 3.34 percentage point gain (p = 0.305). McNemar\u2019s paired analysis corroborated this pattern, with GPT-4.0 approaching significance (p = 0.057) and GPT-5.0 showing similar directional effects (p = 0.215). Although statistical significance was not achieved, the directional consistency across models suggests that individual question presentation may mitigate contextual interference or cognitive overload, leading to modest but practically relevant performance improvements.\u003C\/p\u003E\u003Cp id=\u0022p-75\u0022\u003ESecond, analysis of question order as a proxy for increasing token load revealed differential\u2019question fatigue\u2019 effects across models and presentation formats. GPT-5.0 in full-test mode demonstrated significant accuracy degradation, with odds of a correct response decreasing by 0.3% per question (OR = 0.997, p = 0.035, R\u00b2 = 0.784). This effect was not observed in GPT-5.0 individual question mode, which showed no significant decline (OR = 0.998, p = 0.223, R\u00b2 = 0.095). Claude SONNET 4.0 in individual question mode also exhibited significant question fatigue (OR = 0.995, p = 0.011, R\u00b2 = 0.666), with a more pronounced decline in accuracy across the sequence, although its overall accuracy remained high. These findings suggest that question fatigue is not simply a function of cumulative computational burden, but rather reflects complex interactions between model architecture, presentation format, and context management. The progressive decline in full-test mode aligns with theories of attention saturation and positional encoding inefficiency in transformer architectures, consistent with Levy et al. (2024) findings of decreased reasoning accuracy as input length increased from 250 to 3000 tokens.(\u003Ca id=\u0022xref-ref-22-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-22\u0022\u003E22\u003C\/a\u003E)\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-27\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EConfidence and Calibration: Insights into Model Self-Assessment\u003C\/h3\u003E\u003Cp id=\u0022p-76\u0022\u003EConfidence estimation was incorporated as a key parameter because it reflects an AI model\u2019s capacity to assess the reliability of its own outputs, a critical property for effective deployment in educational and clinical decision-support contexts. Optimal performance requires that self-reported confidence align closely with the probability of correct responses, a characteristic termed calibration. Analysis revealed that while confidence estimates from both GPT-5.0 and Claude SONNET 4.0 were not perfectly calibrated to response accuracy, they demonstrated statistically significant predictive relationships across all evaluated systems.\u003C\/p\u003E\u003Cp id=\u0022p-77\u0022\u003EBinary logistic regression confirmed that confidence scores served as significant predictors of model accuracy, with GPT-5.0 in Individual Question mode exhibiting the strongest predictive association, followed by Claude SONNET 4.0 and GPT-5.0 in Full Test mode. This hierarchy suggests that segmented input presentation enhances confidence-accuracy calibration.\u003C\/p\u003E\u003Cp id=\u0022p-78\u0022\u003EThe observed relationship between confidence and accuracy, while statistically significant, indicates that confidence metrics provide useful but imperfect indicators of response reliability. These findings underscore the necessity for complementary validation mechanisms capable of more effectively identifying uncertain or potentially incorrect responses in high-performing AI systems deployed in professional contexts.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-28\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003ECitation Integrity and Hallucination\u003C\/h3\u003E\u003Cp id=\u0022p-79\u0022\u003EPerhaps the most consequential limitation revealed by this study was the substantial variability in citation quality. While both GPT-5.0 and Claude SONNET 4.0 generated references when prompted, a considerable proportion were partially incorrect or wholly fabricated\u2014a phenomenon termed \u201ccitation hallucination.\u201d This behavior reflects the generative nature of LLMs, which predict text patterns rather than retrieve verified database records. Although models frequently produced references with plausible formatting and content, many could not be traced to authentic sources or failed to support their associated claims. This limitation is particularly problematic in educational and research settings, where users may rely on citations for further reading, evidence validation, or scholarly writing. Fabricated references not only undermine user trust but also risk propagating misinformation if accepted uncritically.\u003C\/p\u003E\u003Cp id=\u0022p-80\u0022\u003EAnalysis revealed that citation validity was independent of answer accuracy, similar to seen by Danesh et al, 2025. (\u003Ca id=\u0022xref-ref-23-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-23\u0022\u003E23\u003C\/a\u003E) Human-in-the-loop assessment demonstrated that hallucinated citations existed across a spectrum of severity. The three-tier classification system differentiated between fully verifiable citations, partially correct citations with minor errors (incorrect volume numbers, page references), and critically erroneous citations containing fabricated authors, non-existent studies, incorrect journal attributions, or wholly unrelated content. The predominance of critically erroneous hallucinations\u2014rather than minor bibliographic inaccuracies\u2014indicates that AI citation fabrication represents a qualitatively distinct problem from human citation errors, consistent with findings reported by Aljamaan et al. (\u003Ca id=\u0022xref-ref-24-1\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-24\u0022\u003E24\u003C\/a\u003E)\u003C\/p\u003E\u003Cp id=\u0022p-81\u0022\u003ECitation performance varied systematically across question categories. Author-specific and analytical questions achieved consistently higher citation accuracy across all models compared to non-specific factual questions, suggesting that contextual specificity significantly influences citation reliability. Non-specific factual questions exhibited the highest incidence of critically incorrect citations, paralleling patterns observed in answer accuracy metrics. These findings underscore the critical need for enhanced citation validation frameworks, including improved retrieval-augmented generation (RAG) mechanisms, real-time database cross-referencing systems, or hybrid architectures that integrate generative capabilities with verified bibliographic databases. Such implementations would ensure that model-generated references maintain both authenticity and contextual relevance, thereby addressing the fundamental disconnect between pattern-based text generation and evidence-based scholarly citation practices.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv id=\u0022sec-29\u0022 class=\u0022subsection\u0022\u003E\u003Ch3\u003EBroader Implications and Future Directions\u003C\/h3\u003E\u003Cp id=\u0022p-82\u0022\u003EThese findings collectively demonstrate both the potential and current constraints of LLMs as educational and knowledge-support tools in specialized dental practice. The observed high overall accuracy indicates that such models can effectively augment learning processes and assessment preparation, particularly for reinforcing foundational knowledge and simulating examination-style questioning protocols. However, persistent limitations in reasoning-intensive clinical domains, suboptimal confidence calibration, and systematic citation reliability deficits necessitate continued human oversight for safe and effective implementation.\u003C\/p\u003E\u003Cp id=\u0022p-83\u0022\u003EFuture investigations should extend this multidimensional evaluation framework through several key directions: (\u003Ca id=\u0022xref-ref-1-3\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-1\u0022\u003E1\u003C\/a\u003E) longitudinal assessments spanning multiple examination cycles to evaluate temporal stability of model performance, (\u003Ca id=\u0022xref-ref-2-3\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-2\u0022\u003E2\u003C\/a\u003E) expansion of citation validation protocols incorporating larger, diverse reviewer panels to enhance reliability of bibliographic assessment, and (\u003Ca id=\u0022xref-ref-3-2\u0022 class=\u0022xref-bibr\u0022 href=\u0022#ref-3\u0022\u003E3\u003C\/a\u003E) integration of advanced calibration metrics including Brier scores, expected calibration error, and reliability diagrams to provide more nuanced confidence-accuracy relationships.\u003C\/p\u003E\u003Cp id=\u0022p-84\u0022\u003EAdditionally, comparative analyses across multiple dental specialties and integration of real-world clinical scenarios would strengthen the ecological validity of these findings. As LLM architectures continue advancing, such comprehensive, specialty-specific evaluation protocols will prove essential for guiding evidence-based integration into dental education curricula, clinical decision-support systems, and scholarly research workflows. The framework established in this study provides a foundation for systematic assessment of emerging AI technologies in professional healthcare education contexts.\u003C\/p\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-30\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EConclusion\u003C\/h2\u003E\u003Cp id=\u0022p-85\u0022\u003EWhile large language models demonstrate the capacity to answer a wide range of dental assessment questions, their reliability is variable and highly dependent on both the context and the manner in which information is presented. This variability underscores that, although LLMs can serve as valuable adjuncts for knowledge reinforcement and assessment preparation, their outputs cannot be assumed accurate or trustworthy in all scenarios. In educational and research applications, it is essential that all LLM-generated responses, particularly those involving complex reasoning or citation are subject to rigorous human review. Ongoing human oversight and validation remain critical to ensure the integrity and safety of information, and future research should focus on developing robust frameworks for integrating LLMs responsibly within dental education and scholarly practice.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-31\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EEthics Approval Statement\u003C\/h2\u003E\u003Cp id=\u0022p-86\u0022\u003EThis study did not involve human participants or identifiable patient data. All questions were derived from publicly available educational resources. The study was conducted in alignment with principles of responsible AI research, emphasizing transparency, reproducibility, and human oversight in citation validation.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-32\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EDeclaration of Interest Statement\u003C\/h2\u003E\u003Cp id=\u0022p-87\u0022\u003EThe authors declare that they have no competing interests.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-33\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EFunding Sources\u003C\/h2\u003E\u003Cp id=\u0022p-88\u0022\u003EThis research did not receive any specific grant from funding agencies in the public, commercial, or not-for-profit sectors.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section\u0022 id=\u0022sec-34\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EAuthor Contributions\u003C\/h2\u003E\u003Cp id=\u0022p-89\u0022\u003EP.D. conceived the study design, conducted data collection and assessment, performed statistical analysis, and drafted the manuscript. R.H. contributed to study conception and data collection. Both authors reviewed and approved the final version of the manuscript.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section data-availability\u0022 id=\u0022sec-35\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EData Availability\u003C\/h2\u003E\u003Cp id=\u0022p-90\u0022\u003EThe datasets generated and analyzed during this study are available from the corresponding author upon reasonable request. The complete dataset includes: AI model responses to all 331 American Academy of Periodontology examination questions, including accuracy assessments, confidence scores, and generated citations for GPT-4o, GPT-4o mini, and Claude-3.5 Sonnet models Human expert validation results for all AI-generated citations, including verification status and authenticity scores Statistical analysis outputs including confidence calibration metrics, accuracy measurements across question categories, and question fatigue analysis. Data requests should be directed to the corresponding author\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section ack\u0022 id=\u0022ack-1\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EAcknowledgements\u003C\/h2\u003E\u003Cp id=\u0022p-91\u0022\u003EThe authors thank Hui Bian, Ph.D., Research \u0026amp; Statistics Consultant, Office for Faculty Excellence, East Carolina University, for providing valuable statistical consultation and guidance during the data analysis phase of this study.\u003C\/p\u003E\u003C\/div\u003E\u003Cdiv class=\u0022section ref-list\u0022 id=\u0022ref-list-1\u0022\u003E\u003Ch2 class=\u0022\u0022\u003EReferences\u003C\/h2\u003E\u003Col class=\u0022cit-list ref-use-labels\u0022\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E1.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-1-1\u0022 title=\u0022View reference 1. in text\u0022 id=\u0022ref-1\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.1\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ESunali\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ED\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKhanna\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ES\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDhaimade\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EPA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKhanna\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ES\u003C\/span\u003E\u003C\/span\u003E. \u003Cspan class=\u0022cit-article-title\u0022\u003EArtificial Intelligence: Transforming Dentistry Today\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EIndian Journal of Basic and Applied Medical Research [Internet\u003C\/abbr\u003E]. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2017\u003C\/span\u003E;(\u003Cspan class=\u0022cit-vol\u0022\u003E6\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003E161\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E7\u003C\/span\u003E. Available from: \u003Ca href=\u0022https:\/\/www.ijbamr.com\u0022\u003Ewww.ijbamr.com\u003C\/a\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DIndian%2BJournal%2Bof%2BBasic%2Band%2BApplied%2BMedical%2BResearch%2B%255BInternet%26rft.volume%253D6%26rft.spage%253D161%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E2.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-2-1\u0022 title=\u0022View reference 2. in text\u0022 id=\u0022ref-2\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.2\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EFang\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EQ\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EReynaldi\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ER\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EAraminta\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EAS\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKamal\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EI\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ESaini\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EP\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EAfshari\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EFS\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EArtificial Intelligence (AI)-driven dental education: Exploring the role of chatbots in a clinical learning environment\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EJ Prosthet Dent\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2025\u003C\/span\u003E Oct;\u003Cspan class=\u0022cit-vol\u0022\u003E134\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E4\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003E1296\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E303\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DJ%2BProsthet%2BDent%26rft.volume%253D134%26rft.spage%253D1296%26rft_id%253Dinfo%253Apmid%252F38644064%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=38644064\u0026amp;link_type=MED\u0026amp;atom=%2Fmedrxiv%2Fearly%2F2025%2F10%2F17%2F2025.10.14.25338040.atom\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-medline\u0022\u003E\u003Cspan\u003EPubMed\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E3.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-3-1\u0022 title=\u0022View reference 3. in text\u0022 id=\u0022ref-3\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.3\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EBhatia\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EAP\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ELambat\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EJain\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ET\u003C\/span\u003E\u003C\/span\u003E. \u003Cspan class=\u0022cit-article-title\u0022\u003EA Comparative Analysis of Conventional and Chat-Generative Pre-trained Transformer-Assisted Teaching Methods in Undergraduate Dental Education\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003ECureus\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2024\u003C\/span\u003E May 9;\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E4.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-4-1\u0022 title=\u0022View reference 4. in text\u0022 id=\u0022ref-4\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.4\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-article-title\u0022\u003EThe rise of large language models\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003ENat Comput Sci\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2025\u003C\/span\u003E Sep 24;\u003Cspan class=\u0022cit-vol\u0022\u003E5\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E9\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003E689\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E90\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DNat%2BComput%2BSci%26rft.volume%253D5%26rft.spage%253D689%26rft_id%253Dinfo%253Apmid%252F40993230%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=40993230\u0026amp;link_type=MED\u0026amp;atom=%2Fmedrxiv%2Fearly%2F2025%2F10%2F17%2F2025.10.14.25338040.atom\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-medline\u0022\u003E\u003Cspan\u003EPubMed\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E5.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-5-1\u0022 title=\u0022View reference 5. in text\u0022 id=\u0022ref-5\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.5\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKnoedler\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EL\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKnoedler\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ES\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EHoch\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ECC\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EPrantl\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EL\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EFrank\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EK\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ESoiderer\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EL\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EIn-depth analysis of ChatGPT\u2019s performance based on specific signaling words and phrases in the question stem of 2377 USMLE step 1 style questions\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003ESci Rep\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2024\u003C\/span\u003E Dec 1;\u003Cspan class=\u0022cit-vol\u0022\u003E14\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E1\u003C\/span\u003E).\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E6.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-6-1\u0022 title=\u0022View reference 6. in text\u0022 id=\u0022ref-6\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.6\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EWang\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EL\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ELi\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EJ\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EZhuang\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EB\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EHuang\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ES\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EFang\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EM\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EWang\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EC\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EAccuracy of Large Language Models When Answering Clinical Research Questions: Systematic Review and Network Meta-Analysis. Vol. 27\u003C\/span\u003E, \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EJournal of Medical Internet Research. JMIR Publications Inc\u003C\/abbr\u003E.; \u003Cspan class=\u0022cit-pub-date\u0022\u003E2025\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E7.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-7-1\u0022 title=\u0022View reference 7. in text\u0022 id=\u0022ref-7\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.7\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ELiu\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EM\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EOkuhara\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ET\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EChang\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EXY\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EShirabe\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ER\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ENishiie\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EY\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EOkada\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EH\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EPerformance of ChatGPT Across Different Versions in Medical Licensing Examinations Worldwide: Systematic Review and Meta-Analysis. Vol. 26\u003C\/span\u003E, \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EJournal of Medical Internet Research. JMIR Publications Inc\u003C\/abbr\u003E.; \u003Cspan class=\u0022cit-pub-date\u0022\u003E2024\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E8.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-8-1\u0022 title=\u0022View reference 8. in text\u0022 id=\u0022ref-8\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.8\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ESnigdha\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ENT\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EBatul\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ER\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKarobari\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EMI\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EAdil\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EAH\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDawasaz\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EAA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EHameed\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EMS\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EAssessing the Performance of ChatGPT 3.5 and ChatGPT 4 in Operative Dentistry and Endodontics: An Exploratory Study\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EHum Behav Emerg Technol\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2024\u003C\/span\u003E Jan 8;2024(\u003Cspan class=\u0022cit-vol\u0022\u003E1\u003C\/span\u003E).\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E9.\u003C\/span\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal no-rev-xref\u0022 id=\u0022cit-2025.10.14.25338040v1.9\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKinikoglu\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EI\u003C\/span\u003E\u003C\/span\u003E. \u003Cspan class=\u0022cit-article-title\u0022\u003EEvaluating ChatGPT and Google Gemini Performance and Implications in Turkish Dental Education\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003ECureus\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2025\u003C\/span\u003E Jan 11;\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E10.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-10-1\u0022 title=\u0022View reference 10. in text\u0022 id=\u0022ref-10\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.10\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EJin\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EHK\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ELee\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EHE\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKim\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EEY\u003C\/span\u003E\u003C\/span\u003E. \u003Cspan class=\u0022cit-article-title\u0022\u003EPerformance of ChatGPT-3.5 and GPT-4 in national licensing examinations for medicine, pharmacy, dentistry, and nursing: a systematic review and meta-analysis\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EBMC Med Educ\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2024\u003C\/span\u003E Dec 1;\u003Cspan class=\u0022cit-vol\u0022\u003E24\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E1\u003C\/span\u003E).\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E11.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-11-1\u0022 title=\u0022View reference 11. in text\u0022 id=\u0022ref-11\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.11\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDanesh\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EPazouki\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EH\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDanesh\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EF\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDanesh\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EVardar-Sengul\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ES\u003C\/span\u003E\u003C\/span\u003E. \u003Cspan class=\u0022cit-article-title\u0022\u003EArtificial intelligence in dental education: ChatGPT\u2019s performance on the periodontic in-service examination\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EJ Periodontol\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2024\u003C\/span\u003E Jul 1;\u003Cspan class=\u0022cit-vol\u0022\u003E95\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E7\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003E682\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E7\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DJ%2BPeriodontol%26rft.volume%253D95%26rft.spage%253D682%26rft_id%253Dinfo%253Apmid%252F38197146%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=38197146\u0026amp;link_type=MED\u0026amp;atom=%2Fmedrxiv%2Fearly%2F2025%2F10%2F17%2F2025.10.14.25338040.atom\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-medline\u0022\u003E\u003Cspan\u003EPubMed\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E12.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-12-1\u0022 title=\u0022View reference 12. in text\u0022 id=\u0022ref-12\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.12\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EGravel\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EJ\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ED\u2019Amours-Gravel\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EM\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EOsmanlliu\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EE\u003C\/span\u003E\u003C\/span\u003E. \u003Cspan class=\u0022cit-article-title\u0022\u003ELearning to Fake It: Limited Responses and Fabricated References Provided by ChatGPT for Medical Questions\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EMayo Clinic Proceedings: Digital Health\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2023\u003C\/span\u003E Sep;\u003Cspan class=\u0022cit-vol\u0022\u003E1\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E3\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003E226\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E34\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DMayo%2BClinic%2BProceedings%253A%2BDigital%2BHealth%26rft.volume%253D1%26rft.spage%253D226%26rft_id%253Dinfo%253Apmid%252F40206627%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=40206627\u0026amp;link_type=MED\u0026amp;atom=%2Fmedrxiv%2Fearly%2F2025%2F10%2F17%2F2025.10.14.25338040.atom\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-medline\u0022\u003E\u003Cspan\u003EPubMed\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E13.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-13-1\u0022 title=\u0022View reference 13. in text\u0022 id=\u0022ref-13\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.13\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EChoi\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EWC\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EChang\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ECI\u003C\/span\u003E\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EChatGPT-5 in Education: New Capabilities and Opportunities for Teaching and Learning [Internet]\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2025\u003C\/span\u003E. Available from: \u003Ca href=\u0022https:\/\/www.preprints.org\/manuscript\/202508.0684\/v1\u0022\u003Ehttps:\/\/www.preprints.org\/manuscript\/202508.0684\/v1\u003C\/a\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E14.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-14-1\u0022 title=\u0022View reference 14. in text\u0022 id=\u0022ref-14\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.14\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003ELLM evaluation: Metrics, frameworks, and best practices | genai-research \u2013 Weights \u0026amp; Biases [Internet]. [cited \u003Cspan class=\u0022cit-pub-date\u0022\u003E2025\u003C\/span\u003E Oct 1]. Available from: \u003Ca href=\u0022https:\/\/wandb.ai\/onlineinference\/genai-research\/reports\/LLM-evaluation-Metrics-frameworks-and-best-practices--VmlldzoxMTMxNjQ4NA#evaluation-methodologies\u0022\u003Ehttps:\/\/wandb.ai\/onlineinference\/genai-research\/reports\/LLM-evaluation-Metrics-frameworks-and-best-practices--VmlldzoxMTMxNjQ4NA#evaluation-methodologies\u003C\/a\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E15.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-15-1\u0022 title=\u0022View reference 15. in text\u0022 id=\u0022ref-15\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.15\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDinc\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EU\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ESarkar\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EJ\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ESchubert\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EP\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ESemrau\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ES\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EWeissmann\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ET\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EKarius\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EBenchmarking GPT-5 in Radiation Oncology: Measurable Gains, but Persistent Need for Expert Oversight\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2025\u003C\/span\u003E Aug 29 [cited 2025 Oct 1]; Available from: \u003Ca href=\u0022https:\/\/arxiv.org\/pdf\/2508.21777\u0022\u003Ehttps:\/\/arxiv.org\/pdf\/2508.21777\u003C\/a\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E16.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-16-1\u0022 title=\u0022View reference 16. in text\u0022 id=\u0022ref-16\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.16\u0022\u003E\u003Cdiv class=\u0022cit-metadata unstructured\u0022\u003EIntroducing GPT-5 | OpenAI [Internet]. [cited 2025 Oct 1]. Available from: \u003Ca href=\u0022https:\/\/openai.com\/index\/introducing-gpt-5\/\u0022\u003Ehttps:\/\/openai.com\/index\/introducing-gpt-5\/\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E17.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-17-1\u0022 title=\u0022View reference 17. in text\u0022 id=\u0022ref-17\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.17\u0022\u003E\u003Cdiv class=\u0022cit-metadata unstructured\u0022\u003EGPT-4 vs GPT-5: How a 74.9% SWE-bench Score Rewires Coding | by Lionel Owono | Aug, 2025 | AWS in Plain English [Internet]. [cited 2025 Oct 1]. Available from: \u003Ca href=\u0022https:\/\/aws.plainenglish.io\/gpt-4-vs-gpt-5-how-a-74-9-swe-bench-score-rewires-coding-fa2b08907067\u0022\u003Ehttps:\/\/aws.plainenglish.io\/gpt-4-vs-gpt-5-how-a-74-9-swe-bench-score-rewires-coding-fa2b08907067\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E18.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-18-1\u0022 title=\u0022View reference 18. in text\u0022 id=\u0022ref-18\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.18\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EFerrag\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EMA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ETihanyi\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EN\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDebbah\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EM.\u003C\/span\u003E\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EReasoning beyond limits: Advances and open problems for LLMs\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EICT Express\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2025\u003C\/span\u003E Sep;\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E19.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-19-1\u0022 title=\u0022View reference 19. in text\u0022 id=\u0022ref-19\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.19\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003EThe Ultimate Guide to Prompt Engineering in \u003Cspan class=\u0022cit-pub-date\u0022\u003E2025\u003C\/span\u003E | Lakera \u2013 Protecting AI teams that disrupt the world. [Internet]. [cited 2025 Oct 2]. Available from: \u003Ca href=\u0022https:\/\/www.lakera.ai\/blog\/prompt-engineering-\u0022\u003Ehttps:\/\/www.lakera.ai\/blog\/prompt-engineering-\u003C\/a\u003Eguide\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E20.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-20-1\u0022 title=\u0022View reference 20. in text\u0022 id=\u0022ref-20\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.20\u0022 data-doi=\u002210.1162\/tacl\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ELiu\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ENF\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ELin\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EK\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EHewitt\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EJ\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EParanjape\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EBevilacqua\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EM\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EPetroni\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EF\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003ELost in the Middle: How Language Models Use Long Contexts\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EAvailable from\u003C\/abbr\u003E\u003Cspan class=\u0022cit-pub-id-sep cit-pub-id-doi-sep\u0022\u003E: \u003C\/span\u003E\u003Cspan class=\u0022cit-pub-id cit-pub-id-doi\u0022\u003E\u003Cspan class=\u0022cit-pub-id-scheme-doi\u0022\u003Edoi:\u003C\/span\u003E10.1162\/tacl\u003C\/span\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DAvailable%2Bfrom%26rft_id%253Dinfo%253Adoi%252F10.1162%252Ftacl%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=10.1162\/tacl\u0026amp;link_type=DOI\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-doi cit-ref-sprinkles-crossref\u0022\u003E\u003Cspan\u003ECrossRef\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E21.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-21-1\u0022 title=\u0022View reference 21. in text\u0022 id=\u0022ref-21\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.21\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ELi\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EY\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EWu\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EB\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EHuang\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EY\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ELuan\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003ES\u003C\/span\u003E\u003C\/span\u003E. \u003Cspan class=\u0022cit-article-title\u0022\u003EDeveloping trustworthy artificial intelligence: insights from research on interpersonal, human-automation, and human-AI trust\u003C\/span\u003E. Vol. \u003Cspan class=\u0022cit-vol\u0022\u003E15\u003C\/span\u003E, \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EFrontiers in Psychology. Frontiers Media SA\u003C\/abbr\u003E; \u003Cspan class=\u0022cit-pub-date\u0022\u003E2024\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E22.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-22-1\u0022 title=\u0022View reference 22. in text\u0022 id=\u0022ref-22\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.22\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ELevy\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EM\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EJacoby\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EGoldberg\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EY.\u003C\/span\u003E\u003C\/span\u003E \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003ESame Task, More Tokens: the Impact of Input Length on the Reasoning Performance of Large Language Models [Internet]\u003C\/abbr\u003E. Vol. 1. Available from: \u003Ca href=\u0022https:\/\/github.com\/alonj\/Same-Task-More-Tokens\u0022\u003Ehttps:\/\/github.com\/alonj\/Same-Task-More-Tokens\u003C\/a\u003E\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E23.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-23-1\u0022 title=\u0022View reference 23. in text\u0022 id=\u0022ref-23\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.23\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDanesh\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDanesh\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EF\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EDanesh\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E. \u003Cspan class=\u0022cit-article-title\u0022\u003EChatGPT\u2019s risk of misinformation in dentistry\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EThe Journal of the American Dental Association\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2025\u003C\/span\u003E Jan;\u003Cspan class=\u0022cit-vol\u0022\u003E156\u003C\/span\u003E(\u003Cspan class=\u0022cit-issue\u0022\u003E1\u003C\/span\u003E):\u003Cspan class=\u0022cit-fpage\u0022\u003E3\u003C\/span\u003E\u2013\u003Cspan class=\u0022cit-lpage\u0022\u003E5\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DThe%2BJournal%2Bof%2Bthe%2BAmerican%2BDental%2BAssociation%26rft.volume%253D156%26rft.spage%253D3%26rft_id%253Dinfo%253Apmid%252F38878024%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003Ca href=\u0022\/lookup\/external-ref?access_num=38878024\u0026amp;link_type=MED\u0026amp;atom=%2Fmedrxiv%2Fearly%2F2025%2F10%2F17%2F2025.10.14.25338040.atom\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-medline\u0022\u003E\u003Cspan\u003EPubMed\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan class=\u0022ref-label\u0022\u003E24.\u003C\/span\u003E\u003Ca class=\u0022rev-xref-ref\u0022 href=\u0022#xref-ref-24-1\u0022 title=\u0022View reference 24. in text\u0022 id=\u0022ref-24\u0022\u003E\u21b5\u003C\/a\u003E\u003Cdiv class=\u0022cit ref-cit ref-journal\u0022 id=\u0022cit-2025.10.14.25338040v1.24\u0022\u003E\u003Cdiv class=\u0022cit-metadata\u0022\u003E\u003Ccite\u003E\u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EAljamaan\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EF\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003ETemsah\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EMH\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EAltamimi\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EI\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EAl-Eyadhy\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EJamal\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EA\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-auth\u0022\u003E\u003Cspan class=\u0022cit-name-surname\u0022\u003EAlhasan\u003C\/span\u003E  \u003Cspan class=\u0022cit-name-given-names\u0022\u003EK\u003C\/span\u003E\u003C\/span\u003E, \u003Cspan class=\u0022cit-etal\u0022\u003Eet al.\u003C\/span\u003E \u003Cspan class=\u0022cit-article-title\u0022\u003EReference Hallucination Score for Medical Artificial Intelligence Chatbots: Development and Usability Study\u003C\/span\u003E. \u003Cabbr class=\u0022cit-jnl-abbrev\u0022\u003EJMIR Med Inform\u003C\/abbr\u003E. \u003Cspan class=\u0022cit-pub-date\u0022\u003E2024\u003C\/span\u003E Jul 31;\u003Cspan class=\u0022cit-vol\u0022\u003E12\u003C\/span\u003E:\u003Cspan class=\u0022cit-fpage\u0022\u003Ee54345\u003C\/span\u003E.\u003C\/cite\u003E\u003C\/div\u003E\u003Cdiv class=\u0022cit-extra\u0022\u003E\u003Ca href=\u0022{openurl}?query=rft.jtitle%253DJMIR%2BMed%2BInform%26rft.volume%253D12%26rft.spage%253D54345e%26rft.genre%253Darticle%26rft_val_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Ajournal%26ctx_ver%253DZ39.88-2004%26url_ver%253DZ39.88-2004%26url_ctx_fmt%253Dinfo%253Aofi%252Ffmt%253Akev%253Amtx%253Actx\u0022 class=\u0022cit-ref-sprinkles cit-ref-sprinkles-openurl cit-ref-sprinkles-open-url\u0022\u003E\u003Cspan\u003EOpenUrl\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/li\u003E\u003C\/ol\u003E\u003C\/div\u003E\u003Cspan class=\u0022highwire-journal-article-marker-end\u0022\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cspan class=\u0022related-urls\u0022\u003E\u003C\/span\u003E\u003C\/div\u003E\u003C\/div\u003E  \u003C\/div\u003E\n\n  \n  \u003C\/div\u003E\n\u003C\/div\u003E\n  \u003C\/div\u003E\n\u003C\/div\u003E\n\u003C\/div\u003E\u003Cscript type=\u0022text\/javascript\u0022 src=\u0022https:\/\/www.medrxiv.org\/sites\/default\/files\/js\/js_zP7WWIfzbyzvaM63L39cNV2juU_1XVH7wduFK9gcMNI.js\u0022\u003E\u003C\/script\u003E\n\u003C\/body\u003E\u003C\/html\u003E"}