@inproceedings{04c222f9656f4fefa7772deca6acdcc0,
title = "RefineNet: Elevating Medical Foundation Models Through Quality-Centric Data Curation by MLLM-Annotated Proxy Distillation",
abstract = "The rapid advancement of medical foundation models creates unprecedented demand for large-scale training data, yet existing medical repositories remain contaminated by heterogeneous mixtures of high- and low-quality image-text pairs—a severe data pollution problem that significantly bottlenecks model performance and optimization. While manual curation could theoretically ensure quality, it is impractical for managing large-scale datasets effectively.To address this critical challenge, we introduce RefineNet—a scalable framework that systematically refines data quality by distilling multimodal large language model (MLLM) insights into an offline reward model.RefineNet innovatively decouples human decision-making for quality assessment into two key dimensions: image-text fidelity and semantic consistency. By strategically filtering and curating datasets, RefineNet demonstrates remarkable performance improvements across diagnostic tasks. Specifically, our method selects 50\% high-quality data subsets that outperform full-data baselines by 9.15\% in Recall@10 (retrieval), 85.59 AUC (classification), and 72.59\% accuracy (visual question answering). Moreover, RefineNet achieves notable agreement with human expert judgments (Pearson{\textquoteright}s r = 0.67), providing clinicians an auditable bridge between automated curation and validation.",
keywords = "Medical data curation, foundation models, multimodal learning, quality assessment",
author = "Ningyi Zhang and Yuan Gao and Xin Wang and Chan, \{Ka Hou\} and Jian Wu and Lam, \{Chan Tong\} and Shanshan Wang and Yue Sun and Im, \{Sio Kei\} and Tao Tan",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Switzerland AG 2026.; 28th International Conference on Medical Image Computing and Computer Assisted Intervention, MICCAI 2025 ; Conference date: 23-09-2025 Through 27-09-2025",
year = "2026",
doi = "10.1007/978-3-032-05141-7\_48",
language = "English",
isbn = "9783032051400",
series = "Lecture Notes in Computer Science",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "498--508",
editor = "Gee, \{James C.\} and Jaesung Hong and Sudre, \{Carole H.\} and Polina Golland and Jinah Park and Alexander, \{Daniel C.\} and Iglesias, \{Juan Eugenio\} and Archana Venkataraman and Kim, \{Jong Hyo\}",
booktitle = "Medical Image Computing and Computer Assisted Intervention, MICCAI 2025 - 28th International Conference, 2025, Proceedings",
address = "Germany",
}