@inproceedings{wang-etal-2025-getting,
title = "Getting More Juice Out of Your Data: Hard Pair Refinement Enhances Visual-Language Models Without Extra Data",
author = "Wang, Haonan and
Huang, Minbin and
Huang, Runhui and
Hong, Lanqing and
Xu, Hang and
Hu, Tianyang and
Liang, Xiaodan and
Li, Zhenguo and
Cheng, Hong and
Kawaguchi, Kenji",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.399/",
doi = "10.18653/v1/2025.naacl-long.399",
pages = "7854--7873",
ISBN = "979-8-89176-189-6",
abstract = "Contrastive Language-Image Pre-training (CLIP) has become the standard for cross- modal image-text representation learning. Improving CLIP typically requires additional data and retraining with new loss functions, but these demands raise resource and time costs, limiting practical use. In this work, we introduce HELIP, a cost-effective strategy that improves CLIP models by exploiting challenging text-image pairs within existing datasets in continuous training. This eliminates the need for additional data or extensive retraining. Moreover, HELIP integrates effortlessly into current training pipelines with minimal code modifications, allowing for quick and seamless implementation. On comprehensive benchmarks, HELIP consistently boosts existing models. In particular, within just two epochs of training, it improves zero-shot classification accuracy on ImageNet for SLIP models pre-trained on CC3M, CC12M, and YFCC15M datasets by 3.05{\%}, 4.47{\%}, and 10.1{\%} , respectively. In addition, on fine-grained classification datasets, HELIP improves the zero-shot performance of CLIP and SLIP by an average of 8.4{\%} and 18.6{\%}, and their linear probe performance by an average of 9.5{\%} and 3.0{\%}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-getting">
<titleInfo>
<title>Getting More Juice Out of Your Data: Hard Pair Refinement Enhances Visual-Language Models Without Extra Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haonan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minbin</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Runhui</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lanqing</namePart>
<namePart type="family">Hong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hang</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianyang</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaodan</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenguo</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenji</namePart>
<namePart type="family">Kawaguchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Contrastive Language-Image Pre-training (CLIP) has become the standard for cross- modal image-text representation learning. Improving CLIP typically requires additional data and retraining with new loss functions, but these demands raise resource and time costs, limiting practical use. In this work, we introduce HELIP, a cost-effective strategy that improves CLIP models by exploiting challenging text-image pairs within existing datasets in continuous training. This eliminates the need for additional data or extensive retraining. Moreover, HELIP integrates effortlessly into current training pipelines with minimal code modifications, allowing for quick and seamless implementation. On comprehensive benchmarks, HELIP consistently boosts existing models. In particular, within just two epochs of training, it improves zero-shot classification accuracy on ImageNet for SLIP models pre-trained on CC3M, CC12M, and YFCC15M datasets by 3.05%, 4.47%, and 10.1% , respectively. In addition, on fine-grained classification datasets, HELIP improves the zero-shot performance of CLIP and SLIP by an average of 8.4% and 18.6%, and their linear probe performance by an average of 9.5% and 3.0%.</abstract>
<identifier type="citekey">wang-etal-2025-getting</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.399</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.399/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>7854</start>
<end>7873</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Getting More Juice Out of Your Data: Hard Pair Refinement Enhances Visual-Language Models Without Extra Data
%A Wang, Haonan
%A Huang, Minbin
%A Huang, Runhui
%A Hong, Lanqing
%A Xu, Hang
%A Hu, Tianyang
%A Liang, Xiaodan
%A Li, Zhenguo
%A Cheng, Hong
%A Kawaguchi, Kenji
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F wang-etal-2025-getting
%X Contrastive Language-Image Pre-training (CLIP) has become the standard for cross- modal image-text representation learning. Improving CLIP typically requires additional data and retraining with new loss functions, but these demands raise resource and time costs, limiting practical use. In this work, we introduce HELIP, a cost-effective strategy that improves CLIP models by exploiting challenging text-image pairs within existing datasets in continuous training. This eliminates the need for additional data or extensive retraining. Moreover, HELIP integrates effortlessly into current training pipelines with minimal code modifications, allowing for quick and seamless implementation. On comprehensive benchmarks, HELIP consistently boosts existing models. In particular, within just two epochs of training, it improves zero-shot classification accuracy on ImageNet for SLIP models pre-trained on CC3M, CC12M, and YFCC15M datasets by 3.05%, 4.47%, and 10.1% , respectively. In addition, on fine-grained classification datasets, HELIP improves the zero-shot performance of CLIP and SLIP by an average of 8.4% and 18.6%, and their linear probe performance by an average of 9.5% and 3.0%.
%R 10.18653/v1/2025.naacl-long.399
%U https://aclanthology.org/2025.naacl-long.399/
%U https://doi.org/10.18653/v1/2025.naacl-long.399
%P 7854-7873
Markdown (Informal)
[Getting More Juice Out of Your Data: Hard Pair Refinement Enhances Visual-Language Models Without Extra Data](https://aclanthology.org/2025.naacl-long.399/) (Wang et al., NAACL 2025)
ACL
- Haonan Wang, Minbin Huang, Runhui Huang, Lanqing Hong, Hang Xu, Tianyang Hu, Xiaodan Liang, Zhenguo Li, Hong Cheng, and Kenji Kawaguchi. 2025. Getting More Juice Out of Your Data: Hard Pair Refinement Enhances Visual-Language Models Without Extra Data. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 7854–7873, Albuquerque, New Mexico. Association for Computational Linguistics.