@inproceedings{53982e34f2ca43f9985073f7de9f7d7f,
title = "Refiner: Fine-grained Cross-modal Concepts Refinement for Compositional Zero-Shot Learning",
abstract = "Recent Compositional Zero-Shot Learning (CZSL) methods increasingly adopt the pre-trained vision-language models to capture the contextual relations between image and text spaces. However, the single-class-token design from Transformer-based encoder inevitably captures contextual information from unrelated objects and background, thus hindering the modeling of fine-grained class-specific visual features. Suffering from cross-modal gap, prior methods also struggle to improve compositional recognition performance. To address these issues, we propose a fine-grained cross-modal concepts refinement framework, termed as Refiner, which comprises two pivotal components: (i) the fine-grained concepts refinement of image embeddings to capture state-object context within visual scenes, and (ii) the cross-modal information fusion to mitigate the modality gap. By leveraging learnable query vectors to capture region-specific semantic information pertinent to composition labels, our approach refines visual representations with fine-grained state-object context information. As for cross-modal information fusion, we construct a robust image-to-text mapping by aligning visual embeddings with states, objects, and compositions, respectively. Extensive experiments demonstrate that our Refiner achieves new state-of-the-art performance across all popular benchmarks in both closed- and open-world settings.",
keywords = "Compositional Zero-shot Learning, Cross-Modal Fusion, Fine-Grained Refinement, Multimodal Models",
author = "Xiao Zhang and Haodong Jing and Hui Chen and Yongqiang Ma and Nanning Zheng",
note = "Publisher Copyright: {\textcopyright} 2025 IEEE.; 2025 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2025 ; Conference date: 06-04-2025 Through 11-04-2025",
year = "2025",
doi = "10.1109/ICASSP49660.2025.10887898",
language = "英语",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
editor = "Rao, \{Bhaskar D\} and Isabel Trancoso and Gaurav Sharma and Mehta, \{Neelesh B.\}",
booktitle = "2025 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2025 - Proceedings",
}