@inproceedings{07f2fbc161de46f7a6034b29095c5220,
title = "CMFG: Cross-Model Fine-Grained Feature Interaction for Text-Video Retrieval",
abstract = "As a fundamental task in the multimodal domain, text-to-video retrieval task has received great attention in recent years. Most of the current research focuses on the interaction between cross-modal coarse-grained features. However, the feature granularity of retrieval models has not been fully explored. Therefore, we introduce video internal region information into cross-modal retrieval and propose a cross-model fine-grained feature retrieval framework. Videos are represented as video-frame-region triple features, texts are represented as sentence-word dual features, and the cross-similarity between visual features and text features is computed through token-wise interaction. It effectively extracts the detailed information in the video, guides the model to pay attention to the effective video region information and keyword information in the sentence, and reduces the adverse effects of redundant words and interfering frames. On the most popular retrieval dataset MSRVTT, the framework achieves state-of-the-art results (51.1@1). Excellent experimental results demonstrate the superiority of fine-grained feature interaction.",
keywords = "Cross-model, Fine-grained, Text-video retrieval",
author = "Shengwei Zhao and Yuying Liu and Shaoyi Du and Zhiqiang Tian and Ting Qu and Linhai Xu",
note = "Publisher Copyright: {\textcopyright} 2023, The Author(s), under exclusive license to Springer Nature Switzerland AG.; 29th International Conference on MultiMedia Modeling, MMM 2023 ; Conference date: 09-01-2023 Through 12-01-2023",
year = "2023",
doi = "10.1007/978-3-031-27818-1\_36",
language = "英语",
isbn = "9783031278174",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "435--445",
editor = "Duc-Tien Dang-Nguyen and Cathal Gurrin and Smeaton, \{Alan F.\} and Martha Larson and Stevan Rudinac and Minh-Son Dao and Christoph Trattner and Phoebe Chen",
booktitle = "MultiMedia Modeling - 29th International Conference, MMM 2023, Proceedings",
}