@inproceedings{67e4088b75674ca89cb04535677f37f0,
title = "LVLM-FDA: Protecting Large Vision-Language Models via Fast Detection of Malicious Attempts",
abstract = "Despite the impressive advancements of large vision-language models (LVLMs) in image understanding and reasoning, their susceptibility to safety risks—such as jailbreak attacks—remains a significant challenge for their real-world applications. To address this, we propose a fast yet safe protecting approach, named LVLM-FDA, which detects malicious attempts in inputs by leveraging the internal representations of LVLMs. By examining the representations across different attention heads, we aim to identify the most discriminative malicious features that can be distinguished from benign ones with high generalization accuracy. Therefore, we introduce a metric called separation probability, which provides a lower bound on the generalization accuracy of a classifier tasked with binary classification of malicious features. We can build a detector that identifies potentially harmful content in outputs by selecting the attention heads that generate the representations with the highest separation probability between the malicious and benign inputs. This detector can be seamlessly integrated into the generation process with minimal computational overhead during inference, offering a strong harmful response detector for modern LVLMs. It can be further applied to add an identification prompt to mitigate the safety risks further. Our experiments on various prompt-based attacks show that our method reduces inference time by at least 15\% while achieving a better defense performance compared to existing methods, as well as keep the general ability of LVLMs, demonstrating the effectiveness and efficiency of our approach in securing LVLMs. The code for our method is available at https://github.com/Chen-Boxu/LVLM-FDA.",
keywords = "AI security, Large vision-language models, LVLM safety, Separation probability",
author = "Boxu Chen and Chaoyi Wang and Le Yang and Ziwei Zheng and Cong Wang and Qian Wang and Chao Shen",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2026.; 18th International Conference on Knowledge Science, Engineering and Management KSEM 2025 ; Conference date: 04-08-2025 Through 07-08-2025",
year = "2026",
doi = "10.1007/978-981-95-3001-4\_2",
language = "英语",
isbn = "9789819530007",
series = "Lecture Notes in Computer Science",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "17--34",
editor = "Tianqing Zhu and Wanlei Zhou and Congcong Zhu",
booktitle = "Knowledge Science, Engineering and Management - 18th International Conference, KSEM 2025, Proceedings",
}