@article{Xuan_Zhang_Chen_Yang_Yan_2020, title={Cross-Modal Attention Network for Temporal Inconsistent Audio-Visual Event Localization}, volume={34}, url={https://ojs.aaai.org/index.php/AAAI/article/view/5361}, DOI={10.1609/aaai.v34i01.5361}, abstractNote={<p>In human multi-modality perception systems, the benefits of integrating auditory and visual information are extensive as they provide plenty supplementary cues for understanding the events. Despite some recent methods proposed for such application, they cannot deal with practical conditions with temporal inconsistency. Inspired by human system which puts different focuses at specific locations, time segments and media while performing multi-modality perception, we provide an attention-based method to simulate such process. Similar to human mechanism, our network can adaptively select <strong><em>“where” to attend</em></strong>, <strong><em>“when” to attend</em></strong> and <strong><em>“which” to attend</em></strong> for audio-visual event localization. In this way, even with large temporal inconsistent between vision and audio, our network is able to adaptively trade information between different modalities and successfully achieve event localization. Our method achieves state-of-the-art performance on AVE (Audio-Visual Event) dataset collected in the real life. In addition, we also systemically investigate audio-visual event localization tasks. The visualization results also help us better understand how our model works.</p>}, number={01}, journal={Proceedings of the AAAI Conference on Artificial Intelligence}, author={Xuan, Hanyu and Zhang, Zhenyu and Chen, Shuo and Yang, Jian and Yan, Yan}, year={2020}, month={Apr.}, pages={279-286} }