@article{Wu_Liu_Wang_Li_2019, title={Differential Networks for Visual Question Answering}, volume={33}, url={https://ojs.aaai.org/index.php/AAAI/article/view/4930}, DOI={10.1609/aaai.v33i01.33018997}, abstractNote={<p>The task of <em>Visual Question Answering</em> (VQA) has emerged in recent years for its potential applications. To address the VQA task, the model should fuse feature elements from both images and questions efficiently. Existing models fuse image feature element <em>v</em><sub><em>i</em></sub> and question feature element <em>q</em><sub><em>i</em></sub> directly, such as an element product <em>v</em><sub><em>i</em></sub><em>q</em><sub><em>i</em></sub>. Those solutions largely ignore the following two key points: 1) Whether <em>v</em><sub><em>i</em></sub> and <em>q</em><sub><em>i</em></sub> are in the same space. 2) How to reduce the observation noises in <em>v</em><sub><em>i</em></sub> and <em>q</em><sub><em>i</em></sub>. We argue that two differences between those two feature elements themselves, like (<em>v</em><sub><em>i</em></sub> − <em>v</em><sub><em>j</em></sub>) and (<em>q</em><sub><em>i</em></sub> −<em>q</em><sub><em>j</em></sub>), are more probably in the same space. And the difference operation would be beneficial to reduce observation noise. To achieve this, we first propose Differential Networks (DN), a novel plug-and-play module which enables differences between pair-wise feature elements. With the tool of DN, we then propose DN based Fusion (DF), a novel model for VQA task. We achieve state-of-the-art results on four publicly available datasets. Ablation studies also show the effectiveness of difference operations in DF model.</p>}, number={01}, journal={Proceedings of the AAAI Conference on Artificial Intelligence}, author={Wu, Chenfei and Liu, Jinlai and Wang, Xiaojie and Li, Ruifan}, year={2019}, month={Jul.}, pages={8997-9004} }