@inproceedings{dff07aae726e485eba725573ce601ea8,
title = "Enhancing Video Grounding with Dual-Path Modality Fusion on Animal Kingdom Datasets",
abstract = "Video grounding, which involves aligning spoken language descriptions with corresponding video segments, plays a critical role in advancing multimedia content understanding. De-spite progress enabled by deep learning in multi-modal learning, this task faces significant challenges within complex datasets such as the Animal Kingdom, which features diverse and intricate natural scenes. Motivated by the need to en-hance cross-modal alignment and achieve robust localization, this study introduces a refined approach based on the Uni-VTG model. We enhance the model through the integration of dual-path modality fusion and a sophisticated multi-modal encoder. This method employs a dual-path mechanism to effectively fuse modalities and an advanced training strategy tailored for the complex requirements of the Animal King-dom dataset. The evaluation on this dataset shows significant improvements in accuracy and robustness, as well as an enhanced mean Intersection over Union (IoU), validating the effectiveness of our approach in navigating the complexities of natural environment video grounding.",
keywords = "Animal Kingdom Dataset, Modality Fusion, Video Grounding",
author = "Chengpeng Xiong and Zhengxuan Chen and Nuoer Long and Un, {Kin Seong} and Zhuolin Li and Shaobin Chen and Tao Tan and Lam, {Chan Tong} and Yue Sun",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 2024 IEEE International Conference on Multimedia and Expo Workshops, ICMEW 2024 ; Conference date: 15-07-2024 Through 19-07-2024",
year = "2024",
doi = "10.1109/ICMEW63481.2024.10645365",
language = "English",
series = "2024 IEEE International Conference on Multimedia and Expo Workshops, ICMEW 2024",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "2024 IEEE International Conference on Multimedia and Expo Workshops, ICMEW 2024",
address = "United States",
}