@inbook{2c2f2ef691d640869af59f42fb746049,
title = "An Investigation of CNN-CARU for Image Captioning",
abstract = "The goal of an image description is to extract essential information and a description of the content of a media feature from an image. This description can be obtained directly from a human-understandable description of an interesting image (retrieval-based image with object(s) and their action description) or encoded by an encoder–decoder neural network. The challenge of the learning model is that it tries to project the media feature into a neutral language, which also produces the description in another feature domain. It may suffer from misidentification of scene or semantic elements. In this chapter, we attempt to address these challenges by introducing a novel image captioning framework that combines generation and retrieval. A CNN-CARU model is introduced, where the image is first encoded by a CNN-based network, and multiple captions are generated/created for a target image by an RNN network of CARU.",
keywords = "CARU, CNN, Encoder–decoder network, Image captioning, NLP",
author = "Im, {Sio Kei} and Chan, {Ka Hou}",
note = "Publisher Copyright: {\textcopyright} 2024, The Author(s), under exclusive license to Springer Nature Switzerland AG.",
year = "2024",
doi = "10.1007/978-3-031-36670-3_2",
language = "English",
series = "Signals and Communication Technology",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "15--23",
booktitle = "Signals and Communication Technology",
address = "Germany",
}