Automatically translating images to texts involves image scene understanding and language modeling. In this paper, we propose a novel model, termed RefineCap, that refines the output vocabulary of the language decoder using decoder-guided visual semantics, and implicitly learns the mapping between visual tag words and images. The proposed Visual-Concept Refinement method can allow the generator to attend to semantic details in the image, thereby generating more semantically descriptive captions. Our model achieves superior performance on the MS-COCO dataset in comparison with previous visual-concept based models.
@article{arxiv.2109.03529,
title = {RefineCap: Concept-Aware Refinement for Image Captioning},
author = {Yekun Chai and Shuo Jin and Junliang Xing},
journal= {arXiv preprint arXiv:2109.03529},
year = {2021}
}