Recently, researchers have gradually realized that in some cases, the self-supervised pre-training on large-scale Internet data is better than that of high-quality/manually labeled data sets, and multimodal/large models are better than single or bimodal/small models. In this paper, we propose a robust audio representation learning method WavBriVL based on Bridging-Vision-and-Language (BriVL). WavBriVL projects audio, image and text into a shared embedded space, so that multi-modal applications can be realized. We demonstrate the qualitative evaluation of the image generated from WavBriVL as a shared embedded space, with the main purposes of this paper:(1) Learning the correlation between audio and image;(2) Explore a new way of image generation, that is, use audio to generate pictures. Experimental results show that this method can effectively generate appropriate images from audio.
@article{arxiv.2303.04585,
title = {Exploring Efficient-Tuned Learning Audio Representation Method from BriVL},
author = {Sen Fang and Yangjian Wu and Bowen Gao and Jingwen Cai and Teik Toe Teoh},
journal= {arXiv preprint arXiv:2303.04585},
year = {2023}
}