In this work, we present an alternative approach to making an agent compositional through the use of a diagnostic classifier. Because of the need for explainable agents in automated decision processes, we attempt to interpret the latent space from an RL agent to identify its current objective in a complex language instruction. Results show that the classification process causes changes in the hidden states which makes them more easily interpretable, but also causes a shift in zero-shot performance to novel instructions. Lastly, we limit the supervisory signal on the classification, and observe a similar but less notable effect.
@article{arxiv.2001.04418,
title = {Exploiting Language Instructions for Interpretable and Compositional Reinforcement Learning},
author = {Michiel van der Meer and Matteo Pirotta and Elia Bruni},
journal= {arXiv preprint arXiv:2001.04418},
year = {2020}
}