We propose Dual Approximation Policy Optimization (DAPO), a framework that incorporates general function approximation into policy mirror descent methods. In contrast to the popular approach of using the L2-norm to measure function approximation errors, DAPO uses the dual Bregman divergence induced by the mirror map for policy projection. This duality framework has both theoretical and practical implications: not only does it achieve fast linear convergence with general function approximation, but it also includes several well-known practical methods as special cases, immediately providing strong convergence guarantees.
@article{arxiv.2410.01249,
title = {Dual Approximation Policy Optimization},
author = {Zhihan Xiong and Maryam Fazel and Lin Xiao},
journal= {arXiv preprint arXiv:2410.01249},
year = {2024}
}