Recent advances in preference optimization have demonstrated significant potential for improving mathematical reasoning capabilities in large language models (LLMs). While current approaches leverage high-quality pairwise preference data through outcome-based criteria like answer correctness or consistency, they fundamentally neglect the internal logical coherence of responses. To overcome this, we propose Probability-Consistent Preference Optimization (PCPO), a novel framework that establishes dual quantitative metrics for preference selection: (1) surface-level answer correctness and (2) intrinsic token-level probability consistency across responses. Extensive experiments show that our PCPO consistently outperforms existing outcome-only criterion approaches across a diverse range of LLMs and benchmarks. Our code is publicly available at https://github.com/YunqiaoYang/PCPO.
@article{arxiv.2505.23540,
title = {Probability-Consistent Preference Optimization for Enhanced LLM Reasoning},
author = {Yunqiao Yang and Houxing Ren and Zimu Lu and Ke Wang and Weikang Shi and Aojun Zhou and Junting Pan and Mingjie Zhan and Hongsheng Li},
journal= {arXiv preprint arXiv:2505.23540},
year = {2025}
}