In this paper, we investigate potential biases in datasets used to make drug binding predictions using machine learning. We investigate a recently published metric called the Asymmetric Validation Embedding (AVE) bias which is used to quantify this bias and detect overfitting. We compare it to a slightly revised version and introduce a new weighted metric. We find that the new metrics allow to quantify overfitting while not overly limiting training data and produce models with greater predictive value.
@article{arxiv.2001.03207,
title = {Split Optimization for Protein/Ligand Binding Models},
author = {Brian Davis and Kevin Mcloughlin and Jonathan Allen and Sally Ellingson},
journal= {arXiv preprint arXiv:2001.03207},
year = {2020}
}