Robust estimation of the essential matrix, which encodes the relative position and orientation of two cameras, is a fundamental step in structure from motion pipelines. Recent deep-based methods achieved accurate estimation by using complex network architectures that involve graphs, attention layers, and hard pruning steps. Here, we propose a simpler network architecture based on Deep Sets. Given a collection of point matches extracted from two images, our method identifies outlier point matches and models the displacement noise in inlier matches. A weighted DLT module uses these predictions to regress the essential matrix. Our network achieves accurate recovery that is superior to existing networks with significantly more complex architectures.
@article{arxiv.2406.17414,
title = {Consensus Learning with Deep Sets for Essential Matrix Estimation},
author = {Dror Moran and Yuval Margalit and Guy Trostianetsky and Fadi Khatib and Meirav Galun and Ronen Basri},
journal= {arXiv preprint arXiv:2406.17414},
year = {2024}
}