Understanding user interface (UI) functionality is a useful yet challenging task for both machines and people. In this paper, we investigate a machine learning approach for screen correspondence, which allows reasoning about UIs by mapping their elements onto previously encountered examples with known functionality and properties. We describe and implement a model that incorporates element semantics, appearance, and text to support correspondence computation without requiring any labeled examples. Through a comprehensive performance evaluation, we show that our approach improves upon baselines by incorporating multi-modal properties of UIs. Finally, we show three example applications where screen correspondence facilitates better UI understanding for humans and machines: (i) instructional overlay generation, (ii) semantic UI element search, and (iii) automated interface testing.
@article{arxiv.2301.08372,
title = {Screen Correspondence: Mapping Interchangeable Elements between UIs},
author = {Jason Wu and Amanda Swearngin and Xiaoyi Zhang and Jeffrey Nichols and Jeffrey P. Bigham},
journal= {arXiv preprint arXiv:2301.08372},
year = {2023}
}