Attributing the output of a neural network to the contribution of given input elements is a way of shedding light on the black-box nature of neural networks. Due to the complexity of current network architectures, current gradient-based attribution methods provide very noisy or coarse results. We propose to prune a neural network for a given single input to keep only neurons that highly contribute to the prediction. We show that by input-specific pruning, network gradients change from reflecting local (noisy) importance information to global importance. Our proposed method is efficient and generates fine-grained attribution maps. We further provide a theoretical justification of the pruning approach relating it to perturbations and validate it through a novel experimental setup. Our method is evaluated by multiple benchmarks: sanity checks, pixel perturbation, and Remove-and-Retrain (ROAR). These benchmarks evaluate the method from different perspectives and our method performs better than other methods across all evaluations.
@article{arxiv.1911.11081,
title = {Improving Feature Attribution through Input-specific Network Pruning},
author = {Ashkan Khakzar and Soroosh Baselizadeh and Saurabh Khanduja and Christian Rupprecht and Seong Tae Kim and Nassir Navab},
journal= {arXiv preprint arXiv:1911.11081},
year = {2020}
}