We address the problem of mean estimation in very high dimensions, in the high probability regime parameterized by failure probability δ. For a distribution with covariance Σ, let its "effective dimension" be d_eff = {Tr(Σ)}/{λ_{max}(Σ)}. For the regime where d_eff = ω(log^2 (1/δ)), we show the first algorithm whose sample complexity is optimal to within 1+o(1) factor. The algorithm has a surprisingly simple structure: 1) re-center the samples using a known sub-Gaussian estimator, 2) carefully choose an easy-to-compute positive integer t and then remove the t samples farthest from the origin and 3) return the sample mean of the remaining samples. The core of the analysis relies on a novel vector Bernstein-type tail bound, showing that under general conditions, the sample mean of a bounded high-dimensional distribution is highly concentrated around a spherical shell.
@InProceedings{lee_et_al:LIPIcs.ITCS.2022.98, author = {Lee, Jasper C.H. and Valiant, Paul}, title = {{Optimal Sub-Gaussian Mean Estimation in Very High Dimensions}}, booktitle = {13th Innovations in Theoretical Computer Science Conference (ITCS 2022)}, pages = {98:1--98:21}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-217-4}, ISSN = {1868-8969}, year = {2022}, volume = {215}, editor = {Braverman, Mark}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.ITCS.2022.98}, URN = {urn:nbn:de:0030-drops-156942}, doi = {10.4230/LIPIcs.ITCS.2022.98}, annote = {Keywords: High-dimensional mean estimation} }
Feedback for Dagstuhl Publishing