Given n points in 𝓁_p^d, we consider the problem of partitioning points into k clusters with associated centers. The cost of a clustering is the sum of p-th powers of distances of points to their cluster centers. For p ∈ [1,2], we design sketches of size poly(log(nd),k,1/ε) such that the cost of the optimal clustering can be estimated to within factor 1+ε, despite the fact that the compressed representation does not contain enough information to recover the cluster centers or the partition into clusters. This leads to a streaming algorithm for estimating the clustering cost with space poly(log(nd),k,1/ε). We also obtain a distributed memory algorithm, where the n points are arbitrarily partitioned amongst m machines, each of which sends information to a central party who then computes an approximation of the clustering cost. Prior to this work, no such streaming or distributed-memory algorithm was known with sublinear dependence on d for p ∈ [1,2).
@InProceedings{charikar_et_al:LIPIcs.ICALP.2022.38, author = {Charikar, Moses and Waingarten, Erik}, title = {{Polylogarithmic Sketches for Clustering}}, booktitle = {49th International Colloquium on Automata, Languages, and Programming (ICALP 2022)}, pages = {38:1--38:20}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-235-8}, ISSN = {1868-8969}, year = {2022}, volume = {229}, editor = {Boja\'{n}czyk, Miko{\l}aj and Merelli, Emanuela and Woodruff, David P.}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.ICALP.2022.38}, URN = {urn:nbn:de:0030-drops-163793}, doi = {10.4230/LIPIcs.ICALP.2022.38}, annote = {Keywords: sketching, clustering} }
Feedback for Dagstuhl Publishing