Given a data stream 𝒟 = ⟨ a₁, a₂, …, a_m ⟩ of m elements where each a_i ∈ [n], the Distinct Elements problem is to estimate the number of distinct elements in 𝒟. Distinct Elements has been a subject of theoretical and empirical investigations over the past four decades resulting in space optimal algorithms for it. All the current state-of-the-art algorithms are, however, beyond the reach of an undergraduate textbook owing to their reliance on the usage of notions such as pairwise independence and universal hash functions. We present a simple, intuitive, sampling-based space-efficient algorithm whose description and the proof are accessible to undergraduates with the knowledge of basic probability theory.
@InProceedings{chakraborty_et_al:LIPIcs.ESA.2022.34, author = {Chakraborty, Sourav and Vinodchandran¹, N. V. and Meel, Kuldeep S.}, title = {{Distinct Elements in Streams: An Algorithm for the (Text) Book}}, booktitle = {30th Annual European Symposium on Algorithms (ESA 2022)}, pages = {34:1--34:6}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-247-1}, ISSN = {1868-8969}, year = {2022}, volume = {244}, editor = {Chechik, Shiri and Navarro, Gonzalo and Rotenberg, Eva and Herman, Grzegorz}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.ESA.2022.34}, URN = {urn:nbn:de:0030-drops-169725}, doi = {10.4230/LIPIcs.ESA.2022.34}, annote = {Keywords: F₀ Estimation, Streaming, Sampling} }
Feedback for Dagstuhl Publishing