The suffix array contains the lexicographical order of all suffixes of a text. It is one of the most well-studied text indices with applications in bioinformatics, compression, and pattern matching. The main bottleneck of distributed-memory suffix array construction algorithms is their memory requirements. Even careful implementations require 30×-60× the input size as working memory. We present a scalable and lightweight distributed-memory adaptation of the difference cover (DCX) suffix array construction algorithm. Our approach relies on novel bucketing and random chunk redistribution techniques which reduce our memory requirement to 20×-26× the input size for medium-sized inputs and to 14×-15× for large-sized inputs. Regarding running time, we achieve speedups of up to 5× over current state-of-the-art distributed suffix array construction algorithms.
@InProceedings{haag_et_al:LIPIcs.ESA.2025.47, author = {Haag, Manuel and Kurpicz, Florian and Sanders, Peter and Schimek, Matthias}, title = {{Fast and Lightweight Distributed Suffix Array Construction}}, booktitle = {33rd Annual European Symposium on Algorithms (ESA 2025)}, pages = {47:1--47:18}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-395-9}, ISSN = {1868-8969}, year = {2025}, volume = {351}, editor = {Benoit, Anne and Kaplan, Haim and Wild, Sebastian and Herman, Grzegorz}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.ESA.2025.47}, URN = {urn:nbn:de:0030-drops-245154}, doi = {10.4230/LIPIcs.ESA.2025.47}, annote = {Keywords: Distributed Computing, Suffix Array Construction} }
Feedback for Dagstuhl Publishing