Creative Commons Attribution 4.0 International license
The LZ77 [Lempel and Ziv, 1977] compression scheme is ubiquitous: it lies at the core of everyday general-purpose standard compressors such as gzip or zstd, but also behind the scenes of many applications such as the compression of payloads transmitted in networks. Computing the exact LZ77 parsing is largely solved in theory: it can be done in sublinear time and space, in compressed space and in external memory, to name but some scenarios. However, these approaches are often impractical for everyday use due to their intensive time or space requirements. Standard compressors tackle this issue by introducing heuristics that go hand in hand with sophisticated encoding schemes to achieve very good compression fast and in small space, however, they only have a local view (e.g., a sliding window) on the input, potentially missing out on long-range repetitions that may be located far apart from one another. In this work, we design and implement - in C++ and leveraging shared-memory parallelism - compression pipelines that first precompress the input using an approximate LZ77 parsing taking care of long-range repetitions. This then serves as an assist to standard compressors for producing a succinct encoding of the remaining short and local repetitions. Similar approaches have been considered by [Kosolobov et al., 2020] and [Nalbach, 2024], respectively using Relative Lempel Ziv [Kuruppu et al. 2010] or the string synchronizing set [Kempa & Kociumaka, 2019]. We fill a gap taking the route via the prefix-free parsing [Boucher et al., 2019], using an intermediate result of [Hong et al., 2023]. On large repetitive inputs of tens of gigabytes, our pipelines are orders of magnitudes faster than the state of the art for computing the exact LZ77 parsing, use space less than the input size and still - despite producing more phrases - achieve the best overall compression in comparison to related work.
@InProceedings{dinklage:LIPIcs.SEA.2026.16,
author = {Dinklage, Patrick},
title = {{Efficient Large-Scale Text Precompression via Approximate LZ77 Parsings}},
booktitle = {24th International Symposium on Experimental Algorithms (SEA 2026)},
pages = {16:1--16:20},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
ISBN = {978-3-95977-422-2},
ISSN = {1868-8969},
year = {2026},
volume = {371},
editor = {Aum\"{u}ller, Martin and Finocchi, Irene},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
address = {Dagstuhl, Germany},
URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.SEA.2026.16},
URN = {urn:nbn:de:0030-drops-260204},
doi = {10.4230/LIPIcs.SEA.2026.16},
annote = {Keywords: compression, algorithm engineering, parallel computation}
}
archived version