This paper investigates the size in bits of the LZ77 encoding, which is the most popular and efficient variant of the Lempel-Ziv encodings used in data compression. We prove that, for a wide natural class of variable-length encoders for LZ77 phrases, the size of the greedily constructed LZ77 encoding on constant alphabets is within a factor O(log n / log log log n) of the optimal LZ77 encoding, where n is the length of the processed string. We describe a series of examples showing that, surprisingly, this bound is tight, thus improving both the previously known upper and lower bounds. Further, we obtain a more detailed bound O(min{z, log n / log log z}), which uses the number z of phrases in the greedy LZ77 encoding as a parameter, and construct a series of examples showing that this bound is tight even for binary alphabet. We then investigate the problem on non-constant alphabets: we show that the known O(log n) bound is tight even for alphabets of logarithmic size, and provide tight bounds for some other important cases.
@InProceedings{kosolobov:LIPIcs.STACS.2018.46, author = {Kosolobov, Dmitry}, title = {{Relations Between Greedy and Bit-Optimal LZ77 Encodings}}, booktitle = {35th Symposium on Theoretical Aspects of Computer Science (STACS 2018)}, pages = {46:1--46:14}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-062-0}, ISSN = {1868-8969}, year = {2018}, volume = {96}, editor = {Niedermeier, Rolf and Vall\'{e}e, Brigitte}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.STACS.2018.46}, URN = {urn:nbn:de:0030-drops-84894}, doi = {10.4230/LIPIcs.STACS.2018.46}, annote = {Keywords: Lempel-Ziv, LZ77 encoding, greedy LZ77, bit optimal LZ77} }
Feedback for Dagstuhl Publishing