Creative Commons Attribution 3.0 Unported license
We show that a wide class of dictionary compression methods (including LZ77, LZ78, grammar compressors as well as parsing-based structures) require |S|H_k(S) + Omega (|S|k log sigma/log_sigma |S|) bits to encode their output. This matches known upper bounds and improves the information-theoretic lower bound of |S|H_k(S). To this end, we abstract the crucial properties of parsings created by those methods, construct a certain family of strings and analyze the parsings of those strings. We also show that for k = alpha log_sigma |S|, where 0 < alpha < 1 is a constant, the aforementioned methods produce an output of size at least 1/(1-alpha)|S|H_k(S) bits. Thus our results separate dictionary compressors from context-based one (such as PPM) and BWT-based ones, as the those include methods achieving |S|H_k(S) + O(sigma^k log sigma) bits, i.e. the redundancy depends on k and sigma but not on |S|.
@InProceedings{ganczorz:LIPIcs.CPM.2019.11,
author = {Ga\'{n}czorz, Micha{\l}},
title = {{Entropy Lower Bounds for Dictionary Compression}},
booktitle = {30th Annual Symposium on Combinatorial Pattern Matching (CPM 2019)},
pages = {11:1--11:18},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
ISBN = {978-3-95977-103-0},
ISSN = {1868-8969},
year = {2019},
volume = {128},
editor = {Pisanti, Nadia and P. Pissis, Solon},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
address = {Dagstuhl, Germany},
URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.CPM.2019.11},
URN = {urn:nbn:de:0030-drops-104822},
doi = {10.4230/LIPIcs.CPM.2019.11},
annote = {Keywords: compression, empirical entropy, parsing, lower bounds}
}