Creative Commons Attribution 3.0 Unported license
We revisit the fundamental problem of dictionary look-up with mismatches. Given a set (dictionary) of d strings of length m and an integer k, we must preprocess it into a data structure to answer the following queries: Given a query string Q of length m, find all strings in the dictionary that are at Hamming distance at most k from Q. Chan and Lewenstein (CPM 2015) showed a data structure for k = 1 with optimal query time O(m/w + occ), where w is the size of a machine word and occ is the size of the output. The data structure occupies O(w d log^{1+epsilon} d) extra bits of space (beyond the entropy-bounded space required to store the dictionary strings). In this work we give a solution with similar bounds for a much wider range of values k. Namely, we give a data structure that has O(m/w + log^k d + occ) query time and uses O(w d log^k d) extra bits of space.
@InProceedings{gawrychowski_et_al:LIPIcs.MFCS.2018.66,
author = {Gawrychowski, Pawel and Landau, Gad M. and Starikovskaya, Tatiana},
title = {{Fast Entropy-Bounded String Dictionary Look-Up with Mismatches}},
booktitle = {43rd International Symposium on Mathematical Foundations of Computer Science (MFCS 2018)},
pages = {66:1--66:15},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
ISBN = {978-3-95977-086-6},
ISSN = {1868-8969},
year = {2018},
volume = {117},
editor = {Potapov, Igor and Spirakis, Paul and Worrell, James},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
address = {Dagstuhl, Germany},
URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.MFCS.2018.66},
URN = {urn:nbn:de:0030-drops-96486},
doi = {10.4230/LIPIcs.MFCS.2018.66},
annote = {Keywords: Dictionary look-up, Hamming distance, compact data structures}
}