The FM-index provides an important solution for efficient retrieval and search in textual big data. Its variants have been widely used in many fields including information retrieval, genome analysis, and web searching. In this paper, we propose improvements via a new compressed representation of the wavelet tree of the Burrows-Wheeler transform of the input text, which incorporates the gap γ-encoding. Our theoretical analysis shows that the new index, called FM-Adaptive, achieves asymptotic space optimality within a factor of 2 in the leading term, but it has a better compression and faster retrieval in practice than the competitive optimal compression boosting used in previous FM-indexes. We present a practical improved locate algorithm that provides substantially faster locating time based upon memoization, which takes advantage of the overlapping subproblems property. We design the lookup table for accelerated decoding to support fast pattern matching in a text. Extensive experiments demonstrate that FM-Adaptive provides faster query performance, often by a considerable amount, and/or comparable or better compression than other state-of-the-art FM-index methods.
@InProceedings{huo_et_al:OASIcs.Manzini.5, author = {Huo, Hongwei and He, Zongtao and Liu, Pengfei and Vitter, Jeffrey Scott}, title = {{FM-Adaptive: A Practical Data-Aware FM-Index}}, booktitle = {The Expanding World of Compressed Data: A Festschrift for Giovanni Manzini's 60th Birthday}, pages = {5:1--5:23}, series = {Open Access Series in Informatics (OASIcs)}, ISBN = {978-3-95977-390-4}, ISSN = {2190-6807}, year = {2025}, volume = {131}, editor = {Ferragina, Paolo and Gagie, Travis and Navarro, Gonzalo}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/OASIcs.Manzini.5}, URN = {urn:nbn:de:0030-drops-239139}, doi = {10.4230/OASIcs.Manzini.5}, annote = {Keywords: Text indexing, Burrows-Wheeler transform, Compressed wavelet trees, Entropy-compressed, Compressed data structures} }
Feedback for Dagstuhl Publishing