We consider document listing on string collections, that is, finding in which strings a given pattern appears. In particular, we focus on repetitive collections: a collection of size N over alphabet [1,a] is composed of D copies of a string of size n, and s single-character edits are applied on the copies. We introduce the first document listing index with size O~(n + s), precisely O((n lg a + s lg^2 N) lg D) bits, and with useful worst-case time guarantees: Given a pattern of length m, the index reports the ndoc strings where it appears in time O(m^2 + m lg N (lg D + lg^e N) ndoc), for any constant e > 0.
@InProceedings{navarro:LIPIcs.CPM.2017.4, author = {Navarro, Gonzalo}, title = {{Document Listing on Repetitive Collections with Guaranteed Performance}}, booktitle = {28th Annual Symposium on Combinatorial Pattern Matching (CPM 2017)}, pages = {4:1--4:13}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-039-2}, ISSN = {1868-8969}, year = {2017}, volume = {78}, editor = {K\"{a}rkk\"{a}inen, Juha and Radoszewski, Jakub and Rytter, Wojciech}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.CPM.2017.4}, URN = {urn:nbn:de:0030-drops-73268}, doi = {10.4230/LIPIcs.CPM.2017.4}, annote = {Keywords: repetitive string collections, document listing, grammar compression, range minimum queries, succinct data structures} }
Feedback for Dagstuhl Publishing