FM-indexes are a crucial data structure in DNA alignment, but searching with them usually takes at least one random access per character in the query pattern. Ferragina and Fischer [Ferragina and Fischer, 2007] observed in 2007 that word-based indexes often use fewer random accesses than character-based indexes, and thus support faster searches. Since DNA lacks natural word-boundaries, however, it is necessary to parse it somehow before applying word-based FM-indexing. Last year, Deng et al. [Deng et al., 2022] proposed parsing genomic data by induced suffix sorting, and showed the resulting word-based FM-indexes support faster counting queries than standard FM-indexes when patterns are a few thousand characters or longer. In this paper we show that using prefix-free parsing - which takes parameters that let us tune the average length of the phrases - instead of induced suffix sorting, gives a significant speedup for patterns of only a few hundred characters. We implement our method and demonstrate it is between 3 and 18 times faster than competing methods on queries to GRCh38. And was consistently faster on queries made to 25,000, 50,000 and 100,000 SARS-CoV-2 genomes. Hence, it is very clear that our method accelerates the performance of count over all state-of-the-art methods with a minor increase in the memory.
@InProceedings{hong_et_al:LIPIcs.WABI.2023.13, author = {Hong, Aaron and Oliva, Marco and K\"{o}ppl, Dominik and Bannai, Hideo and Boucher, Christina and Gagie, Travis}, title = {{Acceleration of FM-Index Queries Through Prefix-Free Parsing}}, booktitle = {23rd International Workshop on Algorithms in Bioinformatics (WABI 2023)}, pages = {13:1--13:16}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-294-5}, ISSN = {1868-8969}, year = {2023}, volume = {273}, editor = {Belazzougui, Djamal and Ouangraoua, A\"{i}da}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.WABI.2023.13}, URN = {urn:nbn:de:0030-drops-186390}, doi = {10.4230/LIPIcs.WABI.2023.13}, annote = {Keywords: FM-index, pangenomics, scalability, word-based indexing, random access} }
Feedback for Dagstuhl Publishing