Creative Commons Attribution 3.0 Unported license
The problem of finding factors of a text string which are identical or similar to a given pattern string is a central problem in computer science. A generalised version of this problem consists in implementing an index over the text to support efficient on-line pattern queries. We study this problem in the case where the text is weighted: for every position of the text and every letter of the alphabet a probability of occurrence of this letter at this position is given. Sequences of this type, also called position weight matrices, are commonly used to represent imprecise or uncertain data. A weighted sequence may represent many different strings, each with probability of occurrence equal to the product of probabilities of its letters at subsequent positions. Given a probability threshold 1/z, we say that a pattern string P matches a weighted text at position i if the product of probabilities of the letters of P at positions i,...,i+|P|-1 in the text is at least 1/z. In this article, we present an O(nz)-time construction of an O(nz)-sized index that can answer pattern matching queries in a weighted text in optimal time improving upon the state of the art by a factor of z log z. Other applications of this data structure include an O(nz)-time construction of the weighted prefix table and an O(nz)-time computation of all covers of a weighted sequence, which improve upon the state of the art by the same factor.
@InProceedings{barton_et_al:LIPIcs.CPM.2016.4,
author = {Barton, Carl and Kociumaka, Tomasz and Pissis, Solon P. and Radoszewski, Jakub},
title = {{Efficient Index for Weighted Sequences}},
booktitle = {27th Annual Symposium on Combinatorial Pattern Matching (CPM 2016)},
pages = {4:1--4:13},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
ISBN = {978-3-95977-012-5},
ISSN = {1868-8969},
year = {2016},
volume = {54},
editor = {Grossi, Roberto and Lewenstein, Moshe},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
address = {Dagstuhl, Germany},
URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.CPM.2016.4},
URN = {urn:nbn:de:0030-drops-60807},
doi = {10.4230/LIPIcs.CPM.2016.4},
annote = {Keywords: weighted sequence, position weight matrix, indexing, weighted suffix tree}
}