We study a document retrieval problem in the new framework where D text documents are organized in a category tree with a pre-defined number h of categories. This situation occurs e.g. with taxomonic trees in biology or subject classification systems for scientific literature. Given a string pattern p and a category (level in the category tree), we wish to efficiently retrieve the t categorical units containing this pattern and belonging to the category. We propose several efficient solutions for this problem. One of them uses n(logσ(1+o(1))+log D+O(h)) + O(Δ) bits of space and O(|p|+t) query time, where n is the total length of the documents, σ the size of the alphabet used in the documents and Δ is the total number of nodes in the category tree. Another solution uses n(logσ(1+o(1))+O(log D))+O(Δ)+O(Dlog n) bits of space and O(|p|+tlog D) query time. We finally propose other solutions which are more space-efficient at the expense of a slight increase in query time.
@InProceedings{belazzougui_et_al:LIPIcs.CPM.2020.4, author = {Belazzougui, Djamal and Kucherov, Gregory}, title = {{Efficient Tree-Structured Categorical Retrieval}}, booktitle = {31st Annual Symposium on Combinatorial Pattern Matching (CPM 2020)}, pages = {4:1--4:11}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-149-8}, ISSN = {1868-8969}, year = {2020}, volume = {161}, editor = {G{\o}rtz, Inge Li and Weimann, Oren}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.CPM.2020.4}, URN = {urn:nbn:de:0030-drops-121299}, doi = {10.4230/LIPIcs.CPM.2020.4}, annote = {Keywords: pattern matching, document retrieval, category tree, space-efficient data structures} }
Feedback for Dagstuhl Publishing