,
Vinamratha Pattar
,
Mingfu Shao
Creative Commons Attribution 4.0 International license
Sequence similarity estimation is essential for many bioinformatics tasks, including functional annotation, phylogenetic analysis, and overlap graph construction. Alignment-free methods aim to solve large-scale sequence similarity estimation by mapping sequences to more easily comparable features that can approximate edit distances efficiently. Substrings or k-mers, as the dominant choice of features, face an unavoidable compromise between sensitivity and specificity when selecting the proper k-value. Recently, subsequence-based features have shown improved performance, but they are computationally demanding, and determining the ideal subsequence length remains an intricate art. In this work, we introduce SubseqSketch, a novel alignment-free scheme that maps a sequence to an integer vector, where the entries correspond to dynamic, rather than fixed, lengths of random subsequences. The cosine similarity between these vectors exhibits a strong correlation with the edit similarity between the original sequences. Through experiments on benchmark datasets, we demonstrate that SubseqSketch is both efficient and effective across various alignment-free tasks, including nearest neighbor search and phylogenetic clustering. A C++ implementation of SubseqSketch is openly available at https://github.com/Shao-Group/SubseqSketch.
@InProceedings{chen_et_al:LIPIcs.WABI.2025.7,
author = {Chen, Ke and Pattar, Vinamratha and Shao, Mingfu},
title = {{Sequence Similarity Estimation by Random Subsequence Sketching}},
booktitle = {25th International Conference on Algorithms for Bioinformatics (WABI 2025)},
pages = {7:1--7:17},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
ISBN = {978-3-95977-386-7},
ISSN = {1868-8969},
year = {2025},
volume = {344},
editor = {Brejov\'{a}, Bro\v{n}a and Patro, Rob},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
address = {Dagstuhl, Germany},
URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.WABI.2025.7},
URN = {urn:nbn:de:0030-drops-239332},
doi = {10.4230/LIPIcs.WABI.2025.7},
annote = {Keywords: Alignment-free sequence comparison, Phylogenetic clustering, Nearest neighbor search, Edit distance embedding}
}
archived version