A fundamental operation in computational genomics is to reduce the input sequences to their constituent k-mers. For maximum performance of downstream applications it is important to store the k-mers in small space, while keeping the representation easy and efficient to use (i.e. without k-mer repetitions and in plain text). Recently, heuristics were presented to compute a near-minimum such representation. We present an algorithm to compute a minimum representation in optimal (linear) time and use it to evaluate the existing heuristics. For that, we present a formalisation of arc-centric bidirected de Bruijn graphs and carefully prove that it accurately models the k-mer spectrum of the input. Our algorithm first constructs the de Bruijn graph in linear time in the length of the input strings (for a fixed-size alphabet). Then it uses a Eulerian-cycle-based algorithm to compute the minimum representation, in time linear in the size of the output.
@InProceedings{schmidt_et_al:LIPIcs.WABI.2022.2, author = {Schmidt, Sebastian and Alanko, Jarno N.}, title = {{Eulertigs: Minimum Plain Text Representation of k-mer Sets Without Repetitions in Linear Time}}, booktitle = {22nd International Workshop on Algorithms in Bioinformatics (WABI 2022)}, pages = {2:1--2:21}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-243-3}, ISSN = {1868-8969}, year = {2022}, volume = {242}, editor = {Boucher, Christina and Rahmann, Sven}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.WABI.2022.2}, URN = {urn:nbn:de:0030-drops-170361}, doi = {10.4230/LIPIcs.WABI.2022.2}, annote = {Keywords: Spectrum preserving string sets, Eulerian cycle, Suffix tree, Bidirected arc-centric de Bruijn graph, k-mer based methods} }
Feedback for Dagstuhl Publishing