@InProceedings{langmead:LIPIcs.WABI.2025.2,
  author =	{Langmead, Ben},
  title =	{{We Are What We Index; a Primer for the Wheeler Graph Era}},
  booktitle =	{25th International Conference on Algorithms for Bioinformatics (WABI 2025)},
  pages =	{2:1--2:2},
  series =	{Leibniz International Proceedings in Informatics (LIPIcs)},
  ISBN =	{978-3-95977-386-7},
  ISSN =	{1868-8969},
  year =	{2025},
  volume =	{344},
  editor =	{Brejov\'{a}, Bro\v{n}a and Patro, Rob},
  publisher =	{Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
  address =	{Dagstuhl, Germany},
  URL =		{https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.WABI.2025.2},
  URN =		{urn:nbn:de:0030-drops-239288},
  doi =		{10.4230/LIPIcs.WABI.2025.2},
  annote =	{Keywords: Indexing, Burrows-Wheeler Transform}
}

Document

DOI: 10.4230/LIPIcs.WABI.2025.3

Approximability of Longest Run Subsequence and Complementary Minimization Problems

Authors: Yuichi Asahiro, Mingyang Gong, Jesper Jansson, Guohui Lin, Sichen Lu, Eiji Miyano, Hirotaka Ono, Toshiki Saitoh, and Shunichi Tanaka

Abstract

We study the polynomial-time approximability of the Longest Run Subsequence problem (LRS for short) and its complementary minimization variant Minimum Run Subsequence Deletion problem (MRSD for short). For a string S = s₁ ⋯ s_n over an alphabet Σ, a subsequence S' of S is S' = s_{i₁} ⋯ s_{i_p}, such that 1 ≤ i₁ < i₂ < … < i_p ≤ |S|. A run of a symbol σ ∈ Σ in S is a maximal substring of consecutive occurrences of σ. A run subsequence S' of S is a subsequence of S in which every symbol σ ∈ Σ occurs in at most one run. The co-subsequence ̅{S'} of the subsequence S' = s_{i₁} ⋯ s_{i_p} in S is the subsequence obtained by deleting all the characters in S' from S, i.e., ̅{S'} = s_{j₁} ⋯ s_{j_{n-p}} such that j₁ < j₂ < … < j_{n-p} and {j₁, …, j_{n-p}} = {1, …, n}⧵ {i₁, …, i_p}. Given a string S, the goal of LRS (resp., MRSD) is to find a run subsequence S^* of S such that the length |S^*| is maximized (resp., the number | ̅{S^*}| of deleted symbols from S is minimized) over all the run subsequences of S. Let k be the maximum number of symbol occurrences in the input S. It is known that LRS and MRSD are APX-hard even if k = 2. In this paper, we show that LRS can be approximated in polynomial time within factors of (k+2)/3 for k = 2 or 3, and 2(k+1)/5 for every k ≥ 4. Furthermore, we show that MRSD can be approximated in linear time within a factor of (k+4)/4 if k is even and (k+3)/4 if k is odd.

Cite as

Yuichi Asahiro, Mingyang Gong, Jesper Jansson, Guohui Lin, Sichen Lu, Eiji Miyano, Hirotaka Ono, Toshiki Saitoh, and Shunichi Tanaka. Approximability of Longest Run Subsequence and Complementary Minimization Problems. In 25th International Conference on Algorithms for Bioinformatics (WABI 2025). Leibniz International Proceedings in Informatics (LIPIcs), Volume 344, pp. 3:1-3:20, Schloss Dagstuhl – Leibniz-Zentrum für Informatik (2025)

Copy BibTex To Clipboard

@InProceedings{asahiro_et_al:LIPIcs.WABI.2025.3,
  author =	{Asahiro, Yuichi and Gong, Mingyang and Jansson, Jesper and Lin, Guohui and Lu, Sichen and Miyano, Eiji and Ono, Hirotaka and Saitoh, Toshiki and Tanaka, Shunichi},
  title =	{{Approximability of Longest Run Subsequence and Complementary Minimization Problems}},
  booktitle =	{25th International Conference on Algorithms for Bioinformatics (WABI 2025)},
  pages =	{3:1--3:20},
  series =	{Leibniz International Proceedings in Informatics (LIPIcs)},
  ISBN =	{978-3-95977-386-7},
  ISSN =	{1868-8969},
  year =	{2025},
  volume =	{344},
  editor =	{Brejov\'{a}, Bro\v{n}a and Patro, Rob},
  publisher =	{Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
  address =	{Dagstuhl, Germany},
  URL =		{https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.WABI.2025.3},
  URN =		{urn:nbn:de:0030-drops-239290},
  doi =		{10.4230/LIPIcs.WABI.2025.3},
  annote =	{Keywords: Longest run subsequence, minimum run subsequence deletion, approximation algorithm}
}

@InProceedings{gonzalezlaffitte_et_al:LIPIcs.WABI.2025.12,
  author =	{Gonz\'{a}lez Laffitte, Marcos E. and Phan, Tieu-Long and Stadler, Peter F.},
  title =	{{Extension of Partial Atom-To-Atom Maps: Uniqueness and Algorithms}},
  booktitle =	{25th International Conference on Algorithms for Bioinformatics (WABI 2025)},
  pages =	{12:1--12:26},
  series =	{Leibniz International Proceedings in Informatics (LIPIcs)},
  ISBN =	{978-3-95977-386-7},
  ISSN =	{1868-8969},
  year =	{2025},
  volume =	{344},
  editor =	{Brejov\'{a}, Bro\v{n}a and Patro, Rob},
  publisher =	{Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
  address =	{Dagstuhl, Germany},
  URL =		{https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.WABI.2025.12},
  URN =		{urn:nbn:de:0030-drops-239410},
  doi =		{10.4230/LIPIcs.WABI.2025.12},
  annote =	{Keywords: atom-to-atom maps, imaginary transition state (ITS) graphs, condensed graph of the reaction (CGR), chemical reaction mechanisms, molecular graphs, metabolic networks, chemical synthesis planning, constrained graph isomorphism}
}

@InProceedings{zentgraf_et_al:LIPIcs.WABI.2025.22,
  author =	{Zentgraf, Jens and Rahmann, Sven},
  title =	{{Design of Worst-Case-Optimal Spaced Seeds}},
  booktitle =	{25th International Conference on Algorithms for Bioinformatics (WABI 2025)},
  pages =	{22:1--22:17},
  series =	{Leibniz International Proceedings in Informatics (LIPIcs)},
  ISBN =	{978-3-95977-386-7},
  ISSN =	{1868-8969},
  year =	{2025},
  volume =	{344},
  editor =	{Brejov\'{a}, Bro\v{n}a and Patro, Rob},
  publisher =	{Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
  address =	{Dagstuhl, Germany},
  URL =		{https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.WABI.2025.22},
  URN =		{urn:nbn:de:0030-drops-239488},
  doi =		{10.4230/LIPIcs.WABI.2025.22},
  annote =	{Keywords: Spaced seed, Gapped k-mer, Integer linear program (ILP), Worst-case design, Reference bias}
}

Document

Extended Abstract

DOI: 10.4230/LIPIcs.WABI.2025.23

Partitioned Multi-MUM Finding for Scalable Pangenomics (Extended Abstract)

Authors: Vikram S. Shivakumar and Ben Langmead

Abstract

Pangenome collections continue to grow and proliferate to hundreds of high-quality genomes, for example, the expanded v2 version of the Human Pangenome Reference Consortium (HPRC) dataset spanning 474 human haplotypes [Liao et al., 2023]. As the size and complexity of these collections grow, it is increasingly important that our methods for studying and indexing pangenomes be scalable and updateable. Maximal Unique Matches (multi-MUMs), exact substring matches present exactly once in all sequences in a pangenome collection, represent conserved anchor sequences that can comprise a common coordinate system. We previously proposed a framework and tool called Mumemto for rapidly identifying multi-MUMs during construction of a compressed pangenome index [Shivakumar and Langmead, 2025]. Using prefix-free parsing (PFP) [Boucher et al., 2019], a compressed-space method for computing full-text indexes, Mumemto outperforms existing methods for identifying multi-MUMs. However, one drawback remains updateability and scalability. Mumemto can become memory-intensive for large pangenomes (> 300 human genomes), and as newly assembled genomes are added to a pangenome collection, Mumemto requires re-running on the entire updated collection. To address this, we developed a partition-merging approach to compute multi-MUMs with Mumemto. We introduce two strategies for merging of multi-MUMs computed across different collections (see Figure 1), enabling parallelization across partitions and simple computation of multi-MUMs for incrementally-updated collections. The first strategy requires a common sequence in each partition (which we call "anchor-based merging"), which serves as a coordinate system to identify multi-MUM overlaps between partitions. By tracking the next longest match for all multi-MUMs and unique matches (UMs) in an auxiliary data structure, intersections between matches can be filtered out if no longer unique in the union collection. The second strategy identifies overlaps directly from the multi-MUM substrings (called "string-based merging"). The overlaps are identified by running Mumemto over the extracted multi-MUM sequences and are similarly filtered out if they are too short to considered unique. Lastly, we propose an extension to anchor-based merging to enable the computation of partial multi-MUMs, present in only a subset of sequences in the union set. The partition-merging framework introduces a tradeoff space in Mumemto between running time and memory, depending on partition size and the number of threads. Running parallel, per-partition Mumemto processes and merging the results reduces the running time but increases the peak memory footprint, while running a single Mumemto thread over each partition serially yields longer running time but a smaller memory footprint. To evaluate this tradeoff, we computed multi-MUMs across 474 haplotypes of chr19 from the HPRC v2 dataset [Liao et al., 2023] and 69 assemblies of A. thaliana [Lian et al., 2024] (Table 1). The string-based method also enables merging multi-MUMs between disjoint collections, for example subclades in a phylogenetic tree. By merging multi-MUMs along the shape of the tree, we can compute matches at internal nodes of the tree along with the root, revealing clade-specific conservation and structural variation. Multi-MUM merging also enables interspecific match computation, which was previously infeasible with Mumemto due to high memory usage for highly-diverse input sequence collections. We use partition-merging to compute multi-MUMs across 29 primate assemblies, and found a correspondence to ultraconserved elements previously found across mammalian genomes [Cummins et al., 2024]. We show that a partitioned Mumemto enables scalability to growing pangenome collections and expands the applicability of Mumemto to larger, more diverse datasets. As a result, Mumemto is the only method capable of computing exact matches across the entire HPRC v2 dataset (474 haplotypes [Liao et al., 2023]), and can easily incorporate future releases of assemblies without recomputation. This increases the scope for exploration of genomic conservation and variation and highlights the potential for Mumemto as a core method for future pangenomics and comparative genomics research. The partitioned Mumemto framework is implemented in v1.3.0 and is available open-source at https://github.com/vikshiv/mumemto.

Cite as

Vikram S. Shivakumar and Ben Langmead. Partitioned Multi-MUM Finding for Scalable Pangenomics (Extended Abstract). In 25th International Conference on Algorithms for Bioinformatics (WABI 2025). Leibniz International Proceedings in Informatics (LIPIcs), Volume 344, pp. 23:1-23:4, Schloss Dagstuhl – Leibniz-Zentrum für Informatik (2025)

Copy BibTex To Clipboard

@InProceedings{shivakumar_et_al:LIPIcs.WABI.2025.23,
  author =	{Shivakumar, Vikram S. and Langmead, Ben},
  title =	{{Partitioned Multi-MUM Finding for Scalable Pangenomics}},
  booktitle =	{25th International Conference on Algorithms for Bioinformatics (WABI 2025)},
  pages =	{23:1--23:4},
  series =	{Leibniz International Proceedings in Informatics (LIPIcs)},
  ISBN =	{978-3-95977-386-7},
  ISSN =	{1868-8969},
  year =	{2025},
  volume =	{344},
  editor =	{Brejov\'{a}, Bro\v{n}a and Patro, Rob},
  publisher =	{Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik},
  address =	{Dagstuhl, Germany},
  URL =		{https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.WABI.2025.23},
  URN =		{urn:nbn:de:0030-drops-239490},
  doi =		{10.4230/LIPIcs.WABI.2025.23},
  annote =	{Keywords: Pangenomics, Comparative genomics, Compressed indexing}
}

LIPIcs, Volume 344

25th International Conference on Algorithms for Bioinformatics (WABI 2025)

Event

Editors

Publication Details

Access Numbers

Documents

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Abstract

Cite as

Filters

Thanks for your feedback!

Could not send message