@article{64bdcb47fa22482ab953c73d9e822379, title = "GenMap: ultra-fast computation of genome mappability", abstract = "MOTIVATION: Computing the uniqueness of k-mers for each position of a genome while allowing for up to e mismatches is computationally challenging. However, it is crucial for many biological applications such as the design of guide RNA for CRISPR experiments. More formally, the uniqueness or (k, e)-mappability can be described for every position as the reciprocal value of how often this k-mer occurs approximately in the genome, i.e. with up to e mismatches. RESULTS: We present a fast method GenMap to compute the (k, e)-mappability. We extend the mappability algorithm, such that it can also be computed across multiple genomes where a k-mer occurrence is only counted once per genome. This allows for the computation of marker sequences or finding candidates for probe design by identifying approximate k-mers that are unique to a genome or that are present in all genomes. GenMap supports different formats such as binary output, wig and bed files as well as csv files to export the location of all approximate k-mers for each genomic position. AVAILABILITY AND IMPLEMENTATION: GenMap can be installed via bioconda. Binaries and C++ source code are available on https://github.com/cpockrandt/genmap.", author = "Christopher Pockrandt and Mai Alzamel and Iliopoulos, {Costas S.} and Knut Reinert", year = "2020", month = "6", day = "1", doi = "10.1093/bioinformatics/btaa222", language = "English", volume = "36", pages = "3687--3692", journal = "Bioinformatics (Oxford, England)", issn = "1367-4811", number = "12", }
@article{494a1cbddd84401e94ac745ec58e638e, title = "Faster algorithms for 1-mappability of a sequence", abstract = "In the k-mappability problem, we are given a string x of length n and integers m and k, and we are asked to count, for each length-m factor y of x, the number of other factors of length m of x that are at Hamming distance at most k from y. We focus here on the version of the problem where k=1. There exists an algorithm to solve this problem for k=1 requiring time O(mnlogn/loglogn) using space O(n). Here we present two new algorithms that require worst-case time O(mn) and O(nlognloglogn), respectively, and space O(n), thus greatly improving the previous result. Moreover, we present another algorithm that requires average-case time and space O(n) for integer alphabets of size σ if m=Ω(log σn). Notably, we show that this algorithm is generalizable for arbitrary k, requiring average-case time O(kn) and space O(n) if m=Ω(klog σn), assuming that the letters are independent and uniformly distributed random variables. Finally, we provide an experimental evaluation of our average-case algorithm demonstrating its competitiveness to the state-of-the-art implementation.", keywords = "Algorithms on strings, Hamming distance, Sequence mappability", author = "Mai Alzamel and Panagiotis Charalampopoulos and Iliopoulos, {Costas S.} and Pissis, {Solon P.} and Jakub Radoszewski and Wing-Kin Sung", year = "2019", month = "5", day = "23", doi = "10.1016/j.tcs.2019.04.026", language = "English", journal = "Theoretical Computer Science", issn = "0304-3975", publisher = "Elsevier", }
@article{a57e0f5cc48746efb0094195583ee140, title = "Off-line and on-line algorithms for closed string factorization", abstract = "A string X=X[1..n], n>1, is said to be closed if it has a nonempty proper prefix that is also a suffix, but that otherwise occurs nowhere else in X; for n=1, every X is closed. Closed strings were introduced by Fici in [1] as objects of combinatorial interest. Recently Badkobeh et al.[2] described a variety of algorithms to factor a given string into closed factors. In particular, they studied the Longest Closed Factorization (LCF) problem, which greedily computes the decomposition X=X1X2⋯Xk, where X1 is the longest closed prefix of X, X2 the longest closed prefix of X with prefix X1 removed, and so on. In this paper we present an O(logn) amortized per character algorithm to compute LCF on-line, where n is the length of the string. We also introduce the Minimum Closed Factorization (MCF) problem, which identifies the minimum number of closed factors that cover X. We first describe an off-line O(nlog2n)-time algorithm to compute MCF(X), then we present an on-line algorithm for the same problem. In fact, we show that MCF(X) can be computed in O(Llogn) time from MCF(X′), where X′=X[1..n−1], and L is the largest integer such that the suffix X[n−L+1..n] is a substring of X′.", keywords = "Closed string, On-line, Minimum factorization", author = "Mai Alzamel and Iliopoulos, {Costas S.} and W.F. Smyth and Wing-Kin Sung", year = "2018", month = "10", day = "30", doi = "10.1016/j.tcs.2018.10.033", language = "English", journal = "Theoretical Computer Science", issn = "0304-3975", publisher = "Elsevier", }
@article{7930cd53f3664c09b011342b26c4e343, title = "Efficient Computation of Palindromes in Sequences with Uncertainties Extension Version", author = "Alzamel, {Mai Abdulaziz M} and Chang Liu and Jia Gao and Costas Iliopoulos", year = "2018", language = "English", volume = "163", pages = "253--266", journal = "FUNDAMENTA INFORMATICAE", issn = "0169-2968", publisher = "IOS Press", number = "3", }