String searching functions
A set of functions for finding all the occurrences (aka "matches" or "hits") of a given pattern (typically short) in a (typically long) reference sequence or set of reference sequences (aka the subject)
matchPattern(pattern, subject, max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, algorithm="auto") countPattern(pattern, subject, max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, algorithm="auto") vmatchPattern(pattern, subject, max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, algorithm="auto", ...) vcountPattern(pattern, subject, max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, algorithm="auto", ...)
pattern |
The pattern string. |
subject |
An XString, XStringViews or MaskedXString
object for An XStringSet or XStringViews object for
|
max.mismatch, min.mismatch |
The maximum and minimum number of mismatching letters allowed (see
|
with.indels |
If (a) nedit(P, S') <= max.mismatch (b) for every substring S1 of S': nedit(P, S1) > nedit(P, S') (c) for every substring S2 of S that contains S': nedit(P, S2) >= nedit(P, S') One nice property of "best local matches" is that their first and last letters are guaranteed to match the letters in P that they align with. |
fixed |
If |
algorithm |
One of the following: |
... |
Additional arguments for methods. |
Available algorithms are: “naive exact”, “naive inexact”,
“Boyer-Moore-like”, “shift-or” and “indels”.
Not all of them can be used in all situations: restrictions
apply depending on the "search criteria" i.e. on the values of
the pattern
, subject
, max.mismatch
,
min.mismatch
, with.indels
and fixed
arguments.
It is important to note that the algorithm
argument
is not part of the search criteria. This is because the supported
algorithms are interchangeable, that is, if 2 different algorithms
are compatible with a given search criteria, then choosing one or
the other will not affect the result (but will most likely affect
the performance). So there is no "wrong choice" of algorithm (strictly
speaking).
Using algorithm="auto"
(the default) is recommended because
then the best suited algorithm will automatically be selected among
the set of algorithms that are valid for the given search criteria.
An XStringViews object for matchPattern
.
A single integer for countPattern
.
An MIndex object for vmatchPattern
.
An integer vector for vcountPattern
, with each element in
the vector corresponding to the number of matches in the corresponding
element of subject
.
Use matchPDict
if you need to match a (big) set of patterns
against a reference sequence.
Use pairwiseAlignment
if you need to solve a (Needleman-Wunsch)
global alignment, a (Smith-Waterman) local alignment, or an (ends-free)
overlap alignment problem.
## --------------------------------------------------------------------- ## A. matchPattern()/countPattern() ## --------------------------------------------------------------------- ## A simple inexact matching example with a short subject: x <- DNAString("AAGCGCGATATG") m1 <- matchPattern("GCNNNAT", x) m1 m2 <- matchPattern("GCNNNAT", x, fixed=FALSE) m2 as.matrix(m2) ## With DNA sequence of yeast chromosome number 1: data(yeastSEQCHR1) yeast1 <- DNAString(yeastSEQCHR1) PpiI <- "GAACNNNNNCTC" # a restriction enzyme pattern match1.PpiI <- matchPattern(PpiI, yeast1, fixed=FALSE) match2.PpiI <- matchPattern(PpiI, yeast1, max.mismatch=1, fixed=FALSE) ## With a genome containing isolated Ns: library(BSgenome.Celegans.UCSC.ce2) chrII <- Celegans[["chrII"]] alphabetFrequency(chrII) matchPattern("N", chrII) matchPattern("TGGGTGTCTTT", chrII) # no match matchPattern("TGGGTGTCTTT", chrII, fixed=FALSE) # 1 match ## Using wildcards ("N") in the pattern on a genome containing N-blocks: library(BSgenome.Dmelanogaster.UCSC.dm3) chrX <- maskMotif(Dmelanogaster$chrX, "N") as(chrX, "Views") # 4 non masked regions matchPattern("TTTATGNTTGGTA", chrX, fixed=FALSE) ## Can also be achieved with no mask: masks(chrX) <- NULL matchPattern("TTTATGNTTGGTA", chrX, fixed="subject") ## --------------------------------------------------------------------- ## B. vmatchPattern()/vcountPattern() ## --------------------------------------------------------------------- ## Load Fly upstream sequences (i.e. the sequences 2000 bases upstream of ## annotated transcription starts): dm3_upstream_filepath <- system.file("extdata", "dm3_upstream2000.fa.gz", package="Biostrings") dm3_upstream <- readDNAStringSet(dm3_upstream_filepath) dm3_upstream Ebox <- DNAString("CANNTG") subject <- dm3_upstream mindex <- vmatchPattern(Ebox, subject, fixed="subject") nmatch_per_seq <- elementNROWS(mindex) # Get the number of matches per # subject element. sum(nmatch_per_seq) # Total number of matches. table(nmatch_per_seq) ## Let's have a closer look at one of the upstream sequences with most ## matches: i0 <- which.max(nmatch_per_seq) subject0 <- subject[[i0]] ir0 <- mindex[[i0]] # matches in 'subject0' as an IRanges object ir0 Views(subject0, ir0) # matches in 'subject0' as a Views object ## --------------------------------------------------------------------- ## C. WITH INDELS ## --------------------------------------------------------------------- library(BSgenome.Celegans.UCSC.ce2) subject <- Celegans$chrI pattern1 <- DNAString("ACGGACCTAATGTTATC") pattern2 <- DNAString("ACGGACCTVATGTTRTC") ## Allowing up to 2 mismatching letters doesn't give any match: m1a <- matchPattern(pattern1, subject, max.mismatch=2) ## But allowing up to 2 edit operations gives 3 matches: system.time(m1b <- matchPattern(pattern1, subject, max.mismatch=2, with.indels=TRUE)) m1b ## pairwiseAlignment() returns the (first) best match only: if (interactive()) { mat <- nucleotideSubstitutionMatrix(match=1, mismatch=0, baseOnly=TRUE) ## Note that this call to pairwiseAlignment() will need to ## allocate 733.5 Mb of memory (i.e. length(pattern) * length(subject) ## * 3 bytes). system.time(pwa <- pairwiseAlignment(pattern1, subject, type="local", substitutionMatrix=mat, gapOpening=0, gapExtension=1)) pwa } ## With IUPAC ambiguities in the pattern: m2a <- matchPattern(pattern2, subject, max.mismatch=2, fixed="subject") m2b <- matchPattern(pattern2, subject, max.mismatch=2, with.indels=TRUE, fixed="subject") ## All the matches in 'm1b' and 'm2a' should also appear in 'm2b': stopifnot(suppressWarnings(all(ranges(m1b) %in% ranges(m2b)))) stopifnot(suppressWarnings(all(ranges(m2a) %in% ranges(m2b)))) ## --------------------------------------------------------------------- ## D. WHEN 'with.indels=TRUE', ONLY "BEST LOCAL MATCHES" ARE REPORTED ## --------------------------------------------------------------------- ## With deletions in the subject: subject <- BString("ACDEFxxxCDEFxxxABCE") matchPattern("ABCDEF", subject, max.mismatch=2, with.indels=TRUE) matchPattern("ABCDEF", subject, max.mismatch=2) ## With insertions in the subject: subject <- BString("AiBCDiEFxxxABCDiiFxxxAiBCDEFxxxABCiDEF") matchPattern("ABCDEF", subject, max.mismatch=2, with.indels=TRUE) matchPattern("ABCDEF", subject, max.mismatch=2) ## With substitutions (note that the "best local matches" can introduce ## indels and therefore be shorter than 6): subject <- BString("AsCDEFxxxABDCEFxxxBACDEFxxxABCEDF") matchPattern("ABCDEF", subject, max.mismatch=2, with.indels=TRUE) matchPattern("ABCDEF", subject, max.mismatch=2)
Please choose more modern alternatives, such as Google Chrome or Mozilla Firefox.