Get chromosome information for an Ensembl species
getChromInfoFromEnsembl
returns chromosome information like
sequence names, lengths and circularity flags for a given Ensembl
species e.g. Human, Cow, Saccharomyces cerevisiae, etc...
getChromInfoFromEnsembl(species, release=NA, division=NA, use.grch37=FALSE, assembled.molecules.only=FALSE, include.non_ref.sequences=FALSE, include.contigs=FALSE, include.clones=FALSE, map.NCBI=FALSE, recache=FALSE, as.Seqinfo=FALSE)
species |
A single string specifying the name of an Ensembl species e.g.
Alternatively the name of an assembly (e.g. |
release |
The Ensembl release to query e.g. 89. If set to |
division |
|
use.grch37 |
NOT TESTED YET!
|
assembled.molecules.only |
NOT IMPLEMENTED YET! |
include.non_ref.sequences |
TODO: DOCUMENT THIS! |
include.contigs |
Whether or not sequences for which |
include.clones |
Whether or not sequences for which |
map.NCBI |
If Note that not all rows in the "Ensembl chrom info" data frame are
necessarily mapped to a row in the "NCBI chrom info" data frame.
For the unmapped rows the NCBI columns in the final data frame
are filled with NAs ( The primary use case for using |
recache |
|
as.Seqinfo |
|
COMING SOON...
For getChromInfoFromEnsembl
: By default, a 7-column data frame
with columns:
name
: character.
length
: integer.
coord_system
: factor.
synonyms
: list.
toplevel
: logical.
non_ref
: logical.
circular
: logical.
and with attribute species_info
which contains details about
the species that was used to obtaine the data.
If map.NCBI
is TRUE
, then 7 "NCBI columns" are added
to the result:
NCBI.SequenceName
: character.
NCBI.SequenceRole
: factor.
NCBI.AssignedMolecule
: factor.
NCBI.GenBankAccn
: character.
NCBI.Relationship
: factor.
NCBI.RefSeqAccn
: character.
NCBI.AssemblyUnit
: factor.
Note that the names of the "NCBI columns" are those returned
by getChromInfoFromNCBI
but with the NCBI.
prefix added to them.
H. Pagès
getChromInfoFromNCBI
and
getChromInfoFromUCSC
for getting chromosome
information for an NCBI assembly or UCSC genome.
Seqinfo objects.
## --------------------------------------------------------------------- ## A. BASIC EXAMPLES ## --------------------------------------------------------------------- ## Internet access required! ## === Worm === ## https://uswest.ensembl.org/Caenorhabditis_elegans celegans <- getChromInfoFromEnsembl("celegans") attr(celegans, "species_info") getChromInfoFromEnsembl("celegans", as.Seqinfo=TRUE) celegans <- getChromInfoFromEnsembl("celegans", map.NCBI=TRUE) ## === Yeast === ## https://uswest.ensembl.org/Saccharomyces_cerevisiae scerevisiae <- getChromInfoFromEnsembl("scerevisiae") attr(scerevisiae, "species_info") getChromInfoFromEnsembl("scerevisiae", as.Seqinfo=TRUE) scerevisiae <- getChromInfoFromEnsembl("scerevisiae", map.NCBI=TRUE) ## Arabidopsis thaliana: athaliana <- getChromInfoFromEnsembl("athaliana", division="plants", map.NCBI=TRUE) attr(athaliana, "species_info") ## --------------------------------------------------------------------- ## Temporary stuff that needs to go away... ## --------------------------------------------------------------------- ## TODO: Check all species for which an NCBI assembly is registered! ## Checked so far (with current Ensembl release i.e. 99): ## - celegans OK ## - scerevisiae OK ## - athaliana OK ## - btaurus OK ## - sscrofa OK ## Not run: ## WORK IN PROGRESS!!! library(GenomeInfoDb) .do_join <- GenomeInfoDb:::.do_join .map_Ensembl_seqlevels_to_NCBI_seqlevels <- GenomeInfoDb:::.map_Ensembl_seqlevels_to_NCBI_seqlevels .map_Ensembl_seqlevels_to_NCBI_seqlevels( paste0("ENS_", 1:26), CharacterList(c(list(c(aa="INSDC1", bb="GNBK7"), c("INSDC2", "RefSeq3")), rep(list(NULL), 23), list("NCBI_7"))), paste0("NCBI_", 1:10), paste0("GNBK", c(1:8, NA, 9)), c(paste0("REFSEQ", c(1:7, 1, 1)), NA), verbose=TRUE ) map_to_NCBI <- function(Ensembl_chrom_info, NCBI_chrom_info, special_mappings=NULL) { .map_Ensembl_seqlevels_to_NCBI_seqlevels( Ensembl_chrom_info[ , "name"], Ensembl_chrom_info[ , "synonyms"], NCBI_chrom_info[ , "SequenceName"], NCBI_chrom_info[ , "GenBankAccn"], NCBI_chrom_info[ , "RefSeqAccn"], special_mappings=special_mappings, verbose=TRUE) } ## ------------------------------------------------------------------------ ## Human ## https://uswest.ensembl.org/Homo_sapiens/ ## Based on GRCh38.p13 (GCA_000001405.28) ## Return 944 rows human_chrom_info <- getChromInfoFromEnsembl("hsapiens") # 1 id: 131550 <- ref chromosome # CHR_HSCHR1_1_CTG3 id: 131561 <- non-ref chromosome # HSCHR1_1_CTG3 id: 131562 <- scaffold (no scaffold is non_ref) ## Map to NCBI ## Summary: ## - 639/640 NCBI sequences are reverse-mapped. ## - Restricted mapping is one-to-one. GRCh38.p13 <- getChromInfoFromNCBI("GRCh38.p13") L2R <- map_to_NCBI(human_chrom_info, GRCh38.p13) ## The only sequence in GRCh38.p13 that cannot be mapped to Ensembl is ## HG2139_PATCH (was introduced in GRCh38.p2)! Why? What's special about ## this patch? GRCh38.p13$mapped <- tabulate(L2R, nbins=nrow(GRCh38.p13)) != 0L table(GRCh38.p13$SequenceRole, GRCh38.p13$mapped) # FALSE TRUE # assembled-molecule 0 25 # alt-scaffold 0 261 # unlocalized-scaffold 0 42 # unplaced-scaffold 0 127 # pseudo-scaffold 0 0 # fix-patch 1 112 # novel-patch 0 72 human_chrom_info <- .do_join(human_chrom_info, GRCh38.p13, L2R) table(human_chrom_info$SequenceRole, human_chrom_info$toplevel) # FALSE TRUE # assembled-molecule 0 25 # alt-scaffold 261 0 # unlocalized-scaffold 0 42 # unplaced-scaffold 0 127 # pseudo-scaffold 0 0 # fix-patch 112 0 # novel-patch 72 0 #hsa_seqlevels <- readRDS("hsapiens_gene_ensembl_txdb_seqlevels.rds") ## ------------------------------------------------------------------------ ## Mouse ## https://uswest.ensembl.org/Mus_musculus/ ## Based on GRCm38.p6 (GCA_000001635.8) ## Return 258 rows mouse_chrom_info <- getChromInfoFromEnsembl("mmusculus") ## Map to NCBI ## Summary: ## - 139/239 NCBI sequences are reverse-mapped. ## - Restricted mapping is NOT one-to-one: 2 Ensembl sequences (NC_005089.1 ## and MT) are both mapped to NCBI MT. GRCm38.p6 <- getChromInfoFromNCBI("GRCm38.p6") L2R <- map_to_NCBI(mouse_chrom_info, GRCm38.p6) ## 100 sequences in GRCm38.p6 are not mapped: GRCm38.p6$mapped <- tabulate(L2R, nbins=nrow(GRCm38.p6)) != 0L table(GRCm38.p6$SequenceRole, GRCm38.p6$mapped) # FALSE TRUE # assembled-molecule 0 22 # alt-scaffold 99 0 # unlocalized-scaffold 0 22 # unplaced-scaffold 0 22 # pseudo-scaffold 0 0 # fix-patch 1 64 # novel-patch 0 9 ## OK so Ensembl doesn't include the alt-scaffolds for Mouse. BUT WHAT ## HAPPENED TO THIS ONE fix-patch SEQUENCE (MG4237_PATCH) THAT IS NOT ## MAPPED? Found it in seq_region_synonym table! It's seq_region_id=100405. ## Hey but that seq_region_id is **NOT** in the seq_region table!!! THIS ## VIOLATES FOREIGN KEY CONSTRAINT!!!! mouse_chrom_info <- .do_join(mouse_chrom_info, GRCm38.p6, L2R) ## Ensembl does NOT comsider NC_005089.1 (duplicate entry for MT) toplevel: mouse_chrom_info[mouse_chrom_info$SequenceName # name length coord_system synonyms toplevel # 184 NC_005089.1 16299 scaffold FALSE # 201 MT 16299 chromosome NC_005089.1, chrM, AY172335.1 TRUE # SequenceName GenBankAccn RefSeqAccn # 184 MT AY172335.1 NC_005089.1 # 201 MT AY172335.1 NC_005089.1 ## ------------------------------------------------------------------------ ## Rat ## https://uswest.ensembl.org/Rattus_norvegicus/ ## Based on Rnor_6.0 (GCA_000001895.4) # Return 1418 rows rat_chrom_info <- getChromInfoFromEnsembl("rnorvegicus") ## Map to NCBI ## Summary: ## - 955/955 NCBI sequences are reverse-mapped. ## - Reverse mapping is one-to-many: 2 Ensembl sequences (NC_001665.2 and MT) ## are mapped to NCBI MT. Rnor_6.0 <- getChromInfoFromNCBI("Rnor_6.0") L2R <- map_to_NCBI(rat_chrom_info, Rnor_6.0) rat_chrom_info <- .do_join(rat_chrom_info, Rnor_6.0, L2R) ## Ensembl does NOT comsider NC_001665.2 (duplicate entry for MT) toplevel: rat_chrom_info[rat_chrom_info$SequenceName # name length coord_system synonyms toplevel # 1417 NC_001665.2 16313 scaffold FALSE # 1418 MT 16313 chromosome NC_001665.2, AY172581.1, chrM TRUE # SequenceName GenBankAccn RefSeqAccn # 1417 MT AY172581.1 NC_001665.2 # 1418 MT AY172581.1 NC_001665.2 table(rat_chrom_info$SequenceRole, rat_chrom_info$toplevel) # FALSE TRUE # assembled-molecule 1 23 # alt-scaffold 0 0 # unlocalized-scaffold 0 354 # unplaced-scaffold 0 578 # pseudo-scaffold 0 0 # fix-patch 0 0 # novel-patch 0 0 ## End(Not run)
Please choose more modern alternatives, such as Google Chrome or Mozilla Firefox.