diff --git a/IsolateCanonicalVariant.R b/IsolateCanonicalVariant.R new file mode 100644 index 0000000000000000000000000000000000000000..a39747ef66724a9dde52ad92b4f2978d97d3621f --- /dev/null +++ b/IsolateCanonicalVariant.R @@ -0,0 +1,74 @@ +# #<----------------------------> +# # Please include this section when distributing and/or using this code. +# # Please read and abide by the terms of the included LICENSE +# # +# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha) +# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues +# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE +# # +# # PURPOSE: +# # From a given vector of annotations for a particular DNA change +# # this function selects the canonical variant (if present) +# # by cross referencing the MANE Select and RefSeq Select sets. +# # +# # Logic flow: +# # - If there is only one annotation; that is selected +# # - If canonical transcript is not found in MANE Select + RefSeq select +# # or a matching transcript ID is not found in the annotation then; +# # The mutation with to the the highest position (residue number) is selected. +# # - If a match for canonical isoform is found then; +# # that particular mutation is selected +# # +# #<----------------------------> + +# Installing dependencies +dependencies <- c("stringi", "doParallel") +missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])] +if(length(missing_packages)) install.packages(missing_packages) +rm(missing_packages,dependencies) + +IsolateCanonicalVariant <- function (AAchangeAnnotations){ + library(doParallel) + #--- Change to master when ready + # importing resources + refseq <- readRDS(url("https://gitlab.utu.fi/deecha/shared_scripts/-/raw/develop/asset/RefSeqSelect_Gene_Transcript.RDS"),"rb") + source("https://gitlab.utu.fi/deecha/shared_scripts/-/raw/master/MutSiteFind.R") + + # initializing cluster + myCluster <- makeCluster(parallel::detectCores(), type = "FORK",useXDR=F,.combine=cbind); registerDoParallel(myCluster);print(myCluster) + + file.create("log.txt") + message(paste0("Logging processed mutations at: ",getwd(),"/log.txt")) + + # computation + results <- foreach(MutInfo = AAchangeAnnotations,.combine = c) %dopar% { + GENE <- can.isoform <- l <- l2 <- NA + l=unique(unlist(stringi::stri_split_fixed( str = MutInfo, pattern = ","),use.names = F,recursive = F)) + + if(length(l)==1){ + l2 <- unlist(stringi::stri_split_fixed( str = l, pattern = ":"),use.names = F,recursive = F) + MUTATION <- stringi::stri_replace_first_fixed(str = l2[stringi::stri_detect_regex(str = l2, use.names = F, pattern = "^p\\.")],pattern = "p.",replacement = "") + } else { + GENE <- stringi::stri_sub(str = l[1],from = 1,to = stringi::stri_locate_first_fixed(str = l[1],pattern = ":")[,"end"]-1) + can.isoform <- refseq$transcript_accession[refseq$gene_id==GENE] + if(length(can.isoform)==0 | !any(stringi::stri_detect_fixed(str = l,pattern = can.isoform))){ + # Canonical isoform not found in RefSeq, or Match for Canonical isoform not found in AAchangeAnnotations + l2 <- unlist(stringi::stri_split_fixed( str = l, pattern = ":"),use.names = F,recursive = F) + l3 <- stringi::stri_replace_all_fixed(str = l2[stringi::stri_startswith_fixed(str = l2,"p.")],pattern = "p.",replacement = "") + MUTATION <- l3[which.max(MutSiteFind(l3))] + } else { + # Canonical isoform found + l=l[grep(can.isoform,l)] + l2 <- unlist(stringi::stri_split_fixed( str = l, pattern = ":"),use.names = F,recursive = F) + MUTATION <- stringi::stri_replace_first_fixed(str = l2[stringi::stri_detect_regex(str = l2, use.names = F, pattern = "^p\\.")],pattern = "p.",replacement = "") + } + } + if(length(MUTATION)==0){ + MUTATION <- NA + } + cat(paste(MUTATION,"\n"),file="log.txt", append=T) + return(MUTATION) + } + stopCluster(myCluster) + return(results) +} diff --git a/MutSiteFind.R b/MutSiteFind.R index 1174e3dda34f9e5530d1340a74bc8f7b7f4b6d44..709b666c8a28a7562383642b166d2fa125a7016c 100644 --- a/MutSiteFind.R +++ b/MutSiteFind.R @@ -17,6 +17,7 @@ dependencies <- c("stringi") missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])] if(length(missing_packages)) install.packages(missing_packages) +rm(missing_packages,dependencies) MutSiteFind <- function(MutationColumn){ return(unlist(x = stringi::stri_extract_first_regex(str = MutationColumn,pattern = "[[:digit:]]+"), use.names = F)) diff --git a/unparalogMutations.R b/unparalogMutations.R index 91683579cbc3bf50a6f8d24e23d2ed86bf678e27..d4795631c720724c956494e6be60d9adc5abb305 100644 --- a/unparalogMutations.R +++ b/unparalogMutations.R @@ -33,6 +33,7 @@ dependencies <- c("stringi", "progress") missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])] if(length(missing_packages)) install.packages(missing_packages) +rm(missing_packages,dependencies) unparalog <- function(DATA, paralog_separator = ";", annotation_separator = ",", GeneColName , AnnotationColName ){ # Sanity checks @@ -51,7 +52,7 @@ unparalog <- function(DATA, paralog_separator = ";", annotation_separator = ",", stop("Inconsistencies with these input parameters.\n Ensure they are correct and try again.") } # Cleanup - rm(missing_packages,dependencies,check_paralog_sep,check_annotation_sep) + rm(check_paralog_sep,check_annotation_sep) gc() current.idx <- 1 # nrow(DATA)+1