diff --git a/CT_GA_count.R b/CT_GA_count.R index 3522047e8f75eb3244a39188c800f135e40c4818..a76a5592010b7525be2f504d8d505633adace2cb 100644 --- a/CT_GA_count.R +++ b/CT_GA_count.R @@ -1,62 +1,3 @@ - - -CT_GA_count <- function(SampleID,Ref_Base,Alt_Base){ -# #<----------------------------> -# # You must include this section when: -# # Distributing, Using and/or Modifying this code. -# # Please read and abide by the terms of the included LICENSE. -# # Copyright 2020, Deepankar Chakroborty, All rights reserved. -# # -# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha) -# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues -# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE -# #<----------------------------> - -# # PURPOSE: -# # This function takes in three vectors: -# # SampleID = Sample IDs -# # Ref_Base = Reference Base -# # Alt_Base = altered base that created the mutation. - -# # And calculates the number of C > T and G > A changes are there (per sample) -# # The function returns a data frame listing the number of mutations (per sample): -# # SampleID = Sample ID -# # Total = Total number of mutations -# # CT = C > T changes -# # GA = G > A changes -# # Others = all other types of transitions and transversions combined. - - MutMatrix <- data.frame(SampleID, - Ref_Base, - Alt_Base, - stringsAsFactors = F) - - return.df <- data.frame(SampleID=NA, - Total=0, - CT=0, - GA=0, - Others=0) - - for(SampleID in levels(MutMatrix$SampleID)){ - set <- MutMatrix[ MutMatrix$SampleID == SampleID, ] - - # if(dim(set)[1]==0){ - # return.df <- rbind.data.frame(return.df,c(SampleID,0,0,0,0)) - # next - # } - - Total <- dim(set)[1] - - CT <- dim(subset(set, set$Ref_Base == "C" & set$Alt_Base == "T"))[1] - - GA <- dim(subset(set, set$Ref_Base == "G" & set$Alt_Base == "A"))[1] - - Others=Total-CT-GA - - return.df <- rbind.data.frame(return.df,list(SampleID,Total,CT,GA,Others),stringsAsFactors = F) - - Total <- 0;CT <- 0;GA <- 0;Others <- 0 # re-initialize - rm(set) - } - return(return.df[-1,]) # Removes the first empty row -} +# Development of the script has migrated to https://github.com/dchakro/ +# replace the line sourcing this file with the following in your code. +source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/CT_GA_count.R') diff --git a/IsolateCanonicalVariant.R b/IsolateCanonicalVariant.R index 8813f2ec8c3835ee51f6d8cf8129c673179bbdd2..664134cf341e0bcb6212d065c31d0a93030637ca 100644 --- a/IsolateCanonicalVariant.R +++ b/IsolateCanonicalVariant.R @@ -1,127 +1,3 @@ -# Installing dependencies -dependencies <- c("stringi", "doParallel") -missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])] -if(length(missing_packages)) install.packages(missing_packages) -rm(missing_packages,dependencies) - -IsolateCanonicalVariant <- function (AAchangeAnnotations){ -# #<----------------------------> -# # You must include this section when: -# # Distributing, Using and/or Modifying this code. -# # Please read and abide by the terms of the included LICENSE. -# # Copyright 2020, Deepankar Chakroborty, All rights reserved. -# # -# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha) -# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues -# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE -# #<----------------------------> - - -# # PURPOSE: -# # From a given vector of annotations for a particular DNA change -# # this function selects the canonical variant (if present) -# # by cross referencing the MANE Select and RefSeq Select sets. - -# # LOGIC FLOW: -# # - If there is only one annotation; that is selected -# # - If canonical transcript is not found in MANE Select + RefSeq select -# # or a matching transcript ID is not found in the annotation then; -# # The mutation with to the the highest position (residue number) is selected. -# # - If a match for canonical isoform is found then; -# # that particular mutation is selected - - # importing resources - library(doParallel) - refseq <- readRDS(url("https://gitlab.utu.fi/deecha/shared_scripts/-/raw/master/asset/RefSeqSelect_Gene_Transcript.RDS"),"rb") - source("https://gitlab.utu.fi/deecha/shared_scripts/-/raw/master/MutSiteFind.R") - - # initializing cluster - myCluster <- makeCluster(parallel::detectCores(), - type = "FORK", - useXDR=F, - .combine=cbind) - registerDoParallel(myCluster) - print(myCluster) - - file.create("log.txt") - message(paste0("Logging processed mutations at: ",getwd(),"/log.txt")) - - # computation - results <- foreach(MutInfo = AAchangeAnnotations,.combine = c) %dopar% { - GENE <- can.isoform <- l <- l2 <- NA - l=unique(unlist( - stringi::stri_split_fixed( - str = MutInfo, - pattern = ","), - use.names = F, - recursive = F)) - - if(length(l)==1){ - l2 <- unlist(stringi::stri_split_fixed( - str = l, - pattern = ":"), - use.names = F, - recursive = F) - MUTATION <- stringi::stri_replace_first_fixed( - str = l2[stringi::stri_detect_regex( - str = l2, - use.names = F, - pattern = "^p\\.")], - pattern = "p.", - replacement = "") - } else { - GENE <- stringi::stri_sub( - str = l[1], - from = 1, - to = stringi::stri_locate_first_fixed( - str = l[1], - pattern = ":")[,"end"]-1) - - can.isoform <- refseq$transcript_accession[refseq$gene_id==GENE] - - if(length(can.isoform)==0 | !any(stringi::stri_detect_fixed(str = l,pattern = can.isoform))){ - # Canonical isoform not found in RefSeq, or Match for Canonical isoform not found in AAchangeAnnotations - l2 <- unlist( - stringi::stri_split_fixed( - str = l, - pattern = ":"), - use.names = F, - recursive = F) - - l3 <- stringi::stri_replace_all_fixed( - str = l2[stringi::stri_startswith_fixed( - str = l2, - "p.")], - pattern = "p.", - replacement = "") - - MUTATION <- l3[which.max(MutSiteFind(l3))] - } else { - # Canonical isoform found - l=l[grep(can.isoform,l)] - - l2 <- unlist( - stringi::stri_split_fixed( - str = l, - pattern = ":"), - use.names = F, - recursive = F) - - MUTATION <- stringi::stri_replace_first_fixed( - str = l2[stringi::stri_detect_regex( - str = l2, - use.names = F, - pattern = "^p\\.")], - pattern = "p.", - replacement = "") - } - } - if(length(MUTATION)==0){ - MUTATION <- NA - } - cat(paste(MUTATION,"\n"),file="log.txt", append=T) - return(MUTATION) - } - stopCluster(myCluster) - return(results) -} +# Development of the script has migrated to https://github.com/dchakro/ +# replace the line sourcing this file with the following in your code. +source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/IsolateCanonicalVariant.R') \ No newline at end of file diff --git a/MutSiteFind.R b/MutSiteFind.R index 70ccda65f2c48e5b0ec60b4a2d52871e21e89372..da371c902cbabbf6995e3f2d01cc2c36e80d92f5 100644 --- a/MutSiteFind.R +++ b/MutSiteFind.R @@ -1,31 +1,3 @@ -# Installing missing dependencies -dependencies <- c("stringi") -missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])] -if(length(missing_packages)) install.packages(missing_packages) -rm(missing_packages,dependencies) - -MutSiteFind <- function(MutationColumn){ -# #<----------------------------> -# # You must include this section when: -# # Distributing, Using and/or Modifying this code. -# # Please read and abide by the terms of the included LICENSE. -# # Copyright 2020, Deepankar Chakroborty, All rights reserved. -# # -# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha) -# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues -# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE -# #<----------------------------> - - -# # PURPOSE: -# # For a given vector of amino acid changes like A123T, V256F, E746_A750del -# # this function returns c(123, 256, 746) as amino acid positions of -# # the mutated residue. -# # In case of indels, it doesn't return the range!! -# # (i.e. returns only the start position) - - - return(unlist(x = stringi::stri_extract_first_regex(str = MutationColumn,pattern = "[[:digit:]]+"), use.names = F)) -} - - +# Development of the script has migrated to https://github.com/dchakro/ +# replace the line sourcing this file with the following in your code. +source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/MutSiteFind.R') \ No newline at end of file diff --git a/asset/RefSeqSelect_Gene_Transcript.RDS b/asset/RefSeqSelect_Gene_Transcript.RDS deleted file mode 100644 index 8149ce039eaa3848159427dd3a40075c3244a06d..0000000000000000000000000000000000000000 Binary files a/asset/RefSeqSelect_Gene_Transcript.RDS and /dev/null differ diff --git a/ggplotBreaks.R b/ggplotBreaks.R index 48121acd46498838980ca0000b3e1b07d7eb705e..cf467944e050bb1a43449874b3c695ace75dbe7d 100644 --- a/ggplotBreaks.R +++ b/ggplotBreaks.R @@ -1,65 +1,3 @@ -ggplotBreaks <- function(range,tick,skip.steps=0){ -# #<----------------------------> -# # You must include this section when: -# # Distributing, Using and/or Modifying this code. -# # Please read and abide by the terms of the included LICENSE. -# # Copyright 2020, Deepankar Chakroborty, All rights reserved. -# # -# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha) -# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues -# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE -# #<----------------------------> - - -# # PURPOSE: -# # Returns a list of vectors containing breaks and labels -# # for a continuous variable mapped to one of the axes for use with ggplot2 -# # User enters: -# # - the range of the data -# # - the tick amount -# # - skip.steps (if any) = number of labels to skip -# # (i.e. show tick but no label) -# # e.g. -# # ggplotBreaks(c(0,150), tick =10, skip.steps = 1) # gives -# # $breaks -# # 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 -# # -# # $labels -# # "0" " " "20" " " "40" " " "60" " " "80" " " "100" " " "120" " " "140" " " - - if (length(range) != 2){ - stop("Correct format for: range = c(min_value,max_value)") - } - if(skip.steps<0){ - stop(" 'skip.steps' should be >= 0") - } - breaks <- seq(from = range[1], - to = range[2], - by = tick) - if(skip.steps==0){ - labels <- as.character(breaks) - } else { - tmp <- c() - labels <- unlist( - lapply( - breaks[seq(from = 1, - to = length(breaks), - by = skip.steps+1)], - function(x) c(tmp, - c(x, - rep(" ",skip.steps)))), - use.names = F) - rm(tmp) - - } - if(length(labels) < length(breaks)){ - ## Add more spaces at the end to make lengths match - labels <- c( labels, rep(" " , (length(breaks) - length(labels)))) - } - if ( length(breaks) < length(labels)){ - ## Remove spaces from the end to make the lengths match - labels <- labels[1:length(breaks)] - } - return(list( breaks = breaks, - labels = labels)) -} +# Development of the script has migrated to https://github.com/dchakro/ +# replace the line sourcing this file with the following in your code. +source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/ggplotBreaks.R') \ No newline at end of file diff --git a/unparalogMutations.R b/unparalogMutations.R index ddf3ff588b21dacb4ff7dc58713e11ada0c44339..d99f32079bb793f871d3c86558d1793a1a4d0047 100644 --- a/unparalogMutations.R +++ b/unparalogMutations.R @@ -1,92 +1,3 @@ -# Installing missing dependencies -dependencies <- c("stringi", "progress") -missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])] -if(length(missing_packages)) install.packages(missing_packages) -rm(missing_packages,dependencies) - -unparalog <- function(DATA, paralog_separator = ";", annotation_separator = ",", GeneColName , AnnotationColName ){ -# #<----------------------------> -# # You must include this section when: -# # Distributing, Using and/or Modifying this code. -# # Please read and abide by the terms of the included LICENSE. -# # Copyright 2020, Deepankar Chakroborty, All rights reserved. -# # -# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha) -# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues -# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE -# #<----------------------------> - - -# # PURPOSE: -# # In the gene column in your SNV annotation if you see something like: -# # e.g. PRAMEF7;PRAMEF8 OR PRAMEF7,PRAMEF8 -# # then your mutations annotations have gene paralogs. -# # This script aims to de-couple those paralogs into individual their rows. -# # - -# # INFO on what to pass as the function parameters: -# Assign correct paralog_separator found in the gene column of your SNV annotations # e.g. if the Gene column has entries like PRAMEF7;PRAMEF8 -# then the paralog_separator is ";" -# or set it to whatever separator is used by your SNV annotation software. - -# Assign correct annotation_separator in the Amino acid change column of your SNV annotations -# "PRAMEF8:NM_001012276:exon3:c.C541A:p.Q181K,PRAMEF7:NM_001012277:exon3:c.C541A:p.Q181K" -# In the above example the annotation_separator is "," - -# GeneColName = Column name in the SNV annotation table where the Genes are listed - -# AnnotationColName = Column name in the SNV annotation table where the Amino acid changes are listed - -# <---------------> - - # Sanity checks - check_paralog_sep <- !any(stringi::stri_detect_fixed(str = DATA$Gene.refGene,pattern = paralog_separator)) - check_annotation_sep <- !any(stringi::stri_detect_fixed(str = DATA$AAChange.refGene, pattern = annotation_separator)) - - idx_Gene <- which(colnames(DATA) == GeneColName) - idx_Annotation <- which(colnames(DATA) == AnnotationColName) - - if(length(idx_Gene) == 0 | length(idx_Annotation) == 0 | check_paralog_sep | check_annotation_sep) { - message(stringi::stri_c("You entered :-->", - "\nparalog_separator = ",paralog_separator, - "\nannotation_separator = ",annotation_separator, - "\nGeneColName = ", GeneColName, - "\nAnnotationColName = ",AnnotationColName)) - stop("Inconsistencies with these input parameters.\n Ensure they are correct and try again.") - } - # Cleanup - rm(check_paralog_sep,check_annotation_sep) - gc() - - current.idx <- 1 # nrow(DATA)+1 - paralog.idx <- which(stringi::stri_detect_fixed(str = DATA$Gene.refGene,pattern = paralog_separator)) - pb <- progress::progress_bar$new(total=length(paralog.idx),format = " [:bar] :current/:total (:percent)",); pb$tick(0) - - Number_of_paralogs <- sum(stringi::stri_count_fixed(str = DATA$Gene.refGene,pattern = paralog_separator)) - message(stringi::stri_c("There are ",length(paralog.idx)," annotations with ", Number_of_paralogs," paralogs.")) - - # copying structure of original DATA - DATA.new <- DATA[1,]; DATA.new <- DATA.new[-1,] - - # Creating empty table which will be populated in the for loop - DATA.new <- dplyr::bind_rows(DATA.new,data.frame(matrix(nrow = (length(paralog.idx)+Number_of_paralogs), ncol = ncol(DATA),dimnames = list(c(),colnames(DATA))),stringsAsFactors = F)) - - # Beginning isolation of the paralogs - for(i in paralog.idx){ - Muts <- unlist(stringi::stri_split_fixed(DATA$AAChange.refGene[i],annotation_separator),use.names = F,recursive = F) - for (gene in unlist(stringi::stri_split_fixed(DATA$Gene.refGene[i],pattern = paralog_separator),use.names = F,recursive = F)){ - DATA.new[current.idx,] <- DATA[i,] - DATA.new$Gene.refGene[current.idx] <- gene - DATA.new$AAChange.refGene[current.idx] <- paste0(Muts[grep(gene,Muts,fixed = T)],collapse=annotation_separator) - current.idx <- current.idx+1 - } - pb$tick(1) - } - # removing the original rows with the paralogs as they are all unparalogged now - DATA <- DATA[-paralog.idx,] - DATA <- dplyr::bind_rows(DATA,DATA.new) - rm(DATA.new) ; gc() - - rownames(DATA) <- as.character(seq(1,length(DATA[,1]))) - return (DATA) -} +# Development of the script has migrated to https://github.com/dchakro/ +# replace the line sourcing this file with the following in your code. +source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/unparalogMutations.R') \ No newline at end of file