Migrate to github.com/dchakro

c137839e · robocopAlpha · d4465971 · c137839e · c137839e · c137839e
Commit c137839e authored 4 years ago by robocopAlpha
--- a/CT_GA_count.R
+++ b/CT_GA_count.R
+# Development of the script has migrated to https://github.com/dchakro/
+# replace the line sourcing this file with the following in your code.
-CT_GA_count <- function(SampleID,Ref_Base,Alt_Base){
+source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/CT_GA_count.R')
-# #<---------------------------->
-# # You must include this section when:
-# # Distributing, Using and/or Modifying this code.
-# # Please read and abide by the terms of the included LICENSE.
-# # Copyright 2020, Deepankar Chakroborty, All rights reserved.
-# #
-# #  Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha)
-# #  Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues
-# #  License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE
-# #<---------------------------->
-# # PURPOSE:
-# # This function takes in three vectors:
-# #     SampleID = Sample IDs
-# #     Ref_Base = Reference Base
-# #     Alt_Base = altered base that created the mutation.
-# #  And calculates the number of C > T and G > A changes are there (per sample)
-# #  The function returns a data frame listing the number of mutations (per sample):
-# #    SampleID = Sample ID
-# #    Total = Total number of mutations
-# #    CT = C > T changes
-# #    GA = G > A changes
-# #    Others = all other types of transitions and transversions combined.
-  MutMatrix <- data.frame(SampleID,
-                          Ref_Base,
-                          Alt_Base,
-                          stringsAsFactors = F)
-  return.df <- data.frame(SampleID=NA,
-                          Total=0,
-                          CT=0,
-                          GA=0,
-                          Others=0)
-  for(SampleID in levels(MutMatrix$SampleID)){
-    set  <-  MutMatrix[ MutMatrix$SampleID == SampleID, ]
-    # if(dim(set)[1]==0){
-    #   return.df <- rbind.data.frame(return.df,c(SampleID,0,0,0,0))
-    #   next
-    # }
-    Total <- dim(set)[1]
-    CT <- dim(subset(set, set$Ref_Base == "C" & set$Alt_Base == "T"))[1]
-    GA <- dim(subset(set, set$Ref_Base == "G" & set$Alt_Base == "A"))[1]
-    Others=Total-CT-GA
-    return.df <- rbind.data.frame(return.df,list(SampleID,Total,CT,GA,Others),stringsAsFactors = F)
-    Total <- 0;CT <- 0;GA <- 0;Others <- 0 # re-initialize
-    rm(set)
-  }
-  return(return.df[-1,]) # Removes the first empty row
-}
--- a/IsolateCanonicalVariant.R
+++ b/IsolateCanonicalVariant.R
-# Installing dependencies
+# Development of the script has migrated to https://github.com/dchakro/
-dependencies <- c("stringi", "doParallel")
+# replace the line sourcing this file with the following in your code.
-missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])]
+source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/IsolateCanonicalVariant.R')
-if(length(missing_packages)) install.packages(missing_packages) 
\ No newline at end of file
-rm(missing_packages,dependencies)
-IsolateCanonicalVariant <- function (AAchangeAnnotations){
-# #<---------------------------->
-# # You must include this section when:
-# # Distributing, Using and/or Modifying this code. 
-# # Please read and abide by the terms of the included LICENSE.
-# # Copyright 2020, Deepankar Chakroborty, All rights reserved.
-# #
-# #  Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha)
-# #  Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues
-# #  License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE
-# #<---------------------------->
-# #  PURPOSE:
-# #  From a given vector of annotations for a particular DNA change
-# #  this function selects the canonical variant (if present)
-# #  by cross referencing the MANE Select and RefSeq Select sets.
-# #  LOGIC FLOW:
-# #  - If there is only one annotation; that is selected
-# #  - If canonical transcript is not found in MANE Select + RefSeq select 
-# #     or a matching transcript ID is not found in the annotation then;
-# #     The mutation with to the the highest position (residue number) is selected.
-# #  - If a match for canonical isoform is found then; 
-# #        that particular mutation is selected
-  # importing resources
-  library(doParallel)
-  refseq <- readRDS(url("https://gitlab.utu.fi/deecha/shared_scripts/-/raw/master/asset/RefSeqSelect_Gene_Transcript.RDS"),"rb")
-  source("https://gitlab.utu.fi/deecha/shared_scripts/-/raw/master/MutSiteFind.R")
-  # initializing cluster
-  myCluster <- makeCluster(parallel::detectCores(), 
-                           type = "FORK",
-                           useXDR=F,
-                           .combine=cbind)
-  registerDoParallel(myCluster)
-  print(myCluster)
-  file.create("log.txt")
-  message(paste0("Logging processed mutations at: ",getwd(),"/log.txt"))
-  # computation
-  results <- foreach(MutInfo = AAchangeAnnotations,.combine = c) %dopar% {
-    GENE <- can.isoform <- l <- l2 <- NA
-    l=unique(unlist(
-      stringi::stri_split_fixed(
-        str = MutInfo,
-        pattern = ","),
-      use.names = F,
-      recursive = F))
-    if(length(l)==1){
-      l2 <- unlist(stringi::stri_split_fixed( 
-          str = l, 
-          pattern = ":"),
-        use.names = F,
-        recursive = F)
-      MUTATION <- stringi::stri_replace_first_fixed(
-        str = l2[stringi::stri_detect_regex(
-          str = l2, 
-          use.names = F, 
-          pattern = "^p\\.")],
-        pattern = "p.",
-        replacement = "")
-    } else {
-      GENE <- stringi::stri_sub(
-        str = l[1],
-        from = 1,
-        to = stringi::stri_locate_first_fixed(
-          str = l[1],
-          pattern = ":")[,"end"]-1)
-      can.isoform <- refseq$transcript_accession[refseq$gene_id==GENE]
-      if(length(can.isoform)==0 | !any(stringi::stri_detect_fixed(str = l,pattern = can.isoform))){
-        # Canonical isoform not found in RefSeq, or Match for Canonical isoform not found in AAchangeAnnotations
-        l2 <- unlist(
-          stringi::stri_split_fixed( 
-            str = l, 
-            pattern = ":"),
-          use.names = F,
-          recursive = F)
-        l3 <- stringi::stri_replace_all_fixed(
-          str = l2[stringi::stri_startswith_fixed(
-            str = l2,
-            "p.")],
-          pattern = "p.",
-          replacement = "")
-        MUTATION <- l3[which.max(MutSiteFind(l3))]
-      } else {
-        # Canonical isoform found
-        l=l[grep(can.isoform,l)]
-        l2 <- unlist(
-          stringi::stri_split_fixed( 
-            str = l, 
-            pattern = ":"),
-          use.names = F,
-          recursive = F)
-        MUTATION <- stringi::stri_replace_first_fixed(
-          str = l2[stringi::stri_detect_regex(
-            str = l2, 
-            use.names = F, 
-            pattern = "^p\\.")],
-          pattern = "p.",
-          replacement = "")
-      }
-    }
-    if(length(MUTATION)==0){
-      MUTATION <- NA
-    }
-    cat(paste(MUTATION,"\n"),file="log.txt", append=T)
-    return(MUTATION)
-  }
-  stopCluster(myCluster)
-  return(results)
-}
--- a/MutSiteFind.R
+++ b/MutSiteFind.R
-# Installing missing dependencies
+# Development of the script has migrated to https://github.com/dchakro/
-dependencies <- c("stringi")
+# replace the line sourcing this file with the following in your code.
-missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])]
+source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/MutSiteFind.R')
-if(length(missing_packages)) install.packages(missing_packages) 
\ No newline at end of file
-rm(missing_packages,dependencies)
-MutSiteFind <- function(MutationColumn){
-# #<---------------------------->
-# # You must include this section when:
-# # Distributing, Using and/or Modifying this code. 
-# # Please read and abide by the terms of the included LICENSE.
-# # Copyright 2020, Deepankar Chakroborty, All rights reserved.
-# #
-# #  Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha)
-# #  Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues
-# #  License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE
-# #<---------------------------->
-# #  PURPOSE:
-# #  For a given vector of amino acid changes like A123T, V256F, E746_A750del
-# #  this function returns c(123, 256, 746) as amino acid positions of
-# #  the mutated residue. 
-# #     In case of indels, it doesn't return the range!! 
-# #        (i.e. returns only the start position)
-  return(unlist(x = stringi::stri_extract_first_regex(str = MutationColumn,pattern = "[[:digit:]]+"), use.names = F))
-}
--- a/asset/RefSeqSelect_Gene_Transcript.RDS
+++ b/asset/RefSeqSelect_Gene_Transcript.RDS
--- a/ggplotBreaks.R
+++ b/ggplotBreaks.R
-ggplotBreaks <- function(range,tick,skip.steps=0){
+# Development of the script has migrated to https://github.com/dchakro/
-# #<---------------------------->
+# replace the line sourcing this file with the following in your code.
-# # You must include this section when:
+source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/ggplotBreaks.R')
-# # Distributing, Using and/or Modifying this code. 
\ No newline at end of file
-# # Please read and abide by the terms of the included LICENSE.
-# # Copyright 2020, Deepankar Chakroborty, All rights reserved.
-# #
-# #  Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha)
-# #  Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues
-# #  License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE
-# #<---------------------------->
-# #  PURPOSE:
-# #  Returns a list of vectors containing breaks and labels 
-# # for a continuous variable mapped to one of the axes for use with ggplot2
-# #  User enters: 
-# #     - the range of the data 
-# #     - the tick amount
-# #     - skip.steps (if any) = number of labels to skip 
-# #                             (i.e. show tick but no label)
-# #     e.g.
-# #         ggplotBreaks(c(0,150), tick =10, skip.steps = 1) # gives
-# #  $breaks
-# #     0  10  20  30  40  50  60  70  80  90 100 110 120 130 140 150
-# # 
-# #  $labels
-# #    "0"  " "  "20"  " "  "40"  " "  "60"  " "  "80"  " "  "100"  " "  "120"  " "  "140"  " "
-  if (length(range) != 2){
-    stop("Correct format for: range = c(min_value,max_value)")
-  }
-  if(skip.steps<0){
-    stop(" 'skip.steps' should be >= 0")
-  }
-  breaks <- seq(from = range[1],
-                to = range[2],
-                by = tick)
-  if(skip.steps==0){
-    labels <- as.character(breaks)  
-  } else {
-    tmp <- c()
-    labels <- unlist(
-      lapply(
-        breaks[seq(from = 1,
-                   to = length(breaks),
-                   by = skip.steps+1)],
-        function(x) c(tmp,
-                      c(x,
-                        rep(" ",skip.steps)))),
-                     use.names = F)
-    rm(tmp)
-  }
-  if(length(labels) < length(breaks)){
-    ## Add more spaces at the end to make lengths match
-    labels <- c( labels, rep(" " , (length(breaks) - length(labels))))
-  }
-  if ( length(breaks) < length(labels)){
-    ## Remove spaces from the end to make the lengths match
-    labels <- labels[1:length(breaks)]
-  }
-  return(list( breaks = breaks,
-              labels = labels))
-}
--- a/unparalogMutations.R
+++ b/unparalogMutations.R
-# Installing missing dependencies
+# Development of the script has migrated to https://github.com/dchakro/
-dependencies <- c("stringi", "progress")
+# replace the line sourcing this file with the following in your code.
-missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])]
+source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/unparalogMutations.R')
-if(length(missing_packages)) install.packages(missing_packages)
\ No newline at end of file
-rm(missing_packages,dependencies)
-unparalog <- function(DATA, paralog_separator = ";", annotation_separator = ",", GeneColName , AnnotationColName ){
-# #<---------------------------->
-# # You must include this section when:
-# # Distributing, Using and/or Modifying this code. 
-# # Please read and abide by the terms of the included LICENSE.
-# # Copyright 2020, Deepankar Chakroborty, All rights reserved.
-# #
-# #  Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha)
-# #  Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues
-# #  License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE
-# #<---------------------------->
-# #  PURPOSE:
-# #  In the gene column in your SNV annotation if you see something like:
-# #  e.g. PRAMEF7;PRAMEF8  OR  PRAMEF7,PRAMEF8
-# #       then your mutations annotations have gene paralogs.
-# #  This script aims to de-couple those paralogs into individual their rows.
-# #
-# # INFO on what to pass as the function parameters:
-# Assign correct paralog_separator found in the gene column of your SNV annotations # e.g. if the Gene column has entries like PRAMEF7;PRAMEF8 
-# then the paralog_separator is ";" 
-# or set it to whatever separator is used by your SNV annotation software.
-# Assign correct annotation_separator in the Amino acid change column of your SNV annotations
-# "PRAMEF8:NM_001012276:exon3:c.C541A:p.Q181K,PRAMEF7:NM_001012277:exon3:c.C541A:p.Q181K"
-# In the above example the annotation_separator is ","
-# GeneColName = Column name in the SNV annotation table where the Genes are listed
-# AnnotationColName = Column name in the SNV annotation table where the Amino acid changes are listed
-# <--------------->
-   # Sanity checks
-   check_paralog_sep <- !any(stringi::stri_detect_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
-   check_annotation_sep <- !any(stringi::stri_detect_fixed(str =  DATA$AAChange.refGene, pattern = annotation_separator))
-   idx_Gene <- which(colnames(DATA) == GeneColName)
-   idx_Annotation <- which(colnames(DATA) == AnnotationColName)
-   if(length(idx_Gene) == 0 | length(idx_Annotation) == 0 | check_paralog_sep | check_annotation_sep) {
-      message(stringi::stri_c("You entered :-->",
-                              "\nparalog_separator = ",paralog_separator,
-                              "\nannotation_separator = ",annotation_separator,
-                              "\nGeneColName = ", GeneColName,
-                              "\nAnnotationColName = ",AnnotationColName))
-   stop("Inconsistencies with these input parameters.\n Ensure they are correct and try again.")
-   }
-   # Cleanup
-   rm(check_paralog_sep,check_annotation_sep)
-   gc()
-   current.idx <- 1 # nrow(DATA)+1
-   paralog.idx <- which(stringi::stri_detect_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
-   pb <- progress::progress_bar$new(total=length(paralog.idx),format = " [:bar] :current/:total (:percent)",); pb$tick(0)
-   Number_of_paralogs <- sum(stringi::stri_count_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
-   message(stringi::stri_c("There are ",length(paralog.idx)," annotations with ", Number_of_paralogs," paralogs."))
-   # copying structure of original DATA
-   DATA.new <- DATA[1,]; DATA.new <- DATA.new[-1,] 
-   # Creating empty table which will be populated in the for loop
-   DATA.new <- dplyr::bind_rows(DATA.new,data.frame(matrix(nrow = (length(paralog.idx)+Number_of_paralogs), ncol = ncol(DATA),dimnames = list(c(),colnames(DATA))),stringsAsFactors = F))
-   # Beginning isolation of the paralogs
-   for(i in paralog.idx){
-      Muts <- unlist(stringi::stri_split_fixed(DATA$AAChange.refGene[i],annotation_separator),use.names = F,recursive = F)
-      for (gene in unlist(stringi::stri_split_fixed(DATA$Gene.refGene[i],pattern = paralog_separator),use.names = F,recursive = F)){
-         DATA.new[current.idx,] <- DATA[i,]
-         DATA.new$Gene.refGene[current.idx] <- gene
-         DATA.new$AAChange.refGene[current.idx] <- paste0(Muts[grep(gene,Muts,fixed = T)],collapse=annotation_separator)
-         current.idx <- current.idx+1
-      }
-      pb$tick(1)
-   }
-   # removing the original rows with the paralogs as they are all unparalogged now
-   DATA <- DATA[-paralog.idx,]
-   DATA <- dplyr::bind_rows(DATA,DATA.new)
-   rm(DATA.new) ; gc()
-   rownames(DATA) <- as.character(seq(1,length(DATA[,1])))
-   return (DATA)
-}