Skip to content
Snippets Groups Projects
Commit c137839e authored by robocopAlpha's avatar robocopAlpha
Browse files

Migrate to github.com/dchakro

parent d4465971
Branches
No related tags found
No related merge requests found
# Development of the script has migrated to https://github.com/dchakro/
# replace the line sourcing this file with the following in your code.
CT_GA_count <- function(SampleID,Ref_Base,Alt_Base){ source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/CT_GA_count.R')
# #<---------------------------->
# # You must include this section when:
# # Distributing, Using and/or Modifying this code.
# # Please read and abide by the terms of the included LICENSE.
# # Copyright 2020, Deepankar Chakroborty, All rights reserved.
# #
# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha)
# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues
# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE
# #<---------------------------->
# # PURPOSE:
# # This function takes in three vectors:
# # SampleID = Sample IDs
# # Ref_Base = Reference Base
# # Alt_Base = altered base that created the mutation.
# # And calculates the number of C > T and G > A changes are there (per sample)
# # The function returns a data frame listing the number of mutations (per sample):
# # SampleID = Sample ID
# # Total = Total number of mutations
# # CT = C > T changes
# # GA = G > A changes
# # Others = all other types of transitions and transversions combined.
MutMatrix <- data.frame(SampleID,
Ref_Base,
Alt_Base,
stringsAsFactors = F)
return.df <- data.frame(SampleID=NA,
Total=0,
CT=0,
GA=0,
Others=0)
for(SampleID in levels(MutMatrix$SampleID)){
set <- MutMatrix[ MutMatrix$SampleID == SampleID, ]
# if(dim(set)[1]==0){
# return.df <- rbind.data.frame(return.df,c(SampleID,0,0,0,0))
# next
# }
Total <- dim(set)[1]
CT <- dim(subset(set, set$Ref_Base == "C" & set$Alt_Base == "T"))[1]
GA <- dim(subset(set, set$Ref_Base == "G" & set$Alt_Base == "A"))[1]
Others=Total-CT-GA
return.df <- rbind.data.frame(return.df,list(SampleID,Total,CT,GA,Others),stringsAsFactors = F)
Total <- 0;CT <- 0;GA <- 0;Others <- 0 # re-initialize
rm(set)
}
return(return.df[-1,]) # Removes the first empty row
}
# Installing dependencies # Development of the script has migrated to https://github.com/dchakro/
dependencies <- c("stringi", "doParallel") # replace the line sourcing this file with the following in your code.
missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])] source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/IsolateCanonicalVariant.R')
if(length(missing_packages)) install.packages(missing_packages) \ No newline at end of file
rm(missing_packages,dependencies)
IsolateCanonicalVariant <- function (AAchangeAnnotations){
# #<---------------------------->
# # You must include this section when:
# # Distributing, Using and/or Modifying this code.
# # Please read and abide by the terms of the included LICENSE.
# # Copyright 2020, Deepankar Chakroborty, All rights reserved.
# #
# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha)
# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues
# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE
# #<---------------------------->
# # PURPOSE:
# # From a given vector of annotations for a particular DNA change
# # this function selects the canonical variant (if present)
# # by cross referencing the MANE Select and RefSeq Select sets.
# # LOGIC FLOW:
# # - If there is only one annotation; that is selected
# # - If canonical transcript is not found in MANE Select + RefSeq select
# # or a matching transcript ID is not found in the annotation then;
# # The mutation with to the the highest position (residue number) is selected.
# # - If a match for canonical isoform is found then;
# # that particular mutation is selected
# importing resources
library(doParallel)
refseq <- readRDS(url("https://gitlab.utu.fi/deecha/shared_scripts/-/raw/master/asset/RefSeqSelect_Gene_Transcript.RDS"),"rb")
source("https://gitlab.utu.fi/deecha/shared_scripts/-/raw/master/MutSiteFind.R")
# initializing cluster
myCluster <- makeCluster(parallel::detectCores(),
type = "FORK",
useXDR=F,
.combine=cbind)
registerDoParallel(myCluster)
print(myCluster)
file.create("log.txt")
message(paste0("Logging processed mutations at: ",getwd(),"/log.txt"))
# computation
results <- foreach(MutInfo = AAchangeAnnotations,.combine = c) %dopar% {
GENE <- can.isoform <- l <- l2 <- NA
l=unique(unlist(
stringi::stri_split_fixed(
str = MutInfo,
pattern = ","),
use.names = F,
recursive = F))
if(length(l)==1){
l2 <- unlist(stringi::stri_split_fixed(
str = l,
pattern = ":"),
use.names = F,
recursive = F)
MUTATION <- stringi::stri_replace_first_fixed(
str = l2[stringi::stri_detect_regex(
str = l2,
use.names = F,
pattern = "^p\\.")],
pattern = "p.",
replacement = "")
} else {
GENE <- stringi::stri_sub(
str = l[1],
from = 1,
to = stringi::stri_locate_first_fixed(
str = l[1],
pattern = ":")[,"end"]-1)
can.isoform <- refseq$transcript_accession[refseq$gene_id==GENE]
if(length(can.isoform)==0 | !any(stringi::stri_detect_fixed(str = l,pattern = can.isoform))){
# Canonical isoform not found in RefSeq, or Match for Canonical isoform not found in AAchangeAnnotations
l2 <- unlist(
stringi::stri_split_fixed(
str = l,
pattern = ":"),
use.names = F,
recursive = F)
l3 <- stringi::stri_replace_all_fixed(
str = l2[stringi::stri_startswith_fixed(
str = l2,
"p.")],
pattern = "p.",
replacement = "")
MUTATION <- l3[which.max(MutSiteFind(l3))]
} else {
# Canonical isoform found
l=l[grep(can.isoform,l)]
l2 <- unlist(
stringi::stri_split_fixed(
str = l,
pattern = ":"),
use.names = F,
recursive = F)
MUTATION <- stringi::stri_replace_first_fixed(
str = l2[stringi::stri_detect_regex(
str = l2,
use.names = F,
pattern = "^p\\.")],
pattern = "p.",
replacement = "")
}
}
if(length(MUTATION)==0){
MUTATION <- NA
}
cat(paste(MUTATION,"\n"),file="log.txt", append=T)
return(MUTATION)
}
stopCluster(myCluster)
return(results)
}
# Installing missing dependencies # Development of the script has migrated to https://github.com/dchakro/
dependencies <- c("stringi") # replace the line sourcing this file with the following in your code.
missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])] source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/MutSiteFind.R')
if(length(missing_packages)) install.packages(missing_packages) \ No newline at end of file
rm(missing_packages,dependencies)
MutSiteFind <- function(MutationColumn){
# #<---------------------------->
# # You must include this section when:
# # Distributing, Using and/or Modifying this code.
# # Please read and abide by the terms of the included LICENSE.
# # Copyright 2020, Deepankar Chakroborty, All rights reserved.
# #
# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha)
# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues
# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE
# #<---------------------------->
# # PURPOSE:
# # For a given vector of amino acid changes like A123T, V256F, E746_A750del
# # this function returns c(123, 256, 746) as amino acid positions of
# # the mutated residue.
# # In case of indels, it doesn't return the range!!
# # (i.e. returns only the start position)
return(unlist(x = stringi::stri_extract_first_regex(str = MutationColumn,pattern = "[[:digit:]]+"), use.names = F))
}
File deleted
ggplotBreaks <- function(range,tick,skip.steps=0){ # Development of the script has migrated to https://github.com/dchakro/
# #<----------------------------> # replace the line sourcing this file with the following in your code.
# # You must include this section when: source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/ggplotBreaks.R')
# # Distributing, Using and/or Modifying this code. \ No newline at end of file
# # Please read and abide by the terms of the included LICENSE.
# # Copyright 2020, Deepankar Chakroborty, All rights reserved.
# #
# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha)
# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues
# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE
# #<---------------------------->
# # PURPOSE:
# # Returns a list of vectors containing breaks and labels
# # for a continuous variable mapped to one of the axes for use with ggplot2
# # User enters:
# # - the range of the data
# # - the tick amount
# # - skip.steps (if any) = number of labels to skip
# # (i.e. show tick but no label)
# # e.g.
# # ggplotBreaks(c(0,150), tick =10, skip.steps = 1) # gives
# # $breaks
# # 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150
# #
# # $labels
# # "0" " " "20" " " "40" " " "60" " " "80" " " "100" " " "120" " " "140" " "
if (length(range) != 2){
stop("Correct format for: range = c(min_value,max_value)")
}
if(skip.steps<0){
stop(" 'skip.steps' should be >= 0")
}
breaks <- seq(from = range[1],
to = range[2],
by = tick)
if(skip.steps==0){
labels <- as.character(breaks)
} else {
tmp <- c()
labels <- unlist(
lapply(
breaks[seq(from = 1,
to = length(breaks),
by = skip.steps+1)],
function(x) c(tmp,
c(x,
rep(" ",skip.steps)))),
use.names = F)
rm(tmp)
}
if(length(labels) < length(breaks)){
## Add more spaces at the end to make lengths match
labels <- c( labels, rep(" " , (length(breaks) - length(labels))))
}
if ( length(breaks) < length(labels)){
## Remove spaces from the end to make the lengths match
labels <- labels[1:length(breaks)]
}
return(list( breaks = breaks,
labels = labels))
}
# Installing missing dependencies # Development of the script has migrated to https://github.com/dchakro/
dependencies <- c("stringi", "progress") # replace the line sourcing this file with the following in your code.
missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])] source('https://raw.githubusercontent.com/dchakro/shared_Rscripts/master/unparalogMutations.R')
if(length(missing_packages)) install.packages(missing_packages) \ No newline at end of file
rm(missing_packages,dependencies)
unparalog <- function(DATA, paralog_separator = ";", annotation_separator = ",", GeneColName , AnnotationColName ){
# #<---------------------------->
# # You must include this section when:
# # Distributing, Using and/or Modifying this code.
# # Please read and abide by the terms of the included LICENSE.
# # Copyright 2020, Deepankar Chakroborty, All rights reserved.
# #
# # Author : Deepankar Chakroborty (https://gitlab.utu.fi/deecha)
# # Report issues: https://gitlab.utu.fi/deecha/shared_scripts/-/issues
# # License: https://gitlab.utu.fi/deecha/shared_scripts/-/blob/master/LICENSE
# #<---------------------------->
# # PURPOSE:
# # In the gene column in your SNV annotation if you see something like:
# # e.g. PRAMEF7;PRAMEF8 OR PRAMEF7,PRAMEF8
# # then your mutations annotations have gene paralogs.
# # This script aims to de-couple those paralogs into individual their rows.
# #
# # INFO on what to pass as the function parameters:
# Assign correct paralog_separator found in the gene column of your SNV annotations # e.g. if the Gene column has entries like PRAMEF7;PRAMEF8
# then the paralog_separator is ";"
# or set it to whatever separator is used by your SNV annotation software.
# Assign correct annotation_separator in the Amino acid change column of your SNV annotations
# "PRAMEF8:NM_001012276:exon3:c.C541A:p.Q181K,PRAMEF7:NM_001012277:exon3:c.C541A:p.Q181K"
# In the above example the annotation_separator is ","
# GeneColName = Column name in the SNV annotation table where the Genes are listed
# AnnotationColName = Column name in the SNV annotation table where the Amino acid changes are listed
# <--------------->
# Sanity checks
check_paralog_sep <- !any(stringi::stri_detect_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
check_annotation_sep <- !any(stringi::stri_detect_fixed(str = DATA$AAChange.refGene, pattern = annotation_separator))
idx_Gene <- which(colnames(DATA) == GeneColName)
idx_Annotation <- which(colnames(DATA) == AnnotationColName)
if(length(idx_Gene) == 0 | length(idx_Annotation) == 0 | check_paralog_sep | check_annotation_sep) {
message(stringi::stri_c("You entered :-->",
"\nparalog_separator = ",paralog_separator,
"\nannotation_separator = ",annotation_separator,
"\nGeneColName = ", GeneColName,
"\nAnnotationColName = ",AnnotationColName))
stop("Inconsistencies with these input parameters.\n Ensure they are correct and try again.")
}
# Cleanup
rm(check_paralog_sep,check_annotation_sep)
gc()
current.idx <- 1 # nrow(DATA)+1
paralog.idx <- which(stringi::stri_detect_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
pb <- progress::progress_bar$new(total=length(paralog.idx),format = " [:bar] :current/:total (:percent)",); pb$tick(0)
Number_of_paralogs <- sum(stringi::stri_count_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
message(stringi::stri_c("There are ",length(paralog.idx)," annotations with ", Number_of_paralogs," paralogs."))
# copying structure of original DATA
DATA.new <- DATA[1,]; DATA.new <- DATA.new[-1,]
# Creating empty table which will be populated in the for loop
DATA.new <- dplyr::bind_rows(DATA.new,data.frame(matrix(nrow = (length(paralog.idx)+Number_of_paralogs), ncol = ncol(DATA),dimnames = list(c(),colnames(DATA))),stringsAsFactors = F))
# Beginning isolation of the paralogs
for(i in paralog.idx){
Muts <- unlist(stringi::stri_split_fixed(DATA$AAChange.refGene[i],annotation_separator),use.names = F,recursive = F)
for (gene in unlist(stringi::stri_split_fixed(DATA$Gene.refGene[i],pattern = paralog_separator),use.names = F,recursive = F)){
DATA.new[current.idx,] <- DATA[i,]
DATA.new$Gene.refGene[current.idx] <- gene
DATA.new$AAChange.refGene[current.idx] <- paste0(Muts[grep(gene,Muts,fixed = T)],collapse=annotation_separator)
current.idx <- current.idx+1
}
pb$tick(1)
}
# removing the original rows with the paralogs as they are all unparalogged now
DATA <- DATA[-paralog.idx,]
DATA <- dplyr::bind_rows(DATA,DATA.new)
rm(DATA.new) ; gc()
rownames(DATA) <- as.character(seq(1,length(DATA[,1])))
return (DATA)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment