Skip to content
Snippets Groups Projects
Commit 4f722e54 authored by robocopAlpha's avatar robocopAlpha
Browse files

working code

parent 802746ae
No related branches found
No related tags found
No related merge requests found
# #<---------------------------->
# # Author : Deepankar Chakroborty
# # Report issues:
# # Please include this when distributing my code.
# # In the gene column in your SNV annotation if you see something like:
# # e.g. PRAMEF7;PRAMEF8 OR PRAMEF7,PRAMEF8
# # then your mutations annotations have gene paralogs.
# #
# # This script aims to de-couple those paralogs into individual their rows.
# #
# #<---------------------------->
### Info:
# Assign correct paralog_separator found in the gene column of your SNV annotations # e.g. if the Gene column has entries like PRAMEF7;PRAMEF8
# then the paralog_separator is ";"
# or set it to whatever separator is used by your SNV annotation software.
# Assign correct annotation_separator in the Amino acid change column of your SNV annotations
# "PRAMEF8:NM_001012276:exon3:c.C541A:p.Q181K,PRAMEF7:NM_001012277:exon3:c.C541A:p.Q181K"
# In the above example the annotation_separator is ","
# GeneColName = Column name in the SNV annotation table where the Genes are listed
# AnnotationColName = Column name in the SNV annotation table where the Amino acid changes are listed
# <--------------->
unparalog <- function(DATA, paralog_separator = ";", annotation_separator = ",", GeneColName , AnnotationColName ){
# Installing missing dependencies
dependencies <- c("stringi", "progress")
missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])]
if(length(missing_packages)) install.packages(missing_packages)
# Sanity checks
check_paralog_sep <- !any(stringi::stri_detect_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
check_annotation_sep <- !any(stringi::stri_detect_fixed(str = DATA$AAChange.refGene, pattern = annotation_separator))
idx_Gene <- which(colnames(DATA) == GeneColName)
idx_Annotation <- which(colnames(DATA) == AnnotationColName)
if(length(idx_Gene) == 0 | length(idx_Annotation) == 0 | check_paralog_sep | check_annotation_sep) {
message(stringi::stri_c("You entered :-->",
"\nparalog_separator = ",paralog_separator,
"\nannotation_separator = ",annotation_separator,
"\nGeneColName = ", GeneColName,
"\nAnnotationColName = ",AnnotationColName))
stop("Inconsistencies with these input parameters.\n Ensure they are correct and try again.")
}
# Cleanup
rm(missing_packages,dependencies,check_paralog_sep,check_annotation_sep)
gc()
current.idx <- nrow(DATA)+1
paralog.idx <- which(stringi::stri_detect_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
pb <- progress::progress_bar$new(total=length(paralog.idx),format = " [:bar] :current/:total (:percent)",); pb$tick(0)
Number_of_paralogs <- sum(stringi::stri_count_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
message(stringi::stri_c("There are ",length(paralog.idx)," annotations with ", Number_of_paralogs," paralogs."))
# copying structure of original DATA
DATA.add <- DATA[1,]; DATA.add <- DATA.add[-1,]
# Adding the empty rows to original table.
# These rows will be populated in the for loop
DATA.add <- dplyr::bind_rows(DATA.add,data.frame(matrix(nrow = (length(paralog.idx)+Number_of_paralogs), ncol = ncol(DATA),dimnames = list(c(),colnames(DATA))),stringsAsFactors = F))
DATA <- dplyr::bind_rows(DATA,DATA.add)
rm(DATA.add) ; gc()
# Beginning isolation of the paralogs
for(i in paralog.idx){
Muts <- unlist(stringi::stri_split_fixed(DATA$AAChange.refGene[i],annotation_separator),use.names = F,recursive = F)
for (gene in unlist(stringi::stri_split_fixed(DATA$Gene.refGene[i],pattern = paralog_separator),use.names = F,recursive = F)){
DATA[current.idx,] <- DATA[i,]
DATA$Gene.refGene[current.idx] <- gene
DATA$AAChange.refGene[current.idx] <- paste0(Muts[grep(gene,Muts,fixed = T)],collapse=",")
current.idx <- current.idx+1
}
pb$tick(1)
}
# removing the original rows with the paralogs as they are all unparalogged now
DATA <- DATA[-paralog.idx,]
rownames(DATA) <- as.character(seq(1,length(DATA[,1])))
return (DATA)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment