From 4f722e5464c576bc20f11fdc27eda794b278066a Mon Sep 17 00:00:00 2001
From: robocopAlpha <deep@iMac.T5>
Date: Wed, 29 Jul 2020 11:37:33 +0300
Subject: [PATCH] working code

---
 unparalogMutations.R | 84 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/unparalogMutations.R b/unparalogMutations.R
index e69de29..e72d2c3 100644
--- a/unparalogMutations.R
+++ b/unparalogMutations.R
@@ -0,0 +1,84 @@
+# #<---------------------------->
+# #  Author : Deepankar Chakroborty
+# #  Report issues: 
+# #  Please include this when distributing my code. 
+# #  In the gene column in your SNV annotation if you see something like:
+# #  e.g. PRAMEF7;PRAMEF8  OR  PRAMEF7,PRAMEF8
+# #       then your mutations annotations have gene paralogs.
+# #
+# #  This script aims to de-couple those paralogs into individual their rows.
+# #
+# #<---------------------------->
+
+### Info:
+# Assign correct paralog_separator found in the gene column of your SNV annotations # e.g. if the Gene column has entries like PRAMEF7;PRAMEF8 
+# then the paralog_separator is ";" 
+# or set it to whatever separator is used by your SNV annotation software.
+
+# Assign correct annotation_separator in the Amino acid change column of your SNV annotations
+# "PRAMEF8:NM_001012276:exon3:c.C541A:p.Q181K,PRAMEF7:NM_001012277:exon3:c.C541A:p.Q181K"
+# In the above example the annotation_separator is ","
+
+# GeneColName = Column name in the SNV annotation table where the Genes are listed
+
+# AnnotationColName = Column name in the SNV annotation table where the Amino acid changes are listed
+
+# <--------------->
+
+unparalog <- function(DATA, paralog_separator = ";", annotation_separator = ",", GeneColName , AnnotationColName ){
+   # Installing missing dependencies
+   dependencies <- c("stringi", "progress")
+   missing_packages <- dependencies[!(dependencies %in% installed.packages()[, "Package"])]
+   if(length(missing_packages)) install.packages(missing_packages)      
+   
+   # Sanity checks
+   check_paralog_sep <- !any(stringi::stri_detect_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
+   check_annotation_sep <- !any(stringi::stri_detect_fixed(str =  DATA$AAChange.refGene, pattern = annotation_separator))
+   
+   idx_Gene <- which(colnames(DATA) == GeneColName)
+   idx_Annotation <- which(colnames(DATA) == AnnotationColName)
+   
+   if(length(idx_Gene) == 0 | length(idx_Annotation) == 0 | check_paralog_sep | check_annotation_sep) {
+      message(stringi::stri_c("You entered :-->",
+                              "\nparalog_separator = ",paralog_separator,
+                              "\nannotation_separator = ",annotation_separator,
+                              "\nGeneColName = ", GeneColName,
+                              "\nAnnotationColName = ",AnnotationColName))
+   stop("Inconsistencies with these input parameters.\n Ensure they are correct and try again.")
+   }
+   # Cleanup
+   rm(missing_packages,dependencies,check_paralog_sep,check_annotation_sep)
+   gc()
+   
+   current.idx <- nrow(DATA)+1
+   paralog.idx <- which(stringi::stri_detect_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
+   pb <- progress::progress_bar$new(total=length(paralog.idx),format = " [:bar] :current/:total (:percent)",); pb$tick(0)
+   
+   Number_of_paralogs <- sum(stringi::stri_count_fixed(str = DATA$Gene.refGene,pattern = paralog_separator))
+   message(stringi::stri_c("There are ",length(paralog.idx)," annotations with ", Number_of_paralogs," paralogs."))
+   
+   # copying structure of original DATA
+   DATA.add <- DATA[1,]; DATA.add <- DATA.add[-1,] 
+   
+   # Adding the empty rows to original table. 
+   # These rows will be populated in the for loop
+   DATA.add <- dplyr::bind_rows(DATA.add,data.frame(matrix(nrow = (length(paralog.idx)+Number_of_paralogs), ncol = ncol(DATA),dimnames = list(c(),colnames(DATA))),stringsAsFactors = F))
+   DATA <- dplyr::bind_rows(DATA,DATA.add)
+   rm(DATA.add) ; gc()
+   
+   # Beginning isolation of the paralogs
+   for(i in paralog.idx){
+      Muts <- unlist(stringi::stri_split_fixed(DATA$AAChange.refGene[i],annotation_separator),use.names = F,recursive = F)
+      for (gene in unlist(stringi::stri_split_fixed(DATA$Gene.refGene[i],pattern = paralog_separator),use.names = F,recursive = F)){
+         DATA[current.idx,] <- DATA[i,]
+         DATA$Gene.refGene[current.idx] <- gene
+         DATA$AAChange.refGene[current.idx] <- paste0(Muts[grep(gene,Muts,fixed = T)],collapse=",")
+         current.idx <- current.idx+1
+      }
+      pb$tick(1)
+   }
+   # removing the original rows with the paralogs as they are all unparalogged now
+   DATA <- DATA[-paralog.idx,]
+   rownames(DATA) <- as.character(seq(1,length(DATA[,1])))
+   return (DATA)
+}
-- 
GitLab