BiologicalDataWithR course, eight files

5f419b3e · Jouni Kettunen · 5f419b3e · 5f419b3e · 5f419b3e · 5f419b3e
Commit 5f419b3e authored Oct 21, 2021 by Jouni Kettunen
--- a/README.md
+++ b/README.md
+# Bioinfromatics Data with R
+
+The files in here are from the course BIOI4440 Biological Data Analysis with R.
+
+The course covered aspects of running data-analysis with BioConductor packages
+
+The data includes e.g. high-throughput sequencing, ELISA and mass spectrometry data.
+
+Weekly numbers correspond the context by following:
+1. Introduction to R statistical environment
+1. Practicing R programming
+1. Simple sequence analysis
+1. Annotating gene groups
+1. Transcriptomics: gene expression microarrays and qRT-PCR
+1. Deciphering the regulome: from ChIP to ChIP-seq
+
+Some of the scripts need external datasets that are not involved. Any unauthorized usage is prohibited.
--- a/Week1Kettunen.R
+++ b/Week1Kettunen.R
+# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
+
+# STUDENT NAME: JOUNI KETTUNEN 
+
+# I use RStudio so I just clicked from the right corner below
+# and set my working directory to BioR21 folder
+setwd("~/BioR21/Week1")
+
+# imported the .csv via ready made tools
+# right top corner: import dataset, text, tab separators
+# heading yes and Automatic (first columns)
+furin.file <- read.delim("~/BioR21/furin_data.csv")
+
+# get the six first rows
+head(furin.file)
+
+# get the last six rows
+tail(furin.file)
+
+# display the structure of furin.file
+str(furin.file)
+
+# whole data plotted
+plot(furin.file)
+
+
+# getting help
+help("jpeg")
+
+#just "randomly" setting some values and seeing what happens
+jpeg(filename = "Naive.WT.1_plot.jpg",
+     width = 480, height = 480, units = "cm", pointsize = 10,
+     quality = 75, res = NA)
+# Naive.WT.1 plotted
+plot(furin.file$Naive.WT.1)
+#shutting the jpeg
+dev.off()
+# and the file is saved
+
+# might this be better than just saving the image straightly
+# from the plots -tab?
--- a/Week2Kettunen.R
+++ b/Week2Kettunen.R
+# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
+# STUDENT NAME: JOUNI KETTUNEN
+
+# this wasn't that hard to do on my own
+# for some reason I have often these kind of "ignition problems"
+
+# like looping: I've done it in Python and Java and R basic course
+# but for some reason felt like someone had pressen red button
+# and erased all my knowledge
+
+
+# set a new working directory for the exercises
+setwd("~/BioR21/Week2")
+# used the R-studios automatic function to get the .csv file
+# import dataset -> from text -> heading yes -> row names automatic 
+furin_data <- read.delim("~/BioR21/furin_data.csv")
+
+# empty vectors a & b
+# bit stupid names, but didn't want to change 
+# as I had them already defined
+a <- c()
+b <- c()
+#start for loop, range needs to be a vector
+# i.e. 1 to number of rows
+for (i in 1:nrow(furin_data)) {
+  # assign the filter value to a variable
+  k <- furin_data[i,3]/furin_data[i,5]
+  # use the variable for filtering
+  if((k > 2) | (k < 0.5)) {
+    # save the k to a local variable
+    outputk <- k
+    # concatenate the previously defined vectors and local variables
+    b <- c (b, outputk)
+    # b holds the values
+    nams <- furin_data$NAME[i]
+    # a holds the names
+    a <- c(a,nams)
+    # finally print the names that fullfill the conditions
+    print(furin_data$NAME[i])
+  }
+}
+
+mydf <- data.frame(fraction = b)
+#create a data frame with 1 column (ratio values)
+
+# JUST FOR PRACTICE, CODE BELOW
+
+
+help("make.names")
+# use make.names to transform duplicates to unique ids
+# data frame can't have duplicate id's
+rownames(mydf) = make.names(a, unique = TRUE)
+
+
+#rename the data frame 1st column (note that starts from 1 not 0)
+names(mydf)[1] <- "ratio"
+#write the .csv to a data frame
+write.csv(mydf, "filteredDF.csv")
+
--- a/Week3Kettunen.R
+++ b/Week3Kettunen.R
+# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
+
+# STUDENT NAME: Jouni Kettunen
+
+
+
+
+setwd("~/BioR21/Week3")
+
+# First thought that these had to be separately created
+# and thus left them be such
+
+# 100.000 random numbers
+vals <- rnorm(1000000)
+# create a matrix containing random numbers
+mym <- matrix(vals, ncol = 10, byrow = TRUE)
+
+# just to test the structure of the matrix
+
+a <- mym[1:5]
+b <- mym[6:10]
+
+testin <- t.test(mym[1:5], mym[6:10])
+
+testin$p.value
+
+# ctrl + i for indentation
+
+#set up a counter for the values
+counter = 0
+# system.time to measure how long the code runs
+system.time(
+  # loop from 1 to number of rows
+  for (i in 1:nrow(mym)) {
+    # again I thought that we needed a vector for latter purpose
+    # while the oneliner was nice solutio, I left this as such as it was
+    # to show independent thinking
+    
+    # take values from every row(1 - number of rows) column-wise from 1-5 and 6-10
+    savedtest <- t.test(mym[i,1:5],mym[i,6:10])
+    # if the p-value is significant
+    if (savedtest$p.value < 0.05) {
+      #increase the counter by 1
+      counter = counter +1
+    }
+    #print(mym[i,1:5])
+  })
+ print(c("Number of items", counter))
--- a/Week4Kettunen.R
+++ b/Week4Kettunen.R
+# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
+
+# STUDENT NAME: Jouni Kettunen
+
+# set a new working directory for this week
+setwd("~/BioR21/Week4")
+
+# create a matrix with 100000 rows and 20 columns and fill it with random numbers
+# had to use calculator to check the amount of needed numbers
+mymat <- matrix(rnorm(2000000), ncol = 20, byrow = TRUE)
+
+# empty vectors for the p-values
+p5 <- c()
+p10 <- c()
+
+#for loop to calculate the t-test with 5 first rows
+for (i in 1:nrow(mymat)) {
+  k <- (t.test(mymat[i,1:5],mymat[i,6:10]))
+  output <- k$p.value
+  p5 <- c(p5, output)
+}
+# attach the p-values holding p5 vector to matrix
+mymat <- cbind(mymat, p5 = p5)
+
+#adjust the p-value and attach it to the matrix
+mymat <- cbind(mymat, p5adjust = (p.adjust(p5)))
+
+
+#for loop to calculate the t-test with 10 rows
+for (i in 1:nrow(mymat)) {
+  x <- (t.test(mymat[i,1:10],mymat[i,11:20]))
+  output <- x$p.value
+  p10 <- c(p10, output)
+}
+
+# attach the p-values holding p10 vector to matrix
+mymat <- cbind(mymat, p10 = p10)
+#adjust the p-value and attach it to the matrix
+mymat <- cbind(mymat, p10adjust = (p.adjust(p10)))
+
+#transform the matrix to dataframe for easier handling
+myframe <- as.data.frame(mymat)
+
+#find the number of significant p-values 
+sum(myframe$p5<0.05)
+sum(myframe$p10<0.05)
+sum(myframe$p5adjust<0.05)
+sum(myframe$p10adjust<0.05)
+
+
+### These are some alternative solutions I tried ###
+
+# I created a subset of the dataframe, but this didn't
+# work as I thought. If i use AND operator between columns I don't get
+# any values as the condition is impossible. 
+# If I use the OR, I get also some values that aren't < 0.05
+
+newframe <- subset(myframe, p5 < 0.05 | p10 < 0.05 | p5adjust < 0.05 | p10adjust < 0.05,
+                   select = c("p5", "p10", "p5adjust", "p10adjust"))
+
+# the frame is filtered to 9043 rows 
+nrow(newframe)
+
+# and the values are here
+sum(newframe$p5<0.05)
+
+#selecting columns one by one would've been possibility
+this <- subset(myframe, p10adjust < 0.05, select = "p10adjust")
+
--- a/Week5Kettunen.R
+++ b/Week5Kettunen.R
+# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
+
+# STUDENT NAME: Jouni Kettunen 
+
+
+# generate a vector with the given accessions
+FGF2s <- c("NP_001997.5", "NP_001103711.1", "NP_990764.1", "NP_062178.1", "NP_032032.1", "XP_003432529.1")
+
+library(rentrez)
+
+otherFGF2 <- entrez_fetch(db = "protein", id = FGF2s, rettype="fasta")
+
+# make it into list to write into computer
+anotherFGF2 <- list(otherFGF2)
+
+#use the data.tables library to write file
+library(data.table)
+
+# write the file to computer¨with fwrite (Stack Overflow tells that it is fastest method)
+fwrite(anotherFGF2, file = "fastaFGF2.fasta")
+
+# use the seqinr to
+library(seqinr)
+
+# read the file from computer
+FGF2fasta <- read.fasta(file = "fastaFGF2.fasta",seqtype = "AA")
+
+# get the headers
+getAnnot(FGF2fasta)
+
+library(stringr)
+
+# tell that we are searching []
+chars <- '\\[.+]'
+
+# extract the names from the annotation
+names <- str_extract(unlist(getAnnot(FGF2fasta)),chars)
+
+# remove the [ brackets ]
+names <- gsub(pattern = "[[:punct:]]",names, replacement="")
+# the "[[:punct:]]" contains the all punctuation marks (incl. []) in reg ex
+
+# use the biomaRt library to get data
+library(biomaRt)
+
+# make a mart object before use
+mart <- useMart("ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl")
+
+# save the data to vector
+toStats <- getSequence(FGF2fasta)
+
+# make a 3x2 canvas
+par(mfrow=c(3,2))
+
+# create plots with loop
+# title creates the headers
+for (i in 1:6) {
+  AAstat(toStats[[i]])
+  title(names[i])
+}
+# change back to 1x1 canvas
+par(mfrow=c(1,1))
+
+# edit:
+# I tried this with the ape package as it is also a list formatted
+# and voilá, works so beautifully
+
+# trace(read.GenBank, edit = TRUE)
+# FGF2seq<-read.GenBank(access.nb = FGF2s,as.character=T)
+#for (i in 1:6) {
+ # AAstat(toupper(FGF2seq[[i]]))
+  # title(names[i])
+#}
\ No newline at end of file
--- a/Week6Kettunen.r
+++ b/Week6Kettunen.r
+# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
+
+# STUDENT NAME: Jouni Kettunen
+
+
+
+
+# A data frame with 25 rows and 3 columns
+
+elisa.data <- data.frame(matrix(NA, nrow = 25, ncol = 3))
+
+# name the columns
+names(elisa.data)[1] <- "label"
+names(elisa.data)[2]<- "OD"
+names(elisa.data)[3] <- "conc"
+
+# fill the first column with names
+elisa.data$label <- c(rep("ref", 5), rep("treat",10), rep("cont",10))
+
+# a random number for optical density
+my_OD <- c(runif(1, min=1, max=100))
+
+# five known values (controls)
+# diluted by 10
+for (i in 2:5){
+  my_OD[i] <- my_OD[i-1]/10
+}
+# attach them to the OD column
+elisa.data$OD[1:5] <- my_OD
+
+# fill the rest part of OD with random numbers
+elisa.data$OD[6:25] <- runif(20)
+
+# an imaginary concentration
+# and diluttions of it by 10
+ref_conc <- 150 
+for (i in 2:5) {
+  ref_conc[i] <- ref_conc[i-1]/10
+}
+#attach the values to the reference column
+elisa.data$conc[1:5] <- ref_conc 
+
+# load DRC library
+library(drc)
+
+# create a model for the measurements
+elisamodel <- drm(formula = OD ~ conc, 
+                  data = elisa.data[1:5,], fct = LL.4())
+
+
+# a new data frame with values from the original dataset 
+elisa.data_predicted <- elisa.data[6:25,]
+
+# fill the concentration column with values predicted on the basis of known values
+elisa.data_predicted$conc <- predict(elisamodel, data.frame(OD=elisa.data$OD[6:25]))
+
+# create a boxplot to compare the groups 
+boxplot(conc ~label, data = elisa.data_predicted)
+
--- a/Week7Kettunen.R
+++ b/Week7Kettunen.R
+# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
+
+# STUDENT NAME: Jouni Kettunen 
+
+#### TASK1 ####
+
+# get an overview of the Theoph dataset with help
+help(Theoph)
+
+# the dose is given in the datasets "Dose" column, only timespan and n are mentioned in text
+# "Twelve subjects were given oral doses of theophylline then serum concentrations were measured at 11 time points over the next 25 hours."
+
+#returns the sturcture of dataset and telling it's data.frame and etc.
+str(Theoph)
+
+
+#inspecting the first six values
+head(Theoph)
+
+# get a summary of the statistics 
+summary(Theoph)
+
+#### TASK2 ####
+
+# save the reference to the dataset to a local variable for further manipulation
+Theo_data <- Theoph
+
+# At first I only managed to do the plot that was in the help page
+
+# here's what they use in the example of the help page of Theoph (except show.given is FALSE)
+coplot(conc ~ Time | Subject, data = Theoph, show.given = TRUE)
+# in a way I think this is quite handy, but not that clear by default
+
+# Then I saw that you used the ggplot and decided to have a go
+
+# Load ggplot and dplyr to use piping
+library(ggplot2)
+library(dplyr)
+
+# pipe the dataframe, group it and color it by the Subjects. All in one graph.
+ggplot(Theo_data %>% group_by(Subject), 
+       aes(x = Time, y = conc, color = Subject)) + geom_line() + ggtitle("Teophilin concentrations")
+# This is the best!
+
+
+# For loop is still a mystery, it has been always the most difficult thing to me in all languages
+
+par(mfrow = c(3,4))
+
+# I understand the example
+for (i in 1:12){
+  plot(i*c(1:10))
+}
+# but how to implement, no idea.
+
+# But during these desperate coding years I've tried to rely on Mr. Google and Stackoverflow. 
+# And here's what I found out. Not that I truly understand, but still.
+
+# Make canvas in size 3x4
+par(mfrow = c(3, 4))
+
+# Make an ordered data frame
+Theo_data2 <- Theoph%>% group_by(Subject)
+
+# for x in unique() gives all unique values (levels should work also)
+for (cat in unique(Theo_data2$Subject)){
+  
+  # this subsets the data by selecting the corresponding Subject
+  d <- subset(Theo_data2, Theo_data2$Subject == cat)
+  
+  # and this plots the subjects data conc~time manner
+  plot(d$Time, d$conc, xlim = range(d$Time), ylim = range(d$conc),
+       
+       # couldn't extract the Subject info from the data frame, so I used the looping variable
+       # this naturally works only in these 1 to something plots
+       main = paste("Plot of Subject: ", cat))
+}
+
+# return the canvas size
+par(mfrow = c(1,1))
+
+#### TASK3 ####
+
+# Three individual dataframes
+
+# dose is constant so no need for max and mean, just pick the values with either or
+theoDose <- aggregate(list(Dose = Theo_data$Dose),by=list(Subject = Theo_data$Subject),mean)
+
+# Max and Mean for the concentration
+theoMaxC <- aggregate(list(maxC = Theo_data$conc),by=list(Subject = Theo_data$Subject),max)
+theomeanC <- aggregate(list(meanC = Theo_data$conc),by=list(Subject = Theo_data$Subject),mean)
+
+### TASK 4 ###
+
+# Fo easier operations, combine the three data frames (only value columns needed from last 2)
+
+doseConcentration <- data.frame(theoDose, maxC = theoMaxC$maxC, meanC = theomeanC$meanC)
+
+# Investigate the relationships of dose vs concentration with ggplot
+
+# plot dose vs. max concentration, gray area (to my understanding) is the SD
+ggplot(doseConcentration %>% group_by(Subject) 
+       ,(aes(x = maxC, y = Dose))) + geom_point() + geom_smooth(method = "lm") 
+
+# Plot dose vs mean concentration. No need to re-pipe, as the frame is already in
+ggplot(doseConcentration,(aes(x = meanC, y = Dose))) + geom_point() + geom_smooth(method = "lm")
+
+# Really ugly looking dispersion with several outliers!
+
+# Relationship between max and mean concentration
+ggplot(doseConcentration,(aes(x = meanC, y = maxC))) + geom_point() + geom_smooth(method = "lm")
+# This is something to be expected and to occur. Nicely plotted around the line.
+
+### Task 5 & 6 ###
+
+# After googling decided to use easier way to combine correlation and plot directly
+
+# load to make some graphs and correlations
+library(ggpubr)
+
+# pearson correlation and the graph for the values
+ggscatter(doseConcentration, x = "meanC", y = "maxC", add = "reg.line", 
+          conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson",
+          xlab = "Concentration mean", ylab = "Concentration maximum")
+
+ggscatter(doseConcentration, x = "meanC", y = "Dose", add = "reg.line", 
+          conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson",
+          xlab = "Mean concentration", ylab = "Dose")
+
+# for some reason the colors dont work properly and the graphs are bit archaic 
+ggscatter(doseConcentration, x = "maxC", y = "Dose", add = "reg.line", 
+          conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson",
+          xlab = "Max concentration", ylab = "Dose", palette = c("springfield"))
+
+# to sum it up: only meanc and maxC had a statistically significant correlation.
+# rest two had a weak positive correlation
+
--- a/Week8Kettunen.R
+++ b/Week8Kettunen.R
+# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITH CONSULTING GOOGLE AND FOLLOWING OTHERS SOLUTIONS
+
+# STUDENT NAME: Jouni Kettunen
+
+# After going into Moodle I found out that this weeks solutions were already there, not nice.
+# Keeping my pride, I decided to return this flawed and hard googled work despite having an opportunity to do everything right by following the examples
+# It's divided into original part and after part, the latter having solutions done in the class and me implementing snippets I found from internet.
+
+# Set the working directory
+setwd("~/BioR21/Week8")
+
+#### TASK 1 #####
+
+
+# load the igraph for visualization
+
+library(igraph)
+
+# make the yeast.graphml to an igraph object
+yeastgraph <- read_graph("yeast.graphml", format = "graphml")
+
+#### TASK 2 #####
+
+# create the Global Efficiency function
+global.efficiency <- function(graph) {
+  
+  # variable n to hold the number of vertices
+  N <- vcount (graph)
+ 
+  # dij to hold the shortest paths value
+  dij <- shortest.paths(graph)
+  
+  #replaces zeros as infinite values
+  dij[dij == 0] <- Inf
+  
+  # calculation part
+  out <- (1/(N*(N-1))) * sum(1/dij)
+  return(out)
+}
+
+global.efficiency(yeastgraph)
+
+#### TASK 3 & 4 (original) #####
+
+# my skills are far far far faaaaaaaar away to do any own implementations
+# so here's the thing you advised; "emulation"
+dummy.vulnerability <- function () {
+  return(rnorm(1))
+}
+
+dummy.vulnerability()
+
+# I didn't anymore know what to do, so I really got frustrated
+
+# I'm sorry for the hard words but I left them to show my frustration
+
+# I do understand that this is an university level course not a kindergarten, but still
+# PLEASE: give more instructions when things are this hard. I really found myself hanging loose and got really frustrated
+# This after all is a matter of continuing or not. If this is so hard and no help or no explanations, why to continue why to try at all?
+
+# no idea how to use the function, but I did it like this
+V(yeastgraph)$vul <- rnorm(vcount(yeastgraph)) 
+
+# hopefully values have changed
+V(yeastgraph)$vul
+
+#### TASK 5 (original) ####
+
+# is this the plotting function?
+plot(yeastgraph)
+
+V(yeastgraph)$color <- "pink"
+# i understand that you can do a lot of coloring, but how. Why there wasn't any proper examples?!
+
+#### TASK 3-5 (after) ####
+
+# I feel that the function part is a mindless copy of others solution, I wish you would've explained this more thoroughly
+
+# Task 3
+
+vulnerability <- function(graph) {
+ # variable E counts the efficiency and holds its value
+  E <- global.efficiency(graph)
+  
+  # an empty vector
+  result <- c()
+  
+  # start from the first vertex and travel to the last
+  for(i in 1:vcount(graph)) {
+    
+    # Atte or Antti had this nice printing option to show where we are in every 100th vertice
+    if(i %% 100 == 0){
+      print(paste("prosessing vertex no ", i))
+    }
+    
+    # I'm not sure what this does, but it was needed
+    # are we saving instead of deleting things?
+    ver.del <- delete.vertices(graph, V(graph)[i])
+    
+    # count the efficiency for the altered 
+    E.i <- global.efficiency(ver.del)
+    V <- (E-E.i)/E
+    result <- c (result,V)
+  }
+  return(result)
+}
+
+# Task 4
+#assign the vulnerability as a node attribute
+V(yeastgraph)$vul <- vulnerability(yeastgraph)
+
+# Task 5
+
+# This is what I got after hours of googling and my own try-outs
+
+# load the ready made colorlibrary
+library(rcartocolor)
+
+# find appropriate colorscale from colors that suit also to colorblind
+display_carto_all(type = "all", colorblind_friendly = T)
+
+# attach the colorscale to a colorRampPalette function that is used in the coloring (found this from Stackoverflow)
+rbPal <- colorRampPalette(carto_pal(7, "SunsetDark"))
+
+# use the 7 colors for the nodes
+# is 7 enough?
+V(yeastgraph)$Col <- rbPal(7)[as.numeric(cut(V(yeastgraph)$vul,breaks = 7))]
+
+# inspect that the colors really are there
+unique(V(yeastgraph)$Col)
+
+# trying to make a reasonable graph with different layout options, found this and the latter part from "Network Visualization Cookbook"
+coords <- layout_with_fr(yeastgraph, niter = 10000)
+coords2 <- layout_with_graphopt(yeastgraph, niter = 10000)
+
+# plot the network with adjusted layout and node options to get a better view
+# aspect size asp = 0 to get more area covered, reduce vertex size from default 15 to 6, color edge to white and color vertices by their vulnerability
+plot.igraph(yeastgraph, layout = coords, asp = 0, vertex.color = V(yeastgraph)$Col, vertex.frame.color = "white", vertex.size = 6, vertex.size2 = NA)
+plot.igraph(yeastgraph, layout = coords2, asp = 0, vertex.color = V(yeastgraph)$Col, vertex.frame.color = "white", vertex.size = 6, vertex.size2 = NA)
+
+# Neither of these look good, they are just one messy pile of labels.
+# The good thing is that when I used the pseudofunction to create colors for the "vulnerability", the vertix colors actually change accordingly!
+
+# Figured out that tkplot can also be used with same attributes as plot.igraph, bad side is that it seems to take a long time to finish. 
+# Good side is that it shows the colors even better than plain plot.
+
+tkplot(yeastgraph, layout = coords, asp = 0, vertex.color = V(yeastgraph)$Col, vertex.frame.color = "white", vertex.size = 6, vertex.size2 = NA)
\ No newline at end of file