Skip to content
Snippets Groups Projects
Commit 5f419b3e authored by Jouni Kettunen's avatar Jouni Kettunen
Browse files

BiologicalDataWithR course, eight files

parents
No related branches found
No related tags found
No related merge requests found
# Bioinfromatics Data with R
The files in here are from the course BIOI4440 Biological Data Analysis with R.
The course covered aspects of running data-analysis with BioConductor packages
The data includes e.g. high-throughput sequencing, ELISA and mass spectrometry data.
Weekly numbers correspond the context by following:
1. Introduction to R statistical environment
1. Practicing R programming
1. Simple sequence analysis
1. Annotating gene groups
1. Transcriptomics: gene expression microarrays and qRT-PCR
1. Deciphering the regulome: from ChIP to ChIP-seq
Some of the scripts need external datasets that are not involved. Any unauthorized usage is prohibited.
# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
# STUDENT NAME: JOUNI KETTUNEN
# I use RStudio so I just clicked from the right corner below
# and set my working directory to BioR21 folder
setwd("~/BioR21/Week1")
# imported the .csv via ready made tools
# right top corner: import dataset, text, tab separators
# heading yes and Automatic (first columns)
furin.file <- read.delim("~/BioR21/furin_data.csv")
# get the six first rows
head(furin.file)
# get the last six rows
tail(furin.file)
# display the structure of furin.file
str(furin.file)
# whole data plotted
plot(furin.file)
# getting help
help("jpeg")
#just "randomly" setting some values and seeing what happens
jpeg(filename = "Naive.WT.1_plot.jpg",
width = 480, height = 480, units = "cm", pointsize = 10,
quality = 75, res = NA)
# Naive.WT.1 plotted
plot(furin.file$Naive.WT.1)
#shutting the jpeg
dev.off()
# and the file is saved
# might this be better than just saving the image straightly
# from the plots -tab?
# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
# STUDENT NAME: JOUNI KETTUNEN
# this wasn't that hard to do on my own
# for some reason I have often these kind of "ignition problems"
# like looping: I've done it in Python and Java and R basic course
# but for some reason felt like someone had pressen red button
# and erased all my knowledge
# set a new working directory for the exercises
setwd("~/BioR21/Week2")
# used the R-studios automatic function to get the .csv file
# import dataset -> from text -> heading yes -> row names automatic
furin_data <- read.delim("~/BioR21/furin_data.csv")
# empty vectors a & b
# bit stupid names, but didn't want to change
# as I had them already defined
a <- c()
b <- c()
#start for loop, range needs to be a vector
# i.e. 1 to number of rows
for (i in 1:nrow(furin_data)) {
# assign the filter value to a variable
k <- furin_data[i,3]/furin_data[i,5]
# use the variable for filtering
if((k > 2) | (k < 0.5)) {
# save the k to a local variable
outputk <- k
# concatenate the previously defined vectors and local variables
b <- c (b, outputk)
# b holds the values
nams <- furin_data$NAME[i]
# a holds the names
a <- c(a,nams)
# finally print the names that fullfill the conditions
print(furin_data$NAME[i])
}
}
mydf <- data.frame(fraction = b)
#create a data frame with 1 column (ratio values)
# JUST FOR PRACTICE, CODE BELOW
help("make.names")
# use make.names to transform duplicates to unique ids
# data frame can't have duplicate id's
rownames(mydf) = make.names(a, unique = TRUE)
#rename the data frame 1st column (note that starts from 1 not 0)
names(mydf)[1] <- "ratio"
#write the .csv to a data frame
write.csv(mydf, "filteredDF.csv")
# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
# STUDENT NAME: Jouni Kettunen
setwd("~/BioR21/Week3")
# First thought that these had to be separately created
# and thus left them be such
# 100.000 random numbers
vals <- rnorm(1000000)
# create a matrix containing random numbers
mym <- matrix(vals, ncol = 10, byrow = TRUE)
# just to test the structure of the matrix
a <- mym[1:5]
b <- mym[6:10]
testin <- t.test(mym[1:5], mym[6:10])
testin$p.value
# ctrl + i for indentation
#set up a counter for the values
counter = 0
# system.time to measure how long the code runs
system.time(
# loop from 1 to number of rows
for (i in 1:nrow(mym)) {
# again I thought that we needed a vector for latter purpose
# while the oneliner was nice solutio, I left this as such as it was
# to show independent thinking
# take values from every row(1 - number of rows) column-wise from 1-5 and 6-10
savedtest <- t.test(mym[i,1:5],mym[i,6:10])
# if the p-value is significant
if (savedtest$p.value < 0.05) {
#increase the counter by 1
counter = counter +1
}
#print(mym[i,1:5])
})
print(c("Number of items", counter))
# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
# STUDENT NAME: Jouni Kettunen
# set a new working directory for this week
setwd("~/BioR21/Week4")
# create a matrix with 100000 rows and 20 columns and fill it with random numbers
# had to use calculator to check the amount of needed numbers
mymat <- matrix(rnorm(2000000), ncol = 20, byrow = TRUE)
# empty vectors for the p-values
p5 <- c()
p10 <- c()
#for loop to calculate the t-test with 5 first rows
for (i in 1:nrow(mymat)) {
k <- (t.test(mymat[i,1:5],mymat[i,6:10]))
output <- k$p.value
p5 <- c(p5, output)
}
# attach the p-values holding p5 vector to matrix
mymat <- cbind(mymat, p5 = p5)
#adjust the p-value and attach it to the matrix
mymat <- cbind(mymat, p5adjust = (p.adjust(p5)))
#for loop to calculate the t-test with 10 rows
for (i in 1:nrow(mymat)) {
x <- (t.test(mymat[i,1:10],mymat[i,11:20]))
output <- x$p.value
p10 <- c(p10, output)
}
# attach the p-values holding p10 vector to matrix
mymat <- cbind(mymat, p10 = p10)
#adjust the p-value and attach it to the matrix
mymat <- cbind(mymat, p10adjust = (p.adjust(p10)))
#transform the matrix to dataframe for easier handling
myframe <- as.data.frame(mymat)
#find the number of significant p-values
sum(myframe$p5<0.05)
sum(myframe$p10<0.05)
sum(myframe$p5adjust<0.05)
sum(myframe$p10adjust<0.05)
### These are some alternative solutions I tried ###
# I created a subset of the dataframe, but this didn't
# work as I thought. If i use AND operator between columns I don't get
# any values as the condition is impossible.
# If I use the OR, I get also some values that aren't < 0.05
newframe <- subset(myframe, p5 < 0.05 | p10 < 0.05 | p5adjust < 0.05 | p10adjust < 0.05,
select = c("p5", "p10", "p5adjust", "p10adjust"))
# the frame is filtered to 9043 rows
nrow(newframe)
# and the values are here
sum(newframe$p5<0.05)
#selecting columns one by one would've been possibility
this <- subset(myframe, p10adjust < 0.05, select = "p10adjust")
# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
# STUDENT NAME: Jouni Kettunen
# generate a vector with the given accessions
FGF2s <- c("NP_001997.5", "NP_001103711.1", "NP_990764.1", "NP_062178.1", "NP_032032.1", "XP_003432529.1")
library(rentrez)
otherFGF2 <- entrez_fetch(db = "protein", id = FGF2s, rettype="fasta")
# make it into list to write into computer
anotherFGF2 <- list(otherFGF2)
#use the data.tables library to write file
library(data.table)
# write the file to computer¨with fwrite (Stack Overflow tells that it is fastest method)
fwrite(anotherFGF2, file = "fastaFGF2.fasta")
# use the seqinr to
library(seqinr)
# read the file from computer
FGF2fasta <- read.fasta(file = "fastaFGF2.fasta",seqtype = "AA")
# get the headers
getAnnot(FGF2fasta)
library(stringr)
# tell that we are searching []
chars <- '\\[.+]'
# extract the names from the annotation
names <- str_extract(unlist(getAnnot(FGF2fasta)),chars)
# remove the [ brackets ]
names <- gsub(pattern = "[[:punct:]]",names, replacement="")
# the "[[:punct:]]" contains the all punctuation marks (incl. []) in reg ex
# use the biomaRt library to get data
library(biomaRt)
# make a mart object before use
mart <- useMart("ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl")
# save the data to vector
toStats <- getSequence(FGF2fasta)
# make a 3x2 canvas
par(mfrow=c(3,2))
# create plots with loop
# title creates the headers
for (i in 1:6) {
AAstat(toStats[[i]])
title(names[i])
}
# change back to 1x1 canvas
par(mfrow=c(1,1))
# edit:
# I tried this with the ape package as it is also a list formatted
# and voilá, works so beautifully
# trace(read.GenBank, edit = TRUE)
# FGF2seq<-read.GenBank(access.nb = FGF2s,as.character=T)
#for (i in 1:6) {
# AAstat(toupper(FGF2seq[[i]]))
# title(names[i])
#}
\ No newline at end of file
# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
# STUDENT NAME: Jouni Kettunen
# A data frame with 25 rows and 3 columns
elisa.data <- data.frame(matrix(NA, nrow = 25, ncol = 3))
# name the columns
names(elisa.data)[1] <- "label"
names(elisa.data)[2]<- "OD"
names(elisa.data)[3] <- "conc"
# fill the first column with names
elisa.data$label <- c(rep("ref", 5), rep("treat",10), rep("cont",10))
# a random number for optical density
my_OD <- c(runif(1, min=1, max=100))
# five known values (controls)
# diluted by 10
for (i in 2:5){
my_OD[i] <- my_OD[i-1]/10
}
# attach them to the OD column
elisa.data$OD[1:5] <- my_OD
# fill the rest part of OD with random numbers
elisa.data$OD[6:25] <- runif(20)
# an imaginary concentration
# and diluttions of it by 10
ref_conc <- 150
for (i in 2:5) {
ref_conc[i] <- ref_conc[i-1]/10
}
#attach the values to the reference column
elisa.data$conc[1:5] <- ref_conc
# load DRC library
library(drc)
# create a model for the measurements
elisamodel <- drm(formula = OD ~ conc,
data = elisa.data[1:5,], fct = LL.4())
# a new data frame with values from the original dataset
elisa.data_predicted <- elisa.data[6:25,]
# fill the concentration column with values predicted on the basis of known values
elisa.data_predicted$conc <- predict(elisamodel, data.frame(OD=elisa.data$OD[6:25]))
# create a boxplot to compare the groups
boxplot(conc ~label, data = elisa.data_predicted)
# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITHOUT CONSULTING
# STUDENT NAME: Jouni Kettunen
#### TASK1 ####
# get an overview of the Theoph dataset with help
help(Theoph)
# the dose is given in the datasets "Dose" column, only timespan and n are mentioned in text
# "Twelve subjects were given oral doses of theophylline then serum concentrations were measured at 11 time points over the next 25 hours."
#returns the sturcture of dataset and telling it's data.frame and etc.
str(Theoph)
#inspecting the first six values
head(Theoph)
# get a summary of the statistics
summary(Theoph)
#### TASK2 ####
# save the reference to the dataset to a local variable for further manipulation
Theo_data <- Theoph
# At first I only managed to do the plot that was in the help page
# here's what they use in the example of the help page of Theoph (except show.given is FALSE)
coplot(conc ~ Time | Subject, data = Theoph, show.given = TRUE)
# in a way I think this is quite handy, but not that clear by default
# Then I saw that you used the ggplot and decided to have a go
# Load ggplot and dplyr to use piping
library(ggplot2)
library(dplyr)
# pipe the dataframe, group it and color it by the Subjects. All in one graph.
ggplot(Theo_data %>% group_by(Subject),
aes(x = Time, y = conc, color = Subject)) + geom_line() + ggtitle("Teophilin concentrations")
# This is the best!
# For loop is still a mystery, it has been always the most difficult thing to me in all languages
par(mfrow = c(3,4))
# I understand the example
for (i in 1:12){
plot(i*c(1:10))
}
# but how to implement, no idea.
# But during these desperate coding years I've tried to rely on Mr. Google and Stackoverflow.
# And here's what I found out. Not that I truly understand, but still.
# Make canvas in size 3x4
par(mfrow = c(3, 4))
# Make an ordered data frame
Theo_data2 <- Theoph%>% group_by(Subject)
# for x in unique() gives all unique values (levels should work also)
for (cat in unique(Theo_data2$Subject)){
# this subsets the data by selecting the corresponding Subject
d <- subset(Theo_data2, Theo_data2$Subject == cat)
# and this plots the subjects data conc~time manner
plot(d$Time, d$conc, xlim = range(d$Time), ylim = range(d$conc),
# couldn't extract the Subject info from the data frame, so I used the looping variable
# this naturally works only in these 1 to something plots
main = paste("Plot of Subject: ", cat))
}
# return the canvas size
par(mfrow = c(1,1))
#### TASK3 ####
# Three individual dataframes
# dose is constant so no need for max and mean, just pick the values with either or
theoDose <- aggregate(list(Dose = Theo_data$Dose),by=list(Subject = Theo_data$Subject),mean)
# Max and Mean for the concentration
theoMaxC <- aggregate(list(maxC = Theo_data$conc),by=list(Subject = Theo_data$Subject),max)
theomeanC <- aggregate(list(meanC = Theo_data$conc),by=list(Subject = Theo_data$Subject),mean)
### TASK 4 ###
# Fo easier operations, combine the three data frames (only value columns needed from last 2)
doseConcentration <- data.frame(theoDose, maxC = theoMaxC$maxC, meanC = theomeanC$meanC)
# Investigate the relationships of dose vs concentration with ggplot
# plot dose vs. max concentration, gray area (to my understanding) is the SD
ggplot(doseConcentration %>% group_by(Subject)
,(aes(x = maxC, y = Dose))) + geom_point() + geom_smooth(method = "lm")
# Plot dose vs mean concentration. No need to re-pipe, as the frame is already in
ggplot(doseConcentration,(aes(x = meanC, y = Dose))) + geom_point() + geom_smooth(method = "lm")
# Really ugly looking dispersion with several outliers!
# Relationship between max and mean concentration
ggplot(doseConcentration,(aes(x = meanC, y = maxC))) + geom_point() + geom_smooth(method = "lm")
# This is something to be expected and to occur. Nicely plotted around the line.
### Task 5 & 6 ###
# After googling decided to use easier way to combine correlation and plot directly
# load to make some graphs and correlations
library(ggpubr)
# pearson correlation and the graph for the values
ggscatter(doseConcentration, x = "meanC", y = "maxC", add = "reg.line",
conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson",
xlab = "Concentration mean", ylab = "Concentration maximum")
ggscatter(doseConcentration, x = "meanC", y = "Dose", add = "reg.line",
conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson",
xlab = "Mean concentration", ylab = "Dose")
# for some reason the colors dont work properly and the graphs are bit archaic
ggscatter(doseConcentration, x = "maxC", y = "Dose", add = "reg.line",
conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson",
xlab = "Max concentration", ylab = "Dose", palette = c("springfield"))
# to sum it up: only meanc and maxC had a statistically significant correlation.
# rest two had a weak positive correlation
# THIS CODE IS MY OWN WORK, IT WAS WRITTEN WITH CONSULTING GOOGLE AND FOLLOWING OTHERS SOLUTIONS
# STUDENT NAME: Jouni Kettunen
# After going into Moodle I found out that this weeks solutions were already there, not nice.
# Keeping my pride, I decided to return this flawed and hard googled work despite having an opportunity to do everything right by following the examples
# It's divided into original part and after part, the latter having solutions done in the class and me implementing snippets I found from internet.
# Set the working directory
setwd("~/BioR21/Week8")
#### TASK 1 #####
# load the igraph for visualization
library(igraph)
# make the yeast.graphml to an igraph object
yeastgraph <- read_graph("yeast.graphml", format = "graphml")
#### TASK 2 #####
# create the Global Efficiency function
global.efficiency <- function(graph) {
# variable n to hold the number of vertices
N <- vcount (graph)
# dij to hold the shortest paths value
dij <- shortest.paths(graph)
#replaces zeros as infinite values
dij[dij == 0] <- Inf
# calculation part
out <- (1/(N*(N-1))) * sum(1/dij)
return(out)
}
global.efficiency(yeastgraph)
#### TASK 3 & 4 (original) #####
# my skills are far far far faaaaaaaar away to do any own implementations
# so here's the thing you advised; "emulation"
dummy.vulnerability <- function () {
return(rnorm(1))
}
dummy.vulnerability()
# I didn't anymore know what to do, so I really got frustrated
# I'm sorry for the hard words but I left them to show my frustration
# I do understand that this is an university level course not a kindergarten, but still
# PLEASE: give more instructions when things are this hard. I really found myself hanging loose and got really frustrated
# This after all is a matter of continuing or not. If this is so hard and no help or no explanations, why to continue why to try at all?
# no idea how to use the function, but I did it like this
V(yeastgraph)$vul <- rnorm(vcount(yeastgraph))
# hopefully values have changed
V(yeastgraph)$vul
#### TASK 5 (original) ####
# is this the plotting function?
plot(yeastgraph)
V(yeastgraph)$color <- "pink"
# i understand that you can do a lot of coloring, but how. Why there wasn't any proper examples?!
#### TASK 3-5 (after) ####
# I feel that the function part is a mindless copy of others solution, I wish you would've explained this more thoroughly
# Task 3
vulnerability <- function(graph) {
# variable E counts the efficiency and holds its value
E <- global.efficiency(graph)
# an empty vector
result <- c()
# start from the first vertex and travel to the last
for(i in 1:vcount(graph)) {
# Atte or Antti had this nice printing option to show where we are in every 100th vertice
if(i %% 100 == 0){
print(paste("prosessing vertex no ", i))
}
# I'm not sure what this does, but it was needed
# are we saving instead of deleting things?
ver.del <- delete.vertices(graph, V(graph)[i])
# count the efficiency for the altered
E.i <- global.efficiency(ver.del)
V <- (E-E.i)/E
result <- c (result,V)
}
return(result)
}
# Task 4
#assign the vulnerability as a node attribute
V(yeastgraph)$vul <- vulnerability(yeastgraph)
# Task 5
# This is what I got after hours of googling and my own try-outs
# load the ready made colorlibrary
library(rcartocolor)
# find appropriate colorscale from colors that suit also to colorblind
display_carto_all(type = "all", colorblind_friendly = T)
# attach the colorscale to a colorRampPalette function that is used in the coloring (found this from Stackoverflow)
rbPal <- colorRampPalette(carto_pal(7, "SunsetDark"))
# use the 7 colors for the nodes
# is 7 enough?
V(yeastgraph)$Col <- rbPal(7)[as.numeric(cut(V(yeastgraph)$vul,breaks = 7))]
# inspect that the colors really are there
unique(V(yeastgraph)$Col)
# trying to make a reasonable graph with different layout options, found this and the latter part from "Network Visualization Cookbook"
coords <- layout_with_fr(yeastgraph, niter = 10000)
coords2 <- layout_with_graphopt(yeastgraph, niter = 10000)
# plot the network with adjusted layout and node options to get a better view
# aspect size asp = 0 to get more area covered, reduce vertex size from default 15 to 6, color edge to white and color vertices by their vulnerability
plot.igraph(yeastgraph, layout = coords, asp = 0, vertex.color = V(yeastgraph)$Col, vertex.frame.color = "white", vertex.size = 6, vertex.size2 = NA)
plot.igraph(yeastgraph, layout = coords2, asp = 0, vertex.color = V(yeastgraph)$Col, vertex.frame.color = "white", vertex.size = 6, vertex.size2 = NA)
# Neither of these look good, they are just one messy pile of labels.
# The good thing is that when I used the pseudofunction to create colors for the "vulnerability", the vertix colors actually change accordingly!
# Figured out that tkplot can also be used with same attributes as plot.igraph, bad side is that it seems to take a long time to finish.
# Good side is that it shows the colors even better than plain plot.
tkplot(yeastgraph, layout = coords, asp = 0, vertex.color = V(yeastgraph)$Col, vertex.frame.color = "white", vertex.size = 6, vertex.size2 = NA)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment