#libraries
library(DESeq2)
library(tibble)

##set working directory
setwd("C:/Users/thood/Documents/R/TCGA/DESeq Results EML4 vs NOn-Fused")


#remove any global variables
rm(list=ls())

#read in dataset
#dataset <- as.matrix(read.csv("counts.csv", row.names=1))
colData <- read.csv('clinical_recurrent.csv', row.names = 1)
#colData <- rownames_to_column(colData, var="ID")

dataset <- read.csv("All594_60483_Fused_recurrent_vs_normal.csv", row.names=1)

#dataset <- rownames_to_column(dataset, var="ID")

cleaned<-as.matrix(cleaned)

genes <- read.csv("gene.list.csv")

dataset_pcgenes <- dataset[dataset$dataframe...1. %in% genes$genes,]

dataset_pcgenes_trans <- t(dataset_pcgenes)

colnames(dataset_pcgenes_trans) = dataset_pcgenes_trans[1, ]
dataset_pcgenes_trans = dataset_pcgenes_trans[-1, ]

dataset_pcgenes_trans <- as.data.frame(dataset_pcgenes_trans)

dataset_pcgenes_trans <- rownames_to_column(dataset_pcgenes_trans, var="ID")

cleaned <- dataset_pcgenes_trans[dataset_pcgenes_trans$ID %in% rownames(colData),]

rownames(cleaned) = cleaned[,1]
cleaned = cleaned[,-1]

cleaned <- t(cleaned)

cleaned <- as.data.frame(cleaned)

#subset the data into allergic and healthy
#dataset_normal <- subset(dataset[,(grep(, colnames(dataset)))])
#colData_normal <- subset(colData[(grep("*.11A", rownames(colData))),])



#cleaned <- as.matrix(cleaned)

#cleaned[,] <-round(cleaned[,],0)

#df <- as.data.frame(cleaned)
write.csv(cleaned, "final_data_recurrent_fusions_vs_non.csv")

dataset<- read.csv("final_data_recurrent_fusions_vs_non.csv", row.names = 1)

cleaned <- as.matrix(dataset)

#check that they match
all(rownames(colData) %in% colnames(cleaned))


#put both files in the same order
cleaned <- cleaned[, rownames(colData)]
all(rownames(colData) == colnames(cleaned))

#convert patient to factor
#colData$patient <- factor(colData$patient)

#create your DESeqData set = dds
#dds <- DESeqDataSetFromMatrix(countData = dataset, colData=colData, design = ~ patient + condition)
dds <- DESeqDataSetFromMatrix(countData = cleaned, colData=colData, design = ~condition)
dds

#prefiltering
dds <- dds[rowSums(counts(dds)) > 1, ]

#creating levels for reference comparison
dds$condition <- relevel(dds$condition, ref = "Non.Fused")
dds$condition <- droplevels(dds$condition)


#differential expression analysis
#library(BiocParallel)
dds <- DESeq(dds)
res <- results(dds)

res
summary(res)

#look at adjusted p values
sum(res$padj < .1, na.rm=TRUE)

indices <- which(res$padj <.1, na.omit(res$padj))
genenamesPadJ10 <- rownames(res)[indices]
genenamesPadJ10

#create a csv file
write.csv(genenamesPadJ10, "genenamesPadj10_recurrent_fusions_vs_non.csv")
write.csv(res, "res_all_recurrent_fusions_vs_non.csv")

#look at .05
res05 <-results(dds, alpha = .05)
summary(res05)

sum(res05$padj < 0.05, na.rm=TRUE)

indices05 <- which(res05$padj <.05)
genenamesPadJ05 <- rownames(res05)[indices05]
genenamesPadJ05

#write a csv file
write.csv(genenamesPadJ05, "genenamesPadj05_recurrent_fusions_vs_non.csv")
write.csv(res05, "res05_recurrent_fusions_vs_non.csv")

#look at .01
res01 <-results(dds, alpha = .01)
summary(res01)

sum(res01$padj < 0.01, na.rm=TRUE)

indices01 <- which(res01$padj <.01)
genenamesPadJ01 <- rownames(res01)[indices01]
genenamesPadJ01

write.csv(genenamesPadJ01, "genenamesPadj01_recurrent_fusions_vs_non.csv")
write.csv(res01, "res01_recurrent_fusions_vs_non.csv")

#look at .001
res001 <-results(dds, alpha = .001)
summary(res001)

sum(res001$padj < 0.001, na.rm=TRUE)

indices001 <- which(res001$padj <.001)
genenamesPadJ001 <- rownames(res001)[indices001]
genenamesPadJ001 

#create csv file
write.csv(genenamesPadJ001, "genenamesPadj001_recurrent_fusions_vs_non.csv")
write.csv(res001, "res001_recurrent_fusions_vs_non.csv")


#plot
plotMA(res, main="DESeq2", ylim=c(-8,8))
plotMA(res05, main="DESeq2", ylim=c(-8,8))
plotMA(res001, main="DESeq2", ylim=c(-8,8))

#pull out top adj p value genes by sorting and grabbing top 25
allres001 <- res001[indices001,]
sortbypvalue <- allres001[order(allres001$padj),]
rownames(sortbypvalue[1:25,])

#extracting normCounts
normCount <- counts(dds, normalized=TRUE)

#only get normalized counts for the genes we are interested in 
normCount_all_non_genes_001 <- normCount[rownames(normCount) %in% genenamesPadJ001,]

#write normalized counts
write.csv(normCount, "recurrent_fusions_vs_non_normCount_padj001.csv")
write.csv(normCount_all_non_genes_001, "recurrent_fusions_vs_non_normCount_2219_padj001.csv")


# #testing normalization methods
# raw <- counts(dds, normalized=FALSE)
# raw_df<-as.data.frame(raw)
# sizes_test <- estimateSizeFactorsForMatrix(raw)
# new_norm_test <- raw/do.call(rbind, rep(list(sizes_test), 19403))
# new_norm_test_df<- as.data.frame(new_norm_test)
# 
# tester<- dataset_pcgenes
# rownames(tester) <- tester[,1]
# tester<- tester[,-1]
# 
# #tester_2 <- tester[rowSums(tester) > 2, ]
# 
# tester_2 <- tester[rownames(tester) %in% rownames(normCount_df),]
# 
# tester_2_m <- as.matrix(tester_2)
# sizes_test2 <- estimateSizeFactorsForMatrix(tester_2_m)
# new_norm_test2 <- tester_2_m/do.call(rbind, rep(list(sizes_test2), 19403))
# new_norm_test_df2<- as.data.frame(new_norm_test2)
# 
# compare<- cbind(new_norm_test_df2[,"TCGA.44.2655.Normal"], new_norm_test_df[,"TCGA.44.2655.Normal"])
# 
# #create a normalized count file for all patients
# write.csv(new_norm_test_df2, "normCountsAllPatients.csv")



#getting transformed values from dds
rld<- rlog(dds, blind=FALSE)
vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
vsd.fast <- vst(dds, blind=FALSE)


#effects of transformation on variance
library(vsn)
notAllZero <- (rowSums(counts(dds))>0)
meanSdPlot(log2(counts(dds,normalized=TRUE)[notAllZero,] + 1))
meanSdPlot(assay(rld[notAllZero,]))
meanSdPlot(assay(vsd[notAllZero,]))

#Heatmap of count matrix
library(pheatmap)
library(grid)
#select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
select2 <- order(rowMeans(normCount_all_non_genes_001), decreasing = TRUE)

#subset the original datset for the rowmeans to get the right indices lined up
gn <- rownames(normCount_all_non_genes_001)[select2]

# defaults to log2(x+1)
nt <- normTransform(dds) 
log2.norm.counts <- assay(nt)[indices001,]
#log2.norm.counts <- normCount_114_genes
#log2.norm.counts <- assay(nt)[gn,]
#datf <- as.data.frame(colData(dds)[,c("patient", "condition")])
datf <- as.data.frame(colData(dds)[,"condition", drop=FALSE])
map<-pheatmap(log2.norm.counts, cluster_rows=TRUE, cluster_cols = TRUE, show_rownames=FALSE, annotation_col = datf, fontsize_col =5, main = "Log2 norm Counts")
#pheatmap(assay(rld)[indices001,], cluster_rows=FALSE, show_rownames=FALSE, annotation_col = datf, fontsize_col =3, main = "regularized log transformation")
#pheatmap(assay(vsd)[indices001,], cluster_rows=FALSE, show_rownames=FALSE,annotation_col = datf, fontsize_col =3, main = "variance stabilizing")
log2.norm.counts[map$tree_col$order,]
#map<-pheatmap(log2.norm.counts, cluster_rows=TRUE, cluster_cols = TRUE, show_rownames=TRUE, annotation_col = datf, main = "Log2 norm Counts", width = 45, height=45, filename="test.pdf")