library(WGCNA)
library(DESeq2)
library(genefilter)
library(ensembldb)
library(tidyverse)
library(pheatmap)
library(fastcluster)
library(limma)
library(cluster)
library(factoextra)
library(thematic)

# controls plot themes at high level 
thematic_on(bg = '#222222', fg = 'white', font = font_spec(scale = 2))
# WGCNA or weighted gene coexpression network analysis

# first step is loading the data from your gene counts information
# we give the ensembl gene IDs as the row names 
counts <- read.csv('WGCNA_counts.csv', header = T)

counts.IP <- data.frame(counts) %>%
  column_to_rownames('X')

# load the sample information
samples.IP <- read.csv('WGCNA_samples.csv', header = T, row.names = 2)
samples.IP <- samples.IP[, -1]

# load data with DESeq2 combining the counts information and sample information
dds <- DESeqDataSetFromMatrix(counts.IP, samples.IP, design = ~Group)
dds$Group <- relevel(dds$Group, ref = 'Sal.One')


# getting the length of genes to calculate fold change per kb million (fkpm)
# get length of all genes from previously computed data 
lengths.genes <- read.csv('LengthData.csv', row.names = 1)
length_means <- DataFrame(rowMeans(lengths.genes))
mcols(dds)$basepairs <- length_means$rowMeans.lengths.genes.

# get fpkm data from DESeq2 counts
# transposed to give form for WGCNA
fpkm_matrix <- t(fpkm(dds))

# Calculate sample distance and cluster the samples
# produces a stepwise dendrogram based on euclidean distance 
sampleTree <- hclust(dist(fpkm_matrix), method = "average")

# plot sample tree
par(cex = 1.3);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="",
     cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)


# Choose a set of soft threshold parameters for the analysis
powers = c(c(1:20), seq(from = 22, to=30, by=2))
sft = pickSoftThreshold(fpkm_matrix, powerVector = powers, verbose = 5) 

# Scale-free topology fit index as a function of the soft-thresholding power
# our model suggests we should choose a power = 3 for the topological
# overlap matrix calculations; see sft$powerEstimate
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");

# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red") 

# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity")) 
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

# now we can examine an automatic method for generating the topological
# overlap matrix based on k means clustering before hierarchical clustering
cor <- WGCNA::cor
net <- blockwiseModules(fpkm_matrix, power = 10, saveTOMs = TRUE,
                        checkMissingData = TRUE,
                        saveTOMFileBase = 'blockwiseTOM')
cor<- stats::cor

# saving the wgcna results
readr::write_rds(net, file = 'WGCNA_results.rds')

# or load in the wgcna results
net <- readr::read_rds('WGCNA_results.rds')

# plotting the results
sizeGrWindow(12, 9)
mergedColors = labels2colors(net$colors)
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

# design matrix based on the Group of the animals
des_mat <- model.matrix(~samples.IP$Group)

# lmFit() needs a transposed version of the matrix
fit <- limma::lmFit(t(net$MEs), design = des_mat)

# Apply empirical Bayes to smooth standard errors
fit <- limma::eBayes(fit)

# Apply multiple testing correction and obtain stats
# this shows which modules are the most differentially expressed
# due to Group type comparing all groups against the Fent Five
stats_df <- limma::topTable(fit, number = ncol(net$MEs)) %>%
  tibble::rownames_to_column("module")

# look at the brown module more closely
brown_df <- data.frame(net$MEs$MEbrown)
brown_df$Sample.IP <- row.names(net$MEs)
brown_df$Group <- samples.IP$Group

# FentFive group MEs are higher than all other groups
# in the brown module
ggplot(
  brown_df,
  aes(
    x = Group,
    y = net.MEs.MEbrown
  )
) +
  # a boxplot with outlier points hidden (they will be in the sina plot)
  geom_boxplot(width = 0.2, outlier.shape = NA) +
  # A sina plot to show all of the individual data points
  ggforce::geom_sina(maxwidth = 0.3) +
  theme_classic()

# show genes assigned to each ME
gene_module_key <- tibble::enframe(net$colors,
    name = "gene", value = "module") %>%
  dplyr::mutate(module = paste0("ME", module))

# get all the genes in a given module
gene_module_key %>%
  dplyr::filter(module == "MEdarkred")

# merge some modules based on hierarchical clustering
merged <- mergeCloseModules(fpkm_matrix,
                            colors = net$colors,
                            MEs = net$MEs,
                            cutHeight = 0.4)

mergedlabels <- merged$colors
mergedColors <- labels2colors(mergedlabels)
genes2colors <- as.data.frame(mergedlabels) %>%
  rownames_to_column('GENEID')

# plotting wald stats for a given module
dds <- DESeqDataSetFromMatrix(counts.IP, samples.IP, design = ~Group)
keep <- rowSums( counts(dds) >= 5 ) >= 2
dds <- dds[keep,]
dds$Group <- relevel(dds$Group, ref = 'Sal.One')
dds <- DESeq(dds)
res.SalFive_vs_SalOne <- results(dds, 
                                 contrast = c('Group', 'Sal.Five', 'Sal.One'))

res.SalFive_vs_SalOne <- data.frame(res.SalFive_vs_SalOne) %>%
  rownames_to_column("GENEID") %>%
  left_join(genes2colors)

# boxplot
ggplot(data = res.FentFive_vs_FentOne, aes(x = mergedlabels, y = stat, fill = mergedlabels)) +
  geom_boxplot(notch = TRUE, show.legend = FALSE, outlier.shape = NA) +
  scale_fill_manual(values = str_sort(unique((mergedlabels)))) +
  ylab('Wald statistic') +
  ggtitle('Fentanyl Five vs Fentanyl One') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_hline(yintercept = 3) +
  geom_hline(yintercept = -3)

# violin plot
ggplot(res.SalFive_vs_SalOne, aes(x = mergedlabels, y = stat, fill = mergedlabels)) +
  geom_violin(show.legend = FALSE) +
  geom_hline(yintercept = c(2, -2)) +
  scale_fill_manual(values = str_sort(unique((mergedlabels)))) +
  xlab('Wald statistic') +
  ggtitle('Sal.Five vs Sal.One') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_hline(yintercept = 3) +
  geom_hline(yintercept = -3)

# fentanyl five vs fentanyl one
dds <- DESeqDataSetFromMatrix(counts.IP, samples.IP, design = ~Group)
dds$Group <- relevel(dds$Group, ref = 'Fent.One')
dds <- DESeq(dds)
res2 <- results(dds, contrast = c('Group', 'Fent.Five', 'Fent.One'))

res2 <- data.frame(res2) %>%
  rownames_to_column("GENEID")
res2$moduleColor <- mergedColors
res2$moduleColor <- as.factor(res$moduleColor)

ggplot(data = res2, aes(x = moduleColor, y = stat, fill = moduleColor)) +
  geom_boxplot(notch = TRUE, show.legend = FALSE, outlier.shape = NA) +
  scale_fill_manual(values = str_sort(unique((mergedColors)))) +
  ylab('Wald statistic') +
  ggtitle('Fent.Five vs Fent.One') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
# getting the adjacency matrix from the data
# DO NOT PERFORM THE METHODS BELOW UNLESS YOU HAVE A COMPUTER
# WITH AT LEAST 32 Gb RAM
cor <- WGCNA::cor
adj <- adjacency(fpkm_matrix, power = 10, type = 'signed')

# Turn adjacency into topological overlap matrix
TOM = TOMsimilarity(adj, TOMType = "signed")
dissTOM = 1-TOM

#hierichical clustering using the agglomerative nesting algorithm
# methods to assess similarity of samples like euclidean distance
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

# function to compute agglomerative coefficient
ac <- function(x) {
  agnes(fpkm_matrix, method = x)$ac
}

# Agglomerative coefficient of each agglomeration method
map_dbl(m, ac)

# plotting the output of the clustering using Ward 
# as the linkage method
res.agnes <- agnes(x = fpkm_matrix, # data matrix
                   stand = TRUE, # Standardize the data
                   metric = "euclidean", # metric for distance matrix
                   method = "ward" # Linkage method
)

fviz_dend(res.agnes, k = 4)

#assign gene names from adjacency matrix to dissTOM
rownames(dissTOM) = rownames(adj) 
colnames(dissTOM) = colnames(adj) 

# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "ward.D2")

#We set the minimum module size at 10:
minModuleSize = 10;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                            minClusterSize = minModuleSize);
table(dynamicMods)

# Convert numeric lables into colors
moduleColors = labels2colors(dynamicMods)
table(moduleColors)


# Plot the dendrogram and colors underneath
sizeGrWindow(5,6)
plotDendroAndColors(geneTree, moduleColors, "Module Colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")


#Define numbers of genes and samples
nGenes = ncol(fpkm_matrix)
nSamples = nrow(fpkm_matrix)

# Recalculate MEs with color labels
# correlations for each gene
MEs0 = moduleEigengenes(fpkm_matrix, mergedColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, samples.IP)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

# heat map of module eigen-genes and samples
pheatmap(MEs, cluster_col=T, cluster_row=T, show_rownames=F, show_colnames=T,
         fontsize=6)

# Heatmap of module eigen-genes and groups
col_ann <- samples.IP[, c(2, 7)]
rownames(col_ann) <- rownames(samples.IP)
col_ann <- data.frame(col_ann)
col_ann$Group <- as.factor(col_ann$Group)
col_ann <- col_ann[order(col_ann$Group),]
col_ann$Sex <- as.factor(col_ann$Sex)
head(col_ann)
ann_color <- list("col_ann" = c("Fent.Five" = "red",
                                "Fent.One" = "tan2",
                                "Sal.Five" = "cyan4",
                                "Sal.One" = "lightblue"))

data <- data.frame(MEs)
data <- data[order(match(rownames(MEs), rownames(col_ann))),]
dim(MEs)

#pdf(file="newMEs.pdf",heigh=60,width=20)

pheatmap(data ,cluster_col=T, cluster_row=F, show_rownames=F,
         show_colnames=T, fontsize=6,
         annotation_row = col_ann, annotation_colors = ann_color)

#example of intramodular analysis
# identifying genes with high GS and MM
module <- "blue"
column <- match(module, modNames)
moduleGenes <- moduleColors == module
sizeGrWindow(7, 7)
par(mfrow = c(1,1))
x <- abs(geneModuleMembership[moduleGenes, column])
y <- abs(geneTraitSignificance[moduleGenes, 1])
limit <- range(c(x,y)) 
r <- round(cor(x,y),2)

plot(    x, 
         y,
         xlab =paste("Module Membership in", module, "module"), 
         ylab ="Gene significance for stage", col = module)
fit <-lm(y~x)
pintra <- summary(fit)$coefficients[,4][[2]] #p-value
abline(fit, col='orange')
legend('topleft', col = "orange", lty = 1, box.lty = 1 ,legend = 'regression line', cex= 0.8) 
mtext(paste('correlation = ', r, ",", "P-value = ", pintra))
title(paste("Module membership vs. gene significance\n"))
text(y~x, labels=rownames(exp_dri_norm)[moduleColors=="blue"],data=exp_dri_norm, cex=0.8, font=4, pos = 2)


# identifying genes with high GS and MM
# values for each gene
intra_modular_analysis <- data.frame(abs(geneModuleMembership[moduleGenes, column]),
                                     abs(geneTraitSignificance[moduleGenes, 1]))
rownames(intra_modular_analysis) = colnames(fpkm_matrix)[moduleColors=="blue"]
View(intra_modular_analysis)

# high intramodular connectivity ~ high kwithin => hub genes
# kwithin: connectivity of the each driver gene in the blue module 
# to all other genes in entire set of genes
connectivity <- intramodularConnectivity(adj, moduleColors)

# for the blue module specifically get the high connectivity genes
connectivity <- connectivity[colnames(adj)[moduleColors=="blue"],] 
order.kWithin <- order(connectivity$kWithin, decreasing = TRUE)
connectivity <- connectivity[order.kWithin,] #order rows following kWithin

#top 20 genes that have a high connectivity to other genes in the turquoise module
connectivity <- connectivity[1:20,] 
View(connectivity)

# correlate module eigengenes with Group
module_eigengenes <- moduleEigengenes(fpkm_matrix, mergedColors)

Group.out <- binarizeCategoricalColumns(samples.IP$Group,
                                        includePairwise = TRUE,
                                        includeLevelVsAll = FALSE,
                                        minCount = 1)
row.names(Group.out) <- row.names(samples.IP)

module.Groups.cor <- cor(module_eigengenes$eigengenes, Group.out, use = 'p')
module.Groups.cor.pvals <- corPvalueStudent(module.Groups.cor, nSamples)

# plotting a heat map
heatmap.data <- merge(module_eigengenes$eigengenes, Group.out, by = 'row.names')

CorLevelPlot(heatmap.data, x = names(heatmap.data)[35:40],
             y = names(heatmap.data)[2:34],
             col = c('blue1', 'skyblue', 'white', 'pink', 'red'))
             

# intramodular analysis to identify hub/driver genes
# calculate the module membership and associcated pvalues first

module.membership <- cor(module_eigengenes$eigengenes, fpkm_matrix, use = 'p')
module.membership.pvals <- corPvalueStudent(module.membership, nSamples)

# calculate gene significance and associated pvals
gene.signif.corr <- cor(fpkm_matrix, Group.out$data.Fent.Five.vs.all, use = 'p')
gene.signif.corr.pvals <- corPvalueStudent(gene.signif.corr, nSamples)

# dark red module hubs
darkred_hubs <- module.membership.pvals[9, ]
names(darkred_hubs) <- colnames(module.membership.pvals)
darkred_hubs <- sort(darkred_hubs)[1:100]

darkred_hubs <- as.data.frame(darkred_hubs) %>%
  rownames_to_column('GENEID')
left_join(darkred_genes, darkred_hubs)

# plotting the stupidly large dissimilarity matrix

dissTOM <- 1 - TOMsimilarityFromExpr(fpkm_matrix[, goodGenes(fpkm_matrix)],
                                     power = 10)

genetree <- hclust(as.dist(dissTOM), method = 'average')

# warning this takes forever to plot
TOMplot(dissTOM, dendro = genetree)

# determining which modules are significantly different based on wald stat
# in fent five vs fent one samples

fentfive.vs.fentone.model <- glm(stat ~ 0 + mergedlabels,
                                family = gaussian, res.FentFive_vs_FentOne)
summary(fentfive.vs.fentone.model)

model2 <- lm(stat ~ 0 + mergedlabels, data = res.FentFive_vs_FentOne)
nbin