This is the analysis of an RNA sequencing experiment performed by Jack McGinty. Jack sorted tuft cells from the first 5 cm (proximal) and last 5 cm (distal) of the small intestine of unmanipulated B6 mice (n=4 of each). Tuft cells were gated as EPCAM+ CD45low/- CD24+ SigF+. This reproducible and dynamic report was created using Rmarkdown and the Knitr package, and summarizes the basic code and outputs (plots, tables, etc) produced during the analysis. Thank you to Daniel Beiting and DIY.transcriptomics (diytranscriptomics.com) for instruction and code.
A variety of R packages were used for this analysis. All graphics and data wrangling were handled using the tidyverse suite of packages. All packages used are available from the Comprehensive R Archive Network (CRAN), Bioconductor.org, or Github.
Read mapping was performed by the Benaroya Research Institute sequencing core. Base calls were processed to FASTQs on BaseSpace (Illumina), and a base call quality-trimming step was applied to remove low-confidence base calls from the ends of reads. The FASTQs were aligned to the GRCm38 mouse reference genome, using STAR v.2.4.2a and gene counts were generated using htseq-count.
library(rhdf5) #provides functions for handling hdf5 file formats (kallisto outputs bootstraps in this format)
library(tidyverse) # provides access to Hadley Wickham's collection of R packages for data science, which we will use throughout the course
library(tximport) # package for getting Kallisto results into R
library(ensembldb) #helps deal with ensembl
library(EnsDb.Mmusculus.v79) #replace with your organism-specific database package
library(edgeR) # well known package for differential expression analysis, but we only use for the DGEList object and for normalization methods
library(matrixStats) # let's us easily calculate stats on rows or columns of a data matrix
library(cowplot) # allows you to combine multiple plots in one figure
library(DT) # for making interactive tables
library(plotly) # for making interactive plots
library(gt) # A layered 'grammar of tables' - think ggplot, but for tables
library(limma) # venerable package for differential gene expression using linear modeling
library(edgeR)
library(RColorBrewer) #need colors to make heatmaps
library(gplots) #the heatmap2 function in this package is a primary tool for making heatmaps
library(heatmaply) #for making interactive heatmaps using plotly
library(plyr)
library(GSEABase) #functions and methods for Gene Set Enrichment Analysis
library(Biobase) #base functions for bioconductor; required by GSEABase
library(GSVA) #Gene Set Variation Analysis, a non-parametric and unsupervised method for estimating variation of gene set enrichment across samples.
library(gprofiler2) #tools for accessing the GO enrichment results using g:Profiler web resources
library(clusterProfiler) # provides a suite of tools for functional enrichment analysis
library(msigdbr) # access to msigdb collections directly within R
library(enrichplot) # great for making the standard GSEA enrichment plots
library(dplyr)
# read in study design ----
targets <- read_tsv("study_design.txt")
#Want to convert P233-3 rawcounts.csv into a data frame, specifying that row names are in column 1
Counts <- read_csv("rawcounts.csv")
#Want to combine reads from duplicate gene names in order to make “symbol” column into row names
#Cuts rows from 53,806 down to 53,656
Counts <- Counts %>%
group_by(symbol) %>%
dplyr::summarise(across(c('NC1-Prox', 'NC2-Prox', 'NC3-Prox', 'NC4-Prox', 'NC1-Dist', 'NC2-Dist', 'NC3-Dist', 'NC4-Dist'), sum)) %>%
as.data.frame()
#Look for missing values in “symbol” column
which(is.na(Counts$symbol))## [1] 53646
#Missing value is at 53646, which means it’s the last value; get rid of it by taking all values except 53656
Counts <- Counts[-c(53646), ]
#Converts “symbol” column into row names so data frame can be read into a matrix
Counts <- column_to_rownames(Counts, var='symbol')# capture sample labels from the study design file
sampleLabels <- targets$Short_Name
myDGEList <- DGEList(Counts)
log2.cpm <- cpm(myDGEList, log=TRUE)
# 'coerce' data matrix to a dataframe to use tidyverse tools on it
log2.cpm.df <- as_tibble(log2.cpm, rownames = "geneID")
# add sample names to this dataframe (we lost these when we read our data in with tximport)
colnames(log2.cpm.df) <- c("geneID", sampleLabels)
# use the tidy package to 'pivot' dataframe (from wide to long)
log2.cpm.df.pivot <- pivot_longer(log2.cpm.df, # dataframe to be pivoted
cols = Prox_1:Dist_4, # column names to be stored as a SINGLE variable
names_to = "samples", # name of that new variable (column)
values_to = "expression") # name of new variable (column) storing all the values (data)
# save pivoted table to plot later
P1 <-ggplot(log2.cpm.df.pivot) +
aes(x=samples, y=expression, fill=samples) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun = "median",
geom = "point",
shape = 95,
size = 10,
color = "black",
show.legend = FALSE) +
labs(y="log2 expression", x = "sample",
title="Log2 Counts per Million (CPM)",
subtitle="unfiltered, non-normalized",
caption=paste0("produced on ", Sys.time())) +
theme_bw()
# now set some cut-off to get rid of genes/transcripts with low counts
# again using rowSums to tally up the 'TRUE' results of a simple evaluation
# how many genes had more than 1 CPM (TRUE) in at least 4 samples
# The line below is important! This is where the filtering starts
# Be sure to adjust this cutoff for the number of samples in the smallest group of comparison.
cpm <- cpm(myDGEList)
keepers <- rowSums(cpm>1)>=4
# now use base R's simple subsetting method to filter DGEList based on the logical produced above
myDGEList.filtered <- myDGEList[keepers,]
log2.cpm.filtered <- cpm(myDGEList.filtered, log=TRUE)
log2.cpm.filtered.df <- as_tibble(log2.cpm.filtered, rownames = "geneID")
colnames(log2.cpm.filtered.df) <- c("geneID", sampleLabels)
# pivot this FILTERED data
log2.cpm.filtered.df.pivot <- pivot_longer(log2.cpm.filtered.df, # dataframe to be pivoted
cols = Prox_1:Dist_4, # column names to be stored as a SINGLE variable
names_to = "samples", # name of that new variable (column)
values_to = "expression") # name of new variable (column) storing all the values (data)
#save plot of filtered data
P2 <-ggplot(log2.cpm.filtered.df.pivot) +
aes(x=samples, y=expression, fill=samples) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun = "median",
geom = "point",
shape = 95,
size = 10,
color = "black",
show.legend = FALSE) +
labs(y="log2 expression", x = "sample",
title="Log2 Counts per Million (CPM)",
subtitle="filtered, non-normalized",
caption=paste0("produced on ", Sys.time())) +
theme_bw()
# Normalize your data ----
myDGEList.filtered.norm <- calcNormFactors(myDGEList.filtered, method = "TMM")
# use the 'cpm' function from EdgeR to get counts per million from your normalized data
log2.cpm.filtered.norm <- cpm(myDGEList.filtered.norm, log=TRUE)
log2.cpm.filtered.norm.df <- as_tibble(log2.cpm.filtered.norm, rownames = "geneID")
colnames(log2.cpm.filtered.norm.df) <- c("geneID", sampleLabels)
# pivot this NORMALIZED data
log2.cpm.filtered.norm.df.pivot <- pivot_longer(log2.cpm.filtered.norm.df, # dataframe to be pivoted
cols = Prox_1:Dist_4, # column names to be stored as a SINGLE variable
names_to = "samples", # name of that new variable (column)
values_to = "expression") # name of new variable (column) storing all the values (data)
#save plot of normalized data
P3 <-ggplot(log2.cpm.filtered.norm.df.pivot) +
aes(x=samples, y=expression, fill=samples) +
geom_violin(trim = FALSE, show.legend = FALSE) +
stat_summary(fun = "median",
geom = "point",
shape = 95,
size = 10,
color = "black",
show.legend = FALSE) +
labs(y="log2 expression", x = "sample",
title="Log2 Counts per Million (CPM)",
subtitle="filtered, TMM normalized",
caption=paste0("produced on ", Sys.time())) +
theme_bw()
# use the 'plot_grid' function from the cowplot package to put these together in a figure
plot_grid(P1, P2, P3, labels = c('A', 'B', 'C'), label_size = 12)Filtering was carried out to remove lowly expressed genes. Genes with less than 1 count per million (CPM) in at least 4 or more samples filtered out. This reduced the number of genes from 53645 to 13729.
Filtered data were clustered to generate a dendrogram
#hierarchical clustering can only work on a data matrix, not a data frame
distance <- dist(t(log2.cpm.filtered.norm), method = "maximum") #other distance methods are "euclidean", maximum", "manhattan", "canberra", "binary" or "minkowski"
clusters <- hclust(distance, method = "average") #other agglomeration methods are "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", or "centroid"
plot(clusters, labels=sampleLabels)# Identify variables of interest in study design file
group <- targets$Group
group <- factor(group)
pca.res <- prcomp(t(log2.cpm.filtered.norm), scale.=F, retx=T)
pc.var<-pca.res$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per<-round(pc.var/sum(pc.var)*100, 1) # we can then use these eigenvalues to calculate the percentage variance explained by each PC
# Visualize your PCA result ------------------
#lets first plot any two PCs against each other
pca.res.df <- as_tibble(pca.res$x)
ggplot(pca.res.df) +
aes(x=PC1, y=PC2, label=sampleLabels, color = group) +
geom_point(size=4) +
#stat_ellipse() +
xlab(paste0("PC1 (",pc.per[1],"%",")")) +
ylab(paste0("PC2 (",pc.per[2],"%",")")) +
labs(title="PCA plot",
caption=paste0("produced on ", Sys.time())) +
coord_fixed() +
theme_bw()The table shown below is interactive. You can sort and search the data directly from the table.
# use dplyr 'mutate' function to add new columns based on existing data
mydata.df <- log2.cpm.filtered.norm.df %>%
mutate(Prox.AVG = (Prox_1 + Prox_2 + Prox_3 + Prox_4)/4,
Dist.AVG = (Dist_1 + Dist_2 + Dist_3 + Dist_4)/4,
#now make columns comparing each of the averages above that you're interested in
LogFC_Dist_v_Prox = (Dist.AVG - Prox.AVG)) %>%
mutate_if(is.numeric, round, 2)
# Make an interactive table using the DT package ----
datatable(mydata.df[,c(1:12)],
extensions = c('KeyTable', "FixedHeader"),
filter = 'top',
options = list(keys = TRUE,
searchHighlight = TRUE,
pageLength = 10,
lengthMenu = c("10", "25", "50", "100")))To identify differentially expressed genes, precision weights were first applied to each gene based on its mean-variance relationship using VOOM, then data was normalized using the TMM method in EdgeR. Linear modeling and bayesian stats were employed via Limma to find genes that were up- or down-regulated by 4-fold or more, with a false-discovery rate (FDR) of 0.01.
# Part 4 essentials: differential gene expression ----
group <- factor(targets$Group)
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
v.DEGList.filtered.norm <- voom(myDGEList.filtered.norm, design, plot = TRUE)fit <- lmFit(v.DEGList.filtered.norm, design)
contrast.matrix <- makeContrasts(Dist_Prox = Distal - Prox,
levels=design)
fits <- contrasts.fit(fit, contrast.matrix)
ebFit <- eBayes(fits)
#set cutoff and summarize results
results <- decideTests(ebFit, method="global", adjust.method="BH", p.value=0.01, lfc=1)
summary(results)## Dist_Prox
## Down 112
## NotSig 13535
## Up 82
This is an interactive volcano plot for Distal vs. Proximal
#volcano plot Distal vs. Proximal
TopHits <- topTable(ebFit, adjust ="BH", coef=1, number=40000, sort.by="logFC")
TopHits.df <- TopHits %>%
as_tibble(rownames = "geneID")
vplot <- ggplot(TopHits.df) +
aes(y=-log10(adj.P.Val), x=logFC, text = paste("Symbol:", geneID)) +
geom_point(size=2) +
geom_hline(yintercept = -log10(0.01), linetype="longdash", colour="grey", size=1) +
geom_vline(xintercept = 1, linetype="longdash", colour="#BE684D", size=1) +
geom_vline(xintercept = -1, linetype="longdash", colour="#2C467A", size=1) +
#annotate("rect", xmin = 1, xmax = 12, ymin = -log10(0.01), ymax = 7.5, alpha=.2, fill="#BE684D") +
#annotate("rect", xmin = -1, xmax = -12, ymin = -log10(0.01), ymax = 7.5, alpha=.2, fill="#2C467A") +
labs(title="Volcano plot",
subtitle = "Distal vs. Proximal",
caption=paste0("produced on ", Sys.time())) +
theme_bw()
ggplotly(vplot)This is an annotated static volcano plot
#annotations on for static
vplot <- ggplot(TopHits.df) +
aes(y=-log10(adj.P.Val), x=logFC, text = paste("Symbol:", geneID)) +
geom_point(size=2) +
geom_hline(yintercept = -log10(0.01), linetype="longdash", colour="grey", size=1) +
geom_vline(xintercept = 1, linetype="longdash", colour="#BE684D", size=1) +
geom_vline(xintercept = -1, linetype="longdash", colour="#2C467A", size=1) +
annotate("rect", xmin = 1, xmax = 10, ymin = -log10(0.01), ymax = 6, alpha=.2, fill="#BE684D") +
annotate("rect", xmin = -1, xmax = -10, ymin = -log10(0.01), ymax = 6, alpha=.2, fill="#2C467A") +
labs(title="Volcano plot",
subtitle = "Distal vs. Proximal",
caption=paste0("produced on ", Sys.time())) +
theme_bw()
plot(vplot)The table below shows all genes that were identified as differentially expressed.
Use the column sort function of the table below to see genes based on these calculated values.
#table of DEGs
colnames(v.DEGList.filtered.norm$E) <- sampleLabels
diffGenes <- v.DEGList.filtered.norm$E[rowSums(abs(results)) !=0,]
diffGenes.df <- as_tibble(diffGenes, rownames = "geneID")
datatable(diffGenes.df,
extensions = c('KeyTable', "FixedHeader"),
caption = 'Table 1: DEGs',
options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
formatRound(columns=c(2:9), digits=2)Pearson correlation was used to cluster 194 differentially expressed genes, which were then represented as heatmap with the data scaled by Zscore for each row.
# modules and pairwise analysis ----
myheatcolors <- rev(brewer.pal(name="RdBu", n=11))
clustRows <- hclust(as.dist(1-cor(t(diffGenes), method="pearson")), method="complete") #cluster rows by pearson correlation
clustColumns <- hclust(as.dist(1-cor(diffGenes, method="spearman")), method="complete")
module.assign <- cutree(clustRows, k=2)
module.color <- rainbow(length(unique(module.assign)), start=0.1, end=0.9)
module.color <- module.color[as.vector(module.assign)]
heatmap.2(diffGenes,
Rowv=as.dendrogram(clustRows),
Colv=as.dendrogram(clustColumns),
RowSideColors=module.color,
col=myheatcolors, scale='row', labRow=NA,
density.info="none", trace="none",
cexRow=1, cexCol=1, margins=c(8,20))Next, we analyzed each module in the heat map to see if it contained genes in the SI tuft cell signature (generated from data in Nadjsombati, McGinty et al Immunity 2018 PMID: 30021144)
Module 1 - genes down in distal vs. proximal
#genes down in distal vs. prox
modulePick <- 1
myModule_down <- diffGenes[names(module.assign[module.assign %in% modulePick]),]
myModule_down <- as.data.frame(myModule_down)
myModule_down <- rownames_to_column(myModule_down, var = "geneID")
#any of these genes in the SI tuft signature?
tuft <- read_tsv("tuft_DE.txt")
matches <- match_df(myModule_down, tuft, on = "geneID")
matchesModule 2 - genes up in distal vs. proximal. These genes constitute the distal tuft cell “signature” used in later analyses
#genes up in distal vs. prox
modulePick <- 2
myModule_up <- diffGenes[names(module.assign[module.assign %in% modulePick]),]
myModule_up <- as.data.frame(myModule_up)
myModule_up <- rownames_to_column(myModule_up, var = "geneID")
#any of these genes in the SI tuft signature?
matches <- match_df(myModule_up, tuft, on = "geneID")
matchesMost of the differences between proximal and distal tuft cells in the SI are probably differences that are shared by all epithelial cells in these regions. We did not have proximal and distal non-tuft cells as a control to confirm this. Only ~10 tuft cell specific genes are up/down regulated between proximal and distal tuft cells. Ptprc (CD45), Sucnr1 and Gnat3 are the most notable among these. They are all expressed at higher levels in the distal SI.
The output from running ‘sessionInfo’ is shown below and details all packages and version necessary to reproduce the results in this report.
sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] enrichplot_1.14.2 msigdbr_7.5.1
## [3] clusterProfiler_4.2.2 gprofiler2_0.2.1
## [5] GSVA_1.42.0 GSEABase_1.56.0
## [7] graph_1.72.0 annotate_1.72.0
## [9] XML_3.99-0.10 plyr_1.8.7
## [11] heatmaply_1.3.0 viridis_0.6.2
## [13] viridisLite_0.4.0 gplots_3.1.3
## [15] RColorBrewer_1.1-3 gt_0.6.0
## [17] plotly_4.10.0 DT_0.24
## [19] cowplot_1.1.1 matrixStats_0.62.0
## [21] edgeR_3.36.0 limma_3.50.3
## [23] EnsDb.Mmusculus.v79_2.99.0 ensembldb_2.18.4
## [25] AnnotationFilter_1.18.0 GenomicFeatures_1.46.5
## [27] AnnotationDbi_1.56.2 Biobase_2.54.0
## [29] GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [31] IRanges_2.28.0 S4Vectors_0.32.4
## [33] BiocGenerics_0.40.0 tximport_1.22.0
## [35] forcats_0.5.1 stringr_1.4.0
## [37] dplyr_1.0.9 purrr_0.3.4
## [39] readr_2.1.2 tidyr_1.2.0
## [41] tibble_3.1.8 ggplot2_3.3.6
## [43] tidyverse_1.3.2 rhdf5_2.38.1
## [45] knitr_1.39 tinytex_0.40
## [47] rmarkdown_2.14
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 rtracklayer_1.54.0
## [3] bit64_4.0.5 irlba_2.3.5
## [5] DelayedArray_0.20.0 data.table_1.14.2
## [7] KEGGREST_1.34.0 RCurl_1.98-1.8
## [9] generics_0.1.3 ScaledMatrix_1.2.0
## [11] RSQLite_2.2.15 shadowtext_0.1.2
## [13] bit_4.0.4 tzdb_0.3.0
## [15] webshot_0.5.3 xml2_1.3.3
## [17] lubridate_1.8.0 SummarizedExperiment_1.24.0
## [19] assertthat_0.2.1 gargle_1.2.0
## [21] xfun_0.32 hms_1.1.1
## [23] jquerylib_0.1.4 babelgene_22.3
## [25] evaluate_0.16 TSP_1.2-1
## [27] fansi_1.0.3 restfulr_0.0.15
## [29] progress_1.2.2 caTools_1.18.2
## [31] dendextend_1.16.0 dbplyr_2.2.1
## [33] readxl_1.4.0 igraph_1.3.4
## [35] DBI_1.1.3 htmlwidgets_1.5.4
## [37] googledrive_2.0.0 ellipsis_0.3.2
## [39] crosstalk_1.2.0 backports_1.4.1
## [41] biomaRt_2.50.3 sparseMatrixStats_1.6.0
## [43] MatrixGenerics_1.6.0 vctrs_0.4.1
## [45] SingleCellExperiment_1.16.0 cachem_1.0.6
## [47] withr_2.5.0 ggforce_0.3.3
## [49] vroom_1.5.7 GenomicAlignments_1.30.0
## [51] treeio_1.18.1 prettyunits_1.1.1
## [53] DOSE_3.20.1 ape_5.6-2
## [55] lazyeval_0.2.2 crayon_1.5.1
## [57] pkgconfig_2.0.3 labeling_0.4.2
## [59] tweenr_1.0.2 nlme_3.1-159
## [61] ProtGenerics_1.26.0 seriation_1.3.6
## [63] rlang_1.0.4 lifecycle_1.0.1
## [65] downloader_0.4 registry_0.5-1
## [67] filelock_1.0.2 BiocFileCache_2.2.1
## [69] modelr_0.1.8 rsvd_1.0.5
## [71] cellranger_1.1.0 polyclip_1.10-0
## [73] Matrix_1.4-1 aplot_0.1.6
## [75] Rhdf5lib_1.16.0 reprex_2.0.1
## [77] googlesheets4_1.0.1 png_0.1-7
## [79] rjson_0.2.21 bitops_1.0-7
## [81] KernSmooth_2.23-20 rhdf5filters_1.6.0
## [83] Biostrings_2.62.0 blob_1.2.3
## [85] DelayedMatrixStats_1.16.0 qvalue_2.26.0
## [87] gridGraphics_0.5-1 beachmat_2.10.0
## [89] scales_1.2.0 memoise_2.0.1
## [91] magrittr_2.0.3 zlibbioc_1.40.0
## [93] compiler_4.1.2 scatterpie_0.1.7
## [95] BiocIO_1.4.0 Rsamtools_2.10.0
## [97] cli_3.3.0 XVector_0.34.0
## [99] patchwork_1.1.1 MASS_7.3-58.1
## [101] tidyselect_1.1.2 stringi_1.7.8
## [103] highr_0.9 yaml_2.3.5
## [105] GOSemSim_2.20.0 BiocSingular_1.10.0
## [107] locfit_1.5-9.6 ggrepel_0.9.3
## [109] grid_4.1.2 sass_0.4.2
## [111] fastmatch_1.1-3 tools_4.1.2
## [113] parallel_4.1.2 rstudioapi_0.13
## [115] foreach_1.5.2 gridExtra_2.3
## [117] farver_2.1.1 ggraph_2.0.6
## [119] digest_0.6.29 Rcpp_1.0.9
## [121] broom_1.0.0 httr_1.4.3
## [123] colorspace_2.0-3 rvest_1.0.2
## [125] fs_1.5.2 splines_4.1.2
## [127] yulab.utils_0.0.5 tidytree_0.4.0
## [129] graphlayouts_0.8.1 ggplotify_0.1.0
## [131] xtable_1.8-6 jsonlite_1.8.0
## [133] ggtree_3.2.1 tidygraph_1.2.1
## [135] ggfun_0.0.6 R6_2.5.1
## [137] pillar_1.8.0 htmltools_0.5.3
## [139] glue_1.6.2 fastmap_1.1.0
## [141] BiocParallel_1.28.3 codetools_0.2-18
## [143] fgsea_1.20.0 utf8_1.2.2
## [145] lattice_0.20-45 bslib_0.4.0
## [147] curl_4.3.2 gtools_3.9.3
## [149] GO.db_3.14.0 munsell_0.5.0
## [151] DO.db_2.9 GenomeInfoDbData_1.2.7
## [153] iterators_1.0.14 HDF5Array_1.22.1
## [155] haven_2.5.0 reshape2_1.4.4
## [157] gtable_0.3.0