0.1 Introduction

This is the analysis of an RNA sequencing experiment performed by Jakob von Moltke and Jack McGinty. Jack sorted tuft cells from the entire small intestine of unmanipulated Flare25+/- Chat-GFP+/- mice (n=4 of each). Tuft cells were gated as EPCAM+ CD45low/- Flare25+ and then Chat-GFP positive and negative cells were sorted. This reproducible and dynamic report was created using Rmarkdown and the Knitr package, and summarizes the basic code and outputs (plots, tables, etc) produced during the analysis. Thank you to Daniel Beiting and DIY.transcriptomics (diytranscriptomics.com) for instruction and code.


0.2 R packages used

A variety of R packages were used for this analysis. All graphics and data wrangling were handled using the tidyverse suite of packages. All packages used are available from the Comprehensive R Archive Network (CRAN), Bioconductor.org, or Github.


0.3 Read mapping

Read mapping was performed by the Benaroya Research Institute sequencing core. Base calls were processed to FASTQs on BaseSpace (Illumina), and a base call quality-trimming step was applied to remove low-confidence base calls from the ends of reads. The FASTQs were aligned to the GRCm38 mouse reference genome, using STAR v.2.4.2a and gene counts were generated using htseq-count.

0.3.1 Importing count data into R

library(tidyverse) # already know about this from Step 1 script
library(edgeR) # well known package for differential expression analysis, but we only use for the DGEList object and for normalization methods
library(matrixStats) # let's us easily calculate stats on rows or columns of a data matrix
library(cowplot)
library(dplyr)
library(limma) # venerable package for differential gene expression using linear modeling
library(gt) # A layered 'grammar of tables' - think ggplot, but for tables
library(plotly)# allows you to combine multiple plots in one figure
library(RColorBrewer) #need colors to make heatmaps
library(heatmaply)
library(gplots) #the heatmap2 function in this package is a primary tool for making heatmaps
library(plyr) 
library(DT) # for making interactive tables
library(limma) # venerable package for differential gene expression using linear modeling
library(clusterProfiler)
library(enrichplot)
library(GSEABase)


# read in your study design
targets <- read_tsv("studydesign.txt")
sampleLabels <- targets$sample

#BRI core provided mapped reads, which we need to import for analysis

#Want to convert P233-3 rawcounts.csv into a data frame, specifying that row names are in column 1
Counts <- read.delim("rawcounts.csv",sep=',', header=TRUE)

#Want to combine reads from duplicate gene names in order to make "symbol" column into row names
#Cuts rows from 53,806 down to 53,656
Counts <- Counts %>%
  group_by(symbol) %>%
  dplyr::summarise(across(c(X1_Chat_n, X2_Chat_n, X3_Chat_n, X4_Chat_n, X5_Chat_p, X6_Chat_p, X7_Chat_p, X8_Chat_p), sum)) %>%
  as.data.frame()

#Look for missing values in "symbol" column
which(is.na(Counts$symbol))
## [1] 53656
#Missing value is at 53656, which means it's the last value; get rid of it by taking all values except 53656
Counts <- Counts[-c(53656), ]

#Converts "symbol" column into row names so data frame can be read into a matrix
Counts <- column_to_rownames(Counts, var="symbol")

0.4 Preprocessing

0.4.1 Impact of filtering and normalization

# Make a DGElist from your counts, and plot ----
myDGEList <- DGEList(Counts)
cpm <- cpm(myDGEList) 
log2.cpm <- cpm(myDGEList, log=TRUE)


# 'coerce' your data matrix to a dataframe so that you can use tidyverse tools on it
log2.cpm.df <- as_tibble(log2.cpm, rownames = "symbol")
# add your sample names to this dataframe (we lost these when we read our data in with tximport)
colnames(log2.cpm.df) <- c("symbol", sampleLabels) #concatenate function that creates character vector of "geneID" and sample labels
# use the tidy package to 'pivot' your dataframe (from wide to long)
log2.cpm.df.pivot <- pivot_longer(log2.cpm.df, # dataframe to be pivoted
                                  cols = Chat_n1:Chat_p4, # column names to be stored as a SINGLE variable
                                  names_to = "samples", # name of that new variable (column)
                                  values_to = "expression") # name of new variable (column) storing all the values (data)


# note it is easy to plot this pivoted data
p1 <- ggplot(log2.cpm.df.pivot) +
  aes(x=samples, y=expression, fill=samples) +
  geom_violin(trim = FALSE, show.legend = FALSE) +
  stat_summary(fun = "median", 
               geom = "point", 
               shape = 95, 
               size = 10, 
               color = "black", 
               show.legend = FALSE) +
  labs(y="log2 expression", x = "sample",
       title="Log2 Counts per Million (CPM)",
       subtitle="unfiltered, non-normalized",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw()


# Filter your data ----
# The line below is important! This is where the filtering starts
# Be sure to adjust this cutoff for the number of samples in the smallest group of comparison.
keepers <- rowSums(cpm>1)>=4
# now use base R's simple subsetting method to filter your DGEList based on the logical produced above
myDGEList.filtered <- myDGEList[keepers,]
dim(myDGEList.filtered)
## [1] 13746     8
log2.cpm.filtered <- cpm(myDGEList.filtered, log=TRUE)
log2.cpm.filtered.df <- as_tibble(log2.cpm.filtered, rownames = "symbol")
colnames(log2.cpm.filtered.df) <- c("symbol", sampleLabels)
# pivot this FILTERED data, just as you did earlier
log2.cpm.filtered.df.pivot <- pivot_longer(log2.cpm.filtered.df, # dataframe to be pivoted
                                           cols = Chat_n1:Chat_p4, # column names to be stored as a SINGLE variable
                                           names_to = "samples", # name of that new variable (column)
                                           values_to = "expression") # name of new variable (column) storing all the values (data)


p2 <- ggplot(log2.cpm.filtered.df.pivot) +
  aes(x=samples, y=expression, fill=samples) +
  geom_violin(trim = FALSE, show.legend = FALSE) +
  stat_summary(fun = "median", 
               geom = "point", 
               shape = 95, 
               size = 10, 
               color = "black", 
               show.legend = FALSE) +
  labs(y="log2 expression", x = "sample",
       title="Log2 Counts per Million (CPM)",
       subtitle="filtered, non-normalized",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw()

# Normalize your data ----
myDGEList.filtered.norm <- calcNormFactors(myDGEList.filtered, method = "TMM")

# use the 'cpm' function from EdgeR to get counts per million from your normalized data
log2.cpm.filtered.norm <- cpm(myDGEList.filtered.norm, log=TRUE)
log2.cpm.filtered.norm.df <- as_tibble(log2.cpm.filtered.norm, rownames = "symbol")
colnames(log2.cpm.filtered.norm.df) <- c("symbol", sampleLabels)
# pivot this NORMALIZED data, just as you did earlier
log2.cpm.filtered.norm.df.pivot <- pivot_longer(log2.cpm.filtered.norm.df, # dataframe to be pivoted
                                                cols = Chat_n1:Chat_p4, # column names to be stored as a SINGLE variable
                                                names_to = "samples", # name of that new variable (column)
                                                values_to = "expression") # name of new variable (column) storing all the values (data)


p3 <- ggplot(log2.cpm.filtered.norm.df.pivot) +
  aes(x=samples, y=expression, fill=samples) +
  geom_violin(trim = FALSE, show.legend = FALSE) +
  stat_summary(fun = "median", 
               geom = "point", 
               shape = 95, 
               size = 10, 
               color = "black", 
               show.legend = FALSE) +
  labs(y="log2 expression", x = "sample",
       title="Log2 Counts per Million (CPM)",
       subtitle="filtered, TMM normalized",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw()


# we'll use the 'plot_grid' function from the cowplot package to put these together in a figure
plot_grid(p1, p2, p3, labels = c('A', 'B', 'C'), label_size = 12)

Filtering was carried out to remove lowly expressed genes. Genes with less than 1 count per million (CPM) in at least 4 or more samples filtered out. This reduced the number of genes from 53655 to 13746.


0.5 Dendrogram

Filtered data were clustered to generate a dendrogram

#hierarchical clustering can only work on a data matrix, not a data frame
#try using filtered and unfiltered data...how does this change the result?
#try other distance methods (e.g. switch from 'maximum' to 'euclidean')...how does this change the result?
distance <- dist(t(log2.cpm.filtered.norm), method = "maximum") #other distance methods are "euclidean", maximum", "manhattan", "canberra", "binary" or "minkowski"
clusters <- hclust(distance, method = "average") #other agglomeration methods are "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", or "centroid"
plot(clusters, labels=sampleLabels)

0.6 PCA plot of filtered and normalized data

# Identify variables of interest in study design file
group <- targets$group
group <- factor(group)

pca.res <- prcomp(t(log2.cpm.filtered.norm), scale.=F, retx=T)
pc.var<-pca.res$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per<-round(pc.var/sum(pc.var)*100, 1) # we can then use these eigenvalues to calculate the percentage variance explained by each PC

# Visualize your PCA result ------------------
#lets first plot any two PCs against each other
#We know how much each sample contributes to each PC (loadings), so let's plot
pca.res.df <- as_tibble(pca.res$x)
ggplot(pca.res.df) +
  aes(x=PC1, y=PC2, label=sampleLabels, color=group) +
  geom_point(size=2) +
  geom_label() +
  #stat_ellipse() +
  xlab(paste0("PC1 (",pc.per[1],"%",")")) + 
  ylab(paste0("PC2 (",pc.per[2],"%",")")) +
  labs(title="PCA plot",
       caption=paste0("produced on ", Sys.time())) +
  #coord_fixed() + #keeps coordinates locked so that scaling plot differently doesn't change scale
  theme_bw()

At this point we assessed the libraries and PCA and noticed that Chat_n1 had an unusual distribution of reads and did not cluster with Chatn2-n4 on the PCA. Indeed, Chat_n1 accounted for a huge amount of the variation in the PCA. For these reasons we considered Chat_p1 an outlier and decided to drop it and reprocess the remaining libraries

0.7 Remove Chat_n1

#Remove X1_Chat_n (Chat1) from original Counts data frame
Counts <- dplyr::select(Counts, X2_Chat_n:X8_Chat_p)

# remove Chat1 from the study design and sample labels object
targets <- read_tsv("studydesign.txt")
targets <- targets[2:8, , drop = TRUE]
sampleLabels <- targets$sample

0.8 Repeat data wrangling

# Make a DGElist from your counts, and plot ----
myDGEList <- DGEList(Counts)
cpm <- cpm(myDGEList) 
log2.cpm <- cpm(myDGEList, log=TRUE)

# 'coerce' your data matrix to a dataframe so that you can use tidyverse tools on it
log2.cpm.df <- as_tibble(log2.cpm, rownames = "symbol")
# add your sample names to this dataframe (we lost these when we read our data in with tximport)
colnames(log2.cpm.df) <- c("symbol", sampleLabels) #concatenate function that creates character vector of "symbol" and sample labels
# use the tidy package to 'pivot' your dataframe (from wide to long)
log2.cpm.df.pivot <- pivot_longer(log2.cpm.df, # dataframe to be pivoted
                                  cols = Chat_n2:Chat_p4, # column names to be stored as a SINGLE variable
                                  names_to = "samples", # name of that new variable (column)
                                  values_to = "expression") # name of new variable (column) storing all the values (data)


# note it is easy to plot this pivoted data
p1 <- ggplot(log2.cpm.df.pivot) +
  aes(x=samples, y=expression, fill=samples) +
  geom_violin(trim = FALSE, show.legend = FALSE) +
  stat_summary(fun = "median", 
               geom = "point", 
               shape = 95, 
               size = 10, 
               color = "black", 
               show.legend = FALSE) +
  labs(y="log2 expression", x = "sample",
       title="Log2 Counts per Million (CPM)",
       subtitle="unfiltered, non-normalized",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw()


# Filter your data ----
# The line below is important! This is where the filtering starts
# Be sure to adjust this cutoff for the number of samples in the smallest group of comparison.
keepers <- rowSums(cpm>1)>=3
# now use base R's simple subsetting method to filter your DGEList based on the logical produced above
myDGEList.filtered <- myDGEList[keepers,]
dim(myDGEList.filtered)
## [1] 14789     7
log2.cpm.filtered <- cpm(myDGEList.filtered, log=TRUE)
log2.cpm.filtered.df <- as_tibble(log2.cpm.filtered, rownames = "symbol")
colnames(log2.cpm.filtered.df) <- c("symbol", sampleLabels)
# pivot this FILTERED data, just as you did earlier
log2.cpm.filtered.df.pivot <- pivot_longer(log2.cpm.filtered.df, # dataframe to be pivoted
                                           cols = Chat_n2:Chat_p4, # column names to be stored as a SINGLE variable
                                           names_to = "samples", # name of that new variable (column)
                                           values_to = "expression") # name of new variable (column) storing all the values (data)


p2 <- ggplot(log2.cpm.filtered.df.pivot) +
  aes(x=samples, y=expression, fill=samples) +
  geom_violin(trim = FALSE, show.legend = FALSE) +
  stat_summary(fun = "median", 
               geom = "point", 
               shape = 95, 
               size = 10, 
               color = "black", 
               show.legend = FALSE) +
  labs(y="log2 expression", x = "sample",
       title="Log2 Counts per Million (CPM)",
       subtitle="filtered, non-normalized",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw()

# Normalize your data ----
myDGEList.filtered.norm <- calcNormFactors(myDGEList.filtered, method = "TMM")
# take a look at this new DGEList object...how has it changed?

# use the 'cpm' function from EdgeR to get counts per million from your normalized data
log2.cpm.filtered.norm <- cpm(myDGEList.filtered.norm, log=TRUE)
log2.cpm.filtered.norm.df <- as_tibble(log2.cpm.filtered.norm, rownames = "symbol")
colnames(log2.cpm.filtered.norm.df) <- c("symbol", sampleLabels)
# pivot this NORMALIZED data, just as you did earlier
log2.cpm.filtered.norm.df.pivot <- pivot_longer(log2.cpm.filtered.norm.df, # dataframe to be pivoted
                                                cols = Chat_n2:Chat_p4, # column names to be stored as a SINGLE variable
                                                names_to = "samples", # name of that new variable (column)
                                                values_to = "expression") # name of new variable (column) storing all the values (data)


p3 <- ggplot(log2.cpm.filtered.norm.df.pivot) +
  aes(x=samples, y=expression, fill=samples) +
  geom_violin(trim = FALSE, show.legend = FALSE) +
  stat_summary(fun = "median", 
               geom = "point", 
               shape = 95, 
               size = 10, 
               color = "black", 
               show.legend = FALSE) +
  labs(y="log2 expression", x = "sample",
       title="Log2 Counts per Million (CPM)",
       subtitle="filtered, TMM normalized",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw()

# what if we wanted to put all three violin plots together?
# go back and assign each plot to a variable (rather than printing to the plots viewer)
# here we assigned the last 3 plots to p1, p2 and p3
# we'll use the 'plot_grid' function from the cowplot package to put these together in a figure
plot_grid(p1, p2, p3, labels = c('A', 'B', 'C'), label_size = 12)

Filtering was carried out to remove lowly expressed genes. Genes with less than 1 count per million (CPM) in at least 4 or more samples filtered out. This reduced the number of genes from 53655 to 14789.


0.9 Dendrogram

Filtered data were clustered to generate a dendrogram

# Identify variables of interest in study design file ----
group <- targets$group
group <- factor(group)

#hierarchical clustering can only work on a data matrix, not a data frame
#try using filtered and unfiltered data...how does this change the result?
#try other distance methods (e.g. switch from 'maximum' to 'euclidean')...how does this change the result?
distance <- dist(t(log2.cpm.filtered.norm), method = "maximum") #other distance methods are "euclidean", maximum", "manhattan", "canberra", "binary" or "minkowski"
clusters <- hclust(distance, method = "average") #other agglomeration methods are "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", or "centroid"
plot(clusters, labels=sampleLabels)

0.10 PCA plot of filtered and normalized data

pca.res <- prcomp(t(log2.cpm.filtered.norm), scale.=F, retx=T)
pc.var<-pca.res$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per<-round(pc.var/sum(pc.var)*100, 1) # we can then use these eigenvalues to calculate the percentage variance explained by each PC

# Visualize your PCA result ------------------
#lets first plot any two PCs against each other
#We know how much each sample contributes to each PC (loadings), so let's plot
pca.res.df <- as_tibble(pca.res$x)
ggplot(pca.res.df) +
  aes(x=PC1, y=PC2, label=sampleLabels, color=group) +
  geom_point(size=2) +
  geom_label() +
  #stat_ellipse() +
  xlab(paste0("PC1 (",pc.per[1],"%",")")) + 
  ylab(paste0("PC2 (",pc.per[2],"%",")")) +
  labs(title="PCA plot",
       caption=paste0("produced on ", Sys.time())) +
  #coord_fixed() + #keeps coordinates locked so that scaling plot differently doesn't change scale
  theme_bw()

0.10.1 Table of filtered and normalized data

The table shown below is interactive. You can sort and search the data directly from the table.

# Use dplyr 'verbs' to modify our dataframe ----
# use dplyr 'mutate' function to add new columns based on existing data
mydata.df <- log2.cpm.filtered.norm.df %>% 
  mutate(negative.AVG = (Chat_n2 + Chat_n3 + Chat_n4)/3,
         positive.AVG = (Chat_p1 + Chat_p2 + Chat_p3 + Chat_p4)/4,
         #now make columns comparing each of the averages above that you're interested in
         LogFC = (positive.AVG - negative.AVG)) %>% 
  mutate_if(is.numeric, round, 2)


# Make an interactive table using the DT package ----
datatable(mydata.df[,c(1,9:11)], 
          extensions = c('KeyTable', "FixedHeader"), 
          filter = 'top',
          options = list(keys = TRUE, 
                         searchHighlight = TRUE, 
                         pageLength = 10, 
                         lengthMenu = c("10", "25", "50", "100")))


0.11 Volcano plots

To identify differentially expressed genes, precision weights were first applied to each gene based on its mean-variance relationship using VOOM, then data was normalized using the TMM method in EdgeR. Linear modeling and bayesian stats were employed via Limma to find genes that were up- or down-regulated by 4-fold or more, with a false-discovery rate (FDR) of 0.01.

# Part 4 essentials: differential gene expression ----

# Set up your design matrix ----
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)


# Model mean-variance trend and fit linear model to data ----
# Use VOOM function from Limma package to model the mean-variance relationship
# Use "myDGEList" because my data was already normalized so didn't need to take DEGList and filter and normalize it
v.DEGList.filtered.norm <- voom(myDGEList.filtered.norm, design, plot = TRUE)

# fit a linear model to your data
fit <- lmFit(v.DEGList.filtered.norm, design)

# Contrast matrix ----
contrast.matrix <- makeContrasts(Chat = Chat_pos - Chat_neg,
                                 levels=design)

# extract the linear model fit -----
fits <- contrasts.fit(fit, contrast.matrix)
#get bayesian stats for your linear model fit
ebFit <- eBayes(fits)

#set cutoff and summarize results
results <- decideTests(ebFit, method="global", adjust.method="BH", p.value=0.01, lfc=1)
summary(results)
##         Chat
## Down      57
## NotSig 14684
## Up        48

This is an interactive volcano plot for Chat+ vs Chat-

#volcano plot Chat+ vs Chat-
TopHits <- topTable(ebFit, adjust ="BH", coef=1, number=40000, sort.by="logFC")
TopHits.df <- TopHits %>%
  as_tibble(rownames = "geneID")

vplot <- ggplot(TopHits.df) +
  aes(y=-log10(adj.P.Val), x=logFC, text = paste("Symbol:", geneID)) +
  geom_point(size=2) +
  geom_hline(yintercept = -log10(0.01), linetype="longdash", colour="grey", size=1) +
  geom_vline(xintercept = 1, linetype="longdash", colour="#BE684D", size=1) +
  geom_vline(xintercept = -1, linetype="longdash", colour="#2C467A", size=1) +
  #annotate("rect", xmin = 1, xmax = 12, ymin = -log10(0.01), ymax = 7.5, alpha=.2, fill="#BE684D") +
  #annotate("rect", xmin = -1, xmax = -12, ymin = -log10(0.01), ymax = 7.5, alpha=.2, fill="#2C467A") +
  labs(title="Volcano plot",
       subtitle = "Chat+ vs Chat-",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw()

ggplotly(vplot)

This is an annotated static volcano plot

#annotations on for static
vplot <- ggplot(TopHits.df) +
  aes(y=-log10(adj.P.Val), x=logFC, text = paste("Symbol:", geneID)) +
  geom_point(size=2) +
  geom_hline(yintercept = -log10(0.01), linetype="longdash", colour="grey", size=1) +
  geom_vline(xintercept = 1, linetype="longdash", colour="#BE684D", size=1) +
  geom_vline(xintercept = -1, linetype="longdash", colour="#2C467A", size=1) +
  annotate("rect", xmin = 1, xmax = 5, ymin = -log10(0.01), ymax = 5, alpha=.2, fill="#BE684D") +
  annotate("rect", xmin = -1, xmax = -5, ymin = -log10(0.01), ymax = 5, alpha=.2, fill="#2C467A") +
  labs(title="Volcano plot",
       subtitle = "Chat+ vs. Chat-",
       caption=paste0("produced on ", Sys.time())) +
  theme_bw()

plot(vplot)


0.12 Table of DEGs

The table below shows all genes that were identified as differentially expressed.

Use the column sort function of the table below to see genes based on these calculated values.

#table of DEGs
colnames(v.DEGList.filtered.norm$E) <- sampleLabels
diffGenes <- v.DEGList.filtered.norm$E[rowSums(abs(results)) !=0,]
diffGenes.df <- as_tibble(diffGenes, rownames = "geneID")
datatable(diffGenes.df, 
          extensions = c('KeyTable', "FixedHeader"), 
          caption = 'Table 1: DEGs',
          options = list(keys = TRUE, searchHighlight = TRUE, pageLength = 10, lengthMenu = c("10", "25", "50", "100"))) %>%
  formatRound(columns=c(2:8), digits=2)

0.13 Heatmaps and modules

Pearson correlation was used to cluster 105 differentially expressed genes, which were then represented as heatmap with the data scaled by Zscore for each row.

# modules and pairwise analysis ----
myheatcolors <- rev(brewer.pal(name="RdBu", n=11))
clustRows <- hclust(as.dist(1-cor(t(diffGenes), method="pearson")), method="complete") #cluster rows by pearson correlation
clustColumns <- hclust(as.dist(1-cor(diffGenes, method="spearman")), method="complete")
module.assign <- cutree(clustRows, k=2)
module.color <- rainbow(length(unique(module.assign)), start=0.1, end=0.9) 
module.color <- module.color[as.vector(module.assign)] 
heatmap.2(diffGenes, 
          Rowv=as.dendrogram(clustRows), 
          Colv=as.dendrogram(clustColumns),
          RowSideColors=module.color,
          col=myheatcolors, scale='row', labRow=NA,
          density.info="none", trace="none",  
          cexRow=1, cexCol=1, margins=c(8,20))

Next, we analyzed each module in the heat map to see if it contained genes in the SI tuft cell signature (generated from data in Nadjsombati, McGinty et al Immunity 2018 PMID: 30021144)

Module 1 - genes down in Chat+ vs. Chat-

#genes down in Chat+ vs Chat-
modulePick <- 2 
myModule_down <- diffGenes[names(module.assign[module.assign %in% modulePick]),] 
myModule_down <- as.data.frame(myModule_down)
myModule_down <- rownames_to_column(myModule_down, var = "geneID")

#any of these genes in the SI tuft signature? 
tuft <- read_tsv("tuft_DE.txt")
matches <- match_df(myModule_down, tuft, on = "geneID")
matches

Module 2 - genes up in Chat+ vs Chat-.

#genes up in Chat+ vs. Chat-
modulePick <- 1 
myModule_up <- diffGenes[names(module.assign[module.assign %in% modulePick]),] 
myModule_up <- as.data.frame(myModule_up)
myModule_up <- rownames_to_column(myModule_up, var = "geneID")

#any of these genes in the SI tuft signature? 
matches <- match_df(myModule_up, tuft, on = "geneID")
matches

0.14 Enrichment Analysis

Lastly, we checked enrichment of genes in Chat+ tuft cells against the Tuft1 and Tuft2 signatures generated by Haber et al (PMID: 29144463) and the distal SI tuft cell signature generated in a separate analysis associated with this Billipp et al manuscript.

# I want to compare Chat positive (and negative) gene signatures to the tuft-1 and tuft-2 signatures from 
# Haber et al. as well as dSI tuft cell signature from lab analysis

# Import tuft-1 and tuft-2 consensus signatures (from both plate and droplet RNAseq) from Haber dataset

# Grab dataframe you made in step3 script
# Rank genes based on LogFC value between Chat_postive and Chat_negative tuft cells
mydata.gsea <- mydata.df$LogFC
names(mydata.gsea) <- as.character(mydata.df$symbol)
mydata.gsea <- sort(mydata.gsea, decreasing = TRUE)

# Import pooled tuft cell signatures and run GSEA
tuftall_signature.df <- read_csv("pooled_signature.csv")
myGSEATuftall.res <- GSEA(mydata.gsea, TERM2GENE=tuftall_signature.df, verbose=FALSE, pvalueCutoff = .05)
myGSEATuftall.df <- as_tibble(myGSEATuftall.res@result)

# enrichment plot for TuftdSI signature
gseaplot2(myGSEATuftall.res,
          geneSetID = c(1,2,3), #can choose multiple signatures to overlay in this plot
          title = "Chat+ vs. tuft cell signatures", #can also turn off this title
          pvalue_table = FALSE) #can set this to FALSE for a cleaner plot

***

0.15 Conclusions

There were not very many transcriptional differences between Chat+ and Chat- cells. In fact, even Chat itself was only ~2.5 fold differentially expressed suggesting some kind of regulation of translation by the Chat UTRs that are retained in the Chat-GFP reporter. The upregulated genes in Chat- cells had nothing to do with tuft cells, while the upregulated genes in the Chat+ cells showed enrichment for both the Tuft1 and Tuft2 signature. They were more enriched for Tuft2, but the strongest enrichment was for genes from the distal SI tuft cells. Since Chat+ tuft cells are more frequent in the distal SI then proximal SI, it is likely that our sort of Chat+ cells (which used cells from the entire SI as starting material) was distal SI biased.

0.16 Session info

The output from running ‘sessionInfo’ is shown below and details all packages and version necessary to reproduce the results in this report.

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] GSEABase_1.56.0       graph_1.72.0          annotate_1.72.0      
##  [4] XML_3.99-0.10         AnnotationDbi_1.56.2  IRanges_2.28.0       
##  [7] S4Vectors_0.32.4      Biobase_2.54.0        BiocGenerics_0.40.0  
## [10] enrichplot_1.14.2     clusterProfiler_4.2.2 DT_0.24              
## [13] plyr_1.8.7            gplots_3.1.3          heatmaply_1.3.0      
## [16] viridis_0.6.2         viridisLite_0.4.0     RColorBrewer_1.1-3   
## [19] plotly_4.10.0         gt_0.6.0              cowplot_1.1.1        
## [22] matrixStats_0.62.0    edgeR_3.36.0          limma_3.50.3         
## [25] forcats_0.5.1         stringr_1.4.0         dplyr_1.0.9          
## [28] purrr_0.3.4           readr_2.1.2           tidyr_1.2.0          
## [31] tibble_3.1.8          ggplot2_3.3.6         tidyverse_1.3.2      
## [34] knitr_1.39            tinytex_0.40          rmarkdown_2.14       
## 
## loaded via a namespace (and not attached):
##   [1] shadowtext_0.1.2       readxl_1.4.0           backports_1.4.1       
##   [4] fastmatch_1.1-3        igraph_1.3.4           lazyeval_0.2.2        
##   [7] splines_4.1.2          crosstalk_1.2.0        BiocParallel_1.28.3   
##  [10] GenomeInfoDb_1.30.1    digest_0.6.29          yulab.utils_0.0.5     
##  [13] foreach_1.5.2          htmltools_0.5.3        GOSemSim_2.20.0       
##  [16] GO.db_3.14.0           fansi_1.0.3            magrittr_2.0.3        
##  [19] memoise_2.0.1          googlesheets4_1.0.1    tzdb_0.3.0            
##  [22] graphlayouts_0.8.1     Biostrings_2.62.0      modelr_0.1.8          
##  [25] vroom_1.5.7            colorspace_2.0-3       ggrepel_0.9.3         
##  [28] blob_1.2.3             rvest_1.0.2            haven_2.5.0           
##  [31] xfun_0.32              crayon_1.5.1           RCurl_1.98-1.8        
##  [34] jsonlite_1.8.0         scatterpie_0.1.7       ape_5.6-2             
##  [37] iterators_1.0.14       glue_1.6.2             polyclip_1.10-0       
##  [40] registry_0.5-1         gtable_0.3.0           gargle_1.2.0          
##  [43] zlibbioc_1.40.0        XVector_0.34.0         webshot_0.5.3         
##  [46] scales_1.2.0           DOSE_3.20.1            DBI_1.1.3             
##  [49] Rcpp_1.0.9             xtable_1.8-6           tidytree_0.4.0        
##  [52] gridGraphics_0.5-1     bit_4.0.4              htmlwidgets_1.5.4     
##  [55] httr_1.4.3             fgsea_1.20.0           ellipsis_0.3.2        
##  [58] farver_2.1.1           pkgconfig_2.0.3        sass_0.4.2            
##  [61] dbplyr_2.2.1           locfit_1.5-9.6         utf8_1.2.2            
##  [64] labeling_0.4.2         ggplotify_0.1.0        tidyselect_1.1.2      
##  [67] rlang_1.0.4            reshape2_1.4.4         munsell_0.5.0         
##  [70] cellranger_1.1.0       tools_4.1.2            cachem_1.0.6          
##  [73] downloader_0.4         cli_3.3.0              generics_0.1.3        
##  [76] RSQLite_2.2.15         broom_1.0.0            evaluate_0.16         
##  [79] fastmap_1.1.0          yaml_2.3.5             ggtree_3.2.1          
##  [82] bit64_4.0.5            fs_1.5.2               tidygraph_1.2.1       
##  [85] caTools_1.18.2         KEGGREST_1.34.0        dendextend_1.16.0     
##  [88] ggraph_2.0.6           nlme_3.1-159           aplot_0.1.6           
##  [91] DO.db_2.9              xml2_1.3.3             compiler_4.1.2        
##  [94] rstudioapi_0.13        png_0.1-7              treeio_1.18.1         
##  [97] reprex_2.0.1           tweenr_1.0.2           bslib_0.4.0           
## [100] stringi_1.7.8          highr_0.9              lattice_0.20-45       
## [103] Matrix_1.4-1           vctrs_0.4.1            pillar_1.8.0          
## [106] lifecycle_1.0.1        jquerylib_0.1.4        data.table_1.14.2     
## [109] bitops_1.0-7           seriation_1.3.6        patchwork_1.1.1       
## [112] qvalue_2.26.0          R6_2.5.1               TSP_1.2-1             
## [115] KernSmooth_2.23-20     gridExtra_2.3          codetools_0.2-18      
## [118] MASS_7.3-58.1          gtools_3.9.3           assertthat_0.2.1      
## [121] withr_2.5.0            GenomeInfoDbData_1.2.7 parallel_4.1.2        
## [124] hms_1.1.1              ggfun_0.0.6            grid_4.1.2            
## [127] googledrive_2.0.0      ggforce_0.3.3          lubridate_1.8.0