######################
#  Mapping Evaporation and rehydration chemistry onto SalinityII Mratio media landscape 
#######################


# Evap/Rehyd data 
data <- read.csv("/Volumes/~kbeer/BaligaLab/Salinity/Evaporation/EvapRehydIons.csv")
data$IonSalinityM <- rowSums(data[,4:8])

evap <- data[1:6,]
rehyd <- data[6:12,]

# GSL composition data
gsl.m <- read.csv("/Volumes/~kbeer/BaligaLab/Salinity/GSL_visit_052011/GSLchemAnalysis_M.csv", header=TRUE, row.names=NULL)
# Filter out South GSL samples since they won't be close at all anyway. 
gsl.m <- gsl.m[c(4:7,9:12),]

#  SalinityII Mratio ion composition data
alldat <- read.csv("/Volumes/~kbeer/BaligaLab/Salinity/Salinity scripts_plots/SalinityII_Mratios/SalinityII_Mratio_ALL_groupmeans_wPeptoneCitrate_v2.csv", header=TRUE, row.names=NULL)

setwd("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/")


# Combined comparison dataset (i.e. evap, rehyd and GSL salinity for comparison w/ SalinityII media landscape) 
gsltest <- rbind(gsl.m, data[,c(1,4:9)])

setwd("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/")
# write.csv(gsltest, file="GSLchem_gsltest.csv" )

# Hotelling's 2 sample multivariate T test
library(Hotelling)
mat1 <- as.matrix(gsltest[,2:6])
mat2 <- as.matrix(alldat[,12:16])
hotel <- hotelling.test(x=mat1, y=mat2, perm=TRUE)
# Duh, the group of GSL samples is different from the group of media

# Factor variables 
KCl.ratio <- as.factor(round(alldat$KCl/(alldat$KCl+alldat$MgSO4), digits=2))
salinityMF <- as.factor(alldat$salinityM)
NaCl.ratio <- cut(alldat$NaCl/alldat$salinityM, c(0.99, 0.85, 0.68, 0), labels=c("low", "med", "high"))

### SKIP TO LINE 89 NOW. I ABANDONED THE FOLLOWING CODE IN FAVOR OF CREATING DIFFERENCE MATRICES FOR EACH ION SEPARATELY.


##############################
# Calculate differences between each pair of evap and salinityII compositions. 
# Then, take the rowSums of those differences, and order them from least to greatest. 
# NOT USING diffs OBJECT BELOW

# Evap diffs 
#row.a <- rep(1:75, each=6) # SalinityII media
#row.b <- rep(1:6, times=75) # Evap concentrations
#rows <-paste(row.a, row.b, sep="_") # rownames for diffs object
#diffs <- data.frame(combo=rows, Na.diff=rep(0,450), Cl.diff=rep(0,450), Mg.diff=rep(0,450), SO4.diff=rep(0,450), K.diff=rep(0,450), IonSalinity.diff=rep(0,450))
#
#ind.evap <- rep(1:6, times=75)
#ind.sal <- rep(1:75, each=6)
#
#for(i in seq(1:450)) {
#  diffs[i, 2:7] <- abs(evap[ind.evap[i],4:9] - alldat[ind.sal[i], 12:17])
#  }



# #########
# # Try comparing ion ratios
# 
# #in evap/rehyd data
# K.Mg.ratio <- data$K/(data$K+data$Mg)  # Mean=0.300, sd=0.007
# Na.ratio <- data$Na/data$IonSalinityM  # Mean=0.367, sd=0.049
# Mg.ratio <- data$Mg/data$IonSalinityM
# 
# # in alldat SalinityII Mratio data
# K.Mg.ratio.all <- alldat$K/(alldat$K+alldat$Mg)
# Na.ratio.all <- alldat$Na/alldat$IonSalinityM
# Mg.ratio.all <- alldat$Mg/alldat$IonSalinityM
# 
# # Which are closest to these in the alldat set?
# 
# min(abs(K.Mg.ratio.all-0.300))
# abs(K.Mg.ratio.all-0.300)   # The media closest to these ratios are 2, 7 and 12 of each series (25% KCl/(KCl+Mg))
# 
# 
# min(abs(Na.ratio.all-0.367))
# abs(Na.ratio.all-0.367)  # Media closest to these are 6-10 (intermediate level Na:total)
# 


###############
# Compare multiple different aspects of salinity
# Define these aspects here: 

# Ion:total ratios in GSL measurements
ratios.gsl <- data.frame(SampleID=gsltest[,1], NaRatio=gsltest$Na/gsltest$IonSalinityM, ClRatio=gsltest$Cl/gsltest$IonSalinityM, MgRatio=gsltest$Mg/gsltest$IonSalinityM, SO4Ratio=gsltest$SO4/gsltest$IonSalinityM, KRatio=gsltest$K/gsltest$IonSalinityM, IonSalinityM=gsltest$IonSalinityM, KMgRatio=gsltest$K/gsltest$Mg, MgNaRatio=gsltest$Mg/gsltest$Na, KNaRatio=gsltest$K/gsltest$Na, KKMgRatio=gsltest$K/(gsltest$K+gsltest$Mg))  
# Ratios of ions:total and ions:ions for GSL comparison set

# Ion:total ratios in salinity media
ratios.sal <- data.frame(SampleID=paste(alldat[,1], alldat[,17]), NaRatio=alldat$Na/alldat$IonSalinityM, ClRatio=alldat$Cl/alldat$IonSalinityM, MgRatio=alldat$Mg/alldat$IonSalinityM, SO4Ratio=alldat$SO4/alldat$IonSalinityM, KRatio=alldat$K/alldat$IonSalinityM, IonSalinityM=alldat$IonSalinityM, KMgRatio=alldat$K/alldat$Mg, MgNaRatio=alldat$Mg/alldat$Na, KNaRatio=alldat$K/alldat$Na, KKMgRatio=alldat$K/(alldat$K+alldat$Mg)) # ratio of Na:total for SalinityII media landscape 
# Infinite values result from K/Mg in KMgRatio... Convert Inf to NA:
is.na(ratios.sal$KMgRatio) <- !is.finite(ratios.sal$KMgRatio)


##########
#  Now, ultimate goal is final table with rownames=gsltest[,1] and 2 columns for each test category (one for minimum difference and one for the name of the salII media that produced that minimum difference)
#############

####### Matrix of pairwise differences between gsl ion:total RATIO and salII ion:total RATIO 

# NaRatio matrix:  Creates gsl x salII matrix of abs(differences) between NaRatios. Repeat for all other columns of ratios.foo objects
NaDiffs <- matrix(rep(0, 1500), 20, 75, dimnames=list(ratios.gsl[,1], ratios.sal[,1])) 
for(i in 1:20) {
  NaDiffs[i,] <- abs(ratios.gsl$NaRatio[i]-ratios.sal$NaRatio)
 }

# ClRatio matrix: (repeat above code)
ClDiffs <- matrix(rep(0, 1500), 20, 75, dimnames=list(ratios.gsl[,1], ratios.sal[,1])) 
for(i in 1:20) {
  ClDiffs[i,] <- abs(ratios.gsl$ClRatio[i]-ratios.sal$ClRatio)
}
# MgRatio matrix: (repeat above code)
MgDiffs <-matrix(rep(0, 1500), 20, 75, dimnames=list(ratios.gsl[,1], ratios.sal[,1])) 
for(i in 1:20) {
  MgDiffs[i,] <- abs(ratios.gsl$MgRatio[i]-ratios.sal$MgRatio)
}
# KRatio matrix: 
KDiffs <-matrix(rep(0, 1500), 20, 75, dimnames=list(ratios.gsl[,1], ratios.sal[,1])) 
for(i in 1:20) {
  KDiffs[i,] <- abs(ratios.gsl$KRatio[i]-ratios.sal$KRatio)
}
# IonSalinityM matrix: 
TotDiffs <- matrix(rep(0, 1500), 20, 75, dimnames=list(ratios.gsl[,1], ratios.sal[,1])) 
for(i in 1:20) {
  TotDiffs[i,] <- abs(ratios.gsl$IonSalinityM[i]-ratios.sal$IonSalinityM)
}
# KMg matrix: 
KMgDiffs <- matrix(rep(0, 1500), 20, 75, dimnames=list(ratios.gsl[,1], ratios.sal[,1])) 
for(i in 1:20) {
  KMgDiffs[i,] <- abs(ratios.gsl$KMgRatio[i]-ratios.sal$KMgRatio)
}
# MgNa matrix: 
MgNaDiffs <- matrix(rep(0, 1500), 20, 75, dimnames=list(ratios.gsl[,1], ratios.sal[,1])) 
for(i in 1:20) {
  MgNaDiffs[i,] <- abs(ratios.gsl$MgNaRatio[i]-ratios.sal$MgNaRatio)
}
# KNa matrix: 
KNaDiffs <- matrix(rep(0, 1500), 20, 75, dimnames=list(ratios.gsl[,1], ratios.sal[,1])) 
for(i in 1:20) {
  KNaDiffs[i,] <- abs(ratios.gsl$KNaRatio[i]-ratios.sal$KNaRatio)
}

# K/K+Mg matrix
KKMgDiffs <- matrix(rep(0, 1500), 20, 75, dimnames=list(ratios.gsl[,1], ratios.sal[,1])) 
for(i in 1:20) {
  KKMgDiffs[i,] <- abs(ratios.gsl$KKMgRatio[i]-ratios.sal$KKMgRatio)
}


####### Matrix of pairwise differences between gsl ion CONCENTRATIONS and salII ion CONCENTRATIONS.  (This is different from the ratio differences above:  ion_gsl-ion_salII vs ion_gsl/Total_gsl - ion_sal/total_sal)


# Na matrix:  Creates gsl x salII matrix of abs(differences) between Na concentration. Repeat for all other columns of ratios.foo objects
NaDiffs2 <- matrix(rep(0, 1500), 20, 75, dimnames=list(gsltest[,1], paste(alldat[,1], alldat[,17]))) 
for(i in 1:20) {
  NaDiffs2[i,] <- abs(gsltest$Na[i]-alldat$Na)
}

# Cl matrix: (repeat above code)
ClDiffs2 <- matrix(rep(0, 1500), 20, 75, dimnames=list(gsltest[,1], paste(alldat[,1], alldat[,17]))) 
for(i in 1:20) {
  ClDiffs2[i,] <- abs(gsltest$Cl[i]-alldat$Cl)
}
# Mg matrix: (repeat above code)
MgDiffs2 <- matrix(rep(0, 1500), 20, 75, dimnames=list(gsltest[,1], paste(alldat[,1], alldat[,17]))) 
for(i in 1:20) {
  MgDiffs2[i,] <- abs(gsltest$Mg[i]-alldat$Mg)
}
# K matrix: (repeat above code)
KDiffs2 <- matrix(rep(0, 1500), 20, 75, dimnames=list(gsltest[,1], paste(alldat[,1], alldat[,17]))) 
for(i in 1:20) {
  KDiffs2[i,] <- abs(gsltest$K[i]-alldat$K)
}


## Example of how to find the smallest difference in each row (using potassium)

Kmins <- apply(KDiffs2,1, min) # media with smallest difference for each GSL sample (row)
Totmins <- apply(TotDiffs,1, min) # media with smallest difference for each GSL sample (row)
hist(Totmins, breaks=15)  # bi-modal distribution of differences in total salinity - justification for cutoff at 1.6M. 



KmediaMatch <- vector("list", 20)
for(i in 1:20) {
  KmediaMatch[[i]] <- which(KDiffs2[i,]==Kmins[i])
}  # KmediaMatch is a list giving the best matching media for each GSL site

# Look at each row of differences 
KDiffs2[1,c(60:75)]

##################################################
##### RATIO diffs: Are differences significant?  Estimate using empirical probability distributions - ratios in the smallest 5% of entire matrix (not just each row) get an asterisk in the heatmap
## Use these to identify the heatmap cells that need asterisks... which diffs are smallest.
## These values of fn.XX are based on RATIO differences, not CONCENTRATION differences. See additional code for concentration diffs ecdf.

# NaDiffs ecdf
fn.Na <- ecdf(NaDiffs)
fn.Na(0.0095)  # = 0.059, may need to test many
plot(fn.Na, verticals=TRUE, do.points=FALSE)
abline(v=0.01)
# Find value of NaDiff at which 5% of obs are lower 


# ClDiffs ecdf
fn.Cl <- ecdf(ClDiffs)
fn.Cl(0.0035)  # = 0.051
plot(fn.Cl, verticals=TRUE, do.points=FALSE)
abline(v=0.002)
# Find values at which 5% of obs are lower


# MgDiffs ecdf
fn.mg <- ecdf(MgDiffs)
fn.mg(0.0075)  # = 0.05
plot(fn.mg, verticals=TRUE, do.points=FALSE)
abline(v=0.007)
# Find values at which 5% of obs are lower

# KDiffs ecdf
fn.k <- ecdf(KDiffs)
fn.k(0.005)  # = 0.053
plot(fn.k, verticals=TRUE, do.points=FALSE)
abline(v=0.005)
# Find values at which 5% of obs are lower

# TotDiffs ecdf
fn.tot <- ecdf(TotDiffs)
fn.tot(0.3)  # = 0.05
plot(fn.tot, verticals=TRUE, do.points=FALSE)
abline(v=0.3)
# Find values at which 5% of obs are lower 

# KMgDiffs ecdf
fn.kmg <- ecdf(KMgDiffs)
fn.kmg(0.041)  # = 0.048
plot(fn.kmg, verticals=TRUE, do.points=FALSE)
abline(v=0.051)
# Find values at which 5% of obs are lower


# MgNaDiffs ecdf
fn.mgna <- ecdf(MgNaDiffs)
fn.mgna(0.029)  # = 0.048
plot(fn.mgna, verticals=TRUE, do.points=FALSE)
abline(v=0.029)
# Find values at which 5% of obs are lower

# KNaDiffs ecdf
fn.kna <- ecdf(KNaDiffs)
fn.kna(0.016)  # = 0.05
plot(fn.kna, verticals=TRUE, do.points=FALSE)
abline(v=0.016)
# Find values at which 5% of obs are lower



##################################################
##### CONCENTRATION diffs: Are differences significant?  Estimate using empirical probability distributions
## Use these to identify the heatmap cells that need asterisks... which diffs are smallest.

# NaDiffs2 ecdf
fn.Na2 <- ecdf(NaDiffs2)
fn.Na2(0.125)  # = 0.053, may need to test many
plot(fn.Na2, verticals=TRUE, do.points=FALSE)
abline(v=0.13)
# Find value of NaDiff at which 5% of obs are lower 


# ClDiffs ecdf
fn.Cl2 <- ecdf(ClDiffs2)
fn.Cl2(0.2)  # = 0.056
plot(fn.Cl2, verticals=TRUE, do.points=FALSE)
abline(v=0.2)
# Find values at which 5% of obs are lower


# MgDiffs ecdf
fn.mg2 <- ecdf(MgDiffs2)
fn.mg2(0.045)  # = 0.049
plot(fn.mg2, verticals=TRUE, do.points=FALSE)
abline(v=0.03)
# Find values at which 5% of obs are lower

# KDiffs ecdf
fn.k2 <- ecdf(KDiffs2)
fn.k2(0.025)  # = 0.052
plot(fn.k2, verticals=TRUE, do.points=FALSE)
abline(v=0.025)
# Find values at which 5% of obs are lower




#########################################
#  Find minimum differences between gsl and salII RATIOS. Use heatmaps to show which pairs have the smallest differences
####### RATIO HEATMAPS
library(gplots)  # for heatmap.2
library(ggplot2)
library(RColorBrewer)
library(colorRamps)
library(fBasics)
library(fPortfolio)

# To find the numeric boundaries of the color bins, assign heatmap to a variable and call hm$colorTable

# NaDiff
# matrix of asterisks for differences smaller than 5% by ecdf() (See ecdf fxns below)
 pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_NaDiffRatio.pdf", height=8, width=11)
sig.na <- replace(NaDiffs, NaDiffs<0.0095, "*") 
sig.na <- replace(sig.na, NaDiffs>0.0095, "")

heatmap.2(NaDiffs[,1:15], Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in Na:Total ratio \nbetween Salinity landscape and GSL conditions", cellnote=sig.na, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()


#ClDiff
# matrix of asterisks for differences smaller than 5% by ecdf() (See ecdf fxns below)
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_ClDiffRatio.pdf", height=8, width=11)
sig.cl <- replace(ClDiffs, ClDiffs<0.0035, "*") 
sig.cl <- replace(sig.cl, ClDiffs>0.0035, "")
heatmap.2(ClDiffs[,1:15], Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in Cl:Total ratio \nbetween Salinity landscape and GSL conditions", cellnote=sig.cl, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()

#MgDiff
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_MgDiffRatio.pdf", height=8, width=11)
sig.mg <- replace(MgDiffs, MgDiffs<0.0075, "*") 
sig.mg <- replace(sig.mg, MgDiffs>0.0075, "")
heatmap.2(MgDiffs[,1:15], Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in Mg:Total ratio \nbetween Salinity landscape and GSL conditions", cellnote=sig.mg, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()

#KDiff
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_KDiffRatio.pdf", height=8, width=11)
sig.k <- replace(KDiffs, KDiffs<0.005, "*") 
sig.k <- replace(sig.k, KDiffs>0.005, "")
heatmap.2(KDiffs[,1:15], Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in K:Total ratio \nbetween Salinity landscape and GSL conditions", cellnote=sig.k, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()

#TotDiff
sig.td <- replace(TotDiffs, TotDiffs<0.3, "*") 
sig.td <- replace(sig.td, TotDiffs>0.3, "")
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_totDiff.pdf", height=8, width=11)
hm <- heatmap.2(TotDiffs, Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20,  main="Differences in total salinity (M) between \nSalinity landscape and GSL conditions", cellnote=sig.td, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()


#KMg Diff
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_KMgDiffRatio.pdf", height=8, width=11)
sig.kmg <- replace(KMgDiffs, KMgDiffs<0.041, "*") 
sig.kmg <- replace(sig.kmg, KMgDiffs>0.041, "")
heatmap.2(KMgDiffs[,1:15], Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in K:Mg ratio \nbetween Salinity landscape and GSL conditions", cellnote=sig.kmg, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()
# Too small range among media/small diffs in GSL

#MgNa Diff
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_MgNaDiffRatio.pdf", height=8, width=11)
sig.mgna <- replace(MgNaDiffs, MgNaDiffs<0.029, "*") 
sig.mgna <- replace(sig.mgna, MgNaDiffs>0.029, "")
heatmap.2(MgNaDiffs[,1:15], Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in Mg:Na ratio \nbetween Salinity landscape and GSL conditions", cellnote=sig.mgna, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()

#KNa Diff
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_KNaDiffRatio.pdf", height=8, width=11)
sig.kna <- replace(KNaDiffs, KNaDiffs<0.016, "*") 
sig.kna <- replace(sig.kna, KNaDiffs>0.016, "")
heatmap.2(KNaDiffs[,1:15], Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in K:Na ratio \nbetween Salinity landscape and GSL conditions", cellnote=sig.kna, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()

### Can truncate any of these heatmaps to contain only the first 15 media (columns) using XDiffs[,15]



#########################################
#  Find minimum differences between gsl and salII CONCENTRATIONS. Use heatmaps to show which pairs have the smallest differences
####### CONCENTRATION HEATMAPS

# NaDiff2
# matrix of asterisks for differences smaller than 5% by ecdf() (See ecdf fxns below)
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_NaDiffConc.pdf", height=8, width=11)
quartz(height=8, width=11)
sig.na <- replace(NaDiffs2, NaDiffs2<0.125, "*") 
sig.na <- replace(sig.na, NaDiffs2>=0.125, "")
hm <- heatmap.2(NaDiffs2, Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in [Na] \nbetween Salinity landscape and GSL conditions", cellnote=sig.na, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()


#ClDiff2
# matrix of asterisks for differences smaller than 5% by ecdf() (See ecdf fxns below)
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_ClDiffConc.pdf", height=8, width=11)
sig.cl <- replace(ClDiffs2, ClDiffs2<0.2, "*") 
sig.cl <- replace(sig.cl, ClDiffs2>0.2, "")
heatmap.2(ClDiffs2, Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in [Cl] \nbetween Salinity landscape and GSL conditions", cellnote=sig.cl, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()

#MgDiff2
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_MgDiffConc.pdf", height=8, width=11)
sig.mg <- replace(MgDiffs2, MgDiffs2<0.045, "*") 
sig.mg <- replace(sig.mg, MgDiffs2>=0.045, "")
hm <- heatmap.2(MgDiffs2, Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in [Mg] \nbetween Salinity landscape and GSL conditions", cellnote=sig.mg, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()

#KDiff2
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_KDiffConc.pdf", height=8, width=11)
sig.k <- replace(KDiffs2, KDiffs2<0.025, "*") 
sig.k <- replace(sig.k, KDiffs2>=0.025, "")
heatmap.2(KDiffs2, Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=19, name="RdYlBu"), breaks=20, main="Differences in [K] \nbetween Salinity landscape and GSL conditions", cellnote=sig.k, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()


##################
# Subset the smallest difference in Total, Na, Mg and K CONCENTRATIONS for manuscript table
###################
# Replace NaDiffs2 with any of the other 'Diffs2' variables. (The '2' indicates concentration, not ion:total ratio)

TotDiffs[1,c(71:75)]
TotDiffs[2,c(66:70)]
TotDiffs[3,c(46:60)]
TotDiffs[4,c(16:30)]
TotDiffs[5,c(1:15)]
TotDiffs[6,c(1:15)]
TotDiffs[7,c(1:15)]
TotDiffs[8,c(1:15)]
TotDiffs[18,c(61:75)]
TotDiffs[19,c(61:75)]
TotDiffs[20,c(1:15)]

NaDiffs2[1,c(71:75)]
NaDiffs2[2,c(66:70)]
NaDiffs2[3,c(46:60)]
NaDiffs2[4,c(16:30)]
NaDiffs2[5,c(1:15)]
NaDiffs2[6,c(1:15)]
NaDiffs2[7,c(1:15)]
NaDiffs2[8,c(1:15)]
NaDiffs2[18,c(61:75)]
NaDiffs2[19,c(61:75)]
NaDiffs2[20,c(1:15)]

MgDiffs2[1,c(71:75)]
MgDiffs2[2,c(66:70)]
MgDiffs2[3,c(46:60)]
MgDiffs2[4,c(16:30)]
MgDiffs2[5,c(1:15)]
MgDiffs2[6,c(1:15)]
MgDiffs2[7,c(1:15)]
MgDiffs2[8,c(1:15)]
MgDiffs2[18,c(61:75)]
MgDiffs2[19,c(61:75)]
MgDiffs2[20,c(1:15)]

KDiffs2[1,c(71:75)]
KDiffs2[2,c(66:70)]
KDiffs2[3,c(46:60)]
KDiffs2[4,c(16:30)]
KDiffs2[5,c(1:15)]
KDiffs2[6,c(1:15)]
KDiffs2[7,c(1:15)]
KDiffs2[8,c(1:15)]
KDiffs2[18,c(61:75)]
KDiffs2[19,c(61:75)]
KDiffs2[20,c(1:15)]

#Write CSVs of Diffs2 matrices for manuscript

write.csv(TotDiffs, file="/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/TotDiffs_differenceMatrix.csv")

write.csv(NaDiffs2, file="/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/NaDiffs2_differenceMatrix.csv")

write.csv(MgDiffs2, file="/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/MgDiffs2_differenceMatrix.csv")

write.csv(KDiffs2, file="/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/KDiffs2_differenceMatrix.csv")


#######################
# Distance matrix comparing media and GSL by using the absolute mean difference between media, for all ions.
# See: http://stackoverflow.com/questions/10707260/how-can-i-create-a-distance-matrix-containing-the-mean-absolute-scores-between-e
##############################

# set up data with rows = ion, columns = sample or media

# GSL
t.gsltest<- t(data.matrix(gsltest))
colnames(t.gsltest) <- gsltest[,1]
t.gsltest <- t.gsltest[-1,]

# media
alldat.sm <- alldat[,c(1, 12:17)]
t.alldat <- t(data.matrix(alldat.sm))
colnames(t.alldat) <- rownames(alldat.sm)
#Col names are V1-V75, representing media 1-15, repeated 5 times from 100 to 60%
t.alldat <- t.alldat[-1,]

# Combine both
t.gsl.alldat <- cbind(t.gsltest, t.alldat)

# transpose t.gsl.alldat
gsl.alldat <- t(t.gsl.alldat)

# Generate dist matrix using manhattan distance: abs(Na - Na)+abs(Cl-Cl)...+abs(K-K)/6 total salinity components
diffmatrix <- as.matrix(dist(gsl.alldat, "manhattan", diag=TRUE, upper=FALSE)/nrow(gsl.alldat))

print(diffmatrix, digits=3)

#Without dividing by nrow... in other words, total abs difference among all ions
diffmatrix2 <- as.matrix(dist(gsl.alldat, "manhattan", diag=TRUE, upper=FALSE))

#Other distance metrics:
diffmatrix3 <- as.matrix(dist(gsl.alldat, "maximum", diag=TRUE, upper=FALSE)/nrow(gsl.alldat))

# Save csv of this distance matrix
write.csv(diffmatrix[1:20, 21:95], file="/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/ManhattanDistMatrix.csv")


# Heatmap of manhattan distance matrix
pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_ManhattanDiffmatrix_allGSL.pdf", height=8, width=11)
heatmap.2(diffmatrix[1:20, 21:95], Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=99, name="RdYlBu"), breaks=100,  main="Average manhattan distance between \nGSL chemistry and growth media", margins=c(8.5,7))
dev.off()

# Heatmap of manhattan dist matrix, omitting Evap 0 - Rehyd 3 (rows 9-17)
sig.diffmat <- replace(diffmatrix[c(1:8,18:20), 21:95], diffmatrix[c(1:8,18:20), 21:95]<0.0165, "*")  # Defines smallest 5% of differences using ecdf... code below.
sig.diffmat <- replace(sig.diffmat, diffmatrix[c(1:8,18:20), 21:95]>0.0165, "")

pdf("/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/Heatmap_ManhattanDiffmatrix_subsetGSL.pdf", height=8, width=11)
heatmap.2(diffmatrix[c(1:8,18:20), 21:95], Rowv=NA, Colv=NA, dendrogram="none", scale="none", trace="none", col=divPalette(n=99, name="RdYlBu"), breaks=100,  main="Average manhattan distance between \nGSL chemistry and growth media", cellnote=sig.diffmat, notecol="white", notecex=1.75, margins=c(8.5,7))
dev.off()

# create list of results: smalldiffs = smallest differences between each GSL sample and media
smalldiffs <- list(GSLsample=rep(0,20) , media=rep(0,20) ,diffVal=rep(0,20), media2=rep(0,20), diffVal2=rep(0,20))
for(i in 1:20) {
  smalldiffs$GSLsample[i] <- colnames(diffmatrix)[i] 
  smalldiffs$media[i] <- which.min(diffmatrix[-c(1:20), i]) # media with smallest difference, using averaged manhattan distance over all ions
  smalldiffs$diffVal[i] <- min(diffmatrix[-c(1:20), i]) # smallest difference, using avg
  smalldiffs$media2[i] <- which.min(diffmatrix2[-c(1:20), i]) # media w/ smallest difference using plain manhattan distance - sum of absolute differences across all ions, not divided by 6
  smalldiffs$diffVal2[i] <- min(diffmatrix2[-c(1:20), i]) # smallest difference using plain manhattan distance
}

# Histogram off differences, to show cutoff
h <- hist(smalldiffs$diffVal, breaks=20)

write.csv(smalldiffs, file="/Volumes/~kbeer/BaligaLab/Salinity/GSL_salinity/GSLvsMratioMedia/ManhattanDist_SmallestDiffs.csv")

# diffmatrix ecdf - find lowest 5% of differences
fn.diffmat <- ecdf(diffmatrix[c(1:8,18:20), 21:95])
fn.diffmat(0.0165)  # = 0.0509
plot(fn.diffmat, verticals=TRUE, do.points=FALSE)
abline(v=0.0165)
# Find values at which 5% of obs are lower



# Hypergeometric analysis - Is 9/11 (81%) a lot of peaks, given that 30 out of 75 produced peaks (40%)?
# Peaks = 'white balls' in ?phyper

# P value for manhattan dist
phyper(9, 30, 45, 11, lower.tail=FALSE) # = 0.000287

# P value for nested method, with Mg (8/11 media produced peaks)
phyper(8, 30, 45, 11, lower.tail=FALSE) # = 0.00318


###################
#  Stistical consulting 7/18/2012 and 7/26/2012
####################

# Datasets to give to Jan 
# GSL compositions
write.csv(gsltest, file="/users/karlyn/Desktop/GSLsalinities.csv")

#SalinityII Mratio media compositions
write.csv(alldat[,c(1, 12:17)], file="/users/karlyn/Desktop/MediaSalinities.csv")


####################################
#PCA:  Run PCA on GSL compositions and then use the linear combinations defined by the major components (axes) to overlay the Mratio media onto the PCA axes.  Ask Jan about how to do this... 

data1<- gsltest
data2<- alldat[,c(1, 12:17)]

### euclidean distance in 2D between the point (x1,y1) and the point (x2,y2)
dis <- function(x1,y1,x2,y2){return(sqrt((x1-x2)^2+(y1-y2)^2))}

### requires the x and y coordinates of a specific point and the x and y coordinates of a bunch of points
### returns the index of the closest point in the bunch to the specified point

findclosest<-function(x,y,xvec,yvec){
  distances<-NULL
  for(i in 1:length(xvec)){distances[i]<-dis(x,y,xvec[i],yvec[i])}
  index<-which.min(distances)
  return(index)
}

### gsl is a matrix of the x and y coordinates of each GSL solution in principal component space (i.e. the transformed ion concentrations multiplied by PC loadings (PC1 and PC2 for x and y, respectively))
### lab is a matrix of the x and y coordinates of each lab solution in principal component space
### function returns a vector containing the index of the lab point that is closest to each gsl point

closest.lab<-function(gsl,lab){
  returnme<-NULL
  for(i in 1:length(gsl[,1])){returnme<-rbind(returnme,findclosest(gsl[i,1],gsl[i,2],lab[,1],lab[,2]))}
  return(returnme)
}


#### setting up the data with principal components

gsl.data<-data1[,2:7]
lab.data<-data2[,2:7]

####
#### transform!
# Log transforming makes variances similar across columns 
gsl.data<-log(gsl.data+0.001)
lab.data<-log(lab.data+0.001)
####
####


gsl.prin.comp<-princomp(x = gsl.data)
gsl.loadings<-loadings(gsl.prin.comp)
# Loadings interpretation: 
  # PC1 is negatively correlated with all ions, most strongly with SO4, followed by total salinity, Na and Cl.
  # PC2 is positively correlated with Na, Cl and TS, and negatively correlated w/ Mg, SO4 and K

gsl<-cbind(as.matrix(gsl.data)%*%gsl.loadings[,1],as.matrix(gsl.data)%*%gsl.loadings[,2]) # GSL compositions in PC1xPC2 space
lab<-cbind(as.matrix(lab.data)%*%gsl.loadings[,1],as.matrix(lab.data)%*%gsl.loadings[,2]) # lab media compositions in PC1xPC2 space

### returns the index of the closest lab solution to each great salt lake sample 
### for a particular choice of principal components 
indices <- closest.lab(gsl,lab)

### Which media correspond to the indices listed in 'indices' object above? 
data2[indices, c(1,7)]

# Remember, we are omitting the following rows from the gsl dataset (data1) because they are 

### recreate the plots from before with indicies instead of circles and x's
plot(0,0,main="Great Salt lake Principal Components",xlab="Principal Component 1",ylab="Principal Component 2",ylim=c(1.7,7.8),xlim=c(-2.5,5.1))
text(gsl[,2]~gsl[,1],col='blue')
text(lab[,2]~lab[,1],col=2)
legend("topleft",legend=c("GSL","Lab"),col=c("blue","red"),pch=1)



### Plot with blue and red dots only, no numbers
pdf("/users/karlyn/Baliga lab local/Talks, posters, slides/LabMtg_20120904/PCA_dotsOnly.pdf")
par(cex=1.4)
plot(0,0,main="Great Salt lake Principal Components",xlab="Principal Component 1",ylab="Principal Component 2",ylim=c(1.7,7.8),xlim=c(-2.5,5.1))
points(gsl[,2]~gsl[,1],col=rgb(0,0,1,0.5), pch=19)
points(lab[,2]~lab[,1],col=rgb(1,0,0,0.5), pch=19)
legend("topleft",legend=c("GSL","Salinity landscape media"),col=c(rgb(0,0,1,0.5),rgb(1,0,0,0.5)),pch=19)
dev.off()

### Plot w/ blue and red dots, with the first dot in each series of 5 red dots changed to media composition number 1-15


pdf("/users/karlyn/Baliga lab local/Talks, posters, slides/LabMtg_20120904/PCA_dotsNums.pdf")
par(cex=1.4)
plot(0,0,main="Great Salt lake Principal Components",xlab="Principal Component 1",ylab="Principal Component 2",ylim=c(1.7,7.8),xlim=c(-2.5,5.1))
points(gsl[,2]~gsl[,1],col=rgb(0,0,1,0.5), pch=19)
points(lab[,2]~lab[,1],col=rgb(1,0,0,0.5), pch=19)
text(lab[1:15,2]~lab[1:15,1], col="black", pos=2, offset=0.4, cex=0.7)
legend("topleft",legend=c("GSL","Salinity landscape media"),col=c(rgb(0,0,1,0.5),rgb(1,0,0,0.5)),pch=19)
dev.off()
