## Skalanes Farm Mound 2022 aDNA Dataset
## Date of code: 3 Sept 2024

#################################################################################
########################  PHYLOSEQ ##############################################
#################################################################################


# Load the libraries

library("phyloseq"); packageVersion("phyloseq")
library ("ggplot2"); packageVersion("ggplot2")
library("plyr"); packageVersion("plyr")
library("RColorBrewer"); packageVersion("RColorBrewer")

#set working directory  

#create the phyloseq object
SFM <- import_biom("SFM_nt-10-bracken.biom")
#check what type of file it is -  inspect the result by asking the class of the object created and doing a close inspection of some of its content
class(SFM)

# You can examine each part of the phyloseq object:
View(SFM@tax_table@.Data)
View(SFM@otu_table@.Data)
View(SFM@sam_data@.Data)

## file names in the OTU table are long and terrible. Rename them.
## current name: 1_1_S1-report-0.10_bracken_species
## new name: 1_1 
library(biomformat)  # To work with .biom files
biom_data <- read_biom("SFM_nt-10-bracken.biom")
otu_table <- as.matrix(SFM@otu_table@.Data)
# Extract current column names
old_colnames <- colnames(otu_table)

# Use gsub to remove everything after the first two numbers 
## !! this worked fine for all the samples but it didn't work for the negative controls
# new_colnames <- sub("^([0-9]+_[0-9]+).*", "\\1", old_colnames)
# Modify the column names
## !! This didn't work either
# new_colnames <- sub("^(.*?)_.*?_.*", "\\1", colnames(otu_table))

new_colnames <- sub("(^[^_]+_[^_]+)_.+", "\\1", colnames(otu_table))
# Assign new column names to the OTU table
colnames(otu_table) <- new_colnames
SFM@otu_table@.Data <- otu_table
View(SFM@otu_table@.Data)
## write_biom(biom_data, "updated_file.biom") ## don't need this?? 

## No SAM file - aka no metadata - can we add this in?? YES WE CAN
metadata <- read.csv("SFM_2022_Metadata.csv", row.names = 1)
metadata_sample_data <- sample_data(metadata)
SFM <- merge_phyloseq(SFM, metadata_sample_data)

## Rename the Tax Table - the column names are currently "Rank 1, Rank 2 ..."
tax_table <- as.matrix(SFM@tax_table@.Data)
colnames(tax_table) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
SFM@tax_table@.Data <- tax_table
View(SFM@tax_table@.Data)

## Clean Up Environment!
remove(tax_table)
remove(otu_table)
remove(metadata)
remove(metadata_sample_data)
remove(biom_data)
remove(new_colnames)
remove(old_colnames)

# Check to make sure everything looks right! 
View(SFM@tax_table@.Data)
View(SFM@otu_table@.Data)
View(SFM@sam_data@.Data)

###############################################################################
####################### BASIC INFORMATION ###########################
###############################################################################
# from: https://www.nicholas-ollberding.com/post/introduction-to-phyloseq/

# how many samples are there? = 28
nsamples(SFM)

# What are the sample names?
sample_names(SFM)

# What are the sample variables?
sample_variables(SFM)

# Look at the beginning of the metadata file
head(sample_data(SFM))

# Look at only one column from the metadata file
sample_data(SFM)$era1

# Count how many items are in a category - the category is the $time
table(sample_data(SFM)$era1)
table(sample_data(SFM)$era2)

# Pull out the metadata
metadata <- data.frame(sample_data(SFM))

# Calculate the total # reads for each sample
sample_sums(SFM)

# Sort that table
sort(sample_sums(SFM))

# plot the read counts
hist(sample_sums(SFM), main = "Histogram of Read Counts", xlab = "Total Reads",
     border = "black", col = "grey", las=1, breaks=12)

# how many taxa are in the OTU table
ntaxa(SFM)

# look at the beginning of the OTU table
head(taxa_names(SFM))

# look at the beginning of an OUT sums table
head(taxa_sums(SFM))

# make a data frame from the OTU table - with this code below just get first 5 rows and columns
(asv_tab <- data.frame(otu_table(SFM)[1:5, 1:5]))

otu_tab <- data.frame(otu_table(SFM))

# What are the taxonomy names
rank_names(SFM)

# what does the tax_table look like
head(tax_table(SFM))

# look at the table but select just one column
head(tax_table(SFM)[,2])

head(tax_table(SFM)[,7])

# look at the table by one column??
table(tax_table(SFM)[,1])

write.table(table(tax_table(SFM)[,2])) # gets list of phylum

# write.table(table(tax_table(psS)[,3])) # gets list of class
 
# write.table(table(tax_table(psS)[,4])) # gets list of order

# write.table(table(tax_table(psS)[,5])) # gets list of family

# write.table(table(tax_table(psS)[,6])) # gets list of genus

# write.table(table(tax_table(psS)[,7])) # gets list of species

# Error in dimnames(x) <- dn : 
#  length of 'dimnames' [1] not equal to array extent

# export some or all of the tax_table
(tax_tab <- data.frame(tax_table(psS)[50:55,]))

tax_tab_psS <- data.frame(tax_table(psS))


###############################################################################
####################### ORDINATION PLOTS: Skalanes Farm Mound 2022 ############
###############################################################################


# Create a color palette
kd_colors <- brewer.pal(n = 4, name = "Set3") # Change the number and palette name as needed

# Plot the bar graph with custom colors
p <- plot_bar(SFM, fill = "Kingdom") + 
  scale_fill_manual(values = kd_colors) +
  ggtitle("Skalanes Farm Mound Kingdom") +
  theme_minimal() # Optionally, you can add more customization to the theme

print(p)

############## subset data, try again 
SFM_Bac <- subset_taxa(SFM, Kingdom=="k__Bacteria")
SFM_Euk <- subset_taxa(SFM, Kingdom=="k__Eukaryota")
SFM_Vir <- subset_taxa(SFM, Kingdom=="k__Viruses")
SFM_Arc <- subset_taxa(SFM, Kingdom=="k__Archaea")

pB <- plot_bar(SFM_Bac, fill = "Phylum") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Bacteria Phylum") +
  theme_minimal() # Optionally, you can add more customization to the theme

print(pB)

SFM_Bac_300 <- prune_taxa(names(sort(taxa_sums(SFM_Bac), TRUE)[1:300]), SFM_Bac)

pB300 <- plot_bar(SFM_Bac_300, fill = "Phylum") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Bacteria Phylum, top 300 reads") +
  facet_wrap(~Phylum) + # can remove this to plot all in one
  theme_minimal() # Optionally, you can add more customization to the theme

print(pB300)

pE <- plot_bar(SFM_Euk, fill = "Phylum") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Eukaryote Phylum") +
  facet_wrap(~Phylum) + # can remove this to plot all in one
  theme_minimal() # Optionally, you can add more customization to the theme

print(pE)

SFM_Euk_Chor <- subset_taxa(SFM_Euk, Phylum=="p__Chordata")


pEC <- plot_bar(SFM_Euk, fill = "Class") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Eukaryote Phylum=Chordata") +
  facet_wrap(~Class) + # can remove this to plot all in one
  theme_minimal() # Optionally, you can add more customization to the theme

print(pEC)

SFM_Euk_Chor_Mamm <- subset_taxa(SFM_Euk, Class=="c__Mammalia")
SFM_Euk_Chor_Mamm_300 <- prune_taxa(names(sort(taxa_sums(SFM_Euk_Chor_Mamm), TRUE)[1:300]), SFM_Euk_Chor_Mamm)

pECM <- plot_bar(SFM_Euk_Chor_Mamm_300, fill = "Order") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Eukaryote Class=Mammalia, top 300 reads") +
  facet_wrap(~Order) + # can remove this to plot all in one
  theme_minimal() # Optionally, you can add more customization to the theme

print(pECM)

SFM_Euk_Chor_Mamm_Arti <- subset_taxa(SFM_Euk, Order=="o__Artiodactyla")

pECMA <- plot_bar(SFM_Euk_Chor_Mamm_Arti, fill = "Family") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Eukaryote Order=Artiodactyla") +
  #facet_wrap(~Family) + # can remove this to plot all in one
  theme_minimal() # Optionally, you can add more customization to the theme

print(pECMA)

SFM_Euk_Chor_Mamm_Arti_Bovi <- subset_taxa(SFM_Euk, Family=="f__Bovidae")

pECMAB <- plot_bar(SFM_Euk_Chor_Mamm_Arti, fill = "Genus") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Eukaryote Family=Bovidae") +
  facet_wrap(~Genus) + # can remove this to plot all in one
  theme_minimal() # Optionally, you can add more customization to the theme

print(pECMAB)

SFM_Euk_Chor_Mamm_Arti_Bovi_Bos <- subset_taxa(SFM_Euk, Genus=="g__Bos")

pECMABB <- plot_bar(SFM_Euk_Chor_Mamm_Arti, fill = "Species") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Eukaryote Genus=Bos") +
  facet_wrap(~Species) + # can remove this to plot all in one
  theme_minimal() # Optionally, you can add more customization to the theme

print(pECMABB)


pA <- plot_bar(SFM_Arc, fill = "Phylum") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Archaea Phylum") +
  facet_wrap(~Phylum) + # can remove this to plot all in one
  theme_minimal() # Optionally, you can add more customization to the theme

print(pA)

pV <- plot_bar(SFM_Vir, fill = "Phylum") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Virus Phylum") +
  facet_wrap(~Phylum) + # can remove this to plot all in one
  theme_minimal() # Optionally, you can add more customization to the theme

print(pV)

################
plot_bar(SFM, fill = "Kingdom")

p + geom_bar(aes(color=Kingdom, fill=Kingdom), stat="identity", position="stack")

### WHY ARE THE COLORS SO TERRIBLE???
# Possible reason: a million black boxes. See https://github.com/joey711/phyloseq/issues/721

p_ = plot_bar(SFM, fill ="Kingdom") + 
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Kingdom")

print(p_)

phylumGlommed = tax_glom(SFM, "Kingdom")
plot_bar(phylumGlommed, x="time", fill ="Kingdom")

k = plot_bar(phylumGlommed, x="time", fill="Kingdom")+
  geom_bar(stat="identity") +
  ggtitle("Skalanes Farm Mound Kingdom") +
  facet_wrap(~Kingdom)

print(k)

## for some reason the above command won't work when I put "sample-id" in the x= section ... 

## facet the plot - TOO MANY PHYLUM - can I break this plot apart by Kingdom?

p + facet_wrap(~Phylum) + ggtitle("Skalanes Farm Mound Phylum")

p + facet_wrap(~Kingdom) + ggtitle("Skalanes Farm Mound Kingdom") #better, still terrible coloring

## to try in the future - make the axis a log scale

SFM.NMDS.bray <- ordinate(SFM, "NMDS", "bray")
SFM.PCoA.bray <- ordinate(SFM, "PCoA", "bray")
SFM.PCoA.u <- ordinate(SFM, "PCoA", "uunifrac") ## no phylogeny, won't work
SFM.PCoA.jac <- ordinate(SFM, "PCoA", "jaccard")


# Warning message:
# In metaMDS(veganifyOTU(physeq), distance, ...) :
#  stress is (nearly) zero: you may have insufficient data

p1 = plot_ordination(SFM, SFM.NMDS.bray, type="taxa", color="Phylum", title="Ordination NMDS Bray")
print(p1)

p1 + facet_wrap(~Phylum, 3) # looks WEIRD! 

p1.1 = plot_ordination(SFM, SFM.PCoA.bray, type="taxa", color="Phylum", title="Ordination PCoA Bray")
print(p1.1)

p1.2 = plot_ordination(SFM, SFM.PCoA.jac, type="taxa", color="Phylum", title="Ordination PCoA Jaccard")
print(p1.2)

p2 = plot_ordination(SFM, SFM.ord, type="era2", color="era2", shape="era2") + 
  geom_point(size=5) + ggtitle("Skalanes Farm Mound Bracken 0.10 NMDS-bray")
print(p2)

#Warning message:
#  In plot_ordination(SFM, SFM.ord, type = "large_categories", color = "large_categories",  :
#                       type argument not supported. `type` set to 'samples'.
#                     See `plot_ordination('list')`

################### LOOPING THROUGH ALL ORDINATION METHODS
# In this section I loop through different method parameter options to the 
# plot_ordination function, store the plot results in a list, and then plot 
# these results in a combined graphic using ggplot2.

dist = "bray"
ord_meths = c("DCA", "CCA", "RDA", "DPCoA", "NMDS", "MDS", "PCoA")
plist = llply(as.list(ord_meths), function(i, psS, dist){
  ordi = ordinate(psS, method=i, distance=dist)
  plot_ordination(psS, ordi, "era", color="era")
}, psS, dist)

#Warning messages:
#  1: In plot_ordination(psS, ordi, "era", color = "era") :
#  type argument not supported. `type` set to 'samples'.
#See `plot_ordination('list')`
#2: In scores(ordination, choices = axes, display = "species", physeq = physeq) :
#  restarting interrupted promise evaluation
#3: In plot_ordination(psS, ordi, "era", color = "era") :
#  Could not obtain coordinates from the provided `ordination`. 
#Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.

################### MDS ("PCoA") on Unifrac Distances
# Use the ordinate function to simultaneously perform weightd UniFrac and then 
# perform a Principal Coordinate Analysis on that distance matrix (first line). 
# Next pass that data and the ordination results to plot_ordination to create 
# the ggplot2 output graphic with default ggplot2 settings.

ordu = ordinate(psS, "PCoA", "unifrac", weighted=TRUE)
# Warning message:
#In matrix(tree$edge[order(tree$edge[, 1]), ][, 2], byrow = TRUE,  :
#            data length [104505] is not a sub-multiple or multiple of the number of rows [52253]

p = plot_ordination(psS, ordu, color="era", shape="era") + 
  geom_point(size=5) + 
  ggtitle("MDS/PCoA on weighted-UniFrac distance, Untrimmed SILVA")
print(p)

###############################################################################
################## ALPHA DIVERSITY GRAPHICS: Skalanes Farm Mound ##############
###############################################################################

# I know it is tempting to trim noise right away, but many richness estimates 
# are modeled on singletons and doubletons in the abundance data. You need to 
# leave them in the dataset if you want a meaningful estimate.

AD_all <- plot_richness(SFM, color="time", title="Skalanes Farm Mound Alpha Diversity Metrics")

print(AD_all)
## Warning message:
#In estimate_richness(physeq, split = TRUE, measures = measures) :
#  The data you have provided does not have
#any singletons. This is highly suspicious. Results of richness
#estimates (for example) are probably unreliable, or wrong, if you have already
#trimmed low-abundance taxa from the data.

#We recommended that you find the un-trimmed data and retry.

#But it seems people are using Bracken to calculate Alpha and Beta diversity?

# We can specify a sample variable on which to group/organize samples along the horizontal (x) axis. 

AD_by_time <- plot_richness(SFM, x="time", title="Skalanes Farm Mound Alpha Diversity", measures=c("Chao1", "Shannon", "Simpson"))
print(AD_by_time)
## WARNING: We recommended that you find the un-trimmed data and retry.

###############################################################################
################## HEATMAPS: Skalanes Farm Mound ##############################
###############################################################################

SFM_Bac <- subset_taxa(SFM, Kingdom=="k__Bacteria")
plot_heatmap(SFM_Bac, title = "Skalanes Farm Mound All Bacteria")
# can prune to the top X amount 
SFM_Bac_300 <- prune_taxa(names(sort(taxa_sums(SFM_Bac), TRUE)[1:300]), SFM_Bac)
plot_heatmap(SFM_Bac_300, title = "Skalanes Farm Mound Top 300 Bacteria")

SFM_Euk <- subset_taxa(SFM, Kingdom=="k__Eukaryota")
plot_heatmap(SFM_Euk, title = "Skalanes Farm Mound All Eukaryote")

SFM_Euk_300 <- prune_taxa(names(sort(taxa_sums(SFM_Euk), TRUE)[1:300]), SFM_Euk)
plot_heatmap(SFM_Euk_300, title = "Skalanes Farm Mound Top 300 Eukaryotes")

SFM_Euk_50 <- prune_taxa(names(sort(taxa_sums(SFM_Euk), TRUE)[1:50]), SFM_Euk)
plot_heatmap(SFM_Euk_50, title = "Skalanes Farm Mound Top 50 Eukaryotes")

SFM_Euk_mamm <- subset_taxa(SFM_Euk, Class=="c__Mammalia")
plot_heatmap(SFM_Euk_mamm, title = "Skalanes Farm Mound Mammals")

SFM_Vir <- subset_taxa(SFM, Kingdom=="k__Viruses")
plot_heatmap(SFM_Vir, title = "Skalanes Farm Mound All Viruses")

# the below code didn't work
SFM_Arc <- subset_taxa(SFM, Kingdom=="k__Archaea")
plot_heatmap(SFM_Arc, title = "Skalanes Farm Mound All Archaea")


###############################################################################
################## plot_net: Skalanes Farm Mound ##############################
###############################################################################

# There is a random aspect to some of the network layout methods. 
# For complete reproducibility of the images produced later in this tutorial, 
# it is possible to set the random number generator seed explicitly:

set.seed(711L)

plot_net(SFM, maxdist = 0.4, color = "era2")

plot_net(SFM_Euk, maxdist = 0.4, color = "era2")

plot_net(SFM_Bac, maxdist = 0.4, color = "era2")

plot_net(SFM_Arc, maxdist = 0.4, color = "era2")

plot_net(SFM_Vir, maxdist = 0.4, color = "era2")