-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTaxonomic-Binning.R
274 lines (204 loc) · 11 KB
/
Taxonomic-Binning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
##################################################################################
###### Main Script ######
##################################################################################
################### Read input table ####################
# Load the tab-delimited file containing the abundances and taxonomic information to be checked (rownames in the first column)
otu_table <- read.table (otu_file,
check.names = FALSE,
header = TRUE,
dec = ".",
sep = "\t",
row.names = 1,
comment.char = "")
# Clean table from empty lines
otu_table <- otu_table[!apply(is.na(otu_table) | otu_table=="",1,all),]
# Create a dataframe with a number of rows identical to the number of OTUs in the dataset
taxonomy <- otu_table[,dim(otu_table)[2]]
# Test if the taxonomy column is in the correct format (delimited by semicolon)
if(all(grepl("(?:[^;]*;){6}", taxonomy))==FALSE) {
#Send error message if taxonomy is not in the right format
stop("Wrong number of taxonomic classes\n
Taxonomic levels have to be separated by semicolons (six in total).
IMPORTANT: if taxonomic information at any level is missing, the semicolons are still needed:\n
e.g.Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;
e.g.Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Prevotellaceae;;")
} else {
# Delete the taxonomy row from the OTU-table
otuFile <- otu_table[,c(1:dim(otu_table)[2] - 1)]
# Initialize empty dataframe
taxonomy_new <- NULL
for (i in 1:dim(otu_table)[1]) {
# Split taxonomic information in its taxonomic classes
# Kingdom - Phylum - Class - Family - Order - Genus
splitTax <- strsplit(x = as.character(taxonomy[i]), ";")
# Save the position where the first empty string (sequence of characters) occurs
value <- which(splitTax[[1]] == "")[1]
# Save the last known taxa information
lastTaxa = splitTax[[1]][value - 1]
# Replace all empty values by the last taxa information and the prefix "unkown_"
splitTax <-replace(splitTax[[1]],splitTax[[1]] == "",paste("unknown_",lastTaxa))
# Write new taxonomic information in the dataframe
taxonomy_new[i] <- list(splitTax)
}
# Adjust dataframe with modified taxonomic information
taxonomy_new <- t(as.data.frame(taxonomy_new))
row.names(taxonomy_new) <- row.names(otuFile)
# Add level information to all taxonomies
# For taxonomies related to kingdom level
taxonomy_new[,1] <- sub("^","k__",taxonomy_new[,1])
# For taxonomies related to phylum level
taxonomy_new[,2] <- sub("^","p__",taxonomy_new[,2])
# For taxonomies related to class level
taxonomy_new[,3] <- sub("^","c__",taxonomy_new[,3])
# For taxonomies related to order level
taxonomy_new[,4] <- sub("^","o__",taxonomy_new[,4])
# For taxonomies related to family level
taxonomy_new[,5] <- sub("^","f__",taxonomy_new[,5])
# For taxonomies related to genus level
taxonomy_new[,6] <- sub("^","g__",taxonomy_new[,6])
#################################################################################
# Create list with taxonomic information for each taxonomy level
class_list <-
list(
unique(taxonomy_new[,1]),unique(taxonomy_new[,2]),
unique(taxonomy_new[,3]), unique(taxonomy_new[,4]),
unique(taxonomy_new[,5]),unique(taxonomy_new[,6])
)
# Clone the created list for further processing
sample_list <- class_list
list_length <- NULL
# Iterate through all six taxonomy levels
for (a in 1:6) {
lis <- lapply(class_list[a], lapply, length)
names(lis)<-lapply(class_list[a],length)
# Individual number of taxonomies for each taxonomic level
num_taxa <- as.integer(names(lis))
list_length[a] <- num_taxa
# Iterate through taxonomic class specific taxonomies
for (b in 1:num_taxa) {
# Initialize list with the value zero for all taxonomies
sample_list[[a]][[b]] <- list(rep.int(0,dim(otuFile)[2]))
}
}
#################################################################################
#################################################################################
# Save relative abundances of all samples for each taxonomy
# Iterate through all OTUs
for (i in 1:dim(otu_table)[1]) {
# Iterate through all taxonomic levels
for (m in 1:6) {
# List of m-th taxonomies of i-th taxonomic levels (e.g. m = Kingdom, i = 4th OTU -> Clostridiales)
taxa_in_list <- list(taxonomy_new[i,])[[1]][m]
# Record the current position in a list
position <- which(class_list[[m]] == taxa_in_list)
# All rows with taxonomic information of n-th sample
matrix <- data.matrix(otuFile)
sub_sample_tax <-(subset(matrix,taxonomy_new[,m] == taxa_in_list))
# Get the actual value out of the list (initialized with zero)
temp <- unlist(sample_list[[m]][[position]])
# Calculate the summed up relative abundances for the particular taxonomic class for n-th sample
temp <- colSums(sub_sample_tax)
# Replace values by new summed values
sample_list[[m]][[position]] <- list(temp)
}
}
#################################################################################
###### Write output ######
#################################################################################
# Generate tables for each taxonomic class
##Kingdom table
# Create table with taxonomic information (kingdom level)
kingdom <- matrix(unlist(sample_list[[1]]),nrow = dim(otuFile)[2],ncol = list_length[1],dimnames = list(names(otuFile),unlist(class_list[[1]])))
kingdom <- (t(kingdom))
##Phylum table
# Create table with taxonomic information (phylum level)
phyla <-matrix(unlist(sample_list[[2]]),nrow = dim(otuFile)[2],ncol = list_length[2],dimnames = list(names(otuFile),unlist(class_list[[2]])))
phyla <- (t(phyla))
# Order table according to taxonomic name (descending)
phyla <- phyla[order(row.names(phyla)),]
## Class table
# Create table with taxonomic information (class level)
classes <- matrix(unlist(sample_list[[3]]), nrow = dim(otuFile)[2], ncol = list_length[3], dimnames = list(names(otuFile),unlist(class_list[[3]])))
classes <- (t(classes))
# Order dataframe according to taxonomic name (descending)
classes <- classes[order(row.names(classes)),]
## Orders
# create table with taxonomic information (Order)
orders <-matrix(unlist(sample_list[[4]]),nrow = dim(otuFile)[2],ncol = list_length[4],dimnames = list(names(otuFile),unlist(class_list[[4]])))
orders <- (t(orders))
# Order dataframe according to taxonomic name (descending)
orders <- orders[order(row.names(orders)),]
## Family table
# Create table with taxonomic information (family level)
families <-matrix(unlist(sample_list[[5]]),nrow = dim(otuFile)[2],ncol = list_length[5],dimnames = list(names(otuFile),unlist(class_list[[5]])))
families <- (t(families))
# Order dataframe according to taxonomic name (descending)
families <- families[order(row.names(families)),]
## Genus level
# Create table with taxonomic information (generum level)
genera <- matrix(unlist(sample_list[[6]]),nrow = dim(otuFile)[2],ncol = list_length[6],dimnames = list(names(otuFile),unlist(class_list[[6]])))
genera <- (t(genera))
# Order dataframe according to taxonomic name (descending)
genera <- genera[order(row.names(genera)),]
# Merge all dataframes
tax_summary <-rbind.data.frame(kingdom,phyla,classes,orders,families,genera)
# Identify duplicates and remove them
tax_summary <- tax_summary[!duplicated(row.names(tax_summary)),]
################################################################################
###### Write Output Files ######
#################################################################################
# Create a directory
dir.create("Taxonomic-Binning")
# Set path for all outputs to the new directory
setwd("Taxonomic-Binning")
# Write output files for taxonomic composition of every sample
write.table(kingdom,"0.Kingdom.all.tab",sep = "\t",col.names = NA)
write.table(phyla,"1.Phyla.all.tab",sep = "\t",col.names = NA)
write.table(classes,"2.Classes.all.tab",sep = "\t",col.names = NA)
write.table(orders,"3.Orders.all.tab",sep = "\t",col.names = NA)
write.table(families,"4.Families.all.tab",sep = "\t",col.names = NA)
write.table(genera,"5.Genera.all.tab",sep = "\t",col.names = NA)
write.table(tax_summary,"tax.summary.all.tab",sep = "\t",col.names = NA)
suppressWarnings (try(write.table(tax_summary, "../../5.Serial-Group-Comparisons/tax.summary.all.tab", sep ="\t",col.names = NA, quote = FALSE), silent =TRUE))
#################################################################################
###### Write Graphical Output ######
#################################################################################
pdf("taxonomic-overview.pdf")
par(xpd=T, mar=par()$mar+c(0,0,0,9))
#Kingdom
#k_col=distinctColorPalette(dim(kingdom)[1])
k_col=rainbow(dim(kingdom)[1])
k_col=sample(k_col)
barplot(kingdom,col=k_col, cex.names=0.5, ylab="cumulative relative abundance (%)", las=2, main="Taxonomic binning at Kingdom level")
legend(par('usr')[2], par('usr')[4], bty='n',rev(row.names(kingdom)),cex=0.7,col = rev(k_col),pch = 16,pt.cex = 1.2)
#Phyla
#p_col=distinctColorPalette(dim(phyla)[1])
p_col=rainbow(dim(phyla)[1])
p_col=sample(p_col)
barplot(phyla,col=p_col, cex.names=0.5,ylab="cumulative relative abundance (%)",las=2, main="Taxonomic binning at Phyla level")
legend(par('usr')[2], par('usr')[4], bty='n',rev(row.names(phyla)),cex=0.7,col = rev(p_col),pch = 16,pt.cex = 1.2)
#Classes
c_col=rainbow(dim(classes)[1])
c_col=sample(c_col)
barplot(classes,col=c_col, cex.names=0.5,ylab="cumulative relative abundance (%)",las=2, main="Taxonomic binning at Class level")
legend(par('usr')[2], par('usr')[4], bty='n',rev(row.names(classes)),cex=0.7,col = rev(c_col),pch = 16,pt.cex = 1.2)
#Orders
o_col=rainbow(dim(orders)[1])
o_col=sample(o_col)
barplot(orders,col=o_col, cex.names=0.5,ylab="cumulative relative abundance (%)",las=2, main="Taxonomic binning at Order level")
legend(par('usr')[2], par('usr')[4], bty='n',rev(row.names(orders)),cex=0.7,col = rev(o_col),pch = 16,pt.cex = 1.2)
#Families
f_col=rainbow(dim(families)[1])
f_col=sample(f_col)
barplot(families,col=f_col, cex.names=0.5,ylab="cumulative relative abundance (%)",las=2, main="Taxonomic binning at Family level")
legend(par('usr')[2], par('usr')[4], bty='n',rev(row.names(families)),cex=0.7,col = rev(f_col),pch = 16,pt.cex = 1.2)
#Genera
g_col=rainbow(dim(genera)[1])
g_col=sample(g_col)
barplot(genera,col=g_col, cex.names=0.5,ylab="cumulative relative abundance (%)",las=2, main="Taxonomic binning at Genus level")
legend(par('usr')[2], par('usr')[4], bty='n',rev(row.names(genera)),cex=0.7,col = rev(g_col),pch = 16,pt.cex = 1.2)
dev.off()
}
#################################################################################
###### End of Script ######
#################################################################################