### R code from vignette source 'ClusteringSequences.Rnw' ################################################### ### code chunk number 1: ClusteringSequences.Rnw:51-56 ################################################### options(continue=" ") options(width=80) options(SweaveHooks=list(fig=function() par(mar=c(4.1, 4.1, 0.3, 0.1)))) set.seed(123) ################################################### ### code chunk number 2: startup ################################################### library(DECIPHER) ################################################### ### code chunk number 3: expr1 ################################################### # specify the path to your file of sequences: fas <- "<>" # OR use the example DNA sequences: fas <- system.file("extdata", "50S_ribosomal_protein_L2.fas", package="DECIPHER") # read the sequences into memory dna <- readDNAStringSet(fas) dna ################################################### ### code chunk number 4: expr2 ################################################### aa <- translate(dna) aa seqs <- aa # could also cluster the nucleotides ################################################### ### code chunk number 5: expr3 ################################################### clusters <- Clusterize(seqs, cutoff=seq(0.5, 0, -0.1), processors=1) class(clusters) colnames(clusters) str(clusters) apply(clusters, 2, max) # number of clusters per cutoff ################################################### ### code chunk number 6: expr4 ################################################### o <- order(clusters[[2]], width(seqs), decreasing=TRUE) # 40% cutoff o <- o[!duplicated(clusters[[2]])] aa[o] dna[o] ################################################### ### code chunk number 7: expr5 ################################################### t <- table(clusters[[1]]) # select the clusters at a cutoff t <- sort(t, decreasing=TRUE) head(t) w <- which(clusters[[1]] == names(t[1])) AlignSeqs(seqs[w], verbose=FALSE) ################################################### ### code chunk number 8: expr6 ################################################### getOption("SweaveHooks")[["fig"]]() aligned_seqs <- AlignSeqs(seqs, verbose=FALSE) d <- DistanceMatrix(aligned_seqs, verbose=FALSE) tree <- TreeLine(myDistMatrix=d, method="UPGMA", verbose=FALSE) heatmap(as.matrix(clusters), scale="column", Colv=NA, Rowv=tree) ################################################### ### code chunk number 9: expr7 ################################################### c1 <- Clusterize(dna, cutoff=0.2, invertCenters=TRUE, processors=1) w <- which(c1 < 0 & !duplicated(c1)) dna[w] # select cluster representatives (negative cluster numbers) ################################################### ### code chunk number 10: expr8 ################################################### c2 <- Clusterize(dna, cutoff=0.2, singleLinkage=TRUE, processors=1) max(abs(c1)) # center-linkage max(c2) # single-linkage (fewer clusters, but broader clusters) ################################################### ### code chunk number 11: expr9 ################################################### getOption("SweaveHooks")[["fig"]]() genus <- sapply(strsplit(names(dna), " "), `[`, 1) t <- table(genus, c2[[1]]) heatmap(sqrt(t), scale="none", Rowv=NA, col=hcl.colors(100)) ################################################### ### code chunk number 12: expr10 ################################################### set.seed(123) # initialize the random number generator clusters <- Clusterize(seqs, cutoff=0.1, processors=1) set.seed(NULL) # reset the seed ################################################### ### code chunk number 13: sessinfo ################################################### toLatex(sessionInfo(), locale=FALSE)