-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpatients.R
executable file
·66 lines (53 loc) · 1.66 KB
/
patients.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env Rscript
args <- commandArgs(trailingOnly=TRUE)
if(length(args) != 3) {
stop("Must provide two bgzip-tabixed vcfs, annotated with dbSnp in ID field, followed by an output prefix")
}
print(args)
library('VariantAnnotation')
file <- args[[1]]
patientID <- gsub(".dbSnp.vcf.gz","",file)
#############################################
### READ VCF
vcf <- readVcf(
TabixFile(file),
"hg19",
ScanVcfParam(
info=NA,
geno=c("FREQ","DP")
)
)
#############################################
### extract allele frequencies for dbSnps
### where depth in all samples is greater than 10
### assumes 5 samples!!
freq <- geno(vcf)[["FREQ"]][grepl("rs",rownames(vcf)) & apply(geno(vcf)[["DP"]]>10,1, sum, na.rm=T)==5,]
colnames(freq) <- paste0(patientID, c("b","n1","n2","t1","t2"))
#############################################
file <- args[[2]]
patientID <- gsub(".dbSnp.vcf.gz","",file)
vcf <- readVcf(
TabixFile(file),
"hg19",
ScanVcfParam(
info=NA,
geno=c("FREQ","DP")
)
)
freq2 <- geno(vcf)[["FREQ"]][grepl("rs",rownames(vcf)) & apply(geno(vcf)[["DP"]]>10,1, sum, na.rm=T)==5,]
colnames(freq2) <- paste0(patientID, c("b","n1","n2","t1","t2"))
rm(vcf)
#############################################
### MERGE ALLELE FREQUENCY MATRICES
freqm <- merge(freq,freq2,by="row.names",all.x=T)
row.names(freqm) <- freqm$Row.names
freqm$Row.names <- NULL
freqm[is.na(freqm)] <- 0
#############################################
### HIERARCHICAL CLUSTERING
hc = hclust(as.dist(1-cor(freqm)))
output_prefix <- args[[3]]
write.table(cor(freqm), file=paste0(output_prefix,".txt"), sep="\t", quote=F)
pdf(paste0(output_prefix,".pdf"))
plot(hc, hang=-1)
dev.off()