-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmake.job.segments.R
executable file
·72 lines (54 loc) · 1.86 KB
/
make.job.segments.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env Rscript
argv <- commandArgs(trailingOnly = TRUE)
if(length(argv) < 2) {
q()
}
gene.file <- argv[1] # e.g., gene.file = 'data/coding.genes.txt.gz'
out.file <- argv[2] # e.g., 'jobs.txt'
options(stringsAsFactors = FALSE)
source('Util.R')
library(dplyr)
library(readr)
gene.cols <- c('chr', 'lb', 'ub', 'strand', 'ensg', 'hgnc')
gene.types <- 'ciiccc_'
## 1. assign genes into blocks -- never change the order
genes <-
read_tsv(gene.file, col_names = gene.cols, col_types = gene.types) %>%
mutate(loc = if_else(strand == '+', lb, ub)) %>%
mutate(data.pos = 1:n())
block.size0 <- 1e8
max.size <- 50
genes.blk <-
genes %>%
mutate(block = floor(loc/block.size0), depth = 0) %>%
select(chr, loc, block, data.pos, depth)
big.blocks <- genes.blk %>%
group_by(chr, block) %>%
summarize(n.genes = n()) %>%
filter(n.genes > max.size) %>%
select(chr, block, n.genes)
## 2. break large blocks into smaller chunks
while(TRUE) {
big.blocks <- genes.blk %>%
group_by(chr, block) %>%
summarize(n.genes = n()) %>%
filter(n.genes > max.size) %>%
select(chr, block)
if(nrow(big.blocks) == 0) break
ret1 <- genes.blk %>%
anti_join(big.blocks)
ret2 <-
genes.blk %>% right_join(big.blocks) %>%
mutate(depth = depth + 1) %>%
mutate(block = floor(loc / block.size0 * 2^depth)/2^depth)
genes.blk <- bind_rows(ret1, ret2) %>%
arrange(data.pos)
}
ret <- genes %>%
left_join(genes.blk) %>%
group_by(chr, block) %>%
summarize(lb = min(data.pos), ub = max(data.pos)) %>%
arrange(lb) %>%
select(chr, lb, ub)
write.table(ret, file = out.file, quote = FALSE, sep = '\t',
col.names = FALSE, row.names = FALSE)