-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile_ChEMBL
83 lines (65 loc) · 2.05 KB
/
Snakefile_ChEMBL
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from chembl_db import create_sdf
from snakemake.logging import logger
from tests import exec_tests_for_chunk
##############
# Parameters #
##############
# Number of compounds in the database
# Query: SELECT COUNT(*) FROM compound_structures
NUM_MOLFILES = 2304875
# Maximum number of compounds within one chunk
MAX_MOLFILES_PER_CHUNK = 10000
# Number of chunks
CHUNKS = NUM_MOLFILES // MAX_MOLFILES_PER_CHUNK + 1
# Number of compounds to be processed iteratively within one chunk
CHUNK_SIZE = NUM_MOLFILES // CHUNKS + 1
# Log parameters
onstart:
logger.info("Parameters:")
logger.info(f"NUM_MOLFILES={NUM_MOLFILES}")
logger.info(f"MAX_MOLFILES_PER_CHUNK={MAX_MOLFILES_PER_CHUNK}")
logger.info(f"CHUNKS={CHUNKS}")
logger.info(f"CHUNK_SIZE={CHUNK_SIZE}")
###################
# Snakemake rules #
###################
rule download_chembl_sqlite:
output:
"chembl_31_sqlite.tar.gz"
shell:
"curl https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_31/chembl_31_sqlite.tar.gz -o {output}"
rule extract_chembl_sqlite:
input:
"chembl_31_sqlite.tar.gz"
output:
"chembl_31/chembl_31_sqlite/chembl_31.db"
shell:
"tar -xzf {input}"
# Creates a sdf file with V3000 molfiles for the given chunk number
rule molfiles_from_db_for_chunk:
input:
"chembl_31/chembl_31_sqlite/chembl_31.db"
output:
"chembl_31/molfiles/{chunk}.sdf"
params:
chunk_start = lambda wildcards: (int(wildcards.chunk) - 1) * CHUNK_SIZE + 1,
chunk_end = lambda wildcards: int(wildcards.chunk) * CHUNK_SIZE
run:
create_sdf(input[0], output[0], params.chunk_start, params.chunk_end)
rule tests_for_chunk:
input:
"chembl_31/molfiles/{chunk}.sdf"
output:
"chembl_31/csv/{chunk}.csv"
run:
exec_tests_for_chunk(input[0], output[0], int(wildcards.chunk) == 1)
rule aggregate_logfiles:
input:
expand("chembl_31/csv/{chunk}.csv", chunk=range(1, CHUNKS + 1))
output:
"snapshot/chembl_snapshot.csv"
shell:
"ls -1v ./chembl_31/csv/*.csv | xargs cat >> {output}"
rule snapshot:
input:
"snapshot/chembl_snapshot.csv"