Snakefile_ChEMBL

from chembl_db import create_sdf
from snakemake.logging import logger
from tests import exec_tests_for_chunk

##############
# Parameters #
##############

# Number of compounds in the database
# Query: SELECT COUNT(*) FROM compound_structures
NUM_MOLFILES = 2304875

# Maximum number of compounds within one chunk
MAX_MOLFILES_PER_CHUNK = 10000

# Number of chunks
CHUNKS = NUM_MOLFILES // MAX_MOLFILES_PER_CHUNK + 1

# Number of compounds to be processed iteratively within one chunk
CHUNK_SIZE = NUM_MOLFILES // CHUNKS + 1

# Log parameters
onstart:
  logger.info("Parameters:")
  logger.info(f"NUM_MOLFILES={NUM_MOLFILES}")
  logger.info(f"MAX_MOLFILES_PER_CHUNK={MAX_MOLFILES_PER_CHUNK}")
  logger.info(f"CHUNKS={CHUNKS}")
  logger.info(f"CHUNK_SIZE={CHUNK_SIZE}")


###################
# Snakemake rules #
###################
rule download_chembl_sqlite:
  output:
    "chembl_31_sqlite.tar.gz"
  shell:
    "curl https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_31/chembl_31_sqlite.tar.gz -o {output}"


rule extract_chembl_sqlite:
  input:
    "chembl_31_sqlite.tar.gz"
  output:
    "chembl_31/chembl_31_sqlite/chembl_31.db"
  shell:
    "tar -xzf {input}"


# Creates a sdf file with V3000 molfiles for the given chunk number
rule molfiles_from_db_for_chunk:
  input:
    "chembl_31/chembl_31_sqlite/chembl_31.db"
  output:
    "chembl_31/molfiles/{chunk}.sdf"
  params:
    chunk_start = lambda wildcards: (int(wildcards.chunk) - 1) * CHUNK_SIZE + 1,
    chunk_end = lambda wildcards: int(wildcards.chunk) * CHUNK_SIZE
  run:
    create_sdf(input[0], output[0], params.chunk_start, params.chunk_end)


rule tests_for_chunk:
  input:
    "chembl_31/molfiles/{chunk}.sdf"
  output:
    "chembl_31/csv/{chunk}.csv"
  run:
    exec_tests_for_chunk(input[0], output[0], int(wildcards.chunk) == 1)


rule aggregate_logfiles:
  input:
    expand("chembl_31/csv/{chunk}.csv", chunk=range(1, CHUNKS + 1))
  output:
    "snapshot/chembl_snapshot.csv"
  shell:
    "ls -1v ./chembl_31/csv/*.csv | xargs cat >> {output}"


rule snapshot:
  input:
    "snapshot/chembl_snapshot.csv"