From 4251f895e59272a768c4aa3a29864f9f133bf2d5 Mon Sep 17 00:00:00 2001 From: Narges Rezaie Date: Wed, 14 Aug 2024 15:28:08 -0700 Subject: [PATCH] complete snakemake workflow --- setup.py | 4 +- workflow/snakemake/README.md | 44 ++++++++------ workflow/snakemake/config/config.yaml | 10 ++-- workflow/snakemake/workflow/Snakefile | 23 ++++++-- .../snakemake/workflow/rules/topModel.smk | 59 +++++++++++-------- workflow/snakemake/workflow/rules/train.smk | 28 +++++---- .../workflow/scripts/make_topmodel.py | 4 +- .../snakemake/workflow/scripts/make_train.py | 10 +++- 8 files changed, 111 insertions(+), 71 deletions(-) diff --git a/setup.py b/setup.py index 0846336..9ebd290 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='Topyfic', # the name of your package packages=['Topyfic'], # same as above - version='v0.4.13', # version number + version='v0.4.15', # version number license='MIT', # license type description='Topyfic is a Python package designed to identify reproducible latent dirichlet allocation (LDA) ' 'using leiden clustering and harmony for single cell epigenomics data', @@ -11,7 +11,7 @@ author='Narges Rezaie', # your name author_email='nargesrezaie80@gmail.com', # your email url='https://github.com/mortazavilab/Topyfic', # url to your git repo - download_url='https://github.com/mortazavilab/Topyfic/archive/refs/tags/v0.4.13.tar.gz', # link to the tar.gz file associated with this release + download_url='https://github.com/mortazavilab/Topyfic/archive/refs/tags/v0.4.15.tar.gz', # link to the tar.gz file associated with this release keywords=['Cellular Programs', 'Latent Dirichlet allocation', 'single-cell multiome', 'single-cell RNA-seq', 'gene regulatory network', 'Topic Modeling', 'single-nucleus RNA-seq'], # python_requires='>=3.9', diff --git a/workflow/snakemake/README.md b/workflow/snakemake/README.md index d473233..a7297b3 100644 --- a/workflow/snakemake/README.md +++ b/workflow/snakemake/README.md @@ -2,10 +2,11 @@ This directory contains a Snakemake pipeline for running the Topyfic automatically. -The snakemake will run training and building model (topModel). +The snakemake will run training (Train) and building model (topModel, Analysis). **Note**: Please make sure to install necessary packages and set up your Snakemake appropriately. -**Note**: pipeline is tested for >= 8.* + +**Note**: pipeline is tested for Snakemake >= 8.X ([more info](https://snakemake.readthedocs.io/en/stable/index.html)) ## Getting started @@ -31,55 +32,64 @@ Modify the [config file](config/config.yaml) or create a new one with the same s 3. **n_topics** - Contains list of number of initial topics you wish to train model base on them - - list of int: [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] + - list of int: `[5, 10, 15, 20, 25, 30, 35, 40, 45, 50]` 4. **organism** - Indicate spices which will be used for downstream analysis - Example: human or mouse -5. **train** +5. **workdir** + - Directory to put the outputs + - Make sure to have write access. + - It will create one folder per dataset. + +6. **train** - most of the item is an input of `train_model()` - n_runs: number of run to define rLDA model (default: 100) - random_states: list of random state, we used to run LDA models (default: range(n_runs)) - - workdir: directory of train outputs. make sure to have write access. -6. **top_model** - - workdir: directory of topModel outputs. make sure to have write access. +7. **top_model** - n_top_genes (int): Number of highly-variable genes to keep (default: 50) - resolution (int): A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters. (default: 1) - max_iter_harmony (int): Number of iteration for running harmony (default: 10) - min_cell_participation (float): Minimum cell participation across for each topic to keep them, when is `None`, it will keep topics with cell participation more than 1% of #cells (#cells / 100) -7. **merging** - - workdir: directory of merged outputs. make sure to have write access. - - only if you have multiple adata input +8. **merge** + - Indicate if you want to also get a model for all data together. ### 3. Run snakemake -First run it with -n to make sure the steps that it plans to run are reasonable. -After it finishes, run the same command without the -n option. +First run it with `-n` to make sure the steps that it plans to run are reasonable. +After it finishes, run the same command without the `-n` option. `snakemake -n` -for slurm: +For SLURM: ``` snakemake \ --j 200 \ ---latency-wait 120 \ +-j 1000 \ +--latency-wait 300 \ --use-conda \ --rerun-triggers mtime \ --executor cluster-generic \ --cluster-generic-submit-cmd \ "sbatch -A model-ad_lab \ - --partition=standard \ + --partition=highmem \ --cpus-per-task 16 \ --mail-user=nargesr@uci.edu \ --mail-type=START,END,FAIL \ --time=72:00:00" \ --n +-n \ +-p \ +--verbose ``` +highmem +standard + +Development hints: If you ran to any error `-p --verbose` would give you more detail about each run and will help you to debug your code. + ### 4. Further downstream analysis diff --git a/workflow/snakemake/config/config.yaml b/workflow/snakemake/config/config.yaml index d5a1eab..07331d5 100644 --- a/workflow/snakemake/config/config.yaml +++ b/workflow/snakemake/config/config.yaml @@ -1,6 +1,6 @@ n_jobs: 8 # Threads -names: [parse, 10x] +names: ['parse', '10x'] count_adata: parse: @@ -12,6 +12,8 @@ n_topics: [5, 10] #, 15, 20, 25, 30, 35, 40, 45, 50] organism: mouse # human or mouse +workdir: /Users/nargesrezaie/Documents/MortazaviLab/Topyfic/workflow/snakemake/results/ + train: n_runs: 100 random_states: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, @@ -24,15 +26,11 @@ train: 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] - workdir: /Users/nargesrezaie/Documents/MortazaviLab/Topyfic/workflow/snakemake/results/{name}/{n_topic}/train top_model: - workdir: /Users/nargesrezaie/Documents/MortazaviLab/Topyfic/workflow/snakemake/results/{name}/{n_topic}/topmodel n_top_genes: 50 resolution: 1 max_iter_harmony: 10 min_cell_participation: None -# only if you have multiple adata input -merging: - workdir: /Users/nargesrezaie/Documents/MortazaviLab/Topyfic/workflow/snakemake/results +merge: True diff --git a/workflow/snakemake/workflow/Snakefile b/workflow/snakemake/workflow/Snakefile index 5880104..2e3825c 100644 --- a/workflow/snakemake/workflow/Snakefile +++ b/workflow/snakemake/workflow/Snakefile @@ -9,19 +9,34 @@ n_topics = config['n_topics'] include: "rules/train.smk" include: "rules/topModel.smk" + +# ##### make neccessary directories ##### +# for name in config['names']: +# for n_topic in config['n_topics']: +# # Check if the file exists +# if not os.path.exists(f"{config['workdir']}/{name}/{n_topic}/train"): +# os.makedirs(f"{config['workdir']}/{name}/{n_topic}/train", mode=0o777) +# +# if not os.path.exists(f"{config['workdir']}/{name}/{n_topic}/topmodel"): +# os.makedirs(f"{config['workdir']}/{name}/{n_topic}/topmodel", mode=0o777) + #### check if merging is necessary ### merging = [] -if 'merging' in config: - merging = f"{config['merging']['workdir']}/topModel_{'_'.join(config['names'])}.p" +if 'merge' in config: + merging = f"{config['workdir']}/topModel_{'_'.join(config['names'])}.p" ##### target rules ##### rule all: input: - expand(f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}.p", + expand(f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}_{{random_state}}.p", + name=config["names"], + n_topic=config["n_topics"], + random_state=config["train"]["random_states"]), + expand(f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}.p", name=config['names'], n_topic=config["n_topics"]), - expand(f"{config['top_model']['workdir']}/topModel_{{name}}_{{n_topic}}.p", + expand(f"{config['workdir']}/{{name}}/{{n_topic}}/topmodel/topModel_{{name}}_{{n_topic}}.p", name=config['names'], n_topic=config["n_topics"]), merging, diff --git a/workflow/snakemake/workflow/rules/topModel.smk b/workflow/snakemake/workflow/rules/topModel.smk index a0418da..15f2175 100644 --- a/workflow/snakemake/workflow/rules/topModel.smk +++ b/workflow/snakemake/workflow/rules/topModel.smk @@ -1,54 +1,61 @@ import pandas as pd +import Topyfic +from scripts import make_topmodel configfile: 'config/config.yaml' # Rule to run topModel for one input adata rule run_top_model: input: - expand(f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}.p", + expand(f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}.p", name=config["names"], n_topic=config["n_topics"]) output: - f"{config['top_model']['workdir']}/topModel_{{name}}_{{n_topic}}.p", + f"{config['workdir']}/{{name}}/{{n_topic}}/topmodel/topModel_{{name}}_{{n_topic}}.p", params: - name=lambda wildcards: config['names'], - n_topic=lambda wildcards: config['n_topics'] + name=lambda wildcards: wildcards.name, + n_topic=lambda wildcards: wildcards.n_topic, run: # df, res = make_topmodel.find_best_n_topic(n_topics=config["n_topics"], # names=params.name, # topmodel_outputs=config['top_model']['workdir']) # # df.to_csv(f"{config['top_model']['workdir']}/k_N_{params.name}.csv",index=False) + train = Topyfic.read_train(f"{config['workdir']}/{params.name}/{params.n_topic}/train/train_{params.name}_{params.n_topic}.p") - make_topmodel.make_top_model(trains={input}, + make_topmodel.make_top_model(trains=[train], adata_paths=config['count_adata'][params.name], - n_top_genes=config['top_model']['n_top_genes'], - resolution=config['top_model']['resolution'], - max_iter_harmony=config['top_model']['max_iter_harmony'], - min_cell_participation=config['top_model']['min_cell_participation'], - topmodel_output=config['top_model']['workdir']) + n_top_genes=int(config['top_model']['n_top_genes']), + resolution=float(config['top_model']['resolution']), + max_iter_harmony=int(config['top_model']['max_iter_harmony']), + min_cell_participation=None if config['top_model']['min_cell_participation'] == "None" else float( + config['top_model']['min_cell_participation']), + topmodel_output=f"{config['workdir']}/{params.name}/{params.n_topic}/topmodel/") # Rule to run topModel for merging multiple input adatas -if 'merging' in config: +if 'merge' in config: rule run_multiple_top_model: input: - expand(f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}.p", + expand(f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}.p", name=config["names"], - n_topic=config["n_topics"]) + n_topic=config["n_topics"]), + expand(f"{config['workdir']}/{{name}}/{{n_topic}}/topmodel/topModel_{{name}}_{{n_topic}}.p", + name=config["names"], + n_topic=config["n_topics"]), output: - f"{config['merging']['workdir']}/topModel_{'_'.join(config['names'])}.p", - f"{config['merging']['workdir']}/analysis_{'_'.join(config['names'])}.p", + f"{config['workdir']}/topModel_{'_'.join(config['names'])}.p", + f"{config['workdir']}/analysis_{'_'.join(config['names'])}.p", params: - name=lambda wildcards: config['names'], - n_topic=lambda wildcards: config['n_topics'] + name=lambda wildcards: wildcards.name, + n_topic=lambda wildcards: wildcards.n_topic, run: df, res = make_topmodel.find_best_n_topic(n_topics=config["n_topics"], names=config["names"], topmodel_outputs=config['merging']['workdir']) - df.to_csv(f"{config['merging']['workdir']}/k_N.csv",index=False) + df.to_csv(f"{config['workdir']}/k_N.csv",index=False) - pd.DataFrame.from_dict(res, orient='index').to_csv(f"{config['merging']['workdir']}/best_k.csv") + pd.DataFrame.from_dict(res, orient='index').to_csv(f"{config['workdir']}/best_k.csv") adata_paths = [] for name in config['names']: @@ -57,12 +64,14 @@ if 'merging' in config: trains = [] for name in res.keys(): n_topic = res[name] - trains.append(f"{config['train']['workdir']}/train_{name}_{n_topic}.p") + train = Topyfic.read_train(f"{config['workdir']}/{name}/{n_topic}/train/train_{name}_{n_topic}.p") + trains.append(train) make_topmodel.make_top_model(trains=trains, adata_paths=adata_paths, - n_top_genes=config['top_model']['n_top_genes'], - resolution=config['top_model']['resolution'], - max_iter_harmony=config['top_model']['max_iter_harmony'], - min_cell_participation=config['top_model']['min_cell_participation'], - topmodel_output=None) \ No newline at end of file + n_top_genes=int(config['top_model']['n_top_genes']), + resolution=float(config['top_model']['resolution']), + max_iter_harmony=int(config['top_model']['max_iter_harmony']), + min_cell_participation=None if config['top_model']['min_cell_participation'] == "None" else float( + config['top_model']['min_cell_participation']), + topmodel_output=f"{config['workdir']}") diff --git a/workflow/snakemake/workflow/rules/train.smk b/workflow/snakemake/workflow/rules/train.smk index 52bfab0..235d59e 100644 --- a/workflow/snakemake/workflow/rules/train.smk +++ b/workflow/snakemake/workflow/rules/train.smk @@ -1,36 +1,38 @@ +from scripts import make_train + configfile: 'config/config.yaml' # Rule to run make_single_train_model rule run_single_train: output: - f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}_{{random_state}}.p", + f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}_{{random_state}}.p", params: - name=lambda wildcards: config['names'], - n_topic=lambda wildcards: config['n_topics'], - random_state=lambda wildcards: config["train"]["random_states"] + name=lambda wildcards: wildcards.name, + n_topic=lambda wildcards: wildcards.n_topic, + random_state=lambda wildcards: wildcards.random_state run: make_train.make_single_train_model(name=params.name, adata_path=config['count_adata'][params.name], - k=params.n_topic, - random_state=params.random_state, - train_output=config['train']['workdir']) + k=int(params.n_topic), + random_state=int(params.random_state), + train_output=f"{config['workdir']}/{params.name}/{params.n_topic}/train/") # Rule to run make_train_model rule run_train_model: input: - expand(f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}_{{random_state}}.p", + expand(f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}_{{random_state}}.p", name=config["names"], n_topic=config["n_topics"], random_state=config["train"]["random_states"]) output: - f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}.p", + f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}.p", params: - name=lambda wildcards: config['names'], - n_topic=lambda wildcards: config['n_topics'], + name=lambda wildcards: wildcards.name, + n_topic=lambda wildcards: wildcards.n_topic, run: make_train.make_train_model(name=params.name, adata_path=config['count_adata'][params.name], - k=params.n_topic, + k=int(params.n_topic), n_runs=config['train']['n_runs'], random_state=config['train']['random_states'], - train_output=config['train']['workdir']) \ No newline at end of file + train_output=f"{config['workdir']}/{params.name}/{params.n_topic}/train/") \ No newline at end of file diff --git a/workflow/snakemake/workflow/scripts/make_topmodel.py b/workflow/snakemake/workflow/scripts/make_topmodel.py index d87d8c2..ef8e00d 100644 --- a/workflow/snakemake/workflow/scripts/make_topmodel.py +++ b/workflow/snakemake/workflow/scripts/make_topmodel.py @@ -33,9 +33,10 @@ def make_top_model(trains, adata_paths, n_top_genes=50, resolution=1, max_iter_h # Check if the file exists if not os.path.exists(topmodel_output): - os.makedirs(topmodel_output) + os.makedirs(topmodel_output, mode=0o777) adata = None + print(adata_paths) if isinstance(adata_paths, str): adata = sc.read_h5ad(adata_paths) else: @@ -53,6 +54,7 @@ def make_top_model(trains, adata_paths, n_top_genes=50, resolution=1, max_iter_h max_iter_harmony=max_iter_harmony, min_cell_participation=min_cell_participation) + print(top_model.name, topmodel_output) top_model.save_topModel(save_path=topmodel_output) adata_topmodel.write_h5ad(f"{topmodel_output}/topic_weight_umap.h5ad") diff --git a/workflow/snakemake/workflow/scripts/make_train.py b/workflow/snakemake/workflow/scripts/make_train.py index a052a76..0c8c0ae 100644 --- a/workflow/snakemake/workflow/scripts/make_train.py +++ b/workflow/snakemake/workflow/scripts/make_train.py @@ -25,7 +25,7 @@ def make_single_train_model(name, adata_path, k, random_state, train_output): # Check if the file exists if not os.path.exists(train_output): - os.makedirs(train_output) + os.makedirs(train_output, mode=0o777) adata = sc.read_h5ad(adata_path) @@ -61,7 +61,7 @@ def make_train_model(name, adata_path, k, n_runs, random_state, train_output): # Check if the file exists if not os.path.exists(train_output): - os.makedirs(train_output) + os.makedirs(train_output, mode=0o777) adata = sc.read_h5ad(adata_path) @@ -71,9 +71,13 @@ def make_train_model(name, adata_path, k, n_runs, random_state, train_output): random_state_range=random_state) trains = [] for i in random_state: - train = Topyfic.read_train(f"{name}_{k}_{i}.p") + print(f"{train_output}train_{name}_{k}_{i}.p") + train = Topyfic.read_train(f"{train_output}train_{name}_{k}_{i}.p") + print(train.random_state_range) trains.append(train) + print(main_train.random_state_range) main_train.combine_LDA_models(adata, single_trains=trains) + print(f"{train_output}{main_train.name}") main_train.save_train(save_path=train_output)