From 4251f895e59272a768c4aa3a29864f9f133bf2d5 Mon Sep 17 00:00:00 2001
From: Narges Rezaie <nargesr@uci.edu>
Date: Wed, 14 Aug 2024 15:28:08 -0700
Subject: [PATCH] complete snakemake workflow

---
 setup.py                                      |  4 +-
 workflow/snakemake/README.md                  | 44 ++++++++------
 workflow/snakemake/config/config.yaml         | 10 ++--
 workflow/snakemake/workflow/Snakefile         | 23 ++++++--
 .../snakemake/workflow/rules/topModel.smk     | 59 +++++++++++--------
 workflow/snakemake/workflow/rules/train.smk   | 28 +++++----
 .../workflow/scripts/make_topmodel.py         |  4 +-
 .../snakemake/workflow/scripts/make_train.py  | 10 +++-
 8 files changed, 111 insertions(+), 71 deletions(-)

diff --git a/setup.py b/setup.py
index 0846336..9ebd290 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name='Topyfic',  # the name of your package
     packages=['Topyfic'],  # same as above
-    version='v0.4.13',  # version number
+    version='v0.4.15',  # version number
     license='MIT',  # license type
     description='Topyfic is a Python package designed to identify reproducible latent dirichlet allocation (LDA) '
                 'using leiden clustering and harmony for single cell epigenomics data',
@@ -11,7 +11,7 @@
     author='Narges Rezaie',  # your name
     author_email='nargesrezaie80@gmail.com',  # your email
     url='https://github.com/mortazavilab/Topyfic',  # url to your git repo
-    download_url='https://github.com/mortazavilab/Topyfic/archive/refs/tags/v0.4.13.tar.gz',  # link to the tar.gz file associated with this release
+    download_url='https://github.com/mortazavilab/Topyfic/archive/refs/tags/v0.4.15.tar.gz',  # link to the tar.gz file associated with this release
     keywords=['Cellular Programs', 'Latent Dirichlet allocation', 'single-cell multiome', 'single-cell RNA-seq',
               'gene regulatory network', 'Topic Modeling', 'single-nucleus RNA-seq'],  #
     python_requires='>=3.9',
diff --git a/workflow/snakemake/README.md b/workflow/snakemake/README.md
index d473233..a7297b3 100644
--- a/workflow/snakemake/README.md
+++ b/workflow/snakemake/README.md
@@ -2,10 +2,11 @@
 
 This directory contains a Snakemake pipeline for running the Topyfic automatically.
 
-The snakemake will run training and building model (topModel). 
+The snakemake will run training (Train) and building model (topModel, Analysis). 
 
 **Note**: Please make sure to install necessary packages and set up your Snakemake appropriately.
-**Note**: pipeline is tested for >= 8.*
+
+**Note**: pipeline is tested for Snakemake >= 8.X ([more info](https://snakemake.readthedocs.io/en/stable/index.html))
 
 ## Getting started
 
@@ -31,55 +32,64 @@ Modify the [config file](config/config.yaml) or create a new one with the same s
 
 3. **n_topics**
    - Contains list of number of initial topics you wish to train model base on them
-   - list of int: [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
+   - list of int: `[5, 10, 15, 20, 25, 30, 35, 40, 45, 50]`
 
 4. **organism**
    - Indicate spices which will be used for downstream analysis
    - Example: human or mouse
 
-5. **train**
+5. **workdir**
+   - Directory to put the outputs
+   - Make sure to have write access.
+   - It will create one folder per dataset.
+
+6. **train**
    - most of the item is an input of `train_model()`
    - n_runs: number of run to define rLDA model (default: 100)
    - random_states: list of random state, we used to run LDA models (default: range(n_runs))
-   - workdir: directory of train outputs. make sure to have write access.
 
-6. **top_model**
-   - workdir: directory of topModel outputs. make sure to have write access.
+7. **top_model**
    - n_top_genes (int): Number of highly-variable genes to keep (default: 50)
    - resolution (int): A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters. (default: 1)
    - max_iter_harmony (int): Number of iteration for running harmony (default: 10)
    - min_cell_participation (float): Minimum cell participation across for each topic to keep them, when is `None`, it will keep topics with cell participation more than 1% of #cells (#cells / 100)
 
-7. **merging**
-   - workdir: directory of merged outputs. make sure to have write access.
-   - only if you have multiple adata input
+8. **merge**
+   - Indicate if you want to also get a model for all data together.
 
 
 ### 3. Run snakemake
 
-First run it with -n to make sure the steps that it plans to run are reasonable. 
-After it finishes, run the same command without the -n option.
+First run it with `-n` to make sure the steps that it plans to run are reasonable. 
+After it finishes, run the same command without the `-n` option.
 
 `snakemake -n`
 
-for slurm:
+For SLURM:
 
 ```
 snakemake \
--j 200 \
---latency-wait 120 \
+-j 1000 \
+--latency-wait 300 \
 --use-conda \
 --rerun-triggers mtime \
 --executor cluster-generic \
 --cluster-generic-submit-cmd \
 "sbatch -A model-ad_lab \
-  --partition=standard \
+  --partition=highmem \
   --cpus-per-task 16 \
   --mail-user=nargesr@uci.edu \
   --mail-type=START,END,FAIL \
   --time=72:00:00" \
--n
+-n \
+-p \
+--verbose
 ```
+highmem
+standard
+
+Development hints: If you ran to any error `-p --verbose` would give you more detail about each run and will help you to debug your code.
+
 
 ### 4. Further downstream analysis
 
diff --git a/workflow/snakemake/config/config.yaml b/workflow/snakemake/config/config.yaml
index d5a1eab..07331d5 100644
--- a/workflow/snakemake/config/config.yaml
+++ b/workflow/snakemake/config/config.yaml
@@ -1,6 +1,6 @@
 n_jobs: 8 # Threads
 
-names: [parse, 10x]
+names: ['parse', '10x']
 
 count_adata:
   parse:
@@ -12,6 +12,8 @@ n_topics: [5, 10] #, 15, 20, 25, 30, 35, 40, 45, 50]
 
 organism: mouse # human or mouse
 
+workdir: /Users/nargesrezaie/Documents/MortazaviLab/Topyfic/workflow/snakemake/results/
+
 train:
   n_runs: 100
   random_states: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
@@ -24,15 +26,11 @@ train:
                   70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                   80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
                   90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
-  workdir: /Users/nargesrezaie/Documents/MortazaviLab/Topyfic/workflow/snakemake/results/{name}/{n_topic}/train
 
 top_model:
-  workdir: /Users/nargesrezaie/Documents/MortazaviLab/Topyfic/workflow/snakemake/results/{name}/{n_topic}/topmodel
   n_top_genes: 50
   resolution: 1
   max_iter_harmony: 10
   min_cell_participation: None
 
-# only if you have multiple adata input
-merging:
-  workdir: /Users/nargesrezaie/Documents/MortazaviLab/Topyfic/workflow/snakemake/results
+merge: True
diff --git a/workflow/snakemake/workflow/Snakefile b/workflow/snakemake/workflow/Snakefile
index 5880104..2e3825c 100644
--- a/workflow/snakemake/workflow/Snakefile
+++ b/workflow/snakemake/workflow/Snakefile
@@ -9,19 +9,34 @@ n_topics = config['n_topics']
 include: "rules/train.smk"
 include: "rules/topModel.smk"
 
+
+# ##### make neccessary directories #####
+# for name in config['names']:
+#     for n_topic in config['n_topics']:
+#         # Check if the file exists
+#         if not os.path.exists(f"{config['workdir']}/{name}/{n_topic}/train"):
+#             os.makedirs(f"{config['workdir']}/{name}/{n_topic}/train", mode=0o777)
+#
+#         if not os.path.exists(f"{config['workdir']}/{name}/{n_topic}/topmodel"):
+#             os.makedirs(f"{config['workdir']}/{name}/{n_topic}/topmodel", mode=0o777)
+
 #### check if merging is necessary ###
 merging = []
-if 'merging' in config:
-    merging = f"{config['merging']['workdir']}/topModel_{'_'.join(config['names'])}.p"
+if 'merge' in config:
+    merging = f"{config['workdir']}/topModel_{'_'.join(config['names'])}.p"
 
 
 ##### target rules #####
 rule all:
     input:
-        expand(f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}.p",
+        expand(f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}_{{random_state}}.p",
+            name=config["names"],
+            n_topic=config["n_topics"],
+            random_state=config["train"]["random_states"]),
+        expand(f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}.p",
             name=config['names'],
             n_topic=config["n_topics"]),
-        expand(f"{config['top_model']['workdir']}/topModel_{{name}}_{{n_topic}}.p",
+        expand(f"{config['workdir']}/{{name}}/{{n_topic}}/topmodel/topModel_{{name}}_{{n_topic}}.p",
             name=config['names'],
             n_topic=config["n_topics"]),
         merging,
diff --git a/workflow/snakemake/workflow/rules/topModel.smk b/workflow/snakemake/workflow/rules/topModel.smk
index a0418da..15f2175 100644
--- a/workflow/snakemake/workflow/rules/topModel.smk
+++ b/workflow/snakemake/workflow/rules/topModel.smk
@@ -1,54 +1,61 @@
 import pandas as pd
+import Topyfic
+from scripts import make_topmodel
 
 configfile: 'config/config.yaml'
 
 # Rule to run topModel for one input adata
 rule run_top_model:
     input:
-        expand(f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}.p",
+        expand(f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}.p",
             name=config["names"],
             n_topic=config["n_topics"])
     output:
-        f"{config['top_model']['workdir']}/topModel_{{name}}_{{n_topic}}.p",
+        f"{config['workdir']}/{{name}}/{{n_topic}}/topmodel/topModel_{{name}}_{{n_topic}}.p",
     params:
-        name=lambda wildcards: config['names'],
-        n_topic=lambda wildcards: config['n_topics']
+        name=lambda wildcards: wildcards.name,
+        n_topic=lambda wildcards: wildcards.n_topic,
     run:
         # df, res = make_topmodel.find_best_n_topic(n_topics=config["n_topics"],
         #     names=params.name,
         #     topmodel_outputs=config['top_model']['workdir'])
         #
         # df.to_csv(f"{config['top_model']['workdir']}/k_N_{params.name}.csv",index=False)
+        train = Topyfic.read_train(f"{config['workdir']}/{params.name}/{params.n_topic}/train/train_{params.name}_{params.n_topic}.p")
 
-        make_topmodel.make_top_model(trains={input},
+        make_topmodel.make_top_model(trains=[train],
             adata_paths=config['count_adata'][params.name],
-            n_top_genes=config['top_model']['n_top_genes'],
-            resolution=config['top_model']['resolution'],
-            max_iter_harmony=config['top_model']['max_iter_harmony'],
-            min_cell_participation=config['top_model']['min_cell_participation'],
-            topmodel_output=config['top_model']['workdir'])
+            n_top_genes=int(config['top_model']['n_top_genes']),
+            resolution=float(config['top_model']['resolution']),
+            max_iter_harmony=int(config['top_model']['max_iter_harmony']),
+            min_cell_participation=None if config['top_model']['min_cell_participation'] == "None" else float(
+                config['top_model']['min_cell_participation']),
+            topmodel_output=f"{config['workdir']}/{params.name}/{params.n_topic}/topmodel/")
 
 # Rule to run topModel for merging multiple input adatas
-if 'merging' in config:
+if 'merge' in config:
     rule run_multiple_top_model:
         input:
-            expand(f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}.p",
+            expand(f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}.p",
                 name=config["names"],
-                n_topic=config["n_topics"])
+                n_topic=config["n_topics"]),
+            expand(f"{config['workdir']}/{{name}}/{{n_topic}}/topmodel/topModel_{{name}}_{{n_topic}}.p",
+               name=config["names"],
+               n_topic=config["n_topics"]),
         output:
-            f"{config['merging']['workdir']}/topModel_{'_'.join(config['names'])}.p",
-            f"{config['merging']['workdir']}/analysis_{'_'.join(config['names'])}.p",
+            f"{config['workdir']}/topModel_{'_'.join(config['names'])}.p",
+            f"{config['workdir']}/analysis_{'_'.join(config['names'])}.p",
         params:
-            name=lambda wildcards: config['names'],
-            n_topic=lambda wildcards: config['n_topics']
+            name=lambda wildcards: wildcards.name,
+            n_topic=lambda wildcards: wildcards.n_topic,
         run:
             df, res = make_topmodel.find_best_n_topic(n_topics=config["n_topics"],
                 names=config["names"],
                 topmodel_outputs=config['merging']['workdir'])
 
-            df.to_csv(f"{config['merging']['workdir']}/k_N.csv",index=False)
+            df.to_csv(f"{config['workdir']}/k_N.csv",index=False)
 
-            pd.DataFrame.from_dict(res, orient='index').to_csv(f"{config['merging']['workdir']}/best_k.csv")
+            pd.DataFrame.from_dict(res, orient='index').to_csv(f"{config['workdir']}/best_k.csv")
 
             adata_paths = []
             for name in config['names']:
@@ -57,12 +64,14 @@ if 'merging' in config:
             trains = []
             for name in res.keys():
                 n_topic = res[name]
-                trains.append(f"{config['train']['workdir']}/train_{name}_{n_topic}.p")
+                train = Topyfic.read_train(f"{config['workdir']}/{name}/{n_topic}/train/train_{name}_{n_topic}.p")
+                trains.append(train)
 
             make_topmodel.make_top_model(trains=trains,
                 adata_paths=adata_paths,
-                n_top_genes=config['top_model']['n_top_genes'],
-                resolution=config['top_model']['resolution'],
-                max_iter_harmony=config['top_model']['max_iter_harmony'],
-                min_cell_participation=config['top_model']['min_cell_participation'],
-                topmodel_output=None)
\ No newline at end of file
+                n_top_genes=int(config['top_model']['n_top_genes']),
+                resolution=float(config['top_model']['resolution']),
+                max_iter_harmony=int(config['top_model']['max_iter_harmony']),
+                min_cell_participation=None if config['top_model']['min_cell_participation'] == "None" else float(
+                    config['top_model']['min_cell_participation']),
+                topmodel_output=f"{config['workdir']}")
diff --git a/workflow/snakemake/workflow/rules/train.smk b/workflow/snakemake/workflow/rules/train.smk
index 52bfab0..235d59e 100644
--- a/workflow/snakemake/workflow/rules/train.smk
+++ b/workflow/snakemake/workflow/rules/train.smk
@@ -1,36 +1,38 @@
+from scripts import make_train
+
 configfile: 'config/config.yaml'
 
 # Rule to run make_single_train_model
 rule run_single_train:
     output:
-        f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}_{{random_state}}.p",
+        f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}_{{random_state}}.p",
     params:
-        name=lambda wildcards: config['names'],
-        n_topic=lambda wildcards: config['n_topics'],
-        random_state=lambda wildcards: config["train"]["random_states"]
+        name=lambda wildcards: wildcards.name,
+        n_topic=lambda wildcards: wildcards.n_topic,
+        random_state=lambda wildcards: wildcards.random_state
     run:
         make_train.make_single_train_model(name=params.name,
             adata_path=config['count_adata'][params.name],
-            k=params.n_topic,
-            random_state=params.random_state,
-            train_output=config['train']['workdir'])
+            k=int(params.n_topic),
+            random_state=int(params.random_state),
+            train_output=f"{config['workdir']}/{params.name}/{params.n_topic}/train/")
 
 # Rule to run make_train_model
 rule run_train_model:
     input:
-        expand(f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}_{{random_state}}.p",
+        expand(f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}_{{random_state}}.p",
             name=config["names"],
             n_topic=config["n_topics"],
             random_state=config["train"]["random_states"])
     output:
-        f"{config['train']['workdir']}/train_{{name}}_{{n_topic}}.p",
+        f"{config['workdir']}/{{name}}/{{n_topic}}/train/train_{{name}}_{{n_topic}}.p",
     params:
-        name=lambda wildcards: config['names'],
-        n_topic=lambda wildcards: config['n_topics'],
+        name=lambda wildcards: wildcards.name,
+        n_topic=lambda wildcards: wildcards.n_topic,
     run:
         make_train.make_train_model(name=params.name,
             adata_path=config['count_adata'][params.name],
-            k=params.n_topic,
+            k=int(params.n_topic),
             n_runs=config['train']['n_runs'],
             random_state=config['train']['random_states'],
-            train_output=config['train']['workdir'])
\ No newline at end of file
+            train_output=f"{config['workdir']}/{params.name}/{params.n_topic}/train/")
\ No newline at end of file
diff --git a/workflow/snakemake/workflow/scripts/make_topmodel.py b/workflow/snakemake/workflow/scripts/make_topmodel.py
index d87d8c2..ef8e00d 100644
--- a/workflow/snakemake/workflow/scripts/make_topmodel.py
+++ b/workflow/snakemake/workflow/scripts/make_topmodel.py
@@ -33,9 +33,10 @@ def make_top_model(trains, adata_paths, n_top_genes=50, resolution=1, max_iter_h
 
     # Check if the file exists
     if not os.path.exists(topmodel_output):
-        os.makedirs(topmodel_output)
+        os.makedirs(topmodel_output, mode=0o777)
 
     adata = None
+    print(adata_paths)
     if isinstance(adata_paths, str):
         adata = sc.read_h5ad(adata_paths)
     else:
@@ -53,6 +54,7 @@ def make_top_model(trains, adata_paths, n_top_genes=50, resolution=1, max_iter_h
                                                                                 max_iter_harmony=max_iter_harmony,
                                                                                 min_cell_participation=min_cell_participation)
 
+    print(top_model.name, topmodel_output)
     top_model.save_topModel(save_path=topmodel_output)
 
     adata_topmodel.write_h5ad(f"{topmodel_output}/topic_weight_umap.h5ad")
diff --git a/workflow/snakemake/workflow/scripts/make_train.py b/workflow/snakemake/workflow/scripts/make_train.py
index a052a76..0c8c0ae 100644
--- a/workflow/snakemake/workflow/scripts/make_train.py
+++ b/workflow/snakemake/workflow/scripts/make_train.py
@@ -25,7 +25,7 @@ def make_single_train_model(name, adata_path, k, random_state, train_output):
 
     # Check if the file exists
     if not os.path.exists(train_output):
-        os.makedirs(train_output)
+        os.makedirs(train_output, mode=0o777)
 
     adata = sc.read_h5ad(adata_path)
 
@@ -61,7 +61,7 @@ def make_train_model(name, adata_path, k, n_runs, random_state, train_output):
 
     # Check if the file exists
     if not os.path.exists(train_output):
-        os.makedirs(train_output)
+        os.makedirs(train_output, mode=0o777)
 
     adata = sc.read_h5ad(adata_path)
 
@@ -71,9 +71,13 @@ def make_train_model(name, adata_path, k, n_runs, random_state, train_output):
                                random_state_range=random_state)
     trains = []
     for i in random_state:
-        train = Topyfic.read_train(f"{name}_{k}_{i}.p")
+        print(f"{train_output}train_{name}_{k}_{i}.p")
+        train = Topyfic.read_train(f"{train_output}train_{name}_{k}_{i}.p")
+        print(train.random_state_range)
         trains.append(train)
 
+    print(main_train.random_state_range)
     main_train.combine_LDA_models(adata, single_trains=trains)
+    print(f"{train_output}{main_train.name}")
     main_train.save_train(save_path=train_output)