Merge pull request #42 from OpenKBC/engineering_dev

SVM auc result by using DEG and feature extraction process
OpenKBC · Sep 30, 2021 · 99a1a75 · 99a1a75
2 parents ab09d05 + 077f80c
commit 99a1a75
Show file tree

Hide file tree

Showing 16 changed files with 937 additions and 453 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Testing
+tester.py
+
 # s3 entire data
 data/
 *.pem

diff --git a/notebook/feature_test_with_DEG.ipynb b/notebook/feature_test_with_DEG.ipynb
diff --git a/notebook/feature_test_with_act.ipynb b/notebook/feature_test_with_act.ipynb
diff --git a/notebook/notebook_archive/Jun09262021/SVM_test.ipynb b/notebook/notebook_archive/Jun09262021/SVM_test.ipynb
diff --git a/notebook/notebook_archive/Jun09262021/feature_test_with_DEG.ipynb b/notebook/notebook_archive/Jun09262021/feature_test_with_DEG.ipynb
@@ -0,0 +1 @@
+import pandas as padj
diff --git a/pipelines/deg_pipeline/README.md b/pipelines/deg_pipeline/README.md
@@ -0,0 +1,19 @@
+## DEG pipeline(DESeq2) by Jun
+* This workflow generates DEG result by using DESeq2, and it is working for only GEO styles of dataset
+
+#### Version history
+* It has memory issue in Docker
+* v1.0.0 is on the pipeline workflow
+
+#### Requirement
+```shell
+pip install -r requirements.txt
+Rscript installer_Rpackage.R
+```
+
+#### Usage
+* Please change config.yaml for standalone usage
+
+```shell
+snakemake --cores 3
+```
diff --git a/pipelines/deg_pipeline/Snakefile b/pipelines/deg_pipeline/Snakefile
@@ -6,9 +6,12 @@ __email__ = "[email protected]"
 # Base DEG pipeline by using DESeq2, it could expand to more functions by using this workflow
 
 # For manual running, please use this one
-#configfile: "config.yaml"
+# configfile: "config.yaml" 
+# pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/deg_pipeline/'
+# 
 
 pipeline_path = '/pipelines/deg_pipeline/'
+
 SAMPLES = ['CD4','CD8','CD14']
 
 rule all:

diff --git a/pipelines/deg_pipeline/import_utils/lib/externalHandler.py b/pipelines/deg_pipeline/import_utils/lib/externalHandler.py
@@ -2,14 +2,31 @@
 import itertools
 
 class handlers(object):
-    def get_column(filename_with_path,  ext_value, annot='gene_id', sep="\t"):
+    def get_column(filename_with_path,  ext_value, annot='gene_id', header_line=0, sep="\t"):
         """
         filename_with_path = filepath + basename
         ext_value = column name of file
         sep = separator
         """
-        temp = pd.read_csv(filename_with_path, sep=sep).set_index(annot) # temp loading
-        return temp[[ext_value]]
+
+        # Don't use pandas.read_csv because of memory usage
+        index_list = []
+        value_list = []
+        with open(filename_with_path, 'r') as infile:
+            for i, line in enumerate(infile):
+                line = line.strip()
+                if i==header_line: # found header
+                    header_info = line.split(sep)
+                    value_ext_location = header_info.index(ext_value) # location of value extraction point
+                    index_ext_location = header_info.index(annot) # location of value extraction point
+
+                elif i!=header_line:
+                    line_list = line.split(sep)
+                    index_list.append(str(line_list[index_ext_location])) # Value list
+                    value_list.append(float(line_list[value_ext_location])) # Index list
+
+        result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
+        return result_df
 
     def get_samplename(filelist):
         """

diff --git a/pipelines/deg_pipeline/import_utils/step2_DESeq2_calculator.R b/pipelines/deg_pipeline/import_utils/step2_DESeq2_calculator.R
@@ -6,7 +6,7 @@
 # metafile = "./sample_CD4_meta.csv"
 # outputfile = "./CD4_DEG.csv"
 
-library(tidyverse)
+#library(tidyverse)
 library(DESeq2)
 library(tximport)
 

diff --git a/pipelines/feature_extraction_pipeline/Snakefile b/pipelines/feature_extraction_pipeline/Snakefile
@@ -7,8 +7,11 @@ __email__ = "[email protected]"
 
 # For manual running, please use this one
 # configfile: "config.yaml"
+# pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/feature_extraction_pipeline/'
+#
 
 pipeline_path = '/pipelines/feature_extraction_pipeline/'
+
 SAMPLES = ['CD4','CD8','CD14']
 
 rule all:

diff --git a/pipelines/feature_extraction_pipeline/import_ML/lib/statFunction.py b/pipelines/feature_extraction_pipeline/import_ML/lib/statFunction.py
@@ -6,7 +6,6 @@
 """
 Description: Repeative functions in notebook
 """
-import matplotlib.pyplot as plt
 from sklearn.svm import SVC
 from sklearn.model_selection import StratifiedKFold
 from sklearn.feature_selection import RFECV

diff --git a/pipelines/pipeline_controller/app.py b/pipelines/pipeline_controller/app.py
@@ -24,10 +24,33 @@
 from flask_wtf import Form
 from wtforms import TextField, SubmitField
 
+# Celery running
+import json
+from celery import Celery, current_task
+from celery.result import AsyncResult
+from subprocess import Popen, PIPE
+
 
 app = Flask(__name__)
 app.config['SECRET_KEY'] = 'swiri021swiri021' # CSRF key
-Bootstrap(app) # set Bootstrap
+
+## Celery setting
+app.config.update(
+    CELERY_BROKER_URL='redis://localhost:6379', # Redis docker
+    CELERY_RESULT_BACKEND='redis://localhost:6379'
+)
+def make_celery(app):
+    celery = Celery(
+        app.import_name,
+        backend=app.config['CELERY_RESULT_BACKEND'],
+        broker=app.config['CELERY_BROKER_URL']
+    )
+    celery.conf.update(app.config)
+    return celery
+celery = make_celery(app)
+
+# set Bootstrap
+Bootstrap(app)
 
 # setting Navigation Bar
 nav = Nav(app)
@@ -91,26 +114,43 @@ class SnakeMakeForm(Form):
 
     return render_template('config_yaml_creator.html', form=form)
 
+@celery.task()
+def workflow_running(pipeline_path, yaml_file):
+    print(pipeline_path, yaml_file)
+
+    proc = Popen(['snakemake', '--snakefile', pipeline_path+'Snakefile', '--cores', str(3), '--configfile', yaml_file], stdin=PIPE, stdout=PIPE, stderr=PIPE)
+    # It is not working with snakemake
+    while True:
+        line = proc.stdout.readline()
+        if not line:
+            break
+        print(str(line))
+        current_task.update_state(state='PROGRESS', meta={'msg': str(line)})
+    return 999
+
+@app.route("/workflow_progress")
+def workflow_progress():
+    print("WORKFLOW RETURN")
+    jobid = request.values.get('jobid')
+    if jobid:
+        job = AsyncResult(jobid, app=celery)
+    print(job.state)
+    if job.state == 'PROGRESS':
+        return json.dumps(dict( state=job.state, msg=job.result['msg'],))
+    elif job.state == 'SUCCESS':
+        return json.dumps(dict( state=job.state, msg="done",))
+    return '{}'
+
 @app.route("/status")
 def workflow_status():
-
     pipeline_path = session.get('selected_pipeline', None) # Pipeline path
     yaml_file = session.get('yaml_output', None) # yaml file
 
-    ## Running snakemake
-    cmd = 'snakemake --snakefile %s --cores 3 --configfile %s'%(pipeline_path+"Snakefile",yaml_file)
-    print(cmd)
-    try:
-        p = subprocess.check_output([cmd], shell=True)
-        msg = "Workflow has been completed"
-    except subprocess.CalledProcessError as e:
-        msg = "Error occur in snakemake, please check log files in pipelines folder"
-
-    return render_template('status.html', msg=msg)
+    job = workflow_running.delay(pipeline_path, yaml_file)
+    return render_template('progress.html', JOBID=job.id)
 
 #########Route###########
 
-
 # Parsing function for yaml data, only work 2 layer nested yaml file
 def _parsing_yamlFile(workflow_path):
     """

diff --git a/pipelines/pipeline_controller/requirements.txt b/pipelines/pipeline_controller/requirements.txt
@@ -4,4 +4,6 @@ PyYAML==5.4.1
 flask==2.0.1
 Flask-WTF==0.15.1
 Flask-Bootstrap==3.3.7.1
-flask-nav==0.6
+flask-nav==0.6
+celery==5.1.2
+redis==3.5.3
diff --git a/pipelines/pipeline_controller/static/spinning-loading.gif b/pipelines/pipeline_controller/static/spinning-loading.gif
diff --git a/pipelines/pipeline_controller/templates/progress.html b/pipelines/pipeline_controller/templates/progress.html
@@ -0,0 +1,40 @@
+{% extends "bootstrap/base.html" %}
+{% import "bootstrap/wtf.html" as wtf %}
+
+{% block navbar %}
+    {{nav.mynavbar.render()}}
+{% endblock %}
+
+{% block content %} 
+<div class="container">
+    <h3>Workflow controller</h3>
+    <p>This controller generates proper snakemake config file to run your samples</p>
+
+    <div id="pct"></div>
+
+    <script src="//code.jquery.com/jquery-2.1.1.min.js"></script>
+    <script>
+    function poll() {
+        $.ajax("{{url_for('.workflow_progress', jobid=JOBID)}}", {
+            dataType: "json"
+            , success: function(resp) {
+                console.log(resp);
+                $("#pct").html("<b>Workflow has been completed</b>");
+                if(resp.msg == 'done') {
+                    return;
+                } else{
+                    $("#pct").html("<img src='/static/spinning-loading.gif'>");
+                    setTimeout(poll, 1000.0);
+                }
+            }
+        });
+    }
+    $(function() {
+        var JOBID = "{{ JOBID }}";
+        poll();
+    });
+    </script>
+
+    <p>Copyright 2021 <a href="https://github.com/OpenKBC/multiple_sclerosis_proj">OpenKBC repository</a></p>
+</div>
+{% endblock %}
diff --git a/utils/lib/externalHandler.py b/utils/lib/externalHandler.py
@@ -1,15 +1,33 @@
 import pandas as pd
+import numpy as np
 import itertools
 
 class handlers(object):
-    def get_column(filename_with_path,  ext_value, annot='gene_id', sep="\t"):
+    def get_column(filename_with_path,  ext_value, annot='gene_id', header_line=0, sep="\t", opt=0):
         """
         filename_with_path = filepath + basename
         ext_value = column name of file
         sep = separator
         """
-        temp = pd.read_csv(filename_with_path, sep=sep).set_index(annot) # temp loading
-        return temp[[ext_value]]
+
+        # Don't use pandas.read_csv because of memory usage
+        index_list = []
+        value_list = []
+        with open(filename_with_path, 'r') as infile:
+            for i, line in enumerate(infile):
+                line = line.strip()
+                if i==header_line: # found header
+                    header_info = line.split(sep)
+                    value_ext_location = header_info.index(ext_value) # location of value extraction point
+                    index_ext_location = header_info.index(annot) # location of value extraction point
+
+                elif i!=header_line:
+                    line_list = line.split(sep)
+                    index_list.append(str(line_list[index_ext_location])) # Value list
+                    value_list.append(float(line_list[value_ext_location])) # Index list
+
+        result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
+        return result_df
 
     def get_samplename(filelist):
         """