Skip to content

Commit

Permalink
Merge pull request #42 from OpenKBC/engineering_dev
Browse files Browse the repository at this point in the history
SVM auc result by using DEG and feature extraction process
  • Loading branch information
swiri021 authored Sep 30, 2021
2 parents ab09d05 + 077f80c commit 99a1a75
Show file tree
Hide file tree
Showing 16 changed files with 937 additions and 453 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Testing
tester.py

# s3 entire data
data/
*.pem
Expand Down
365 changes: 365 additions & 0 deletions notebook/feature_test_with_DEG.ipynb

Large diffs are not rendered by default.

404 changes: 404 additions & 0 deletions notebook/feature_test_with_act.ipynb

Large diffs are not rendered by default.

430 changes: 0 additions & 430 deletions notebook/notebook_archive/Jun09262021/SVM_test.ipynb

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
import pandas as padj
19 changes: 19 additions & 0 deletions pipelines/deg_pipeline/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## DEG pipeline(DESeq2) by Jun
* This workflow generates DEG result by using DESeq2, and it is working for only GEO styles of dataset

#### Version history
* It has memory issue in Docker
* v1.0.0 is on the pipeline workflow

#### Requirement
```shell
pip install -r requirements.txt
Rscript installer_Rpackage.R
```

#### Usage
* Please change config.yaml for standalone usage

```shell
snakemake --cores 3
```
5 changes: 4 additions & 1 deletion pipelines/deg_pipeline/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@ __email__ = "[email protected]"
# Base DEG pipeline by using DESeq2, it could expand to more functions by using this workflow

# For manual running, please use this one
#configfile: "config.yaml"
# configfile: "config.yaml"
# pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/deg_pipeline/'
#

pipeline_path = '/pipelines/deg_pipeline/'

SAMPLES = ['CD4','CD8','CD14']

rule all:
Expand Down
23 changes: 20 additions & 3 deletions pipelines/deg_pipeline/import_utils/lib/externalHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,31 @@
import itertools

class handlers(object):
def get_column(filename_with_path, ext_value, annot='gene_id', sep="\t"):
def get_column(filename_with_path, ext_value, annot='gene_id', header_line=0, sep="\t"):
"""
filename_with_path = filepath + basename
ext_value = column name of file
sep = separator
"""
temp = pd.read_csv(filename_with_path, sep=sep).set_index(annot) # temp loading
return temp[[ext_value]]

# Don't use pandas.read_csv because of memory usage
index_list = []
value_list = []
with open(filename_with_path, 'r') as infile:
for i, line in enumerate(infile):
line = line.strip()
if i==header_line: # found header
header_info = line.split(sep)
value_ext_location = header_info.index(ext_value) # location of value extraction point
index_ext_location = header_info.index(annot) # location of value extraction point

elif i!=header_line:
line_list = line.split(sep)
index_list.append(str(line_list[index_ext_location])) # Value list
value_list.append(float(line_list[value_ext_location])) # Index list

result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
return result_df

def get_samplename(filelist):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# metafile = "./sample_CD4_meta.csv"
# outputfile = "./CD4_DEG.csv"

library(tidyverse)
#library(tidyverse)
library(DESeq2)
library(tximport)

Expand Down
3 changes: 3 additions & 0 deletions pipelines/feature_extraction_pipeline/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@ __email__ = "[email protected]"

# For manual running, please use this one
# configfile: "config.yaml"
# pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/feature_extraction_pipeline/'
#

pipeline_path = '/pipelines/feature_extraction_pipeline/'

SAMPLES = ['CD4','CD8','CD14']

rule all:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"""
Description: Repeative functions in notebook
"""
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
Expand Down
66 changes: 53 additions & 13 deletions pipelines/pipeline_controller/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,33 @@
from flask_wtf import Form
from wtforms import TextField, SubmitField

# Celery running
import json
from celery import Celery, current_task
from celery.result import AsyncResult
from subprocess import Popen, PIPE


app = Flask(__name__)
app.config['SECRET_KEY'] = 'swiri021swiri021' # CSRF key
Bootstrap(app) # set Bootstrap

## Celery setting
app.config.update(
CELERY_BROKER_URL='redis://localhost:6379', # Redis docker
CELERY_RESULT_BACKEND='redis://localhost:6379'
)
def make_celery(app):
celery = Celery(
app.import_name,
backend=app.config['CELERY_RESULT_BACKEND'],
broker=app.config['CELERY_BROKER_URL']
)
celery.conf.update(app.config)
return celery
celery = make_celery(app)

# set Bootstrap
Bootstrap(app)

# setting Navigation Bar
nav = Nav(app)
Expand Down Expand Up @@ -91,26 +114,43 @@ class SnakeMakeForm(Form):

return render_template('config_yaml_creator.html', form=form)

@celery.task()
def workflow_running(pipeline_path, yaml_file):
print(pipeline_path, yaml_file)

proc = Popen(['snakemake', '--snakefile', pipeline_path+'Snakefile', '--cores', str(3), '--configfile', yaml_file], stdin=PIPE, stdout=PIPE, stderr=PIPE)
# It is not working with snakemake
while True:
line = proc.stdout.readline()
if not line:
break
print(str(line))
current_task.update_state(state='PROGRESS', meta={'msg': str(line)})
return 999

@app.route("/workflow_progress")
def workflow_progress():
print("WORKFLOW RETURN")
jobid = request.values.get('jobid')
if jobid:
job = AsyncResult(jobid, app=celery)
print(job.state)
if job.state == 'PROGRESS':
return json.dumps(dict( state=job.state, msg=job.result['msg'],))
elif job.state == 'SUCCESS':
return json.dumps(dict( state=job.state, msg="done",))
return '{}'

@app.route("/status")
def workflow_status():

pipeline_path = session.get('selected_pipeline', None) # Pipeline path
yaml_file = session.get('yaml_output', None) # yaml file

## Running snakemake
cmd = 'snakemake --snakefile %s --cores 3 --configfile %s'%(pipeline_path+"Snakefile",yaml_file)
print(cmd)
try:
p = subprocess.check_output([cmd], shell=True)
msg = "Workflow has been completed"
except subprocess.CalledProcessError as e:
msg = "Error occur in snakemake, please check log files in pipelines folder"

return render_template('status.html', msg=msg)
job = workflow_running.delay(pipeline_path, yaml_file)
return render_template('progress.html', JOBID=job.id)

#########Route###########


# Parsing function for yaml data, only work 2 layer nested yaml file
def _parsing_yamlFile(workflow_path):
"""
Expand Down
4 changes: 3 additions & 1 deletion pipelines/pipeline_controller/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ PyYAML==5.4.1
flask==2.0.1
Flask-WTF==0.15.1
Flask-Bootstrap==3.3.7.1
flask-nav==0.6
flask-nav==0.6
celery==5.1.2
redis==3.5.3
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
40 changes: 40 additions & 0 deletions pipelines/pipeline_controller/templates/progress.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{% extends "bootstrap/base.html" %}
{% import "bootstrap/wtf.html" as wtf %}

{% block navbar %}
{{nav.mynavbar.render()}}
{% endblock %}

{% block content %}
<div class="container">
<h3>Workflow controller</h3>
<p>This controller generates proper snakemake config file to run your samples</p>

<div id="pct"></div>

<script src="//code.jquery.com/jquery-2.1.1.min.js"></script>
<script>
function poll() {
$.ajax("{{url_for('.workflow_progress', jobid=JOBID)}}", {
dataType: "json"
, success: function(resp) {
console.log(resp);
$("#pct").html("<b>Workflow has been completed</b>");
if(resp.msg == 'done') {
return;
} else{
$("#pct").html("<img src='/static/spinning-loading.gif'>");
setTimeout(poll, 1000.0);
}
}
});
}
$(function() {
var JOBID = "{{ JOBID }}";
poll();
});
</script>

<p>Copyright 2021 <a href="https://github.com/OpenKBC/multiple_sclerosis_proj">OpenKBC repository</a></p>
</div>
{% endblock %}
24 changes: 21 additions & 3 deletions utils/lib/externalHandler.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,33 @@
import pandas as pd
import numpy as np
import itertools

class handlers(object):
def get_column(filename_with_path, ext_value, annot='gene_id', sep="\t"):
def get_column(filename_with_path, ext_value, annot='gene_id', header_line=0, sep="\t", opt=0):
"""
filename_with_path = filepath + basename
ext_value = column name of file
sep = separator
"""
temp = pd.read_csv(filename_with_path, sep=sep).set_index(annot) # temp loading
return temp[[ext_value]]

# Don't use pandas.read_csv because of memory usage
index_list = []
value_list = []
with open(filename_with_path, 'r') as infile:
for i, line in enumerate(infile):
line = line.strip()
if i==header_line: # found header
header_info = line.split(sep)
value_ext_location = header_info.index(ext_value) # location of value extraction point
index_ext_location = header_info.index(annot) # location of value extraction point

elif i!=header_line:
line_list = line.split(sep)
index_list.append(str(line_list[index_ext_location])) # Value list
value_list.append(float(line_list[value_ext_location])) # Index list

result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
return result_df

def get_samplename(filelist):
"""
Expand Down

0 comments on commit 99a1a75

Please sign in to comment.