Skip to content

Commit

Permalink
Fixed minor bugs and data analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
swiri021 committed Sep 30, 2021
1 parent e3c313c commit 077f80c
Show file tree
Hide file tree
Showing 11 changed files with 823 additions and 441 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Testing
tester.py

# s3 entire data
data/
*.pem
Expand Down
365 changes: 365 additions & 0 deletions notebook/feature_test_with_DEG.ipynb

Large diffs are not rendered by default.

404 changes: 404 additions & 0 deletions notebook/feature_test_with_act.ipynb

Large diffs are not rendered by default.

430 changes: 0 additions & 430 deletions notebook/notebook_archive/Jun09262021/SVM_test.ipynb

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
import pandas as padj
8 changes: 5 additions & 3 deletions pipelines/deg_pipeline/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ __email__ = "[email protected]"
# Base DEG pipeline by using DESeq2, it could expand to more functions by using this workflow

# For manual running, please use this one
#configfile: "config.yaml"
# configfile: "config.yaml"
# pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/deg_pipeline/'
#

pipeline_path = '/pipelines/deg_pipeline/'

#pipeline_path = '/pipelines/deg_pipeline/'
pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/deg_pipeline/'
SAMPLES = ['CD4','CD8','CD14']

rule all:
Expand Down
23 changes: 20 additions & 3 deletions pipelines/deg_pipeline/import_utils/lib/externalHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,31 @@
import itertools

class handlers(object):
def get_column(filename_with_path, ext_value, annot='gene_id', sep="\t"):
def get_column(filename_with_path, ext_value, annot='gene_id', header_line=0, sep="\t"):
"""
filename_with_path = filepath + basename
ext_value = column name of file
sep = separator
"""
temp = pd.read_csv(filename_with_path, sep=sep).set_index(annot) # temp loading
return temp[[ext_value]]

# Don't use pandas.read_csv because of memory usage
index_list = []
value_list = []
with open(filename_with_path, 'r') as infile:
for i, line in enumerate(infile):
line = line.strip()
if i==header_line: # found header
header_info = line.split(sep)
value_ext_location = header_info.index(ext_value) # location of value extraction point
index_ext_location = header_info.index(annot) # location of value extraction point

elif i!=header_line:
line_list = line.split(sep)
index_list.append(str(line_list[index_ext_location])) # Value list
value_list.append(float(line_list[value_ext_location])) # Index list

result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
return result_df

def get_samplename(filelist):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# metafile = "./sample_CD4_meta.csv"
# outputfile = "./CD4_DEG.csv"

library(tidyverse)
#library(tidyverse)
library(DESeq2)
library(tximport)

Expand Down
3 changes: 3 additions & 0 deletions pipelines/feature_extraction_pipeline/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@ __email__ = "[email protected]"

# For manual running, please use this one
# configfile: "config.yaml"
# pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/feature_extraction_pipeline/'
#

pipeline_path = '/pipelines/feature_extraction_pipeline/'

SAMPLES = ['CD4','CD8','CD14']

rule all:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"""
Description: Repeative functions in notebook
"""
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
Expand Down
24 changes: 21 additions & 3 deletions utils/lib/externalHandler.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,33 @@
import pandas as pd
import numpy as np
import itertools

class handlers(object):
def get_column(filename_with_path, ext_value, annot='gene_id', sep="\t"):
def get_column(filename_with_path, ext_value, annot='gene_id', header_line=0, sep="\t", opt=0):
"""
filename_with_path = filepath + basename
ext_value = column name of file
sep = separator
"""
temp = pd.read_csv(filename_with_path, sep=sep).set_index(annot) # temp loading
return temp[[ext_value]]

# Don't use pandas.read_csv because of memory usage
index_list = []
value_list = []
with open(filename_with_path, 'r') as infile:
for i, line in enumerate(infile):
line = line.strip()
if i==header_line: # found header
header_info = line.split(sep)
value_ext_location = header_info.index(ext_value) # location of value extraction point
index_ext_location = header_info.index(annot) # location of value extraction point

elif i!=header_line:
line_list = line.split(sep)
index_list.append(str(line_list[index_ext_location])) # Value list
value_list.append(float(line_list[value_ext_location])) # Index list

result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
return result_df

def get_samplename(filelist):
"""
Expand Down

0 comments on commit 077f80c

Please sign in to comment.