Fixed minor bugs and data analysis

OpenKBC · Sep 30, 2021 · 077f80c · 077f80c
1 parent e3c313c
commit 077f80c
Show file tree

Hide file tree

Showing 11 changed files with 823 additions and 441 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Testing
+tester.py
+
 # s3 entire data
 data/
 *.pem

diff --git a/notebook/feature_test_with_DEG.ipynb b/notebook/feature_test_with_DEG.ipynb
diff --git a/notebook/feature_test_with_act.ipynb b/notebook/feature_test_with_act.ipynb
diff --git a/notebook/notebook_archive/Jun09262021/SVM_test.ipynb b/notebook/notebook_archive/Jun09262021/SVM_test.ipynb
diff --git a/notebook/notebook_archive/Jun09262021/feature_test_with_DEG.ipynb b/notebook/notebook_archive/Jun09262021/feature_test_with_DEG.ipynb
@@ -0,0 +1 @@
+import pandas as padj
diff --git a/pipelines/deg_pipeline/Snakefile b/pipelines/deg_pipeline/Snakefile
@@ -6,10 +6,12 @@ __email__ = "[email protected]"
 # Base DEG pipeline by using DESeq2, it could expand to more functions by using this workflow
 
 # For manual running, please use this one
-#configfile: "config.yaml"
+# configfile: "config.yaml" 
+# pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/deg_pipeline/'
+# 
+
+pipeline_path = '/pipelines/deg_pipeline/'
 
-#pipeline_path = '/pipelines/deg_pipeline/'
-pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/deg_pipeline/'
 SAMPLES = ['CD4','CD8','CD14']
 
 rule all:

diff --git a/pipelines/deg_pipeline/import_utils/lib/externalHandler.py b/pipelines/deg_pipeline/import_utils/lib/externalHandler.py
@@ -2,14 +2,31 @@
 import itertools
 
 class handlers(object):
-    def get_column(filename_with_path,  ext_value, annot='gene_id', sep="\t"):
+    def get_column(filename_with_path,  ext_value, annot='gene_id', header_line=0, sep="\t"):
         """
         filename_with_path = filepath + basename
         ext_value = column name of file
         sep = separator
         """
-        temp = pd.read_csv(filename_with_path, sep=sep).set_index(annot) # temp loading
-        return temp[[ext_value]]
+
+        # Don't use pandas.read_csv because of memory usage
+        index_list = []
+        value_list = []
+        with open(filename_with_path, 'r') as infile:
+            for i, line in enumerate(infile):
+                line = line.strip()
+                if i==header_line: # found header
+                    header_info = line.split(sep)
+                    value_ext_location = header_info.index(ext_value) # location of value extraction point
+                    index_ext_location = header_info.index(annot) # location of value extraction point
+
+                elif i!=header_line:
+                    line_list = line.split(sep)
+                    index_list.append(str(line_list[index_ext_location])) # Value list
+                    value_list.append(float(line_list[value_ext_location])) # Index list
+
+        result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
+        return result_df
 
     def get_samplename(filelist):
         """

diff --git a/pipelines/deg_pipeline/import_utils/step2_DESeq2_calculator.R b/pipelines/deg_pipeline/import_utils/step2_DESeq2_calculator.R
@@ -6,7 +6,7 @@
 # metafile = "./sample_CD4_meta.csv"
 # outputfile = "./CD4_DEG.csv"
 
-library(tidyverse)
+#library(tidyverse)
 library(DESeq2)
 library(tximport)
 

diff --git a/pipelines/feature_extraction_pipeline/Snakefile b/pipelines/feature_extraction_pipeline/Snakefile
@@ -7,8 +7,11 @@ __email__ = "[email protected]"
 
 # For manual running, please use this one
 # configfile: "config.yaml"
+# pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/feature_extraction_pipeline/'
+#
 
 pipeline_path = '/pipelines/feature_extraction_pipeline/'
+
 SAMPLES = ['CD4','CD8','CD14']
 
 rule all:

diff --git a/pipelines/feature_extraction_pipeline/import_ML/lib/statFunction.py b/pipelines/feature_extraction_pipeline/import_ML/lib/statFunction.py
@@ -6,7 +6,6 @@
 """
 Description: Repeative functions in notebook
 """
-import matplotlib.pyplot as plt
 from sklearn.svm import SVC
 from sklearn.model_selection import StratifiedKFold
 from sklearn.feature_selection import RFECV

diff --git a/utils/lib/externalHandler.py b/utils/lib/externalHandler.py
@@ -1,15 +1,33 @@
 import pandas as pd
+import numpy as np
 import itertools
 
 class handlers(object):
-    def get_column(filename_with_path,  ext_value, annot='gene_id', sep="\t"):
+    def get_column(filename_with_path,  ext_value, annot='gene_id', header_line=0, sep="\t", opt=0):
         """
         filename_with_path = filepath + basename
         ext_value = column name of file
         sep = separator
         """
-        temp = pd.read_csv(filename_with_path, sep=sep).set_index(annot) # temp loading
-        return temp[[ext_value]]
+
+        # Don't use pandas.read_csv because of memory usage
+        index_list = []
+        value_list = []
+        with open(filename_with_path, 'r') as infile:
+            for i, line in enumerate(infile):
+                line = line.strip()
+                if i==header_line: # found header
+                    header_info = line.split(sep)
+                    value_ext_location = header_info.index(ext_value) # location of value extraction point
+                    index_ext_location = header_info.index(annot) # location of value extraction point
+
+                elif i!=header_line:
+                    line_list = line.split(sep)
+                    index_list.append(str(line_list[index_ext_location])) # Value list
+                    value_list.append(float(line_list[value_ext_location])) # Index list
+
+        result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
+        return result_df
 
     def get_samplename(filelist):
         """