diff --git a/.gitignore b/.gitignore
index 1e8f214..ff31749 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Testing
+tester.py
+
# s3 entire data
data/
*.pem
diff --git a/notebook/feature_test_with_DEG.ipynb b/notebook/feature_test_with_DEG.ipynb
new file mode 100644
index 0000000..6786a16
--- /dev/null
+++ b/notebook/feature_test_with_DEG.ipynb
@@ -0,0 +1,365 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit\n",
+ "\n",
+ "deg_path = \"resultFiles/DEG_RRvsCIS_by_Jun/\"\n",
+ "expr_path = \"../data/counts_normalized/rawFiles/\"\n",
+ "deg_df = pd.read_csv(deg_path+\"CD4_DEG.result\",sep=' ', index_col=0).dropna()\n",
+ "sig_df = deg_df.loc[(deg_df['pvalue']<0.05)]\n",
+ "sig_df = sig_df.loc[(sig_df['log2FoldChange'] > 1) | (sig_df['log2FoldChange'] < -1)]\n",
+ "\n",
+ "expr_df = pd.read_csv(expr_path+\"counts_norm_CD4.csv\", index_col=0)\n",
+ "expr_df.loc[sig_df.index.tolist()]\n",
+ "expr_df.columns = [x.split('.')[0] for x in expr_df.columns.tolist()]\n",
+ "expr_df = expr_df.applymap(lambda x : np.log2(x+1))\n",
+ "expr_df = expr_df.subtract(expr_df.median(axis=1), axis=0)\n",
+ "\n",
+ "meta_data = pd.read_csv('../data/annotation_metadata/EPIC_HCvB_metadata_baseline_updated-share.csv')"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "source": [
+ "sample_list, sample_category = exttoolkit.get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse')\n",
+ "sample_list[0] = list(set(expr_df.columns.tolist()).intersection(set(sample_list[0])))\n",
+ "sample_list[4] = list(set(expr_df.columns.tolist()).intersection(set(sample_list[4])))\n",
+ "ext_samples = sample_list[0] + sample_list[4] # RR + CIS\n",
+ "\n",
+ "ext_category = [0]*len(sample_list[0])+[1]*len(sample_list[4])\n",
+ "\n",
+ "expr_df = expr_df[ext_samples].loc[sig_df.index]\n",
+ "expr_df = expr_df.replace(0, np.nan).dropna(thresh=len(expr_df.columns)-2).replace(np.nan, 0)\n"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "source": [
+ "len(ext_samples)"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "119"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "source": [
+ "X = expr_df.T.values\n",
+ "y = ext_category"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "source": [
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.svm import SVC\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn import metrics\n",
+ "\n",
+ "auc_arr = []\n",
+ "val_auc = []\n",
+ "\n",
+ "for t in list(range(0,100)):\n",
+ " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=t)\n",
+ " X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=t)\n",
+ "\n",
+ " #randomState = list(range(0,5))\n",
+ "\n",
+ " clf = SVC(kernel=\"linear\")\n",
+ " clf.fit(X_train, y_train)\n",
+ "\n",
+ " y_pred = clf.predict(X_test)\n",
+ " fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)\n",
+ " auc_arr.append([t, metrics.auc(fpr, tpr)])\n",
+ " \n",
+ " y_val_pred = clf.predict(X_val)\n",
+ " fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred, pos_label=1)\n",
+ " val_auc.append([t, metrics.auc(fpr, tpr)])\n",
+ "\n",
+ "auc_test_df = pd.DataFrame(data=auc_arr, columns=['state', 'auc']).set_index('state')\n",
+ "auc_val_df = pd.DataFrame(data=val_auc, columns=['state', 'auc']).set_index('state')"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "source": [
+ "auc_df = pd.concat([auc_test_df, auc_val_df], axis=1)\n",
+ "auc_df.columns = ['test_auc', 'val_auc']\n",
+ "auc_df['diff'] = auc_df['test_auc'] - auc_df['val_auc']\n",
+ "auc_df"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " test_auc | \n",
+ " val_auc | \n",
+ " diff | \n",
+ "
\n",
+ " \n",
+ " state | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.718750 | \n",
+ " 0.755556 | \n",
+ " -0.036806 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.708333 | \n",
+ " 0.888889 | \n",
+ " -0.180556 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.626050 | \n",
+ " 0.584034 | \n",
+ " 0.042017 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.685714 | \n",
+ " 0.773684 | \n",
+ " -0.087970 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.900000 | \n",
+ " 0.642857 | \n",
+ " 0.257143 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 95 | \n",
+ " 0.750000 | \n",
+ " 0.611888 | \n",
+ " 0.138112 | \n",
+ "
\n",
+ " \n",
+ " 96 | \n",
+ " 0.638655 | \n",
+ " 0.697479 | \n",
+ " -0.058824 | \n",
+ "
\n",
+ " \n",
+ " 97 | \n",
+ " 0.688889 | \n",
+ " 0.687500 | \n",
+ " 0.001389 | \n",
+ "
\n",
+ " \n",
+ " 98 | \n",
+ " 0.677778 | \n",
+ " 0.750000 | \n",
+ " -0.072222 | \n",
+ "
\n",
+ " \n",
+ " 99 | \n",
+ " 0.742857 | \n",
+ " 0.888889 | \n",
+ " -0.146032 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
100 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " test_auc val_auc diff\n",
+ "state \n",
+ "0 0.718750 0.755556 -0.036806\n",
+ "1 0.708333 0.888889 -0.180556\n",
+ "2 0.626050 0.584034 0.042017\n",
+ "3 0.685714 0.773684 -0.087970\n",
+ "4 0.900000 0.642857 0.257143\n",
+ "... ... ... ...\n",
+ "95 0.750000 0.611888 0.138112\n",
+ "96 0.638655 0.697479 -0.058824\n",
+ "97 0.688889 0.687500 0.001389\n",
+ "98 0.677778 0.750000 -0.072222\n",
+ "99 0.742857 0.888889 -0.146032\n",
+ "\n",
+ "[100 rows x 3 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "source": [
+ "import seaborn as sns\n",
+ "sns.distplot(auc_test_df['auc'].values.tolist())\n",
+ "sns.distplot(auc_val_df['auc'].values.tolist())"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/opt/miniconda3/envs/r-py-test/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
+ " warnings.warn(msg, FutureWarning)\n",
+ "/opt/miniconda3/envs/r-py-test/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
+ " warnings.warn(msg, FutureWarning)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "source": [
+ "sns.distplot(auc_df['diff'].values.tolist())"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/opt/miniconda3/envs/r-py-test/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
+ " warnings.warn(msg, FutureWarning)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [],
+ "outputs": [],
+ "metadata": {}
+ }
+ ],
+ "metadata": {
+ "orig_nbformat": 4,
+ "language_info": {
+ "name": "python",
+ "version": "3.8.2",
+ "mimetype": "text/x-python",
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "pygments_lexer": "ipython3",
+ "nbconvert_exporter": "python",
+ "file_extension": ".py"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3.8.2 64-bit ('r-py-test': conda)"
+ },
+ "interpreter": {
+ "hash": "7508a6b53ffb04362d156591e4bfb20c197555e37f3cce3b1ec90fd899bbfe63"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/notebook/feature_test_with_act.ipynb b/notebook/feature_test_with_act.ipynb
new file mode 100644
index 0000000..851f954
--- /dev/null
+++ b/notebook/feature_test_with_act.ipynb
@@ -0,0 +1,404 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "source": [
+ "import pandas as pd\n",
+ "## Utils and Library for notebook\n",
+ "from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit\n",
+ "\n",
+ "# Root data path\n",
+ "DATA_PATH = '../data/'\n",
+ "\n",
+ "#Data loading\n",
+ "df = pd.read_csv(\"resultFiles/featureExtractionV5_by_Jun/CD4.Ranksum.RFECV.act.csv\", engine='c', index_col=0)\n",
+ "meta_data = pd.read_csv(DATA_PATH+'annotation_metadata/EPIC_HCvB_metadata_baseline_updated-share.csv')\n"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "source": [
+ "## Utils and Library for notebook\n",
+ "from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit\n",
+ "import itertools\n",
+ "def _LoadDiseaseDuration(df, meta_data, returntype='long'):\n",
+ " \"\"\"\n",
+ " df : Expression or activation score matrix\n",
+ " meta_data : meta data which contains duration and sample ID\n",
+ " output: long DD samples and short DD samples by list, or healthy samples and short DD samples by list\n",
+ " \"\"\"\n",
+ " # checking multiple element for returntype\n",
+ " if returntype.count(',')>1: raise ValueError('No more than 2 elements for returntype')\n",
+ "\n",
+ " if returntype.find(',')==-1: # if returnType is single(long and healthy)\n",
+ " # Sample by disease category\n",
+ " sample_list, sample_category = exttoolkit.get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse')\n",
+ " \n",
+ " # Sort by disease category and exclude uknown samples\n",
+ " patient_samples = [] # patient samples\n",
+ " healthy_samples = [] # healthy samples\n",
+ " for samples, category in zip(sample_list, sample_category):\n",
+ " if category=='Healthy':\n",
+ " healthy_samples = samples\n",
+ " else:\n",
+ " if category!='Unknown':# Excluding unknown samples\n",
+ " patient_samples.append(samples)\n",
+ "\n",
+ " patient_samples = list(itertools.chain(*patient_samples)) # flatten\n",
+ " patient_samples = list(set(patient_samples).intersection(df.columns.tolist())) # intersected with act score matrix\n",
+ " healthy_samples = list(set(healthy_samples).intersection(df.columns.tolist())) # intersected with act score matrix\n",
+ " patient_meta = meta_data.loc[meta_data['HCVB_ID'].isin(patient_samples)] # Make patient metadata\n",
+ "\n",
+ " longDD_samples, shortDD_samples = exttoolkit.get_sample_name_by_contValues(patient_meta, 'HCVB_ID', 'DiseaseDuration', 25)\n",
+ " longDD_samples = list(set(longDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix\n",
+ " shortDD_samples = list(set(shortDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix\n",
+ "\n",
+ " else: # if returnType is multiple(List)\n",
+ " # Sample by disease category\n",
+ " sample_list, sample_category = exttoolkit.get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse')\n",
+ " category1 = returntype.split(',')[0]\n",
+ " category2 = returntype.split(',')[1]\n",
+ " \n",
+ " # Sort by disease category and exclude uknown samples\n",
+ " patient_samples = [] # patient samples\n",
+ " healthy_samples = [] # healthy samples\n",
+ " for samples, category in zip(sample_list, sample_category):\n",
+ " if category==category1:\n",
+ " category1_samples = list(set(samples).intersection(df.columns.tolist())) # intersected with act score matrix\n",
+ " elif category==category2:\n",
+ " category2_samples = list(set(samples).intersection(df.columns.tolist())) # intersected with act score matrix\n",
+ "\n",
+ " # return result\n",
+ " if returntype=='long':\n",
+ " return longDD_samples, shortDD_samples\n",
+ " elif returntype=='healthy':\n",
+ " return healthy_samples, shortDD_samples\n",
+ " else:\n",
+ " return category1_samples, category2_samples\n"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "source": [
+ "df_cd4 = df.copy()\n",
+ "longDD_samples, shortDD_samples = _LoadDiseaseDuration(df_cd4, meta_data, 'RR,CIS')\n",
+ "df_cd4 = df_cd4[longDD_samples+shortDD_samples]\n",
+ "df_cd4 = df_cd4.subtract(df_cd4.median(axis=1), axis=0)\n",
+ "\n",
+ "\n",
+ "X = df_cd4.T.values # Training sample\n",
+ "y = [0]*len(longDD_samples)+[1]*len(shortDD_samples) # Training y\n",
+ "X.shape"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(119, 556)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "source": [
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.svm import SVC\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn import metrics\n",
+ "\n",
+ "auc_arr = []\n",
+ "val_auc = []\n",
+ "\n",
+ "for t in list(range(0,100)):\n",
+ " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=t)\n",
+ " X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=t)\n",
+ "\n",
+ " #randomState = list(range(0,5))\n",
+ "\n",
+ " clf = SVC(kernel=\"linear\")\n",
+ " clf.fit(X_train, y_train)\n",
+ "\n",
+ " y_pred = clf.predict(X_test)\n",
+ " fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)\n",
+ " auc_arr.append([t, metrics.auc(fpr, tpr)])\n",
+ " \n",
+ " y_val_pred = clf.predict(X_val)\n",
+ " fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred, pos_label=1)\n",
+ " val_auc.append([t, metrics.auc(fpr, tpr)])\n",
+ "\n",
+ "auc_test_df = pd.DataFrame(data=auc_arr, columns=['state', 'auc']).set_index('state')\n",
+ "auc_val_df = pd.DataFrame(data=val_auc, columns=['state', 'auc']).set_index('state')"
+ ],
+ "outputs": [],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "source": [
+ "auc_df = pd.concat([auc_test_df, auc_val_df], axis=1)\n",
+ "auc_df.columns = ['test_auc', 'val_auc']\n",
+ "auc_df['diff'] = auc_df['test_auc'] - auc_df['val_auc']\n",
+ "auc_df"
+ ],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " test_auc | \n",
+ " val_auc | \n",
+ " diff | \n",
+ "
\n",
+ " \n",
+ " state | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.875000 | \n",
+ " 0.822222 | \n",
+ " 0.052778 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1.000000 | \n",
+ " 0.944444 | \n",
+ " 0.055556 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.857143 | \n",
+ " 0.970588 | \n",
+ " -0.113445 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.828571 | \n",
+ " 0.873684 | \n",
+ " -0.045113 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.900000 | \n",
+ " 0.900000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 95 | \n",
+ " 0.687500 | \n",
+ " 0.954545 | \n",
+ " -0.267045 | \n",
+ "
\n",
+ " \n",
+ " 96 | \n",
+ " 0.869748 | \n",
+ " 0.941176 | \n",
+ " -0.071429 | \n",
+ "
\n",
+ " \n",
+ " 97 | \n",
+ " 0.911111 | \n",
+ " 0.937500 | \n",
+ " -0.026389 | \n",
+ "
\n",
+ " \n",
+ " 98 | \n",
+ " 1.000000 | \n",
+ " 0.888889 | \n",
+ " 0.111111 | \n",
+ "
\n",
+ " \n",
+ " 99 | \n",
+ " 0.950000 | \n",
+ " 0.916667 | \n",
+ " 0.033333 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
100 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " test_auc val_auc diff\n",
+ "state \n",
+ "0 0.875000 0.822222 0.052778\n",
+ "1 1.000000 0.944444 0.055556\n",
+ "2 0.857143 0.970588 -0.113445\n",
+ "3 0.828571 0.873684 -0.045113\n",
+ "4 0.900000 0.900000 0.000000\n",
+ "... ... ... ...\n",
+ "95 0.687500 0.954545 -0.267045\n",
+ "96 0.869748 0.941176 -0.071429\n",
+ "97 0.911111 0.937500 -0.026389\n",
+ "98 1.000000 0.888889 0.111111\n",
+ "99 0.950000 0.916667 0.033333\n",
+ "\n",
+ "[100 rows x 3 columns]"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "source": [
+ "import seaborn as sns\n",
+ "sns.distplot(auc_test_df['auc'].values.tolist())\n",
+ "sns.distplot(auc_val_df['auc'].values.tolist())"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/opt/miniconda3/envs/r-py-test/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
+ " warnings.warn(msg, FutureWarning)\n",
+ "/opt/miniconda3/envs/r-py-test/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
+ " warnings.warn(msg, FutureWarning)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "source": [
+ "sns.distplot(auc_df['diff'].values.tolist())"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/opt/miniconda3/envs/r-py-test/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
+ " warnings.warn(msg, FutureWarning)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "execution_count": 11
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ }
+ }
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [],
+ "outputs": [],
+ "metadata": {}
+ }
+ ],
+ "metadata": {
+ "orig_nbformat": 4,
+ "language_info": {
+ "name": "python",
+ "version": "3.8.2",
+ "mimetype": "text/x-python",
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "pygments_lexer": "ipython3",
+ "nbconvert_exporter": "python",
+ "file_extension": ".py"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3.8.2 64-bit ('r-py-test': conda)"
+ },
+ "interpreter": {
+ "hash": "7508a6b53ffb04362d156591e4bfb20c197555e37f3cce3b1ec90fd899bbfe63"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/notebook/notebook_archive/Jun09262021/SVM_test.ipynb b/notebook/notebook_archive/Jun09262021/SVM_test.ipynb
deleted file mode 100644
index 5cacee6..0000000
--- a/notebook/notebook_archive/Jun09262021/SVM_test.ipynb
+++ /dev/null
@@ -1,430 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 115,
- "source": [
- "import pandas as pd\n",
- "## Utils and Library for notebook\n",
- "from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit\n",
- "\n",
- "# Root data path\n",
- "DATA_PATH = '../data/'\n",
- "\n",
- "#Data loading\n",
- "df = pd.read_csv(\"resultFiles/featureExtractionV2_by_Jun/LongDiseaseDuration/CD4.Ranksum.RFECV.act.csv\", engine='c', index_col=0)\n",
- "meta_data = pd.read_csv(DATA_PATH+'annotation_metadata/EPIC_HCvB_metadata_baseline_updated-share.csv')\n"
- ],
- "outputs": [],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 116,
- "source": [
- "## Utils and Library for notebook\n",
- "from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit\n",
- "import itertools\n",
- "def _LoadDiseaseDuration(df, meta_data, returntype='long'):\n",
- " \"\"\"\n",
- " df : Expression or activation score matrix\n",
- " meta_data : meta data which contains duration and sample ID\n",
- " output: long DD samples and short DD samples by list, or healthy samples and short DD samples by list\n",
- " \"\"\"\n",
- " # Sample by disease category\n",
- " sample_list, sample_category = exttoolkit.get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse')\n",
- " \n",
- " # Sort by disease category and exclude uknown samples\n",
- " patient_samples = [] # patient samples\n",
- " healthy_samples = [] # healthy samples\n",
- " for samples, category in zip(sample_list, sample_category):\n",
- " if category=='Healthy':\n",
- " healthy_samples = samples\n",
- " else:\n",
- " if category!='Unknown':# Excluding unknown samples\n",
- " patient_samples.append(samples)\n",
- "\n",
- " patient_samples = list(itertools.chain(*patient_samples)) # flatten\n",
- " patient_samples = list(set(patient_samples).intersection(df.columns.tolist())) # intersected with act score matrix\n",
- " healthy_samples = list(set(healthy_samples).intersection(df.columns.tolist())) # intersected with act score matrix\n",
- " patient_meta = meta_data.loc[meta_data['HCVB_ID'].isin(patient_samples)] # Make patient metadata\n",
- "\n",
- " longDD_samples, shortDD_samples = exttoolkit.get_sample_name_by_contValues(patient_meta, 'HCVB_ID', 'DiseaseDuration', 25)\n",
- " longDD_samples = list(set(longDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix\n",
- " shortDD_samples = list(set(shortDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix\n",
- "\n",
- " if returntype=='long':\n",
- " return longDD_samples, shortDD_samples\n",
- " elif returntype=='healthy':\n",
- " return healthy_samples, shortDD_samples"
- ],
- "outputs": [],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 117,
- "source": [
- "df_cd4 = df.copy()\n",
- "longDD_samples, shortDD_samples = _LoadDiseaseDuration(df_cd4, meta_data, 'long')\n",
- "df_cd4 = df_cd4[longDD_samples+shortDD_samples]\n",
- "df_cd4 = df_cd4.subtract(df_cd4.median(axis=1), axis=0)\n",
- "\n",
- "\n",
- "X = df_cd4.T.values # Training sample\n",
- "y = [0]*len(longDD_samples)+[1]*len(shortDD_samples) # Training y\n",
- "X.shape"
- ],
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "(86, 402)"
- ]
- },
- "metadata": {},
- "execution_count": 117
- }
- ],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 118,
- "source": [
- "import matplotlib.pyplot as plt\n",
- "from sklearn.svm import SVC\n",
- "from sklearn.model_selection import StratifiedKFold\n",
- "from sklearn.feature_selection import RFECV\n",
- "\n",
- "## Reference: \n",
- "## https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html\n",
- "\n",
- "estimator = SVC(kernel=\"linear\") # linear\n",
- "min_features_to_select = 1\n",
- "rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(2),\\\n",
- " scoring='accuracy', min_features_to_select=min_features_to_select)\n",
- "rfecv.fit(X, y)\n",
- "\n",
- "print(\"Optimal number of features : %d\" % rfecv.n_features_)\n",
- "\n",
- "# Plot number of features VS. cross-validation scores\n",
- "plt.figure()\n",
- "plt.xlabel(\"Number of features selected\")\n",
- "plt.ylabel(\"Cross validation score (nb of correct classifications)\")\n",
- "plt.plot(range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select), rfecv.grid_scores_)\n",
- "plt.show()"
- ],
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Optimal number of features : 259\n"
- ]
- },
- {
- "output_type": "display_data",
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- }
- }
- ],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 119,
- "source": [
- "rfecv.n_features_"
- ],
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "259"
- ]
- },
- "metadata": {},
- "execution_count": 119
- }
- ],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 120,
- "source": [
- "import numpy as np\n",
- "selected_features = df_cd4.index[np.where(rfecv.ranking_==1)] # Top100\n",
- "selected_df = df_cd4.loc[selected_features]\n",
- "\n",
- "fold_change = (selected_df[longDD_samples].mean(axis=1) - selected_df[shortDD_samples].mean(axis=1)).apply(abs)\n",
- "fold_change = fold_change.sort_values(ascending=False)[:200].index.tolist()\n",
- "selected_df = selected_df.loc[fold_change]\n",
- "\n",
- "X = selected_df.T.values\n",
- "y = [0]*len(longDD_samples)+[1]*len(shortDD_samples) # Training y\n",
- "y = np.array(y)"
- ],
- "outputs": [],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 121,
- "source": [
- "from sklearn.ensemble import RandomForestClassifier\n",
- "from sklearn.svm import SVC\n",
- "from sklearn.model_selection import train_test_split\n",
- "from sklearn import metrics\n",
- "\n",
- "auc_arr = []\n",
- "val_auc = []\n",
- "\n",
- "for t in list(range(0,100)):\n",
- " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=t)\n",
- " X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=t)\n",
- "\n",
- " #randomState = list(range(0,5))\n",
- "\n",
- " clf = SVC(kernel=\"linear\")\n",
- " clf.fit(X_train, y_train)\n",
- "\n",
- " y_pred = clf.predict(X_test)\n",
- " fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)\n",
- " auc_arr.append([t, metrics.auc(fpr, tpr)])\n",
- " \n",
- " y_val_pred = clf.predict(X_val)\n",
- " fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred, pos_label=1)\n",
- " val_auc.append([t, metrics.auc(fpr, tpr)])\n",
- "\n",
- "auc_test_df = pd.DataFrame(data=auc_arr, columns=['state', 'auc']).set_index('state')\n",
- "auc_val_df = pd.DataFrame(data=val_auc, columns=['state', 'auc']).set_index('state')"
- ],
- "outputs": [],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 125,
- "source": [
- "auc_df = pd.concat([auc_test_df, auc_val_df], axis=1)\n",
- "auc_df.columns = ['test_auc', 'val_auc']\n",
- "auc_df['diff'] = auc_df['test_auc'] - auc_df['val_auc']\n",
- "auc_df"
- ],
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " test_auc | \n",
- " val_auc | \n",
- " diff | \n",
- "
\n",
- " \n",
- " state | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1.000000 | \n",
- " 0.916667 | \n",
- " 0.083333 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0.928571 | \n",
- " 0.900000 | \n",
- " 0.028571 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 1.000000 | \n",
- " 0.928571 | \n",
- " 0.071429 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 1.000000 | \n",
- " 1.000000 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 0.900000 | \n",
- " 0.750000 | \n",
- " 0.150000 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 95 | \n",
- " 0.875000 | \n",
- " 1.000000 | \n",
- " -0.125000 | \n",
- "
\n",
- " \n",
- " 96 | \n",
- " 1.000000 | \n",
- " 0.750000 | \n",
- " 0.250000 | \n",
- "
\n",
- " \n",
- " 97 | \n",
- " 1.000000 | \n",
- " 0.928571 | \n",
- " 0.071429 | \n",
- "
\n",
- " \n",
- " 98 | \n",
- " 0.875000 | \n",
- " 1.000000 | \n",
- " -0.125000 | \n",
- "
\n",
- " \n",
- " 99 | \n",
- " 0.833333 | \n",
- " 1.000000 | \n",
- " -0.166667 | \n",
- "
\n",
- " \n",
- "
\n",
- "
100 rows × 3 columns
\n",
- "
"
- ],
- "text/plain": [
- " test_auc val_auc diff\n",
- "state \n",
- "0 1.000000 0.916667 0.083333\n",
- "1 0.928571 0.900000 0.028571\n",
- "2 1.000000 0.928571 0.071429\n",
- "3 1.000000 1.000000 0.000000\n",
- "4 0.900000 0.750000 0.150000\n",
- "... ... ... ...\n",
- "95 0.875000 1.000000 -0.125000\n",
- "96 1.000000 0.750000 0.250000\n",
- "97 1.000000 0.928571 0.071429\n",
- "98 0.875000 1.000000 -0.125000\n",
- "99 0.833333 1.000000 -0.166667\n",
- "\n",
- "[100 rows x 3 columns]"
- ]
- },
- "metadata": {},
- "execution_count": 125
- }
- ],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 126,
- "source": [
- "sns.distplot(auc_test_df['auc'].values.tolist())\n",
- "sns.distplot(auc_val_df['auc'].values.tolist())"
- ],
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "/opt/miniconda3/envs/r-py-test/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
- " warnings.warn(msg, FutureWarning)\n",
- "/opt/miniconda3/envs/r-py-test/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
- " warnings.warn(msg, FutureWarning)\n"
- ]
- },
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "execution_count": 126
- },
- {
- "output_type": "display_data",
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- }
- }
- ],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "source": [],
- "outputs": [],
- "metadata": {}
- }
- ],
- "metadata": {
- "orig_nbformat": 4,
- "language_info": {
- "name": "python",
- "version": "3.8.2",
- "mimetype": "text/x-python",
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "pygments_lexer": "ipython3",
- "nbconvert_exporter": "python",
- "file_extension": ".py"
- },
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3.8.2 64-bit ('r-py-test': conda)"
- },
- "interpreter": {
- "hash": "7508a6b53ffb04362d156591e4bfb20c197555e37f3cce3b1ec90fd899bbfe63"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
\ No newline at end of file
diff --git a/notebook/notebook_archive/Jun09262021/feature_test_with_DEG.ipynb b/notebook/notebook_archive/Jun09262021/feature_test_with_DEG.ipynb
new file mode 100644
index 0000000..156d784
--- /dev/null
+++ b/notebook/notebook_archive/Jun09262021/feature_test_with_DEG.ipynb
@@ -0,0 +1 @@
+import pandas as padj
\ No newline at end of file
diff --git a/pipelines/deg_pipeline/README.md b/pipelines/deg_pipeline/README.md
new file mode 100644
index 0000000..1eb1fcc
--- /dev/null
+++ b/pipelines/deg_pipeline/README.md
@@ -0,0 +1,19 @@
+## DEG pipeline(DESeq2) by Jun
+* This workflow generates DEG result by using DESeq2, and it is working for only GEO styles of dataset
+
+#### Version history
+* It has memory issue in Docker
+* v1.0.0 is on the pipeline workflow
+
+#### Requirement
+```shell
+pip install -r requirements.txt
+Rscript installer_Rpackage.R
+```
+
+#### Usage
+* Please change config.yaml for standalone usage
+
+```shell
+snakemake --cores 3
+```
\ No newline at end of file
diff --git a/pipelines/deg_pipeline/Snakefile b/pipelines/deg_pipeline/Snakefile
index de8d611..391429e 100644
--- a/pipelines/deg_pipeline/Snakefile
+++ b/pipelines/deg_pipeline/Snakefile
@@ -6,9 +6,12 @@ __email__ = "swiri021@gmail.com"
# Base DEG pipeline by using DESeq2, it could expand to more functions by using this workflow
# For manual running, please use this one
-#configfile: "config.yaml"
+# configfile: "config.yaml"
+# pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/deg_pipeline/'
+#
pipeline_path = '/pipelines/deg_pipeline/'
+
SAMPLES = ['CD4','CD8','CD14']
rule all:
diff --git a/pipelines/deg_pipeline/import_utils/lib/externalHandler.py b/pipelines/deg_pipeline/import_utils/lib/externalHandler.py
index b0c73b1..939ea65 100644
--- a/pipelines/deg_pipeline/import_utils/lib/externalHandler.py
+++ b/pipelines/deg_pipeline/import_utils/lib/externalHandler.py
@@ -2,14 +2,31 @@
import itertools
class handlers(object):
- def get_column(filename_with_path, ext_value, annot='gene_id', sep="\t"):
+ def get_column(filename_with_path, ext_value, annot='gene_id', header_line=0, sep="\t"):
"""
filename_with_path = filepath + basename
ext_value = column name of file
sep = separator
"""
- temp = pd.read_csv(filename_with_path, sep=sep).set_index(annot) # temp loading
- return temp[[ext_value]]
+
+ # Don't use pandas.read_csv because of memory usage
+ index_list = []
+ value_list = []
+ with open(filename_with_path, 'r') as infile:
+ for i, line in enumerate(infile):
+ line = line.strip()
+ if i==header_line: # found header
+ header_info = line.split(sep)
+ value_ext_location = header_info.index(ext_value) # location of value extraction point
+ index_ext_location = header_info.index(annot) # location of value extraction point
+
+ elif i!=header_line:
+ line_list = line.split(sep)
+ index_list.append(str(line_list[index_ext_location])) # Value list
+ value_list.append(float(line_list[value_ext_location])) # Index list
+
+ result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
+ return result_df
def get_samplename(filelist):
"""
diff --git a/pipelines/deg_pipeline/import_utils/step2_DESeq2_calculator.R b/pipelines/deg_pipeline/import_utils/step2_DESeq2_calculator.R
index c01651c..ee2bb11 100644
--- a/pipelines/deg_pipeline/import_utils/step2_DESeq2_calculator.R
+++ b/pipelines/deg_pipeline/import_utils/step2_DESeq2_calculator.R
@@ -6,7 +6,7 @@
# metafile = "./sample_CD4_meta.csv"
# outputfile = "./CD4_DEG.csv"
-library(tidyverse)
+#library(tidyverse)
library(DESeq2)
library(tximport)
diff --git a/pipelines/feature_extraction_pipeline/Snakefile b/pipelines/feature_extraction_pipeline/Snakefile
index db96798..2279726 100644
--- a/pipelines/feature_extraction_pipeline/Snakefile
+++ b/pipelines/feature_extraction_pipeline/Snakefile
@@ -7,8 +7,11 @@ __email__ = "swiri021@gmail.com"
# For manual running, please use this one
# configfile: "config.yaml"
+# pipeline_path = '/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines/feature_extraction_pipeline/'
+#
pipeline_path = '/pipelines/feature_extraction_pipeline/'
+
SAMPLES = ['CD4','CD8','CD14']
rule all:
diff --git a/pipelines/feature_extraction_pipeline/import_ML/lib/statFunction.py b/pipelines/feature_extraction_pipeline/import_ML/lib/statFunction.py
index a94f68a..7266fb8 100644
--- a/pipelines/feature_extraction_pipeline/import_ML/lib/statFunction.py
+++ b/pipelines/feature_extraction_pipeline/import_ML/lib/statFunction.py
@@ -6,7 +6,6 @@
"""
Description: Repeative functions in notebook
"""
-import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
diff --git a/pipelines/pipeline_controller/app.py b/pipelines/pipeline_controller/app.py
index 9889382..e32ba38 100644
--- a/pipelines/pipeline_controller/app.py
+++ b/pipelines/pipeline_controller/app.py
@@ -24,10 +24,33 @@
from flask_wtf import Form
from wtforms import TextField, SubmitField
+# Celery running
+import json
+from celery import Celery, current_task
+from celery.result import AsyncResult
+from subprocess import Popen, PIPE
+
app = Flask(__name__)
app.config['SECRET_KEY'] = 'swiri021swiri021' # CSRF key
-Bootstrap(app) # set Bootstrap
+
+## Celery setting
+app.config.update(
+ CELERY_BROKER_URL='redis://localhost:6379', # Redis docker
+ CELERY_RESULT_BACKEND='redis://localhost:6379'
+)
+def make_celery(app):
+ celery = Celery(
+ app.import_name,
+ backend=app.config['CELERY_RESULT_BACKEND'],
+ broker=app.config['CELERY_BROKER_URL']
+ )
+ celery.conf.update(app.config)
+ return celery
+celery = make_celery(app)
+
+# set Bootstrap
+Bootstrap(app)
# setting Navigation Bar
nav = Nav(app)
@@ -91,26 +114,43 @@ class SnakeMakeForm(Form):
return render_template('config_yaml_creator.html', form=form)
+@celery.task()
+def workflow_running(pipeline_path, yaml_file):
+ print(pipeline_path, yaml_file)
+
+ proc = Popen(['snakemake', '--snakefile', pipeline_path+'Snakefile', '--cores', str(3), '--configfile', yaml_file], stdin=PIPE, stdout=PIPE, stderr=PIPE)
+ # It is not working with snakemake
+ while True:
+ line = proc.stdout.readline()
+ if not line:
+ break
+ print(str(line))
+ current_task.update_state(state='PROGRESS', meta={'msg': str(line)})
+ return 999
+
+@app.route("/workflow_progress")
+def workflow_progress():
+ print("WORKFLOW RETURN")
+ jobid = request.values.get('jobid')
+ if jobid:
+ job = AsyncResult(jobid, app=celery)
+ print(job.state)
+ if job.state == 'PROGRESS':
+ return json.dumps(dict( state=job.state, msg=job.result['msg'],))
+ elif job.state == 'SUCCESS':
+ return json.dumps(dict( state=job.state, msg="done",))
+ return '{}'
+
@app.route("/status")
def workflow_status():
-
pipeline_path = session.get('selected_pipeline', None) # Pipeline path
yaml_file = session.get('yaml_output', None) # yaml file
- ## Running snakemake
- cmd = 'snakemake --snakefile %s --cores 3 --configfile %s'%(pipeline_path+"Snakefile",yaml_file)
- print(cmd)
- try:
- p = subprocess.check_output([cmd], shell=True)
- msg = "Workflow has been completed"
- except subprocess.CalledProcessError as e:
- msg = "Error occur in snakemake, please check log files in pipelines folder"
-
- return render_template('status.html', msg=msg)
+ job = workflow_running.delay(pipeline_path, yaml_file)
+ return render_template('progress.html', JOBID=job.id)
#########Route###########
-
# Parsing function for yaml data, only work 2 layer nested yaml file
def _parsing_yamlFile(workflow_path):
"""
diff --git a/pipelines/pipeline_controller/requirements.txt b/pipelines/pipeline_controller/requirements.txt
index 32409a6..6032c06 100644
--- a/pipelines/pipeline_controller/requirements.txt
+++ b/pipelines/pipeline_controller/requirements.txt
@@ -4,4 +4,6 @@ PyYAML==5.4.1
flask==2.0.1
Flask-WTF==0.15.1
Flask-Bootstrap==3.3.7.1
-flask-nav==0.6
\ No newline at end of file
+flask-nav==0.6
+celery==5.1.2
+redis==3.5.3
\ No newline at end of file
diff --git a/pipelines/pipeline_controller/static/spinning-loading.gif b/pipelines/pipeline_controller/static/spinning-loading.gif
new file mode 100644
index 0000000..e3b78dd
Binary files /dev/null and b/pipelines/pipeline_controller/static/spinning-loading.gif differ
diff --git a/pipelines/pipeline_controller/templates/progress.html b/pipelines/pipeline_controller/templates/progress.html
new file mode 100644
index 0000000..3f99f8b
--- /dev/null
+++ b/pipelines/pipeline_controller/templates/progress.html
@@ -0,0 +1,40 @@
+{% extends "bootstrap/base.html" %}
+{% import "bootstrap/wtf.html" as wtf %}
+
+{% block navbar %}
+ {{nav.mynavbar.render()}}
+{% endblock %}
+
+{% block content %}
+
+
Workflow controller
+
This controller generates proper snakemake config file to run your samples
+
+
+
+
+
+
+
Copyright 2021 OpenKBC repository
+
+{% endblock %}
\ No newline at end of file
diff --git a/utils/lib/externalHandler.py b/utils/lib/externalHandler.py
index b0c73b1..3de4a3f 100644
--- a/utils/lib/externalHandler.py
+++ b/utils/lib/externalHandler.py
@@ -1,15 +1,33 @@
import pandas as pd
+import numpy as np
import itertools
class handlers(object):
- def get_column(filename_with_path, ext_value, annot='gene_id', sep="\t"):
+ def get_column(filename_with_path, ext_value, annot='gene_id', header_line=0, sep="\t", opt=0):
"""
filename_with_path = filepath + basename
ext_value = column name of file
sep = separator
"""
- temp = pd.read_csv(filename_with_path, sep=sep).set_index(annot) # temp loading
- return temp[[ext_value]]
+
+ # Don't use pandas.read_csv because of memory usage
+ index_list = []
+ value_list = []
+ with open(filename_with_path, 'r') as infile:
+ for i, line in enumerate(infile):
+ line = line.strip()
+ if i==header_line: # found header
+ header_info = line.split(sep)
+ value_ext_location = header_info.index(ext_value) # location of value extraction point
+ index_ext_location = header_info.index(annot) # location of value extraction point
+
+ elif i!=header_line:
+ line_list = line.split(sep)
+ index_list.append(str(line_list[index_ext_location])) # Value list
+ value_list.append(float(line_list[value_ext_location])) # Index list
+
+ result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
+ return result_df
def get_samplename(filelist):
"""