-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep7_geneRFECV.py
51 lines (40 loc) · 2.03 KB
/
step7_geneRFECV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
__author__ = "Junhee Yoon"
__version__ = "1.0.0"
__maintainer__ = "Junhee Yoon"
__email__ = "[email protected]"
"""
Description: Mimic of notebook code for pipeline work, please see step2 in Jun notebook archive
Step4 is the same process with different input(activation socre -> gene expression)
"""
import pandas as pd
import argparse
import os
from libraries.metaHandler import metaExt
## call previous step for calling internal function
from libraries.statFunction import StatHandler
parser = argparse.ArgumentParser(prog='step7_geneRFECV.py')
# Input data
parser.add_argument('-r','--rthresh', dest='rankingthresh', required=True,\
help='Threshold for RFECV" ')
parser.add_argument('-t','--type', dest='resultType', default='RR,CIS',\
help='Result type, ex: long, healthy, "RR,CIS" ')
args = parser.parse_args()
# Simple control for snakemake(no argparse)
if __name__ == "__main__":
SharedFilePath = os.environ['efspoint'] # Main data path here, goes to EFS volume
metaName = os.environ['metafile'] # EPIC_HCvB_metadata_baseline_updated-share.csv
msigFile = os.environ['msigDBPATH'] # msigdb.v7.4.entrez.gmt
step1Input = os.environ['startFile'] # counts_vst_CD4.csv
inputFile = SharedFilePath+os.path.basename(step1Input).replace('.csv', '.step6.csv') # replace to step4 input
#Data loading
df = pd.read_csv(inputFile, engine='c', index_col=0)
meta_data = pd.read_csv(SharedFilePath+metaName)
longDD_samples, shortDD_samples = metaExt._LoadDiseaseDuration(df, meta_data, args.resultType)
df = df[longDD_samples+shortDD_samples].dropna() # reform df with intersected samples
# Make training samples
X = df.T.values # Training sample
y = [0]*len(longDD_samples)+[1]*len(shortDD_samples) # Training y
# features and ranking
outputFile = SharedFilePath+os.path.basename(step1Input).replace('.csv', '.step7.csv') # replace to step5 output
rankarr = StatHandler.calculate_RFECV(df, X, y, int(args.rankingthresh)) # get ranksum result
df.loc[df.index[rankarr]].to_csv(outputFile) # Writing