diff --git a/.gitignore b/.gitignore index 5cabe15..e287f73 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # s3 entire data data/ +*.pem +credentials # MAC .DS_Store diff --git a/Dockerfile_jupyterNotebook b/Dockerfile_jupyterNotebook index d073493..1480cd0 100644 --- a/Dockerfile_jupyterNotebook +++ b/Dockerfile_jupyterNotebook @@ -1,4 +1,4 @@ -FROM jupyter/datascience-notebook +FROM jupyter/base-notebook COPY notebook/installers/installer_Rpackage.R /installer_Rpackage.R COPY notebook/installers/requirements.txt /requirements.txt diff --git a/README.md b/README.md index bd49c19..20b9724 100644 --- a/README.md +++ b/README.md @@ -34,9 +34,16 @@ docker-compose up # composing up # Access jupyter notebook http://localhost:8888/token_number ``` + * Workflow container ```shell #Not prepared controller yet, please get into the container to run snakemake workflow separately docker exec -it container_name_of_pipelines bash ``` +* AWS module +```shell +cd aws_module +sh aws_module.sh t2.medium +``` + diff --git a/aws_module/README.md b/aws_module/README.md new file mode 100644 index 0000000..d05a95c --- /dev/null +++ b/aws_module/README.md @@ -0,0 +1,38 @@ +## AWS module for running the project +* This module supports to run the project codes, pipelines and analysis by launching AWS EC2 instance, currently, this module generates an EC2 instance in AWS by using github code and project S3 data. (Limit: below m5.4xlarge). The EC2 is launched with 200G volumes total initally(default). +* Please contact to members to have credentials to use this service. +* It contains AMI mapping.json for modifying initial storage size + +### AWS AMI description +``` +aws ec2 describe-images --image-ids ami-0f6304b1dde9413d6 #ubuntu 18.04 LTS with Docker +``` + +### Requirements on local PC +``` +apt-get install awscli +apt-get install jq +``` + +### Usage on local PC +``` +sh aws_module.sh t2.micro #with instance type(t2.micro for testing, maximum : m5.xlarge, m5.2xlarge) +``` + +### Requirements for docker +* This version has a problem with docker installment in AWS, and docker needs to be installed manually +``` +ssh -i MSplatform-key.pem ubuntu@IP_ADDRESS +``` + +### File information +* InstanceLaunch-Info: This file contains standard EC2 information you launched (IP addr, AZ and etc) +* InstanceVolume-Info: This file contains volume information you launched +* PublicIP: This file contains public IP address of EC2 you launched +* MSplatform-key.pem: This is the key for ssh'ing to EC2 + +### Services +``` +http://yourEC2URL/ # Pipeline Controller +http://yourEC2URL:8888/?token= # Pipeline Controller +``` \ No newline at end of file diff --git a/aws_module/aws_check_status.sh b/aws_module/aws_check_status.sh new file mode 100755 index 0000000..09ed8bc --- /dev/null +++ b/aws_module/aws_check_status.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +## Purpose of this bash file is running while EC2 is ready. When it is ready, automatically it will be out +objectID=$1 +objectState=dummy + +if [ "$2" == "ec2" ] +then + while [ "$objectState" != "running" ];do # EC2 running checking + sleep 1 + objStatuses=$(aws ec2 describe-instance-status --instance-id $objectID) + objectState=$( jq --jsonargs '.InstanceStatuses | .[] | .InstanceState.Name' <<< "${objStatuses}" ) + objectState="${objectState%\"}" # Remove double quotes from string + objectState="${objectState#\"}" # Remove double quotes from string + done +elif [ "$2" == "ebs" ] +then + while [ "$objectState" != "available" ];do # EBS available checking + sleep 1 + objStatuses=$(aws ec2 describe-volumes --volume-ids $objectID) + objectState=$( jq --jsonargs '.Volumes | .[] | .State' <<< "${objStatuses}" ) + objectState="${objectState%\"}" # Remove double quotes from string + objectState="${objectState#\"}" # Remove double quotes from string + done +fi \ No newline at end of file diff --git a/aws_module/aws_module.sh b/aws_module/aws_module.sh new file mode 100644 index 0000000..523279e --- /dev/null +++ b/aws_module/aws_module.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +## Purpose of this temporary bash file is to calculate large size of data for the project, +## default data, codes and docker images will be launched on EC2 instance. +## Let members know if you want to get auth for using AWS(credentials), and personal use of EC2 is strictly prohibited.(It is monitored by admin) +## Parsing portion needs to be changed to aws tag work + +securityGroupID=sg-08946d1b26a30d376 # default securityGroup for EC2(flask) +instanceType=$1 # t2.micro, m5.4xlarge for normal use +VolumeSize=100 # EBS volumne Size +InstanceInfoFile=InstanceLaunch-Info # Instance launch information +VolumeInfoFile=InstanceVolume-Info # Volume create information +PublicIPFile=PublicIP # Public IP information +PemKeyName=MSplatform-key + +## Key gen for EC2. if new key is needed, please use this command lines +#aws ec2 create-key-pair --key-name $PemKeyName --query 'KeyMaterial' --output text > MSplatform-key.pem +## Change auth for pem key +#chmod -R 400 MSplatform-key.pem + +## EC2 Instance launch with modifying block-device-mapping, ami-030cd17b75425e48d(plain ubuntu) +aws ec2 run-instances --image-id ami-0f6304b1dde9413d6 --block-device-mappings file://mapping_dockerAMI.json \ +--instance-type $instanceType --security-group-ids $securityGroupID --key-name $PemKeyName > $InstanceInfoFile + +InstanceIDLine=$(cat $InstanceInfoFile | grep 'InstanceId' | xargs) # Instance ID from info, stripping line +IFS=': ' read -r -a array <<< "$InstanceIDLine" # Split string +element=${array[1]} # extract ID +InstanceID=$(echo ${element/,/} | xargs) # Last cleanup of Instance ID string +echo "Instance ID: $InstanceID" + +AZLine=$(cat $InstanceInfoFile | grep 'AvailabilityZone' | xargs) # Get Availability Zone +IFS=': ' read -r -a array <<< "$AZLine" # Split string +element=${array[1]} # extract ID +AvailabilityZone=$(echo ${element/,/} | xargs) # Last cleanup of Instance ID string +echo "Instance AZ: $AvailabilityZone" + +#echo "Check ec2 status before create volume" +sh aws_check_status.sh $InstanceID ec2 # Check EC2 running + +ip_addr=$(aws ec2 describe-instances --instance-ids $InstanceID --query 'Reservations[0].Instances[0].PublicIpAddress') # get public IP for EC2 +ip_addr="${ip_addr%\"}" # Remove double quotes from string +ip_addr="${ip_addr#\"}" # Remove double quotes from string +echo "PublicIP: $ip_addr" +echo "PublicIP: $ip_addr" > $PublicIPFile + +## Volume create (same AZ with EC2) +aws ec2 create-volume --availability-zone $AvailabilityZone --volume-type gp2 --size $VolumeSize > $VolumeInfoFile + +VolumeIDLine=$(cat $VolumeInfoFile | grep 'VolumeId' | xargs) # Volume ID from info, stripping line +IFS=': ' read -r -a array <<< "$VolumeIDLine" # Split string +element=${array[1]} # extract ID +VolumeID=$(echo ${element/,/} | xargs) # Last cleanup of Instance ID string +echo "Volume ID: $VolumeID" + +## Volume attach +echo "Check ebs status before attach-volume" +sh aws_check_status.sh $VolumeID ebs # Check EBS availability +if [ "$2" == "m5."* ]; +then + echo "NVME volume" + aws ec2 attach-volume --volume-id $VolumeID --instance-id $InstanceID --device /dev/nvme1n1 + storageType=nvme +else + aws ec2 attach-volume --volume-id $VolumeID --instance-id $InstanceID --device /dev/sdf +fi +sleep 30 # sleep while AWS is loading + +## Running installer +ssh -i MSplatform-key.pem ubuntu@$ip_addr 'bash -s' < installer.sh + +## Moving credentials to ec2 for s3 connection +scp -i MSplatform-key.pem credentials ubuntu@$ip_addr:/home/ubuntu/.aws + +## S3 sync from S3 project bucket +ssh -i MSplatform-key.pem ubuntu@$ip_addr 'bash -s' < s3Sync.sh + +#### Running something here +#### Running something here + +#### Copy to S3 for resultFiles +#### Copy to S3 for resultFiles + +#### Terminateing EC2 here +#aws ec2 stop-instances --instance-ids $InstanceID +#aws ec2 detach-volume --volume-id $VolumeID +#aws ec2 delete-volume --volume-id $VolumeID +#aws ec2 terminate-instances --instance-ids $InstanceID \ No newline at end of file diff --git a/aws_module/docker_setup.sh b/aws_module/docker_setup.sh new file mode 100644 index 0000000..2956a4e --- /dev/null +++ b/aws_module/docker_setup.sh @@ -0,0 +1,7 @@ +# Install docker-compose +sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose +sudo chmod +x /usr/local/bin/docker-compose + +# docker-compose up for containers +cd /home/ubuntu/MSProject/multiple_sclerosis_proj +sudo docker-compose -f docker-compose.AWS.yaml up --detach \ No newline at end of file diff --git a/aws_module/installer.sh b/aws_module/installer.sh new file mode 100644 index 0000000..2c3a567 --- /dev/null +++ b/aws_module/installer.sh @@ -0,0 +1,22 @@ +storageType=$1 #storage type + +# Make usable disk +sudo mkdir /home/ubuntu/MSProject +sudo apt update && sudo apt install git +sudo sudo apt-get install -y awscli + +sudo mkfs -t ext4 /dev/nvme1n1 # format attached volume, attempt nvme format +sudo mkfs -t ext4 /dev/xvdf # format attached volume(/dev/sdf), attempt nvme format + +sudo mount /dev/nvme1n1 /home/ubuntu/MSProject # Mount to project directory, attempt nvme mount +sudo mount /dev/xvdf /home/ubuntu/MSProject # Mount to project directory, attempt standard mount +#sudo chown -R ubuntu:ubuntu /home/ubuntu/MSProject + +# Download all codes +cd /home/ubuntu/MSProject # go to working directory +sudo git clone https://github.com/OpenKBC/multiple_sclerosis_proj.git # git clone the code + +# For AWS S3 credential +cd /home/ubuntu +sudo mkdir .aws +sudo chown -R ubuntu:ubuntu /home/ubuntu/.aws/ \ No newline at end of file diff --git a/aws_module/mapping_dockerAMI.json b/aws_module/mapping_dockerAMI.json new file mode 100644 index 0000000..e042151 --- /dev/null +++ b/aws_module/mapping_dockerAMI.json @@ -0,0 +1,20 @@ +[ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "DeleteOnTermination": true, + "SnapshotId": "snap-0a2ad3b40ffe12e08", + "VolumeSize": 100, + "VolumeType": "gp2", + "Encrypted": false + } + }, + { + "DeviceName": "/dev/sdb", + "VirtualName": "ephemeral0" + }, + { + "DeviceName": "/dev/sdc", + "VirtualName": "ephemeral1" + } +] \ No newline at end of file diff --git a/aws_module/mapping_plainAMI.json b/aws_module/mapping_plainAMI.json new file mode 100644 index 0000000..19b9f7f --- /dev/null +++ b/aws_module/mapping_plainAMI.json @@ -0,0 +1,20 @@ +[ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "DeleteOnTermination": true, + "SnapshotId": "snap-04a6d5008d4a3d51f", + "VolumeSize": 100, + "VolumeType": "gp2", + "Encrypted": false + } + }, + { + "DeviceName": "/dev/sdb", + "VirtualName": "ephemeral0" + }, + { + "DeviceName": "/dev/sdc", + "VirtualName": "ephemeral1" + } +] \ No newline at end of file diff --git a/aws_module/s3Sync.sh b/aws_module/s3Sync.sh new file mode 100644 index 0000000..576aa92 --- /dev/null +++ b/aws_module/s3Sync.sh @@ -0,0 +1,4 @@ +cd /home/ubuntu/MSProject/multiple_sclerosis_proj # default project directory +sudo mkdir data/ +sudo chown ubuntu:ubuntu data/ +aws s3 sync s3://openkbc-ms-bucket/ data/ # sync to ec2 \ No newline at end of file diff --git a/docker-compose.AWS.yaml b/docker-compose.AWS.yaml new file mode 100644 index 0000000..f11465a --- /dev/null +++ b/docker-compose.AWS.yaml @@ -0,0 +1,26 @@ +version: "3" +services: + notebook: # Notebook + build: + context: . + dockerfile: Dockerfile_jupyterNotebook + volumes: + - /home/ubuntu/MSProject/multiple_sclerosis_proj/notebook/notebook_lib:/home/jovyan/work/notebook_lib + - /home/ubuntu/MSProject/multiple_sclerosis_proj/notebook/notebook_utils:/home/jovyan/work/notebook_utils + - /home/ubuntu/MSProject/multiple_sclerosis_proj/notebook/notebook_archive:/home/jovyan/work/notebook_archive + - /home/ubuntu/MSProject/multiple_sclerosis_proj/notebook/resultFiles:/home/jovyan/work/resultFiles + - /home/ubuntu/MSProject/multiple_sclerosis_proj/data:/home/jovyan/MainData + ports: + - 8888:8888 + container_name: notebook-container:v1.0.0 + + pipelines: # Pipelines + build: + context: . + dockerfile: Dockerfile_SnakemakePipeline + volumes: + - /home/ubuntu/MSProject/multiple_sclerosis_proj/data:/MainData + - /home/ubuntu/MSProject/multiple_sclerosis_proj/notebook/resultFiles:/Output + ports: + - 80:5000 + container_name: pipeline-container:v1.0.0 \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index a660d4d..bdc91d6 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,6 +1,5 @@ version: "3" services: - notebook: # Notebook build: dockerfile: Dockerfile_jupyterNotebook @@ -12,7 +11,7 @@ services: - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data:/home/jovyan/MainData ports: - 8888:8888 - container_name: notebook-container:v1.0.0 + container_name: notebookContainer pipelines: # Pipelines build: @@ -22,4 +21,4 @@ services: - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/resultFiles:/Output ports: - 80:5000 - container_name: pipeline-container:v1.0.0 \ No newline at end of file + container_name: pipelineContainer \ No newline at end of file diff --git a/notebook/notebook_archive/Jun09262021/SVM_test.ipynb b/notebook/notebook_archive/Jun09262021/SVM_test.ipynb index 2c84ef9..5cacee6 100644 --- a/notebook/notebook_archive/Jun09262021/SVM_test.ipynb +++ b/notebook/notebook_archive/Jun09262021/SVM_test.ipynb @@ -194,7 +194,7 @@ " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=t)\n", " X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=t)\n", "\n", - " randomState = list(range(0,5))\n", + " #randomState = list(range(0,5))\n", "\n", " clf = SVC(kernel=\"linear\")\n", " clf.fit(X_train, y_train)\n", @@ -395,66 +395,6 @@ ], "metadata": {} }, - { - "cell_type": "code", - "execution_count": 129, - "source": [ - "X_test.shape" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(17, 200)" - ] - }, - "metadata": {}, - "execution_count": 129 - } - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 130, - "source": [ - "X_val.shape" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(18, 200)" - ] - }, - "metadata": {}, - "execution_count": 130 - } - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 131, - "source": [ - "X_train.shape" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(51, 200)" - ] - }, - "metadata": {}, - "execution_count": 131 - } - ], - "metadata": {} - }, { "cell_type": "code", "execution_count": null, diff --git a/pipelines/feature_extraction_pipeline/README.md b/pipelines/feature_extraction_pipeline/README.md index c2ac1a3..dcb50cf 100644 --- a/pipelines/feature_extraction_pipeline/README.md +++ b/pipelines/feature_extraction_pipeline/README.md @@ -1,6 +1,10 @@ ## Feature Extraction by Jun * This workflow generates feature extracted gene expression data by CD4, CD8 and CD14. It starts with vst(or DEseq2) normalized expression and it makes activation scores as interim result automatically. Activation scores is used for the first step of feature extraction, and the workflow generates gene matrix witht final genes list. +#### Version history +* v1.0.1 has more functions of sample spliter(step1: _LoadDiseaseDuration) +* v1.0.0 is on the pipeline workflow + #### Requirement ```shell pip install -r requirements.txt diff --git a/pipelines/feature_extraction_pipeline/import_ML/step1_actscoreDiff.py b/pipelines/feature_extraction_pipeline/import_ML/step1_actscoreDiff.py index c9641ef..02b89c1 100644 --- a/pipelines/feature_extraction_pipeline/import_ML/step1_actscoreDiff.py +++ b/pipelines/feature_extraction_pipeline/import_ML/step1_actscoreDiff.py @@ -47,32 +47,54 @@ def _LoadDiseaseDuration(df, meta_data, returntype='long'): meta_data : meta data which contains duration and sample ID output: long DD samples and short DD samples by list, or healthy samples and short DD samples by list """ - # Sample by disease category - sample_list, sample_category = _get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse') - - # Sort by disease category and exclude uknown samples - patient_samples = [] # patient samples - healthy_samples = [] # healthy samples - for samples, category in zip(sample_list, sample_category): - if category=='Healthy': - healthy_samples = samples - else: - if category!='Unknown':# Excluding unknown samples - patient_samples.append(samples) + # checking multiple element for returntype + if returntype.count(',')>1: raise ValueError('No more than 2 elements for returntype') + + if returntype.find(',')==-1: # if returnType is single(long and healthy) + # Sample by disease category + sample_list, sample_category = _get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse') + + # Sort by disease category and exclude uknown samples + patient_samples = [] # patient samples + healthy_samples = [] # healthy samples + for samples, category in zip(sample_list, sample_category): + if category=='Healthy': + healthy_samples = samples + else: + if category!='Unknown':# Excluding unknown samples + patient_samples.append(samples) + + patient_samples = list(itertools.chain(*patient_samples)) # flatten + patient_samples = list(set(patient_samples).intersection(df.columns.tolist())) # intersected with act score matrix + healthy_samples = list(set(healthy_samples).intersection(df.columns.tolist())) # intersected with act score matrix + patient_meta = meta_data.loc[meta_data['HCVB_ID'].isin(patient_samples)] # Make patient metadata - patient_samples = list(itertools.chain(*patient_samples)) # flatten - patient_samples = list(set(patient_samples).intersection(df.columns.tolist())) # intersected with act score matrix - healthy_samples = list(set(healthy_samples).intersection(df.columns.tolist())) # intersected with act score matrix - patient_meta = meta_data.loc[meta_data['HCVB_ID'].isin(patient_samples)] # Make patient metadata + longDD_samples, shortDD_samples = _get_sample_name_by_contValues(patient_meta, 'HCVB_ID', 'DiseaseDuration', 25) + longDD_samples = list(set(longDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix + shortDD_samples = list(set(shortDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix - longDD_samples, shortDD_samples = _get_sample_name_by_contValues(patient_meta, 'HCVB_ID', 'DiseaseDuration', 25) - longDD_samples = list(set(longDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix - shortDD_samples = list(set(shortDD_samples.values.tolist()).intersection(df.columns.tolist())) # intersected with act score matrix + else: # if returnType is multiple(List) + # Sample by disease category + sample_list, sample_category = _get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse') + category1 = returntype.split(',')[0] + category2 = returntype.split(',')[1] + + # Sort by disease category and exclude uknown samples + patient_samples = [] # patient samples + healthy_samples = [] # healthy samples + for samples, category in zip(sample_list, sample_category): + if category==category1: + category1_samples = list(set(samples).intersection(df.columns.tolist())) # intersected with act score matrix + elif category==category2: + category2_samples = list(set(samples).intersection(df.columns.tolist())) # intersected with act score matrix + # return result if returntype=='long': return longDD_samples, shortDD_samples elif returntype=='healthy': return healthy_samples, shortDD_samples + else: + return category1_samples, category2_samples # Simple control for snakemake(no argparse) actScoreInput=argv[1]