-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSec. Structure Predictor.py
189 lines (173 loc) · 9.07 KB
/
Sec. Structure Predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# On the basis of Propensity Value(Chou-Fasman Method) prediction of Protein Secondary Structure
import numpy as np
import requests
import plotext as plt
from tabulate import tabulate
import sys
from colorama import Fore, Style
#Creating APIs
WEBSITE_API = "https://rest.uniprot.org/"
PROTEINS_API = "https://www.ebi.ac.uk/proteins/api"
print("On the basis of Propensity Value(Chou-Fasman Method) prediction of Protein Secondary Structure")
# Amino Acid Abbreviations
amino_acids = {
'A': 'Alanine', 'R': 'Arginine', 'N': 'Asparagine',
'D': 'Aspartic Acid', 'C': 'Cysteine', 'E': 'Glutamic Acid',
'Q': 'Glutamine', 'G': 'Glycine', 'H': 'Histidine',
'I': 'Isoleucine', 'L': 'Leucine', 'K': 'Lysine',
'M': 'Methionine', 'F': 'Phenylalanine', 'P': 'Proline',
'S': 'Serine', 'T': 'Threonine', 'W': 'Tryptophan',
'Y': 'Tyrosine', 'V': 'Valine'
}
# Chou-Fasman Parameters
parameters = {
'Alpha Helix': {'A': 1.42, 'R': 0.98, 'N': 0.67, 'D': 0.73, 'C': 0.70, 'E': 1.51, 'Q': 1.11, 'G': 0.57, 'H': 1.00, 'I': 1.08, 'L': 1.21, 'K': 1.16, 'M': 1.45, 'F': 1.13, 'P': 0.57, 'S': 0.76, 'T': 0.82, 'W': 1.08, 'Y': 0.69, 'V': 1.06},
'Beta Sheet': {'A': 0.83, 'R': 0.93, 'N': 0.89, 'D': 1.01, 'C': 1.19, 'E': 0.74, 'Q': 0.80, 'G': 0.81, 'H': 0.81, 'I': 1.60, 'L': 1.22, 'K': 0.81, 'M': 1.05, 'F': 1.28, 'P': 0.62, 'S': 1.13, 'T': 1.20, 'W': 1.14, 'Y': 1.47, 'V': 1.65},
'Coil': {'A': 0.75, 'R': 1.09, 'N': 1.44, 'D': 1.26, 'C': 0.11, 'E': 0.75, 'Q': 1.09, 'G': 1.62, 'H': 1.19, 'I': 0.32, 'L': 0.57, 'K': 1.03, 'M': 0.50, 'F': 0.59, 'P': 1.81, 'S': 0.11, 'T': 0.98, 'W': 0.78, 'Y': 0.84, 'V': 0.29},
'Turn' : {'A': 0.66, 'R': 0.95, 'N': 1.56, 'D': 1.46, 'C': 1.19, 'E': 0.74, 'Q': 0.98, 'G': 1.56, 'H': 0.95, 'I': 0.47, 'L': 0.59, 'K': 1.01, 'M': 0.60, 'F': 0.60, 'P': 1.52, 'S': 1.43, 'T': 0.96, 'W': 0.96, 'Y': 1.14, 'V': 0.50}
}
#function to Predict the Secondary Structure on the basis of Chou Fasman method
def calculate_structure(probability_matrix, sequence):
structure_prediction = []
for amino_acid in sequence:
probabilities = [parameters['Alpha Helix'][amino_acid], parameters['Beta Sheet'][amino_acid],parameters['Coil'][amino_acid], parameters['Turn'][amino_acid]]
max_index = probabilities.index(max(probabilities)) #probabilities = [0.98, 0.93]
if max_index == 0:
structure_prediction.append('H') # Alpha Helix
elif max_index == 1:
structure_prediction.append('E') # Beta Sheet
elif max_index == 3:
structure_prediction.append('T') # Turn
else:
structure_prediction.append('C') # Coil
return ''.join(structure_prediction)
#main function that control all the task except prediction
def main():
#sequence = input("Enter protein sequence: ")
#'''
print("1. If you know/have the FASTA sequence, go for the option 1\n2. If you don't know/have the FASTA sequence, go for the option 2")
ch = int(input("Enter the choice for opting the way to predict the secondary structure of protein: "))
if ch == 1:
print("Paste the FASTA sequence. Press ENTER, then Ctrl+D to save it.")
sequence = ''
while True:
try:
line = input()
except EOFError:
break
sequence += line
elif ch == 2:
def get_url(url, **kwargs):
response = requests.get(url, **kwargs);
if not response.ok:
print(response.text)
response.raise_for_status()
sys.exit()
return response
protein_name = input("Enter the name of the protein: ")
print("The types of taxonomy")
print(
"1. BOVIN for Bovine\n2. CHICK for Chicken\n3. ECOLI for Escherichia coli\n4. HORSE for Horse\n5. HUMAN for Homo sapiens\n6. MAIZE for Maize (Zea mays)\n7. MOUSE for Mouse\n8. PEA for Garden pea (Pisum sativum)\n9. PIG for Pig\n10. RABIT for Rabbit\n11. RAT for Rat\n12. SHEEP for Sheep\n13. SOYBN for Soybean (Glycine max)\n14. TOBAC for Common tobacco (Nicotina tabacum)\n15. WHEAT for Wheat (Triticum aestivum)\n16. YEAST for Baker's yeast (Saccharomyces cerevisiae)\n17. * for ALL(It will have all types of structure)")
taxonomy_type = input("Enter the taxonomy of whose secondary protein structure is required: ").upper()
r = get_url(
f"{WEBSITE_API}/uniprotkb/search?query={protein_name} AND (taxonomy_name={taxonomy_type})&format=fasta")
# creating a temporary text file and deleting it after the runtime.
fhw = open("temp.txt", "w+")
file = str(r.text)
fhw.writelines(file)
fhw.close()
fhr = open("temp.txt", "r")
st = ''
lst_k = []
lst_v = []
# lst_check = [">sp|P01308|INS_HUMAN Insulin OS=Homo sapiens OX=9606 GN=INS PE=1 SV=1\n","MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAED\n","LQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN\n", ">sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens OX=9606 GN=INSR PE=1 SV=4\n", "MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL\n", "QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL\n", "FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN\n","PS\n",">sp|P14735|IDE_HUMAN Insulin-degrading enzyme OS=Homo sapiens OX=9606 GN=IDE PE=1 SV=4\n","MRYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPAIKRIGNHITKSPED\n"]
for k in fhr.readlines():
if k.startswith(">"):
lst_v.append(st)
st = ''
lst_k.append(k.rstrip("\n"))
elif k.isupper():
st += k.rstrip("\n")
else:
pass
lst_v.append(st)
lst_v.pop(0)
f_lst_k = f_lst_v = []
flag = 0
n_dic = {}
dic = dict(zip(lst_k, lst_v))
keys = list(dic.keys())
values = [] # list(dic.values())
for k in range(0, len(lst_k)):
key = lst_k[k]
values.append(int((((key.split(" "))[-2]).split("="))[-1]))
arr = np.array(values)
sorted_value_index = (-arr).argsort()
rev_sorted_dict = {keys[i]: values[i] for i in sorted_value_index}
#print(rev_sorted_dict)
n_keys = list(rev_sorted_dict.keys())
# print(n_keys)
for j in n_keys:
n_dic[j] = dic[j]
# print(n_dic)
counter_1 = 0
for head in n_dic:
print(Fore.YELLOW + "(",counter_1 + 1, ") ", head)
print(Style.RESET_ALL, end='')
print(n_dic[head])
counter_1 += 1
print(Fore.LIGHTRED_EX+"Note:- More the PE values, more is the Existence of the Protein\n\t The list has been ordered as Highest PE value at the top, and decreases as we go down.")
print(Style.RESET_ALL, end='')
ch_2 = int(input("Enter the serial number of the protein: "))
sequence = n_dic[n_keys[ch_2-1]]
fhr.close()
sequence = sequence.upper()
print("No. of Amino acids : ",len(sequence))
prediction = calculate_structure(parameters, sequence)
print(Fore.CYAN + "Predicted Secondary Structure: ", prediction)
t = len(prediction)
count_H, count_E, count_C, count_T = 0, 0, 0, 0
for k in prediction:
if k == "H":
count_H += 1
elif k == "E":
count_E += 1
elif k == "C":
count_C += 1
elif k == "T":
count_T += 1
else:
pass
percent_H = (count_H / t) * 100
percent_E = (count_E / t) * 100
percent_C = (count_C / t) * 100
percent_T = (count_T / t) * 100
#print(percent_H, "\n", percent_E, "\n", percent_C, "\n", percent_T)
data = [["alpa-helix(H)", percent_H], ["beta-sheet(E)", percent_E], ["coil(C)", percent_C], ["turn(T)", percent_T]]
print(Fore.YELLOW + tabulate(data, headers=["Types of Bond", "Percentage"]))
x_axis = ["alpa-helix(H)", "beta-sheet(E)", "coil(C)", "turn(T)"]
y_axis = [percent_H, percent_E, percent_C, percent_T]
plt.simple_bar(x_axis, y_axis, width=100, title='The Graphical representation of % of helix, sheet, turn, coil')
plt.show()
print(Fore.LIGHTRED_EX+"\t1. Red - Alpha-Helix(H)")
print(Fore.LIGHTGREEN_EX + "\t2. Green - Beta-Sheet(E)")
print(Fore.LIGHTBLUE_EX + "\t3. Blue - Coil(C)")
print(Fore.LIGHTYELLOW_EX + "\t4. Yellow - Terminal(T)")
print(Style.BRIGHT+Fore.LIGHTWHITE_EX+"\tTherefore, the Actual Secondary Structure is:-")
counter = 0
for k in prediction:
if k == "H":
print(Fore.LIGHTRED_EX + f'{sequence[counter]}', end='')
elif k == "E":
print(Fore.LIGHTGREEN_EX + f'{sequence[counter]}', end='')
elif k == "C":
print(Fore.LIGHTBLUE_EX + f'{sequence[counter]}', end='')
elif k == "T":
print(Fore.LIGHTYELLOW_EX + f'{sequence[counter]}', end='')
else:
pass
counter += 1
if counter%100 == 0:
print("\n",end='')
if __name__ == "__main__":
main()