-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_orf_spike_protein.py
50 lines (38 loc) · 1.7 KB
/
find_orf_spike_protein.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from Bio import SeqIO
# Open the FASTA file and parse the nucleotide sequence
### find orf and
def find_orf(scaffolds, start_pos = 21562, end_pos = 25384):
with open(scaffolds, "r") as handle:
record = SeqIO.read(handle, "fasta")
nucleotide_seq = record.seq
s_gene_start = start_pos # Python uses 0-based indexing
s_gene_end = end_pos
# Find the closest ATG near the annotated start position
start_codon = "ATG"
search_start = max(0, s_gene_start - 50)
search_end = s_gene_start + 50
closest_atg_pos = -1
min_distance = float('inf')
for pos in range(search_start, search_end):
if record.seq[pos:pos+3] == start_codon:
distance = abs(s_gene_start - pos)
if distance < min_distance:
closest_atg_pos = pos
min_distance = distance
if closest_atg_pos != -1:
adjusted_s_gene_start = closest_atg_pos
adjusted_s_gene_sequence = record.seq[adjusted_s_gene_start:s_gene_end]
adjusted_spike_protein_sequence = adjusted_s_gene_sequence.translate()
print("Adjusted spike glycoprotein sequence:")
print(adjusted_spike_protein_sequence)
else:
print("No start codon found near the annotated S gene start position.")
return adjusted_spike_protein_sequence
def save_protein_fasta(protein_sequence, output_file, header=">protein"):
with open(output_file, "w") as f:
f.write(header + "\n")
f.write(str(protein_sequence) + "\n")
# Replace with the path where you want to save the file
output_file = "spike_glycoprotein.fasta"
adjusted_spike_protein_sequence = find_orf(scaffolds)
save_protein_fasta(adjusted_spike_protein_sequence, output_file)