-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNCBI.asm.summary.py
executable file
·249 lines (233 loc) · 8.43 KB
/
NCBI.asm.summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/env python
import os
import sys
from argparse import ArgumentParser
from csv import reader
from shutil import rmtree
from tempfile import mkdtemp
import sqlite3
def parseArgs():
parser = ArgumentParser(description='Creates (or updates) a SQLite3 '
'database of genome assembly information from NCBI\'s FTP site '
'(or a given summary file)', add_help=False,
epilog='The output database file is especially useful for retrieving '
'genome assembly files with the aid of NCBI.asm.fetch.py')
req = parser.add_argument_group('Required')
req.add_argument('-o', '--output', required=True, metavar='FILE',
help='output SQLite3 database file')
opt = parser.add_argument_group('Optional')
opt.add_argument('-c', '--compress', action='store_true', default=False,
help='compress SQLite3 output database; especially useful when the '
'infile database entries overwrite fetched or provided summary data '
'or contains many different additional entries [off]')
opt.add_argument('-d', '--db', choices=['genbank', 'refseq'],
default='refseq', help='database to summarize from NCBI [refseq]')
opt.add_argument('-f', '--force', action='store_true', default=False,
help='when input SQL entry has a conflicting assembly_accession '
'entry with the summary data fetched or provided, overwrite/replace '
'entry in the summary data with the infile database entry [off]')
opt.add_argument('-g', '--group', choices=['all', 'archaea', 'bacteria',
'fungi', 'invertebrate', 'metagenomes', 'other', 'plant', 'protozoa',
'vertebrate_mammalian', 'vertebrate_other', 'viral'],
metavar='{archaea, bacteria, viral}',
default='bacteria', help='group of assembly data to fetch [bacteria]')
opt.add_argument('-h', '--help', action='help',
help='show this help message and exit')
opt.add_argument('-i', '--infile', metavar='FILE',
help='input SQLite3 database file which is appended to the summary '
'data output; table name must be \'<--db>_asm_summary\'; see '
'--force for handling entry conflicts')
opt.add_argument('-q', '--quiet', action='store_true', default=False,
help='hide messages when entries are updated or added [off]')
opt.add_argument('-r', '--rows', metavar='INT', type=int, default=400000,
help='number of data summary rows to simultaneously add to SQLite '
'database at once; useful for speed optimization; default takes 5 '
'sec for refseq whereas naive insertion of 1 takes 7 min [400000]')
opt.add_argument('-s', '--summary-data', metavar='FILE',
help='tab-delimited local file of assembly summary data (38-column '
'format as NCBI has specified in ftp://ftp.ncbi.nlm.nih.gov/genomes/'
'ASSEMBLY_REPORTS/README_assembly_summary.txt) rather than directly '
'fetching from NCBI\'s FTP site; especially useful if FTP is '
'blocked or creating custom database')
opt.add_argument('-u', '--url', metavar='STR', type=str, default=None,
help='web address to file of assembly summary data (38-column); '
'for bypassing NCBI access (e.g., EMBL-EBI) without locally storing '
'the file')
return parser.parse_args()
def sql_open(sql_out):
try:
con = sqlite3.connect(sql_out)
cur = con.cursor()
return con, cur
except Error as e:
sys.stderr.write('ERROR: {}\n'.format(e))
sys.exit(1)
def sql_compress(cur):
cur.execute('VACUUM')
def sql_close(con, cur):
cur.close()
con.close()
def sql_create_table(cur, tbl):
# NOTE: the 23rd column 'asm_not_live_date' was added some time between
# years 2019 and 2021; before that only the first 22 columns existed.
# NOTE: May 23, 2023 there's now 38 columns! The new "group" column had
# to be changed to "group_name" to prevent the fail on error message
# 'sqlite3.OperationalError: near "group": syntax error' because "group"
# alone cannot be used for this. "GROUP" is a reserved keyword in SQL,
# often used in the context of GROUP BY in queries. If you're using
# "GROUP" as a column or table name, consider renaming it to avoid
# conflicts.
cur.execute('''
CREATE TABLE IF NOT EXISTS {} (
assembly_accession TEXT UNIQUE NOT NULL,
bioproject TEXT,
biosample TEXT,
wgs_master TEXT,
refseq_category TEXT,
taxid INT,
species_taxid INT,
organism_name TEXT,
infraspecific_name TEXT,
isolate TEXT,
version_status TEXT,
assembly_level TEXT,
release_type TEXT,
genome_rep TEXT,
seq_rel_date TEXT,
asm_name TEXT,
submitter TEXT,
gbrs_paired_asm TEXT,
paired_asm_comp TEXT,
ftp_path TEXT NOT NULL,
excluded_from_refseq TEXT,
relation_to_type_material TEXT,
asm_not_live_date TEXT,
assembly_type TEXT,
group_name TEXT,
genome_size INT,
genome_size_ungapped INT,
gc_percent REAL,
replicon_count INT,
scaffold_count INT,
contig_count INT,
annotation_provider TEXT,
annotation_name TEXT,
annotation_date TEXT,
total_gene_count INT,
protein_coding_gene_count INT,
non_coding_gene_count INT,
pubmed_id TEXT
)'''.format(tbl))
def sql_data_entry(con, cur, tbl, dat, bulk=False, overwrite=False):
if bulk and not overwrite:
cur.executemany('INSERT INTO {} VALUES({})'.format(
tbl, ('?,' * 38)[:-1]), dat)
elif bulk and overwrite:
cur.executemany('REPLACE INTO {} VALUES({})'.format(
tbl, ('?,' * 38)[:-1]), dat)
elif not bulk and overwrite:
cur.execute('REPLACE INTO {} VALUES({})'.format(
tbl, ('?,' * 38)[:-1]), dat)
else:
cur.execute('INSERT INTO {} VALUES({})'.format(
tbl, ('?,' * 38)[:-1]), dat)
con.commit()
def sql_query(cur, tbl, query, quiet):
cur.execute('SELECT * FROM {} WHERE assembly_accession=?'.format(
tbl), (query,))
fnd = cur.fetchall()
if len(fnd) >= 1:
if not quiet:
sys.stderr.write('INFO: {} already found in db\n'.format(query))
if len(fnd) == 1:
return fnd[0]
elif len(fnd) > 1:
if not quiet:
sys.stderr.write('WARNING: expect 1 but found {} entries for '
'{}. Using first value {}...\n'.format(
len(fnd), query, fnd[0]))
return fnd[0]
else:
return False
def chunk_data(data, rows_per_chunk=5000):
chunk = []
for i, row in enumerate(data):
if i % rows_per_chunk == 0 and i > 0:
yield chunk
del chunk[:]
chunk.append([str(s).decode('UTF-8') for s in row])
yield chunk
def main():
opt = parseArgs()
ofh = os.path.abspath(os.path.expanduser(opt.output))
if opt.infile:
ifh = os.path.abspath(os.path.expanduser(opt.infile))
if ifh == ofh:
sys.stderr.write('ERROR: in and out files cannot be the same\n')
sys.exit(1)
# Prepare or fetch summary file of assembly data
if opt.summary_data:
summary_file = os.path.abspath(os.path.expanduser(opt.summary_data))
with open(summary_file) as f:
ln = next(f)
num_col = len(ln.split('\t'))
if num_col != 38:
sys.stderr.write('ERROR: expect 38 data columns from input '
'summary file; contains {} columns\n'.format(num_col))
sys.exit(1)
else:
tmp = mkdtemp()
tfh = os.path.join(tmp, 'asm-summary.tab')
if opt.url is not None:
url = opt.url
elif opt.group == 'all':
url = ('ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/'
'assembly_summary_{}.txt'.format(opt.db))
else:
url = ('ftp://ftp.ncbi.nlm.nih.gov/genomes/{}/{}/'
'assembly_summary.txt'.format(opt.db, opt.group))
os.system('curl --max-time 300 --silent --fail --show-error '
'{} -o {}'.format(url, tfh))
if os.path.exists(tfh) and os.path.getsize(tfh) > 0:
summary_file = tfh
else:
sys.stderr.write('ERROR: unable to fetch assembly summary file '
'from NCBI\'s FTP site\n')
rmtree(tmp)
sys.exit(1)
# Output sqlite3 db from fetched or provided assembly data
tbl = 'asm_summary'
con, cur = sql_open(ofh)
sql_create_table(cur, tbl)
with open(summary_file) as f:
r = reader((row for row in f if not row.startswith('#')),
delimiter='\t')
for chunk in chunk_data(r, rows_per_chunk=opt.rows):
sql_data_entry(con, cur, tbl, chunk, bulk=True)
# Handle comparisons of input to output sqlite3 dbs
if opt.infile:
sql_close(con, cur)
con, cur = sql_open(ifh)
cur.execute('SELECT * FROM {}'.format(tbl))
input_data = cur.fetchall()
sql_close(con, cur)
con, cur = sql_open(ofh)
reload(sys)
sys.setdefaultencoding('UTF-8')
if opt.force:
for chunk in chunk_data(input_data, rows_per_chunk=opt.rows):
sql_data_entry(con, cur, tbl, chunk, bulk=True, overwrite=True)
else:
for entry in input_data:
a = sql_query(cur, tbl, str(entry[0]), opt.quiet)
if not a:
sql_data_entry(con, cur, tbl,
[str(s).decode('UTF-8') for s in entry])
# Finish and cleanup
if opt.compress:
sql_compress(cur)
sql_close(con, cur)
if not opt.summary_data:
rmtree(tmp)
if __name__ == '__main__':
main()