-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbowfin_code.sh
360 lines (313 loc) · 9.09 KB
/
bowfin_code.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
# Assign names to each forward and reverse sequence reads file
fwd1="bowfin_1.fq.gz"
rev1="bowfin_2.fq.gz"
name1="bowfin"
# Trim PE reads
fastp \
-i ${fwd1} \
-I ${rev1} \
-o ${name1}_F.trimmed.fq.gz \
-O ${name1}_R.trimmed.fq.gz \
--detect_adapter_for_pe \
--cut_front \
--cut_tail \
--cut_window_size=4 \
--cut_mean_quality=20 \
--qualified_quality_phred=20 \
--unqualified_percent_limit=30 \
--n_base_limit=5 \
--length_required=50 \
--low_complexity_filter \
--complexity_threshold=30 \
--overrepresentation_analysis \
--json=${name1}.json \
--html=${name1}.html \
--report_title="$name1" \
--thread=8
# Create MP reads using in silico method
cross-mates AHAT01.fasta bowfin_F.trimmed.fq bowfin_R.trimmed.fq
#Overlap paired-end reads with flash
flash bowfin_F.trimmed.fq bowfin_R.trimmed.fq -M 180
# Remove identical read pairs with clumpify
#PE reads
echo "Running Clumpify PE mode..."
clumpify.sh \
-in=bowfin_F.trimmed.fq \
-in2=bowfin_R.trimmed.fq \
-out=bowfin_F.trimmed.deduped.fq.gz \
-out2=bowfin_R.trimmed.deduped.fq.gz \
dedupe=t \
containment=f \
optical=t \
dupedist=12000
echo "Finished Clumpify PE mode..."
#SE reads
echo "Running Clumpify SE mode..."
clumpify.sh \
-in=bowfin.extendedFrags.fastq \
-out=bowfin.extendedFrags.deduped.fq.gz \
dedupe=t \
containment=f \
optical=t \
dupedist=12000
echo "Finished Clumpify SE mode..."
#Remove mitochondrial reads with bowtie2
#PE reads
name="bowfin"
bowtie2-build bowfin_mito.fasta bowfin_mito
# Do Mapping for bowfin reads
bowtie2 \
--phred33 \
-q \
--very-sensitive \
--minins 0 \
--maxins 1000 \
--fr \
--threads 20 \
--reorder \
-x bowfin_mito \
-1 ${name}_F.trimmed.deduped.fq.gz \
-2 ${name}_R.trimmed.deduped.fq.gz | \
samtools view -b -F 2 | \
samtools sort -T ${name}.tmp -n -O bam | \
bedtools bamtofastq -i - -fq ${name}_F.trimmed.deduped.noMito.fq -fq2 ${name}_R.trimmed.deduped.noMito.fq
#SE reads
name="bowfin"
bowtie2-build bowfin_mito.fasta bowfin_mito
# Do Mapping for bowfin reads
bowtie2 \
--phred33 \
-q \
--very-sensitive \
--minins 0 \
--maxins 1000 \
--threads 20 \
--reorder \
-x bowfin_mito \
-U ${name}.extendedFrags.deduped.fq.gz | \
samtools view -b -F 2 | \
samtools sort -T ${name}.tmp -n -O bam | \
bedtools bamtofastq -i - -fq ${name}.extendedFrags.deduped.noMito.fq
#Error correct with musket
#PE reads
musket -p 64 bowfin_F.trimmed.deduped.noMito.fq bowfin_R.trimmed.deduped.noMito.fq -omulti trout.trimmed.deduped.noMito.corrected -inorder
#SE reads
musket -p 64 bowfin.extendedFrags.deduped.noMito.fq -omulti bowfin.extendedFrags.deduped.noMito.corrected -inorder
#Assemble with abyss
abyss-pe k=104 name=bowfin lib='pea peb' mp='mpc mpd mpe mpf mpg mph mpi' \
pea='bowfin.trimmed.deduped.noMito.corrected_1.fq bowfin.trimmed.deduped.noMito.corrected_2.fq' \
peb='bowfin_F.trimmed-pe-500_1.fq bowfin_F.trimmed-pe-500_2.fq' \
se='bowfin.extendedFrags.deduped.noMito.corrected.fq'\
mpc='bowfin_F.trimmed-mp-1000_1.fq bowfin_F.trimmed-mp-1000_2.fq' \
mpd='bowfin_F.trimmed-mp-1500_1.fq bowfin_F.trimmed-mp-1500_2.fq' \
mpe='bowfin_F.trimmed-mp-2000_1.fq bowfin_F.trimmed-mp-2000_2.fq' \
mpf='bowfin_F.trimmed-mp-5000_1.fq bowfin_F.trimmed-mp-5000_2.fq' \
mpg='bowfin_F.trimmed-mp-10000_1.fq bowfin_F.trimmed-mp-10000_2.fq' \
mph='bowfin_F.trimmed-mp-20000_1.fq bowfin_F.trimmed-mp-20000_2.fq' \
mpi='bowfin_F.trimmed-mp-50000_1.fq bowfin_F.trimmed-mp-50000_2.fq'
#Mapping with bwa, fastp, samtools and seqtk
# Set the input file:
sample=$(sed -n "$SLURM_ARRAY_TASK_ID"p samples)
fq=$(grep "$sample" seq_files.list)
# Run SE mapping
./Mapper_SE \
-r bowfin-8.fa \
-i $fq \
-t 4 \
-o $sample
#Mapper_SE executable script
# V3.2 4/02/2020 by Bob Fitak - SE reads
# This script will take an input fastq file of raw
# reads and clean, map, and process them. The
# reference genome will be indexed using BWA if
# no index is found.
# You may need to adjust other parameters for
# fastp, bwa and samtools as you see fit.
# The program assumes paired-end sequencing
# libraries
# REQUIREMENTS (must be in your PATH):
# bwa
# fastp
# samtools (v1.3 or greater)
# seqtk (Heng Li's github page)
scriptName="${0##*/}"
function printUsage() {
cat<<EOF
Synopsis
$scriptName [-h | --help] [-i forward.fastq{.gz}] [-j reverse.fastq{.gz}] [-r reference.fa] [-X integer] [-t integer] [-o output_prefix] [-u]
Description
Clean, trim, and map a sequencing dataset to
a reference genome, keeping the mapped or
unmapped reads file (SAM format).
-i
The forward reads file in fastq format (gzipped accepted)
-j
The reverse reads file in fastq format (gzipped accepted)
-r
the reference genome fasta file for mapping
-X
include only X number of reads (for testing purposes)
Default: include all reads
-t
the number of threads (default = 1)
-o
prefix for output files (default = "I-should-have-named-these-files")
-u
if set (T), then only unmapped reads are kept and the resulting BAM files deleted to save space
Default: F, don't keep unmapped reads in separate files
-h, --help
Brings up this help page
Author
Robert Fitak, Department of Biology; Genomics and Bioinformatics Cluster, University of Central Florida
EOF
}
if [ $# -lt 1 ]
then printUsage
exit 1
fi
# Set default number of threads
threads=1
# Set default for not keeping unmapped reads
unmapped=F
# Set default output files name
out="I-should-have-named-these-files"
# Read input parameters
while getopts ':hi:j:r:X:t:o:u' option
do
case "${option}"
in
i) fwd=${OPTARG};;
j) rev=${OPTARG};;
r) ref=${OPTARG};;
X) reads=${OPTARG};;
t) threads=${OPTARG};;
o) out=${OPTARG};;
u) unmapped=T;;
h) printUsage
exit
;;
help) printUsage
exit
;;
esac
done
# Check for reference genome index, or index if none is found
if [[ -f $ref.bwt ]]
then
echo "Reference index exists, skipping reference index build"
else
echo "Making bwa index"
bwa index $ref
fi
# Process all sequences
if [[ -z $reads ]]
then
fastp \
--in1 $fwd \
--stdout \
--detect_adapter_for_pe \
--adapter_fasta /network/rit/home/bobfitak/.bin/adapters.fa \
--cut_front \
--cut_tail \
--cut_window_size=4 \
--cut_mean_quality=20 \
--qualified_quality_phred=20 \
--unqualified_percent_limit=30 \
--n_base_limit=5 \
--length_required=50 \
--low_complexity_filter \
--complexity_threshold=30 \
--overrepresentation_analysis \
--html=${out}.html \
--json=${out}.json \
--report_title="$out" \
--thread=${threads} | \
bwa mem \
-M \
-p \
-t ${threads} \
${ref} \
- | \
samtools sort \
-T ${out}.tmp \
-O bam \
-o ${out}.sorted.bam -
#${bowtie2} \
# -x ${ref} \
# --interleaved - \
# -q \
# --very-sensitive-local \
# --no-unal \
# --threads ${threads} \
# -S ${SRR}.sam \
# --un-conc-gz ${SRR}.unmapped.fq.gz
# Or process only X reads
elif [[ ! -z $reads ]]
then
# multiply 4 by the number of reads desired
n=$(( $reads * 4 ))
seqtk seq -l0 $fwd | head -n ${n} | \
fastp \
--stdin \
--stdout \
--detect_adapter_for_pe \
--adapter_fasta /network/rit/home/bobfitak/.bin/adapters.fa \
--cut_front \
--cut_tail \
--cut_window_size=4 \
--cut_mean_quality=20 \
--qualified_quality_phred=20 \
--unqualified_percent_limit=30 \
--n_base_limit=5 \
--length_required=50 \
--low_complexity_filter \
--complexity_threshold=30 \
--overrepresentation_analysis \
--html=${out}.html \
--json=${out}.json \
--report_title="$out" \
--thread=${threads} | \
bwa mem \
-M \
-p \
-t ${threads} \
${ref} \
- | \
samtools sort \
-T ${out}.tmp \
-O bam \
-o ${out}.sorted.bam -
fi
# Delete extra files
rm -rf ${out}.tmp* ${out}.json
# BAM file, get stats, index.
samtools index ${out}.sorted.bam
samtools stats ${out}.sorted.bam > ${out}.bamstats
rm -rf ${out}.tmp*
# If -u is set, get unmapped reads
if [ "$unmapped" = "T" ]
then
echo "Keeping unmapped reads and deleting BAM files"
samtools fastq \
-1 ${out}.unmapped.F.fq.gz \
-2 ${out}.unmapped.R.fq.gz \
-0 ${out}.unmapped.0.fq.gz \
-s ${out}.unmapped.s.fq.gz \
-c 6 \
-n \
-f 4 \
${out}.sorted.bam
rm ${out}.sorted.bam ${out}.sorted.bam.bai
fi
#echo "input = $SRR, X = $reads, reference = $ref, threads = $threads"
echo "Mapping analysis complete. All results are saved with the prefix: $out"
#SNP calling with STACKS
/network/rit/home/sb939359/turnerlab/bin/stacks-2.53/bin/ref_map.pl -T 16 \
--popmap /network/rit/home/sb939359/turnerlab/spencer/stacks/names.txt \
-X "populations:--vcf" \
-X "populations:--structure" \
-X "populations:--phylip" \
-o /network/rit/home/sb939359/turnerlab/spencer/stacks/output \
--samples /network/rit/home/sb939359/turnerlab/spencer/stacks \
vcftools --vcf filteredSNPs.vcf --remove-filtered-all --maf 0.10 --remove-indels --max-missing 0.5 --recode --recode-INFO-all --out SNPs_filtered_maf0.10_no_indels_50percent