-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfastq_to_fasta_qual.pl
executable file
·154 lines (127 loc) · 4.47 KB
/
fastq_to_fasta_qual.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/perl
use strict;
my $usage = "Use: $0 input.fastq\n";
if (scalar @ARGV != 1) { die $usage; }
if (! (open (IN, "<$ARGV[0]"))) { die "Can't open $ARGV[0]: $!\n"; }
if (-e "$ARGV[0].fasta") { die "$ARGV[0].fasta extists - please remove or rename\n"; }
if (-e "$ARGV[0].qual") { die "$ARGV[0].qual extists - please remove or rename\n"; }
if (! (open (FASTA, ">$ARGV[0].fasta"))) { die "Can't write to $ARGV[0].fasta: $!\n"; }
if (! (open (QUAL, ">$ARGV[0].qual"))) { die "Can't write to $ARGV[0].qual: $!\n"; }
my $id = "";
my $seq = "";
my $qual = "";
my $line = <IN>;
if ($line !~ /\@/) { die "$ARGV[0] does not look like a FASTQ file (it should start with \@)\n"; }
while ($line =~ s/^\@//o) {
chomp ($line);
my $id = $line;
$line = <IN>;
while ($line !~ s/^\+//o) {
chomp ($line);
$seq .= $line;
$line = <IN>; }
chomp ($line);
if (($line =~ /\S/o) && ($line ne $id)) { die "ID of $id not followed by same identifier for quality scores\n"; }
$line = <IN>;
while (length ($qual) < length ($seq)) {
chomp ($line);
$qual .= $line;
$line = <IN>; }
print FASTA ">$id\n$seq\n";
print QUAL ">$id\n";
my @a = split //o, $qual;
foreach my $i (@a) {
my $q = 10 * log (1 + 10 ** ((ord ($i) - 64) / 10)) / log (10);
printf QUAL "%1.0f ", $q; }
print QUAL "\n";
$id = "";
$seq = "";
$qual = "";
}
# #!/usr/bin/env perl
# # Author: Lee Katz
# # Convert a fastq to a fasta/qual combo using BioPerl, with some Linux commands
# use Bio::Perl;
# use Data::Dumper;
# use strict;
# use warnings;
# use threads;
# use Thread::Queue;
# use Getopt::Long;
# my $settings={};
# $|=1;
# my %numSequences; # static for a subroutine
# exit(main());
# sub main{
# die("Usage: $0 -i inputFastqFile [-n numCpus -q outputQualfile -f outputFastaFile]") if(@ARGV<1);
# GetOptions($settings,('numCpus=s','input=s','qualOut=s','fastaOut=s'));
# my $file=$$settings{input}||die("input parameter missing");
# my $outfasta=$$settings{fastaOut}||"$file.fasta";
# my $outqual=$$settings{qualOut}||"$file.qual";
# my $numCpus=$$settings{numCpus}||1;
# my @subfile=splitFastq($file,$numCpus);
# for my $f(@subfile){
# threads->create(\&convert,$f,"$f.fasta","$f.qual");
# }
# $_->join for (threads->list);
# # join the sub files together
# joinFastqFiles(\@subfile,$file);
# return 0;
# }
# sub convert{
# my($file,$outfasta,$outqual)=@_;
# my $numSequences=numSequences($file);
# my $reportEvery=int($numSequences/100) || 1;
# print "$numSequences sequences to convert in $file\n";
# my $in=Bio::SeqIO->new(-file=>$file,-format=>"fastq-illumina");
# my $seqOut=Bio::SeqIO->new(-file=>">$outfasta",-format=>"fasta");
# my $qualOut=Bio::SeqIO->new(-file=>">$outqual",-format=>"qual");
# my $seqCount=0;
# my $percentDone=0;
# while(my $seq=$in->next_seq){
# $seqOut->write_seq($seq);
# $qualOut->write_seq($seq);
# $seqCount++;
# if($seqCount%$reportEvery == 0){
# $percentDone++;
# print "$percentDone%..";
# }
# }
# print "Done with subfile $file.\n";
# return 1;
# }
# sub joinFastqFiles{
# my($subfile,$outfileBasename)=@_;
# my($command,$subfasta,$subqual);
# # fasta
# $subfasta.="$_.fasta " for(@$subfile);
# $command="cat $subfasta > $outfileBasename.fasta";
# system($command);
# # qual
# $subqual.="$_.qual " for (@$subfile);
# $command="cat $subqual > $outfileBasename.qual";
# system($command);
# return 1;
# }
# sub splitFastq{
# my($file,$numCpus)=@_;
# my $prefix="FQ"; # for fastq
# my $numSequences=numSequences($file);
# my $numSequencesPerFile=int($numSequences/$numCpus);
# my $numSequencesPerFileRemainder=$numSequences % $numCpus;
# my $numLinesPerFile=$numSequencesPerFile*4; # four lines per read; this could become incorrect if there is a really long read (not currently likely)
# system("rm -rf tmp;mkdir tmp;");
# system("split -l $numLinesPerFile $file 'tmp/FQ'");
# return glob("tmp/FQ*");
# }
# # use Linux to find the number of sequences quickly, but cache the value because it is still a slow process
# # This should probably changed to c -l4 but I don't have time to test the change
# # TODO for anyone reading this: please change this method to wc -l divided by 4.
# sub numSequences{
# my $file=shift;
# return $numSequences{$file} if($numSequences{$file});
# my $num=`grep -c '^\@' $file`;
# chomp($num);
# $numSequences{$file}=$num;
# return $num;
# }