-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparallel-examples.sh
executable file
·252 lines (200 loc) · 9.62 KB
/
parallel-examples.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/local/bin/bash
# Using GNU parallel painlessly
# Jul 18, 2022
# https://omgenomics.com/parallel/
# https://www.gnu.org/software/parallel/
# https://yewtu.be/watch?v=qypUdm-IE9c
## 1) Basic examples
## --
parallel echo ::: A B C
# Get inputs from a command:
parallel echo ::: $(seq 10)
# Makes a sample file with numbers 1 to 100.
seq 100 > file.txt
# Get inputs from a file:
parallel echo ::: $(cat file.txt)
# same as (using 4 colons):
parallel echo :::: file.txt
# You can get inputs from a pipe:
cat file.txt | grep "2" | parallel echo "hello"
# Every possible combination:
parallel echo ::: A B C ::: 1 2 3
# A 1
# A 2
# B 1
# A 3
# B 2
# B 3
# C 1
# C 2
# C 3
# Linked inputs:
parallel --link echo ::: A B C ::: 1 2 3
# A 1
# B 2
# C 3
## 2) Run a script in parallel
## --
echo '
JOB_ID=${1}
sleep 2
echo "Finished job #${JOB_ID}"
' > do_something.sh
# make it executable
chmod +x do_something.sh
# run this:
parallel ./do_something.sh ::: 1 2 3
# output: after 2 seconds have passed, all appear at once:
# Finished job #1
# Finished job #3
# Finished job #2
# You can also try adding –verbose to see when each job starts running.
parallel --verbose ./do_something.sh ::: 1 2 3
# ./do_something.sh 1
# ./do_something.sh 2
# ./do_something.sh 3
# Finished job #1
# Finished job #3
# Finished job #2
# And you can control how many jobs run at once with --jobs:
parallel --jobs 2 --verbose ./do_something.sh ::: 1 2 3
# Using --jobs 1 is essentially the same as a basic for loop.
# Try the same tricks with this script, which sleeps for the specified number of seconds:
echo '
sleep ${1}
echo "slept ${1} seconds."
' > do_something2.sh
chmod +x do_something2.sh
parallel ./do_something2.sh ::: 1 2 3
# output, now waiting 1 second in between each line:
# slept 1 seconds.
# slept 2 seconds.
# slept 3 seconds.
## 3) Multiple arguments:
## --
echo '
sleep ${1}
echo "${2} slept ${1} seconds"
' > do_something3.sh
chmod +x do_something3.sh
# parallel ./do_something3.sh ::: A B C ::: 1 2 3
# Whoops, teachable mistake here:
# 1 slept A seconds
# usage: sleep seconds
# 2 slept A seconds
# usage: sleep seconds
# 3 slept A seconds
# usage: sleep seconds
# 1 slept B seconds
# usage: sleep seconds
# 2 slept B seconds
# usage: sleep seconds
# 3 slept B seconds
# usage: sleep seconds
# 1 slept C seconds
# usage: sleep seconds
# 2 slept C seconds
# usage: sleep seconds
# 3 slept C seconds
# usage: sleep seconds
# This ran, for example, ./do_something3.sh A 1, so the script ran sleep A which
# is wrong, we wanted sleep 1. You can fix this by changing the order of
# arguments to ::: 1 2 3 ::: A B C, or you can specify where to insert the
# arguments – this is more flexible.
parallel ./do_something3.sh {2} {1} ::: A B C ::: 1 2 3
# A slept 1 seconds
# B slept 1 seconds
# C slept 1 seconds
# A slept 2 seconds
# B slept 2 seconds
# C slept 2 seconds
# A slept 3 seconds
# B slept 3 seconds
# C slept 3 seconds
# Much better.
## 4) Running parallel with a function
## --
# This is nice when you want something more complex than the small examples above
# but without writing a script.
# Try this: (This doesn't work for me in the ZSH shell (new Mac) though.)
function make_a_file_exported() {
seq ${2} > numbers_${1}.txt
}
export -f make_a_file_exported
parallel --link make_a_file_exported {1} {2} ::: A B C ::: 10 20 30
# Otherwise you can use env_parallel:
# follow instructions the first time you run this to make it work.
source env_parallel.bash
function make_a_file_linked() {
seq ${2} > numbers_env_${1}.txt
}
env_parallel --link make_a_file_linked {1} {2} ::: A B C ::: 10 20 30
# If neither of those work for you, google around for answers or just use scripts instead.
## 5) Practical tips for launching bioinformatics jobs
## --
# You can use parallel to orchestrate jobs for you on the cloud or on your lab/
# institution’s cluster. As an example, I’ve been using this a lot at work where
# we have a script that launches one stage at a time on Google Cloud, and then
# when a step finishes, it will take that output as the input for the next stage.
# It’s nice to use parallel to orchestrate this so I don’t have to open each of
# these orchestration scripts in different terminal tabs (or screen/tmx2 tabs).
# In this example, you have a runner_script.sh that might launch a job or a
# series of interdependent jobs on a cloud or cluster. We give it some input
# files and in this case I want to see the result at a few different filter
# values, so I get that combination of each input file with each of the 3 values
# of quality filter.
# you can make some fake input files for practice like this:
parallel touch "input_file_{1}.bam" ::: $(seq 10)
# Use a function to specify carefully how the runner script should be run:
function launch_runner_script() {
INPUT=${1}
Q_FILTER=${2}
OUTPUT=${INPUT%.bam}.output.q${Q_FILTER}
echo "runner_script.sh --input ${1} --output ${OUTPUT}.bam --filter ${Q_FILTER} 2>&1 > ${OUTPUT}.log"
}
echo "env parallel example 2"
env_parallel launch_runner_script {1} {2} ::: $(ls input_file*.bam) ::: 10 20 30
# You’ll notice a few things above:
# 1. We use the input name to determine the output name, removing the .bam
# prefix and attaching that Q_FILTER value to differentiate the outputs. I
# try to make this as clean as possible
# 2. We write a log file next to the output. ` 2>&1 >` means we’re capturing
# both stderr and stdout and writing them to the same log file. I always make
# sure to capture logs for each run individually so I can check on them along
# the way and so I have a hope of debugging any problems that arise.
# 3. We echo the runner_script.sh command, this is always a good idea to do
# first to check the output commands are what you expected before you proceed
# (then you can remove the echo and quotes to run it for real). See the
# output below:
# Output:
# runner_script.sh --input input_file_1.bam --output input_file_1.output.q30.bam --filter 30 2>&1 > input_file_1.output.q30.log
# runner_script.sh --input input_file_3.bam --output input_file_3.output.q10.bam --filter 10 2>&1 > input_file_3.output.q10.log
# runner_script.sh --input input_file_1.bam --output input_file_1.output.q20.bam --filter 20 2>&1 > input_file_1.output.q20.log
# runner_script.sh --input input_file_2.bam --output input_file_2.output.q20.bam --filter 20 2>&1 > input_file_2.output.q20.log
# runner_script.sh --input input_file_10.bam --output input_file_10.output.q30.bam --filter 30 2>&1 > input_file_10.output.q30.log
# runner_script.sh --input input_file_2.bam --output input_file_2.output.q10.bam --filter 10 2>&1 > input_file_2.output.q10.log
# runner_script.sh --input input_file_10.bam --output input_file_10.output.q10.bam --filter 10 2>&1 > input_file_10.output.q10.log
# runner_script.sh --input input_file_2.bam --output input_file_2.output.q30.bam --filter 30 2>&1 > input_file_2.output.q30.log
# runner_script.sh --input input_file_1.bam --output input_file_1.output.q10.bam --filter 10 2>&1 > input_file_1.output.q10.log
# runner_script.sh --input input_file_10.bam --output input_file_10.output.q20.bam --filter 20 2>&1 > input_file_10.output.q20.log
# runner_script.sh --input input_file_4.bam --output input_file_4.output.q20.bam --filter 20 2>&1 > input_file_4.output.q20.log
# runner_script.sh --input input_file_3.bam --output input_file_3.output.q20.bam --filter 20 2>&1 > input_file_3.output.q20.log
# runner_script.sh --input input_file_4.bam --output input_file_4.output.q10.bam --filter 10 2>&1 > input_file_4.output.q10.log
# runner_script.sh --input input_file_3.bam --output input_file_3.output.q30.bam --filter 30 2>&1 > input_file_3.output.q30.log
# runner_script.sh --input input_file_6.bam --output input_file_6.output.q20.bam --filter 20 2>&1 > input_file_6.output.q20.log
# runner_script.sh --input input_file_6.bam --output input_file_6.output.q10.bam --filter 10 2>&1 > input_file_6.output.q10.log
# runner_script.sh --input input_file_5.bam --output input_file_5.output.q30.bam --filter 30 2>&1 > input_file_5.output.q30.log
# runner_script.sh --input input_file_4.bam --output input_file_4.output.q30.bam --filter 30 2>&1 > input_file_4.output.q30.log
# runner_script.sh --input input_file_5.bam --output input_file_5.output.q20.bam --filter 20 2>&1 > input_file_5.output.q20.log
# runner_script.sh --input input_file_5.bam --output input_file_5.output.q10.bam --filter 10 2>&1 > input_file_5.output.q10.log
# runner_script.sh --input input_file_7.bam --output input_file_7.output.q10.bam --filter 10 2>&1 > input_file_7.output.q10.log
# runner_script.sh --input input_file_7.bam --output input_file_7.output.q20.bam --filter 20 2>&1 > input_file_7.output.q20.log
# runner_script.sh --input input_file_7.bam --output input_file_7.output.q30.bam --filter 30 2>&1 > input_file_7.output.q30.log
# runner_script.sh --input input_file_6.bam --output input_file_6.output.q30.bam --filter 30 2>&1 > input_file_6.output.q30.log
# runner_script.sh --input input_file_8.bam --output input_file_8.output.q30.bam --filter 30 2>&1 > input_file_8.output.q30.log
# runner_script.sh --input input_file_8.bam --output input_file_8.output.q10.bam --filter 10 2>&1 > input_file_8.output.q10.log
# runner_script.sh --input input_file_8.bam --output input_file_8.output.q20.bam --filter 20 2>&1 > input_file_8.output.q20.log
# runner_script.sh --input input_file_9.bam --output input_file_9.output.q30.bam --filter 30 2>&1 > input_file_9.output.q30.log
# runner_script.sh --input input_file_9.bam --output input_file_9.output.q10.bam --filter 10 2>&1 > input_file_9.output.q10.log
# runner_script.sh --input input_file_9.bam --output input_file_9.output.q20.bam --filter 20 2>&1 > input_file_9.output.q20.log