Skip to content

Commit

Permalink
apps/io_tester: add some tests for the IO scheduler
Browse files Browse the repository at this point in the history
Add some test cases useful for presenting improvement ideas for the IO scheduler.
3 of 4 tests added in this patch illustrate some goals which weren't met before
but are met after this series. The `tau_nemesis` test illustrates a problem
which is present both before and after this series.
  • Loading branch information
michoecho committed Jan 14, 2025
1 parent dbc19d2 commit 2070713
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env bash

# Test scenario:
# A single CPU-starved shard has a batch IO job.
# Goal: it should be able to utilize the entire bandwidth of the disk,
# despite the rare polls.

if [ $# -ne 1 ]; then
echo "Usage: $0 IO_TESTER_EXECUTABLE" >&2
exit 1
fi

"$1" --smp=7 --storage=/dev/null --conf=<(cat <<'EOF'
- name: tablet-streaming
data_size: 1GB
shards: [0]
type: seqread
shard_info:
parallelism: 50
reqsize: 128kB
shares: 200
- name: cpuhog
type: cpu
shards: [0]
shard_info:
parallelism: 1
execution_time: 550us
EOF
) --io-properties-file=<(cat <<'EOF'
# i4i.2xlarge
disks:
- mountpoint: /dev
read_bandwidth: 1542559872
read_iops: 218786
write_bandwidth: 1130867072
write_iops: 121499
EOF
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env bash

# Test scenario:
# all shards contend for IO, but one shard is additionally CPU-starved
# and polls rarely.
# Goal: it should still be getting a reasonably fair share of disk bandwidth.

if [ $# -ne 1 ]; then
echo "Usage: $0 IO_TESTER_EXECUTABLE" >&2
exit 1
fi

"$1" --smp=7 --storage=/dev/null --conf=<(cat <<'EOF'
- name: tablet-streaming
data_size: 1GB
shards: all
type: seqread
shard_info:
parallelism: 50
reqsize: 128kB
shares: 200
- name: cpuhog
type: cpu
shards: [0]
shard_info:
parallelism: 1
execution_time: 550us
EOF
) --io-properties-file=<(cat <<'EOF'
# i4i.2xlarge
disks:
- mountpoint: /dev
read_bandwidth: 1542559872
read_iops: 218786
write_bandwidth: 1130867072
write_iops: 121499
EOF
) --duration=2
77 changes: 77 additions & 0 deletions apps/io_tester/test_cases/scylla_tablet_migration.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env bash

# Test scenario:
# Simulation of a ScyllaDB workload which prompted some changes to the IO scheduler:
# database queries concurrent with tablet streaming.
#
# All 7 shards are running a low-priority (200 shares) batch IO workload
# and a high-priority (1000 shares), moderate-bandwidth, interactive workload.
#
# The interactive workload requires about 30% of the node's
# total bandwidth (as measured in tokens), in small random reads.
# The batch workload does large sequential reads and wants to utilize all
# spare bandwidth.
#
# This workload is almost symmetric across shards, but is slightly skewed
# and shard 0 is slightly more loaded. But even on this shard, the workload
# doesn't need more than 35% of the fair bandwidth of this shard.
#
# Due to the distribution of shares across IO classes, the user expects that
# the interactive workload should be guaranteed (1000 / (1000 + 200)) == ~84% of
# the disk bandwidth on each shard. So if it's only asking for less than 35%,
# the lower-priority job shouldn't disturb it.
#
# But before the relevant IO scheduler changes, this goal wasn't met,
# and the interactive workload on shard 0 was instead starved for IO
# by the low-priority workloads on other shards.

if [ $# -ne 1 ]; then
echo "Usage: $0 IO_TESTER_EXECUTABLE" >&2
exit 1
fi

"$1" --smp=7 --storage=/dev/null --conf=<(cat <<'EOF'
- name: tablet-streaming
data_size: 1GB
shards: all
type: seqread
shard_info:
parallelism: 50
reqsize: 128kB
shares: 200
- name: cassandra-stress
shards: all
type: randread
data_size: 1GB
shard_info:
parallelism: 100
reqsize: 1536
shares: 1000
rps: 75
options:
pause_distribution: poisson
sleep_type: steady
- name: cassandra-stress-slight-imbalance
shards: [0]
type: randread
data_size: 1GB
shard_info:
parallelism: 100
reqsize: 1536
class: cassandra-stress
rps: 10
options:
pause_distribution: poisson
sleep_type: steady
EOF
) --io-properties-file=<(cat <<'EOF'
# i4i.2xlarge
disks:
- mountpoint: /dev
read_bandwidth: 1542559872
read_iops: 218786
write_bandwidth: 1130867072
write_iops: 121499
EOF
)
58 changes: 58 additions & 0 deletions apps/io_tester/test_cases/tau_nemesis.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env bash

# There is a `tau` mechanism in `fair_queue` which lets newly-activated
# IO classes to monopolize the shard's IO queue for a while.
#
# This isn't very useful and can result in major performance problems,
# as this test illustrates. The `highprio` workload could have tail latency
# of about 2 milliseconds, but the `bursty_lowprio` is allowed by `tau` to butt in
# periodically and preempt `highprio` for ~30ms, bringing its tail latency
# to that threshold.

if [ $# -ne 1 ]; then
echo "Usage: $0 IO_TESTER_EXECUTABLE" >&2
exit 1
fi

"$1" --smp=7 --storage=/dev/null --conf=<(cat <<'EOF'
- name: filler
data_size: 1GB
shards: all
type: seqread
shard_info:
parallelism: 10
reqsize: 128kB
shares: 10
- name: bursty_lowprio
data_size: 1GB
shards: all
type: seqread
shard_info:
parallelism: 1
reqsize: 128kB
shares: 100
batch: 50
rps: 8
- name: highprio
shards: all
type: randread
data_size: 1GB
shard_info:
parallelism: 100
reqsize: 1536
shares: 1000
rps: 50
options:
pause_distribution: poisson
sleep_type: steady
EOF
) --io-properties-file=<(cat <<'EOF'
# i4i.2xlarge
disks:
- mountpoint: /dev
read_bandwidth: 1542559872
read_iops: 218786
write_bandwidth: 1130867072
write_iops: 121499
EOF
)

0 comments on commit 2070713

Please sign in to comment.