apps/io_tester: add some tests for the IO scheduler

Add some test cases useful for presenting improvement ideas for the IO scheduler. 3 of 4 tests added in this patch illustrate some goals which weren't met before but are met after this series. The `tau_nemesis` test illustrates a problem which is present both before and after this series.
scylladb · Jan 14, 2025 · 2070713 · 2070713
1 parent dbc19d2
commit 2070713
Show file tree

Hide file tree

Showing 4 changed files with 213 additions and 0 deletions.
diff --git a/apps/io_tester/test_cases/one_cpu_starved_shard_can_still_saturate_io.sh b/apps/io_tester/test_cases/one_cpu_starved_shard_can_still_saturate_io.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+# Test scenario:
+# A single CPU-starved shard has a batch IO job.
+# Goal: it should be able to utilize the entire bandwidth of the disk,
+# despite the rare polls.
+
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 IO_TESTER_EXECUTABLE" >&2
+    exit 1
+fi
+
+"$1" --smp=7 --storage=/dev/null --conf=<(cat <<'EOF'
+- name: tablet-streaming
+  data_size: 1GB
+  shards: [0]
+  type: seqread
+  shard_info:
+    parallelism: 50
+    reqsize: 128kB
+    shares: 200
+- name: cpuhog
+  type: cpu
+  shards: [0]
+  shard_info:
+    parallelism: 1
+    execution_time: 550us
+
+EOF
+) --io-properties-file=<(cat <<'EOF'
+# i4i.2xlarge
+disks:
+- mountpoint: /dev
+  read_bandwidth: 1542559872
+  read_iops: 218786
+  write_bandwidth: 1130867072
+  write_iops: 121499
+EOF
+)
diff --git a/apps/io_tester/test_cases/one_cpu_starved_shard_has_reasonable_fairness.sh b/apps/io_tester/test_cases/one_cpu_starved_shard_has_reasonable_fairness.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+# Test scenario:
+# all shards contend for IO, but one shard is additionally CPU-starved
+# and polls rarely.
+# Goal: it should still be getting a reasonably fair share of disk bandwidth.
+
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 IO_TESTER_EXECUTABLE" >&2
+    exit 1
+fi
+
+"$1" --smp=7 --storage=/dev/null --conf=<(cat <<'EOF'
+- name: tablet-streaming
+  data_size: 1GB
+  shards: all
+  type: seqread
+  shard_info:
+    parallelism: 50
+    reqsize: 128kB
+    shares: 200
+- name: cpuhog
+  type: cpu
+  shards: [0]
+  shard_info:
+    parallelism: 1
+    execution_time: 550us
+
+EOF
+) --io-properties-file=<(cat <<'EOF'
+# i4i.2xlarge
+disks:
+- mountpoint: /dev
+  read_bandwidth: 1542559872
+  read_iops: 218786
+  write_bandwidth: 1130867072
+  write_iops: 121499
+EOF
+) --duration=2
diff --git a/apps/io_tester/test_cases/scylla_tablet_migration.sh b/apps/io_tester/test_cases/scylla_tablet_migration.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+# Test scenario:
+# Simulation of a ScyllaDB workload which prompted some changes to the IO scheduler:
+# database queries concurrent with tablet streaming.
+# 
+# All 7 shards are running a low-priority (200 shares) batch IO workload
+# and a high-priority (1000 shares), moderate-bandwidth, interactive workload.
+#
+# The interactive workload requires about 30% of the node's
+# total bandwidth (as measured in tokens), in small random reads.
+# The batch workload does large sequential reads and wants to utilize all
+# spare bandwidth.
+#
+# This workload is almost symmetric across shards, but is slightly skewed
+# and shard 0 is slightly more loaded. But even on this shard, the workload
+# doesn't need more than 35% of the fair bandwidth of this shard.
+#
+# Due to the distribution of shares across IO classes, the user expects that
+# the interactive workload should be guaranteed (1000 / (1000 + 200)) == ~84% of 
+# the disk bandwidth on each shard. So if it's only asking for less than 35%,
+# the lower-priority job shouldn't disturb it.
+#
+# But before the relevant IO scheduler changes, this goal wasn't met,
+# and the interactive workload on shard 0 was instead starved for IO
+# by the low-priority workloads on other shards.
+
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 IO_TESTER_EXECUTABLE" >&2
+    exit 1
+fi
+
+"$1" --smp=7 --storage=/dev/null --conf=<(cat <<'EOF'
+- name: tablet-streaming
+  data_size: 1GB
+  shards: all
+  type: seqread
+  shard_info:
+    parallelism: 50
+    reqsize: 128kB
+    shares: 200
+- name: cassandra-stress
+  shards: all
+  type: randread
+  data_size: 1GB
+  shard_info:
+    parallelism: 100
+    reqsize: 1536
+    shares: 1000
+    rps: 75
+  options:
+    pause_distribution: poisson
+    sleep_type: steady
+- name: cassandra-stress-slight-imbalance
+  shards: [0]
+  type: randread
+  data_size: 1GB
+  shard_info:
+    parallelism: 100
+    reqsize: 1536
+    class: cassandra-stress
+    rps: 10
+  options:
+    pause_distribution: poisson
+    sleep_type: steady
+
+EOF
+) --io-properties-file=<(cat <<'EOF'
+# i4i.2xlarge
+disks:
+- mountpoint: /dev
+  read_bandwidth: 1542559872
+  read_iops: 218786
+  write_bandwidth: 1130867072
+  write_iops: 121499
+EOF
+)
diff --git a/apps/io_tester/test_cases/tau_nemesis.sh b/apps/io_tester/test_cases/tau_nemesis.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# There is a `tau` mechanism in `fair_queue` which lets newly-activated
+# IO classes to monopolize the shard's IO queue for a while.
+#
+# This isn't very useful and can result in major performance problems,
+# as this test illustrates. The `highprio` workload could have tail latency
+# of about 2 milliseconds, but the `bursty_lowprio` is allowed by `tau` to butt in
+# periodically and preempt `highprio` for ~30ms, bringing its tail latency
+# to that threshold.
+
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 IO_TESTER_EXECUTABLE" >&2
+    exit 1
+fi
+
+"$1" --smp=7 --storage=/dev/null --conf=<(cat <<'EOF'
+- name: filler
+  data_size: 1GB
+  shards: all
+  type: seqread
+  shard_info:
+    parallelism: 10
+    reqsize: 128kB
+    shares: 10
+- name: bursty_lowprio
+  data_size: 1GB
+  shards: all
+  type: seqread
+  shard_info:
+    parallelism: 1
+    reqsize: 128kB
+    shares: 100
+    batch: 50
+    rps: 8
+- name: highprio
+  shards: all
+  type: randread
+  data_size: 1GB
+  shard_info:
+    parallelism: 100
+    reqsize: 1536
+    shares: 1000
+    rps: 50
+  options:
+    pause_distribution: poisson
+    sleep_type: steady
+EOF
+) --io-properties-file=<(cat <<'EOF'
+# i4i.2xlarge
+disks:
+- mountpoint: /dev
+  read_bandwidth: 1542559872
+  read_iops: 218786
+  write_bandwidth: 1130867072
+  write_iops: 121499
+EOF
+)