Skip to content

Commit

Permalink
Merge pull request #231 from DataDog/anmarchenko/refactor_test_retry_…
Browse files Browse the repository at this point in the history
…strategies

[SDTEST-173] Early flake detection support for Cucumber
  • Loading branch information
anmarchenko authored Sep 17, 2024
2 parents c842a90 + cdd105b commit 1052c9c
Show file tree
Hide file tree
Showing 36 changed files with 873 additions and 478 deletions.
1 change: 1 addition & 0 deletions lib/datadog/ci.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
require_relative "ci/ext/app_types"
require_relative "ci/ext/telemetry"

require "datadog"
require "datadog/core"

module Datadog
Expand Down
37 changes: 0 additions & 37 deletions lib/datadog/ci/contrib/cucumber/configuration_override.rb

This file was deleted.

40 changes: 40 additions & 0 deletions lib/datadog/ci/contrib/cucumber/filter.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# frozen_string_literal: true

module Datadog
module CI
module Contrib
module Cucumber
class Filter < ::Cucumber::Core::Filter.new(:configuration)
def test_case(test_case)
test_retries_component.reset_retries! unless test_case_seen[test_case]
test_case_seen[test_case] = true

configuration.on_event(:test_case_finished) do |event|
next unless retry_required?(test_case, event)

test_case.describe_to(receiver)
end

super
end

private

def retry_required?(test_case, event)
return false unless event.test_case == test_case

test_retries_component.should_retry?
end

def test_case_seen
@test_case_seen ||= Hash.new { |h, k| h[k] = false }
end

def test_retries_component
@test_retries_component ||= Datadog.send(:components).test_retries
end
end
end
end
end
end
16 changes: 15 additions & 1 deletion lib/datadog/ci/contrib/cucumber/instrumentation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,24 @@ module InstanceMethods

def formatters
existing_formatters = super
@datadog_formatter ||= CI::Contrib::Cucumber::Formatter.new(@configuration)
@datadog_formatter ||= Formatter.new(@configuration)
[@datadog_formatter] + existing_formatters
end

def filters
require_relative "filter"

filters_list = super
datadog_filter = Filter.new(@configuration)
unless @configuration.dry_run?
# insert our filter the pre-last position because Cucumber::Filters::PrepareWorld must be the last one
# see:
# https://github.com/cucumber/cucumber-ruby/blob/58dd8f12c0ac5f4e607335ff2e7d385c1ed25899/lib/cucumber/runtime.rb#L266
filters_list.insert(-2, datadog_filter)
end
filters_list
end

def begin_scenario(test_case)
if Datadog::CI.active_test&.skipped_by_itr?
raise ::Cucumber::Core::Test::Result::Skipped, CI::Ext::Test::ITR_TEST_SKIP_REASON
Expand Down
2 changes: 0 additions & 2 deletions lib/datadog/ci/contrib/cucumber/patcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
require "datadog/tracing/contrib/patcher"

require_relative "instrumentation"
require_relative "configuration_override"

module Datadog
module CI
Expand All @@ -21,7 +20,6 @@ def target_version

def patch
::Cucumber::Runtime.include(Instrumentation)
::Cucumber::Configuration.include(ConfigurationOverride)
end
end
end
Expand Down
191 changes: 48 additions & 143 deletions lib/datadog/ci/test_retries/component.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# frozen_string_literal: true

require_relative "driver/no_retry"
require_relative "driver/retry_failed"
require_relative "driver/retry_new"

require_relative "strategy/no_retry"
require_relative "strategy/retry_failed"
require_relative "strategy/retry_new"
Expand All @@ -14,16 +18,7 @@ module TestRetries
# - retrying failed tests - improve success rate of CI pipelines
# - retrying new tests - detect flaky tests as early as possible to prevent them from being merged
class Component
FIBER_LOCAL_CURRENT_RETRY_STRATEGY_KEY = :__dd_current_retry_strategy

DEFAULT_TOTAL_TESTS_COUNT = 100

# there are clearly 2 different concepts mixed here, we should split them into separate components
# (high level strategies?) in the subsequent PR
attr_reader :retry_failed_tests_enabled, :retry_failed_tests_max_attempts,
:retry_failed_tests_total_limit, :retry_failed_tests_count,
:retry_new_tests_enabled, :retry_new_tests_duration_thresholds, :retry_new_tests_unique_tests_set,
:retry_new_tests_total_limit, :retry_new_tests_count
FIBER_LOCAL_CURRENT_RETRY_DRIVER_KEY = :__dd_current_retry_driver

def initialize(
retry_failed_tests_enabled:,
Expand All @@ -32,175 +27,85 @@ def initialize(
retry_new_tests_enabled:,
unique_tests_client:
)
@retry_failed_tests_enabled = retry_failed_tests_enabled
@retry_failed_tests_max_attempts = retry_failed_tests_max_attempts
@retry_failed_tests_total_limit = retry_failed_tests_total_limit
# counter that stores the current number of failed tests retried
@retry_failed_tests_count = 0

@retry_new_tests_enabled = retry_new_tests_enabled
@retry_new_tests_duration_thresholds = nil
@retry_new_tests_unique_tests_set = Set.new
@unique_tests_client = unique_tests_client
# total maximum number of new tests to retry (will be set based on the total number of tests in the session)
@retry_new_tests_total_limit = 0
# counter thate stores the current number of new tests retried
@retry_new_tests_count = 0
no_retries_strategy = Strategy::NoRetry.new

retry_failed_strategy = Strategy::RetryFailed.new(
enabled: retry_failed_tests_enabled,
max_attempts: retry_failed_tests_max_attempts,
total_limit: retry_failed_tests_total_limit
)

retry_new_strategy = Strategy::RetryNew.new(
enabled: retry_new_tests_enabled,
unique_tests_client: unique_tests_client
)

# order is important, we should try to retry new tests first
@retry_strategies = [retry_new_strategy, retry_failed_strategy, no_retries_strategy]
@mutex = Mutex.new
end

def configure(library_settings, test_session)
@retry_failed_tests_enabled &&= library_settings.flaky_test_retries_enabled?
@retry_new_tests_enabled &&= library_settings.early_flake_detection_enabled?

return unless @retry_new_tests_enabled

# mark early flake detection enabled for test session
test_session.set_tag(Ext::Test::TAG_EARLY_FLAKE_ENABLED, "true")

# configure retrying new tests
@retry_new_tests_duration_thresholds = library_settings.slow_test_retries
Datadog.logger.debug do
"Slow test retries thresholds: #{@retry_new_tests_duration_thresholds.entries}"
end

@retry_new_tests_unique_tests_set = @unique_tests_client.fetch_unique_tests(test_session)

percentage_limit = library_settings.faulty_session_threshold
tests_count = test_session.total_tests_count.to_i
if tests_count.zero?
Datadog.logger.debug do
"Total tests count is zero, using default value for the total number of tests: [#{DEFAULT_TOTAL_TESTS_COUNT}]"
end

tests_count = DEFAULT_TOTAL_TESTS_COUNT
end

@retry_new_tests_total_limit = (tests_count * percentage_limit / 100.0).ceil
Datadog.logger.debug do
"Retry new tests total limit is [#{@retry_new_tests_total_limit}] (#{percentage_limit}%) of #{tests_count}"
end

if @retry_new_tests_unique_tests_set.empty?
@retry_new_tests_enabled = false
mark_test_session_faulty(test_session)

Datadog.logger.warn(
"Disabling early flake detection because there is no known tests (possible reason: no test runs in default branch)"
)
end

Datadog.logger.debug do
"Found [#{@retry_new_tests_unique_tests_set.size}] known unique tests"
# let all strategies configure themselves
@retry_strategies.each do |strategy|
strategy.configure(library_settings, test_session)
end
Utils::Telemetry.distribution(
Ext::Telemetry::METRIC_EFD_UNIQUE_TESTS_RESPONSE_TESTS,
@retry_new_tests_unique_tests_set.size.to_f
)
end

def with_retries(&block)
self.current_retry_strategy = nil
reset_retries!

loop do
yield

break unless current_retry_strategy&.should_retry?
break unless should_retry?
end
ensure
self.current_retry_strategy = nil
reset_retries!
end

def build_strategy(test_span)
def build_driver(test_span)
@mutex.synchronize do
if should_retry_new_test?(test_span)
Datadog.logger.debug do
"#{test_span.name} is new, will be retried"
end
@retry_new_tests_count += 1

Strategy::RetryNew.new(test_span, duration_thresholds: @retry_new_tests_duration_thresholds)
elsif should_retry_failed_test?(test_span)
Datadog.logger.debug do
"#{test_span.name} failed, will be retried"
end
@retry_failed_tests_count += 1

Strategy::RetryFailed.new(max_attempts: @retry_failed_tests_max_attempts)
else
Strategy::NoRetry.new
end
# find the first strategy that covers the test span and let it build the driver
strategy = @retry_strategies.find { |strategy| strategy.covers?(test_span) }

raise "No retry strategy found for test span: #{test_span.name}" if strategy.nil?

strategy.build_driver(test_span)
end
end

def record_test_finished(test_span)
if current_retry_strategy.nil?
# we always run test at least once and after the first pass create a correct retry strategy
self.current_retry_strategy = build_strategy(test_span)
if current_retry_driver.nil?
# we always run test at least once and after the first pass create a correct retry driver
self.current_retry_driver = build_driver(test_span)
else
# after each retry we record the result, strategy will decide if we should retry again
current_retry_strategy&.record_retry(test_span)
# after each retry we record the result, the driver will decide if we should retry again
current_retry_driver&.record_retry(test_span)
end
end

def record_test_span_duration(tracer_span)
current_retry_strategy&.record_duration(tracer_span.duration)
end

private

def current_retry_strategy
Thread.current[FIBER_LOCAL_CURRENT_RETRY_STRATEGY_KEY]
end

def current_retry_strategy=(strategy)
Thread.current[FIBER_LOCAL_CURRENT_RETRY_STRATEGY_KEY] = strategy
current_retry_driver&.record_duration(tracer_span.duration)
end

def should_retry_failed_test?(test_span)
if @retry_failed_tests_count >= @retry_failed_tests_total_limit
Datadog.logger.debug do
"Retry failed tests limit reached: [#{@retry_failed_tests_count}] out of [#{@retry_new_tests_total_limit}]"
end
@retry_failed_tests_enabled = false
end

@retry_failed_tests_enabled && !!test_span&.failed?
end

def should_retry_new_test?(test_span)
if @retry_new_tests_count >= @retry_new_tests_total_limit
Datadog.logger.debug do
"Retry new tests limit reached: [#{@retry_new_tests_count}] out of [#{@retry_new_tests_total_limit}]"
end
@retry_new_tests_enabled = false
mark_test_session_faulty(Datadog::CI.active_test_session)
end

@retry_new_tests_enabled && !test_span.skipped? && is_new_test?(test_span)
# this API is targeted on Cucumber instrumentation or any other that cannot leverage #with_retries method
def reset_retries!
self.current_retry_driver = nil
end

def test_visibility_component
Datadog.send(:components).test_visibility
def should_retry?
!!current_retry_driver&.should_retry?
end

def is_new_test?(test_span)
test_id = Utils::TestRun.datadog_test_id(test_span.name, test_span.test_suite_name)

result = !@retry_new_tests_unique_tests_set.include?(test_id)

if result
Datadog.logger.debug do
"#{test_id} is not found in the unique tests set, it is a new test"
end
end
private

result
def current_retry_driver
Thread.current[FIBER_LOCAL_CURRENT_RETRY_DRIVER_KEY]
end

def mark_test_session_faulty(test_session)
test_session&.set_tag(Ext::Test::TAG_EARLY_FLAKE_ABORT_REASON, Ext::Test::EARLY_FLAKE_FAULTY)
def current_retry_driver=(driver)
Thread.current[FIBER_LOCAL_CURRENT_RETRY_DRIVER_KEY] = driver
end
end
end
Expand Down
Loading

0 comments on commit 1052c9c

Please sign in to comment.