Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-50639][SQL] Improve warning logging in CacheManager #49276

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,9 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper {
if (storageLevel == StorageLevel.NONE) {
// Do nothing for StorageLevel.NONE since it will not actually cache any data.
} else if (lookupCachedDataInternal(normalizedPlan).nonEmpty) {
logWarning("Asked to cache already cached data.")
logWarning(log"An attempt was made to cache data even though the data had already been " +
log"cached. Please un-cache data or clear cache first.\nLogical plan:\n" +
log"${MDC(QUERY_PLAN, normalizedPlan)}")
} else {
val sessionWithConfigsOff = getOrCloneSessionWithConfigsOff(spark)
val inMemoryRelation = sessionWithConfigsOff.withActive {
Expand All @@ -140,7 +142,8 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper {

this.synchronized {
if (lookupCachedDataInternal(normalizedPlan).nonEmpty) {
logWarning("Data has already been cached.")
logWarning(log"Data has already been cached. No new data is cached.\nLogical plan:\n" +
log"${MDC(QUERY_PLAN, normalizedPlan)}")
} else {
// the cache key is the normalized plan
val cd = CachedData(normalizedPlan, inMemoryRelation)
Expand Down Expand Up @@ -206,7 +209,10 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper {
plan: LogicalPlan,
cascade: Boolean,
blocking: Boolean): Unit = {
uncacheByCondition(spark, _.sameResult(plan), cascade, blocking)
if (!uncacheByCondition(spark, _.sameResult(plan), cascade, blocking)) {
logWarning(log"Data has not been previously cached or it was removed from the " +
log"cache already.\nLogical plan:\n${MDC(QUERY_PLAN, plan)}")
}
}

def uncacheTableOrView(spark: SparkSession, name: Seq[String], cascade: Boolean): Unit = {
Expand Down Expand Up @@ -241,16 +247,19 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper {
spark: SparkSession,
isMatchedPlan: LogicalPlan => Boolean,
cascade: Boolean,
blocking: Boolean): Unit = {
blocking: Boolean): Boolean = {
val shouldRemove: LogicalPlan => Boolean =
if (cascade) {
_.exists(isMatchedPlan)
} else {
isMatchedPlan
}
val plansToUncache = cachedData.filter(cd => shouldRemove(cd.plan))
var plansToUncache: IndexedSeq[CachedData] = null
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vrozov why do we need the code change here if this PR is to improve the logging?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gengliangwang logging relies on the plansToUncache.nonEmpty added on line 288 to correctly log warnings, so the change is necessary to prevent race condition (cacheData should be accessed under synchronized).

this.synchronized {
cachedData = cachedData.filterNot(cd => plansToUncache.exists(_ eq cd))
plansToUncache = cachedData.filter(cd => shouldRemove(cd.plan))
if (plansToUncache.nonEmpty) {
cachedData = cachedData.filterNot(cd => plansToUncache.exists(_ eq cd))
}
}
plansToUncache.foreach { _.cachedRepresentation.cacheBuilder.clearCache(blocking) }
CacheManager.logCacheOperation(log"Removed ${MDC(SIZE, plansToUncache.size)} Dataframe " +
Expand All @@ -276,6 +285,7 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper {
cd.plan.exists(isMatchedPlan) && !cacheAlreadyLoaded
})
}
plansToUncache.nonEmpty
}

// Analyzes column statistics in the given cache data
Expand Down
Loading