diff --git a/core/src/main/scala/org/apache/spark/util/Clock.scala b/core/src/main/scala/org/apache/spark/util/Clock.scala
index d2674d4f47224..226f15d3d38c2 100644
--- a/core/src/main/scala/org/apache/spark/util/Clock.scala
+++ b/core/src/main/scala/org/apache/spark/util/Clock.scala
@@ -42,7 +42,7 @@ private[spark] trait Clock {
    *
    * TL;DR: on modern (2.6.32+) Linux kernels with modern (AMD K8+) CPUs, the values returned by
    * `System.nanoTime()` are consistent across CPU cores *and* packages, and provide always
-   * increasing values (although it may not be completely monotonic when the the system clock is
+   * increasing values (although it may not be completely monotonic when the system clock is
    * adjusted by NTP daemons using time slew).
    */
   // scalastyle:on line.size.limit
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 7221623f89e1b..a0da3ca5b5f3b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -83,7 +83,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
 
     // Five messages should be marked as received and queued, but no messages should be posted to
-    // listeners yet because the the listener bus hasn't been started.
+    // listeners yet because the listener bus hasn't been started.
     assert(bus.metrics.numEventsPosted.getCount === 5)
     assert(bus.queuedEvents.size === 5)
 
@@ -206,7 +206,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     assert(sharedQueueSize(bus) === 1)
     assert(numDroppedEvents(bus) === 1)
 
-    // Allow the the remaining events to be processed so we can stop the listener bus:
+    // Allow the remaining events to be processed so we can stop the listener bus:
     listenerWait.release(2)
     bus.stop()
   }
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 2b5993a352cb0..0b4e1494bf300 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -436,7 +436,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite
     val it = map.iterator
     assert(it.isInstanceOf[CompletionIterator[_, _]])
     // org.apache.spark.util.collection.AppendOnlyMap.destructiveSortedIterator returns
-    // an instance of an annonymous Iterator class.
+    // an instance of an anonymous Iterator class.
 
     val underlyingMapRef = WeakReference(map.currentMap)
 
diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml
index 36e0b99a07ffd..1149e4704be2e 100644
--- a/docs/_data/menu-sql.yaml
+++ b/docs/_data/menu-sql.yaml
@@ -233,5 +233,5 @@
               url: sql-ref-functions-udf-scalar.html
             - text: Aggregate functions
               url: sql-ref-functions-udf-aggregate.html
-    - text: Arthmetic operations
+    - text: Arithmetic operations
       url: sql-ref-arithmetic-ops.html
diff --git a/docs/configuration.md b/docs/configuration.md
index 497a2ad36b67c..a02733fdbee89 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -2423,7 +2423,7 @@ showDF(properties, numRows = 200, truncate = FALSE)
     Interval at which data received by Spark Streaming receivers is chunked
     into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the
     <a href="streaming-programming-guide.html#level-of-parallelism-in-data-receiving">performance
-     tuning</a> section in the Spark Streaming programing guide for more details.
+     tuning</a> section in the Spark Streaming programming guide for more details.
   </td>
 </tr>
 <tr>
@@ -2434,7 +2434,7 @@ showDF(properties, numRows = 200, truncate = FALSE)
     Effectively, each stream will consume at most this number of records per second.
     Setting this configuration to 0 or a negative number will put no limit on the rate.
     See the <a href="streaming-programming-guide.html#deploying-applications">deployment guide</a>
-    in the Spark Streaming programing guide for mode details.
+    in the Spark Streaming programming guide for mode details.
   </td>
 </tr>
 <tr>
@@ -2444,7 +2444,7 @@ showDF(properties, numRows = 200, truncate = FALSE)
     Enable write-ahead logs for receivers. All the input data received through receivers
     will be saved to write-ahead logs that will allow it to be recovered after driver failures.
     See the <a href="streaming-programming-guide.html#deploying-applications">deployment guide</a>
-    in the Spark Streaming programing guide for more details.
+    in the Spark Streaming programming guide for more details.
   </td>
 </tr>
 <tr>
diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index 05c688960f04c..6a81aff1476d9 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -670,7 +670,7 @@ others.
     <tr>
       <td>Gamma</td>
       <td>Continuous</td>
-      <td>Inverse*, Idenity, Log</td>
+      <td>Inverse*, Identity, Log</td>
     </tr>
     <tr>
       <td>Tweedie</td>
diff --git a/docs/ml-migration-guide.md b/docs/ml-migration-guide.md
index 9e8cd3e07b1ee..49f701b2156b3 100644
--- a/docs/ml-migration-guide.md
+++ b/docs/ml-migration-guide.md
@@ -254,7 +254,7 @@ Deprecations in the `spark.mllib` and `spark.ml` packages include:
  We move all functionality in overridden methods to the corresponding `transformSchema`.
 * [SPARK-14829](https://issues.apache.org/jira/browse/SPARK-14829):
  In `spark.mllib` package, `LinearRegressionWithSGD`, `LassoWithSGD`, `RidgeRegressionWithSGD` and `LogisticRegressionWithSGD` have been deprecated.
- We encourage users to use `spark.ml.regression.LinearRegresson` and `spark.ml.classification.LogisticRegresson`.
+ We encourage users to use `spark.ml.regression.LinearRegression` and `spark.ml.classification.LogisticRegression`.
 * [SPARK-14900](https://issues.apache.org/jira/browse/SPARK-14900):
  In `spark.mllib.evaluation.MulticlassMetrics`, the parameters `precision`, `recall` and `fMeasure` have been deprecated in favor of `accuracy`.
 * [SPARK-15644](https://issues.apache.org/jira/browse/SPARK-15644):
@@ -266,12 +266,12 @@ Deprecations in the `spark.mllib` and `spark.ml` packages include:
 Changes of behavior in the `spark.mllib` and `spark.ml` packages include:
 
 * [SPARK-7780](https://issues.apache.org/jira/browse/SPARK-7780):
- `spark.mllib.classification.LogisticRegressionWithLBFGS` directly calls `spark.ml.classification.LogisticRegresson` for binary classification now.
+ `spark.mllib.classification.LogisticRegressionWithLBFGS` directly calls `spark.ml.classification.LogisticRegression` for binary classification now.
  This will introduce the following behavior changes for `spark.mllib.classification.LogisticRegressionWithLBFGS`:
     * The intercept will not be regularized when training binary classification model with L1/L2 Updater.
     * If users set without regularization, training with or without feature scaling will return the same solution by the same convergence rate.
 * [SPARK-13429](https://issues.apache.org/jira/browse/SPARK-13429):
- In order to provide better and consistent result with `spark.ml.classification.LogisticRegresson`,
+ In order to provide better and consistent result with `spark.ml.classification.LogisticRegression`,
  the default value of `spark.mllib.classification.LogisticRegressionWithLBFGS`: `convergenceTol` has been changed from 1E-4 to 1E-6.
 * [SPARK-12363](https://issues.apache.org/jira/browse/SPARK-12363):
  Fix a bug of `PowerIterationClustering` which will likely change its result.
diff --git a/docs/monitoring.md b/docs/monitoring.md
index cff1126481f94..090178f5b37eb 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -640,7 +640,7 @@ A list of the available metrics, with a short description:
 
 ### Executor Metrics
 
-Executor-level metrics are sent from each executor to the driver as part of the Heartbeat to describe the performance metrics of Executor itself like JVM heap memory, GC infomation. Metrics `peakExecutorMetrics.*` are only enabled if `spark.eventLog.logStageExecutorMetrics.enabled` is true.
+Executor-level metrics are sent from each executor to the driver as part of the Heartbeat to describe the performance metrics of Executor itself like JVM heap memory, GC information. Metrics `peakExecutorMetrics.*` are only enabled if `spark.eventLog.logStageExecutorMetrics.enabled` is true.
 A list of the available metrics, with a short description:
 
 <table class="table">
diff --git a/docs/sql-data-sources-avro.md b/docs/sql-data-sources-avro.md
index dda7af1dc0f31..53be8709e91cc 100644
--- a/docs/sql-data-sources-avro.md
+++ b/docs/sql-data-sources-avro.md
@@ -245,7 +245,7 @@ Data source options of Avro can be set via:
     <td>None</td>
     <td>Optional Avro schema (in JSON format) that was used to serialize the data. This should be set if the schema provided
       for deserialization is compatible with - but not the same as - the one used to originally convert the data to Avro.
-      For more information on Avro's schema evolution and compatability, please refer to the [documentation of Confluent](https://docs.confluent.io/current/schema-registry/avro.html).
+      For more information on Avro's schema evolution and compatibility, please refer to the [documentation of Confluent](https://docs.confluent.io/current/schema-registry/avro.html).
     </td>
     <td>function <code>from_avro</code></td>
   </tr>
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 1db2a7d41082b..674621f3fdfaf 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -220,7 +220,7 @@ license: |
 
   - Since Spark 3.0, when casting interval values to string type, there is no "interval" prefix, e.g. `1 days 2 hours`. In Spark version 2.4 and earlier, the string contains the "interval" prefix like `interval 1 days 2 hours`.
 
-  - Since Spark 3.0, when casting string value to integral types(tinyint, smallint, int and bigint), datetime types(date, timestamp and interval) and boolean type, the leading and trailing whitespaces(<= ACSII 32) will be trimmed before converted to these type values, e.g. `cast(' 1\t' as int)` results `1`, `cast(' 1\t' as boolean)` results `true`, `cast('2019-10-10\t as date)` results the date value `2019-10-10`. In Spark version 2.4 and earlier, while casting string to integrals and booleans, it will not trim the whitespaces from both ends, the foregoing results will be `null`, while to datetimes, only the trailing spaces(= ASCII 32) will be removed.
+  - Since Spark 3.0, when casting string value to integral types(tinyint, smallint, int and bigint), datetime types(date, timestamp and interval) and boolean type, the leading and trailing whitespaces (<= ASCII 32) will be trimmed before converted to these type values, e.g. `cast(' 1\t' as int)` results `1`, `cast(' 1\t' as boolean)` results `true`, `cast('2019-10-10\t as date)` results the date value `2019-10-10`. In Spark version 2.4 and earlier, while casting string to integrals and booleans, it will not trim the whitespaces from both ends, the foregoing results will be `null`, while to datetimes, only the trailing spaces (= ASCII 32) will be removed.
 
   - Since Spark 3.0, numbers written in scientific notation(e.g. `1E2`) would be parsed as Double. In Spark version 2.4 and earlier, they're parsed as Decimal. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.exponentLiteralAsDecimal.enabled` to `true`.
 
diff --git a/docs/sql-pyspark-pandas-with-arrow.md b/docs/sql-pyspark-pandas-with-arrow.md
index d638278b42355..7eb8a74547f70 100644
--- a/docs/sql-pyspark-pandas-with-arrow.md
+++ b/docs/sql-pyspark-pandas-with-arrow.md
@@ -255,7 +255,7 @@ different than a Pandas timestamp. It is recommended to use Pandas time series f
 working with timestamps in `pandas_udf`s to get the best performance, see
 [here](https://pandas.pydata.org/pandas-docs/stable/timeseries.html) for details.
 
-### Compatibiliy Setting for PyArrow >= 0.15.0 and Spark 2.3.x, 2.4.x
+### Compatibility Setting for PyArrow >= 0.15.0 and Spark 2.3.x, 2.4.x
 
 Since Arrow 0.15.0, a change in the binary IPC format requires an environment variable to be
 compatible with previous versions of Arrow <= 0.14.1. This is only necessary to do for PySpark
diff --git a/docs/sql-ref-null-semantics.md b/docs/sql-ref-null-semantics.md
index fd467d224ffd5..3cbc15c600cee 100644
--- a/docs/sql-ref-null-semantics.md
+++ b/docs/sql-ref-null-semantics.md
@@ -25,14 +25,14 @@ A column is associated with a data type and represents
 a specific attribute of an entity (for example, `age` is a column of an
 entity called `person`). Sometimes, the value of a column
 specific to a row is not known at the time the row comes into existence.
-In `SQL`, such values are represnted as `NULL`. This section details the
+In `SQL`, such values are represented as `NULL`. This section details the
 semantics of `NULL` values handling in various operators, expressions and
 other `SQL` constructs.
 
 1. [Null handling in comparison operators](#comp-operators)
 2. [Null handling in Logical operators](#logical-operators)
 3. [Null handling in Expressions](#expressions)
-      1. [Null handling in null-in-tolerant expressions](#null-in-tolerant)
+      1. [Null handling in null-intolerant expressions](#null-intolerant)
       2. [Null handling Expressions that can process null value operands](#can-process-null)
       3. [Null handling in built-in aggregate expressions](#built-in-aggregate)
 4. [Null handling in WHERE, HAVING and JOIN conditions](#condition-expressions)
@@ -61,10 +61,10 @@ the `age` column and this table will be used in various examples in the sections
 <tr><td>700</td><td>Dan</td><td>50</td></tr>
 </table>
 
-### Comparision operators <a name="comp-operators"></a>
+### Comparison operators <a name="comp-operators"></a>
 
 Apache spark supports the standard comparison operators such as '>', '>=', '=', '<' and '<='.
-The result of these operators is unknown or `NULL` when one of the operarands or both the operands are
+The result of these operators is unknown or `NULL` when one of the operands or both the operands are
 unknown or `NULL`. In order to compare the `NULL` values for equality, Spark provides a null-safe
 equal operator ('<=>'), which returns `False` when one of the operand is `NULL` and returns 'True` when
 both the operands are `NULL`. The following table illustrates the behaviour of comparison operators when
@@ -152,7 +152,7 @@ SELECT NULL <=> NULL;
 Spark supports standard logical operators such as `AND`, `OR` and `NOT`. These operators take `Boolean` expressions
 as the arguments and return a `Boolean` value.  
 
-The following tables illustrate the behavior of logical opeators when one or both operands are `NULL`.
+The following tables illustrate the behavior of logical operators when one or both operands are `NULL`.
 
 <table class="tsclass" border="1">
   <tr>
@@ -236,12 +236,12 @@ The comparison operators and logical operators are treated as expressions in
 Spark. Other than these two kinds of expressions, Spark supports other form of
 expressions such as function expressions, cast expressions, etc. The expressions
 in Spark can be broadly classified as :
-- Null in-tolerent expressions
+- Null intolerant expressions
 - Expressions that can process `NULL` value operands
   - The result of these expressions depends on the expression itself.
 
-#### Null in-tolerant expressions <a name="null-in-tolerant"></a>
-Null in-tolerant expressions return `NULL` when one or more arguments of 
+#### Null intolerant expressions <a name="null-intolerant"></a>
+Null intolerant expressions return `NULL` when one or more arguments of 
 expression are `NULL` and most of the expressions fall in this category.
 
 ##### Examples
@@ -297,7 +297,7 @@ SELECT isnull(null) AS expression_output;
   |true             |
   +-----------------+
 
--- Returns the first occurence of non `NULL` value.
+-- Returns the first occurrence of non `NULL` value.
 SELECT coalesce(null, null, 3, null) AS expression_output;
   +-----------------+
   |expression_output|
@@ -460,7 +460,7 @@ WHERE p1.age <=> p2.age
 {% endhighlight %}
 
 ### Aggregate operator (GROUP BY, DISTINCT) <a name="aggregate-operator"></a>
-As discussed in the previous section [comparison operator](sql-ref-null-semantics.html#comparision-operators),
+As discussed in the previous section [comparison operator](sql-ref-null-semantics.html#comparison-operators),
 two `NULL` values are not equal. However, for the purpose of grouping and distinct processing, the two or more
 values with `NULL data`are grouped together into the same bucket. This behaviour is conformant with SQL
 standard and with other enterprise database management systems.
diff --git a/docs/sql-ref-syntax-ddl-alter-table.md b/docs/sql-ref-syntax-ddl-alter-table.md
index 1d7ace74231e6..a921478daa470 100644
--- a/docs/sql-ref-syntax-ddl-alter-table.md
+++ b/docs/sql-ref-syntax-ddl-alter-table.md
@@ -120,7 +120,7 @@ ALTER TABLE table_identifier [ partition_spec ] SET SERDE serde_class_name
 
 #### SET LOCATION And SET FILE FORMAT
 `ALTER TABLE SET` command can also be used for changing the file location and file format for 
-exsisting tables. 
+existing tables. 
 
 ##### Syntax
 {% highlight sql %}
diff --git a/docs/sql-ref-syntax-dml-insert-into.md b/docs/sql-ref-syntax-dml-insert-into.md
index 3060e2d1a6562..715f43c9b80ea 100644
--- a/docs/sql-ref-syntax-dml-insert-into.md
+++ b/docs/sql-ref-syntax-dml-insert-into.md
@@ -55,7 +55,7 @@ INSERT INTO [ TABLE ] table_identifier [ partition_spec ]
 
 <dl>
   <dt><code><em>VALUES ( { value | NULL } [ , ... ] ) [ , ( ... ) ]</em></code></dt>
-  <dd>Specifies the values to be inserted. Either an explicitly specified value or a NULL can be inserted. A comma must be used to seperate each value in the clause. More than one set of values can be specified to insert multiple rows.</dd>
+  <dd>Specifies the values to be inserted. Either an explicitly specified value or a NULL can be inserted. A comma must be used to separate each value in the clause. More than one set of values can be specified to insert multiple rows.</dd>
 </dl>
 
 <dl>
@@ -215,4 +215,4 @@ INSERT INTO [ TABLE ] table_identifier [ partition_spec ]
 ### Related Statements
   * [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html)
   * [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html)
-  * [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html)
\ No newline at end of file
+  * [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html)
diff --git a/docs/streaming-kafka-0-10-integration.md b/docs/streaming-kafka-0-10-integration.md
index d8fd6724e91bd..0f5964786fbce 100644
--- a/docs/streaming-kafka-0-10-integration.md
+++ b/docs/streaming-kafka-0-10-integration.md
@@ -233,7 +233,7 @@ For data stores that support transactions, saving offsets in the same transactio
 {% highlight scala %}
 // The details depend on your data store, but the general idea looks like this
 
-// begin from the the offsets committed to the database
+// begin from the offsets committed to the database
 val fromOffsets = selectOffsetsFromYourDatabase.map { resultSet =>
   new TopicPartition(resultSet.string("topic"), resultSet.int("partition")) -> resultSet.long("offset")
 }.toMap
@@ -263,7 +263,7 @@ stream.foreachRDD { rdd =>
 {% highlight java %}
 // The details depend on your data store, but the general idea looks like this
 
-// begin from the the offsets committed to the database
+// begin from the offsets committed to the database
 Map<TopicPartition, Long> fromOffsets = new HashMap<>();
 for (resultSet : selectOffsetsFromYourDatabase)
   fromOffsets.put(new TopicPartition(resultSet.string("topic"), resultSet.int("partition")), resultSet.long("offset"));
diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
index eb9e186e961a9..7d10c6d709539 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -405,7 +405,7 @@ The following configurations are optional:
   </td>
   <td>latest</td>
   <td>batch query</td>
-  <td>The end point when a batch query is ended, a json string specifying an ending timesamp for each TopicPartition.
+  <td>The end point when a batch query is ended, a json string specifying an ending timestamp for each TopicPartition.
   The returned offset for each partition is the earliest offset whose timestamp is greater than or equal to
   the given timestamp in the corresponding partition. If the matched offset doesn't exist, the offset will
   be set to latest.<p/>
diff --git a/docs/web-ui.md b/docs/web-ui.md
index f94e81ca67961..6f1afb2ed46f5 100644
--- a/docs/web-ui.md
+++ b/docs/web-ui.md
@@ -444,7 +444,7 @@ The third section has the SQL statistics of the submitted operations.
 	* _Canceled_, final state when the execution is canceled.
 	* _Finished_ processing and waiting to fetch results.
 	* _Closed_, final state when client closed the statement.
-* **Detail** of the execution plan with parsed logical plan, analyzed logical plan, optimized logical plan and physical plan or errors in the the SQL statement.
+* **Detail** of the execution plan with parsed logical plan, analyzed logical plan, optimized logical plan and physical plan or errors in the SQL statement.
 
 <p style="text-align: center;">
   <img src="img/JDBCServer3.png" title="JDBC/ODBC SQL Statistics" alt="JDBC/ODBC SQL Statistics">
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala
index 61ffe31edfd04..ead45423d673f 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
 
 /**
- * Class to calculate offset ranges to process based on the the from and until offsets, and
+ * Class to calculate offset ranges to process based on the from and until offsets, and
  * the configured `minPartitions`.
  */
 private[kafka010] class KafkaOffsetRangeCalculator(val minPartitions: Option[Int]) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
index 6c194902a750b..a9c2941ef3a53 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
@@ -148,7 +148,7 @@ private[spark] object DecisionTreeMetadata extends Logging {
       require(maxCategoriesPerFeature <= maxPossibleBins,
         s"DecisionTree requires maxBins (= $maxPossibleBins) to be at least as large as the " +
         s"number of values in each categorical feature, but categorical feature $maxCategory " +
-        s"has $maxCategoriesPerFeature values. Considering remove this and other categorical " +
+        s"has $maxCategoriesPerFeature values. Consider removing this and other categorical " +
         "features with a large number of values, or add more training examples.")
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedUnsafeProjection.scala
index 55a5bd380859e..39a16e917c4a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedUnsafeProjection.scala
@@ -132,7 +132,7 @@ object InterpretedUnsafeProjection {
       dt: DataType,
       nullable: Boolean): (SpecializedGetters, Int) => Unit = {
 
-    // Create the the basic writer.
+    // Create the basic writer.
     val unsafeWriter: (SpecializedGetters, Int) => Unit = dt match {
       case BooleanType =>
         (v, i) => writer.write(i, v.getBoolean(i))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 05fd5e35e22ad..810c28116de47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -406,7 +406,7 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
 
         // Create the attribute mapping. Note that the currentNextAttrPairs can contain duplicate
         // keys in case of Union (this is caused by the PushProjectionThroughUnion rule); in this
-        // case we use the the first mapping (which should be provided by the first child).
+        // case we use the first mapping (which should be provided by the first child).
         val mapping = AttributeMap(currentNextAttrPairs)
 
         // Create a an expression cleaning function for nodes that can actually produce redundant
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelperSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelperSuite.scala
index 9100e10ca0c09..0a3f86ebf6808 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelperSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelperSuite.scala
@@ -118,7 +118,7 @@ class AnalysisHelperSuite extends SparkFunSuite {
 
   test("do not allow transform in analyzer") {
     val plan = Project(Nil, LocalRelation())
-    // These should be OK since we are not in the analzyer
+    // These should be OK since we are not in the analyzer
     plan.transform { case p: Project => p }
     plan.transformUp { case p: Project => p }
     plan.transformDown { case p: Project => p }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala
index 1f325c11c9e44..2ec99c4b9ade3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala
@@ -158,7 +158,7 @@ class ObjectAggregationIterator(
         val buffer: InternalRow = getAggregationBufferByKey(hashMap, groupingKey)
         processRow(buffer, newInput)
 
-        // The the hash map gets too large, makes a sorted spill and clear the map.
+        // The hash map gets too large, makes a sorted spill and clear the map.
         if (hashMap.size >= fallbackCountThreshold) {
           logInfo(
             s"Aggregation hash map size ${hashMap.size} reaches threshold " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonForeachWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonForeachWriter.scala
index a4e9b3305052f..2a799bab1eb81 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonForeachWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonForeachWriter.scala
@@ -78,7 +78,7 @@ object PythonForeachWriter {
    *
    * Internally, it uses a [[HybridRowQueue]] to buffer the rows in a practically unlimited queue
    * across memory and local disk. However, HybridRowQueue is designed to be used only with
-   * EvalPythonExec where the reader is always behind the the writer, that is, the reader does not
+   * EvalPythonExec where the reader is always behind the writer, that is, the reader does not
    * try to read n+1 rows if the writer has only written n rows at any point of time. This
    * assumption is not true for PythonForeachWriter where rows may be added at a different rate as
    * they are consumed by the python worker. Hence, to maintain the invariant of the reader being
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
index f1bfe97610fed..3c45f22815db8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
@@ -110,7 +110,7 @@ import org.apache.spark.util.{CompletionIterator, SerializableConfiguration}
  *
  * 3. When both window in join key and time range conditions are present, case 1 + 2.
  *    In this case, since window equality is a stricter condition than the time range, we can
- *    use the the State Key Watermark = event time watermark to discard state (similar to case 1).
+ *    use the State Key Watermark = event time watermark to discard state (similar to case 1).
  *
  * @param leftKeys  Expression to generate key rows for joining from left input
  * @param rightKeys Expression to generate key rows for joining from right input
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWrite.scala
index ad5c7cf24caf7..dc25289aa1e2d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWrite.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWrite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactor
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
-/** Common methods used to create writes for the the console sink */
+/** Common methods used to create writes for the console sink */
 class ConsoleWrite(schema: StructType, options: CaseInsensitiveStringMap)
     extends StreamingWrite with Logging {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala
index ab68eba81b843..af08a53e465b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala
@@ -93,7 +93,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalGroupState
  *      any trigger and timeout function call will not occur until there is data.
  *    - Since the processing time timeout is based on the clock time, it is affected by the
  *      variations in the system clock (i.e. time zone changes, clock skew, etc.).
- *  - With `EventTimeTimeout`, the user also has to specify the the the event time watermark in
+ *  - With `EventTimeTimeout`, the user also has to specify the event time watermark in
  *    the query using `Dataset.withWatermark()`. With this setting, data that is older than the
  *    watermark are filtered out. The timeout can be set for a group by setting a timeout timestamp
  *    using`GroupState.setTimeoutTimestamp()`, and the timeout would occur when the watermark
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala
index 4b6da5a02eac3..0734c6e18deea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala
@@ -36,7 +36,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark {
 
   // We use `col1 BIGINT, col2 STRUCT<_1: BIGINT, _2: STRING>,
   // col3 ARRAY<STRUCT<_1: BIGINT, _2: STRING>>` as a test schema.
-  // col1, col2._1 and col3._1 are used for comparision. col2._2 and col3._2 mimics the burden
+  // col1, col2._1 and col3._1 are used for comparison. col2._2 and col3._2 mimics the burden
   // for the other columns
   private val df = spark
     .range(N * 10)

None	Optional Avro schema (in JSON format) that was used to serialize the data. This should be set if the schema provided for deserialization is compatible with - but not the same as - the one used to originally convert the data to Avro. - For more information on Avro's schema evolution and compatability, please refer to the [documentation of Confluent](https://docs.confluent.io/current/schema-registry/avro.html). + For more information on Avro's schema evolution and compatibility, please refer to the [documentation of Confluent](https://docs.confluent.io/current/schema-registry/avro.html).	function `from_avro`
700	Dan	50