NVIDIA · praateekmahajan · Feb 8, 2025 · Jan 28, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/docs/user-guide/gpudeduplication.rst b/docs/user-guide/gpudeduplication.rst
@@ -63,14 +63,14 @@ After ensuring your dataset has a unique ID field (or creating one with the code
     from nemo_curator.datasets import DocumentDataset
 
     # Initialize the deduplication object
-    ExactDups = ExactDuplicates(id_field="my_id", text_field="text")
+    exact_duplicates = ExactDuplicates(id_field="my_id", text_field="text")
 
     dataset = DocumentDataset.read_parquet(
         input_files="/path/to/parquet/data",
         backend="cudf",  # or "pandas" for CPU
     )
 
-    duplicate_docs = ExactDups(dataset)
+    duplicate_docs = exact_duplicates(dataset)
 
     """
     Sample output:
@@ -82,9 +82,11 @@ After ensuring your dataset has a unique ID field (or creating one with the code
     107  doc_prefix-52271  0f763a2937d57b9d96bf9f220e55f2bd
     """
 
+    deduplicated_dataset = exact_duplicates.remove(dataset, duplicate_docs)
+
+
 .. tip::
-  A more comprehensive example, including how to remove documents from a corpus using the list of
-  duplicate IDs generated from the exact deduplication step above, can be found in `examples/exact_deduplication.py <https://github.com/NVIDIA/NeMo-Curator/blob/main/examples/exact_deduplication.py>`_.
+  A more comprehensive example, can be found in `examples/exact_deduplication.py <https://github.com/NVIDIA/NeMo-Curator/blob/main/examples/exact_deduplication.py>`_.
 
 """"""""""""
 CLI Utility
@@ -226,14 +228,14 @@ Python API
     from nemo_curator.datasets import DocumentDataset
 
     # Initialize the deduplication object
-    FuzzyDups = FuzzyDuplicates(config=config, logger="./")
+    fuzzy_duplicates = FuzzyDuplicates(config=config, logger="./")
 
     dataset = DocumentDataset.read_json(
         input_files="/path/to/jsonl/data",
         backend="cudf", # FuzzyDuplicates only supports datasets with the cuDF backend.
     )
 
-    duplicate_docs = FuzzyDups(dataset)
+    duplicate_docs = fuzzy_duplicates(dataset)
     """
     Sample output:
                   my_id  group
@@ -244,10 +246,12 @@ Python API
     4  doc_prefix-42050    154
     """
 
+    deduplicated_dataset = fuzzy_duplicates.remove(dataset, duplicate_docs)
+
+
 .. tip::
 
-  - A more comprehensive example for the above, including how to remove documents from a corpus using the list of
-    duplicate IDs generated from fuzzy deduplication, can be found in `examples/fuzzy_deduplication.py <https://github.com/NVIDIA/NeMo-Curator/blob/main/examples/fuzzy_deduplication.py>`_.
+  - A comprehensive example can be found in `examples/fuzzy_deduplication.py <https://github.com/NVIDIA/NeMo-Curator/blob/main/examples/fuzzy_deduplication.py>`_.
   - The default values of ``num_buckets`` and ``hashes_per_bucket`` are set to find documents with an approximately Jaccard similarity of 0.8 or above.
   - Higher ``buckets_per_shuffle`` values can lead to better performance but might lead to out of memory errors.
   - Setting the ``false_positive_check`` flag to ``False`` is ideal for optimal performance.

diff --git a/examples/exact_deduplication.py b/examples/exact_deduplication.py
@@ -56,20 +56,7 @@ def main(args):
         duplicates = DocumentDataset.read_parquet(duplicates, backend=backend)
 
     # It's easy to apply dataframe operations to the dataset by using the underlying df.
-
-    # By default all duplicate id's are included in the result
-    # keep 1 document from each group of duplcates and mark the others to remove
-    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html
-    docs_to_remove = duplicates.df.map_partitions(
-        lambda x: x[x._hashes.duplicated(keep="first")]
-    )
-
-    # When there are few duplicates we can compute the results to a list and use `isin`.
-    result = input_dataset.df[
-        ~input_dataset.df[dataset_id_field].isin(
-            docs_to_remove[dataset_id_field].compute()
-        )
-    ]
+    result = exact_dup.remove(input_dataset, duplicates)
     write_to_disk(result, output_dir, output_type="parquet")
     print(time.time() - t0)
 

diff --git a/examples/fuzzy_deduplication.py b/examples/fuzzy_deduplication.py
@@ -84,19 +84,7 @@ def main(args):
             print(f"Time taken:{time.time() - t0}s")
             return
 
-        # By default all duplicate id's and the group they belong to are included in the result
-        # keep 1 document from each group of duplcates and mark the others to remove
-        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html
-        docs_to_remove = duplicates.df.map_partitions(
-            lambda x: x[x.group.duplicated(keep="first")]
-        )
-
-        # When there are few duplicates we can compute the results to a list and use `isin`.
-        result = input_dataset.df[
-            ~input_dataset.df[dataset_id_field].isin(
-                docs_to_remove[dataset_id_field].compute()
-            )
-        ]
+        result = fuzzy_dup.remove(input_dataset, duplicates)
         write_to_disk(result, output_dir, output_type=filetype)
         print(f"Time taken:{time.time() - t0}s")
 

diff --git a/nemo_curator/modules/exact_dedup.py b/nemo_curator/modules/exact_dedup.py
@@ -18,7 +18,6 @@
 import time
 import warnings
 from contextlib import nullcontext
-from datetime import datetime
 from hashlib import md5
 from typing import Optional, Union
 
@@ -31,6 +30,7 @@
 from nemo_curator.log import create_logger
 from nemo_curator.utils.distributed_utils import performance_report_if_with_ts_suffix
 from nemo_curator.utils.gpu_utils import is_cudf_type
+from nemo_curator.utils.removal import remove_duplicates
 
 
 class ExactDuplicates:
@@ -64,6 +64,7 @@ def __init__(
             raise ValueError(
                 f"{hash_method} not in supported hash_methods. Choose a hash_method from {self.SUPPORTED_HASHES}"
             )
+
         self.hash_method = hash_method
         self.id_field = id_field
         self.text_field = text_field
@@ -135,7 +136,7 @@ def hash_documents(
             # TODO: Generalize ty using self.hash_method
             return df.apply(lambda x: md5(x.encode()).hexdigest())
 
-    def __call__(self, dataset: DocumentDataset) -> Union[DocumentDataset, str]:
+    def identify(self, dataset: DocumentDataset) -> DocumentDataset:
-    def identify(self, dataset: DocumentDataset) -> DocumentDataset:
+    def _identify(self, dataset: DocumentDataset) -> DocumentDataset:
-    def identify(self, dataset: DocumentDataset) -> DocumentDataset:
+    def _identify(self, dataset: DocumentDataset) -> DocumentDataset:
         """
         Find document ID's for exact duplicates in a given DocumentDataset
         Parameters
@@ -166,10 +167,42 @@ def __call__(self, dataset: DocumentDataset) -> Union[DocumentDataset, str]:
         self._logger.info(
             f"Time taken for Exact Dedup Computation = {time.time() - t0}s and output written at {write_path}"
         )
-        if is_cudf_type(result):
-            import dask_cudf
+        backend = "cudf" if is_cudf_type(result) else "pandas"
+        return DocumentDataset.read_parquet(
+            write_path,
+            backend=backend,
+            blocksize="1024MiB",
+            files_per_partition=None,
+            split_row_groups=False,
+        )
 
-            result_dataset = dask_cudf.read_parquet(write_path, split_row_groups=False)
-        else:
-            result_dataset = dd.read_parquet(write_path)
-        return DocumentDataset(result_dataset)
+    def remove(
+        self, dataset: DocumentDataset, duplicates_to_remove: Optional[DocumentDataset]
+    ) -> DocumentDataset:
+        """
+        Remove exact duplicates from a given DocumentDataset
+        Parameters
+        ----------
+        dataset: DocumentDataset
+          The input datset to remove exact duplicates
+        Returns
+        -------
+        DocumentDataset containing only non-duplicate documents
+        """
+        if not duplicates_to_remove:
+            return None
+        result = remove_duplicates(
+            left=dataset.df,
+            duplicates=duplicates_to_remove.df,
+            id_field=self.id_field,
+            group_field="_hashes",
+        )
+        return DocumentDataset(result)
+
+    def __call__(
+        self, dataset: DocumentDataset, perform_removal: bool = False
+    ) -> DocumentDataset:
+        duplicates = self.identify(dataset)
+        if duplicates and perform_removal:
+            return self.remove(dataset, duplicates)
+        return duplicates
diff --git a/nemo_curator/modules/fuzzy_dedup/connectedcomponents.py b/nemo_curator/modules/fuzzy_dedup/connectedcomponents.py
@@ -125,8 +125,6 @@ def _run_connected_components(
                 f"# rows in labels_df = {len(labels_df)}"
             )
             assert num_nodes == len(labels_df)
-            # Ensure all docs in the same group are in the same partition
-            labels_df = labels_df.shuffle(on=["group"], ignore_index=True)
             labels_df.to_parquet(output_path, write_index=False, overwrite=True)
             Comms.destroy()
         self._logger.info(

diff --git a/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py b/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py
@@ -17,9 +17,7 @@
 import logging
 import os
 import time
-from typing import Union
-
-import dask_cudf
+from typing import Optional, Union
 
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.log import create_logger
@@ -33,6 +31,7 @@
 from nemo_curator.modules.fuzzy_dedup.minhash import MinHash
 from nemo_curator.modules.meta import Sequential
 from nemo_curator.utils.distributed_utils import performance_report_if_with_ts_suffix
+from nemo_curator.utils.removal import remove_duplicates
 
 
 class FuzzyDuplicates:
@@ -63,6 +62,7 @@ def __init__(
             self._logger = logger
 
         self.config = config
+
         self.minhash = MinHash(
             seed=self.config.seed,
             num_hashes=self.config.num_hashes,
@@ -129,7 +129,7 @@ def __init__(
             profile_dir=self.config.profile_dir,
         )
 
-    def __call__(self, dataset: DocumentDataset):
+    def identify(self, dataset: DocumentDataset) -> Optional[DocumentDataset]:
         """
         Parameters
         ----------
@@ -243,4 +243,41 @@ def __call__(self, dataset: DocumentDataset):
         print(f"Stage {stage_num}: Connected Components across buckets complete!")
         stage_num += 1
 
-        return DocumentDataset(dask_cudf.read_parquet(cc_path, split_row_groups=False))
+        return DocumentDataset.read_parquet(
+            cc_path,
+            backend="cudf",
+            blocksize="1024MiB",
+            files_per_partition=None,
+            split_row_groups=False,
+        )
+
+    def remove(
+        self, dataset: DocumentDataset, duplicates_to_remove: Optional[DocumentDataset]
+    ) -> Optional[DocumentDataset]:
+        """
+        Remove exact duplicates from a given DocumentDataset
+        Parameters
+        ----------
+        dataset: DocumentDataset
+          The input datset to remove exact duplicates
+        Returns
+        -------
+        DocumentDataset containing only non-duplicate documents
+        """
+        if not duplicates_to_remove:
+            return None
+        result = remove_duplicates(
+            left=dataset.df,
+            duplicates=duplicates_to_remove.df,
+            id_field=self.config.id_field,
+            group_field="group",
+        )
+        return DocumentDataset(result)
+
+    def __call__(
+        self, dataset: DocumentDataset, perform_removal: bool = False
+    ) -> DocumentDataset:
+        duplicates = self.identify(dataset)
+        if perform_removal:
+            return self.remove(dataset, duplicates)
+        return duplicates
diff --git a/nemo_curator/modules/semantic_dedup/clusteringmodel.py b/nemo_curator/modules/semantic_dedup/clusteringmodel.py
@@ -195,9 +195,7 @@ def __call__(self, embeddings_dataset: DocumentDataset):
                 id_col=self.id_col,
                 kmeans_centroids_file=kmeans_centroids_file,
                 nearest_cent_dir=clustering_output_dir,
-                output_sorted_clusters_dir=os.path.join(
-                    self.clustering_output_dir, "sorted"
-                ),
+                vdb=os.path.join(self.clustering_output_dir, "sorted"),
                 embedding_col=self.embedding_col,
                 sim_metric=self.sim_metric,
                 keep_hard=self.keep_hard,

diff --git a/nemo_curator/utils/removal.py b/nemo_curator/utils/removal.py
@@ -0,0 +1,42 @@
+import dask.dataframe as dd
+
+
+def remove_duplicates(
+    left: dd.DataFrame,
+    duplicates: dd.DataFrame,
+    id_field: str,
+    group_field: str,
+) -> dd.DataFrame:
+    if left.npartitions < duplicates.npartitions:
+        msg = (
+            "The number of partitions in `left` is less than the number of partitions in the duplicates dataset. "
+            "This may lead to a shuffle join. Please re-read left and right with different partition sizes, or repartition left / right."
+        )
+        raise ValueError(msg)
+
+    # Create a new column name for temporary ID storage during merge
+    new_id_field = f"{id_field}_new"
+
+    duplicates_to_remove = (
+        duplicates
+        # Redistribute data across partitions so that all duplicates are in same partition
+        .shuffle(on=[group_field], ignore_index=True)
+        # For each partition, keep only the duplicated rows (excluding first occurrence)
+        .map_partitions(lambda x: x[x[group_field].duplicated(keep="first")]).drop(
+            columns=group_field
+        )
+        # Rename the ID field to avoid conflicts in the upcoming merge
+        .rename(columns={id_field: new_id_field})[[new_id_field]]
+    )
+
+    merge = left.merge(
+        right=duplicates_to_remove,
+        how="left",
+        broadcast=True,  # Broadcast smaller DataFrame to all partitions
+        left_on=id_field,
+        right_on=new_id_field,
+    )
+
+    # This effectively removes all rows that were not in duplicates_to_remove
+    removed_result = merge[merge[new_id_field].isna()].drop(columns=[new_id_field])
+    return removed_result