From 39db13d6ce495ffcc4a641387930521d60059b87 Mon Sep 17 00:00:00 2001 From: Praateek Mahajan Date: Fri, 7 Feb 2025 16:10:07 -0800 Subject: [PATCH] Removal logic for fuzzy / exact (no class abstraction) (#509) Signed-off-by: Phillip Mobley --- nemo_curator/modules/exact_dedup.py | 2 +- nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo_curator/modules/exact_dedup.py b/nemo_curator/modules/exact_dedup.py index f06497c5..4bdd93b3 100644 --- a/nemo_curator/modules/exact_dedup.py +++ b/nemo_curator/modules/exact_dedup.py @@ -146,7 +146,7 @@ def hash_documents( # TODO: Generalize ty using self.hash_method return df.apply(lambda x: md5(x.encode()).hexdigest()) - def call(self, dataset: DocumentDataset) -> Union[DocumentDataset, str]: + def identify_duplicates(self, dataset: DocumentDataset) -> DocumentDataset: """ Find document ID's for exact duplicates in a given DocumentDataset Parameters diff --git a/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py b/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py index 895ddeca..cdc98e6d 100644 --- a/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py +++ b/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py @@ -131,7 +131,9 @@ def __init__( profile_dir=self.config.profile_dir, ) - def call(self, dataset: DocumentDataset): + def identify_duplicates( + self, dataset: DocumentDataset + ) -> Optional[DocumentDataset]: """ Parameters ----------