Skip to content

Commit

Permalink
Removal logic for fuzzy / exact (no class abstraction) (NVIDIA#509)
Browse files Browse the repository at this point in the history
Signed-off-by: Phillip Mobley <[email protected]>
  • Loading branch information
praateekmahajan authored and philm001 committed Feb 10, 2025
1 parent 9638f2c commit 39db13d
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
2 changes: 1 addition & 1 deletion nemo_curator/modules/exact_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def hash_documents(
# TODO: Generalize ty using self.hash_method
return df.apply(lambda x: md5(x.encode()).hexdigest())

def call(self, dataset: DocumentDataset) -> Union[DocumentDataset, str]:
def identify_duplicates(self, dataset: DocumentDataset) -> DocumentDataset:
"""
Find document ID's for exact duplicates in a given DocumentDataset
Parameters
Expand Down
4 changes: 3 additions & 1 deletion nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,9 @@ def __init__(
profile_dir=self.config.profile_dir,
)

def call(self, dataset: DocumentDataset):
def identify_duplicates(
self, dataset: DocumentDataset
) -> Optional[DocumentDataset]:
"""
Parameters
----------
Expand Down

0 comments on commit 39db13d

Please sign in to comment.