Skip to content

Commit

Permalink
Expose more configurations to test long string support
Browse files Browse the repository at this point in the history
Signed-off-by: Ayush Dattagupta <[email protected]>
  • Loading branch information
ayushdg committed Jul 10, 2024
1 parent 9a3bbbd commit ba7e8a1
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
3 changes: 3 additions & 0 deletions nemo_curator/modules/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ class FuzzyDuplicatesConfig(BaseConfig):
# Only required for fp check
num_anchors: int = 2
jaccard_threshold: float = 0.8
bucket_mapping_blocksize: int = 256
parts_per_worker: int = 1
bucket_parts_per_worker: int = 8

def __post_init__(self):
self.num_hashes = self.num_buckets * self.hashes_per_bucket
Expand Down
8 changes: 4 additions & 4 deletions nemo_curator/modules/fuzzy_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,9 +502,9 @@ def __call__(self, dataset: DocumentDataset):
documents_df=dataset.df,
bucket_w_anchors_path=mapped_buckets_w_anchors_path,
output_shuffled_docs_path=shuffled_docs_path,
bucket_mapping_df_blocksize=256,
parts_per_worker=1,
bucket_parts_per_worker=8,
bucket_mapping_df_blocksize=self.config.bucket_mapping_blocksize,
parts_per_worker=self.config.parts_per_worker,
bucket_parts_per_worker=self.config.bucket_parts_per_worker,
)
print("Stage3 (False Postive Check): Shuffle docs complete!")

Expand Down Expand Up @@ -602,7 +602,7 @@ def _get_output_map_from_text_bytes_per_bucket(
):
# String bytes limit for cuDF
# https://github.com/rapidsai/cudf/issues/13733
max_text_bytes_per_part = int(np.iinfo(np.int32).max // 1.2)
max_text_bytes_per_part = int(np.iinfo(np.int32).max * 1.2)

self._logger.info(f"max_text_bytes_per_part = {max_text_bytes_per_part}")
# Increasing in an attempt to prevent hitting
Expand Down

0 comments on commit ba7e8a1

Please sign in to comment.