Skip to content

Commit

Permalink
Default to using larger batches
Browse files Browse the repository at this point in the history
Signed-off-by: Ayush Dattagupta <[email protected]>
  • Loading branch information
ayushdg committed Jul 10, 2024
1 parent 19777e7 commit a892490
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 2 deletions.
2 changes: 1 addition & 1 deletion nemo_curator/modules/fuzzy_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ def _get_output_map_from_text_bytes_per_bucket(
):
# String bytes limit for cuDF
# https://github.com/rapidsai/cudf/issues/13733
max_text_bytes_per_part = int(np.iinfo(np.int32).max * 1.2)
max_text_bytes_per_part = int(np.iinfo(np.int32).max * 3)

self._logger.info(f"max_text_bytes_per_part = {max_text_bytes_per_part}")
# Increasing in an attempt to prevent hitting
Expand Down
3 changes: 3 additions & 0 deletions nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def main(args):
print(f"Processed {args.num_files}... quitting")
break

print(args.input_meta)
files = get_all_files_paths_under(root=data_path, recurse_subdirectories=False)
files = [f for f in files if f.endswith(".jsonl")]
df = read_data(
Expand All @@ -78,6 +79,7 @@ def main(args):
backend="cudf",
files_per_partition=args.files_per_partition,
add_filename=False,
input_meta=args.input_meta,
)[[id_field, text_field]]

if num_files is not None:
Expand Down Expand Up @@ -120,6 +122,7 @@ def attach_args(parser=None):
help="Random seed used for intializing the hash "
"functions used to compute the MinHashes"
)
argumentHelper.add_arg_input_meta()
parser.add_argument(
"--char-ngram",
type=int,
Expand Down
2 changes: 2 additions & 0 deletions nemo_curator/utils/distributed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ def read_single_partition(
read_kwargs = {"lines": filetype == "jsonl"}
if backend == "cudf":
read_f = cudf.read_json
if input_meta is not None:
read_kwargs["prune_columns"] = True
else:
read_kwargs["dtype"] = False
read_f = pd.read_json
Expand Down
2 changes: 1 addition & 1 deletion nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def get_shuffle_part_ids_df(
num_workers=0,
):
sizes = agg_df[size_col].values
max_text_bytes_per_part = int(np.iinfo(np.int32).max // 1.2)
max_text_bytes_per_part = int(np.iinfo(np.int32).max * 3)

# Adjust max_text_bytes_per_part if the number of output
# partitions is small compared to the number of workers.
Expand Down

0 comments on commit a892490

Please sign in to comment.