diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py index 4645d55ef..43375a127 100644 --- a/nemo_curator/__init__.py +++ b/nemo_curator/__init__.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + import dask from .modules import * @@ -21,3 +23,6 @@ # See https://github.com/NVIDIA/NeMo-Curator/issues/33 # This also happens when reading and writing to files dask.config.set({"dataframe.convert-string": False}) + +# Enable libcudf large string support +os.environ["LIBCUDF_LARGE_STRINGS_ENABLED"] = "1"