Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Nemotron CC SDG Pipelines and Pre-processing/Post-Processing Stages #527

Merged
merged 23 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/user-guide/api/filters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,14 @@ Heuristic Filters
:members:
:member-order: bysource

.. autoclass:: nemo_curator.filters.TokenCountFilter
:members:
:member-order: bysource

.. autoclass:: nemo_curator.filters.SubstringFilter
:members:
:member-order: bysource

------------------------------
Code Filters
------------------------------
Expand Down
6 changes: 6 additions & 0 deletions docs/user-guide/api/misc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@ Miscellaneous

.. autoclass:: nemo_curator.Shuffle
:members:

.. autoclass:: nemo_curator.DocumentSplitter
:members:

.. autoclass:: nemo_curator.DocumentJoiner
:members:
19 changes: 19 additions & 0 deletions docs/user-guide/api/modifiers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,22 @@ Modifiers

.. autoclass:: nemo_curator.modifiers.PiiModifier
:members:

.. autoclass:: nemo_curator.modifiers.LineRemover
:members:

.. autoclass:: nemo_curator.modifiers.MarkdownRemover
:members:

.. autoclass:: nemo_curator.modifiers.NewlineNormalizer
:members:

.. autoclass:: nemo_curator.modifiers.UrlRemover
:members:

.. autoclass:: nemo_curator.modifiers.Slicer
:members:

.. autoclass:: nemo_curator.modifiers.QuotationRemover
:members:

4 changes: 4 additions & 0 deletions nemo_curator/filters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@
RepeatedParagraphsFilter,
RepeatingDuplicateNGramsFilter,
RepeatingTopNGramsFilter,
SubstringFilter,
SymbolsToWordsFilter,
TokenCountFilter,
UrlsFilter,
WhiteSpaceFilter,
WordCountFilter,
Expand Down Expand Up @@ -98,4 +100,6 @@
"QualityEstimationFilter",
"AnswerabilityFilter",
"EasinessFilter",
"TokenCountFilter",
"SubstringFilter",
]
62 changes: 62 additions & 0 deletions nemo_curator/filters/heuristic_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@

import os.path
import tarfile
from typing import Literal

import requests
from platformdirs import user_cache_dir
from transformers import AutoTokenizer

from nemo_curator.filters.bitext_filter import BitextFilter
from nemo_curator.filters.doc_filter import DocumentFilter, import_filter
Expand Down Expand Up @@ -671,6 +673,66 @@ def keep_document(self, score):
return score != 1


class TokenCountFilter(DocumentFilter):
"""
If the document contains more or less than a specified number of tokens, then discard.
"""

def __init__(self, tokenizer: AutoTokenizer, min_tokens=0, max_tokens=float("inf")):
"""
Args:
tokenizer (AutoTokenizer): The tokenizer to use to count the tokens.
min_tokens (int): The minimum number of tokens the document must contain.
Set to 0 to disable the minimum token count filter.
max_tokens (int): The maximum number of tokens the document can contain.
Set to infinity to disable the maximum token count filter.
"""
super().__init__()
self._tokenizer = tokenizer
self._min_tokens = min_tokens
self._max_tokens = max_tokens
self._name = "token_count"

def score_document(self, text: str) -> int:
ryantwolf marked this conversation as resolved.
Show resolved Hide resolved
tokens = self._tokenizer.encode(text)
return len(tokens)

def keep_document(self, score: int) -> bool:
return self._min_tokens <= score <= self._max_tokens


class SubstringFilter(DocumentFilter):
"""
Keeps documents that contain a substring in a given position.
Gives a score of 1 if the substring is found in the given position, otherwise 0.
"""

def __init__(self, substring: str, position: Literal["prefix", "suffix", "any"]):
"""
Args:
substring (str): The substring to check for.
position (Literal["prefix", "suffix", "any"]): The position of the substring.
"""
super().__init__()
self._substring = substring
if position not in ["prefix", "suffix", "any"]:
raise ValueError(
f"Invalid position: {position}. Must be one of: prefix, suffix, any."
)
self._position = position

def score_document(self, text: str) -> int:
if self._position == "prefix":
return int(text.startswith(self._substring))
elif self._position == "suffix":
return int(text.endswith(self._substring))
elif self._position == "any":
return int(self._substring in text)

def keep_document(self, score: int) -> bool:
return score == 1


class HistogramFilter(DocumentFilter):
"""Histogram filter used by the NLLB paper (https://arxiv.org/pdf/2207.04672). See p30 for details.

Expand Down
8 changes: 8 additions & 0 deletions nemo_curator/modifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@
from .c4 import BoilerPlateStringModifier
from .doc_modifier import DocumentModifier
from .fasttext import FastTextLabelModifier
from .line_remover import LineRemover
from .markdown_remover import MarkdownRemover
from .newline_normalizer import NewlineNormalizer
from .pii_modifier import PiiModifier
from .quotation_remover import QuotationRemover
from .slicer import Slicer
from .unicode_reformatter import UnicodeReformatter
from .url_remover import UrlRemover

Expand All @@ -25,7 +29,11 @@
"BoilerPlateStringModifier",
"FastTextLabelModifier",
"UnicodeReformatter",
"QuotationRemover",
"LineRemover",
"MarkdownRemover",
"PiiModifier",
"NewlineNormalizer",
"UrlRemover",
"Slicer",
]
36 changes: 36 additions & 0 deletions nemo_curator/modifiers/line_remover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

from nemo_curator.modifiers import DocumentModifier


class LineRemover(DocumentModifier):
"""
Removes lines from a document if the content of the line matches a given string.
"""

def __init__(self, patterns: List[str]):
"""
Args:
patterns (List[str]): The patterns to check
"""
super().__init__()
self._patterns = patterns

def modify_document(self, text: str) -> str:
lines = text.split("\n")
new_lines = [line for line in lines if line not in self._patterns]
return "\n".join(new_lines)
43 changes: 43 additions & 0 deletions nemo_curator/modifiers/markdown_remover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re

from nemo_curator.modifiers import DocumentModifier

MARKDOWN_BOLD_REGEX = r"\*\*(.*?)\*\*"
MARKDOWN_ITALIC_REGEX = r"\*(.*?)\*"
MARKDOWN_UNDERLINE_REGEX = r"_(.*?)_"
MARKDOWN_LINK_REGEX = r"\[.*?\]\((.*?)\)"


class MarkdownRemover(DocumentModifier):
"""
Removes Markdown formatting in a document including bold, italic, underline, and URL text.
"""

def __init__(self):
super().__init__()

def modify_document(self, text: str) -> str:
lines = text.split("\n")
new_lines = []
for line in lines:
line = re.sub(MARKDOWN_BOLD_REGEX, r"\1", line) # **text**
line = re.sub(MARKDOWN_ITALIC_REGEX, r"\1", line) # *text*
line = re.sub(MARKDOWN_UNDERLINE_REGEX, r"\1", line) # _text_
line = re.sub(MARKDOWN_LINK_REGEX, r"\1", line) # [text](url)
new_lines.append(line)

return "\n".join(new_lines)
38 changes: 38 additions & 0 deletions nemo_curator/modifiers/quotation_remover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo_curator.modifiers import DocumentModifier


class QuotationRemover(DocumentModifier):
"""
Removes quotations from a document following a few rules:
- If the document is less than 2 characters, it is returned unchanged.
- If the document starts and ends with a quotation mark and there are
no newlines in the document, the quotation marks are removed.
- If the document starts and ends with a quotation mark and there are
newlines in the document, the quotation marks are removed only if
the first line does not end with a quotation mark.
Comment on lines +24 to +26
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't this lead in a weird result?

"""
'hello my name is bla,
what's your name?'
is your name 'xyz'?
"""
Here the quotations are quoting 'hello my name is bla, \n what's your name?' and xyz

But the removal logic will output quotations such that the quoted phrase is is your name

Copy link
Collaborator Author

@ryantwolf ryantwolf Feb 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry the formatting in your example is a bit weird and I'm having trouble understanding your question. Is your example meant to be one document or multiple documents? The ending ? would cause no modifications to be made on this document. From how I am reading it I see:

example="""""hello my name is bla,
what's your name?"
is your name "xyz"?
"""

Please let me know if I have misread this.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad on the formatting, I mistakenly also added the question mark

example=""""hello my name is bla,
what's your name?"
is your name "xyz"
"""

In this example first line first char and last line last char are quotation marks, so the output will be

example="""hello my name is bla,
what's your name?"
is your name "xyz

i.e initially the quoted phrase was "hello my name is bla, \n what's your name?" and "xyz", but once we remove the punctuation as per the algorithm, the output will be "is your name "

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha thanks for clarifying. You're correct that this probably will happen, but I don't think that format of response is common in the Nemotron-CC SDG pipelines this filter was applied in. A lot of these filters have weird data edgecases, but so long as folks selectively apply them and look at their data to see the results it should be fine.

"""

def __init__(self):
super().__init__()

def modify_document(self, text: str) -> str:
if len(text.strip()) > 2 and text[0] == '"' and text[-1] == '"':
if "\n" not in text.strip():
text = text[1:-1]
elif "\n" in text.strip() and text.split("\n")[0][-1] != '"':
ryantwolf marked this conversation as resolved.
Show resolved Hide resolved
text = text[1:-1]
return text
85 changes: 85 additions & 0 deletions nemo_curator/modifiers/slicer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Union

from nemo_curator.modifiers import DocumentModifier


class Slicer(DocumentModifier):
"""
Slices a document based on indices or strings.
"""

def __init__(
self,
left: Optional[Union[int, str]] = None,
right: Optional[Union[int, str]] = None,
include_left: bool = True,
include_right: bool = True,
strip: bool = True,
):
"""
Args:
left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive).
If the provided value is a str, slice the string from the first occurence of this substring.
right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive).
ryantwolf marked this conversation as resolved.
Show resolved Hide resolved
If the provided value is a str, slice the string to the last occurence of this substring.
include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the
slicing result. Defaults to False.
include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the
slicing result. Defaults to False.
strip (bool): If True, strip the resulting string.
"""
super().__init__()
self._left = left
self._right = right
self._include_left = include_left
self._include_right = include_right
self._strip = strip

def modify_document(self, text: str) -> str:
# Determine start index based on left type
if isinstance(self._left, int):
left_index = self._left
elif isinstance(self._left, str):
left_index_found = text.find(self._left)
if left_index_found == -1:
return ""
left_index = (
left_index_found
if self._include_left
else left_index_found + len(self._left)
)
else:
left_index = 0 # default if neither int nor str

# Determine end index based on right type
if isinstance(self._right, int):
right_index = self._right
elif isinstance(self._right, str):
right_index_found = text.rfind(self._right)
if right_index_found == -1:
return ""
right_index = (
right_index_found + len(self._right)
if self._include_right
else right_index_found
)
else:
right_index = len(text) # default if neither int nor str

result = text[left_index:right_index]
if self._strip:
result = result.strip()
return result
3 changes: 3 additions & 0 deletions nemo_curator/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .exact_dedup import ExactDuplicates
from .meta import Sequential
from .modify import Modify
from .splitter import DocumentSplitter, DocumentJoiner
from .task import TaskDecontamination
from .to_backend import ToBackend

Expand Down Expand Up @@ -92,4 +93,6 @@
"SemDedup",
"BaseModule",
"ToBackend",
"DocumentSplitter",
"DocumentJoiner",
]
Loading