NVIDIA · ryantwolf · Feb 12, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/docs/user-guide/api/filters.rst b/docs/user-guide/api/filters.rst
@@ -152,6 +152,14 @@ Heuristic Filters
     :members:
     :member-order: bysource
 
+.. autoclass:: nemo_curator.filters.TokenCountFilter
+    :members:
+    :member-order: bysource
+
+.. autoclass:: nemo_curator.filters.SubstringFilter
+    :members:
+    :member-order: bysource
+
 ------------------------------
 Code Filters
 ------------------------------

diff --git a/docs/user-guide/api/misc.rst b/docs/user-guide/api/misc.rst
@@ -15,3 +15,9 @@ Miscellaneous
 
 .. autoclass:: nemo_curator.Shuffle
     :members:
+
+.. autoclass:: nemo_curator.DocumentSplitter
+    :members:
+
+.. autoclass:: nemo_curator.DocumentJoiner
+    :members:
diff --git a/docs/user-guide/api/modifiers.rst b/docs/user-guide/api/modifiers.rst
@@ -32,3 +32,22 @@ Modifiers
 
 .. autoclass:: nemo_curator.modifiers.PiiModifier
     :members:
+
+.. autoclass:: nemo_curator.modifiers.LineRemover
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.MarkdownRemover
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.NewlineNormalizer
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.UrlRemover
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.Slicer
+    :members:
+
+.. autoclass:: nemo_curator.modifiers.QuotationRemover
+    :members:
+
diff --git a/nemo_curator/filters/__init__.py b/nemo_curator/filters/__init__.py
@@ -49,7 +49,9 @@
     RepeatedParagraphsFilter,
     RepeatingDuplicateNGramsFilter,
     RepeatingTopNGramsFilter,
+    SubstringFilter,
     SymbolsToWordsFilter,
+    TokenCountFilter,
     UrlsFilter,
     WhiteSpaceFilter,
     WordCountFilter,
@@ -98,4 +100,6 @@
     "QualityEstimationFilter",
     "AnswerabilityFilter",
     "EasinessFilter",
+    "TokenCountFilter",
+    "SubstringFilter",
 ]
diff --git a/nemo_curator/filters/heuristic_filter.py b/nemo_curator/filters/heuristic_filter.py
@@ -14,9 +14,11 @@
 
 import os.path
 import tarfile
+from typing import Literal
 
 import requests
 from platformdirs import user_cache_dir
+from transformers import AutoTokenizer
 
 from nemo_curator.filters.bitext_filter import BitextFilter
 from nemo_curator.filters.doc_filter import DocumentFilter, import_filter
@@ -671,6 +673,66 @@ def keep_document(self, score):
         return score != 1
 
 
+class TokenCountFilter(DocumentFilter):
+    """
+    If the document contains more or less than a specified number of tokens, then discard.
+    """
+
+    def __init__(self, tokenizer: AutoTokenizer, min_tokens=0, max_tokens=float("inf")):
+        """
+        Args:
+            tokenizer (AutoTokenizer): The tokenizer to use to count the tokens.
+            min_tokens (int): The minimum number of tokens the document must contain.
+                Set to 0 to disable the minimum token count filter.
+            max_tokens (int): The maximum number of tokens the document can contain.
+                Set to infinity to disable the maximum token count filter.
+        """
+        super().__init__()
+        self._tokenizer = tokenizer
+        self._min_tokens = min_tokens
+        self._max_tokens = max_tokens
+        self._name = "token_count"
+
+    def score_document(self, text: str) -> int:
+        tokens = self._tokenizer.encode(text)
+        return len(tokens)
+
+    def keep_document(self, score: int) -> bool:
+        return self._min_tokens <= score <= self._max_tokens
+
+
+class SubstringFilter(DocumentFilter):
+    """
+    Keeps documents that contain a substring in a given position.
+    Gives a score of 1 if the substring is found in the given position, otherwise 0.
+    """
+
+    def __init__(self, substring: str, position: Literal["prefix", "suffix", "any"]):
+        """
+        Args:
+            substring (str): The substring to check for.
+            position (Literal["prefix", "suffix", "any"]): The position of the substring.
+        """
+        super().__init__()
+        self._substring = substring
+        if position not in ["prefix", "suffix", "any"]:
+            raise ValueError(
+                f"Invalid position: {position}. Must be one of: prefix, suffix, any."
+            )
+        self._position = position
+
+    def score_document(self, text: str) -> int:
+        if self._position == "prefix":
+            return int(text.startswith(self._substring))
+        elif self._position == "suffix":
+            return int(text.endswith(self._substring))
+        elif self._position == "any":
+            return int(self._substring in text)
+
+    def keep_document(self, score: int) -> bool:
+        return score == 1
+
+
 class HistogramFilter(DocumentFilter):
     """Histogram filter used by the NLLB paper (https://arxiv.org/pdf/2207.04672). See p30 for details.
 

diff --git a/nemo_curator/modifiers/__init__.py b/nemo_curator/modifiers/__init__.py
@@ -15,8 +15,12 @@
 from .c4 import BoilerPlateStringModifier
 from .doc_modifier import DocumentModifier
 from .fasttext import FastTextLabelModifier
+from .line_remover import LineRemover
+from .markdown_remover import MarkdownRemover
 from .newline_normalizer import NewlineNormalizer
 from .pii_modifier import PiiModifier
+from .quotation_remover import QuotationRemover
+from .slicer import Slicer
 from .unicode_reformatter import UnicodeReformatter
 from .url_remover import UrlRemover
 
@@ -25,7 +29,11 @@
     "BoilerPlateStringModifier",
     "FastTextLabelModifier",
     "UnicodeReformatter",
+    "QuotationRemover",
+    "LineRemover",
+    "MarkdownRemover",
     "PiiModifier",
     "NewlineNormalizer",
     "UrlRemover",
+    "Slicer",
 ]
diff --git a/nemo_curator/modifiers/line_remover.py b/nemo_curator/modifiers/line_remover.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from nemo_curator.modifiers import DocumentModifier
+
+
+class LineRemover(DocumentModifier):
+    """
+    Removes lines from a document if the content of the line matches a given string.
+    """
+
+    def __init__(self, patterns: List[str]):
+        """
+        Args:
+            patterns (List[str]): The patterns to check
+        """
+        super().__init__()
+        self._patterns = patterns
+
+    def modify_document(self, text: str) -> str:
+        lines = text.split("\n")
+        new_lines = [line for line in lines if line not in self._patterns]
+        return "\n".join(new_lines)
diff --git a/nemo_curator/modifiers/markdown_remover.py b/nemo_curator/modifiers/markdown_remover.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+from nemo_curator.modifiers import DocumentModifier
+
+MARKDOWN_BOLD_REGEX = r"\*\*(.*?)\*\*"
+MARKDOWN_ITALIC_REGEX = r"\*(.*?)\*"
+MARKDOWN_UNDERLINE_REGEX = r"_(.*?)_"
+MARKDOWN_LINK_REGEX = r"\[.*?\]\((.*?)\)"
+
+
+class MarkdownRemover(DocumentModifier):
+    """
+    Removes Markdown formatting in a document including bold, italic, underline, and URL text.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def modify_document(self, text: str) -> str:
+        lines = text.split("\n")
+        new_lines = []
+        for line in lines:
+            line = re.sub(MARKDOWN_BOLD_REGEX, r"\1", line)  # **text**
+            line = re.sub(MARKDOWN_ITALIC_REGEX, r"\1", line)  # *text*
+            line = re.sub(MARKDOWN_UNDERLINE_REGEX, r"\1", line)  # _text_
+            line = re.sub(MARKDOWN_LINK_REGEX, r"\1", line)  # [text](url)
+            new_lines.append(line)
+
+        return "\n".join(new_lines)
diff --git a/nemo_curator/modifiers/quotation_remover.py b/nemo_curator/modifiers/quotation_remover.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_curator.modifiers import DocumentModifier
+
+
+class QuotationRemover(DocumentModifier):
+    """
+    Removes quotations from a document following a few rules:
+    - If the document is less than 2 characters, it is returned unchanged.
+    - If the document starts and ends with a quotation mark and there are
+        no newlines in the document, the quotation marks are removed.
+    - If the document starts and ends with a quotation mark and there are
+        newlines in the document, the quotation marks are removed only if
+        the first line does not end with a quotation mark.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def modify_document(self, text: str) -> str:
+        if len(text.strip()) > 2 and text[0] == '"' and text[-1] == '"':
+            if "\n" not in text.strip():
+                text = text[1:-1]
+            elif "\n" in text.strip() and text.split("\n")[0][-1] != '"':
+                text = text[1:-1]
+        return text
diff --git a/nemo_curator/modifiers/slicer.py b/nemo_curator/modifiers/slicer.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+
+from nemo_curator.modifiers import DocumentModifier
+
+
+class Slicer(DocumentModifier):
+    """
+    Slices a document based on indices or strings.
+    """
+
+    def __init__(
+        self,
+        left: Optional[Union[int, str]] = None,
+        right: Optional[Union[int, str]] = None,
+        include_left: bool = True,
+        include_right: bool = True,
+        strip: bool = True,
+    ):
+        """
+        Args:
+            left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive).
+                If the provided value is a str, slice the string from the first occurence of this substring.
+            right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive).
+                If the provided value is a str, slice the string to the last occurence of this substring.
+            include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the
+                slicing result. Defaults to False.
+            include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the
+                slicing result. Defaults to False.
+            strip (bool): If True, strip the resulting string.
+        """
+        super().__init__()
+        self._left = left
+        self._right = right
+        self._include_left = include_left
+        self._include_right = include_right
+        self._strip = strip
+
+    def modify_document(self, text: str) -> str:
+        # Determine start index based on left type
+        if isinstance(self._left, int):
+            left_index = self._left
+        elif isinstance(self._left, str):
+            left_index_found = text.find(self._left)
+            if left_index_found == -1:
+                return ""
+            left_index = (
+                left_index_found
+                if self._include_left
+                else left_index_found + len(self._left)
+            )
+        else:
+            left_index = 0  # default if neither int nor str
+
+        # Determine end index based on right type
+        if isinstance(self._right, int):
+            right_index = self._right
+        elif isinstance(self._right, str):
+            right_index_found = text.rfind(self._right)
+            if right_index_found == -1:
+                return ""
+            right_index = (
+                right_index_found + len(self._right)
+                if self._include_right
+                else right_index_found
+            )
+        else:
+            right_index = len(text)  # default if neither int nor str
+
+        result = text[left_index:right_index]
+        if self._strip:
+            result = result.strip()
+        return result
diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py
@@ -28,6 +28,7 @@
 from .exact_dedup import ExactDuplicates
 from .meta import Sequential
 from .modify import Modify
+from .splitter import DocumentSplitter, DocumentJoiner
 from .task import TaskDecontamination
 from .to_backend import ToBackend
 
@@ -92,4 +93,6 @@
     "SemDedup",
     "BaseModule",
     "ToBackend",
+    "DocumentSplitter",
+    "DocumentJoiner",
 ]