From 632df995104ebdb11b371c563373f5af40aae2e9 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 13 Dec 2024 14:46:42 -0800
Subject: [PATCH 1/3] first commit

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 docs/user-guide/api/download.rst     |  3 +++
 docs/user-guide/download.rst         |  8 +++++--
 nemo_curator/download/__init__.py    |  2 ++
 nemo_curator/download/commoncrawl.py | 31 ++++++++++++++++++++++++++--
 pyproject.toml                       |  1 +
 tests/test_download.py               | 11 +++++++++-
 6 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/docs/user-guide/api/download.rst b/docs/user-guide/api/download.rst
index e4182a587..56c0a90c1 100644
--- a/docs/user-guide/api/download.rst
+++ b/docs/user-guide/api/download.rst
@@ -55,6 +55,9 @@ Common Crawl
 .. autoclass:: nemo_curator.download.ResiliparseExtractor
     :members:
 
+.. autoclass:: nemo_curator.download.TrafilaturaExtractor
+    :members:
+
 ------------------------------
 Wikipedia
 ------------------------------
diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst
index d4b854e46..2a7b1da76 100644
--- a/docs/user-guide/download.rst
+++ b/docs/user-guide/download.rst
@@ -18,7 +18,7 @@ the extraction step to limit the amount of documents that undergo this heavy com
 NeMo Curator provides example utilities for downloading and extracting Common Crawl, ArXiv, and Wikipedia data.
 In addition, it provides a flexible interface to extend the utility to other datasets.
 Our Common Crawl example demonstrates how to process a crawl by downloading the data from S3, doing preliminary language filtering with pyCLD2,
-and extracting the relevant text with jusText or Resiliparse to output :code:`.jsonl` files.
+and extracting the relevant text with jusText, Resiliparse, or Trafilatura to output :code:`.jsonl` files.
 
 NeMo Curator currently does not provide out-of-the-box support for web-crawling or web-scraping.
 It provides utilities for downloading and extracting data from the preexisting online sources given above.
@@ -53,11 +53,15 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c
 
     from nemo_curator.download import (
       ResiliparseExtractor,
+      TrafilaturaExtractor,
       download_common_crawl,
     )
 
     # Change the extraction algorithm
     extraction_algorithm = ResiliparseExtractor()
+    # Alternatively
+    # extraction_algorithm = TrafilaturaExtractor()
+
     common_crawl = download_common_crawl(
       "/extracted/output/folder",
       "2020-50",
@@ -74,7 +78,7 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c
 
  1. Decode the HTML within the record from binary to text.
  2. If the HTML can be properly decoded, then with `pyCLD2 <https://github.com/aboSamoor/pycld2>`_, perform language detection on the input HTML.
- 3. Finally, the extract the relevant text with `jusText <https://github.com/miso-belica/jusText>`_ or `Resiliparse <https://github.com/chatnoir-eu/chatnoir-resiliparse>`_ from the HTML and write it out as a single string within the 'text' field of a json entry within a `.jsonl` file.
+ 3. Finally, the extract the relevant text with `jusText <https://github.com/miso-belica/jusText>`_, `Resiliparse <https://github.com/chatnoir-eu/chatnoir-resiliparse>`_, or `Trafilatura <https://trafilatura.readthedocs.io/en/latest/>`_ from the HTML and write it out as a single string within the 'text' field of a json entry within a `.jsonl` file.
 * ``download_wikipedia`` will download and extract the latest wikipedia dump. Files are downloaded using ``wget``. Wikipedia might download slower than the other datasets. This is because they limit the number of downloads that can occur per-ip address.
 
   .. code-block:: python
diff --git a/nemo_curator/download/__init__.py b/nemo_curator/download/__init__.py
index cfa1811e4..1de9a080e 100644
--- a/nemo_curator/download/__init__.py
+++ b/nemo_curator/download/__init__.py
@@ -20,6 +20,7 @@
     CommonCrawlWARCIterator,
     JusTextExtractor,
     ResiliparseExtractor,
+    TrafilaturaExtractor,
     download_common_crawl,
 )
 from .doc_builder import (
@@ -54,6 +55,7 @@
     "CommonCrawlWARCDownloaderExtractOnly",
     "JusTextExtractor",
     "ResiliparseExtractor",
+    "TrafilaturaExtractor",
     "download_wikipedia",
     "WikipediaDownloader",
     "WikipediaIterator",
diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py
index 53deffd91..f76587eee 100644
--- a/nemo_curator/download/commoncrawl.py
+++ b/nemo_curator/download/commoncrawl.py
@@ -199,6 +199,33 @@ def extract_text(self, html, stop_words):
         return result
 
 
+class TrafilaturaExtractor(HTMLExtractorAlgorithm):
+    def __init__(
+        self,
+        required_stopword_density=0.32,
+        main_content=True,
+        alt_texts=False,
+    ):
+        """
+        Initialize the Trafilatura text extraction algorithm with specified parameters.
+
+        Args:
+            required_stopword_density: Proportion of stopwords required preserve an extracted paragraph.
+                Studies on stopword lists and their distribution in various text corpora often
+                suggest that around 30-40% of a typical English text consists of stopwords.
+            main_content: Whether to apply simple heuristics for extracting only "main-content" elements.
+            alt_texts: Whether to preserve alternative text descriptions (e.g., for images).
+
+        """
+        self.required_stopword_density = required_stopword_density
+        self.main_content = main_content
+        self.alt_texts = alt_texts
+
+    def extract_text(self, html, stop_words):
+        # TODO
+        return html
+
+
 def get_stop_list_dict(languages=[]):
 
     # Name mapping for language names from CLD2 (values)
@@ -372,7 +399,7 @@ def download_common_crawl(
     url_limit=None,
 ) -> DocumentDataset:
     """
-    Downloads Common Crawl WARC snapshots and extracts them using jusText or Resiliparse
+    Downloads Common Crawl WARC snapshots and extracts them using jusText, Resiliparse, or Trafilatura
 
     Args:
       output_path: The path to the root directory of the files
@@ -382,7 +409,7 @@ def download_common_crawl(
       end_snapshot: The last common crawl snapshot to include. Must be chronologically
         after the starting snapshot.
       output_type: The file type to save the data as.
-      algorithm: A JusTextExtractor or ResiliparseExtractor object.
+      algorithm: A JusTextExtractor, ResiliparseExtractor, or TrafilaturaExtractor object.
       news: If True, gets WARC URLs for the CC-NEWS dataset instead of the CC-MAIN datasets.
         Also assumes that the format for the start and end snapshots is 'YYYY-MM' (Year-Month).
       aws: Whether to download from Common Crawl's S3 bucket. If True, uses s5cmd to download.
diff --git a/pyproject.toml b/pyproject.toml
index a12f3ef08..4ab3d3640 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,7 @@ dependencies = [
     "resiliparse",
     "sentencepiece",
     "spacy>=3.6.0, <3.8.0",
+    "trafilatura",
     "unidic-lite==1.0.8",
     "usaddress==0.5.10",
     "warcio==1.7.4",
diff --git a/tests/test_download.py b/tests/test_download.py
index e2a69cb1a..cd903b969 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -2,7 +2,11 @@
 
 import pytest
 
-from nemo_curator.download import ResiliparseExtractor, download_and_extract
+from nemo_curator.download import (
+    ResiliparseExtractor,
+    TrafilaturaExtractor,
+    download_and_extract,
+)
 from nemo_curator.download.commoncrawl import (
     CommonCrawlWARCDownloader,
     CommonCrawlWARCExtractor,
@@ -17,6 +21,7 @@ def test_imports(self):
         from nemo_curator.download import (
             JusTextExtractor,
             ResiliparseExtractor,
+            TrafilaturaExtractor,
             download_arxiv,
             download_common_crawl,
             download_wikipedia,
@@ -82,6 +87,10 @@ def test_resiliparse_extract_text(self):
 
         assert result == expected
 
+    def test_trafilatura_extract_text(self):
+        # TODO
+        pass
+
     def test_common_crawl_urls(self):
         start_snapshot = "2021-04"
         end_snapshot = "2021-10"

From 51b11450e74bf5513e8eeba1d6903eff223bfb59 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 13 Dec 2024 15:44:33 -0800
Subject: [PATCH 2/3] add implementation and pytest

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 nemo_curator/download/commoncrawl.py |  35 +++++++--
 tests/test_download.py               | 113 +++++++++++++++------------
 2 files changed, 89 insertions(+), 59 deletions(-)

diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py
index f76587eee..7ef628924 100644
--- a/nemo_curator/download/commoncrawl.py
+++ b/nemo_curator/download/commoncrawl.py
@@ -24,6 +24,7 @@
 import pycld2 as cld2
 from charset_normalizer import detect
 from resiliparse.extract.html2text import extract_plain_text
+from trafilatura import extract as extract_with_trafilatura
 from warcio.archiveiterator import ArchiveIterator
 
 from nemo_curator.datasets import DocumentDataset
@@ -203,8 +204,7 @@ class TrafilaturaExtractor(HTMLExtractorAlgorithm):
     def __init__(
         self,
         required_stopword_density=0.32,
-        main_content=True,
-        alt_texts=False,
+        **extract_kwargs,
     ):
         """
         Initialize the Trafilatura text extraction algorithm with specified parameters.
@@ -213,17 +213,36 @@ def __init__(
             required_stopword_density: Proportion of stopwords required preserve an extracted paragraph.
                 Studies on stopword lists and their distribution in various text corpora often
                 suggest that around 30-40% of a typical English text consists of stopwords.
-            main_content: Whether to apply simple heuristics for extracting only "main-content" elements.
-            alt_texts: Whether to preserve alternative text descriptions (e.g., for images).
+            extract_kwargs: Additional keyword arguments for the Trafilatura extract function.
+                See API documentation https://trafilatura.readthedocs.io/en/latest/usage-python.html#choice-of-html-elements
+                for list of possible parameters.
 
         """
         self.required_stopword_density = required_stopword_density
-        self.main_content = main_content
-        self.alt_texts = alt_texts
+        self.extract_kwargs = extract_kwargs
 
     def extract_text(self, html, stop_words):
-        # TODO
-        return html
+        text = extract_with_trafilatura(html, **self.extract_kwargs)
+
+        if text is not None:
+            paragraphs = list(filter(None, text.split("\n")))
+            result = []
+            for paragraph in paragraphs:
+                words = paragraph.split()
+                length = len(words)
+                if length == 0:
+                    continue
+                stopwords = [word for word in words if word in stop_words]
+                stopword_density = len(stopwords) / length
+
+                if stopword_density >= self.required_stopword_density:
+                    result.append(paragraph)
+        else:
+            return None
+
+        if len(result) == 0:
+            return None
+        return result
 
 
 def get_stop_list_dict(languages=[]):
diff --git a/tests/test_download.py b/tests/test_download.py
index cd903b969..2a98c8041 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -16,6 +16,56 @@
 )
 
 
+@pytest.fixture
+def html_string():
+    # Modified from https://github.com/chatnoir-eu/chatnoir-resiliparse/blob/abdf1966fb3cefe3e0790e510ab5cb1446f99a79/tests/resiliparse/extract/test_html2text.py
+    html = """<!doctype html>
+        <head>
+            <title>My Title</title>
+            <meta charset="utf-8">
+            <style>* { margin: 0; }</style>
+        </head>
+        <body>
+            <section id="wrapper">
+                <nav>
+                    <ul>
+                        <li>Nav 1</li>
+                        <li>
+                            <p>Nav 2</p>
+                            <ul>
+                                <li><p>Nav 3</p></li>
+                            </ul>
+                        </li>
+                    </ul>
+                </nav>
+                <main>
+                    This is a sample paragraph. In it we write words.
+                    These are stopwords: because did than has near we almost while what still.
+                    <a href="#foo" hidden>bar</a>
+
+                    <p>
+                    This paragraph doesn't have many stopwords. Remove it.
+                    <br>Let's keep this paragraph: either came does last new took taken making became from.
+                    </p>
+
+                    <button aria-hidden="true">Click here</button>
+                    <input type="hidden" value="foo">
+                    <input type="text" value="Some text" placeholder="Insert text">
+                    <input type="text" placeholder="Insert text">
+                    <img src="" alt="Some image">
+                    <object data="" class="some-class hidden">Cannot display object</object>
+                </main>
+                <script language="vbscript" type="text/vbscript">MsgBox("Hello World!")</script>
+                <noscript>Sorry, your browser doesn't support VB Script!</noscript>
+                <div><div><div><footer id="global-footer">
+                    Copyright (C) 2021 Foo Bar
+                </footer></div></div></div>
+            </section>
+        </body>
+    </html>"""
+    return html
+
+
 class TestDownload:
     def test_imports(self):
         from nemo_curator.download import (
@@ -29,56 +79,10 @@ def test_imports(self):
 
         assert True
 
-    def test_resiliparse_extract_text(self):
-        # Modified from https://github.com/chatnoir-eu/chatnoir-resiliparse/blob/abdf1966fb3cefe3e0790e510ab5cb1446f99a79/tests/resiliparse/extract/test_html2text.py
-        html = """<!doctype html>
-            <head>
-                <title>My Title</title>
-                <meta charset="utf-8">
-                <style>* { margin: 0; }</style>
-            </head>
-            <body>
-                <section id="wrapper">
-                    <nav>
-                        <ul>
-                            <li>Nav 1</li>
-                            <li>
-                                <p>Nav 2</p>
-                                <ul>
-                                    <li><p>Nav 3</p></li>
-                                </ul>
-                            </li>
-                        </ul>
-                    </nav>
-                    <main>
-                        This is a sample paragraph. In it we write words.
-                        These are stopwords: because did than has near we almost while what still.
-                        <a href="#foo" hidden>bar</a>
-
-                        <p>
-                        This paragraph doesn't have many stopwords. Remove it.
-                        <br>Let's keep this paragraph: either came does last new took taken making became from.
-                        </p>
-
-                        <button aria-hidden="true">Click here</button>
-                        <input type="hidden" value="foo">
-                        <input type="text" value="Some text" placeholder="Insert text">
-                        <input type="text" placeholder="Insert text">
-                        <img src="" alt="Some image">
-                        <object data="" class="some-class hidden">Cannot display object</object>
-                    </main>
-                    <script language="vbscript" type="text/vbscript">MsgBox("Hello World!")</script>
-                    <noscript>Sorry, your browser doesn't support VB Script!</noscript>
-                    <div><div><div><footer id="global-footer">
-                        Copyright (C) 2021 Foo Bar
-                    </footer></div></div></div>
-                </section>
-            </body>
-        </html>"""
-
+    def test_resiliparse_extract_text(self, html_string):
         algorithm = ResiliparseExtractor()
         stop_words = get_stop_list_dict()
-        result = algorithm.extract_text(html, stop_words["ENGLISH"])
+        result = algorithm.extract_text(html_string, stop_words["ENGLISH"])
 
         expected = [
             "This is a sample paragraph. In it we write words. These are stopwords: because did than has near we almost while what still.",
@@ -87,9 +91,16 @@ def test_resiliparse_extract_text(self):
 
         assert result == expected
 
-    def test_trafilatura_extract_text(self):
-        # TODO
-        pass
+    def test_trafilatura_extract_text(self, html_string):
+        algorithm = TrafilaturaExtractor()
+        stop_words = get_stop_list_dict()
+        result = algorithm.extract_text(html_string, stop_words["ENGLISH"])
+
+        expected = [
+            "Let's keep this paragraph: either came does last new took taken making became from.",
+        ]
+
+        assert result == expected
 
     def test_common_crawl_urls(self):
         start_snapshot = "2021-04"

From db8c5b2c90866247e6c3ba0f09bbd0f9b858a37f Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Mon, 23 Dec 2024 10:01:15 -0800
Subject: [PATCH 3/3] allow editing trafilatura config params

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 docs/user-guide/download.rst         |  2 +-
 nemo_curator/download/commoncrawl.py | 51 +++++++++++++++++++++++++++-
 tests/test_download.py               |  7 +++-
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst
index 2a7b1da76..465e2e4f5 100644
--- a/docs/user-guide/download.rst
+++ b/docs/user-guide/download.rst
@@ -70,7 +70,7 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c
       algorithm=extraction_algorithm,
     )
 
-  Above, we changed the extraction algorithm from the default ``JusTextExtractor``.
+  Above, we changed the extraction algorithm from the default ``JusTextExtractor``. **Note:** Please see the Trafilatura documentation `here <https://trafilatura.readthedocs.io/en/latest/settings.html>`_ and `here <https://trafilatura.readthedocs.io/en/latest/usage-python.html#choice-of-html-elements>`_ for more information about custom Trafilatura extraction parameters.
 
   The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use.
 
diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py
index 7ef628924..cfb5c58a0 100644
--- a/nemo_curator/download/commoncrawl.py
+++ b/nemo_curator/download/commoncrawl.py
@@ -17,6 +17,7 @@
 import subprocess
 import unicodedata
 from abc import ABC, abstractmethod
+from copy import deepcopy
 from urllib.parse import urlparse
 
 import justext
@@ -25,6 +26,7 @@
 from charset_normalizer import detect
 from resiliparse.extract.html2text import extract_plain_text
 from trafilatura import extract as extract_with_trafilatura
+from trafilatura.settings import DEFAULT_CONFIG as TRAFILATURA_DEFAULT_CONFIG
 from warcio.archiveiterator import ArchiveIterator
 
 from nemo_curator.datasets import DocumentDataset
@@ -204,6 +206,13 @@ class TrafilaturaExtractor(HTMLExtractorAlgorithm):
     def __init__(
         self,
         required_stopword_density=0.32,
+        min_extracted_size=250,
+        min_extracted_comm_size=1,
+        min_output_size=1,
+        min_output_comm_size=1,
+        max_tree_size=None,
+        min_duplcheck_size=100,
+        max_repetitions=2,
         **extract_kwargs,
     ):
         """
@@ -213,16 +222,56 @@ def __init__(
             required_stopword_density: Proportion of stopwords required preserve an extracted paragraph.
                 Studies on stopword lists and their distribution in various text corpora often
                 suggest that around 30-40% of a typical English text consists of stopwords.
+            min_extracted_size: Acceptable size in characters (used to trigger fallbacks).
+                Defaults to 250. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html.
+            min_extracted_comm_size: Works the same as min_output_comm_size for comment extraction.
+                Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html.
+            min_output_size: Absolute acceptable minimum for main text output.
+                Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html.
+            min_output_comm_size: Works the same as min_output_comm_size for comment extraction.
+                Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html.
+            max_tree_size: Used to discard documents with too many elements. Defaults to None.
+            min_duplcheck_size: Minimum size in characters to run deduplication on.
+                Defaults to 100. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html.
+            max_repetitions: Maximum number of duplicates allowed.
+                Defaults to 2. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html.
             extract_kwargs: Additional keyword arguments for the Trafilatura extract function.
                 See API documentation https://trafilatura.readthedocs.io/en/latest/usage-python.html#choice-of-html-elements
                 for list of possible parameters.
 
         """
         self.required_stopword_density = required_stopword_density
+        self.min_extracted_size = min_extracted_size
+        self.min_extracted_comm_size = min_extracted_comm_size
+        self.min_output_size = min_output_size
+        self.min_output_comm_size = min_output_comm_size
+        self.max_tree_size = max_tree_size
+        self.min_duplcheck_size = min_duplcheck_size
+        self.max_repetitions = max_repetitions
         self.extract_kwargs = extract_kwargs
 
     def extract_text(self, html, stop_words):
-        text = extract_with_trafilatura(html, **self.extract_kwargs)
+        trafilatura_config = deepcopy(TRAFILATURA_DEFAULT_CONFIG)
+        trafilatura_config["DEFAULT"]["MIN_EXTRACTED_SIZE"] = str(
+            self.min_extracted_size
+        )
+        trafilatura_config["DEFAULT"]["MIN_EXTRACTED_COMM_SIZE"] = str(
+            self.min_extracted_comm_size
+        )
+        trafilatura_config["DEFAULT"]["MIN_OUTPUT_SIZE"] = str(self.min_output_size)
+        trafilatura_config["DEFAULT"]["MIN_OUTPUT_COMM_SIZE"] = str(
+            self.min_output_comm_size
+        )
+        if self.max_tree_size:
+            trafilatura_config["DEFAULT"]["MAX_TREE_SIZE"] = str(self.max_tree_size)
+        trafilatura_config["DEFAULT"]["MIN_DUPLCHECK_SIZE"] = str(
+            self.min_duplcheck_size
+        )
+        trafilatura_config["DEFAULT"]["MAX_REPETITIONS"] = str(self.max_repetitions)
+
+        text = extract_with_trafilatura(
+            html, config=trafilatura_config, **self.extract_kwargs
+        )
 
         if text is not None:
             paragraphs = list(filter(None, text.split("\n")))
diff --git a/tests/test_download.py b/tests/test_download.py
index 2a98c8041..b4dd1be53 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -92,7 +92,12 @@ def test_resiliparse_extract_text(self, html_string):
         assert result == expected
 
     def test_trafilatura_extract_text(self, html_string):
-        algorithm = TrafilaturaExtractor()
+        algorithm = TrafilaturaExtractor(
+            min_extracted_size=10,
+            min_duplcheck_size=10,
+            max_repetitions=1,
+            deduplicate=True,
+        )
         stop_words = get_stop_list_dict()
         result = algorithm.extract_text(html_string, stop_words["ENGLISH"])