fix: Format markdown to HTML to avoid Chainlit reformatting it (#90)

navapbc · Oct 8, 2024 · c627043 · c627043
1 parent c3ff81f
commit c627043
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 8 deletions.
diff --git a/app/poetry.lock b/app/poetry.lock
diff --git a/app/pyproject.toml b/app/pyproject.toml
@@ -29,6 +29,8 @@ pdfminer-six = "^20240706"
 
 unstructured = {extras = ["pdf", "docx", "pptx"], version = "^0.15.8"}
 nltk = "^3.9.1"
+markdown = "^3.7"
+
 [tool.poetry.group.dev.dependencies]
 black = "^24.8.0"
 flake8 = "^6.1.0"
@@ -38,6 +40,7 @@ isort = "^5.12.0"
 mypy = "^1.5.1"
 moto = {extras = ["s3"], version = "^4.0.2"}
 types-pytz = "^2023.3.1.1"
+types-markdown = "^3.7.0.20240822"
 coverage = "^7.3.2"
 Faker = "^19.8.0"
 factory-boy = "^3.3.0"

diff --git a/app/src/format.py b/app/src/format.py
@@ -3,6 +3,8 @@
 import re
 from typing import OrderedDict, Sequence
 
+import markdown
+
 from src.citations import dereference_citations, reify_citations_with_scores, split_into_subsections
 from src.db.models.document import Chunk, ChunkWithScore, Document
 from src.util.bem_util import get_bem_url, replace_bem_with_link
@@ -66,6 +68,12 @@ def _get_bem_documents_to_show(
     return documents
 
 
+def to_html(text: str) -> str:
+    # markdown expects '\n' before the start of a list
+    corrected_text = re.sub(r"^- ", "\n- ", text, flags=re.MULTILINE, count=1)
+    return markdown.markdown(corrected_text)
+
+
 def format_bem_subsections(
     chunks_shown_max_num: int,
     chunks_shown_min_score: float,
@@ -74,7 +82,7 @@ def format_bem_subsections(
 ) -> str:
     global _accordion_id
 
-    response_with_citations = reify_citations_with_scores(raw_response, chunks_with_scores)
+    response_with_citations = to_html(reify_citations_with_scores(raw_response, chunks_with_scores))
 
     chunks = [c.chunk for c in chunks_with_scores]
     context = split_into_subsections(chunks)
@@ -86,7 +94,7 @@ def format_bem_subsections(
         chunk = citation.chunk
         subsection = citation.subsection
 
-        formatted_subsection = replace_bem_with_link(subsection)
+        formatted_subsection = to_html(replace_bem_with_link(subsection))
         bem_url_for_page = get_bem_url(chunk.document.name)
         if chunk.page_number:
             bem_url_for_page += "#page=" + str(chunk.page_number)
@@ -119,8 +127,14 @@ def format_bem_subsections(
     # This heading is important to prevent Chainlit from embedding citations_html
     # as the next part of a a list in response_with_citations
     if citations_html:
-        return response_with_citations + "<h3>Source(s)</h3>" + citations_html
-    return response_with_citations
+        return (
+            "<div>"
+            + response_with_citations
+            + "</div><h3>Source(s)</h3><div>"
+            + citations_html
+            + "</div>"
+        )
+    return "<div>" + response_with_citations + "</div>"
 
 
 def format_bem_documents(

diff --git a/app/tests/src/test_format.py b/app/tests/src/test_format.py
@@ -141,10 +141,15 @@ def test__add_ellipses():
 
 
 def test_format_bem_subsections(chunks_with_scores):
-    assert format_bem_subsections(0, 0, chunks_with_scores, "") == ""
+    assert format_bem_subsections(0, 0, chunks_with_scores, "") == "<div></div>"
     assert (
         format_bem_subsections(0, 0, [], "Non-existant citation: (citation-0)")
-        == "Non-existant citation: (citation-0)"
+        == "<div><p>Non-existant citation: (citation-0)</p></div>"
+    )
+
+    assert (
+        format_bem_subsections(0, 0, [], "List intro sentence: \n- item 1\n- item 2")
+        == "<div><p>List intro sentence: </p>\n<ul>\n<li>item 1</li>\n<li>item 2</li>\n</ul></div>"
     )
 
     chunks_with_scores[0].chunk.document.name = "BEM 100: Intro"