Skip to content

Commit

Permalink
fix: Format markdown to HTML to avoid Chainlit reformatting it (#90)
Browse files Browse the repository at this point in the history
  • Loading branch information
yoomlam authored Oct 8, 2024
1 parent c3ff81f commit c627043
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 8 deletions.
30 changes: 28 additions & 2 deletions app/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions app/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ pdfminer-six = "^20240706"

unstructured = {extras = ["pdf", "docx", "pptx"], version = "^0.15.8"}
nltk = "^3.9.1"
markdown = "^3.7"

[tool.poetry.group.dev.dependencies]
black = "^24.8.0"
flake8 = "^6.1.0"
Expand All @@ -38,6 +40,7 @@ isort = "^5.12.0"
mypy = "^1.5.1"
moto = {extras = ["s3"], version = "^4.0.2"}
types-pytz = "^2023.3.1.1"
types-markdown = "^3.7.0.20240822"
coverage = "^7.3.2"
Faker = "^19.8.0"
factory-boy = "^3.3.0"
Expand Down
22 changes: 18 additions & 4 deletions app/src/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import re
from typing import OrderedDict, Sequence

import markdown

from src.citations import dereference_citations, reify_citations_with_scores, split_into_subsections
from src.db.models.document import Chunk, ChunkWithScore, Document
from src.util.bem_util import get_bem_url, replace_bem_with_link
Expand Down Expand Up @@ -66,6 +68,12 @@ def _get_bem_documents_to_show(
return documents


def to_html(text: str) -> str:
# markdown expects '\n' before the start of a list
corrected_text = re.sub(r"^- ", "\n- ", text, flags=re.MULTILINE, count=1)
return markdown.markdown(corrected_text)


def format_bem_subsections(
chunks_shown_max_num: int,
chunks_shown_min_score: float,
Expand All @@ -74,7 +82,7 @@ def format_bem_subsections(
) -> str:
global _accordion_id

response_with_citations = reify_citations_with_scores(raw_response, chunks_with_scores)
response_with_citations = to_html(reify_citations_with_scores(raw_response, chunks_with_scores))

chunks = [c.chunk for c in chunks_with_scores]
context = split_into_subsections(chunks)
Expand All @@ -86,7 +94,7 @@ def format_bem_subsections(
chunk = citation.chunk
subsection = citation.subsection

formatted_subsection = replace_bem_with_link(subsection)
formatted_subsection = to_html(replace_bem_with_link(subsection))
bem_url_for_page = get_bem_url(chunk.document.name)
if chunk.page_number:
bem_url_for_page += "#page=" + str(chunk.page_number)
Expand Down Expand Up @@ -119,8 +127,14 @@ def format_bem_subsections(
# This heading is important to prevent Chainlit from embedding citations_html
# as the next part of a a list in response_with_citations
if citations_html:
return response_with_citations + "<h3>Source(s)</h3>" + citations_html
return response_with_citations
return (
"<div>"
+ response_with_citations
+ "</div><h3>Source(s)</h3><div>"
+ citations_html
+ "</div>"
)
return "<div>" + response_with_citations + "</div>"


def format_bem_documents(
Expand Down
9 changes: 7 additions & 2 deletions app/tests/src/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,15 @@ def test__add_ellipses():


def test_format_bem_subsections(chunks_with_scores):
assert format_bem_subsections(0, 0, chunks_with_scores, "") == ""
assert format_bem_subsections(0, 0, chunks_with_scores, "") == "<div></div>"
assert (
format_bem_subsections(0, 0, [], "Non-existant citation: (citation-0)")
== "Non-existant citation: (citation-0)"
== "<div><p>Non-existant citation: (citation-0)</p></div>"
)

assert (
format_bem_subsections(0, 0, [], "List intro sentence: \n- item 1\n- item 2")
== "<div><p>List intro sentence: </p>\n<ul>\n<li>item 1</li>\n<li>item 2</li>\n</ul></div>"
)

chunks_with_scores[0].chunk.document.name = "BEM 100: Intro"
Expand Down

0 comments on commit c627043

Please sign in to comment.