forked from Unstructured-IO/unstructured
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
build: wolfi base image for Dockerfile (Unstructured-IO#3016)
### Summary Updates the `Dockerfile` to use the Chainguard `wolfi-base` image to reduce CVEs. Also adds a step in the docker publish job that scans the images and checks for CVEs before publishing. The job will fail if there are high or critical vulnerabilities. ### Testing Run `make docker-run-dev` and then `python3.11` once you're in. And that point, you can try: ```python from unstructured.partition.auto import partition elements = partition(filename="example-docs/DA-1p.pdf", skip_infer_table_types=["pdf"]) elements ``` Stop the container once you're done.
- Loading branch information
1 parent
094e354
commit 612905e
Showing
10 changed files
with
104 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -204,3 +204,6 @@ examples/**/output/ | |
|
||
outputdiff.txt | ||
metricsdiff.txt | ||
|
||
# APK packages for the docker build | ||
docker-packages/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,55 @@ | ||
# syntax=docker/dockerfile:experimental | ||
FROM quay.io/unstructured-io/base-images:rocky9.2-9@sha256:73d8492452f086144d4b92b7931aa04719f085c74d16cae81e8826ef873729c9 as base | ||
FROM cgr.dev/chainguard/wolfi-base:latest | ||
|
||
# NOTE(crag): NB_USER ARG for mybinder.org compat: | ||
# https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html | ||
ARG NB_USER=notebook-user | ||
ARG NB_UID=1000 | ||
ARG PIP_VERSION | ||
WORKDIR /app | ||
|
||
# Set up environment | ||
ENV HOME /home/${NB_USER} | ||
ENV PYTHONPATH="${PYTHONPATH}:${HOME}" | ||
ENV PATH="/home/usr/.local/bin:${PATH}" | ||
USER root | ||
|
||
RUN groupadd --gid ${NB_UID} ${NB_USER} | ||
RUN useradd --uid ${NB_UID} --gid ${NB_UID} ${NB_USER} | ||
WORKDIR ${HOME} | ||
|
||
FROM base as deps | ||
# Copy and install Unstructured | ||
COPY requirements requirements | ||
|
||
RUN python3.10 -m pip install pip==${PIP_VERSION} && \ | ||
dnf -y groupinstall "Development Tools" && \ | ||
find requirements/ -type f -name "*.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \ | ||
dnf -y groupremove "Development Tools" && \ | ||
dnf clean all | ||
|
||
RUN python3.10 -c "import nltk; nltk.download('punkt')" && \ | ||
python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" | ||
|
||
FROM deps as code | ||
|
||
USER ${NB_USER} | ||
|
||
COPY example-docs example-docs | ||
COPY ./docker-packages/*.apk packages/ | ||
COPY ./requirements/*.txt requirements/ | ||
COPY unstructured unstructured | ||
COPY test_unstructured test_unstructured | ||
COPY example-docs example-docs | ||
|
||
RUN python3.10 -c "from unstructured.partition.model_init import initialize; initialize()" | ||
RUN apk update && apk add py3.11-pip mesa-gl glib cmake && \ | ||
apk add --allow-untrusted packages/pandoc-3.1.8-r0.apk && \ | ||
apk add --allow-untrusted packages/poppler-23.09.0-r0.apk && \ | ||
apk add --allow-untrusted packages/leptonica-1.83.0-r0.apk && \ | ||
apk add --allow-untrusted packages/tesseract-5.3.2-r0.apk && \ | ||
apk add --allow-untrusted packages/libreoffice-7.6.5-r0.apk && \ | ||
apk add bash && \ | ||
apk add libmagic && \ | ||
mv /share/tessdata/configs /usr/local/share/tessdata/ && \ | ||
mv /share/tessdata/tessconfigs /usr/local/share/tessdata/ && \ | ||
ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/libreoffice && \ | ||
ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/soffice && \ | ||
chmod +x /usr/local/lib/libreoffice/program/soffice.bin && \ | ||
chmod +x /usr/local/bin/libreoffice && \ | ||
chmod +x /usr/local/bin/soffice | ||
|
||
RUN chown -R nonroot:nonroot /app | ||
|
||
USER nonroot | ||
|
||
RUN pip3.11 install --no-cache-dir --user -r requirements/base.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/test.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-csv.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-docx.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-epub.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-markdown.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-msg.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-odt.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-pdf-image.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-pptx.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-xlsx.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/huggingface.txt && \ | ||
pip3.11 install unstructured.paddlepaddle | ||
|
||
RUN python3.11 -c "import nltk; nltk.download('punkt')" && \ | ||
python3.11 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \ | ||
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \ | ||
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')" | ||
|
||
ENV PATH="${PATH}:/home/nonroot/.local/bin" | ||
ENV TESSDATA_PREFIX=/usr/local/share/tessdata | ||
|
||
CMD ["/bin/bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
|
||
files=( | ||
"libreoffice-7.6.5-r0.apk" | ||
"openjpeg-2.5.0-r0.apk" | ||
"poppler-23.09.0-r0.apk" | ||
"leptonica-1.83.0-r0.apk" | ||
"pandoc-3.1.8-r0.apk" | ||
"tesseract-5.3.2-r0.apk" | ||
"nltk_data.tgz" | ||
|
||
) | ||
|
||
directory="docker-packages" | ||
mkdir -p "${directory}" | ||
|
||
for file in "${files[@]}"; do | ||
echo "Downloading ${file}" | ||
wget "https://utic-public-cf.s3.amazonaws.com/$file" -P "$directory" | ||
done | ||
|
||
echo "Downloads complete." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters