From d4985f9ef873db52c0792cb0a36b1fe97ad04a8b Mon Sep 17 00:00:00 2001
From: maen08 <2001stany@gmail.com>
Date: Wed, 1 May 2024 15:45:48 +0300
Subject: [PATCH 1/2] add: docker files and config

---
 Dockerfile       | 22 ++++++++++++++++++++++
 README.md        | 24 ++++++++++++++++++++++++
 requirements.txt |  7 +++++++
 3 files changed, 53 insertions(+)
 create mode 100644 Dockerfile
 create mode 100644 requirements.txt

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..2448f2c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,22 @@
+FROM python:3.12.3-alpine
+
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+
+RUN apk update && apk add python3-dev \
+                          gcc \
+                          musl-dev
+                          
+
+WORKDIR /app
+
+COPY extract_text.py  /app/
+COPY pdftext /app/
+COPY models /app/
+COPY scripts  /app/
+COPY requirements.txt /app/
+                                                   
+RUN pip install --upgrade pip
+RUN pip install -r /app/requirements.txt
+
+CMD [ "python3", "extract_text.py" ] 
\ No newline at end of file
diff --git a/README.md b/README.md
index d5ce939..306f83f 100644
--- a/README.md
+++ b/README.md
@@ -89,6 +89,30 @@ text = dictionary_output(pdf, sort=False, page_range=[1,2,3]) # Optional argumen
 
 If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper.  pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well.
 
+# Run on Docker
+Clone a project
+```
+git clone repository
+
+```
+
+Build a docker image
+```
+cd pdftext
+docker build -t pdftext .
+
+```
+
+Running with docker
+```
+# write out a text file
+docker run pdftext PDF_PATH --out_path output.txt
+
+# write out a json file
+docker run pdftext PDF_PATH --out_path output.txt --json
+
+```
+
 # Benchmarks
 
 I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthedocs.io/en/latest/), [pdfplumber](https://github.com/jsvine/pdfplumber), and pdftext.  I chose pymupdf because it extracts blocks and lines.  Pdfplumber extracts words and bboxes.  I did not benchmark pypdf, even though it is a great library, because it doesn't provide individual character/line/block and bbox information.
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8ea5bed
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+joblib==1.4.0
+numpy==1.26.4
+pydantic==2.7.1
+pydantic-settings==2.2.1
+pypdfium2==4.29.0
+scikit-learn==1.4.2
+

From c054c716e46b10aae8bd6d20d180150a14a7517c Mon Sep 17 00:00:00 2001
From: maen08 <2001stany@gmail.com>
Date: Wed, 1 May 2024 18:18:46 +0300
Subject: [PATCH 2/2] update: readme file

---
 Dockerfile     | 22 +++++++++++-----------
 poetry.lock    |  2 +-
 pyproject.toml |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2448f2c..afdbb1c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,20 +3,20 @@ FROM python:3.12.3-alpine
 ENV PYTHONDONTWRITEBYTECODE 1
 ENV PYTHONUNBUFFERED 1
 
-RUN apk update && apk add python3-dev \
-                          gcc \
-                          musl-dev
-                          
+RUN apk add --no-cache \
+    build-base \
+    python3-dev \
+    py3-pip \
+    lapack-dev \
+    gfortran \
+    libffi-dev
 
 WORKDIR /app
 
-COPY extract_text.py  /app/
-COPY pdftext /app/
-COPY models /app/
-COPY scripts  /app/
-COPY requirements.txt /app/
+COPY .  /app/
+COPY pyproject.toml  /app/
                                                    
-RUN pip install --upgrade pip
-RUN pip install -r /app/requirements.txt
+RUN pip install poetry
+RUN poetry install --no-root
 
 CMD [ "python3", "extract_text.py" ] 
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index eb9abc9..3c80cad 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1911,4 +1911,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13,!=3.9.7"
-content-hash = "52cfc286016e488015d31fb2f8b9b92a715b81a352dfbd6dbbacb88808fb0294"
+content-hash = "52cfc286016e488015d31fb2f8b9b92a715b81a352dfbd6dbbacb88808fb0294"
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index a71f9de..9a94791 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,4 +35,4 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
-pdftext = "extract_text:main"
+pdftext = "extract_text:main"
\ No newline at end of file