From d4985f9ef873db52c0792cb0a36b1fe97ad04a8b Mon Sep 17 00:00:00 2001 From: maen08 <2001stany@gmail.com> Date: Wed, 1 May 2024 15:45:48 +0300 Subject: [PATCH 1/2] add: docker files and config --- Dockerfile | 22 ++++++++++++++++++++++ README.md | 24 ++++++++++++++++++++++++ requirements.txt | 7 +++++++ 3 files changed, 53 insertions(+) create mode 100644 Dockerfile create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2448f2c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.12.3-alpine + +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 + +RUN apk update && apk add python3-dev \ + gcc \ + musl-dev + + +WORKDIR /app + +COPY extract_text.py /app/ +COPY pdftext /app/ +COPY models /app/ +COPY scripts /app/ +COPY requirements.txt /app/ + +RUN pip install --upgrade pip +RUN pip install -r /app/requirements.txt + +CMD [ "python3", "extract_text.py" ] \ No newline at end of file diff --git a/README.md b/README.md index d5ce939..306f83f 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,30 @@ text = dictionary_output(pdf, sort=False, page_range=[1,2,3]) # Optional argumen If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper. pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well. +# Run on Docker +Clone a project +``` +git clone repository + +``` + +Build a docker image +``` +cd pdftext +docker build -t pdftext . + +``` + +Running with docker +``` +# write out a text file +docker run pdftext PDF_PATH --out_path output.txt + +# write out a json file +docker run pdftext PDF_PATH --out_path output.txt --json + +``` + # Benchmarks I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthedocs.io/en/latest/), [pdfplumber](https://github.com/jsvine/pdfplumber), and pdftext. I chose pymupdf because it extracts blocks and lines. Pdfplumber extracts words and bboxes. I did not benchmark pypdf, even though it is a great library, because it doesn't provide individual character/line/block and bbox information. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8ea5bed --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +joblib==1.4.0 +numpy==1.26.4 +pydantic==2.7.1 +pydantic-settings==2.2.1 +pypdfium2==4.29.0 +scikit-learn==1.4.2 + From c054c716e46b10aae8bd6d20d180150a14a7517c Mon Sep 17 00:00:00 2001 From: maen08 <2001stany@gmail.com> Date: Wed, 1 May 2024 18:18:46 +0300 Subject: [PATCH 2/2] update: readme file --- Dockerfile | 22 +++++++++++----------- poetry.lock | 2 +- pyproject.toml | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2448f2c..afdbb1c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,20 +3,20 @@ FROM python:3.12.3-alpine ENV PYTHONDONTWRITEBYTECODE 1 ENV PYTHONUNBUFFERED 1 -RUN apk update && apk add python3-dev \ - gcc \ - musl-dev - +RUN apk add --no-cache \ + build-base \ + python3-dev \ + py3-pip \ + lapack-dev \ + gfortran \ + libffi-dev WORKDIR /app -COPY extract_text.py /app/ -COPY pdftext /app/ -COPY models /app/ -COPY scripts /app/ -COPY requirements.txt /app/ +COPY . /app/ +COPY pyproject.toml /app/ -RUN pip install --upgrade pip -RUN pip install -r /app/requirements.txt +RUN pip install poetry +RUN poetry install --no-root CMD [ "python3", "extract_text.py" ] \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index eb9abc9..3c80cad 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1911,4 +1911,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13,!=3.9.7" -content-hash = "52cfc286016e488015d31fb2f8b9b92a715b81a352dfbd6dbbacb88808fb0294" +content-hash = "52cfc286016e488015d31fb2f8b9b92a715b81a352dfbd6dbbacb88808fb0294" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a71f9de..9a94791 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,4 +35,4 @@ requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] -pdftext = "extract_text:main" +pdftext = "extract_text:main" \ No newline at end of file