diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index b0b0d14..2a06a1d 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -85,8 +85,10 @@ def word_break(): word_break() continue - # we break on any change in font info - if any(char['font'][k] != word['font'][k] for k in ['name', 'flags', 'size', 'weight']): + # we break on any change in font info - optimized comparison + char_font = char['font'] + word_font = word['font'] + if any(char_font[k] != word_font[k] for k in ['name', 'flags', 'size', 'weight']): word_break() continue @@ -99,17 +101,19 @@ def word_break(): word['bbox'] = word['bbox'].merge(char['bbox']) word['chars'].append(char) - # deduplicate words - seen = {} + # deduplicate words - use tuple keys instead of strings + seen = set() deduped = [] for word in words: # Round the bbox coordinates bbox = word['bbox'].bbox - bbox = [round(x, 0) for x in bbox] + bbox_rounded = tuple(round(x, 0) for x in bbox) - key = f"{bbox}-{word['text']}-{word['rotation']}-{word['font']['name']}-{word['font']['flags']}-{word['font']['size']}-{word['font']['weight']}" + key = (bbox_rounded, word['text'], word['rotation'], + word['font']['name'], word['font']['flags'], + word['font']['size'], word['font']['weight']) if key not in seen: - seen[key] = True + seen.add(key) deduped.append(word) return [char for word in deduped for char in word['chars']] diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py index f650c71..c478df1 100644 --- a/pdftext/pdf/pages.py +++ b/pdftext/pdf/pages.py @@ -144,12 +144,13 @@ def line_break(): line_break() continue - # we break if the previous span ends with a linebreak or hyphenation - if any(line["spans"][-1]["text"].endswith(suffix) for suffix in ["\n", "\x02"]): + # we break if the previous span ends with a linebreak + last_text = line["spans"][-1]["text"] + if any(last_text.endswith(suffix) for suffix in ["\n", "\x02"]): line_break() continue - if span["rotation"] != line["rotation"]: + if span["rotation"] != line["rotation"] and abs(span["rotation"] - line["rotation"]) >= 45: line_break() continue diff --git a/pdftext/schema.py b/pdftext/schema.py index b13456b..736b856 100644 --- a/pdftext/schema.py +++ b/pdftext/schema.py @@ -56,12 +56,14 @@ def y_end(self): return self.bbox[3] def merge(self, other: Bbox) -> Bbox: - x_start = self.x_start if self.x_start < other.x_start else other.x_start - y_start = self.y_start if self.y_start < other.y_start else other.y_start - x_end = self.x_end if self.x_end > other.x_end else other.x_end - y_end = self.y_end if self.y_end > other.y_end else other.y_end - - return Bbox([x_start, y_start, x_end, y_end]) + self_bbox = self.bbox + other_bbox = other.bbox + return Bbox([ + min(self_bbox[0], other_bbox[0]), + min(self_bbox[1], other_bbox[1]), + max(self_bbox[2], other_bbox[2]), + max(self_bbox[3], other_bbox[3]) + ]) def overlap_x(self, other: Bbox): return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0])) diff --git a/pdftext/scripts/extract_text.py b/pdftext/scripts/extract_text.py index af6db1a..cb8dd38 100644 --- a/pdftext/scripts/extract_text.py +++ b/pdftext/scripts/extract_text.py @@ -52,7 +52,7 @@ def extract_text_cli( workers=kwargs["workers"], disable_links=True ) - text = json.dumps(text) + text = json.dumps(text, ensure_ascii=False, indent=2) else: text = plain_text_output( pdf_path, diff --git a/pyproject.toml b/pyproject.toml index 1f179df..e7eb36a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.6.2" +version = "0.6.3" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0" diff --git a/tests/conftest.py b/tests/conftest.py index 4eefc39..dc09f49 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,10 @@ def pdf_path(): return "tests/data/adversarial.pdf" +@pytest.fixture(scope="session") +def pdf_path2(): + return "tests/data/communication.pdf" + @pytest.fixture() def pdf_doc(pdf_path): doc = pdfium.PdfDocument(pdf_path) diff --git a/tests/data/communication.pdf b/tests/data/communication.pdf new file mode 100644 index 0000000..f44667c Binary files /dev/null and b/tests/data/communication.pdf differ diff --git a/tests/test_extraction.py b/tests/test_extraction.py index f913b84..dc8fdde 100644 --- a/tests/test_extraction.py +++ b/tests/test_extraction.py @@ -35,3 +35,10 @@ def test_superscripts(pdf_path): if span["text"] == "∞": assert span["superscript"] is True return True + + +def test_line_joining(pdf_path2): + pages = [11] + text = plain_text_output(pdf_path2, page_range=pages).lower() + assert "the axis media control viewer toolbar" in text + assert "axismediacontrolviewertoolbar" not in text