datalab-to · VikParuchuri · Jun 11, 2025 · May 30, 2025 · Jun 11, 2025 · Jun 11, 2025
diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py
@@ -85,8 +85,10 @@ def word_break():
             word_break()
             continue
 
-        # we break on any change in font info
-        if any(char['font'][k] != word['font'][k] for k in ['name', 'flags', 'size', 'weight']):
+        # we break on any change in font info - optimized comparison
+        char_font = char['font']
+        word_font = word['font']
+        if any(char_font[k] != word_font[k] for k in ['name', 'flags', 'size', 'weight']):
             word_break()
             continue
 
@@ -99,17 +101,19 @@ def word_break():
         word['bbox'] = word['bbox'].merge(char['bbox'])
         word['chars'].append(char)
 
-    # deduplicate words
-    seen = {}
+    # deduplicate words - use tuple keys instead of strings
+    seen = set()
     deduped = []
     for word in words:
         # Round the bbox coordinates
         bbox = word['bbox'].bbox
-        bbox = [round(x, 0) for x in bbox]
+        bbox_rounded = tuple(round(x, 0) for x in bbox)
 
-        key = f"{bbox}-{word['text']}-{word['rotation']}-{word['font']['name']}-{word['font']['flags']}-{word['font']['size']}-{word['font']['weight']}"
+        key = (bbox_rounded, word['text'], word['rotation'], 
+               word['font']['name'], word['font']['flags'], 
+               word['font']['size'], word['font']['weight'])
         if key not in seen:
-            seen[key] = True
+            seen.add(key)
             deduped.append(word)
 
     return [char for word in deduped for char in word['chars']]
diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py
@@ -144,12 +144,13 @@ def line_break():
             line_break()
             continue
 
-        # we break if the previous span ends with a linebreak or hyphenation
-        if any(line["spans"][-1]["text"].endswith(suffix) for suffix in ["\n", "\x02"]):
+        # we break if the previous span ends with a linebreak
+        last_text = line["spans"][-1]["text"]
+        if any(last_text.endswith(suffix) for suffix in ["\n", "\x02"]):
             line_break()
             continue
 
-        if span["rotation"] != line["rotation"]:
+        if span["rotation"] != line["rotation"] and abs(span["rotation"] - line["rotation"]) >= 45:
             line_break()
             continue
 

diff --git a/pdftext/schema.py b/pdftext/schema.py
@@ -56,12 +56,14 @@ def y_end(self):
         return self.bbox[3]
 
     def merge(self, other: Bbox) -> Bbox:
-        x_start = self.x_start if self.x_start < other.x_start else other.x_start
-        y_start = self.y_start if self.y_start < other.y_start else other.y_start
-        x_end = self.x_end if self.x_end > other.x_end else other.x_end
-        y_end = self.y_end if self.y_end > other.y_end else other.y_end
-
-        return Bbox([x_start, y_start, x_end, y_end])
+        self_bbox = self.bbox
+        other_bbox = other.bbox
+        return Bbox([
+            min(self_bbox[0], other_bbox[0]),
+            min(self_bbox[1], other_bbox[1]),
+            max(self_bbox[2], other_bbox[2]),
+            max(self_bbox[3], other_bbox[3])
+        ])
 
     def overlap_x(self, other: Bbox):
         return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0]))

diff --git a/pdftext/scripts/extract_text.py b/pdftext/scripts/extract_text.py
@@ -52,7 +52,7 @@ def extract_text_cli(
             workers=kwargs["workers"],
             disable_links=True
         )
-        text = json.dumps(text)
+        text = json.dumps(text, ensure_ascii=False, indent=2)
     else:
         text = plain_text_output(
             pdf_path,

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.6.2"
+version = "0.6.3"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <[email protected]>"]
 license = "Apache-2.0"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,6 +5,10 @@
 def pdf_path():
     return "tests/data/adversarial.pdf"
 
+@pytest.fixture(scope="session")
+def pdf_path2():
+    return "tests/data/communication.pdf"
+
 @pytest.fixture()
 def pdf_doc(pdf_path):
     doc = pdfium.PdfDocument(pdf_path)

diff --git a/tests/data/communication.pdf b/tests/data/communication.pdf
diff --git a/tests/test_extraction.py b/tests/test_extraction.py
@@ -35,3 +35,10 @@ def test_superscripts(pdf_path):
                     if span["text"] == "∞":
                         assert span["superscript"] is True
                         return True
+
+
+def test_line_joining(pdf_path2):
+    pages = [11]
+    text = plain_text_output(pdf_path2, page_range=pages).lower()
+    assert "the axis media control viewer toolbar" in text
+    assert "axismediacontrolviewertoolbar" not in text