Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,10 @@ def word_break():
word_break()
continue

# we break on any change in font info
if any(char['font'][k] != word['font'][k] for k in ['name', 'flags', 'size', 'weight']):
# we break on any change in font info - optimized comparison
char_font = char['font']
word_font = word['font']
if any(char_font[k] != word_font[k] for k in ['name', 'flags', 'size', 'weight']):
word_break()
continue

Expand All @@ -99,17 +101,19 @@ def word_break():
word['bbox'] = word['bbox'].merge(char['bbox'])
word['chars'].append(char)

# deduplicate words
seen = {}
# deduplicate words - use tuple keys instead of strings
seen = set()
deduped = []
for word in words:
# Round the bbox coordinates
bbox = word['bbox'].bbox
bbox = [round(x, 0) for x in bbox]
bbox_rounded = tuple(round(x, 0) for x in bbox)

key = f"{bbox}-{word['text']}-{word['rotation']}-{word['font']['name']}-{word['font']['flags']}-{word['font']['size']}-{word['font']['weight']}"
key = (bbox_rounded, word['text'], word['rotation'],
word['font']['name'], word['font']['flags'],
word['font']['size'], word['font']['weight'])
if key not in seen:
seen[key] = True
seen.add(key)
deduped.append(word)

return [char for word in deduped for char in word['chars']]
7 changes: 4 additions & 3 deletions pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,13 @@ def line_break():
line_break()
continue

# we break if the previous span ends with a linebreak or hyphenation
if any(line["spans"][-1]["text"].endswith(suffix) for suffix in ["\n", "\x02"]):
# we break if the previous span ends with a linebreak
last_text = line["spans"][-1]["text"]
if any(last_text.endswith(suffix) for suffix in ["\n", "\x02"]):
line_break()
continue

if span["rotation"] != line["rotation"]:
if span["rotation"] != line["rotation"] and abs(span["rotation"] - line["rotation"]) >= 45:
line_break()
continue

Expand Down
14 changes: 8 additions & 6 deletions pdftext/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,14 @@ def y_end(self):
return self.bbox[3]

def merge(self, other: Bbox) -> Bbox:
x_start = self.x_start if self.x_start < other.x_start else other.x_start
y_start = self.y_start if self.y_start < other.y_start else other.y_start
x_end = self.x_end if self.x_end > other.x_end else other.x_end
y_end = self.y_end if self.y_end > other.y_end else other.y_end

return Bbox([x_start, y_start, x_end, y_end])
self_bbox = self.bbox
other_bbox = other.bbox
return Bbox([
min(self_bbox[0], other_bbox[0]),
min(self_bbox[1], other_bbox[1]),
max(self_bbox[2], other_bbox[2]),
max(self_bbox[3], other_bbox[3])
])

def overlap_x(self, other: Bbox):
return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0]))
Expand Down
2 changes: 1 addition & 1 deletion pdftext/scripts/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def extract_text_cli(
workers=kwargs["workers"],
disable_links=True
)
text = json.dumps(text)
text = json.dumps(text, ensure_ascii=False, indent=2)
else:
text = plain_text_output(
pdf_path,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.6.2"
version = "0.6.3"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <[email protected]>"]
license = "Apache-2.0"
Expand Down
4 changes: 4 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
def pdf_path():
return "tests/data/adversarial.pdf"

@pytest.fixture(scope="session")
def pdf_path2():
return "tests/data/communication.pdf"

@pytest.fixture()
def pdf_doc(pdf_path):
doc = pdfium.PdfDocument(pdf_path)
Expand Down
Binary file added tests/data/communication.pdf
Binary file not shown.
7 changes: 7 additions & 0 deletions tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,10 @@ def test_superscripts(pdf_path):
if span["text"] == "∞":
assert span["superscript"] is True
return True


def test_line_joining(pdf_path2):
pages = [11]
text = plain_text_output(pdf_path2, page_range=pages).lower()
assert "the axis media control viewer toolbar" in text
assert "axismediacontrolviewertoolbar" not in text