-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_tables_from_pdf_PiPlumber.py
More file actions
28 lines (23 loc) · 1.38 KB
/
extract_tables_from_pdf_PiPlumber.py
File metadata and controls
28 lines (23 loc) · 1.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import pdfplumber
from tabulate import tabulate
with pdfplumber.open("OOP_10122018.pdf") as pdf:
for page_num, page in enumerate(pdf.pages, start=1):
tables= page.extract_tables()
for table_num, table in enumerate(tables, start=1):
with open(f"table-{(table_num)}-page-{page_num}", "w") as table_file:
# Clean the table data
clean_table = []
try:
for row in table:
clean_row = [str(cell).strip() if cell else "" for cell in row]
clean_table.append(clean_row)
# Print with nice borders (like original PDF)
table_file.write(tabulate(table,
headers="firstrow", # First row as headers
tablefmt="grid", # Grid format with borders
stralign="left")) # Left align text
except Exception as e:
print(f"Tabulate formatting failed: {e}")
# Fallback to manual formatting
# row_text = " | ".join(str(cell) if cell else "" for cell in row)
# table_file.write(row_text + "\n")