PDF-Data-Extraction-Python/extract_tables_from_pdf_PiPlumber.py at main · muqaddasazahra/PDF-Data-Extraction-Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import pdfplumber
from tabulate import tabulate

with pdfplumber.open("OOP_10122018.pdf") as pdf:
    for page_num, page in enumerate(pdf.pages, start=1):
        tables= page.extract_tables()
        for table_num, table in enumerate(tables, start=1):
            with open(f"table-{(table_num)}-page-{page_num}", "w") as table_file:
                  # Clean the table data
                    clean_table = []
                    try:
                        for row in table:
                            clean_row = [str(cell).strip() if cell else "" for cell in row]
                            clean_table.append(clean_row)

                        # Print with nice borders (like original PDF)
                        table_file.write(tabulate(table,
                                    headers="firstrow",  # First row as headers
                                    tablefmt="grid",     # Grid format with borders
                                    stralign="left"))    # Left align text

                    except Exception as e:
                        print(f"Tabulate formatting failed: {e}")
                        # Fallback to manual formatting
                        # row_text = " | ".join(str(cell) if cell else "" for cell in row)
                        # table_file.write(row_text + "\n")