-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreader17.py
More file actions
65 lines (52 loc) · 2.15 KB
/
reader17.py
File metadata and controls
65 lines (52 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import fitz # PyMuPDF
from PyPDF2 import PdfWriter, PdfReader
def extract_table_regions(input_pdf, output_pdf):
doc = fitz.open(input_pdf)
writer = PdfWriter()
for page_num in range(len(doc)):
page = doc.load_page(page_num)
blocks = page.get_text("dict")["blocks"]
page_height = float(page.mediabox.height)
# Sort blocks top to bottom
sorted_blocks = sorted(
[b for b in blocks if "lines" in b],
key=lambda b: b["bbox"][1]
)
table_started = False
table_blocks = []
footer_threshold = page_height * 0.90 # bottom 10%
for block in sorted_blocks:
y0, y1 = block["bbox"][1], block["bbox"][3]
text = " ".join([span["text"] for line in block["lines"] for span in line["spans"]]).strip()
lower_text = text.lower()
# Skip footer blocks
if y1 > footer_threshold:
continue
# Skip metadata blocks on last page
if any(kw in lower_text for kw in [
"primary key", "foreign key", "references", "constraint",
"currencies", "data_sources", "job_id"
]):
break
# Start collecting only after table header
if not table_started:
if "column name" in lower_text and "data type" in lower_text:
table_started = True
else:
continue
if table_started:
table_blocks.append(block)
# Determine crop box from table blocks
if table_blocks:
x0 = min(b["bbox"][0] for b in table_blocks)
y0 = min(b["bbox"][1] for b in table_blocks)
x1 = max(b["bbox"][2] for b in table_blocks)
y1 = min(max(b["bbox"][3] for b in table_blocks), footer_threshold - 10)
# Crop and add to output
page.set_cropbox(fitz.Rect(x0, y0, x1, y1))
writer.add_page(PdfReader(input_pdf).pages[page_num])
# Save output
with open(output_pdf, "wb") as f:
writer.write(f)
# Example usage
extract_table_regions("input.pdf", "output_cleaned.pdf")