-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreader22.py
More file actions
139 lines (114 loc) · 6.48 KB
/
reader22.py
File metadata and controls
139 lines (114 loc) · 6.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import pdfplumber
import json
import re
from typing import List, Dict, Any
def extract_transaction_table_with_plumber(input_pdf_path: str, output_json_path: str):
"""
Extracts a table from a PDF based on a 'TRANSACTIONS' header.
The header must be in the top-right corner, above a blue horizontal line.
This function uses pdfplumber to identify specific text and line objects
based on their position and color.
Args:
input_pdf_path (str): The path to the input PDF file.
output_json_path (str): The path to the output JSON file.
"""
all_table_data = []
# Use a flexible regex to find the header, ignoring case and whitespace
header_pattern = re.compile(r"transactions", re.IGNORECASE)
try:
with pdfplumber.open(input_pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
print(f"🔎 Analyzing page {page_num + 1}...")
found_blue_line = False
blue_line_y = None
# Search for a blue horizontal line near the top of the page
# The color is represented as a tuple of RGB values.
# Common blue is [0, 0, 255]. We'll be a bit flexible.
# A "blue" color check. Adjust the tolerance if needed.
is_blue = lambda c: isinstance(c, list) and c[0] < 50 and c[1] < 50 and c[2] > 200
# Look for a line that is horizontal and blue
for line in page.lines:
# Check if the line is horizontal (start and end y are close)
if abs(line["x0"] - line["x1"]) > 100 and abs(line["y0"] - line["y1"]) < 2:
# Check if the color is blue
if is_blue(line.get("stroke")):
found_blue_line = True
blue_line_y = line["y1"]
print(f"✅ Found a blue line at y-coordinate: {blue_line_y}")
break
if not found_blue_line:
continue
found_header = False
header_bbox = None
# Search for the "TRANSACTIONS" text block in the top-right corner
# `page.extract_text(layout=True)` provides block-level information including coordinates
# Check blocks to find the header that meets the criteria
for block in page.extract_text(layout=True).split('\n'):
# The block's layout info is lost, so we'll use a manual check against page text
text_content = block.strip()
if header_pattern.search(text_content):
# Now, find the actual text block with coordinates
words = page.extract_words()
for word in words:
if header_pattern.search(word['text']):
# Check if the header is above the blue line and on the right side
# Using a threshold of 70% of the page width
if word['top'] < blue_line_y and word['x1'] > page.width * 0.70:
print(f"✅ Found 'TRANSACTIONS' header in the correct position.")
# Create a bounding box for the header
header_bbox = (word['x0'], word['top'], word['x1'], word['bottom'])
found_header = True
break
if found_header:
break
if not found_header:
print(f"⚠️ No 'TRANSACTIONS' header found matching position criteria on page {page_num + 1}.")
continue
# Find all tables on the page. We will assume the correct table is the first one found
# that is below our identified header.
tables = page.find_tables()
# Find the first table below the header
target_table = None
for table in tables:
# A table is below the header if its top y-coordinate is greater than the header's bottom y-coordinate
# and the header bbox was found.
if header_bbox and table.bbox[1] > header_bbox[3]:
target_table = table
break
if target_table:
# Extract the raw table data as a list of lists
extracted_data = target_table.extract()
if extracted_data and len(extracted_data) > 1:
# The first row is the header
headers = [h.strip() if h else '' for h in extracted_data[0]]
# The rest are the data rows
data_rows = extracted_data[1:]
if any(headers) and data_rows:
current_table_data = {header: [] for header in headers}
for row in data_rows:
if len(row) == len(headers):
for i, value in enumerate(row):
current_table_data[headers[i]].append(value.strip() if value else '')
all_table_data.append(current_table_data)
except FileNotFoundError:
print(f"❌ Error: The file at '{input_pdf_path}' was not found.")
return
except Exception as e:
print(f"❌ An error occurred during PDF processing: {e}")
return
# Save the extracted data to a JSON file
if all_table_data:
if len(all_table_data) == 1:
final_data = all_table_data[0]
else:
final_data = all_table_data
with open(output_json_path, 'w', encoding='utf-8') as f:
json.dump(final_data, f, ensure_ascii=False, indent=4)
print(f"✅ Table data successfully saved to: {output_json_path}")
else:
print("⚠️ No matching tables found in the PDF.")
# 🔧 Replace with your actual paths
input_pdf = r"C:\Users\Suren\Downloads\input.pdf"
output_json = r"C:\Users\Suren\Downloads\transactions_table_data.json"
# Execute the function
extract_transaction_table_with_plumber(input_pdf, output_json)