-
Notifications
You must be signed in to change notification settings - Fork 218
Description
pip install pytesseract pillow opencv-python
import pytesseract
from PIL import Image
import cv2
import json
import os
Optional: If Tesseract is not in PATH (Windows example)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def ocr_to_json(image_path, output_json="output.json"):
# Load image using OpenCV
image = cv2.imread(image_path)
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Optional: Apply thresholding to improve accuracy
_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
# Extract OCR data (word-level with bounding boxes)
data = pytesseract.image_to_data(thresh, output_type=pytesseract.Output.DICT)
results = []
for i in range(len(data['text'])):
word = data['text'][i].strip()
if word != "":
results.append({
"text": word,
"confidence": data['conf'][i],
"bounding_box": {
"x": data['left'][i],
"y": data['top'][i],
"width": data['width'][i],
"height": data['height'][i]
}
})
final_output = {
"file_name": os.path.basename(image_path),
"total_words": len(results),
"extracted_data": results
}
with open(output_json, "w", encoding="utf-8") as f:
json.dump(final_output, f, indent=4)
return final_output
Example usage
if name == "main":
result = ocr_to_json("sample_image.jpg")
print(json.dumps(result, indent=4))
{
"file_name": "sample_image.jpg",
"total_words": 3,
"extracted_data": [
{
"text": "Invoice",
"confidence": "96",
"bounding_box": {
"x": 100,
"y": 50,
"width": 120,
"height": 30
}
}
]
}
def simple_ocr_json(image_path):
text = pytesseract.image_to_string(Image.open(image_path))
return {
"file": image_path,
"text": text.strip(),
"lines": text.strip().split("\n")
}
pytesseract.image_to_string(img, lang='eng+ben')
pip install pytesseract pillow opencv-python easyocr paddleocr flask fastapi uvicorn python-multipart
import pytesseract
import cv2
import json
from pytesseract import Output
def basic_ocr_to_json(image_path):
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
data = pytesseract.image_to_data(gray, output_type=Output.DICT)
words = []
for i in range(len(data['text'])):
if data['text'][i].strip() != "":
words.append({
"text": data['text'][i],
"confidence": data['conf'][i],
"box": {
"x": data['left'][i],
"y": data['top'][i],
"w": data['width'][i],
"h": data['height'][i]
}
})
return {"total_words": len(words), "words": words}
def advanced_preprocess(image_path):
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (5,5), 0)
thresh = cv2.adaptiveThreshold(
blur, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
11, 2
)
text = pytesseract.image_to_string(thresh)
return {"clean_text": text}
def bengali_english_ocr(image_path):
img = cv2.imread(image_path)
text = pytesseract.image_to_string(img, lang='eng+ben')
return {"text": text}
import easyocr
def easyocr_json(image_path):
reader = easyocr.Reader(['en','bn'])
results = reader.readtext(image_path)
data = []
for bbox, text, conf in results:
data.append({
"text": text,
"confidence": conf,
"bounding_box": bbox
})
return {"results": data}
from paddleocr import PaddleOCR
def paddle_ocr_json(image_path):
ocr = PaddleOCR(lang='en')
result = ocr.ocr(image_path)
data = []
for line in result[0]:
data.append({
"text": line[1][0],
"confidence": line[1][1],
"bounding_box": line[0]
})
return {"results": data}
from flask import Flask, request, jsonify
import os
app = Flask(name)
@app.route("/ocr", methods=["POST"])
def ocr_api():