-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathig.py
More file actions
75 lines (64 loc) · 2.19 KB
/
ig.py
File metadata and controls
75 lines (64 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# /// script
# dependencies = [
# "pytesseract",
# "Pillow",
# "pillow-heif",
# ]
# ///
import os
import json
import re
import shutil
from pathlib import Path
import pytesseract
from PIL import Image
import pillow_heif
# Register HEIF opener for PIL
pillow_heif.register_heif_opener()
def extract_instagram_handles(text):
"""Extract Instagram handles from text using regex."""
# Match @ followed by letters, numbers, periods, and underscores
pattern = r'@[A-Za-z0-9._]+\b'
return re.findall(pattern, text)
def process_image(image_path):
"""Extract text from image using OCR."""
try:
img = Image.open(image_path)
# Convert image to RGB if necessary
if img.mode != 'RGB':
img = img.convert('RGB')
text = pytesseract.image_to_string(img)
return text
except Exception as e:
print(f"Error processing {image_path}: {e}")
return ""
def main():
# Create processed directory if it doesn't exist
processed_dir = Path('processed')
processed_dir.mkdir(exist_ok=True)
# Get all image files in current directory
image_files = []
for ext in ['.heic', '.jpg', '.jpeg', '.png']:
image_files.extend(Path('.').glob(f'*{ext}'))
image_files.extend(Path('.').glob(f'*{ext.upper()}'))
# Process each image
with open('instagram_handles.jsonl', 'w', encoding='utf-8') as f:
for image_path in image_files:
# Skip files already in processed directory
if 'processed' in str(image_path):
continue
print(f"Processing {image_path}")
text = process_image(image_path)
handles = extract_instagram_handles(text)
if handles:
# Write results to JSONL file
for handle in handles:
result = {
'file': image_path.name,
'ig': handle
}
f.write(json.dumps(result) + '\n')
# Move file to processed directory
shutil.move(str(image_path), str(processed_dir / image_path.name))
if __name__ == "__main__":
main()