-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakeImageRecords.py
More file actions
146 lines (117 loc) · 3.45 KB
/
MakeImageRecords.py
File metadata and controls
146 lines (117 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# generate the csv file to join to the specimen records
# add views from filenames
import os
import csv
import re
import pandas as pd
from view_codes import make_view
### SETTINGS
datafile_dir = r''
datafile = r''
image_catnum_field = '' # the field in the dataset that has catalog numbers formatted to match those in the file name - the joining field.
keyword_fields = [
'country',
'species',
'typestatus'
]
image_dir = r'G:\PRE' # the folder with the images; does not include subfolders
outputfile = r'Iziko-mammals-types-2023.csv' #
fileext = '.tif'
# THE SCRIPT
print('reading image directory')
try:
dir_contents = os.listdir(image_dir)
except Exception as ex:
print('Oops! ' + str(ex))
exit()
images = list(filter(lambda x: x.lower().endswith(fileext.lower()), dir_contents))
print('reading data file...')
fullpath = os.path.join(datafile_dir, datafile)
try:
df = pd.read_csv(fullpath, na_values=['', ' '], keep_default_na=False)
except Exception as e:
print(e)
print('Quitting...')
exit()
if df.empty:
print('No records found in', datafile)
print('Quitting...')
exit()
print(df.shape[0], 'records read from', datafile)
try:
df = df.set_index(image_catnum_field, drop = False, verify_integrity = True)
except ValueError:
print('There are duplicate values in the field \'', image_catnum_field + '\'')
print('Please remove the duplicates and try again.')
print('Quitting...')
exit()
#check the file includes all the fields
missingfields = []
for field in keyword_fields:
if field not in df:
missingfields.append(field)
if len(missingfields) > 0:
print('The following keyword fields are not in the dataset. Check spelling, case, and fix the dataset if needed:')
print(' | '.join(missingfields))
exit()
print('making image tags dataset')
rows = []
missing = set()
code_errors = set()
for image in images:
catalognumber = image.split('_')[0]
row = {
"filename": image,
"views": []
}
parts = re.split(r"[_\.]", image)
parts.pop()
parts = parts[1:]
has_code_errors = False
for part in parts:
try:
view = make_view(part)
except:
code_errors.add(image)
has_code_errors = True
continue
if view:
row["views"].append(view)
if has_code_errors:
continue
row["views"] = ','.join(row['views'])
data_record = None
try:
data_record = df.loc[catalognumber]
except: #it can only be a key error
missing.add(catalognumber)
continue
for keyword_field in keyword_fields:
keyword_value = data_record[keyword_field]
if pd.isna(keyword_value):
keyword_value = None
row[keyword_field] = keyword_value
rows.append(row)
if len(rows):
print('saving image dataset file with', len(rows), 'records')
with open(os.path.join(image_dir, outputfile), 'w', encoding='UTF8', newline='', errors='ignore') as f:
fields = ['filename', 'views', *keyword_fields]
dict_writer = csv.DictWriter(f, fields)
dict_writer.writeheader()
dict_writer.writerows(rows)
if len(missing):
print("The following specimens are not in the data file:")
for num in missing:
print(num)
if len(code_errors):
print("The following images have code errors:")
for image in code_errors:
print(image)
else:
if len(code_errors):
print("The following images have code errors:")
for image in code_errors:
print(image)
else:
print("no images matched records in the dataset")
print('All done, bye bye now...')