image-tagger-python/MakeImageRecords.py at main · NSCF/image-tagger-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# generate the csv file to join to the specimen records
# add views from filenames

import os
import csv
import re
import pandas as pd
from view_codes import make_view

### SETTINGS

datafile_dir = r''
datafile = r''
image_catnum_field = '' # the field in the dataset that has catalog numbers formatted to match those in the file name - the joining field.

keyword_fields = [
  'country',
  'species',
  'typestatus'
]

image_dir = r'G:\PRE' # the folder with the images; does not include subfolders
outputfile = r'Iziko-mammals-types-2023.csv' #
fileext = '.tif'

# THE SCRIPT

print('reading image directory')
try:
  dir_contents = os.listdir(image_dir)
except Exception as ex:
  print('Oops! ' + str(ex))
  exit()

images = list(filter(lambda x: x.lower().endswith(fileext.lower()), dir_contents))

print('reading data file...')
fullpath = os.path.join(datafile_dir, datafile)
try:
  df = pd.read_csv(fullpath, na_values=['', ' '], keep_default_na=False)
except Exception as e:
  print(e)
  print('Quitting...')
  exit()

if df.empty:
  print('No records found in', datafile)
  print('Quitting...')
  exit()

print(df.shape[0], 'records read from', datafile)
try:
  df = df.set_index(image_catnum_field, drop = False, verify_integrity = True)
except ValueError:
  print('There are duplicate values in the field \'', image_catnum_field + '\'')
  print('Please remove the duplicates and try again.')
  print('Quitting...')
  exit()

#check the file includes all the fields
missingfields = []
for field in keyword_fields:
    if field not in df:
        missingfields.append(field)

if len(missingfields) > 0:
    print('The following keyword fields are not in the dataset. Check spelling, case, and fix the dataset if needed:')
    print(' | '.join(missingfields))
    exit()

print('making image tags dataset')
rows = []
missing = set()
code_errors = set()
for image in images:

  catalognumber = image.split('_')[0]

  row = {
    "filename": image,
    "views": []
  }

  parts = re.split(r"[_\.]", image)
  parts.pop()
  parts = parts[1:]
  has_code_errors =  False
  for part in parts:

    try:
      view =  make_view(part)
    except:
      code_errors.add(image)
      has_code_errors = True
      continue

    if view:
      row["views"].append(view)

  if has_code_errors:
    continue

  row["views"] = ','.join(row['views'])

  data_record = None
  try:
    data_record = df.loc[catalognumber]
  except: #it can only be a key error
    missing.add(catalognumber)
    continue


  for keyword_field in keyword_fields:
    keyword_value = data_record[keyword_field]
    if pd.isna(keyword_value):
      keyword_value = None
    row[keyword_field] = keyword_value

  rows.append(row)

if len(rows):
  print('saving image dataset file with', len(rows), 'records')
  with open(os.path.join(image_dir, outputfile), 'w', encoding='UTF8', newline='', errors='ignore') as f:
    fields = ['filename', 'views', *keyword_fields]
    dict_writer = csv.DictWriter(f, fields)
    dict_writer.writeheader()
    dict_writer.writerows(rows)

  if len(missing):
    print("The following specimens are not in the data file:")
    for num in missing:
      print(num)

  if len(code_errors):
    print("The following images have code errors:")
    for image in code_errors:
      print(image)
else:
  if len(code_errors):
    print("The following images have code errors:")
    for image in code_errors:
      print(image)
  else:
    print("no images matched records in the dataset")

print('All done, bye bye now...')