forked from iconclass/data
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmake_sqlite.py
More file actions
180 lines (161 loc) · 5.61 KB
/
make_sqlite.py
File metadata and controls
180 lines (161 loc) · 5.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import sqlite3
import os
import traceback
import sys
SCHEMA = [
"""CREATE TABLE IF NOT EXISTS "notations" (
"id" INTEGER PRIMARY KEY AUTOINCREMENT,
"notation" TEXT,
"children" TEXT,
"refs" TEXT,
"key" TEXT
)""",
"""CREATE TABLE IF NOT EXISTS "keys" (
"id" INTEGER PRIMARY KEY AUTOINCREMENT,
"code" TEXT,
"suffix" TEXT
)""",
"""CREATE TABLE IF NOT EXISTS "texts" (
"ref" INTEGER,
"type" INTEGER,
"language" TEXT,
"text" TEXT
)""",
"""CREATE INDEX IF NOT EXISTS "texts_ref" ON "texts" ("ref")""",
"""CREATE INDEX IF NOT EXISTS "notations_notation" ON "notations" ("notation")""",
"""CREATE INDEX IF NOT EXISTS "keys_code" ON "keys" ("code")""",
]
def parse_dbtxt(data):
obj = {}
buf = []
last_field = None
for line in data.split("\n"):
if line.startswith("#"):
continue
data = line.split(" ")
if len(data) < 2:
continue
field = data[0].lower()
data = " ".join(data[1:])
if field == ";":
buf.append(data)
elif field != last_field:
if buf:
obj[last_field] = buf
buf = [data]
last_field = field
if field in ("n", "k", "code", "suffix"):
buf = buf[0]
if buf:
obj[last_field] = buf
for k, v in obj.copy().items():
if k.startswith("txt_"):
obj.setdefault("txt", {})[k[4:]] = v
del obj[k]
if k.startswith("kwd_"):
obj.setdefault("kw", {})[k[4:]] = v
del obj[k]
return obj
def read_notations(filename, cursor):
print("Reading notations")
rowid = 1
INSERT_SQL = "INSERT INTO notations VALUES (?, ?, ?, ?, ?)"
notation_ids = {}
with open(filename, "rt", encoding="utf8") as input_file:
for lineno, chunk in enumerate(input_file.read().split("\n$")):
try:
obj = parse_dbtxt(chunk)
except:
print(
f"Problem with notations in {filename} on line {lineno}: {repr(chunk)}"
)
return None
notation = obj.get("n")
children = "|".join(obj.get("c", [])) or None
refs = "|".join(obj.get("r", [])) or None
key = obj.get("k")
if notation:
data = (rowid, notation, children, refs, key)
cursor.execute(INSERT_SQL, data)
notation_ids[notation] = rowid
rowid += 1
return notation_ids
def read_texts(txt_type, notation_ids, filename, language, cursor):
print("Reading %s texts from %s" % (language, filename))
INSERT_SQL = (
"INSERT INTO texts (ref, type, language, text) VALUES (?, %s, ?, ?)" % txt_type
)
with open(filename, "rt", encoding="utf8") as input_file:
for line in input_file.read().split("\n"):
if line.startswith("#"):
continue
line = line.strip()
tmp = line.split("|")
if len(tmp) != 2:
continue
notation, txt = tmp
ref = notation_ids.get(notation)
if not ref:
continue
data = (ref, language, txt)
cursor.execute(INSERT_SQL, data)
def read_keys(notation_ids, filename, cursor):
print("Reading keys")
# get the maximum row id for notations
# the keys id should start from the maximum notation ids plus 1
# as the texts table have infor on both notations and keys
row_id = max(notation_ids.values())
INSERT_SQL1 = "INSERT INTO keys (id, code, suffix) VALUES (?, ?, ?)"
INSERT_SQL2 = "INSERT INTO texts (ref, type, language, text) VALUES (?, ?, ?, ?)"
with open(filename, "rt", encoding="utf8") as input_file:
for chunk in input_file.read().split("\n$"):
row_id += 1
obj = parse_dbtxt(chunk)
code = obj.get("code")
suffix = obj.get("suffix")
if not code and suffix:
continue
data = (row_id, code, suffix)
cursor.execute(INSERT_SQL1, data)
notation_ids[code] = row_id
# And also insert the texts and keywords
for language, v in obj.get("txt", {}).items():
for vv in v:
data = (row_id, 0, language, vv)
cursor.execute(INSERT_SQL2, data)
for language, v in obj.get("kw", {}).items():
for vv in v:
data = (row_id, 1, language, vv)
cursor.execute(INSERT_SQL2, data)
if __name__ == "__main__":
db = sqlite3.connect("iconclass.sqlite")
cursor = db.cursor()
for statement in SCHEMA:
try:
cursor.execute(statement)
except sqlite3.OperationalError:
traceback.print_exc()
print("Problem with ---> [", end="")
print(statement, end="")
print("]")
sys.exit(1)
# Read the structure
notation_ids = read_notations("notations.txt", cursor)
if not notation_ids:
sys.exit(1)
# Read the texts
for dirpath, dirs, files in os.walk("."):
for filename in files:
if filename.startswith("kw_"):
language = filename[3:5]
read_texts(
1, notation_ids, os.path.join(dirpath, filename), language, cursor
)
elif filename.startswith("txt_"):
language = filename[4:6]
read_texts(
0, notation_ids, os.path.join(dirpath, filename), language, cursor
)
# read the keys
read_keys(notation_ids, "keys.txt", cursor)
db.commit()