-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathuscxmlparse.py
More file actions
executable file
·98 lines (74 loc) · 3.14 KB
/
uscxmlparse.py
File metadata and controls
executable file
·98 lines (74 loc) · 3.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding=utf-8 -*-
__author__ = 'Clive'
import os
from os import walk
import sqlite3
import xmlhelper
import dbhelper
import gzip
import shutil
xmlpath = r"./xml/"
titleNameTable = "titleNames"
titleNameTableFormat = "(_id INTEGER PRIMARY KEY AUTOINCREMENT, titleNumText TEXT, titleName TEXT)"
titleNameTableKeyCol = "titleNumText"
titleTableNamePrefix = "t"
titleTableFormat = "(_id INTEGER PRIMARY KEY AUTOINCREMENT, pid INTEGER, Level TEXT, Heading TEXT, Content TEXT)"
# change the current directory to the xml directory
os.chdir(xmlpath)
# find the path of where the xml files are stored
mypath = os.path.dirname(__file__) + '/' + xmlpath
# exclude for now; need to either fix code to handle special case or do by hand
excludeFiles = [] # ["usc05A.xml", "usc11A.xml", "usc18A.xml", "usc28a.xml", "usc50a.xml"]
# get all the xml files in the directory
xmlFiles = []
for (dirpath, dirnames, filenames) in walk(mypath):
filenames.sort()
for f in filenames:
froot, fext = os.path.splitext(f)
if fext == ".xml" and froot not in excludeFiles:
xmlFiles.append(f)
titleNameTabbleDb = titleNameTable.lower() + ".db"
if os.path.isfile(titleNameTabbleDb):
os.remove(titleNameTabbleDb)
# https://github.com/ghaering/pysqlite/issues/109
dbcon = sqlite3.connect(titleNameTable.lower() + ".db", 5.0, 0, None)
dbcon.text_factory = str
with dbcon:
dbcur = dbcon.cursor()
# create the table of titles if it doesn't exist
dbhelper.check_table(dbcur, titleNameTable, titleNameTableFormat)
for f in xmlFiles:
print ("Parsing " + f + "...")
root = xmlhelper.extract_xml(f, mypath)
# get the title
titleNum = root.find('./meta/docNumber').text
titleTable = titleTableNamePrefix + titleNum
mainroot = root.find('./main/title')
if mainroot is None:
mainroot = root.find('./appendix')
titleNumText = mainroot.find('./num').text
# if titleNumText and titleNumText.endswith(u'—'):
# titleNumText = titleNumText[:-1]
titleHeading = mainroot.find('./heading').text
dbhelper.insert_if_not_in_table(dbcur, titleNameTable, titleNameTableKeyCol, titleNumText, titleHeading)
dbconTitleDB = sqlite3.connect(titleTable+".db")
dbconTitleDB.text_factory = str
dbcurTitleCur = dbconTitleDB.cursor()
dbcurTitleCur.execute("DROP TABLE IF EXISTS '" + titleTable + "'")
dbcurTitleCur.execute("CREATE TABLE '" + titleTable + "' " + titleTableFormat)
xmlhelper.parse_title(mainroot, dbcurTitleCur, titleTable, xmlhelper.aboveSectionLevels, 0)
# https://github.com/ghaering/pysqlite/issues/109
dbconTitleDB.isolation_level = None
dbcurTitleCur.execute("VACUUM")
dbcon.commit()
dbconTitleDB.commit()
dbconTitleDB.close()
print (f + " parsed")
os.rename(f, f+"-bak")
dbFiles = []
for (dirpath2, dirnames2, filenames) in walk(mypath):
for f in filenames:
froot, fext = os.path.splitext(f)
if fext == ".db":
with open(f, "rb") as f_in, gzip.open(f+'.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)