-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquery.py
More file actions
185 lines (155 loc) · 4.37 KB
/
query.py
File metadata and controls
185 lines (155 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from utils import *
from expression_parser import query_to_tree
from urllib.parse import urlparse
import time
kMaxVal = (float('-inf'), 0)
kDefaultLimit = 1000
kDefaultChunkSize = 1000
class Hash64:
def __init__(self):
pass
def __call__(self, x):
h = hashlib.sha256()
h.update(x.encode())
return int(h.hexdigest()[-16:], 16)
hashfn = Hash64()
def intersect(*iters, limit=kDefaultLimit):
return atleast(*iters, k=len(iters), limit=limit)
def union(*iters, limit=kDefaultLimit):
return atleast(*iters, k=1, limit=limit)
def atleast(*iters, k=2, limit=kDefaultLimit):
assert 1 <= k <= len(iters)
num_returned = 0
try:
vals = [next(it) for it in iters]
while True:
minval = min(vals)
if sum([v == minval for v in vals]) >= k:
yield minval
num_returned += 1
if num_returned >= limit:
break
for i in range(len(vals)):
if vals[i] == minval:
vals[i] = next(iters[i])
except StopIteration:
return
def token_iterator(token, chunksize=kDefaultChunkSize, limit=kDefaultLimit):
if token is None:
h = 0
else:
h = hashfn(token) - (1 << 63)
r = []
i = 0
num_returned = 0
offset = 0
while True:
if i >= len(r):
i -= len(r)
offset += len(r)
r = c.execute(f"""
SELECT comment_score, comment_id
FROM tokens
WHERE token_hash={h}
ORDER BY comment_score, comment_id
LIMIT {chunksize}
OFFSET {offset}""").fetchall()
if i >= len(r):
return
yield r[i]
num_returned += 1
if num_returned >= limit:
yield kMaxVal
return
i += 1
def score_iterator(score, chunksize=kDefaultChunkSize, limit=kDefaultLimit, op='>'):
r = []
i = 0
num_returned = 0
offset = 0
while True:
if i >= len(r):
i -= len(r)
offset += len(r)
r = c.execute(f"""
SELECT comment_score, comment_id
FROM tokens
WHERE comment_score{op}{score}
AND token_hash=0
ORDER BY comment_score, comment_id
LIMIT {chunksize}
OFFSET {offset}""").fetchall()
print('score', r[:10])
if i >= len(r):
yield kMaxVal
return
yield r[i]
num_returned += 1
if num_returned >= kDefaultLimit:
return
i += 1
parser = MyHTMLParser()
conn = sqlite3.connect('new.db')
c = conn.cursor()
def tree_to_iter(tree, limit=kDefaultLimit):
print(limit, tree)
if tree.op == '*':
return intersect(*[tree_to_iter(c, limit=float('inf')) for c in tree.children], limit=limit)
if tree.op == '+':
return union(*[tree_to_iter(c, limit=float('inf')) for c in tree.children], limit=limit)
if tree.op == '>':
assert tree.children[0].op == '+'
thresh = int(tree.children[1].op) + 1
return atleast(*[
tree_to_iter(c, limit=float('inf')) for c in tree.children[0].children
],
limit=limit,
k=thresh
)
if tree.op[:6] == 'score>':
return score_iterator(-int(tree.op[6:]), limit=limit, op='<')
elif tree.op[:6] == 'score<':
return score_iterator(-int(tree.op[6:]), limit=limit, op='>')
elif tree.op[:6] == 'score=':
return score_iterator(-int(tree.op[6:]), limit=limit, op='=')
return token_iterator(tree.op, limit=limit)
"""
TODO: if one (or more) of the query tokens is very common it can
take a very long time (e.g. 2 seconds!) to execute because an
overwhelming proportion of the common token's documents do not
contain the rare tokens.
A simple solution is to only use rare tokens for merging and use
random access to check the common tokens.
"""
def query(sql_cursor, user_query, max_results=100):
tokens = user_query.strip().lower().split(' ')
it = atleast(
*[token_iterator(t, limit=float('inf')) for t in tokens],
k=len(tokens),
limit=max_results
)
R = []
try:
for i in range(max_results):
R.append(next(it))
except StopIteration:
pass
if len(R) > 0 and R[-1] == kMaxVal:
R.pop()
R = [
json.loads(c.execute(f"SELECT json FROM comments WHERE comment_id={r[1]}").fetchone()[0]) for r in R
]
for i in range(len(R)):
T = R[i]["tokens"].split(' ')
T.sort()
R[i]["tokens"] = ' '.join(T)
return {
"comments": R,
"tokens": tokens,
"num_excluded": 0
}
if __name__ == '__main__':
conn = sqlite3.connect('new.db')
c = conn.cursor()
R = query(c, 'year:2020 author:you-get-an-upvote many')
# graces point year:2020 author:HlynkaCG