forked from hackintoshrao/sqlglot
-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathtesting.py
More file actions
201 lines (161 loc) · 6.41 KB
/
testing.py
File metadata and controls
201 lines (161 loc) · 6.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from typing import Dict, List, Any
from sqlglot import parse_one, parse
from sqlglot import exp
def parse_sql(sql: str, dialect: str = "snowflake") -> List[Dict[str, Any]]:
"""
Parses the given SQL string and returns the structured `infos` list.
Args:
sql (str): The SQL query string to parse.
dialect (str): The SQL dialect to use for parsing (default is "snowflake").
Returns:
List[Dict[str, Any]]: The structured information extracted from the SQL query.
"""
# Parse the SQL into a list of expressions (handles multiple statements)
parsed = parse(sql, read=dialect)
infos = []
for statement in parsed:
if isinstance(statement, exp.Select):
infos.extend(parse_select(statement))
else:
# Handle other SQL statement types if necessary
pass
return infos
def parse_select(select_expr: exp.Select) -> List[Dict[str, Any]]:
"""
Parses a SELECT expression and extracts the relevant information.
Args:
select_expr (exp.Select): The SELECT expression to parse.
Returns:
List[Dict[str, Any]]: The extracted information from the SELECT expression.
"""
infos = []
main_info = {"tables": [], "columns": [], "where_columns": [], "limits": []}
# Extract tables from FROM clause
from_expr = select_expr.args.get("from")
if from_expr:
for table in from_expr.find_all(exp.Table):
main_info["tables"].append(table.name)
# Extract columns from SELECT expressions
for select_exp in select_expr.expressions:
column = extract_column(select_exp)
if column:
main_info["columns"].append(column)
# Extract WHERE clause
where_expr = select_expr.args.get("where")
if where_expr:
main_info["where_columns"] = process_where(where_expr.this)
# Extract LIMIT clause
limit_expr = select_expr.args.get("limit")
if limit_expr:
main_info["limits"].append(limit_expr.sql())
# Append main_info to infos
infos.append(main_info)
# Process JOINs
joins = select_expr.args.get("joins", [])
for join in joins:
join_info = {"tables": [], "columns": [], "where_columns": [], "limits": []}
join_table = join.this
if isinstance(join_table, exp.Table):
join_info["tables"].append(join_table.name)
elif isinstance(join_table, exp.Subquery):
# Recursively process the subquery in the JOIN
sub_infos = parse_select(join_table.this)
if sub_infos:
# Assuming the first dictionary corresponds to the subquery
sub_info = sub_infos[0]
join_info["tables"].extend(sub_info["tables"])
join_info["columns"].extend(sub_info["columns"])
join_info["where_columns"].extend(sub_info["where_columns"])
join_info["limits"].extend(sub_info["limits"])
# Process the JOIN condition (ON clause)
on_expr = join.args.get("on")
if on_expr:
join_info["where_columns"] = process_where(on_expr.this)
# Append join_info to infos
infos.append(join_info)
return infos
def extract_column(select_exp: exp.Expression) -> str:
"""
Extracts the column name from a SELECT expression.
Args:
select_exp (exp.Expression): The SELECT expression.
Returns:
str: The column name or expression.
"""
if isinstance(select_exp, exp.Column):
return select_exp.name
elif isinstance(select_exp, exp.Star):
return "*"
else:
# Handle expressions like functions or aliases
return select_exp.sql()
def process_where(where_condition: exp.Expression) -> List[Dict[str, Any]]:
"""
Processes the WHERE condition to extract subqueries and their information.
Args:
where_condition (exp.Expression): The WHERE condition expression.
Returns:
List[Dict[str, Any]]: A list of condition dictionaries extracted from the WHERE clause.
"""
where_columns = []
# Handle logical operators (AND, OR)
if isinstance(where_condition, (exp.And, exp.Or)):
# Recursively process each part of the logical condition
for arg in where_condition.args.values():
if isinstance(arg, list):
for expr in arg:
where_columns.extend(process_where(expr))
else:
where_columns.extend(process_where(arg))
else:
# Check for subqueries within the condition
subqueries = where_condition.find_all(exp.Subquery)
for subquery in subqueries:
sub_select = subquery.this
if isinstance(sub_select, exp.Select):
sub_info = {"tables": [], "columns": [], "where_columns": [], "limits": []}
# Extract tables from the subquery's FROM clause
from_expr = sub_select.args.get("from")
if from_expr:
for table in from_expr.find_all(exp.Table):
sub_info["tables"].append(table.name)
# Extract columns from the subquery's SELECT expressions
for select_exp in sub_select.expressions:
column = extract_column(select_exp)
if column:
sub_info["columns"].append(column)
# Extract WHERE clause from the subquery
where_expr = sub_select.args.get("where")
if where_expr:
sub_info["where_columns"] = process_where(where_expr.this)
# Extract LIMIT clause from the subquery
limit_expr = sub_select.args.get("limit")
if limit_expr:
sub_info["limits"].append(limit_expr.sql())
# Append the subquery info to where_columns
where_columns.append(sub_info)
return where_columns
# Example Usage
if __name__ == "__main__":
test_sql = """
SELECT a, b
FROM table
WHERE colc > (
SELECT d
FROM t2
);
SELECT a, b
FROM table2;
SELECT x, y
FROM table3
WHERE x>10 and colx IN (
SELECT *
FROM t4
);
"""
print(repr(parse_one(test_sql, read="snowflake")))
# Parse the SQL and extract infos
infos = parse_sql(test_sql, dialect="snowflake")
# Print the resulting infos
import pprint
pprint.pprint(infos)