Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 41 additions & 29 deletions azure-kusto-data/azure/kusto/data/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@
from .exceptions import KustoServiceError


HAS_PANDAS = True

try:
import pandas
from .helpers import to_pandas_datetime, to_pandas_timedelta
except ImportError:
HAS_PANDAS = False


class WellKnownDataSet(Enum):
"""Categorizes data tables according to the role they play in the data set that a Kusto query returns."""

Expand All @@ -26,48 +35,46 @@ class KustoResultRow(object):
def __init__(self, columns, row):
self._value_by_name = {}
self._value_by_index = []
self._seventh_digit = {}
self._hidden_values = []

for i, value in enumerate(row):
column = columns[i]
try:
lower_column_type = column.column_type.lower()
column_type = column.column_type.lower()
except AttributeError:
self._value_by_index.append(value)
self._value_by_name[columns[i]] = value
if HAS_PANDAS:
self._hidden_values.append(value)
continue

if lower_column_type in ["datetime", "timespan"]:
if column_type in ["datetime", "timespan"]:
if value is None:
typed_value = None
if HAS_PANDAS:
self._hidden_values.append(None)
else:
try:
# If you are here to read this, you probably hit some datetime/timedelta inconsistencies.
# Azure-Data-Explorer(Kusto) supports 7 decimal digits, while the corresponding python types supports only 6.
# What we do here, is remove the 7th digit, if exists, and create a datetime/timedelta
# from whats left. The reason we are keeping the 7th digit, is to allow users to work with
# this precision in case they want it. One example why one might want this precision, is when
# working with pandas. In that case, use azure.kusto.data.helpers.dataframe_from_result_table
# which takes into account the 7th digit.
char = value.split(":")[2].split(".")[1][6]
if char.isdigit():
tick = int(char)
last = value[-1] if value[-1].isalpha() else ""
typed_value = KustoResultRow.convertion_funcs[lower_column_type](value[:-2] + last)
if tick:
if lower_column_type == "datetime":
self._seventh_digit[column.column_name] = tick
else:
self._seventh_digit[column.column_name] = (
tick if abs(typed_value) == typed_value else -tick
)
else:
typed_value = KustoResultRow.convertion_funcs[lower_column_type](value)
except (IndexError, AttributeError):
typed_value = KustoResultRow.convertion_funcs[lower_column_type](value)
elif lower_column_type in KustoResultRow.convertion_funcs:
typed_value = KustoResultRow.convertion_funcs[lower_column_type](value)
# If you are here to read this, you probably hit some datetime/timedelta inconsistencies.
# Azure-Data-Explorer(Kusto) supports 7 decimal digits, while the corresponding python types supports only 6.
# One example why one might want this precision, is when working with pandas.
# In that case, use azure.kusto.data.helpers.dataframe_from_result_table which takes into account the original value.
typed_value = KustoResultRow.convertion_funcs[column_type](value)

# this is a special case where plain python will lose precision, so we keep the precise value hidden
# when transforming to pandas, we can use the hidden value to convert to precise pandas/numpy types
if HAS_PANDAS:
if column_type == "datetime":
self._hidden_values.append(to_pandas_datetime(value))
if column_type == "timespan":
self._hidden_values.append(to_pandas_timedelta(value, typed_value))
elif column_type in KustoResultRow.convertion_funcs:
typed_value = KustoResultRow.convertion_funcs[column_type](value)
if HAS_PANDAS:
self._hidden_values.append(value)
else:
typed_value = value
if HAS_PANDAS:
self._hidden_values.append(value)

self._value_by_index.append(typed_value)
self._value_by_name[column.column_name] = typed_value
Expand Down Expand Up @@ -129,6 +136,11 @@ def __init__(self, json_table):

self.rows = [KustoResultRow(self.columns, row) for row in json_table["Rows"]]

@property
def _rows(self):
for row in self.rows:
yield row._hidden_values

@property
def rows_count(self):
return len(self.rows)
Expand Down
53 changes: 39 additions & 14 deletions azure-kusto-data/azure/kusto/data/helpers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,41 @@
"""Kusto helper functions"""
import six

import pandas
from ._models import KustoResultTable

def to_pandas_datetime(raw_value):
import pandas as pd

return pd.to_datetime(raw_value)


def to_pandas_timedelta(raw_value, timedelta_value):
import pandas as pd

if isinstance(raw_value, (six.integer_types, float)):
# https://docs.microsoft.com/en-us/dotnet/api/system.datetime.ticks
# kusto saves up to ticks, 1 tick == 100 nanoseconds
return pd.Timedelta(raw_value * 100, unit="ns")
if isinstance(raw_value, six.string_types):
fraction = raw_value.split(".")[-1]
if fraction.isdigit():
whole_part = int(timedelta_value.total_seconds())
time_with_exact_fraction = str(whole_part) + "." + fraction
total_seconds = float(time_with_exact_fraction)

return pd.Timedelta(total_seconds, unit="s")
else:
return pd.Timedelta(timedelta_value)

return pd.Timedelta(timedelta_value.total_seconds(), unit="ns")


def dataframe_from_result_table(table):
import pandas as pd
from ._models import KustoResultTable
from dateutil.tz import UTC

"""Converts Kusto tables into pandas DataFrame.
:param azure.kusto.data._models.KustoResultTable table: Table recieved from the response.
:param azure.kusto.data._models.KustoResultTable table: Table received from the response.
:return: pandas DataFrame.
:rtype: pandas.DataFrame
"""
Expand All @@ -16,16 +45,12 @@ def dataframe_from_result_table(table):
if not isinstance(table, KustoResultTable):
raise TypeError("Expected KustoResultTable got {}".format(type(table).__name__))

frame = pandas.DataFrame.from_records(
[row.to_list() for row in table.rows], columns=[col.column_name for col in table.columns]
)
bool_columns = [col.column_name for col in table.columns if col.column_type == "bool"]
for col in bool_columns:
frame[col] = frame[col].astype(bool)

for i in range(len(table.rows)):
seventh = table.rows[i]._seventh_digit
for name in seventh.keys():
frame.loc[i, name] += pandas.Timedelta(seventh[name] * 100, unit="ns")
columns = [col.column_name for col in table.columns]
frame = pd.DataFrame(table._rows, columns=columns)

# fix types
for col in table.columns:
if col.column_type == "bool":
frame[col.column_name] = frame[col.column_name].astype(bool)

return frame
2 changes: 1 addition & 1 deletion azure-kusto-data/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,6 @@
namespace_packages=["azure"],
keywords="kusto wrapper client library",
packages=find_packages(exclude=["azure", "tests"]),
install_requires=["adal>=1.0.0", "python-dateutil>=2.7.0", "requests>=2.13.0", "six>=1.10.0"],
install_requires=["adal>=1.0.0", "python-dateutil>=2.8.0", "requests>=2.13.0", "six>=1.10.0"],
extras_require={"pandas": ["pandas==0.24.1"], ":python_version<'3.0'": ["azure-nspkg"]},
)
16 changes: 11 additions & 5 deletions azure-kusto-data/tests/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,10 @@ def test_valid_response(self):
self.assertEqual(type(row[4]), bool if row[4] is not None else type(None))
self.assertEqual(type(row[5]), timedelta if row[5] is not None else type(None))

for i in range(0, len(primary_table)):
row = primary_table[i]
expected_row = expected_table[i]
for j in range(0, len(row)):
self.assertEqual(row[j], expected_row[j])
for row_index, row in enumerate(primary_table):
expected_row = expected_table[row_index]
for col_index, value in enumerate(row):
self.assertEqual(value, expected_row[col_index])

def test_invalid_table(self):
"""Tests calling of table with index that doesn't exists."""
Expand All @@ -236,3 +235,10 @@ def test_iterating_after_end(self):
"""Tests StopIteration is raised when the response ends."""
response = KustoResponseDataSetV2(json.loads(RESPONSE_TEXT))
self.assertEqual(sum(1 for _ in response.primary_results[0]), 3)

def test_row_equality(self):
"""Tests the rows are idempotent."""
response = KustoResponseDataSetV2(json.loads(RESPONSE_TEXT))
table = response.primary_results[0]
for row_index, row in enumerate(table):
self.assertEqual(table[row_index], row)
29 changes: 18 additions & 11 deletions azure-kusto-data/tests/test_kusto_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,19 @@ def test_sanity_query(self, mock_post):
expected["xsmalltext"] = DIGIT_WORDS[int(expected["xint16"])]
expected["xtext"] = DIGIT_WORDS[int(expected["xint16"])]
expected["xnumberAsText"] = text_type(expected["xint16"])
expected["xtime"] = (

next_time = (
timedelta()
if expected["xtime"] is None
else (abs(expected["xtime"]) + timedelta(days=1, seconds=1, microseconds=1000))
* (-1) ** (expected["rownumber"] + 1)
)

# hacky tests - because time here is relative to previous row, after we pass a time where we have > 500 nanoseconds,
# another microseconds digit is needed
if expected["rownumber"] + 1 == 6:
next_time += timedelta(microseconds=1)
expected["xtime"] = next_time
if expected["xint16"] > 0:
expected["xdynamicWithNulls"] = {"rowId": expected["xint16"], "arr": [0, expected["xint16"]]}

Expand Down Expand Up @@ -241,16 +248,16 @@ def test_sanity_data_frame(self, mock_post):
"xdate": Series(
[
pandas.to_datetime(None),
pandas.to_datetime("2014-01-01T01:01:01.0000000Z").tz_convert(UTC),
pandas.to_datetime("2015-01-01T01:01:01.0000001Z").tz_convert(UTC),
pandas.to_datetime("2016-01-01T01:01:01.0000002Z").tz_convert(UTC),
pandas.to_datetime("2017-01-01T01:01:01.0000003Z").tz_convert(UTC),
pandas.to_datetime("2018-01-01T01:01:01.0000004Z").tz_convert(UTC),
pandas.to_datetime("2019-01-01T01:01:01.0000005Z").tz_convert(UTC),
pandas.to_datetime("2020-01-01T01:01:01.0000006Z").tz_convert(UTC),
pandas.to_datetime("2021-01-01T01:01:01.0000007Z").tz_convert(UTC),
pandas.to_datetime("2022-01-01T01:01:01.0000008Z").tz_convert(UTC),
pandas.to_datetime("2023-01-01T01:01:01.0000009Z").tz_convert(UTC),
pandas.to_datetime("2014-01-01T01:01:01.0000000Z"),
pandas.to_datetime("2015-01-01T01:01:01.0000001Z"),
pandas.to_datetime("2016-01-01T01:01:01.0000002Z"),
pandas.to_datetime("2017-01-01T01:01:01.0000003Z"),
pandas.to_datetime("2018-01-01T01:01:01.0000004Z"),
pandas.to_datetime("2019-01-01T01:01:01.0000005Z"),
pandas.to_datetime("2020-01-01T01:01:01.0000006Z"),
pandas.to_datetime("2021-01-01T01:01:01.0000007Z"),
pandas.to_datetime("2022-01-01T01:01:01.0000008Z"),
pandas.to_datetime("2023-01-01T01:01:01.0000009Z"),
]
),
"xsmalltext": Series(
Expand Down