diff --git a/azure-kusto-data/azure/kusto/data/_models.py b/azure-kusto-data/azure/kusto/data/_models.py index 25404e32..6846911d 100644 --- a/azure-kusto-data/azure/kusto/data/_models.py +++ b/azure-kusto-data/azure/kusto/data/_models.py @@ -9,6 +9,15 @@ from .exceptions import KustoServiceError +HAS_PANDAS = True + +try: + import pandas + from .helpers import to_pandas_datetime, to_pandas_timedelta +except ImportError: + HAS_PANDAS = False + + class WellKnownDataSet(Enum): """Categorizes data tables according to the role they play in the data set that a Kusto query returns.""" @@ -26,48 +35,46 @@ class KustoResultRow(object): def __init__(self, columns, row): self._value_by_name = {} self._value_by_index = [] - self._seventh_digit = {} + self._hidden_values = [] + for i, value in enumerate(row): column = columns[i] try: - lower_column_type = column.column_type.lower() + column_type = column.column_type.lower() except AttributeError: self._value_by_index.append(value) self._value_by_name[columns[i]] = value + if HAS_PANDAS: + self._hidden_values.append(value) continue - if lower_column_type in ["datetime", "timespan"]: + if column_type in ["datetime", "timespan"]: if value is None: typed_value = None + if HAS_PANDAS: + self._hidden_values.append(None) else: - try: - # If you are here to read this, you probably hit some datetime/timedelta inconsistencies. - # Azure-Data-Explorer(Kusto) supports 7 decimal digits, while the corresponding python types supports only 6. - # What we do here, is remove the 7th digit, if exists, and create a datetime/timedelta - # from whats left. The reason we are keeping the 7th digit, is to allow users to work with - # this precision in case they want it. One example why one might want this precision, is when - # working with pandas. In that case, use azure.kusto.data.helpers.dataframe_from_result_table - # which takes into account the 7th digit. - char = value.split(":")[2].split(".")[1][6] - if char.isdigit(): - tick = int(char) - last = value[-1] if value[-1].isalpha() else "" - typed_value = KustoResultRow.convertion_funcs[lower_column_type](value[:-2] + last) - if tick: - if lower_column_type == "datetime": - self._seventh_digit[column.column_name] = tick - else: - self._seventh_digit[column.column_name] = ( - tick if abs(typed_value) == typed_value else -tick - ) - else: - typed_value = KustoResultRow.convertion_funcs[lower_column_type](value) - except (IndexError, AttributeError): - typed_value = KustoResultRow.convertion_funcs[lower_column_type](value) - elif lower_column_type in KustoResultRow.convertion_funcs: - typed_value = KustoResultRow.convertion_funcs[lower_column_type](value) + # If you are here to read this, you probably hit some datetime/timedelta inconsistencies. + # Azure-Data-Explorer(Kusto) supports 7 decimal digits, while the corresponding python types supports only 6. + # One example why one might want this precision, is when working with pandas. + # In that case, use azure.kusto.data.helpers.dataframe_from_result_table which takes into account the original value. + typed_value = KustoResultRow.convertion_funcs[column_type](value) + + # this is a special case where plain python will lose precision, so we keep the precise value hidden + # when transforming to pandas, we can use the hidden value to convert to precise pandas/numpy types + if HAS_PANDAS: + if column_type == "datetime": + self._hidden_values.append(to_pandas_datetime(value)) + if column_type == "timespan": + self._hidden_values.append(to_pandas_timedelta(value, typed_value)) + elif column_type in KustoResultRow.convertion_funcs: + typed_value = KustoResultRow.convertion_funcs[column_type](value) + if HAS_PANDAS: + self._hidden_values.append(value) else: typed_value = value + if HAS_PANDAS: + self._hidden_values.append(value) self._value_by_index.append(typed_value) self._value_by_name[column.column_name] = typed_value @@ -129,6 +136,11 @@ def __init__(self, json_table): self.rows = [KustoResultRow(self.columns, row) for row in json_table["Rows"]] + @property + def _rows(self): + for row in self.rows: + yield row._hidden_values + @property def rows_count(self): return len(self.rows) diff --git a/azure-kusto-data/azure/kusto/data/helpers.py b/azure-kusto-data/azure/kusto/data/helpers.py index c9138eac..29e9368e 100644 --- a/azure-kusto-data/azure/kusto/data/helpers.py +++ b/azure-kusto-data/azure/kusto/data/helpers.py @@ -1,12 +1,41 @@ """Kusto helper functions""" +import six -import pandas -from ._models import KustoResultTable + +def to_pandas_datetime(raw_value): + import pandas as pd + + return pd.to_datetime(raw_value) + + +def to_pandas_timedelta(raw_value, timedelta_value): + import pandas as pd + + if isinstance(raw_value, (six.integer_types, float)): + # https://docs.microsoft.com/en-us/dotnet/api/system.datetime.ticks + # kusto saves up to ticks, 1 tick == 100 nanoseconds + return pd.Timedelta(raw_value * 100, unit="ns") + if isinstance(raw_value, six.string_types): + fraction = raw_value.split(".")[-1] + if fraction.isdigit(): + whole_part = int(timedelta_value.total_seconds()) + time_with_exact_fraction = str(whole_part) + "." + fraction + total_seconds = float(time_with_exact_fraction) + + return pd.Timedelta(total_seconds, unit="s") + else: + return pd.Timedelta(timedelta_value) + + return pd.Timedelta(timedelta_value.total_seconds(), unit="ns") def dataframe_from_result_table(table): + import pandas as pd + from ._models import KustoResultTable + from dateutil.tz import UTC + """Converts Kusto tables into pandas DataFrame. - :param azure.kusto.data._models.KustoResultTable table: Table recieved from the response. + :param azure.kusto.data._models.KustoResultTable table: Table received from the response. :return: pandas DataFrame. :rtype: pandas.DataFrame """ @@ -16,16 +45,12 @@ def dataframe_from_result_table(table): if not isinstance(table, KustoResultTable): raise TypeError("Expected KustoResultTable got {}".format(type(table).__name__)) - frame = pandas.DataFrame.from_records( - [row.to_list() for row in table.rows], columns=[col.column_name for col in table.columns] - ) - bool_columns = [col.column_name for col in table.columns if col.column_type == "bool"] - for col in bool_columns: - frame[col] = frame[col].astype(bool) - - for i in range(len(table.rows)): - seventh = table.rows[i]._seventh_digit - for name in seventh.keys(): - frame.loc[i, name] += pandas.Timedelta(seventh[name] * 100, unit="ns") + columns = [col.column_name for col in table.columns] + frame = pd.DataFrame(table._rows, columns=columns) + + # fix types + for col in table.columns: + if col.column_type == "bool": + frame[col.column_name] = frame[col.column_name].astype(bool) return frame diff --git a/azure-kusto-data/setup.py b/azure-kusto-data/setup.py index 36d7ad20..fcbdb6cd 100644 --- a/azure-kusto-data/setup.py +++ b/azure-kusto-data/setup.py @@ -44,6 +44,6 @@ namespace_packages=["azure"], keywords="kusto wrapper client library", packages=find_packages(exclude=["azure", "tests"]), - install_requires=["adal>=1.0.0", "python-dateutil>=2.7.0", "requests>=2.13.0", "six>=1.10.0"], + install_requires=["adal>=1.0.0", "python-dateutil>=2.8.0", "requests>=2.13.0", "six>=1.10.0"], extras_require={"pandas": ["pandas==0.24.1"], ":python_version<'3.0'": ["azure-nspkg"]}, ) diff --git a/azure-kusto-data/tests/test_functional.py b/azure-kusto-data/tests/test_functional.py index 78fadd4f..48fc21e4 100644 --- a/azure-kusto-data/tests/test_functional.py +++ b/azure-kusto-data/tests/test_functional.py @@ -213,11 +213,10 @@ def test_valid_response(self): self.assertEqual(type(row[4]), bool if row[4] is not None else type(None)) self.assertEqual(type(row[5]), timedelta if row[5] is not None else type(None)) - for i in range(0, len(primary_table)): - row = primary_table[i] - expected_row = expected_table[i] - for j in range(0, len(row)): - self.assertEqual(row[j], expected_row[j]) + for row_index, row in enumerate(primary_table): + expected_row = expected_table[row_index] + for col_index, value in enumerate(row): + self.assertEqual(value, expected_row[col_index]) def test_invalid_table(self): """Tests calling of table with index that doesn't exists.""" @@ -236,3 +235,10 @@ def test_iterating_after_end(self): """Tests StopIteration is raised when the response ends.""" response = KustoResponseDataSetV2(json.loads(RESPONSE_TEXT)) self.assertEqual(sum(1 for _ in response.primary_results[0]), 3) + + def test_row_equality(self): + """Tests the rows are idempotent.""" + response = KustoResponseDataSetV2(json.loads(RESPONSE_TEXT)) + table = response.primary_results[0] + for row_index, row in enumerate(table): + self.assertEqual(table[row_index], row) diff --git a/azure-kusto-data/tests/test_kusto_client.py b/azure-kusto-data/tests/test_kusto_client.py index 38cd7639..c5b0ab62 100644 --- a/azure-kusto-data/tests/test_kusto_client.py +++ b/azure-kusto-data/tests/test_kusto_client.py @@ -171,12 +171,19 @@ def test_sanity_query(self, mock_post): expected["xsmalltext"] = DIGIT_WORDS[int(expected["xint16"])] expected["xtext"] = DIGIT_WORDS[int(expected["xint16"])] expected["xnumberAsText"] = text_type(expected["xint16"]) - expected["xtime"] = ( + + next_time = ( timedelta() if expected["xtime"] is None else (abs(expected["xtime"]) + timedelta(days=1, seconds=1, microseconds=1000)) * (-1) ** (expected["rownumber"] + 1) ) + + # hacky tests - because time here is relative to previous row, after we pass a time where we have > 500 nanoseconds, + # another microseconds digit is needed + if expected["rownumber"] + 1 == 6: + next_time += timedelta(microseconds=1) + expected["xtime"] = next_time if expected["xint16"] > 0: expected["xdynamicWithNulls"] = {"rowId": expected["xint16"], "arr": [0, expected["xint16"]]} @@ -241,16 +248,16 @@ def test_sanity_data_frame(self, mock_post): "xdate": Series( [ pandas.to_datetime(None), - pandas.to_datetime("2014-01-01T01:01:01.0000000Z").tz_convert(UTC), - pandas.to_datetime("2015-01-01T01:01:01.0000001Z").tz_convert(UTC), - pandas.to_datetime("2016-01-01T01:01:01.0000002Z").tz_convert(UTC), - pandas.to_datetime("2017-01-01T01:01:01.0000003Z").tz_convert(UTC), - pandas.to_datetime("2018-01-01T01:01:01.0000004Z").tz_convert(UTC), - pandas.to_datetime("2019-01-01T01:01:01.0000005Z").tz_convert(UTC), - pandas.to_datetime("2020-01-01T01:01:01.0000006Z").tz_convert(UTC), - pandas.to_datetime("2021-01-01T01:01:01.0000007Z").tz_convert(UTC), - pandas.to_datetime("2022-01-01T01:01:01.0000008Z").tz_convert(UTC), - pandas.to_datetime("2023-01-01T01:01:01.0000009Z").tz_convert(UTC), + pandas.to_datetime("2014-01-01T01:01:01.0000000Z"), + pandas.to_datetime("2015-01-01T01:01:01.0000001Z"), + pandas.to_datetime("2016-01-01T01:01:01.0000002Z"), + pandas.to_datetime("2017-01-01T01:01:01.0000003Z"), + pandas.to_datetime("2018-01-01T01:01:01.0000004Z"), + pandas.to_datetime("2019-01-01T01:01:01.0000005Z"), + pandas.to_datetime("2020-01-01T01:01:01.0000006Z"), + pandas.to_datetime("2021-01-01T01:01:01.0000007Z"), + pandas.to_datetime("2022-01-01T01:01:01.0000008Z"), + pandas.to_datetime("2023-01-01T01:01:01.0000009Z"), ] ), "xsmalltext": Series(