From f77e40a301f7ad1e7015d9961fdae54250f3c4b7 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 19 Jul 2016 18:06:55 +0100 Subject: [PATCH] ENH: Enable automatic writing of dates to Stata files Automatically select type %tc for datetime[ns] columns Change ValueErrors to NotImplementedError for unsupported types Add tests for select exceptions Improve to_stata and StataWriter docstrings closes #12259 --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/frame.py | 39 ++++++++++------ pandas/io/stata.py | 83 ++++++++++++++++++++------------- pandas/io/tests/test_stata.py | 50 +++++++++++++++++++- 4 files changed, 124 insertions(+), 49 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index efa6e5575fa79..d1ee506ba294c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -251,6 +251,7 @@ Other enhancements - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) - ``.to_stata()`` and ```StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) +- ``.to_stata()`` and ```StataWriter`` will automatically convert ``datetime[ns]`` columns to Stata format ``%tc`` rather than raising a ``ValueError`` (:issue:`12259`) .. _whatsnew_0190.api: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4fe7b318b3a18..a59668320de3d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1473,31 +1473,42 @@ def to_stata(self, fname, convert_dates=None, write_index=True, Parameters ---------- - fname : file path or buffer - Where to save the dta file. + fname : str or buffer + String path of file-like object convert_dates : dict - Dictionary mapping column of datetime types to the stata internal - format that you want to use for the dates. Options are - 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a - number or a name. + Dictionary mapping columns containing datetime types to stata internal + format to use when wirting the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information write_index : bool Write the index to Stata dataset. encoding : str - Default is latin-1. Note that Stata does not support unicode. + Default is latin-1. Unicode is not supported byteorder : str - Can be ">", "<", "little", or "big". The default is None which uses - `sys.byteorder` + Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime - A date time to use when writing the file. Can be None, in which - case the current time is used. + A datetime to use as file creation date. Default is the current time dataset_label : str - A label for the data set. Should be 80 characters or smaller. + A label for the data set. Must be 80 characters or smaller. .. versionadded:: 0.19.0 variable_labels : dict - Dictionary containing columns as keys and variable labels as - values. Each label must be 80 characters or smaller. + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + * Column dtype is not representable in Stata + ValueError + * Columns listed in convert_dates are noth either datetime64[ns] + or datetime.datetime + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters Examples -------- diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d35466e8896ba..5528b2803eb21 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -432,7 +432,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): d = parse_dates_safe(dates, year=True) conv_dates = d.year else: - raise ValueError("fmt %s not understood" % fmt) + raise ValueError("Format %s is not a known Stata date format" % fmt) conv_dates = Series(conv_dates, dtype=np.float64) missing_value = struct.unpack('", "<", "little", or "big". The default is None which uses - `sys.byteorder` + Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime - A date time to use when writing the file. Can be None, in which - case the current time is used. + A datetime to use as file creation date. Default is the current time dataset_label : str - A label for the data set. Should be 80 characters or smaller. + A label for the data set. Must be 80 characters or smaller. .. versionadded:: 0.19.0 @@ -1843,6 +1842,17 @@ class StataWriter(StataParser): The StataWriter instance has a write_file method, which will write the file to the given `fname`. + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + * Column dtype is not representable in Stata + ValueError + * Columns listed in convert_dates are noth either datetime64[ns] + or datetime.datetime + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + Examples -------- >>> import pandas as pd @@ -1861,7 +1871,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None): super(StataWriter, self).__init__(encoding) - self._convert_dates = convert_dates + self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index self._time_stamp = time_stamp self._data_label = data_label @@ -2041,15 +2051,22 @@ def _prepare_pandas(self, data): self.varlist = data.columns.tolist() dtypes = data.dtypes - if self._convert_dates is not None: - self._convert_dates = _maybe_convert_to_int_keys( - self._convert_dates, self.varlist + + # Ensure all date columns are converted + for col in data: + if col in self._convert_dates: + continue + if is_datetime64_dtype(data[col]): + self._convert_dates[col] = 'tc' + + self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates, + self.varlist) + for key in self._convert_dates: + new_type = _convert_datetime_to_stata_type( + self._convert_dates[key] ) - for key in self._convert_dates: - new_type = _convert_datetime_to_stata_type( - self._convert_dates[key] - ) - dtypes[key] = np.dtype(new_type) + dtypes[key] = np.dtype(new_type) + self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.iteritems(): diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 91850e6ffe9b9..009e40c84f94b 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -11,17 +11,17 @@ import nose import numpy as np +from pandas.tslib import NaT import pandas as pd import pandas.util.testing as tm from pandas import compat from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series -from pandas.types.common import is_categorical_dtype -from pandas.tslib import NaT from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) +from pandas.types.common import is_categorical_dtype class TestStata(tm.TestCase): @@ -1165,6 +1165,52 @@ def test_write_variable_label_errors(self): with tm.ensure_clean() as path: original.to_stata(path, variable_labels=variable_labels_long) + def test_default_date_conversion(self): + # GH 12259 + dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000), + dt.datetime(2012, 12, 21, 12, 21, 12, 21000), + dt.datetime(1776, 7, 4, 7, 4, 7, 4000)] + original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], + 'strs': ['apple', 'banana', 'cherry'], + 'dates': dates}) + + with tm.ensure_clean() as path: + original.to_stata(path, write_index=False) + reread = read_stata(path, convert_dates=True) + tm.assert_frame_equal(original, reread) + + original.to_stata(path, + write_index=False, + convert_dates={'dates': 'tc'}) + direct = read_stata(path, convert_dates=True) + tm.assert_frame_equal(reread, direct) + + def test_unsupported_type(self): + original = pd.DataFrame({'a': [1 + 2j, 2 + 4j]}) + + with tm.assertRaises(NotImplementedError): + with tm.ensure_clean() as path: + original.to_stata(path) + + def test_unsupported_datetype(self): + dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000), + dt.datetime(2012, 12, 21, 12, 21, 12, 21000), + dt.datetime(1776, 7, 4, 7, 4, 7, 4000)] + original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], + 'strs': ['apple', 'banana', 'cherry'], + 'dates': dates}) + + with tm.assertRaises(NotImplementedError): + with tm.ensure_clean() as path: + original.to_stata(path, convert_dates={'dates': 'tC'}) + + dates = pd.date_range('1-1-1990',periods=3,tz='Asia/Hong_Kong') + original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], + 'strs': ['apple', 'banana', 'cherry'], + 'dates': dates}) + with tm.assertRaises(NotImplementedError): + with tm.ensure_clean() as path: + original.to_stata(path) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],