apache · wesm · Mar 12, 2017 · Mar 13, 2017 · Mar 13, 2017 · xhochy
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
@@ -108,6 +108,31 @@ std::string Date32Type::ToString() const {
   return std::string("date32");
 }
 
+static inline void print_time_unit(TimeUnit unit, std::ostream* stream) {
+  switch (unit) {
+    case TimeUnit::SECOND:
+      (*stream) << "s";
+      break;
+    case TimeUnit::MILLI:
+      (*stream) << "ms";
+      break;
+    case TimeUnit::MICRO:
+      (*stream) << "us";
+      break;
+    case TimeUnit::NANO:
+      (*stream) << "ns";
+      break;
+  }
+}
+
+std::string TimestampType::ToString() const {
+  std::stringstream ss;
+  ss << "timestamp[";
+  print_time_unit(this->unit, &ss);
+  ss << "]";
+  return ss.str();
+}
+
 // ----------------------------------------------------------------------
 // Union type
 

diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
@@ -495,7 +495,7 @@ struct ARROW_EXPORT TimestampType : public FixedWidthType {
   TimestampType(const TimestampType& other) : TimestampType(other.unit) {}
 
   Status Accept(TypeVisitor* visitor) const override;
-  std::string ToString() const override { return name(); }
+  std::string ToString() const override;
   static std::string name() { return "timestamp"; }
 
   TimeUnit unit;

diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
@@ -56,6 +56,8 @@
                             FloatValue, DoubleValue, ListValue,
                             BinaryValue, StringValue)
 
+import pyarrow.schema as _schema
+
 from pyarrow.schema import (null, bool_,
                             int8, int16, int32, int64,
                             uint8, uint16, uint32, uint64,
@@ -64,6 +66,7 @@
                             list_, struct, dictionary, field,
                             DataType, Field, Schema, schema)
 
+
 from pyarrow.table import Column, RecordBatch, Table, concat_tables
 
 

diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
@@ -34,7 +34,8 @@ from pyarrow.memory cimport MemoryPool, maybe_unbox_memory_pool
 cimport pyarrow.scalar as scalar
 from pyarrow.scalar import NA
 
-from pyarrow.schema cimport Field, Schema, DictionaryType
+from pyarrow.schema cimport (DataType, Field, Schema, DictionaryType,
+                             box_data_type)
 import pyarrow.schema as schema
 
 cimport cpython
@@ -45,16 +46,40 @@ cdef _pandas():
     return pd
 
 
+cdef maybe_coerce_datetime64(values, dtype, DataType type,
+                             timestamps_to_ms=False):
+
+    from pyarrow.compat import DatetimeTZDtype
+
+    if values.dtype.type != np.datetime64:
+        return values, type
+
+    coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]'
+
+    if coerce_ms:
+        values = values.astype('datetime64[ms]')
+
+    if isinstance(dtype, DatetimeTZDtype):
+        tz = dtype.tz
+        unit = 'ms' if coerce_ms else dtype.unit
+        type = schema.timestamp(unit, tz)
+    else:
+        # Trust the NumPy dtype
+        type = schema.type_from_numpy_dtype(values.dtype)
+
+    return values, type
+
+
 cdef class Array:
 
     cdef init(self, const shared_ptr[CArray]& sp_array):
         self.sp_array = sp_array
         self.ap = sp_array.get()
-        self.type = DataType()
-        self.type.init(self.sp_array.get().type())
+        self.type = box_data_type(self.sp_array.get().type())
 
     @staticmethod
-    def from_pandas(obj, mask=None, timestamps_to_ms=False, Field field=None,
+    def from_pandas(obj, mask=None, DataType type=None,
+                    timestamps_to_ms=False,
                     MemoryPool memory_pool=None):
         """
         Convert pandas.Series to an Arrow Array.
@@ -66,6 +91,9 @@ cdef class Array:
         mask : pandas.Series or numpy.ndarray, optional
             boolean mask if the object is valid or null
 
+        type : pyarrow.DataType
+            Explicit type to attempt to coerce to
+
         timestamps_to_ms : bool, optional
             Convert datetime columns to ms resolution. This is needed for
             compatibility with other functionality like Parquet I/O which
@@ -107,33 +135,43 @@ cdef class Array:
         """
         cdef:
             shared_ptr[CArray] out
-            shared_ptr[CField] c_field
+            shared_ptr[CDataType] c_type
             CMemoryPool* pool
 
         pd = _pandas()
 
-        if field is not None:
-            c_field = field.sp_field
-
         if mask is not None:
             mask = get_series_values(mask)
 
-        series_values = get_series_values(obj)
+        values = get_series_values(obj)
+        pool = maybe_unbox_memory_pool(memory_pool)
 
-        if isinstance(series_values, pd.Categorical):
+        if isinstance(values, pd.Categorical):
             return DictionaryArray.from_arrays(
-                series_values.codes, series_values.categories.values,
+                values.codes, values.categories.values,
                 mask=mask, memory_pool=memory_pool)
+        elif values.dtype == object:
+            # Object dtype undergoes a different conversion path as more type
+            # inference may be needed
+            if type is not None:
+                c_type = type.sp_type
+            with nogil:
+                check_status(pyarrow.PandasObjectsToArrow(
+                    pool, values, mask, c_type, &out))
         else:
-            if series_values.dtype.type == np.datetime64 and timestamps_to_ms:
-                series_values = series_values.astype('datetime64[ms]')
+            values, type = maybe_coerce_datetime64(
+                values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms)
+
+            if type is None:
+                check_status(pyarrow.PandasDtypeToArrow(values.dtype, &c_type))
+            else:
+                c_type = type.sp_type
 
-            pool = maybe_unbox_memory_pool(memory_pool)
             with nogil:
                 check_status(pyarrow.PandasToArrow(
-                    pool, series_values, mask, c_field, &out))
+                    pool, values, mask, c_type, &out))
 
-            return box_array(out)
+        return box_array(out)
 
     @staticmethod
     def from_list(object list_obj, DataType type=None,
@@ -338,6 +376,10 @@ cdef class DateArray(NumericArray):
     pass
 
 
+cdef class TimestampArray(NumericArray):
+    pass
+
+
 cdef class FloatArray(FloatingPointArray):
     pass
 
@@ -423,7 +465,7 @@ cdef dict _array_classes = {
     Type_LIST: ListArray,
     Type_BINARY: BinaryArray,
     Type_STRING: StringArray,
-    Type_TIMESTAMP: Int64Array,
+    Type_TIMESTAMP: TimestampArray,
     Type_DICTIONARY: DictionaryArray
 }
 

diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py
@@ -17,9 +17,11 @@
 
 # flake8: noqa
 
+from distutils.version import LooseVersion
 import itertools
 
 import numpy as np
+import pandas as pd
 
 import sys
 import six
@@ -115,6 +117,13 @@ def encode_file_path(path):
     return encoded_path
 
 
+if LooseVersion(pd.__version__) < '0.19.0':
+    pdapi = pd.core.common
+    from pandas.core.dtypes import DatetimeTZDtype
+else:
+    from pandas.types.dtypes import DatetimeTZDtype
+    pdapi = pd.api.types
+
 integer_types = six.integer_types + (np.integer,)
 
 __all__ = []
diff --git a/python/pyarrow/config.pyx b/python/pyarrow/config.pyx
@@ -17,10 +17,10 @@
 cdef extern from 'pyarrow/do_import_numpy.h':
     pass
 
-cdef extern from 'pyarrow/numpy_interop.h' namespace 'pyarrow':
+cdef extern from 'pyarrow/numpy_interop.h' namespace 'arrow::py':
     int import_numpy()
 
-cdef extern from 'pyarrow/config.h' namespace 'pyarrow':
+cdef extern from 'pyarrow/config.h' namespace 'arrow::py':
     void pyarrow_init()
     void pyarrow_set_numpy_nan(object o)
 

diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
@@ -19,6 +19,7 @@
 from distutils.version import LooseVersion
 import pandas as pd
 
+from pyarrow.compat import pdapi
 from pyarrow._feather import FeatherError  # noqa
 from pyarrow.table import Table
 import pyarrow._feather as ext
@@ -27,11 +28,6 @@
 if LooseVersion(pd.__version__) < '0.17.0':
     raise ImportError("feather requires pandas >= 0.17.0")
 
-if LooseVersion(pd.__version__) < '0.19.0':
-    pdapi = pd.core.common
-else:
-    pdapi = pd.api.types
-
 
 class FeatherReader(ext.FeatherReader):
 

diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
@@ -84,6 +84,13 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CArray] indices()
         shared_ptr[CArray] dictionary()
 
+    cdef cppclass CTimestampType" arrow::TimestampType"(CFixedWidthType):
+        TimeUnit unit
+        c_string timezone
+
+    cdef cppclass CTimeType" arrow::TimeType"(CFixedWidthType):
+        TimeUnit unit
+
     cdef cppclass CDictionaryType" arrow::DictionaryType"(CFixedWidthType):
         CDictionaryType(const shared_ptr[CDataType]& index_type,
                         const shared_ptr[CArray]& dictionary)
@@ -92,6 +99,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CArray] dictionary()
 
     shared_ptr[CDataType] timestamp(TimeUnit unit)
+    shared_ptr[CDataType] timestamp(const c_string& timezone, TimeUnit unit)
 
     cdef cppclass CMemoryPool" arrow::MemoryPool":
         int64_t bytes_allocated()
@@ -117,9 +125,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CStringType" arrow::StringType"(CDataType):
         pass
 
-    cdef cppclass CTimestampType" arrow::TimestampType"(CDataType):
-        TimeUnit unit
-
     cdef cppclass CField" arrow::Field":
         c_string name
         shared_ptr[CDataType] type

diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
@@ -18,22 +18,29 @@
 # distutils: language = c++
 
 from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CField,
+from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn,
                                         CTable, CDataType, CStatus, Type,
                                         CMemoryPool, TimeUnit)
 
 cimport pyarrow.includes.libarrow_io as arrow_io
 
 
-cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
+cdef extern from "pyarrow/api.h" namespace "arrow::py" nogil:
     shared_ptr[CDataType] GetPrimitiveType(Type type)
     shared_ptr[CDataType] GetTimestampType(TimeUnit unit)
-    CStatus ConvertPySequence(object obj, CMemoryPool* pool, shared_ptr[CArray]* out)
+    CStatus ConvertPySequence(object obj, CMemoryPool* pool,
+                              shared_ptr[CArray]* out)
+
+    CStatus PandasDtypeToArrow(object dtype, shared_ptr[CDataType]* type)
 
     CStatus PandasToArrow(CMemoryPool* pool, object ao, object mo,
-                          shared_ptr[CField] field,
+                          const shared_ptr[CDataType]& type,
                           shared_ptr[CArray]* out)
 
+    CStatus PandasObjectsToArrow(CMemoryPool* pool, object ao, object mo,
+                                 const shared_ptr[CDataType]& type,
+                                 shared_ptr[CArray]* out)
+
     CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr,
                                  PyObject* py_ref, PyObject** out)
 
@@ -47,12 +54,12 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
     CMemoryPool* get_memory_pool()
 
 
-cdef extern from "pyarrow/common.h" namespace "pyarrow" nogil:
+cdef extern from "pyarrow/common.h" namespace "arrow::py" nogil:
     cdef cppclass PyBytesBuffer(CBuffer):
         PyBytesBuffer(object o)
 
 
-cdef extern from "pyarrow/io.h" namespace "pyarrow" nogil:
+cdef extern from "pyarrow/io.h" namespace "arrow::py" nogil:
     cdef cppclass PyReadableFile(arrow_io.ReadableFileInterface):
         PyReadableFile(object fo)
 

diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd
@@ -16,7 +16,9 @@
 # under the License.
 
 from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport (CDataType, CDictionaryType,
+from pyarrow.includes.libarrow cimport (CDataType,
+                                        CDictionaryType,
+                                        CTimestampType,
                                         CField, CSchema)
 
 cdef class DataType:
@@ -31,6 +33,12 @@ cdef class DictionaryType(DataType):
     cdef:
         const CDictionaryType* dict_type
 
+
+cdef class TimestampType(DataType):
+    cdef:
+        const CTimestampType* ts_type
+
+
 cdef class Field:
     cdef:
         shared_ptr[CField] sp_field