55
66import numpy as np
77from pandas import _hash , Series , factorize , Categorical , Index , MultiIndex
8+ import pandas .core .algorithms as algos
89from pandas .lib import is_bool_array
910from pandas .types .generic import ABCIndexClass , ABCSeries , ABCDataFrame
1011from pandas .types .common import (is_categorical_dtype , is_numeric_dtype ,
@@ -58,15 +59,16 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
5859 hash_key = _default_hash_key
5960
6061 if isinstance (obj , MultiIndex ):
61- return _hash_tuples (obj , encoding , hash_key )
62+ return Series (hash_tuples (obj , encoding , hash_key ),
63+ dtype = 'uint64' , copy = False )
6264
6365 if isinstance (obj , ABCIndexClass ):
6466 h = hash_array (obj .values , encoding , hash_key ,
65- categorize ).astype ('uint64' )
66- h = Series (h , index = obj , dtype = 'uint64' )
67+ categorize ).astype ('uint64' , copy = False )
68+ h = Series (h , index = obj , dtype = 'uint64' , copy = False )
6769 elif isinstance (obj , ABCSeries ):
6870 h = hash_array (obj .values , encoding , hash_key ,
69- categorize ).astype ('uint64' )
71+ categorize ).astype ('uint64' , copy = False )
7072 if index :
7173 h = _combine_hash_arrays (iter ([
7274 h ,
@@ -76,7 +78,7 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
7678 hash_key = hash_key ,
7779 categorize = categorize ).values ]),
7880 2 )
79- h = Series (h , index = obj .index , dtype = 'uint64' )
81+ h = Series (h , index = obj .index , dtype = 'uint64' , copy = False )
8082 elif isinstance (obj , ABCDataFrame ):
8183 hashes = (hash_array (series .values ) for _ , series in obj .iteritems ())
8284 num_items = len (obj .columns )
@@ -91,34 +93,81 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
9193 hashes = itertools .chain (hashes , index_hash_generator )
9294 h = _combine_hash_arrays (hashes , num_items )
9395
94- h = Series (h , index = obj .index , dtype = 'uint64' )
96+ h = Series (h , index = obj .index , dtype = 'uint64' , copy = False )
9597 else :
9698 raise TypeError ("Unexpected type for hashing %s" % type (obj ))
9799 return h
98100
99101
100- def _hash_tuples (vals , encoding , hash_key ):
102+ def _hash_lists (vals , encoding = 'utf8' , hash_key = None ):
103+ """
104+
105+ Parameters
106+ ----------
107+ vals : list of ndarrays
108+ encoding : string, default 'utf8'
109+ encoding for data & key when strings
110+ hash_key : string key to encode, default to _default_hash_key
111+
112+ Returns
113+ -------
114+ 1d uint64 numpy array of hash values, same length as the vals[0]
115+ """
116+
117+ if not isinstance (vals , list ):
118+ raise TypeError ("only can accept lists" )
119+
120+ if not len (vals ):
121+ raise ValueError ("must pass a non-zero length vals" )
122+
123+ if not isinstance (vals [0 ], np .ndarray ):
124+ raise ValueError ("must pass a ndarray" )
125+
126+ hashes = (hash_array (l , encoding = encoding , hash_key = hash_key )
127+ for l in vals )
128+ h = _combine_hash_arrays (hashes , len (vals ))
129+ return h
130+
131+
132+ def hash_tuples (vals , encoding = 'utf8' , hash_key = None ):
101133 """
102134 Hash an MultiIndex / array_of_tuples efficiently
103135
104136 Parameters
105137 ----------
106- vals : MultiIndex or ndarray of tuples
138+ vals : MultiIndex, ndarray of tuples, or single tuple
107139 encoding : string, default 'utf8'
108140 hash_key : string key to encode, default to _default_hash_key
109141
110142 Returns
111143 -------
112- ndarray of hashed values array, same size as len(c)
144+ ndarray of hashed values array
113145 """
114146
147+ is_tuple = False
148+ if isinstance (vals , tuple ):
149+ vals = [vals ]
150+ is_tuple = True
151+
115152 if not isinstance (vals , MultiIndex ):
116153 vals = MultiIndex .from_tuples (vals )
117154
118- # efficiently turn us into a DataFrame and hash
119- return hash_pandas_object (vals .to_frame (index = False ),
120- index = False , encoding = encoding ,
121- hash_key = hash_key , categorize = False )
155+ # create a list-of-ndarrays & hash
156+ def get_level_values (num ):
157+ unique = vals .levels [num ] # .values
158+ labels = vals .labels [num ]
159+ filled = algos .take_1d (unique .values , labels ,
160+ fill_value = unique ._na_value )
161+ return filled
162+
163+ vals = [get_level_values (level )
164+ for level in range (vals .nlevels )]
165+
166+ result = _hash_lists (vals , encoding = encoding , hash_key = hash_key )
167+ if is_tuple :
168+ result = result [0 ]
169+
170+ return result
122171
123172
124173def _hash_categorical (c , encoding , hash_key ):
@@ -138,7 +187,7 @@ def _hash_categorical(c, encoding, hash_key):
138187 """
139188 cat_hashed = hash_array (c .categories .values , encoding , hash_key ,
140189 categorize = False ).astype (np .uint64 , copy = False )
141- return c .rename_categories (cat_hashed ).astype (np .uint64 )
190+ return c .rename_categories (cat_hashed ).astype (np .uint64 , copy = False )
142191
143192
144193def hash_array (vals , encoding = 'utf8' , hash_key = None , categorize = True ):
@@ -168,10 +217,6 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
168217 if hash_key is None :
169218 hash_key = _default_hash_key
170219
171- if isinstance (vals , list ) and len (vals ) and isinstance (vals [0 ], tuple ):
172- # we hash an list of tuples similar to a MultiIndex
173- return _hash_tuples (vals , encoding , hash_key ).values
174-
175220 # For categoricals, we hash the categories, then remap the codes to the
176221 # hash values. (This check is above the complex check so that we don't ask
177222 # numpy if categorical is a subdtype of complex, as it will choke.
@@ -187,9 +232,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
187232 # manage it.
188233 if is_bool_array (vals ):
189234 vals = vals .astype ('u8' )
190- elif ((is_datetime64_dtype (vals ) or
191- is_timedelta64_dtype (vals ) or
192- is_numeric_dtype (vals )) and vals .dtype .itemsize <= 8 ):
235+ elif (is_datetime64_dtype (vals ) or
236+ is_timedelta64_dtype (vals )):
237+ vals = vals .view ('i8' ).astype ('u8' , copy = False )
238+ elif (is_numeric_dtype (vals ) and vals .dtype .itemsize <= 8 ):
193239 vals = vals .view ('u{}' .format (vals .dtype .itemsize )).astype ('u8' )
194240 else :
195241 # With repeated values, its MUCH faster to categorize object dtypes,
0 commit comments