11"""
22data hash pandas / numpy objects
33"""
4+ import itertools
45
56import numpy as np
6- from pandas import _hash , Series , factorize , Categorical , Index
7+ from pandas import _hash , Series , factorize , Categorical , Index , MultiIndex
8+ import pandas .core .algorithms as algos
79from pandas .lib import is_bool_array
810from pandas .types .generic import ABCIndexClass , ABCSeries , ABCDataFrame
911from pandas .types .common import (is_categorical_dtype , is_numeric_dtype ,
10- is_datetime64_dtype , is_timedelta64_dtype )
12+ is_datetime64_dtype , is_timedelta64_dtype ,
13+ is_list_like )
1114
1215# 16 byte long hashing key
1316_default_hash_key = '0123456789123456'
1417
1518
19+ def _combine_hash_arrays (arrays , num_items ):
20+ """
21+ Parameters
22+ ----------
23+ arrays : generator
24+ num_items : int
25+
26+ Should be the same as CPython's tupleobject.c
27+ """
28+ try :
29+ first = next (arrays )
30+ except StopIteration :
31+ return np .array ([], dtype = np .uint64 )
32+
33+ arrays = itertools .chain ([first ], arrays )
34+
35+ mult = np .uint64 (1000003 )
36+ out = np .zeros_like (first ) + np .uint64 (0x345678 )
37+ for i , a in enumerate (arrays ):
38+ inverse_i = num_items - i
39+ out ^= a
40+ out *= mult
41+ mult += np .uint64 (82520 + inverse_i + inverse_i )
42+ assert i + 1 == num_items , 'Fed in wrong num_items'
43+ out += np .uint64 (97531 )
44+ return out
45+
46+
1647def hash_pandas_object (obj , index = True , encoding = 'utf8' , hash_key = None ,
1748 categorize = True ):
1849 """
@@ -41,45 +72,97 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
4172 if hash_key is None :
4273 hash_key = _default_hash_key
4374
44- def adder ( h , hashed_to_add ):
45- h = np . multiply ( h , np . uint ( 3 ), h )
46- return np . add ( h , hashed_to_add , h )
75+ if isinstance ( obj , MultiIndex ):
76+ return Series ( hash_tuples ( obj , encoding , hash_key ),
77+ dtype = 'uint64' , copy = False )
4778
4879 if isinstance (obj , ABCIndexClass ):
4980 h = hash_array (obj .values , encoding , hash_key ,
50- categorize ).astype ('uint64' )
51- h = Series (h , index = obj , dtype = 'uint64' )
81+ categorize ).astype ('uint64' , copy = False )
82+ h = Series (h , index = obj , dtype = 'uint64' , copy = False )
5283 elif isinstance (obj , ABCSeries ):
5384 h = hash_array (obj .values , encoding , hash_key ,
54- categorize ).astype ('uint64' )
85+ categorize ).astype ('uint64' , copy = False )
5586 if index :
56- h = adder (h , hash_pandas_object (obj .index ,
57- index = False ,
58- encoding = encoding ,
59- hash_key = hash_key ,
60- categorize = categorize ).values )
61- h = Series (h , index = obj .index , dtype = 'uint64' )
87+ index_iter = (hash_pandas_object (obj .index ,
88+ index = False ,
89+ encoding = encoding ,
90+ hash_key = hash_key ,
91+ categorize = categorize ).values
92+ for _ in [None ])
93+ arrays = itertools .chain ([h ], index_iter )
94+ h = _combine_hash_arrays (arrays , 2 )
95+
96+ h = Series (h , index = obj .index , dtype = 'uint64' , copy = False )
97+
6298 elif isinstance (obj , ABCDataFrame ):
63- cols = obj .iteritems ()
64- first_series = next (cols )[1 ]
65- h = hash_array (first_series .values , encoding ,
66- hash_key , categorize ).astype ('uint64' )
67- for _ , col in cols :
68- h = adder (h , hash_array (col .values , encoding , hash_key ,
69- categorize ))
99+ hashes = (hash_array (series .values ) for _ , series in obj .iteritems ())
100+ num_items = len (obj .columns )
70101 if index :
71- h = adder (h , hash_pandas_object (obj .index ,
72- index = False ,
73- encoding = encoding ,
74- hash_key = hash_key ,
75- categorize = categorize ).values )
102+ index_hash_generator = (hash_pandas_object (obj .index ,
103+ index = False ,
104+ encoding = encoding ,
105+ hash_key = hash_key ,
106+ categorize = categorize ).values # noqa
107+ for _ in [None ])
108+ num_items += 1
109+ hashes = itertools .chain (hashes , index_hash_generator )
110+ h = _combine_hash_arrays (hashes , num_items )
76111
77- h = Series (h , index = obj .index , dtype = 'uint64' )
112+ h = Series (h , index = obj .index , dtype = 'uint64' , copy = False )
78113 else :
79114 raise TypeError ("Unexpected type for hashing %s" % type (obj ))
80115 return h
81116
82117
118+ def hash_tuples (vals , encoding = 'utf8' , hash_key = None ):
119+ """
120+ Hash an MultiIndex / list-of-tuples efficiently
121+
122+ .. versionadded:: 0.20.0
123+
124+ Parameters
125+ ----------
126+ vals : MultiIndex, list-of-tuples, or single tuple
127+ encoding : string, default 'utf8'
128+ hash_key : string key to encode, default to _default_hash_key
129+
130+ Returns
131+ -------
132+ ndarray of hashed values array
133+ """
134+
135+ is_tuple = False
136+ if isinstance (vals , tuple ):
137+ vals = [vals ]
138+ is_tuple = True
139+ elif not is_list_like (vals ):
140+ raise TypeError ("must be convertible to a list-of-tuples" )
141+
142+ if not isinstance (vals , MultiIndex ):
143+ vals = MultiIndex .from_tuples (vals )
144+
145+ # create a list-of-ndarrays
146+ def get_level_values (num ):
147+ unique = vals .levels [num ] # .values
148+ labels = vals .labels [num ]
149+ filled = algos .take_1d (unique ._values , labels ,
150+ fill_value = unique ._na_value )
151+ return filled
152+
153+ vals = [get_level_values (level )
154+ for level in range (vals .nlevels )]
155+
156+ # hash the list-of-ndarrays
157+ hashes = (hash_array (l , encoding = encoding , hash_key = hash_key )
158+ for l in vals )
159+ h = _combine_hash_arrays (hashes , len (vals ))
160+ if is_tuple :
161+ h = h [0 ]
162+
163+ return h
164+
165+
83166def _hash_categorical (c , encoding , hash_key ):
84167 """
85168 Hash a Categorical by hashing its categories, and then mapping the codes
@@ -97,7 +180,7 @@ def _hash_categorical(c, encoding, hash_key):
97180 """
98181 cat_hashed = hash_array (c .categories .values , encoding , hash_key ,
99182 categorize = False ).astype (np .uint64 , copy = False )
100- return c .rename_categories (cat_hashed ).astype (np .uint64 )
183+ return c .rename_categories (cat_hashed ).astype (np .uint64 , copy = False )
101184
102185
103186def hash_array (vals , encoding = 'utf8' , hash_key = None , categorize = True ):
@@ -108,7 +191,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
108191
109192 Parameters
110193 ----------
111- vals : ndarray
194+ vals : ndarray, Categorical
112195 encoding : string, default 'utf8'
113196 encoding for data & key when strings
114197 hash_key : string key to encode, default to _default_hash_key
@@ -124,6 +207,9 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
124207
125208 """
126209
210+ if not hasattr (vals , 'dtype' ):
211+ raise TypeError ("must pass a ndarray-like" )
212+
127213 if hash_key is None :
128214 hash_key = _default_hash_key
129215
@@ -142,9 +228,10 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
142228 # manage it.
143229 if is_bool_array (vals ):
144230 vals = vals .astype ('u8' )
145- elif ((is_datetime64_dtype (vals ) or
146- is_timedelta64_dtype (vals ) or
147- is_numeric_dtype (vals )) and vals .dtype .itemsize <= 8 ):
231+ elif (is_datetime64_dtype (vals ) or
232+ is_timedelta64_dtype (vals )):
233+ vals = vals .view ('i8' ).astype ('u8' , copy = False )
234+ elif (is_numeric_dtype (vals ) and vals .dtype .itemsize <= 8 ):
148235 vals = vals .view ('u{}' .format (vals .dtype .itemsize )).astype ('u8' )
149236 else :
150237 # With repeated values, its MUCH faster to categorize object dtypes,
@@ -156,7 +243,12 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
156243 ordered = False , fastpath = True )
157244 return _hash_categorical (cat , encoding , hash_key )
158245
159- vals = _hash .hash_object_array (vals , hash_key , encoding )
246+ try :
247+ vals = _hash .hash_object_array (vals , hash_key , encoding )
248+ except TypeError :
249+ # we have mixed types
250+ vals = _hash .hash_object_array (vals .astype (str ).astype (object ),
251+ hash_key , encoding )
160252
161253 # Then, redistribute these 64-bit ints within the space of 64-bit ints
162254 vals ^= vals >> 30
0 commit comments