File tree Expand file tree Collapse file tree 2 files changed +10
-2
lines changed
Expand file tree Collapse file tree 2 files changed +10
-2
lines changed Original file line number Diff line number Diff line change 1616from pandas .util .decorators import (Appender , cache_readonly ,
1717 deprecate_kwarg , Substitution )
1818from pandas .core .common import AbstractMethodError
19- from pandas .tools .hashing import hash_pandas_object
2019from pandas .formats .printing import pprint_thing
2120
2221_shared_docs = dict ()
@@ -838,6 +837,7 @@ def hash(self, index=True):
838837 9751253963311919054], dtype=uint64)
839838
840839 """
840+ from pandas .tools .hashing import hash_pandas_object
841841 return hash_pandas_object (self , index = index )
842842
843843
Original file line number Diff line number Diff line change 22data hash pandas / numpy objects
33"""
44
5+ from hashlib import md5
56import numpy as np
7+ from pandas import Series
68from pandas .types .generic import ABCIndexClass , ABCSeries , ABCDataFrame
79from pandas .types .common import is_categorical_dtype
810
@@ -71,7 +73,13 @@ def hash_array(vals):
7173
7274 vals = vals .view ('u{}' .format (vals .dtype .itemsize )).astype ('u8' )
7375 else :
74- vals = np .array ([hash (x ) for x in vals ], dtype = np .uint64 )
76+
77+ # we want to stringify
78+ # then apply a consistent hashing scheme
79+ def f (v ):
80+ return int (md5 (v ).hexdigest (), 16 ) % (10 ** 8 )
81+ vals = Series (vals ).astype (str ).str .encode ('utf8' ).values
82+ vals = np .array ([f (v ) for v in vals ], dtype = 'uint64' )
7583
7684 # Then, redistribute these 64-bit ints within the space of 64-bit ints
7785 vals ^= vals >> 30
You can’t perform that action at this time.
0 commit comments