@@ -377,12 +377,13 @@ cdef class Int64HashTable(HashTable):
377377
378378 def factorize (self , ndarray[object] values ):
379379 reverse = {}
380- labels = self .get_labels(values, reverse, 0 )
380+ labels = self .get_labels(values, reverse, 0 , 0 )
381381 return reverse, labels
382382
383383 @ cython.boundscheck (False )
384384 def get_labels (self , int64_t[:] values , Int64Vector uniques ,
385- Py_ssize_t count_prior , Py_ssize_t na_sentinel ):
385+ Py_ssize_t count_prior , Py_ssize_t na_sentinel ,
386+ bint check_null = True ):
386387 cdef:
387388 Py_ssize_t i, n = len (values)
388389 int64_t[:] labels
@@ -399,6 +400,11 @@ cdef class Int64HashTable(HashTable):
399400 for i in range (n):
400401 val = values[i]
401402 k = kh_get_int64(self .table, val)
403+
404+ if check_null and val == iNaT:
405+ labels[i] = na_sentinel
406+ continue
407+
402408 if k != self .table.n_buckets:
403409 idx = self .table.vals[k]
404410 labels[i] = idx
@@ -525,13 +531,14 @@ cdef class Float64HashTable(HashTable):
525531
526532 def factorize (self , float64_t[:] values ):
527533 uniques = Float64Vector()
528- labels = self .get_labels(values, uniques, 0 , - 1 )
534+ labels = self .get_labels(values, uniques, 0 , - 1 , 1 )
529535 return uniques.to_array(), labels
530536
531537 @ cython.boundscheck (False )
532538 def get_labels (self , float64_t[:] values ,
533- Float64Vector uniques ,
534- Py_ssize_t count_prior , int64_t na_sentinel ):
539+ Float64Vector uniques ,
540+ Py_ssize_t count_prior , int64_t na_sentinel ,
541+ bint check_null = True ):
535542 cdef:
536543 Py_ssize_t i, n = len (values)
537544 int64_t[:] labels
@@ -548,7 +555,7 @@ cdef class Float64HashTable(HashTable):
548555 for i in range (n):
549556 val = values[i]
550557
551- if val != val:
558+ if check_null and val != val:
552559 labels[i] = na_sentinel
553560 continue
554561
@@ -762,7 +769,8 @@ cdef class PyObjectHashTable(HashTable):
762769 return uniques.to_array()
763770
764771 def get_labels (self , ndarray[object] values , ObjectVector uniques ,
765- Py_ssize_t count_prior , int64_t na_sentinel ):
772+ Py_ssize_t count_prior , int64_t na_sentinel ,
773+ bint check_null = True ):
766774 cdef:
767775 Py_ssize_t i, n = len (values)
768776 int64_t[:] labels
@@ -777,7 +785,7 @@ cdef class PyObjectHashTable(HashTable):
777785 val = values[i]
778786 hash (val)
779787
780- if val != val or val is None :
788+ if check_null and val != val or val is None :
781789 labels[i] = na_sentinel
782790 continue
783791
@@ -808,14 +816,15 @@ cdef class Factorizer:
808816 def get_count (self ):
809817 return self .count
810818
811- def factorize (self , ndarray[object] values , sort = False , na_sentinel = - 1 ):
819+ def factorize (self , ndarray[object] values , sort = False , na_sentinel = - 1 ,
820+ check_null = True ):
812821 """
813822 Factorize values with nans replaced by na_sentinel
814823 >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
815824 array([ 0, 1, 20])
816825 """
817826 labels = self .table.get_labels(values, self .uniques,
818- self .count, na_sentinel)
827+ self .count, na_sentinel, check_null )
819828 mask = (labels == na_sentinel)
820829 # sort on
821830 if sort:
@@ -848,9 +857,10 @@ cdef class Int64Factorizer:
848857 return self .count
849858
850859 def factorize (self , int64_t[:] values , sort = False ,
851- na_sentinel = - 1 ):
860+ na_sentinel = - 1 , check_null = True ):
852861 labels = self .table.get_labels(values, self .uniques,
853- self .count, na_sentinel)
862+ self .count, na_sentinel,
863+ check_null)
854864
855865 # sort on
856866 if sort:
0 commit comments