@@ -211,22 +211,23 @@ def convert_categorical(x):
211211 return Categorical (concatted , rawcats )
212212
213213
214- def union_categoricals (to_union ):
214+ def union_categoricals (to_union , sort_categories = False ):
215215 """
216216 Combine list-like of Categoricals, unioning categories. All
217- must have the same dtype, and none can be ordered .
217+ categories must have the same dtype.
218218
219219 .. versionadded:: 0.19.0
220220
221221 Parameters
222222 ----------
223223 to_union : list-like of Categoricals
224+ sort_categories : boolean, default False
225+ If true, resulting categories will be lexsorted, otherwise
226+ they will be ordered as they appear in the data
224227
225228 Returns
226229 -------
227- Categorical
228- A single array, categories will be ordered as they
229- appear in the list
230+ result : Categorical
230231
231232 Raises
232233 ------
@@ -244,41 +245,47 @@ def union_categoricals(to_union):
244245
245246 first = to_union [0 ]
246247
247- if not all (is_dtype_equal (c .categories .dtype , first .categories .dtype )
248- for c in to_union ):
248+ if not all (is_dtype_equal (other .categories .dtype , first .categories .dtype )
249+ for other in to_union [ 1 :] ):
249250 raise TypeError ("dtype of categories must be the same" )
250251
252+ ordered = False
251253 if all (first .is_dtype_equal (other ) for other in to_union [1 :]):
252- return Categorical (np .concatenate ([c .codes for c in to_union ]),
253- categories = first .categories , ordered = first .ordered ,
254- fastpath = True )
254+ # identical categories - fastpath
255+ categories = first .categories
256+ ordered = first .ordered
257+ new_codes = np .concatenate ([c .codes for c in to_union ])
258+
259+ if sort_categories :
260+ categories = categories .sort_values ()
261+ indexer = first .categories .get_indexer (categories )
262+ new_codes = take_1d (indexer , new_codes , fill_value = - 1 )
255263 elif all (not c .ordered for c in to_union ):
256- # not ordered
257- pass
264+ # different categories - union and recode
265+ cats = first .categories .append ([c .categories for c in to_union [1 :]])
266+ categories = Index (cats .unique ())
267+ if sort_categories :
268+ categories = categories .sort_values ()
269+
270+ new_codes = []
271+ for c in to_union :
272+ if len (c .categories ) > 0 :
273+ indexer = categories .get_indexer (c .categories )
274+ new_codes .append (take_1d (indexer , c .codes , fill_value = - 1 ))
275+ else :
276+ # must be all NaN
277+ new_codes .append (c .codes )
278+ new_codes = np .concatenate (new_codes )
258279 else :
259- # to show a proper error message
280+ # ordered - to show a proper error message
260281 if all (c .ordered for c in to_union ):
261282 msg = ("to union ordered Categoricals, "
262283 "all categories must be the same" )
263284 raise TypeError (msg )
264285 else :
265286 raise TypeError ('Categorical.ordered must be the same' )
266287
267- cats = first .categories
268- unique_cats = cats .append ([c .categories for c in to_union [1 :]]).unique ()
269- categories = Index (unique_cats )
270-
271- new_codes = []
272- for c in to_union :
273- if len (c .categories ) > 0 :
274- indexer = categories .get_indexer (c .categories )
275- new_codes .append (take_1d (indexer , c .codes , fill_value = - 1 ))
276- else :
277- # must be all NaN
278- new_codes .append (c .codes )
279-
280- new_codes = np .concatenate (new_codes )
281- return Categorical (new_codes , categories = categories , ordered = False ,
288+ return Categorical (new_codes , categories = categories , ordered = ordered ,
282289 fastpath = True )
283290
284291
0 commit comments