@@ -187,7 +187,6 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
187187 else :
188188 f = lambda x : pat in x
189189 return _na_map (f , arr , na )
190-
191190
192191
193192def str_startswith (arr , pat , na = np .nan ):
@@ -460,6 +459,46 @@ def f(x):
460459 return result
461460
462461
462+ def str_get_dummies (arr , sep = '|' ):
463+ """
464+ Split each string by sep and return a frame of dummy/indicator variables.
465+
466+ Examples
467+ --------
468+ >>> Series(['a|b', 'a', 'a|c']).str.get_dummies()
469+ a b c
470+ 0 1 1 0
471+ 1 1 0 0
472+ 2 1 0 1
473+
474+ >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
475+ a b c
476+ 0 1 1 0
477+ 1 NaN NaN NaN
478+ 2 1 0 1
479+
480+ See also ``pd.get_dummies``.
481+
482+ """
483+ def na_setunion (x , y ):
484+ try :
485+ return x .union (y )
486+ except TypeError :
487+ return x
488+
489+ # TODO remove this hack?
490+ arr = sep + arr .fillna ('' ).astype (str ) + sep
491+
492+ from functools import reduce
493+ tags = sorted (reduce (na_setunion , arr .str .split (sep ), set ())
494+ - set (['' ]))
495+ dummies = np .empty ((len (arr ), len (tags )), dtype = int )
496+
497+ for i , t in enumerate (tags ):
498+ pat = sep + t + sep
499+ dummies [:, i ] = _na_map (lambda x : pat in x , arr )
500+ return DataFrame (dummies , arr .index , tags )
501+
463502
464503def str_join (arr , sep ):
465504 """
@@ -843,7 +882,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
843882 result = str_contains (self .series , pat , case = case , flags = flags ,
844883 na = na , regex = regex )
845884 return self ._wrap_result (result )
846-
885+
847886 @copy (str_replace )
848887 def replace (self , pat , repl , n = - 1 , case = True , flags = 0 ):
849888 result = str_replace (self .series , pat , repl , n = n , case = case ,
@@ -899,6 +938,11 @@ def rstrip(self, to_strip=None):
899938 result = str_rstrip (self .series , to_strip )
900939 return self ._wrap_result (result )
901940
941+ @copy (str_get_dummies )
942+ def get_dummies (self , sep = '|' ):
943+ result = str_get_dummies (self .series , sep )
944+ return self ._wrap_result (result )
945+
902946 count = _pat_wrapper (str_count , flags = True )
903947 startswith = _pat_wrapper (str_startswith , na = True )
904948 endswith = _pat_wrapper (str_endswith , na = True )
0 commit comments