@@ -332,3 +332,97 @@ using something similar to the following:
332332 See `the NumPy documentation on byte order
333333<https://docs.scipy.org/doc/numpy/user/basics.byteswapping.html> `__ for more
334334details.
335+
336+
337+ Alternative to storing lists in Pandas DataFrame Cells
338+ ------------------------------------------------------
339+ Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat DataFrame structure.
340+
341+ Example of exploding nested lists into a DataFrame:
342+
343+ .. ipython :: python
344+
345+ from collections import OrderedDict
346+ df = (pd.DataFrame(OrderedDict([(' name' , [' A.J. Price' ]* 3 ),
347+ (' opponent' , [' 76ers' , ' blazers' , ' bobcats' ]),
348+ (' attribute x' , [' A' ,' B' ,' C' ])
349+ ])
350+ ))
351+ df
352+
353+ nn = [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]]* 3
354+ nn
355+
356+ # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
357+ df2 = pd.concat([df[[' name' ,' opponent' ]], pd.DataFrame(nn)], axis = 1 )
358+ df2
359+
360+ # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
361+ # Note that only the index from the original df is retained -
362+ # any other columns in the original df are not part of the new df
363+ df3 = df2.set_index([' name' , ' opponent' ])
364+ df3
365+
366+ # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
367+ # Note that at this point we have a Series, not a Dataframe
368+ ser = df3.stack()
369+ ser
370+
371+ # Step 4: Drop the extraneous index level created by the stack
372+ ser.reset_index(level = 2 , drop = True , inplace = True )
373+ ser
374+
375+ # Step 5: Create a Dataframe from the Series
376+ df4 = ser.to_frame(' nearest_neighbors' )
377+ df4
378+
379+ # All steps in one stack
380+ df4 = (df2.set_index([' name' , ' opponent' ])
381+ .stack()
382+ .reset_index(level = 2 , drop = True )
383+ .to_frame(' nearest_neighbors' ))
384+ df4
385+
386+ Example of exploding a list embedded in a dataframe:
387+
388+ .. ipython :: python
389+
390+ df = (pd.DataFrame(OrderedDict([(' name' , [' A.J. Price' ]* 3 ),
391+ (' opponent' , [' 76ers' , ' blazers' , ' bobcats' ]),
392+ (' attribute x' , [' A' ,' B' ,' C' ]),
393+ (' nearest_neighbors' , [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]]* 3 )
394+ ])
395+ ))
396+
397+ df
398+
399+ # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
400+ df2 = df.set_index([' name' , ' opponent' ])
401+ df2
402+
403+ # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
404+ # Note that only the index from the original df is retained -
405+ # any other columns in the original df are not part of the new df
406+ df3 = df2.nearest_neighbors.apply(pd.Series)
407+ df3
408+
409+ # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
410+ # Note that at this point we have a Series, not a Dataframe
411+ ser = df3.stack()
412+ ser
413+
414+ # Step 4: Drop the extraneous index level created by the stack
415+ ser.reset_index(level = 2 , drop = True , inplace = True )
416+ ser
417+
418+ # Step 5: Create a Dataframe from the Series
419+ df4 = ser.to_frame(' nearest_neighbors' )
420+ df4
421+
422+ # All steps in one stack
423+ df4 = (df.set_index([' name' , ' opponent' ])
424+ .nearest_neighbors.apply(pd.Series)
425+ .stack()
426+ .reset_index(level = 2 , drop = True )
427+ .to_frame(' nearest_neighbors' ))
428+ df4
0 commit comments