@@ -83,98 +83,15 @@ def describe_ndframe(
8383 raise ValueError ("percentiles cannot contain duplicates" )
8484 percentiles = unique_pcts
8585
86- formatted_percentiles = format_percentiles (percentiles )
87-
88- def describe_numeric_1d (series ) -> "Series" :
89- from pandas import Series
90-
91- stat_index = ["count" , "mean" , "std" , "min" ] + formatted_percentiles + ["max" ]
92- d = (
93- [series .count (), series .mean (), series .std (), series .min ()]
94- + series .quantile (percentiles ).tolist ()
95- + [series .max ()]
96- )
97- return Series (d , index = stat_index , name = series .name )
98-
99- def describe_categorical_1d (data ) -> "Series" :
100- names = ["count" , "unique" ]
101- objcounts = data .value_counts ()
102- count_unique = len (objcounts [objcounts != 0 ])
103- result = [data .count (), count_unique ]
104- dtype = None
105- if result [1 ] > 0 :
106- top , freq = objcounts .index [0 ], objcounts .iloc [0 ]
107- if is_datetime64_any_dtype (data .dtype ):
108- if obj .ndim == 1 :
109- stacklevel = 5
110- else :
111- stacklevel = 6
112- warnings .warn (
113- "Treating datetime data as categorical rather than numeric in "
114- "`.describe` is deprecated and will be removed in a future "
115- "version of pandas. Specify `datetime_is_numeric=True` to "
116- "silence this warning and adopt the future behavior now." ,
117- FutureWarning ,
118- stacklevel = stacklevel ,
119- )
120- tz = data .dt .tz
121- asint = data .dropna ().values .view ("i8" )
122- top = Timestamp (top )
123- if top .tzinfo is not None and tz is not None :
124- # Don't tz_localize(None) if key is already tz-aware
125- top = top .tz_convert (tz )
126- else :
127- top = top .tz_localize (tz )
128- names += ["top" , "freq" , "first" , "last" ]
129- result += [
130- top ,
131- freq ,
132- Timestamp (asint .min (), tz = tz ),
133- Timestamp (asint .max (), tz = tz ),
134- ]
135- else :
136- names += ["top" , "freq" ]
137- result += [top , freq ]
138-
139- # If the DataFrame is empty, set 'top' and 'freq' to None
140- # to maintain output shape consistency
141- else :
142- names += ["top" , "freq" ]
143- result += [np .nan , np .nan ]
144- dtype = "object"
145-
146- from pandas import Series
147-
148- return Series (result , index = names , name = data .name , dtype = dtype )
149-
150- def describe_timestamp_1d (data ) -> "Series" :
151- # GH-30164
152- from pandas import Series
153-
154- stat_index = ["count" , "mean" , "min" ] + formatted_percentiles + ["max" ]
155- d = (
156- [data .count (), data .mean (), data .min ()]
157- + data .quantile (percentiles ).tolist ()
158- + [data .max ()]
159- )
160- return Series (d , index = stat_index , name = data .name )
161-
162- def describe_1d (data ) -> "Series" :
163- if is_bool_dtype (data .dtype ):
164- return describe_categorical_1d (data )
165- elif is_numeric_dtype (data ):
166- return describe_numeric_1d (data )
167- elif is_datetime64_any_dtype (data .dtype ) and datetime_is_numeric :
168- return describe_timestamp_1d (data )
169- elif is_timedelta64_dtype (data .dtype ):
170- return describe_numeric_1d (data )
171- else :
172- return describe_categorical_1d (data )
173-
17486 if obj .ndim == 1 :
17587 # Incompatible return value type
17688 # (got "Series", expected "FrameOrSeries") [return-value]
177- return describe_1d (obj ) # type:ignore[return-value]
89+ return describe_1d (
90+ obj ,
91+ percentiles ,
92+ datetime_is_numeric ,
93+ is_series = True ,
94+ ) # type:ignore[return-value]
17895 elif (include is None ) and (exclude is None ):
17996 # when some numerics are found, keep only numerics
18097 default_include = [np .number ]
@@ -191,7 +108,10 @@ def describe_1d(data) -> "Series":
191108 else :
192109 data = obj .select_dtypes (include = include , exclude = exclude )
193110
194- ldesc = [describe_1d (s ) for _ , s in data .items ()]
111+ ldesc = [
112+ describe_1d (s , percentiles , datetime_is_numeric , is_series = False )
113+ for _ , s in data .items ()
114+ ]
195115 # set a convenient order for rows
196116 names : List [Hashable ] = []
197117 ldesc_indexes = sorted ((x .index for x in ldesc ), key = len )
@@ -203,3 +123,143 @@ def describe_1d(data) -> "Series":
203123 d = concat ([x .reindex (names , copy = False ) for x in ldesc ], axis = 1 , sort = False )
204124 d .columns = data .columns .copy ()
205125 return d
126+
127+
128+ def describe_numeric_1d (series , percentiles ) -> "Series" :
129+ """Describe series containing numerical data.
130+
131+ Parameters
132+ ----------
133+ series : Series
134+ Series to be described.
135+ percentiles : list-like of numbers, optional
136+ The percentiles to include in the output.
137+ """
138+ from pandas import Series
139+
140+ formatted_percentiles = format_percentiles (percentiles )
141+
142+ stat_index = ["count" , "mean" , "std" , "min" ] + formatted_percentiles + ["max" ]
143+ d = (
144+ [series .count (), series .mean (), series .std (), series .min ()]
145+ + series .quantile (percentiles ).tolist ()
146+ + [series .max ()]
147+ )
148+ return Series (d , index = stat_index , name = series .name )
149+
150+
151+ def describe_categorical_1d (data , is_series ) -> "Series" :
152+ """Describe series containing categorical data.
153+
154+ Parameters
155+ ----------
156+ data : Series
157+ Series to be described.
158+ is_series : bool
159+ True if the original object is a Series.
160+ False if the one column of the DataFrame is described.
161+ """
162+ names = ["count" , "unique" ]
163+ objcounts = data .value_counts ()
164+ count_unique = len (objcounts [objcounts != 0 ])
165+ result = [data .count (), count_unique ]
166+ dtype = None
167+ if result [1 ] > 0 :
168+ top , freq = objcounts .index [0 ], objcounts .iloc [0 ]
169+ if is_datetime64_any_dtype (data .dtype ):
170+ if is_series :
171+ stacklevel = 5
172+ else :
173+ stacklevel = 6
174+ warnings .warn (
175+ "Treating datetime data as categorical rather than numeric in "
176+ "`.describe` is deprecated and will be removed in a future "
177+ "version of pandas. Specify `datetime_is_numeric=True` to "
178+ "silence this warning and adopt the future behavior now." ,
179+ FutureWarning ,
180+ stacklevel = stacklevel ,
181+ )
182+ tz = data .dt .tz
183+ asint = data .dropna ().values .view ("i8" )
184+ top = Timestamp (top )
185+ if top .tzinfo is not None and tz is not None :
186+ # Don't tz_localize(None) if key is already tz-aware
187+ top = top .tz_convert (tz )
188+ else :
189+ top = top .tz_localize (tz )
190+ names += ["top" , "freq" , "first" , "last" ]
191+ result += [
192+ top ,
193+ freq ,
194+ Timestamp (asint .min (), tz = tz ),
195+ Timestamp (asint .max (), tz = tz ),
196+ ]
197+ else :
198+ names += ["top" , "freq" ]
199+ result += [top , freq ]
200+
201+ # If the DataFrame is empty, set 'top' and 'freq' to None
202+ # to maintain output shape consistency
203+ else :
204+ names += ["top" , "freq" ]
205+ result += [np .nan , np .nan ]
206+ dtype = "object"
207+
208+ from pandas import Series
209+
210+ return Series (result , index = names , name = data .name , dtype = dtype )
211+
212+
213+ def describe_timestamp_1d (data , percentiles ) -> "Series" :
214+ """Describe series containing datetime64 dtype.
215+
216+ Parameters
217+ ----------
218+ data : Series
219+ Series to be described.
220+ percentiles : list-like of numbers, optional
221+ The percentiles to include in the output.
222+ """
223+ # GH-30164
224+ from pandas import Series
225+
226+ formatted_percentiles = format_percentiles (percentiles )
227+
228+ stat_index = ["count" , "mean" , "min" ] + formatted_percentiles + ["max" ]
229+ d = (
230+ [data .count (), data .mean (), data .min ()]
231+ + data .quantile (percentiles ).tolist ()
232+ + [data .max ()]
233+ )
234+ return Series (d , index = stat_index , name = data .name )
235+
236+
237+ def describe_1d (data , percentiles , datetime_is_numeric , * , is_series ) -> "Series" :
238+ """Describe series.
239+
240+ Parameters
241+ ----------
242+ data : Series
243+ Series to be described.
244+ percentiles : list-like of numbers, optional
245+ The percentiles to include in the output.
246+ datetime_is_numeric : bool, default False
247+ Whether to treat datetime dtypes as numeric.
248+ is_series : bool
249+ True if the original object is a Series.
250+ False if the one column of the DataFrame is described.
251+
252+ Returns
253+ -------
254+ Series
255+ """
256+ if is_bool_dtype (data .dtype ):
257+ return describe_categorical_1d (data , is_series )
258+ elif is_numeric_dtype (data ):
259+ return describe_numeric_1d (data , percentiles )
260+ elif is_datetime64_any_dtype (data .dtype ) and datetime_is_numeric :
261+ return describe_timestamp_1d (data , percentiles )
262+ elif is_timedelta64_dtype (data .dtype ):
263+ return describe_numeric_1d (data , percentiles )
264+ else :
265+ return describe_categorical_1d (data , is_series )
0 commit comments