ENH: pd.DataFrame.info() to show line numbers GH17304

pratapvardhan · pratapvardhan · commit 89a6a012ab7a · 2018-01-06T22:27:31.000+05:30
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -429,3 +429,4 @@ Other
 
 - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
 - :func:`Timestamp.replace` will now handle Daylight Savings transitions gracefully (:issue:`18319`)
+- :func:`DataFrame.info()` now shows line numbers for column summary (:issue:`17304`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1809,50 +1809,59 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
         lines.append(self.index.summary())
 
         if len(self.columns) == 0:
-            lines.append('Empty %s' % type(self).__name__)
+            lines.append('Empty {name}'.format(name=type(self).__name__))
             _put_lines(buf, lines)
             return
 
         cols = self.columns
+        cols_count = len(cols)
 
         # hack
         if max_cols is None:
-            max_cols = get_option('display.max_info_columns',
-                                  len(self.columns) + 1)
+            max_cols = get_option('display.max_info_columns', cols_count + 1)
 
         max_rows = get_option('display.max_info_rows', len(self) + 1)
 
         if null_counts is None:
-            show_counts = ((len(self.columns) <= max_cols) and
+            show_counts = ((cols_count <= max_cols) and
                            (len(self) < max_rows))
         else:
             show_counts = null_counts
-        exceeds_info_cols = len(self.columns) > max_cols
+        exceeds_info_cols = cols_count > max_cols
 
         def _verbose_repr():
-            lines.append('Data columns (total %d columns):' %
-                         len(self.columns))
-            space = max(len(pprint_thing(k)) for k in self.columns) + 4
+            lines.append('Data columns (total '
+                         '{count} columns):'.format(count=cols_count))
+            space = max([len(pprint_thing(k)) for k in cols])
+            space = max(space, len(pprint_thing('Column'))) + 4
+            space_num = len(pprint_thing(cols_count))
+            space_num = max(space_num, len(pprint_thing('Index'))) + 2
             counts = None
 
-            tmpl = "%s%s"
+            header = _put_str('Index', space_num) + _put_str('Column', space)
+            tmpl = '{count}{dtype}'
             if show_counts:
                 counts = self.count()
                 if len(cols) != len(counts):  # pragma: no cover
-                    raise AssertionError('Columns must equal counts (%d != %d)'
-                                         % (len(cols), len(counts)))
-                tmpl = "%s non-null %s"
-
+                    raise AssertionError(
+                        'Columns must equal counts '
+                        '({cols_count} != {count})'.format(
+                            cols_count=cols_count, count=len(counts)))
+                header += 'Non-Null Count'
+                tmpl = '{count} non-null {dtype}'
+
+            lines.append(header)
             dtypes = self.dtypes
-            for i, col in enumerate(self.columns):
+            for i, col in enumerate(cols):
                 dtype = dtypes.iloc[i]
                 col = pprint_thing(col)
-
-                count = ""
+                line_no = _put_str(' {num}'.format(num=i), space_num)
+                count = ''
                 if show_counts:
                     count = counts.iloc[i]
 
-                lines.append(_put_str(col, space) + tmpl % (count, dtype))
+                lines.append(line_no + _put_str(col, space) +
+                             tmpl.format(count=count, dtype=dtype))
 
         def _non_verbose_repr():
             lines.append(self.columns.summary(name='Columns'))
diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py
@@ -239,8 +239,8 @@ def test_info_duplicate_columns_shows_correct_dtypes(self):
         frame.info(buf=io)
         io.seek(0)
         lines = io.readlines()
-        assert 'a    1 non-null int64\n' == lines[3]
-        assert 'a    1 non-null float64\n' == lines[4]
+        assert ' 0     a         1 non-null int64\n' == lines[4]
+        assert ' 1     a         1 non-null float64\n' == lines[5]
 
     def test_info_shows_column_dtypes(self):
         dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
@@ -254,20 +254,21 @@ def test_info_shows_column_dtypes(self):
         df.info(buf=buf)
         res = buf.getvalue()
         for i, dtype in enumerate(dtypes):
-            name = '%d    %d non-null %s' % (i, n, dtype)
+            name = '%s         %d non-null %s' % (i, n, dtype)
+
             assert name in res
 
     def test_info_max_cols(self):
         df = DataFrame(np.random.randn(10, 5))
-        for len_, verbose in [(5, None), (5, False), (10, True)]:
+        for len_, verbose in [(5, None), (5, False), (11, True)]:
             # For verbose always      ^ setting  ^ summarize ^ full output
             with option_context('max_info_columns', 4):
                 buf = StringIO()
                 df.info(buf=buf, verbose=verbose)
                 res = buf.getvalue()
                 assert len(res.strip().split('\n')) == len_
 
-        for len_, verbose in [(10, None), (5, False), (10, True)]:
+        for len_, verbose in [(11, None), (5, False), (11, True)]:
 
             # max_cols no exceeded
             with option_context('max_info_columns', 5):
@@ -276,7 +277,7 @@ def test_info_max_cols(self):
                 res = buf.getvalue()
                 assert len(res.strip().split('\n')) == len_
 
-        for len_, max_cols in [(10, 5), (5, 4)]:
+        for len_, max_cols in [(11, 5), (5, 4)]:
             # setting truncates
             with option_context('max_info_columns', 4):
                 buf = StringIO()

Original file line number	Diff line number	Diff line change
`@@ -429,3 +429,4 @@ Other`
`429`	`429`
`430`	`430`	- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
`431`	`431`	- :func:`Timestamp.replace` will now handle Daylight Savings transitions gracefully (:issue:`18319`)
	`432`	+- :func:`DataFrame.info()` now shows line numbers for column summary (:issue:`17304`)