311311 fields of each line as half-open intervals (i.e., [from, to[ ).
312312 String value 'infer' can be used to instruct the parser to try
313313 detecting the column specifications from the first 100 rows of
314- the data (default='infer').
314+ the data which are not being skipped via skiprows (default='infer').
315315widths : list of ints. optional
316316 A list of field widths which can be used instead of 'colspecs' if
317317 the intervals are contiguous.
@@ -2852,13 +2852,15 @@ class FixedWidthReader(BaseIterator):
28522852 A reader of fixed-width lines.
28532853 """
28542854
2855- def __init__ (self , f , colspecs , delimiter , comment ):
2855+ def __init__ (self , f , colspecs , delimiter , comment , skiprows = None ):
28562856 self .f = f
28572857 self .buffer = None
28582858 self .delimiter = '\r \n ' + delimiter if delimiter else '\n \r \t '
28592859 self .comment = comment
2860+ if skiprows is None :
2861+ skiprows = set ()
28602862 if colspecs == 'infer' :
2861- self .colspecs = self .detect_colspecs ()
2863+ self .colspecs = self .detect_colspecs (skiprows = skiprows )
28622864 else :
28632865 self .colspecs = colspecs
28642866
@@ -2875,20 +2877,34 @@ def __init__(self, f, colspecs, delimiter, comment):
28752877 raise TypeError ('Each column specification must be '
28762878 '2 element tuple or list of integers' )
28772879
2878- def get_rows (self , n ):
2879- rows = []
2880- for i , row in enumerate (self .f , 1 ):
2881- rows .append (row )
2882- if i >= n :
2880+ def get_rows (self , n , skiprows = None ):
2881+ """
2882+ We distinguish buffer_rows (the first <= n lines)
2883+ from the rows returned to detect_colspecs because
2884+ it's simpler to leave the other locations with
2885+ skiprows logic alone than to modify them to deal
2886+ with the fact we skipped some rows here as well.
2887+ """
2888+ if skiprows is None :
2889+ skiprows = set ()
2890+ buffer_rows = []
2891+ detect_rows = []
2892+ for i , row in enumerate (self .f ):
2893+ if i not in skiprows :
2894+ detect_rows .append (row )
2895+ buffer_rows .append (row )
2896+ if len (detect_rows ) >= n :
28832897 break
2884- self .buffer = iter (rows )
2885- return rows
2898+ self .buffer = iter (buffer_rows )
2899+ return detect_rows
28862900
2887- def detect_colspecs (self , n = 100 ):
2901+ def detect_colspecs (self , n = 100 , skiprows = None ):
28882902 # Regex escape the delimiters
28892903 delimiters = '' .join ([r'\%s' % x for x in self .delimiter ])
28902904 pattern = re .compile ('([^%s]+)' % delimiters )
2891- rows = self .get_rows (n )
2905+ rows = self .get_rows (n , skiprows )
2906+ if not rows :
2907+ raise EmptyDataError ("No rows from which to infer column width" )
28922908 max_len = max (map (len , rows ))
28932909 mask = np .zeros (max_len + 1 , dtype = int )
28942910 if self .comment is not None :
@@ -2899,7 +2915,8 @@ def detect_colspecs(self, n=100):
28992915 shifted = np .roll (mask , 1 )
29002916 shifted [0 ] = 0
29012917 edges = np .where ((mask ^ shifted ) == 1 )[0 ]
2902- return list (zip (edges [::2 ], edges [1 ::2 ]))
2918+ edge_pairs = list (zip (edges [::2 ], edges [1 ::2 ]))
2919+ return edge_pairs
29032920
29042921 def __next__ (self ):
29052922 if self .buffer is not None :
@@ -2924,9 +2941,8 @@ class FixedWidthFieldParser(PythonParser):
29242941 def __init__ (self , f , ** kwds ):
29252942 # Support iterators, convert to a list.
29262943 self .colspecs = kwds .pop ('colspecs' )
2927-
29282944 PythonParser .__init__ (self , f , ** kwds )
29292945
29302946 def _make_reader (self , f ):
29312947 self .data = FixedWidthReader (f , self .colspecs , self .delimiter ,
2932- self .comment )
2948+ self .comment , self . skiprows )
0 commit comments