From 4d3a609bc619dda21880dee2477aaebff1b3364d Mon Sep 17 00:00:00 2001 From: Adam Jones <10088591+adamroyjones@users.noreply.github.com> Date: Wed, 17 Nov 2021 21:41:32 +0000 Subject: [PATCH 1/2] Describe `strip` as a parsing option instead of a generating option --- lib/csv.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/csv.rb b/lib/csv.rb index 72f339ac..5af23e8f 100644 --- a/lib/csv.rb +++ b/lib/csv.rb @@ -341,6 +341,7 @@ # liberal_parsing: false, # nil_value: nil, # empty_value: "", +# strip: false, # # For generating. # write_headers: nil, # quote_empty: true, @@ -348,7 +349,6 @@ # write_converters: nil, # write_nil_value: nil, # write_empty_value: "", -# strip: false, # } # # ==== Options for Parsing @@ -367,7 +367,7 @@ # - +skip_blanks+: Specifies whether blanks lines are to be ignored. # - +skip_lines+: Specifies how comments lines are to be recognized. # - +strip+: Specifies whether leading and trailing whitespace are -# to be stripped from fields.. +# to be stripped from fields. # - +liberal_parsing+: Specifies whether \CSV should attempt to parse # non-compliant data. # - +nil_value+: Specifies the object that is to be substituted for each null (no-text) field. @@ -946,6 +946,7 @@ def initialize(message, line_number) liberal_parsing: false, nil_value: nil, empty_value: "", + strip: false, # For generating. write_headers: nil, quote_empty: true, @@ -953,7 +954,6 @@ def initialize(message, line_number) write_converters: nil, write_nil_value: nil, write_empty_value: "", - strip: false, }.freeze class << self @@ -1879,11 +1879,11 @@ def initialize(data, encoding: nil, nil_value: nil, empty_value: "", + strip: false, quote_empty: true, write_converters: nil, write_nil_value: nil, - write_empty_value: "", - strip: false) + write_empty_value: "") raise ArgumentError.new("Cannot parse nil as CSV") if data.nil? if data.is_a?(String) From b86b3f47426eeaf546dfcbd9ea580c59c756747d Mon Sep 17 00:00:00 2001 From: Adam Jones <10088591+adamroyjones@users.noreply.github.com> Date: Wed, 17 Nov 2021 23:17:21 +0000 Subject: [PATCH 2/2] Handle ambiguities between col_sep and strip parsing options (#225) With Ruby 3.0.2 and csv 3.2.1, the file ```ruby require "csv" File.open("example.tsv", "w") { |f| f.puts("foo\t\tbar") } CSV.read("example.tsv", col_sep: "\t", strip: true) ``` produces the error ``` lib/csv/parser.rb:935:in `parse_quotable_robust': TODO: Meaningful message in line 1. (CSV::MalformedCSVError) ``` However, the CSV in this example is not malformed; instead, ambiguous options were provided to the parser. It is not obvious (to me) whether the string should be parsed as - `["foo\t\tbar"]`, - `["foo", "bar"]`, - `["foo", "", "bar"]`, or - `["foo", nil, "bar"]`. This commit adds code that raises an exception when this situation is encountered. Specifically, it checks if the column separator either ends with or starts with the characters that would be stripped away. This commit also adds unit tests and updates the documentation. --- lib/csv.rb | 5 +++-- lib/csv/parser.rb | 23 +++++++++++++++++++++++ test/csv/parse/test_strip.rb | 29 +++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/lib/csv.rb b/lib/csv.rb index 5af23e8f..2c47eadd 100644 --- a/lib/csv.rb +++ b/lib/csv.rb @@ -366,8 +366,9 @@ # - +header_converters+: Specifies the header converters to be used. # - +skip_blanks+: Specifies whether blanks lines are to be ignored. # - +skip_lines+: Specifies how comments lines are to be recognized. -# - +strip+: Specifies whether leading and trailing whitespace are -# to be stripped from fields. +# - +strip+: Specifies whether leading and trailing whitespace are to be +# stripped from fields. This must be compatible with +col_sep+; if it is not, +# then an +ArgumentError+ exception will be raised. # - +liberal_parsing+: Specifies whether \CSV should attempt to parse # non-compliant data. # - +nil_value+: Specifies the object that is to be substituted for each null (no-text) field. diff --git a/lib/csv/parser.rb b/lib/csv/parser.rb index 3334acfb..f87db3bb 100644 --- a/lib/csv/parser.rb +++ b/lib/csv/parser.rb @@ -361,6 +361,7 @@ def prepare prepare_skip_lines prepare_strip prepare_separators + validate_strip_and_col_sep_options prepare_quoted prepare_unquoted prepare_line @@ -531,6 +532,28 @@ def prepare_separators @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding)) end + # This method verifies that there are no (obvious) ambiguities with the + # provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+ + # and +strip+ were both equal to +\t+, then there would be no clear way to + # parse the input. + def validate_strip_and_col_sep_options + return unless @strip + + if @strip.is_a?(String) + if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip) + raise ArgumentError, + "The provided strip (#{@escaped_strip}) and " \ + "col_sep (#{@escaped_column_separator}) options are incompatible." + end + else + if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator) + raise ArgumentError, + "The provided strip (true) and " \ + "col_sep (#{@escaped_column_separator}) options are incompatible." + end + end + end + def prepare_quoted if @quote_character @quotes = Regexp.new(@escaped_quote_character + diff --git a/test/csv/parse/test_strip.rb b/test/csv/parse/test_strip.rb index 3564fcb3..c5e35209 100644 --- a/test/csv/parse/test_strip.rb +++ b/test/csv/parse/test_strip.rb @@ -80,4 +80,33 @@ def test_do_not_strip_crlf %Q{"a" ,"b " \r\n}, strip: true)) end + + def test_col_sep_incompatible_true + message = "The provided strip (true) and " \ + "col_sep (\\t) options are incompatible." + assert_raise_with_message(ArgumentError, message) do + CSV.parse_line(%Q{"a"\t"b"\n}, + col_sep: "\t", + strip: true) + end + end + + def test_col_sep_incompatible_string + message = "The provided strip (\\t) and " \ + "col_sep (\\t) options are incompatible." + assert_raise_with_message(ArgumentError, message) do + CSV.parse_line(%Q{"a"\t"b"\n}, + col_sep: "\t", + strip: "\t") + end + end + + def test_col_sep_compatible_string + assert_equal( + ["a", "b"], + CSV.parse_line(%Q{\va\tb\v\n}, + col_sep: "\t", + strip: "\v") + ) + end end