diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e58f33bd..fb84ed66 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -114,6 +114,17 @@ jobs: version: 16 build_type: Debug name: ubuntu-debug-clang + # Linux arm + - os: ubuntu-24.04-arm + compiler: clang + version: 16 + build_type: Release + name: ubuntu-arm64-release-clang + - os: ubuntu-24.04-arm + compiler: clang + version: 16 + build_type: Debug + name: ubuntu-arm64-debug-clang # macOS - os: macos-14 compiler: apple-clang @@ -270,17 +281,19 @@ jobs: strategy: fail-fast: false matrix: - target: - - ubuntu-release-gcc - - ubuntu-release-clang - - ubuntu-debug-gcc - - ubuntu-debug-clang - - macos-release-appleclang - - macos-debug-appleclang - - asan - - ubsan - - tsan - runs-on: "${{ contains(matrix.target, 'mac') && 'macos-14' || 'ubuntu-latest' }}" + include: + - { target: ubuntu-release-gcc, os: ubuntu-latest } + - { target: ubuntu-release-clang, os: ubuntu-latest } + - { target: ubuntu-debug-gcc, os: ubuntu-latest } + - { target: ubuntu-debug-clang, os: ubuntu-latest } + - { target: ubuntu-arm64-release-clang, os: ubuntu-24.04-arm } + - { target: ubuntu-arm64-debug-clang, os: ubuntu-24.04-arm } + - { target: macos-release-appleclang, os: macos-14 } + - { target: macos-debug-appleclang, os: macos-14 } + - { target: asan, os: ubuntu-latest } + - { target: ubsan, os: ubuntu-latest } + - { target: tsan, os: ubuntu-latest } + runs-on: ${{ matrix.os }} steps: - uses: actions/download-artifact@v4 @@ -301,17 +314,19 @@ jobs: strategy: fail-fast: false matrix: - target: - - ubuntu-release-gcc - - ubuntu-release-clang - - ubuntu-debug-gcc - - ubuntu-debug-clang - - macos-release-appleclang - - macos-debug-appleclang - - asan - - ubsan - - tsan - runs-on: "${{ contains(matrix.target, 'mac') && 'macos-14' || 'ubuntu-latest' }}" + include: + - { target: ubuntu-release-gcc, os: ubuntu-latest } + - { target: ubuntu-release-clang, os: ubuntu-latest } + - { target: ubuntu-debug-gcc, os: ubuntu-latest } + - { target: ubuntu-debug-clang, os: ubuntu-latest } + - { target: ubuntu-arm64-release-clang, os: ubuntu-24.04-arm } + - { target: ubuntu-arm64-debug-clang, os: ubuntu-24.04-arm } + - { target: macos-release-appleclang, os: macos-14 } + - { target: macos-debug-appleclang, os: macos-14 } + - { target: asan, os: ubuntu-latest } + - { target: ubsan, os: ubuntu-latest } + - { target: tsan, os: ubuntu-latest } + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 diff --git a/.gitignore b/.gitignore index d2c553f8..dfbe0066 100644 --- a/.gitignore +++ b/.gitignore @@ -101,4 +101,4 @@ Makefile # Ignore Python virtual environment venv -.venv +.venv* diff --git a/CMakeLists.txt b/CMakeLists.txt index f21e90e3..0391c969 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,8 @@ find_package(argparse REQUIRED CONFIG) find_package(vincentlaucsb-csv-parser REQUIRED CONFIG) find_package(indicators REQUIRED CONFIG) find_package(PkgConfig REQUIRED) +find_package(jsoncons REQUIRED) + pkg_search_module(FFTW REQUIRED IMPORTED_TARGET fftw3) # ----------------------------- Cartogram Library --------------------------- @@ -36,6 +38,7 @@ target_link_libraries(cartogram_lib vincentlaucsb-csv-parser::vincentlaucsb-csv-parser indicators::indicators PkgConfig::FFTW + jsoncons ) if(CMAKE_BUILD_TYPE STREQUAL Release) diff --git a/conanfile.txt b/conanfile.txt index eeadb375..7be813c4 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -7,6 +7,7 @@ nlohmann_json/3.11.3 boost/1.83.0 fftw/3.3.10 pkgconf/2.2.0 +jsoncons/1.3.0 [tool_requires] pkgconf/2.2.0 diff --git a/src/cartogram_info/read_csv.cpp b/src/cartogram_info/read_csv.cpp index 3a175ff3..9d40c5b1 100644 --- a/src/cartogram_info/read_csv.cpp +++ b/src/cartogram_info/read_csv.cpp @@ -1,32 +1,64 @@ #include "cartogram_info.hpp" -#include "csv.hpp" #include "string_to_decimal_converter.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ltrim_ascii(std::string &s) +{ + size_t i = 0; + while (i < s.size()) { + unsigned char ch = static_cast(s[i]); + if ( + ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\v' || + ch == '\f') + ++i; + else + break; + } + if (i) + s.erase(0, i); +} -static int extract_color_col_index( - const csv::CSVReader &reader, - const std::string color_col_name) +static inline void rtrim_ascii(std::string &s) { - // Find index of column with color specifiers. If no color column header was - // passed with the command-line flag --color, the header is assumed to be - // "Color". - int color_col = reader.index_of(color_col_name); - - // If the default "Color" header cannot be found, try again using the British - // spelling "Colour" - if (color_col == csv::CSV_NOT_FOUND && color_col_name == "Color") { - color_col = reader.index_of("Colour"); + while (!s.empty()) { + unsigned char ch = static_cast(s.back()); + if ( + ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\v' || + ch == '\f') + s.pop_back(); + else + break; } +} - return color_col; +static inline std::string normalize_id_token(std::string s) +{ + // Strip UTF-8 BOM if present + if ( + s.size() >= 3 && static_cast(s[0]) == 0xEF && + static_cast(s[1]) == 0xBB && + static_cast(s[2]) == 0xBF) { + s.erase(0, 3); + } + ltrim_ascii(s); + rtrim_ascii(s); + return s; } static void check_validity_of_area_str(const std::string &area_as_str) { - std::string area_process_str = area_as_str; - - if (area_process_str.empty()) { - area_process_str = "NA"; - } + std::string area_process_str = area_as_str.empty() ? "NA" : area_as_str; if (!StringToDecimalConverter::is_str_valid_characters(area_process_str)) { std::cerr @@ -37,7 +69,7 @@ static void check_validity_of_area_str(const std::string &area_as_str) } if ( - !StringToDecimalConverter::is_str_NA(area_process_str) and + !StringToDecimalConverter::is_str_NA(area_process_str) && !StringToDecimalConverter::is_str_correct_format(area_process_str)) { std::cerr << "ERROR: Invalid area string format: " << area_process_str << std::endl; @@ -52,20 +84,13 @@ static void check_validity_of_area_str(const std::string &area_as_str) static std::string process_inset_pos_str(const std::string &inset_pos_as_str) { - std::string inset_pos = inset_pos_as_str; - - if (inset_pos.empty()) { - inset_pos = "C"; - } - - inset_pos = std::toupper(inset_pos[0], std::locale()); - - if (inset_pos == "U") { + std::string inset_pos = inset_pos_as_str.empty() ? "C" : inset_pos_as_str; + inset_pos[0] = + static_cast(std::toupper(static_cast(inset_pos[0]))); + if (inset_pos == "U") inset_pos = "T"; - } - if (inset_pos == "D") { + if (inset_pos == "D") inset_pos = "B"; - } return inset_pos; } @@ -82,79 +107,143 @@ static void check_validity_of_inset_pos( } } +using jsoncons::ojson; +namespace jcc = jsoncons::csv; + +struct HeaderMap { + std::vector raw; // as in file + std::vector norm; // trimmed/BOM-stripped view + + int index_of_norm(const std::string &name) const + { + std::string target = name; + ltrim_ascii(target); + rtrim_ascii(target); + // Do not strip BOM from 'name'; caller passes canonical tokens like + // "Color" + auto it = std::find(norm.begin(), norm.end(), target); + return it == norm.end() ? -1 : static_cast(it - norm.begin()); + } + + const std::string &raw_at(size_t i) const + { + return raw[i]; + } + + const std::string &norm_at(size_t i) const + { + return norm[i]; + } +}; + +static ojson load_csv_rows(const std::string &path) +{ + std::ifstream is(path); + if (!is) { + std::cerr << "ERROR: Cannot open CSV file: " << path << std::endl; + std::exit(17); + } + jcc::csv_options opts; + opts.assume_header(true).ignore_empty_lines(true).trim(true).infer_types( + false); // keep all fields as strings for stable behavior + try { + return jcc::decode_csv(is, opts); // array of row-objects + } catch (const std::exception &e) { + std::cerr << "ERROR: Failed to parse CSV: " << e.what() << std::endl; + std::exit(17); + } +} + +static HeaderMap headers_from(const ojson &rows) +{ + HeaderMap hm; + if (!rows.is_array() || rows.empty()) + return hm; + + const ojson &first = rows.at(0); + for (const auto &kv : first.object_range()) { + hm.raw.emplace_back(std::string(kv.key())); + hm.norm.emplace_back(normalize_id_token(std::string(kv.key()))); + } + return hm; +} + +static int extract_color_col_index( + const HeaderMap &hm, + const std::string &color_col_name) +{ + int idx = hm.index_of_norm(color_col_name); + if (idx < 0 && color_col_name == "Color") + idx = hm.index_of_norm("Colour"); + return idx; +} + // Find the matching ID columns in both the CSV and GeoJSON file // Returns the header name of the matching ID column in the GeoJSON file std::string CartogramInfo::match_id_columns( const std::optional &id_col) { - csv::CSVReader reader(args_.visual_file_name); - std::string csv_id_header; + ojson rows = load_csv_rows(args_.visual_file_name); + const HeaderMap hm = headers_from(rows); + std::string csv_id_header_norm; - // Copies what unique_properties_map_ does except it stores the values in a - // set instead. Should find a more efficient way to do this. + // Build normalized sets from GeoJSON unique properties std::map> geojson_properties_info; for (auto &[key, properties_vec] : unique_properties_map_) { - std::set properties_set( - properties_vec.begin(), - properties_vec.end()); - geojson_properties_info[key] = properties_set; + std::set s; + for (const auto &v : properties_vec) + s.insert(normalize_id_token(v)); + geojson_properties_info[key] = std::move(s); } + std::string matching_id_header; - // If the user has specified a header as the ID column, check that one first. - if (id_col) { - std::set csv_id_set; - for (auto row = reader.begin(); row != reader.end(); row++) - csv_id_set.insert((*row)[*id_col].get()); + auto try_match = [&](const std::string &header_norm) -> bool { + int idx = hm.index_of_norm(header_norm); + if (idx < 0) + return false; + const std::string &raw_key = hm.raw_at(static_cast(idx)); + + std::set data_set; + size_t row_count = rows.size(); + for (const auto &r : rows.array_range()) { + std::string cell; + if (r.contains(raw_key)) { + const ojson &v = r.at(raw_key); + // Values are strings because infer_types(false) + cell = v.is_string() ? v.as_string() : std::string(); + } + data_set.insert(normalize_id_token(cell)); + } + + if (data_set.size() != row_count) + return false; - // Check through each of the GEOJSON properties, see if any exactly match - // the given CSV ID column data for (auto &[key, value_set] : geojson_properties_info) { - if (csv_id_set == value_set) { + if (data_set == value_set) { matching_id_header = key; - csv_id_header = *id_col; - break; + csv_id_header_norm = header_norm; + return true; } } - if (matching_id_header.empty()) + return false; + }; + + if (id_col) { + if (!try_match(*id_col)) { std::cerr << "Given ID header " << *id_col << " does not match with any GeoJSON properties. " "Finding next best matching ID column..." << std::endl; + } } - // If there is no user given ID header or the header does not match with any - // GEOJSON properties, iterate through each of the CSV columns to find the - // matching ID header. if (matching_id_header.empty()) { - std::vector column_headers = reader.get_col_names(); - - for (std::string &header : column_headers) { - // The begin() iterator for CSVReader seems to not be working correctly - // (issue here: https://github.com/vincentlaucsb/csv-parser/issues/261). - // As such, a new reader has to be declared in each loop in order to - // properly iterate through the rows. - csv::CSVReader loop_reader(args_.visual_file_name); - std::set data_set; - for (auto row = loop_reader.begin(); row != loop_reader.end(); row++) { - data_set.insert((*row)[header].get()); - } - - // If the set size is less than the number of rows then skip the column - // as it cannot be the ID column. - if (data_set.size() < loop_reader.n_rows()) - continue; - - for (auto &[key, value_set] : geojson_properties_info) { - if (data_set == value_set) { - matching_id_header = key; - csv_id_header = header; - break; - } - } - - if (!matching_id_header.empty()) + for (const auto &h : hm.norm) { + if (try_match(h)) { + std::cerr << "Matched ID column: " << h << std::endl; break; + } } } @@ -165,7 +254,13 @@ std::string CartogramInfo::match_id_columns( std::exit(16); } - id_col_ = reader.index_of(csv_id_header); + int idx = hm.index_of_norm(csv_id_header_norm); + if (idx < 0) { + std::cerr << "ERROR: Internal error determining ID column index." + << std::endl; + std::exit(16); + } + id_col_ = idx; // index into normalized header list return matching_id_header; } @@ -191,7 +286,8 @@ void CartogramInfo::update_id_header_info( const std::string csv_id = geojson_id_to_csv_id.at(geojson_id); new_gd_to_inset[csv_id] = inset_pos; } - gd_to_inset_ = new_gd_to_inset; + + gd_to_inset_ = std::move(new_gd_to_inset); for (InsetState &inset_state : inset_states_) { inset_state.update_gd_ids(geojson_id_to_csv_id); @@ -205,9 +301,9 @@ static void check_validity_of_csv_ids( const std::vector &initial_id_order) { std::vector csv_ids; - for (const auto &[id, data] : csv_data) { + csv_ids.reserve(csv_data.size()); + for (const auto &[id, _] : csv_data) csv_ids.push_back(id); - } for (const auto &id : csv_ids) { if ( @@ -225,7 +321,6 @@ static void check_validity_of_csv_ids( << std::endl; csv_data[id] = {{"area", "NA"}, {"color", ""}, {"label", ""}, {"inset_pos", "C"}}; - // std::exit(22); } } } @@ -259,13 +354,12 @@ void CartogramInfo::relocate_geodivs_based_on_inset_pos( // Add color and label info, if present std::string color = gd_info.at("color"); - if (!color.empty()) { + if (!color.empty()) new_inset_state.insert_color(id, color); - } + const std::string &label = gd_info.at("label"); - if (!label.empty()) { + if (!label.empty()) new_inset_state.insert_label(id, label); - } } new_inset_states.emplace_back(std::move(new_inset_state)); } @@ -280,26 +374,20 @@ static bool is_point_as_separator( const std::map> &csv_data) { std::vector area_strs; - for (const auto &[id, data] : csv_data) { + area_strs.reserve(csv_data.size()); + for (const auto &[_, data] : csv_data) area_strs.push_back(data.at("area")); - } - - if (StringToDecimalConverter::is_comma_as_separator(area_strs)) { - return false; - } - - return true; + return !StringToDecimalConverter::is_comma_as_separator(area_strs); } static void process_area_strs( std::map> &csv_data) { const bool uses_point_separator = is_point_as_separator(csv_data); - for (auto &[id, data] : csv_data) { + for (auto &[_, data] : csv_data) { std::string &area_as_str = data.at("area"); - if (area_as_str.empty()) { + if (area_as_str.empty()) area_as_str = "NA"; - } area_as_str = StringToDecimalConverter::parse_str(area_as_str, uses_point_separator); } @@ -307,24 +395,55 @@ static void process_area_strs( void CartogramInfo::read_csv() { - csv::CSVReader reader(args_.visual_file_name); + ojson rows = load_csv_rows(args_.visual_file_name); + const HeaderMap hm = headers_from(rows); + + if (hm.norm.size() < 2) { + std::cerr + << "ERROR: CSV with >= 2 columns (IDs, target areas) required. Some " + "rows in your CSV may not have values for all columns" + << std::endl; + std::exit(17); + } const std::string new_id_header = match_id_columns(args_.id_col); - const int id_col = id_col_; - // Unless named through command-line argument, - // 2nd column is assumed to be target areas - const int area_col = args_.area_col ? reader.index_of(*args_.area_col) : 1; + const int id_col = id_col_; // index into hm.norm - // Defaults set in parse_arguments.cpp - const int inset_col = reader.index_of(args_.inset_col); // default: "Inset" - const int label_col = reader.index_of(args_.label_col); // default: "Label" + auto col_index = [&](const std::string &name) -> int { + return hm.index_of_norm(name); + }; - // default: "Color" | "Colour" - const int color_col = extract_color_col_index(reader, args_.color_col); + const int area_col = args_.area_col ? col_index(*args_.area_col) : 1; + if (area_col < 0) { + std::cerr + << "ERROR: CSV with >= 2 columns (IDs, target areas) required. Some " + "rows in your CSV may not have values for all columns" + << std::endl; + std::exit(17); + } + + const int inset_col = col_index(args_.inset_col); // default: "Inset" + const int label_col = col_index(args_.label_col); // default: "Label" + const int color_col = extract_color_col_index( + hm, + args_.color_col); // default: "Color" | "Colour" + + const std::string &id_key = hm.raw_at(static_cast(id_col)); + const std::string &area_key = hm.raw_at(static_cast(area_col)); + const std::string color_key = (color_col >= 0) + ? hm.raw_at(static_cast(color_col)) + : std::string(); + const std::string label_key = (label_col >= 0) + ? hm.raw_at(static_cast(label_col)) + : std::string(); + const std::string inset_key = (inset_col >= 0) + ? hm.raw_at(static_cast(inset_col)) + : std::string(); std::map> csv_data; - for (auto &row : reader) { - if (row.size() < 2) { + + for (const auto &r : rows.array_range()) { + if (!r.contains(id_key) || !r.contains(area_key)) { std::cerr << "ERROR: CSV with >= 2 columns (IDs, target areas) required. Some " "rows in your CSV may not have values for all columns" @@ -332,22 +451,21 @@ void CartogramInfo::read_csv() std::exit(17); } - const std::string id = row[static_cast(id_col)].get(); - const std::string area_as_str = row[static_cast(area_col)].get(); + std::string id = normalize_id_token(r.at(id_key).as_string()); + std::string area_as_str = r.at(area_key).as_string(); check_validity_of_area_str(area_as_str); - const std::string color = (color_col != csv::CSV_NOT_FOUND) - ? row[static_cast(color_col)].get() - : ""; + std::string color; + if (!color_key.empty() && r.contains(color_key)) + color = r.at(color_key).as_string(); - const std::string label = (label_col != csv::CSV_NOT_FOUND) - ? row[static_cast(label_col)].get() - : ""; + std::string label; + if (!label_key.empty() && r.contains(label_key)) + label = r.at(label_key).as_string(); - const std::string inset_pos_as_str = - (inset_col != csv::CSV_NOT_FOUND) - ? row[static_cast(inset_col)].get() - : "C"; + std::string inset_pos_as_str = "C"; + if (!inset_key.empty() && r.contains(inset_key)) + inset_pos_as_str = r.at(inset_key).as_string(); const std::string inset_pos = process_inset_pos_str(inset_pos_as_str); check_validity_of_inset_pos(inset_pos, id);