diff --git a/setup.py b/setup.py index 62ba0e7d8..cd1637561 100644 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ tests_require=["pytest"], entry_points={ "console_scripts": [ + "csvMergeEnsembles = subscript.csvMergeEnsembles:main", "presentvalue = subscript.presentvalue:main", "sunsch = subscript.sunsch:main", ] diff --git a/subscript/csvMergeEnsembles.py b/subscript/csvMergeEnsembles.py new file mode 100755 index 000000000..5c6755411 --- /dev/null +++ b/subscript/csvMergeEnsembles.py @@ -0,0 +1,163 @@ +""" +Merge multiple CSV files. +""" + +from __future__ import print_function +from __future__ import absolute_import +from __future__ import division + +import sys +import argparse +import re + +import pandas + + +class CustomFormatter( + argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter +): + """ + Multiple inheritance used for argparse to get both + defaults and raw description formatter + """ + + pass + + +def get_parser(): + """Construct parser object for csvMergeEnsembles""" + parser = argparse.ArgumentParser( + formatter_class=CustomFormatter, + description=""" +Merge multiple CSV files into one. Each row will be tagged by the filename +it came from in the column 'ensemble'. + +The columns in the ensembles need not be the same. Similar column names +will be merged, differing column names will be padded (with NaN) in the +ensemble where they don't exist. + +Note that the ordering of all columns becomes alphabetical after this merging. +""", + epilog="""If realization-*/iter-* is present in the filename, that numerical information +is attempted extracted and put into the columns Realization and Iteration +""", + ) + parser.add_argument("csvfiles", nargs="+", help="input csv files") + parser.add_argument( + "-o", + "--output", + type=str, + help="name of output csv file. Use - or stdout to dump output to stdout.", + default="merged.csv", + ) + parser.add_argument( + "--keepconstantcolumns", + action="store_true", + help="Keep constant columns", + default=False, + ) + parser.add_argument( + "--filecolumn", + type=str, + help="Name of column containing original filename", + default="ensemble", + ) + parser.add_argument( + "-q", + "--quiet", + action="store_true", + help="Suppress non-critical output", + default=False, + ) + return parser + + +def main(): + """Entry point from command line""" + parser = get_parser() + args = parser.parse_args() + quiet = args.output == "-" or args.output == "stdout" or args.quiet + + ens = pandas.DataFrame() + for csvfile in args.csvfiles: + if not quiet: + print(" ** Loading " + csvfile + "...") + try: + ensnew = pandas.read_csv(csvfile) + if not quiet: + print(ensnew.info()) + + ensnew[args.filecolumn] = pandas.Series( + csvfile.replace(".csv", ""), index=ensnew.index + ) + realregex = r".*realization-(\d*)/" + iterregex = r".*iter-(\d*)/" + + if re.match(realregex, csvfile): + # We don't use the column name "Realization" yet, + # because it might exist in some of the + # input files, but later on, we will copy it to "Realization" + # if it doesn't exist in the end + ensnew[args.filecolumn + "-realization"] = re.match( + realregex, csvfile + ).group(1) + if re.match(iterregex, csvfile): + ensnew[args.filecolumn + "-iter"] = re.match(iterregex, csvfile).group( + 1 + ) + + # Concatenation is done one frame at at a time. + # This makes concatenation slower, but more memory efficient. + ens = pandas.concat([ens, ensnew], ignore_index=True, sort=True) + # (the indices in these csv files are just the row number, + # which doesn't mean anything + # in our data, therefore we should "ignore_index".) + if not quiet: + print(" ------------------ ") + except IOError: + if not quiet: + print("WARNING: " + csvfile + " not found.") + except pandas.errors.EmptyDataError: + if not quiet: + print("WARNING: " + csvfile + " seems empty, no data found.") + + if not args.keepconstantcolumns: + columnstodelete = [] + for col in ens.columns: + if len(ens[col].unique()) == 1: + columnstodelete.append(col) + if not quiet: + print(" Dropping constant columns " + str(columnstodelete)) + ens.drop(columnstodelete, inplace=True, axis=1) + + # Copy realization column if its only source is the filename. + if ( + "Realization" not in ens.columns + and args.filecolumn + "-realization" in ens.columns + ): + ens["Realization"] = ens[args.filecolumn + "-realization"] + # Ditto for iteration + if "Iter" not in ens.columns and args.filecolumn + "-iter" in ens.columns: + ens["Iter"] = ens[args.filecolumn + "-iter"] + + if ens.empty: + print("ERROR: No data to output.") + sys.exit(1) + + if not quiet: + print(" ** Merged ensemble data:") + print(ens.info()) + + print(" ** Exporting csv data to " + args.output) + + if args.output == "-" or args.output == "stdout": + ens.to_csv(sys.stdout, index=False) + else: + ens.to_csv(path_or_buf=args.output, index=False) + + if not quiet: + print(" - Finished writing to " + args.output) + + +if __name__ == "__main__": + main() diff --git a/subscript/tests/test_csvMergeEnsembles.py b/subscript/tests/test_csvMergeEnsembles.py new file mode 100644 index 000000000..0d0dd64fa --- /dev/null +++ b/subscript/tests/test_csvMergeEnsembles.py @@ -0,0 +1,59 @@ +"""Test csvMergeEnsembles""" +from __future__ import absolute_import + +import os +import sys + +import pandas as pd + +from .. import csvMergeEnsembles + + +def test_main_merge(): + """Test command line interface for csvMergeEnsembles""" + + test_csv_1 = "foo.csv" + test_csv_2 = "bar.csv" + merged_csv = "merged.csv" + + # Dump test data to disk as CSV first: + pd.DataFrame( + columns=["Realization", "FOO", "CONST"], data=[[0, 10, 1], [1, 20, 1]] + ).to_csv(test_csv_1, index=False) + pd.DataFrame( + columns=["Realization", "BAR", "CONST"], data=[[0, 30, 1], [1, 40, 1]] + ).to_csv(test_csv_2, index=False) + + sys.argv = ["csvMergeEnsembles", test_csv_1, test_csv_2, "-q", "-o", merged_csv] + csvMergeEnsembles.main() + merged = pd.read_csv(merged_csv) + + assert len(merged) == 4 + assert len(merged.columns) == 4 # 3 unique in input, and 1 extra. + assert test_csv_1.replace(".csv", "") in merged.ensemble.unique() + assert test_csv_2.replace(".csv", "") in merged.ensemble.unique() + assert len(merged.ensemble.unique()) == 2 + + # Test --keepconstantcolumns + sys.argv = [ + "csvMergeEnsembles", + test_csv_1, + test_csv_2, + "--keepconstantcolumns", + "-q", + "-o", + merged_csv, + ] + csvMergeEnsembles.main() + merged = pd.read_csv(merged_csv) + + assert len(merged) == 4 + assert len(merged.columns) == 5 # Also the constant column + + # Cleanup + if os.path.exists(merged_csv): + os.unlink(merged_csv) + if os.path.exists(test_csv_1): + os.unlink(test_csv_1) + if os.path.exists(test_csv_2): + os.unlink(test_csv_2)