From 89a3be89cc941e418ae604436e18c149f2b64fac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5vard=20Berland?= Date: Thu, 3 Oct 2019 10:54:11 +0200 Subject: [PATCH 1/2] Add original csvMergeEnsembles from resscript --- subscript/csvMergeEnsembles.py | 120 +++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100755 subscript/csvMergeEnsembles.py diff --git a/subscript/csvMergeEnsembles.py b/subscript/csvMergeEnsembles.py new file mode 100755 index 000000000..fc07f7585 --- /dev/null +++ b/subscript/csvMergeEnsembles.py @@ -0,0 +1,120 @@ +#!/bin/env python +# +# Usage: +# csvMergeEnsembles.py ensemble1.csv ensemble2.csv [ensemble3.csv [...]] +# +# Given csv files (typically ensembles produced by ERT), it will +# append all the data rows of the second ensemble to the first ensemble. +# The data will be exported to the file "merged.csv" - rename it afterwards +# +# A new column is added called 'ensemble', which will contain the name of the +# ensemble (taken from the filename you provide). +# +# The columns in the ensembles need not be the same. Similar column names +# will be merged, differing column names will be padded (with NaN) in the +# ensemble where they don't exist. +# +# Note that the ordering of all columns becomes alphabetical after this merging. +# +# Author: Haavard Berland, OSE PTC RP, Sept/Oct 2015, havb@statoil.com + +import sys +import pandas +import argparse +import re +import resscript.header as header + +parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("csvfiles", nargs="+", help="input csv files") +parser.add_argument("-o", "--output", type=str, + help="name of output csv file. Use - or stdout to dump output to stdout.", default="merged.csv") +parser.add_argument("--keepconstantcolumns", action='store_true', help="Keep constant columns", default=False) +parser.add_argument("--filecolumn", type=str, help="Name of column containing original filename", default="ensemble") +parser.add_argument("-q", "--quiet", action='store_true', help="Suppress non-critical output", default=False) + +args = parser.parse_args() + +if args.output == "-" or args.output == "stdout": + quiet = True +else: + quiet = False + +if args.quiet: + quiet = True + + +if not quiet: + header.compose("csvMergeEnsembles.py", + "01.04.2015", + ["Haavard Berland"], + ["havb@statoil.com"], + ["-h for help, or check wiki"], + "Merge multiple CSV exports from ERT into one CSV file") + + +ens = pandas.DataFrame() +for csvfile in args.csvfiles: + if not quiet: + print " ** Loading "+ csvfile + "..." + try: + ensnew = pandas.read_csv(csvfile) + if not quiet: + print ensnew.info() + + ensnew[args.filecolumn] = pandas.Series(csvfile.replace(".csv",""), index=ensnew.index) + realregex = ".*realization-(\d*)/" + iterregex = ".*iter-(\d*)/" + + if re.match(realregex, csvfile): + # We don't use the column name "Realization" yet, because it might exist in some of the + # input files, but later on, we will copy it to "Realization" if it doesn't exist in the end + ensnew[args.filecolumn + "-realization"] = re.match(realregex, csvfile).group(1) + if re.match(iterregex, csvfile): + ensnew[args.filecolumn + "-iter"] = re.match(iterregex, csvfile).group(1) + + ens = pandas.concat([ens, ensnew], ignore_index=True, sort=True) + # (the indices in these csv files are just the row number, which doesn't mean anything + # in our data, therefore we should "ignore_index".) + if not quiet: + print " ------------------ " + except IOError: + if not quiet: + print "WARNING: " + csvfile + " not found." + except pandas.errors.EmptyDataError: + if not quiet: + print "WARNING: " + csvfile + " seems empty, no data found." + +if not args.keepconstantcolumns: + columnstodelete = [] + for col in ens.columns: + if len(ens[col].unique()) == 1: + columnstodelete.append(col) + if not quiet: + print " Dropping constant columns " + str(columnstodelete) + ens.drop(columnstodelete, inplace=True, axis=1) + +# Copy realization column if its only source is the filename. +if not "Realization" in ens.columns and args.filecolumn+"-realization" in ens.columns: + ens["Realization"] = ens[args.filecolumn + "-realization"] +# Ditto for iteration +if not "Iter" in ens.columns and args.filecolumn+"-iter" in ens.columns: + ens["Iter"] = ens[args.filecolumn + "-iter"] + + +if len(ens.index) == 0: + print "ERROR: No data to output." + sys.exit(1) + +if not quiet: + print " ** Merged ensemble data:" + print ens.info() + + print " ** Exporting csv data to " + args.output + +if args.output == "-" or args.output == "stdout": + ens.to_csv(sys.stdout, index=False) +else: + ens.to_csv(path_or_buf=args.output, index=False) + +if not quiet: + print " - Finished writing to " + args.output From 486694183fc672d9a151ad114197928d5866feef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5vard=20Berland?= Date: Thu, 3 Oct 2019 11:19:09 +0200 Subject: [PATCH 2/2] Port csvMergeEnsembles to subscript, add simple test --- setup.py | 1 + subscript/csvMergeEnsembles.py | 259 +++++++++++++--------- subscript/tests/test_csvMergeEnsembles.py | 59 +++++ 3 files changed, 211 insertions(+), 108 deletions(-) create mode 100644 subscript/tests/test_csvMergeEnsembles.py diff --git a/setup.py b/setup.py index 62ba0e7d8..cd1637561 100644 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ tests_require=["pytest"], entry_points={ "console_scripts": [ + "csvMergeEnsembles = subscript.csvMergeEnsembles:main", "presentvalue = subscript.presentvalue:main", "sunsch = subscript.sunsch:main", ] diff --git a/subscript/csvMergeEnsembles.py b/subscript/csvMergeEnsembles.py index fc07f7585..5c6755411 100755 --- a/subscript/csvMergeEnsembles.py +++ b/subscript/csvMergeEnsembles.py @@ -1,120 +1,163 @@ -#!/bin/env python -# -# Usage: -# csvMergeEnsembles.py ensemble1.csv ensemble2.csv [ensemble3.csv [...]] -# -# Given csv files (typically ensembles produced by ERT), it will -# append all the data rows of the second ensemble to the first ensemble. -# The data will be exported to the file "merged.csv" - rename it afterwards -# -# A new column is added called 'ensemble', which will contain the name of the -# ensemble (taken from the filename you provide). -# -# The columns in the ensembles need not be the same. Similar column names -# will be merged, differing column names will be padded (with NaN) in the -# ensemble where they don't exist. -# -# Note that the ordering of all columns becomes alphabetical after this merging. -# -# Author: Haavard Berland, OSE PTC RP, Sept/Oct 2015, havb@statoil.com +""" +Merge multiple CSV files. +""" + +from __future__ import print_function +from __future__ import absolute_import +from __future__ import division import sys -import pandas import argparse import re -import resscript.header as header -parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("csvfiles", nargs="+", help="input csv files") -parser.add_argument("-o", "--output", type=str, - help="name of output csv file. Use - or stdout to dump output to stdout.", default="merged.csv") -parser.add_argument("--keepconstantcolumns", action='store_true', help="Keep constant columns", default=False) -parser.add_argument("--filecolumn", type=str, help="Name of column containing original filename", default="ensemble") -parser.add_argument("-q", "--quiet", action='store_true', help="Suppress non-critical output", default=False) - -args = parser.parse_args() +import pandas -if args.output == "-" or args.output == "stdout": - quiet = True -else: - quiet = False -if args.quiet: - quiet = True +class CustomFormatter( + argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter +): + """ + Multiple inheritance used for argparse to get both + defaults and raw description formatter + """ + + pass + + +def get_parser(): + """Construct parser object for csvMergeEnsembles""" + parser = argparse.ArgumentParser( + formatter_class=CustomFormatter, + description=""" +Merge multiple CSV files into one. Each row will be tagged by the filename +it came from in the column 'ensemble'. + +The columns in the ensembles need not be the same. Similar column names +will be merged, differing column names will be padded (with NaN) in the +ensemble where they don't exist. + +Note that the ordering of all columns becomes alphabetical after this merging. +""", + epilog="""If realization-*/iter-* is present in the filename, that numerical information +is attempted extracted and put into the columns Realization and Iteration +""", + ) + parser.add_argument("csvfiles", nargs="+", help="input csv files") + parser.add_argument( + "-o", + "--output", + type=str, + help="name of output csv file. Use - or stdout to dump output to stdout.", + default="merged.csv", + ) + parser.add_argument( + "--keepconstantcolumns", + action="store_true", + help="Keep constant columns", + default=False, + ) + parser.add_argument( + "--filecolumn", + type=str, + help="Name of column containing original filename", + default="ensemble", + ) + parser.add_argument( + "-q", + "--quiet", + action="store_true", + help="Suppress non-critical output", + default=False, + ) + return parser + + +def main(): + """Entry point from command line""" + parser = get_parser() + args = parser.parse_args() + quiet = args.output == "-" or args.output == "stdout" or args.quiet + + ens = pandas.DataFrame() + for csvfile in args.csvfiles: + if not quiet: + print(" ** Loading " + csvfile + "...") + try: + ensnew = pandas.read_csv(csvfile) + if not quiet: + print(ensnew.info()) + + ensnew[args.filecolumn] = pandas.Series( + csvfile.replace(".csv", ""), index=ensnew.index + ) + realregex = r".*realization-(\d*)/" + iterregex = r".*iter-(\d*)/" + + if re.match(realregex, csvfile): + # We don't use the column name "Realization" yet, + # because it might exist in some of the + # input files, but later on, we will copy it to "Realization" + # if it doesn't exist in the end + ensnew[args.filecolumn + "-realization"] = re.match( + realregex, csvfile + ).group(1) + if re.match(iterregex, csvfile): + ensnew[args.filecolumn + "-iter"] = re.match(iterregex, csvfile).group( + 1 + ) + + # Concatenation is done one frame at at a time. + # This makes concatenation slower, but more memory efficient. + ens = pandas.concat([ens, ensnew], ignore_index=True, sort=True) + # (the indices in these csv files are just the row number, + # which doesn't mean anything + # in our data, therefore we should "ignore_index".) + if not quiet: + print(" ------------------ ") + except IOError: + if not quiet: + print("WARNING: " + csvfile + " not found.") + except pandas.errors.EmptyDataError: + if not quiet: + print("WARNING: " + csvfile + " seems empty, no data found.") + + if not args.keepconstantcolumns: + columnstodelete = [] + for col in ens.columns: + if len(ens[col].unique()) == 1: + columnstodelete.append(col) + if not quiet: + print(" Dropping constant columns " + str(columnstodelete)) + ens.drop(columnstodelete, inplace=True, axis=1) + + # Copy realization column if its only source is the filename. + if ( + "Realization" not in ens.columns + and args.filecolumn + "-realization" in ens.columns + ): + ens["Realization"] = ens[args.filecolumn + "-realization"] + # Ditto for iteration + if "Iter" not in ens.columns and args.filecolumn + "-iter" in ens.columns: + ens["Iter"] = ens[args.filecolumn + "-iter"] + + if ens.empty: + print("ERROR: No data to output.") + sys.exit(1) + if not quiet: + print(" ** Merged ensemble data:") + print(ens.info()) -if not quiet: - header.compose("csvMergeEnsembles.py", - "01.04.2015", - ["Haavard Berland"], - ["havb@statoil.com"], - ["-h for help, or check wiki"], - "Merge multiple CSV exports from ERT into one CSV file") + print(" ** Exporting csv data to " + args.output) + if args.output == "-" or args.output == "stdout": + ens.to_csv(sys.stdout, index=False) + else: + ens.to_csv(path_or_buf=args.output, index=False) -ens = pandas.DataFrame() -for csvfile in args.csvfiles: if not quiet: - print " ** Loading "+ csvfile + "..." - try: - ensnew = pandas.read_csv(csvfile) - if not quiet: - print ensnew.info() - - ensnew[args.filecolumn] = pandas.Series(csvfile.replace(".csv",""), index=ensnew.index) - realregex = ".*realization-(\d*)/" - iterregex = ".*iter-(\d*)/" - - if re.match(realregex, csvfile): - # We don't use the column name "Realization" yet, because it might exist in some of the - # input files, but later on, we will copy it to "Realization" if it doesn't exist in the end - ensnew[args.filecolumn + "-realization"] = re.match(realregex, csvfile).group(1) - if re.match(iterregex, csvfile): - ensnew[args.filecolumn + "-iter"] = re.match(iterregex, csvfile).group(1) - - ens = pandas.concat([ens, ensnew], ignore_index=True, sort=True) - # (the indices in these csv files are just the row number, which doesn't mean anything - # in our data, therefore we should "ignore_index".) - if not quiet: - print " ------------------ " - except IOError: - if not quiet: - print "WARNING: " + csvfile + " not found." - except pandas.errors.EmptyDataError: - if not quiet: - print "WARNING: " + csvfile + " seems empty, no data found." - -if not args.keepconstantcolumns: - columnstodelete = [] - for col in ens.columns: - if len(ens[col].unique()) == 1: - columnstodelete.append(col) - if not quiet: - print " Dropping constant columns " + str(columnstodelete) - ens.drop(columnstodelete, inplace=True, axis=1) - -# Copy realization column if its only source is the filename. -if not "Realization" in ens.columns and args.filecolumn+"-realization" in ens.columns: - ens["Realization"] = ens[args.filecolumn + "-realization"] -# Ditto for iteration -if not "Iter" in ens.columns and args.filecolumn+"-iter" in ens.columns: - ens["Iter"] = ens[args.filecolumn + "-iter"] - - -if len(ens.index) == 0: - print "ERROR: No data to output." - sys.exit(1) - -if not quiet: - print " ** Merged ensemble data:" - print ens.info() - - print " ** Exporting csv data to " + args.output - -if args.output == "-" or args.output == "stdout": - ens.to_csv(sys.stdout, index=False) -else: - ens.to_csv(path_or_buf=args.output, index=False) - -if not quiet: - print " - Finished writing to " + args.output + print(" - Finished writing to " + args.output) + + +if __name__ == "__main__": + main() diff --git a/subscript/tests/test_csvMergeEnsembles.py b/subscript/tests/test_csvMergeEnsembles.py new file mode 100644 index 000000000..0d0dd64fa --- /dev/null +++ b/subscript/tests/test_csvMergeEnsembles.py @@ -0,0 +1,59 @@ +"""Test csvMergeEnsembles""" +from __future__ import absolute_import + +import os +import sys + +import pandas as pd + +from .. import csvMergeEnsembles + + +def test_main_merge(): + """Test command line interface for csvMergeEnsembles""" + + test_csv_1 = "foo.csv" + test_csv_2 = "bar.csv" + merged_csv = "merged.csv" + + # Dump test data to disk as CSV first: + pd.DataFrame( + columns=["Realization", "FOO", "CONST"], data=[[0, 10, 1], [1, 20, 1]] + ).to_csv(test_csv_1, index=False) + pd.DataFrame( + columns=["Realization", "BAR", "CONST"], data=[[0, 30, 1], [1, 40, 1]] + ).to_csv(test_csv_2, index=False) + + sys.argv = ["csvMergeEnsembles", test_csv_1, test_csv_2, "-q", "-o", merged_csv] + csvMergeEnsembles.main() + merged = pd.read_csv(merged_csv) + + assert len(merged) == 4 + assert len(merged.columns) == 4 # 3 unique in input, and 1 extra. + assert test_csv_1.replace(".csv", "") in merged.ensemble.unique() + assert test_csv_2.replace(".csv", "") in merged.ensemble.unique() + assert len(merged.ensemble.unique()) == 2 + + # Test --keepconstantcolumns + sys.argv = [ + "csvMergeEnsembles", + test_csv_1, + test_csv_2, + "--keepconstantcolumns", + "-q", + "-o", + merged_csv, + ] + csvMergeEnsembles.main() + merged = pd.read_csv(merged_csv) + + assert len(merged) == 4 + assert len(merged.columns) == 5 # Also the constant column + + # Cleanup + if os.path.exists(merged_csv): + os.unlink(merged_csv) + if os.path.exists(test_csv_1): + os.unlink(test_csv_1) + if os.path.exists(test_csv_2): + os.unlink(test_csv_2)