From 89a3be89cc941e418ae604436e18c149f2b64fac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5vard=20Berland?= <havb@equinor.com>
Date: Thu, 3 Oct 2019 10:54:11 +0200
Subject: [PATCH 1/2] Add original csvMergeEnsembles from resscript

---
 subscript/csvMergeEnsembles.py | 120 +++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100755 subscript/csvMergeEnsembles.py

diff --git a/subscript/csvMergeEnsembles.py b/subscript/csvMergeEnsembles.py
new file mode 100755
index 000000000..fc07f7585
--- /dev/null
+++ b/subscript/csvMergeEnsembles.py
@@ -0,0 +1,120 @@
+#!/bin/env python
+#
+# Usage:
+#        csvMergeEnsembles.py ensemble1.csv ensemble2.csv [ensemble3.csv [...]]
+#
+# Given csv files (typically ensembles produced by ERT), it will 
+# append all the data rows of the second ensemble to the first ensemble.
+# The data will be exported to the file "merged.csv" - rename it afterwards
+#
+# A new column is added called 'ensemble', which will contain the name of the 
+# ensemble (taken from the filename you provide).
+#
+# The columns in the ensembles need not be the same. Similar column names 
+# will be merged, differing column names will be padded (with NaN) in the 
+# ensemble where they don't exist.
+#
+# Note that the ordering of all columns becomes alphabetical after this merging.
+#
+# Author: Haavard Berland, OSE PTC RP, Sept/Oct 2015, havb@statoil.com
+
+import sys
+import pandas
+import argparse
+import re
+import resscript.header as header
+
+parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument("csvfiles", nargs="+", help="input csv files")
+parser.add_argument("-o", "--output", type=str, 
+                    help="name of output csv file. Use - or stdout to dump output to stdout.", default="merged.csv")
+parser.add_argument("--keepconstantcolumns", action='store_true', help="Keep constant columns", default=False)
+parser.add_argument("--filecolumn", type=str, help="Name of column containing original filename", default="ensemble")
+parser.add_argument("-q", "--quiet", action='store_true', help="Suppress non-critical output", default=False)
+
+args = parser.parse_args()
+
+if args.output == "-" or args.output == "stdout":
+    quiet = True
+else:
+    quiet = False
+
+if args.quiet:
+    quiet = True
+
+
+if not quiet:
+    header.compose("csvMergeEnsembles.py", 
+                   "01.04.2015", 
+                   ["Haavard Berland"], 
+                   ["havb@statoil.com"], 
+                   ["-h for help, or check wiki"], 
+                   "Merge multiple CSV exports from ERT into one CSV file")
+
+
+ens = pandas.DataFrame()
+for csvfile in args.csvfiles:
+    if not quiet:
+        print " ** Loading "+  csvfile + "..."
+    try:
+        ensnew = pandas.read_csv(csvfile)
+        if not quiet:
+            print ensnew.info()
+
+        ensnew[args.filecolumn] = pandas.Series(csvfile.replace(".csv",""), index=ensnew.index)
+        realregex = ".*realization-(\d*)/"
+        iterregex = ".*iter-(\d*)/"
+
+        if re.match(realregex, csvfile):
+            # We don't use the column name "Realization" yet, because it might exist in some of the 
+            # input files, but later on, we will copy it to "Realization" if it doesn't exist in the end
+            ensnew[args.filecolumn + "-realization"] = re.match(realregex, csvfile).group(1)
+        if re.match(iterregex, csvfile):
+            ensnew[args.filecolumn + "-iter"] = re.match(iterregex, csvfile).group(1)
+
+        ens = pandas.concat([ens, ensnew], ignore_index=True, sort=True)
+        # (the indices in these csv files are just the row number, which doesn't mean anything 
+        #  in our data, therefore we should "ignore_index".)
+        if not quiet:
+            print "         ------------------  "
+    except IOError:
+        if not quiet:
+            print "WARNING: " + csvfile + " not found."
+    except pandas.errors.EmptyDataError:
+        if not quiet:
+            print "WARNING: " + csvfile + " seems empty, no data found."
+
+if not args.keepconstantcolumns:
+    columnstodelete = []
+    for col in ens.columns:
+        if len(ens[col].unique()) == 1:
+            columnstodelete.append(col)
+    if not quiet: 
+        print "  Dropping constant columns " + str(columnstodelete) 
+    ens.drop(columnstodelete, inplace=True, axis=1)
+
+# Copy realization column if its only source is the filename.
+if not "Realization" in ens.columns and args.filecolumn+"-realization" in ens.columns:
+    ens["Realization"] = ens[args.filecolumn + "-realization"]
+# Ditto for iteration
+if not "Iter" in ens.columns and args.filecolumn+"-iter" in ens.columns:
+    ens["Iter"] = ens[args.filecolumn + "-iter"]
+
+
+if len(ens.index) == 0:
+    print "ERROR: No data to output."
+    sys.exit(1)
+
+if not quiet:
+    print " ** Merged ensemble data:"
+    print ens.info()
+    
+    print " ** Exporting csv data to " + args.output    
+
+if args.output == "-" or args.output == "stdout":
+    ens.to_csv(sys.stdout, index=False)
+else:
+    ens.to_csv(path_or_buf=args.output, index=False)
+
+if not quiet:
+    print " - Finished writing to " + args.output

From 486694183fc672d9a151ad114197928d5866feef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5vard=20Berland?= <havb@equinor.com>
Date: Thu, 3 Oct 2019 11:19:09 +0200
Subject: [PATCH 2/2] Port csvMergeEnsembles to subscript, add simple test

---
 setup.py                                  |   1 +
 subscript/csvMergeEnsembles.py            | 259 +++++++++++++---------
 subscript/tests/test_csvMergeEnsembles.py |  59 +++++
 3 files changed, 211 insertions(+), 108 deletions(-)
 create mode 100644 subscript/tests/test_csvMergeEnsembles.py

diff --git a/setup.py b/setup.py
index 62ba0e7d8..cd1637561 100644
--- a/setup.py
+++ b/setup.py
@@ -21,6 +21,7 @@
     tests_require=["pytest"],
     entry_points={
         "console_scripts": [
+            "csvMergeEnsembles = subscript.csvMergeEnsembles:main",
             "presentvalue = subscript.presentvalue:main",
             "sunsch = subscript.sunsch:main",
         ]
diff --git a/subscript/csvMergeEnsembles.py b/subscript/csvMergeEnsembles.py
index fc07f7585..5c6755411 100755
--- a/subscript/csvMergeEnsembles.py
+++ b/subscript/csvMergeEnsembles.py
@@ -1,120 +1,163 @@
-#!/bin/env python
-#
-# Usage:
-#        csvMergeEnsembles.py ensemble1.csv ensemble2.csv [ensemble3.csv [...]]
-#
-# Given csv files (typically ensembles produced by ERT), it will 
-# append all the data rows of the second ensemble to the first ensemble.
-# The data will be exported to the file "merged.csv" - rename it afterwards
-#
-# A new column is added called 'ensemble', which will contain the name of the 
-# ensemble (taken from the filename you provide).
-#
-# The columns in the ensembles need not be the same. Similar column names 
-# will be merged, differing column names will be padded (with NaN) in the 
-# ensemble where they don't exist.
-#
-# Note that the ordering of all columns becomes alphabetical after this merging.
-#
-# Author: Haavard Berland, OSE PTC RP, Sept/Oct 2015, havb@statoil.com
+"""
+Merge multiple CSV files.
+"""
+
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
 
 import sys
-import pandas
 import argparse
 import re
-import resscript.header as header
 
-parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("csvfiles", nargs="+", help="input csv files")
-parser.add_argument("-o", "--output", type=str, 
-                    help="name of output csv file. Use - or stdout to dump output to stdout.", default="merged.csv")
-parser.add_argument("--keepconstantcolumns", action='store_true', help="Keep constant columns", default=False)
-parser.add_argument("--filecolumn", type=str, help="Name of column containing original filename", default="ensemble")
-parser.add_argument("-q", "--quiet", action='store_true', help="Suppress non-critical output", default=False)
-
-args = parser.parse_args()
+import pandas
 
-if args.output == "-" or args.output == "stdout":
-    quiet = True
-else:
-    quiet = False
 
-if args.quiet:
-    quiet = True
+class CustomFormatter(
+    argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
+):
+    """
+    Multiple inheritance used for argparse to get both
+    defaults and raw description formatter
+    """
+
+    pass
+
+
+def get_parser():
+    """Construct parser object for csvMergeEnsembles"""
+    parser = argparse.ArgumentParser(
+        formatter_class=CustomFormatter,
+        description="""
+Merge multiple CSV files into one. Each row will be tagged by the filename
+it came from in the column 'ensemble'.
+
+The columns in the ensembles need not be the same. Similar column names
+will be merged, differing column names will be padded (with NaN) in the
+ensemble where they don't exist.
+
+Note that the ordering of all columns becomes alphabetical after this merging.
+""",
+        epilog="""If realization-*/iter-* is present in the filename, that numerical information
+is attempted extracted and put into the columns Realization and Iteration
+""",
+    )
+    parser.add_argument("csvfiles", nargs="+", help="input csv files")
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        help="name of output csv file. Use - or stdout to dump output to stdout.",
+        default="merged.csv",
+    )
+    parser.add_argument(
+        "--keepconstantcolumns",
+        action="store_true",
+        help="Keep constant columns",
+        default=False,
+    )
+    parser.add_argument(
+        "--filecolumn",
+        type=str,
+        help="Name of column containing original filename",
+        default="ensemble",
+    )
+    parser.add_argument(
+        "-q",
+        "--quiet",
+        action="store_true",
+        help="Suppress non-critical output",
+        default=False,
+    )
+    return parser
+
+
+def main():
+    """Entry point from command line"""
+    parser = get_parser()
+    args = parser.parse_args()
+    quiet = args.output == "-" or args.output == "stdout" or args.quiet
+
+    ens = pandas.DataFrame()
+    for csvfile in args.csvfiles:
+        if not quiet:
+            print(" ** Loading " + csvfile + "...")
+        try:
+            ensnew = pandas.read_csv(csvfile)
+            if not quiet:
+                print(ensnew.info())
+
+            ensnew[args.filecolumn] = pandas.Series(
+                csvfile.replace(".csv", ""), index=ensnew.index
+            )
+            realregex = r".*realization-(\d*)/"
+            iterregex = r".*iter-(\d*)/"
+
+            if re.match(realregex, csvfile):
+                # We don't use the column name "Realization" yet,
+                # because it might exist in some of the
+                # input files, but later on, we will copy it to "Realization"
+                # if it doesn't exist in the end
+                ensnew[args.filecolumn + "-realization"] = re.match(
+                    realregex, csvfile
+                ).group(1)
+            if re.match(iterregex, csvfile):
+                ensnew[args.filecolumn + "-iter"] = re.match(iterregex, csvfile).group(
+                    1
+                )
+
+            # Concatenation is done one frame at at a time.
+            # This makes concatenation slower, but more memory efficient.
+            ens = pandas.concat([ens, ensnew], ignore_index=True, sort=True)
+            # (the indices in these csv files are just the row number,
+            # which doesn't mean anything
+            # in our data, therefore we should "ignore_index".)
+            if not quiet:
+                print("         ------------------  ")
+        except IOError:
+            if not quiet:
+                print("WARNING: " + csvfile + " not found.")
+        except pandas.errors.EmptyDataError:
+            if not quiet:
+                print("WARNING: " + csvfile + " seems empty, no data found.")
+
+    if not args.keepconstantcolumns:
+        columnstodelete = []
+        for col in ens.columns:
+            if len(ens[col].unique()) == 1:
+                columnstodelete.append(col)
+        if not quiet:
+            print("  Dropping constant columns " + str(columnstodelete))
+        ens.drop(columnstodelete, inplace=True, axis=1)
+
+    # Copy realization column if its only source is the filename.
+    if (
+        "Realization" not in ens.columns
+        and args.filecolumn + "-realization" in ens.columns
+    ):
+        ens["Realization"] = ens[args.filecolumn + "-realization"]
+    # Ditto for iteration
+    if "Iter" not in ens.columns and args.filecolumn + "-iter" in ens.columns:
+        ens["Iter"] = ens[args.filecolumn + "-iter"]
+
+    if ens.empty:
+        print("ERROR: No data to output.")
+        sys.exit(1)
 
+    if not quiet:
+        print(" ** Merged ensemble data:")
+        print(ens.info())
 
-if not quiet:
-    header.compose("csvMergeEnsembles.py", 
-                   "01.04.2015", 
-                   ["Haavard Berland"], 
-                   ["havb@statoil.com"], 
-                   ["-h for help, or check wiki"], 
-                   "Merge multiple CSV exports from ERT into one CSV file")
+        print(" ** Exporting csv data to " + args.output)
 
+    if args.output == "-" or args.output == "stdout":
+        ens.to_csv(sys.stdout, index=False)
+    else:
+        ens.to_csv(path_or_buf=args.output, index=False)
 
-ens = pandas.DataFrame()
-for csvfile in args.csvfiles:
     if not quiet:
-        print " ** Loading "+  csvfile + "..."
-    try:
-        ensnew = pandas.read_csv(csvfile)
-        if not quiet:
-            print ensnew.info()
-
-        ensnew[args.filecolumn] = pandas.Series(csvfile.replace(".csv",""), index=ensnew.index)
-        realregex = ".*realization-(\d*)/"
-        iterregex = ".*iter-(\d*)/"
-
-        if re.match(realregex, csvfile):
-            # We don't use the column name "Realization" yet, because it might exist in some of the 
-            # input files, but later on, we will copy it to "Realization" if it doesn't exist in the end
-            ensnew[args.filecolumn + "-realization"] = re.match(realregex, csvfile).group(1)
-        if re.match(iterregex, csvfile):
-            ensnew[args.filecolumn + "-iter"] = re.match(iterregex, csvfile).group(1)
-
-        ens = pandas.concat([ens, ensnew], ignore_index=True, sort=True)
-        # (the indices in these csv files are just the row number, which doesn't mean anything 
-        #  in our data, therefore we should "ignore_index".)
-        if not quiet:
-            print "         ------------------  "
-    except IOError:
-        if not quiet:
-            print "WARNING: " + csvfile + " not found."
-    except pandas.errors.EmptyDataError:
-        if not quiet:
-            print "WARNING: " + csvfile + " seems empty, no data found."
-
-if not args.keepconstantcolumns:
-    columnstodelete = []
-    for col in ens.columns:
-        if len(ens[col].unique()) == 1:
-            columnstodelete.append(col)
-    if not quiet: 
-        print "  Dropping constant columns " + str(columnstodelete) 
-    ens.drop(columnstodelete, inplace=True, axis=1)
-
-# Copy realization column if its only source is the filename.
-if not "Realization" in ens.columns and args.filecolumn+"-realization" in ens.columns:
-    ens["Realization"] = ens[args.filecolumn + "-realization"]
-# Ditto for iteration
-if not "Iter" in ens.columns and args.filecolumn+"-iter" in ens.columns:
-    ens["Iter"] = ens[args.filecolumn + "-iter"]
-
-
-if len(ens.index) == 0:
-    print "ERROR: No data to output."
-    sys.exit(1)
-
-if not quiet:
-    print " ** Merged ensemble data:"
-    print ens.info()
-    
-    print " ** Exporting csv data to " + args.output    
-
-if args.output == "-" or args.output == "stdout":
-    ens.to_csv(sys.stdout, index=False)
-else:
-    ens.to_csv(path_or_buf=args.output, index=False)
-
-if not quiet:
-    print " - Finished writing to " + args.output
+        print(" - Finished writing to " + args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/subscript/tests/test_csvMergeEnsembles.py b/subscript/tests/test_csvMergeEnsembles.py
new file mode 100644
index 000000000..0d0dd64fa
--- /dev/null
+++ b/subscript/tests/test_csvMergeEnsembles.py
@@ -0,0 +1,59 @@
+"""Test csvMergeEnsembles"""
+from __future__ import absolute_import
+
+import os
+import sys
+
+import pandas as pd
+
+from .. import csvMergeEnsembles
+
+
+def test_main_merge():
+    """Test command line interface for csvMergeEnsembles"""
+
+    test_csv_1 = "foo.csv"
+    test_csv_2 = "bar.csv"
+    merged_csv = "merged.csv"
+
+    # Dump test data to disk as CSV first:
+    pd.DataFrame(
+        columns=["Realization", "FOO", "CONST"], data=[[0, 10, 1], [1, 20, 1]]
+    ).to_csv(test_csv_1, index=False)
+    pd.DataFrame(
+        columns=["Realization", "BAR", "CONST"], data=[[0, 30, 1], [1, 40, 1]]
+    ).to_csv(test_csv_2, index=False)
+
+    sys.argv = ["csvMergeEnsembles", test_csv_1, test_csv_2, "-q", "-o", merged_csv]
+    csvMergeEnsembles.main()
+    merged = pd.read_csv(merged_csv)
+
+    assert len(merged) == 4
+    assert len(merged.columns) == 4  # 3 unique in input, and 1 extra.
+    assert test_csv_1.replace(".csv", "") in merged.ensemble.unique()
+    assert test_csv_2.replace(".csv", "") in merged.ensemble.unique()
+    assert len(merged.ensemble.unique()) == 2
+
+    # Test --keepconstantcolumns
+    sys.argv = [
+        "csvMergeEnsembles",
+        test_csv_1,
+        test_csv_2,
+        "--keepconstantcolumns",
+        "-q",
+        "-o",
+        merged_csv,
+    ]
+    csvMergeEnsembles.main()
+    merged = pd.read_csv(merged_csv)
+
+    assert len(merged) == 4
+    assert len(merged.columns) == 5  # Also the constant column
+
+    # Cleanup
+    if os.path.exists(merged_csv):
+        os.unlink(merged_csv)
+    if os.path.exists(test_csv_1):
+        os.unlink(test_csv_1)
+    if os.path.exists(test_csv_2):
+        os.unlink(test_csv_2)