equinor · berland · Oct 3, 2019 · Oct 3, 2019 · Oct 3, 2019
diff --git a/setup.py b/setup.py
@@ -21,6 +21,7 @@
     tests_require=["pytest"],
     entry_points={
         "console_scripts": [
+            "csvMergeEnsembles = subscript.csvMergeEnsembles:main",
             "presentvalue = subscript.presentvalue:main",
             "sunsch = subscript.sunsch:main",
         ]

diff --git a/subscript/csvMergeEnsembles.py b/subscript/csvMergeEnsembles.py
@@ -0,0 +1,163 @@
+"""
+Merge multiple CSV files.
+"""
+
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+
+import sys
+import argparse
+import re
+
+import pandas
+
+
+class CustomFormatter(
+    argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
+):
+    """
+    Multiple inheritance used for argparse to get both
+    defaults and raw description formatter
+    """
+
+    pass
+
+
+def get_parser():
+    """Construct parser object for csvMergeEnsembles"""
+    parser = argparse.ArgumentParser(
+        formatter_class=CustomFormatter,
+        description="""
+Merge multiple CSV files into one. Each row will be tagged by the filename
+it came from in the column 'ensemble'.
+
+The columns in the ensembles need not be the same. Similar column names
+will be merged, differing column names will be padded (with NaN) in the
+ensemble where they don't exist.
+
+Note that the ordering of all columns becomes alphabetical after this merging.
+""",
+        epilog="""If realization-*/iter-* is present in the filename, that numerical information
+is attempted extracted and put into the columns Realization and Iteration
+""",
+    )
+    parser.add_argument("csvfiles", nargs="+", help="input csv files")
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        help="name of output csv file. Use - or stdout to dump output to stdout.",
+        default="merged.csv",
+    )
+    parser.add_argument(
+        "--keepconstantcolumns",
+        action="store_true",
+        help="Keep constant columns",
+        default=False,
+    )
+    parser.add_argument(
+        "--filecolumn",
+        type=str,
+        help="Name of column containing original filename",
+        default="ensemble",
+    )
+    parser.add_argument(
+        "-q",
+        "--quiet",
+        action="store_true",
+        help="Suppress non-critical output",
+        default=False,
+    )
+    return parser
+
+
+def main():
+    """Entry point from command line"""
+    parser = get_parser()
+    args = parser.parse_args()
+    quiet = args.output == "-" or args.output == "stdout" or args.quiet
+
+    ens = pandas.DataFrame()
+    for csvfile in args.csvfiles:
+        if not quiet:
+            print(" ** Loading " + csvfile + "...")
+        try:
+            ensnew = pandas.read_csv(csvfile)
+            if not quiet:
+                print(ensnew.info())
+
+            ensnew[args.filecolumn] = pandas.Series(
+                csvfile.replace(".csv", ""), index=ensnew.index
+            )
+            realregex = r".*realization-(\d*)/"
+            iterregex = r".*iter-(\d*)/"
+
+            if re.match(realregex, csvfile):
+                # We don't use the column name "Realization" yet,
+                # because it might exist in some of the
+                # input files, but later on, we will copy it to "Realization"
+                # if it doesn't exist in the end
+                ensnew[args.filecolumn + "-realization"] = re.match(
+                    realregex, csvfile
+                ).group(1)
+            if re.match(iterregex, csvfile):
+                ensnew[args.filecolumn + "-iter"] = re.match(iterregex, csvfile).group(
+                    1
+                )
+
+            # Concatenation is done one frame at at a time.
+            # This makes concatenation slower, but more memory efficient.
+            ens = pandas.concat([ens, ensnew], ignore_index=True, sort=True)
+            # (the indices in these csv files are just the row number,
+            # which doesn't mean anything
+            # in our data, therefore we should "ignore_index".)
+            if not quiet:
+                print("         ------------------  ")
+        except IOError:
+            if not quiet:
+                print("WARNING: " + csvfile + " not found.")
+        except pandas.errors.EmptyDataError:
+            if not quiet:
+                print("WARNING: " + csvfile + " seems empty, no data found.")
+
+    if not args.keepconstantcolumns:
+        columnstodelete = []
+        for col in ens.columns:
+            if len(ens[col].unique()) == 1:
+                columnstodelete.append(col)
+        if not quiet:
+            print("  Dropping constant columns " + str(columnstodelete))
+        ens.drop(columnstodelete, inplace=True, axis=1)
+
+    # Copy realization column if its only source is the filename.
+    if (
+        "Realization" not in ens.columns
+        and args.filecolumn + "-realization" in ens.columns
+    ):
+        ens["Realization"] = ens[args.filecolumn + "-realization"]
+    # Ditto for iteration
+    if "Iter" not in ens.columns and args.filecolumn + "-iter" in ens.columns:
+        ens["Iter"] = ens[args.filecolumn + "-iter"]
+
+    if ens.empty:
+        print("ERROR: No data to output.")
+        sys.exit(1)
+
+    if not quiet:
+        print(" ** Merged ensemble data:")
+        print(ens.info())
+
+        print(" ** Exporting csv data to " + args.output)
+
+    if args.output == "-" or args.output == "stdout":
+        ens.to_csv(sys.stdout, index=False)
+    else:
+        ens.to_csv(path_or_buf=args.output, index=False)
+
+    if not quiet:
+        print(" - Finished writing to " + args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/subscript/tests/test_csvMergeEnsembles.py b/subscript/tests/test_csvMergeEnsembles.py
@@ -0,0 +1,59 @@
+"""Test csvMergeEnsembles"""
+from __future__ import absolute_import
+
+import os
+import sys
+
+import pandas as pd
+
+from .. import csvMergeEnsembles
+
+
+def test_main_merge():
+    """Test command line interface for csvMergeEnsembles"""
+
+    test_csv_1 = "foo.csv"
+    test_csv_2 = "bar.csv"
+    merged_csv = "merged.csv"
+
+    # Dump test data to disk as CSV first:
+    pd.DataFrame(
+        columns=["Realization", "FOO", "CONST"], data=[[0, 10, 1], [1, 20, 1]]
+    ).to_csv(test_csv_1, index=False)
+    pd.DataFrame(
+        columns=["Realization", "BAR", "CONST"], data=[[0, 30, 1], [1, 40, 1]]
+    ).to_csv(test_csv_2, index=False)
+
+    sys.argv = ["csvMergeEnsembles", test_csv_1, test_csv_2, "-q", "-o", merged_csv]
+    csvMergeEnsembles.main()
+    merged = pd.read_csv(merged_csv)
+
+    assert len(merged) == 4
+    assert len(merged.columns) == 4  # 3 unique in input, and 1 extra.
+    assert test_csv_1.replace(".csv", "") in merged.ensemble.unique()
+    assert test_csv_2.replace(".csv", "") in merged.ensemble.unique()
+    assert len(merged.ensemble.unique()) == 2
+
+    # Test --keepconstantcolumns
+    sys.argv = [
+        "csvMergeEnsembles",
+        test_csv_1,
+        test_csv_2,
+        "--keepconstantcolumns",
+        "-q",
+        "-o",
+        merged_csv,
+    ]
+    csvMergeEnsembles.main()
+    merged = pd.read_csv(merged_csv)
+
+    assert len(merged) == 4
+    assert len(merged.columns) == 5  # Also the constant column
+
+    # Cleanup
+    if os.path.exists(merged_csv):
+        os.unlink(merged_csv)
+    if os.path.exists(test_csv_1):
+        os.unlink(test_csv_1)
+    if os.path.exists(test_csv_2):
+        os.unlink(test_csv_2)