From 22a7310b9f15a7336455c75d56e6f7dd2724ffff Mon Sep 17 00:00:00 2001 From: Richin Jain Date: Thu, 21 Feb 2019 13:21:20 -0500 Subject: [PATCH 1/3] Added UnitTests: Modified the exisiting data tests to follow pytest schema, currently they were a bunch of if else and it would not have failed build even if they failed. Also moved it under a new folder called tests/unit as that is more recommended way --- azure-pipelines.yml | 2 +- {code/testing => tests/unit}/data_test.py | 106 +++++++++++----------- 2 files changed, 54 insertions(+), 54 deletions(-) rename {code/testing => tests/unit}/data_test.py (56%) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 99f3642e..96262a50 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -28,7 +28,7 @@ steps: displayName: 'replace subscription value' -- script: 'python code/testing/data_test.py data/diabetes.csv && python code/testing/data_test.py data/diabetes_bad_dist.csv && python code/testing/data_test.py data/diabetes_bad_schema.csv && python code/testing/data_test.py data/diabetes_missing_values.csv' +- script: 'pytest tests/unit/data_test.py' displayName: 'Data Quality Check' - script: 'python aml_service/00-WorkSpace.py' diff --git a/code/testing/data_test.py b/tests/unit/data_test.py similarity index 56% rename from code/testing/data_test.py rename to tests/unit/data_test.py index 206cc8a5..ecfb34e8 100644 --- a/code/testing/data_test.py +++ b/tests/unit/data_test.py @@ -16,7 +16,9 @@ THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, + +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER @@ -24,14 +26,22 @@ ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ - -import sys import os import numpy as np import pandas as pd + +# get absolute path of csv files from data folder +def get_absPath(filename): + """Returns the path of the notebooks folder""" + path = os.path.abspath( + os.path.join(os.path.dirname(__file__), os.path.pardir, + os.path.pardir, "data", filename)) + return path + + # number of features -n_columns = 10 +expected_columns = 10 # distribution of features in the training set historical_mean = np.array( @@ -65,60 +75,50 @@ ] ) -# maximal relative change in feature mean or standrd deviation that we can tolerate +# maximal relative change in feature mean or standrd deviation +# that we can tolerate shift_tolerance = 3 -def check_schema(X): - n_actual_columns = X.shape[1] - if n_actual_columns != n_columns: - print( - "Error: found {} feature columns. The data should have {} feature columns.".format( - n_actual_columns, n_columns - ) - ) - return False - - return True - - -def check_missing_values(dataset): +def test_check_schema(): + datafile = get_absPath("diabetes.csv") + # check that file exists + assert(os.path.exists(datafile)) + dataset = pd.read_csv(datafile) + header = dataset[dataset.columns[:-1]] + actual_columns = header.shape[1] + # check header has expected number of columns + assert(actual_columns == expected_columns) + + +def test_check_bad_schema(): + datafile = get_absPath("diabetes_bad_schema.csv") + # check that file exists + assert(os.path.exists(datafile)) + dataset = pd.read_csv(datafile) + header = dataset[dataset.columns[:-1]] + actual_columns = header.shape[1] + # check header has expected number of columns + assert(actual_columns != expected_columns) + + +def test_check_missing_values(): + datafile = get_absPath("diabetes_missing_values.csv") + # check that file exists + assert(os.path.exists(datafile)) + dataset = pd.read_csv(datafile) n_nan = np.sum(np.isnan(dataset.values)) - if n_nan > 0: - print("Warning: the data has {} missing values".format(n_nan)) - return False - return True + assert(n_nan > 0) -def check_distribution(dataset): +def test_check_distribution(): + datafile = get_absPath("diabetes_bad_dist.csv") + # check that file exists + assert(os.path.exists(datafile)) + dataset = pd.read_csv(datafile) mean = np.mean(dataset.values, axis=0) std = np.mean(dataset.values, axis=0) - if ( - np.sum(abs(mean - historical_mean) > shift_tolerance * abs(historical_mean)) > 0 - or np.sum(abs(std - historical_std) > shift_tolerance * abs(historical_std)) > 0 - ): - print("Warning: new data has different distribution than the training data") - return False - return True - - -def main(): - filename = sys.argv[1] - if not os.path.exists(filename): - print("Error: The file {} does not exist".format(filename)) - return - - dataset = pd.read_csv(filename) - if check_schema(dataset[dataset.columns[:-1]]): - print("Data schema test succeeded") - if check_missing_values(dataset) and check_distribution(dataset): - print("Missing values test passed") - print("Data distribution test passed") - else: - print( - "There might be some issues with the data. Please check warning messages." - ) - - -if __name__ == "__main__": - main() + assert(np.sum(abs(mean - historical_mean) > shift_tolerance * + abs(historical_mean)) or + np.sum(abs(std - historical_std) > shift_tolerance * + abs(historical_std)) > 0) From 5dd8fc1a064f5d250a241291903893d7b6bf01e7 Mon Sep 17 00:00:00 2001 From: Richin Jain Date: Thu, 21 Feb 2019 13:24:18 -0500 Subject: [PATCH 2/3] Removing extra line break --- tests/unit/data_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/unit/data_test.py b/tests/unit/data_test.py index ecfb34e8..74f03336 100644 --- a/tests/unit/data_test.py +++ b/tests/unit/data_test.py @@ -16,9 +16,7 @@ THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, - -INDIRECT, INCIDENTAL, +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER From addf7a7f562f1aa1648d3fa903f80d8d87b647ce Mon Sep 17 00:00:00 2001 From: Richin Jain Date: Thu, 21 Feb 2019 13:34:59 -0500 Subject: [PATCH 3/3] Adding pytest as required package to run test, missed adding it earlier --- environment_setup/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment_setup/requirements.txt b/environment_setup/requirements.txt index c69c9074..b3c2a14c 100644 --- a/environment_setup/requirements.txt +++ b/environment_setup/requirements.txt @@ -1,4 +1,5 @@ scipy==1.0.0 scikit-learn==0.19.1 numpy==1.14.5 -pandas==0.23.1 \ No newline at end of file +pandas==0.23.1 +pytest==4.3.0 \ No newline at end of file