From 7873c22ae4311854a815924f103f26bc0fce299e Mon Sep 17 00:00:00 2001 From: Andrew Holmes Date: Mon, 24 Oct 2016 15:35:27 -0400 Subject: [PATCH] Initial commit --- learning_curve.py | 17 +++++++++++++---- questions.txt | 9 +++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 questions.txt diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..686d393 100755 --- a/learning_curve.py +++ b/learning_curve.py @@ -7,17 +7,26 @@ from sklearn.linear_model import LogisticRegression data = load_digits() -print data.DESCR +#print data.DESCR num_trials = 10 train_percentages = range(5,95,5) test_accuracies = numpy.zeros(len(train_percentages)) - # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate # the resultant accuracy. # You should repeat each training percentage num_trials times to smooth out variability # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner - -# TODO: your code here +i = 0 +for x in train_percentages: #Loops through the range of values specified in the train_percentages list + j = 0 + summation = 0 + while j < num_trials: #For each value in train_percentages, the test accuracy is calculated num_trials times and averaged together + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size = (x/100.0)) + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + summation += model.score(X_test,y_test) #Adds the result of the test to summation every time through the loop + j += 1 + test_accuracies[i] = summation/num_trials #Divides the total by the number of trials and stores it in the ith position in test_accuracies + i += 1 #increments i every time through the loop fig = plt.figure() plt.plot(train_percentages, test_accuracies) diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..9046c53 --- /dev/null +++ b/questions.txt @@ -0,0 +1,9 @@ +Andrew Holmes - Machine Learning Toolbox + +1. Generally, the curve trends upward, approaching 1 as the percentage of data used for training increases. + +2. There is more noise in some parts of the curve than others, but it is generally noisy across the board. This is likely because the number of tests performed is small (only 10) and there tend to be large discrepencies between tests at the same training percentage. Increasing the step size and number of tests taken would make the noise in the curve less apparent. + +3. I increased the number of trials per data point to 100 and got a much smoother curve. While it would take much longer for the computer to output, increasing by even more trials would create an even smoother curve. + +4. When I set C = 10**-1, the curve looked much less linear. It rises very quickly and then approaches 1 at a slower rate. Thus, the accuracy is higher for lower training data percentages. When I set C = 10, The accuracy trends much higher much faster. When I set C = 10**10, it looks as though the accuracy increases in steps and there's a lot more noise.