From 4cc366e0d41b3c3b9a82669135b62b9566483d2d Mon Sep 17 00:00:00 2001 From: Prabha Dublish Date: Sat, 12 Nov 2016 15:53:38 -0500 Subject: [PATCH 1/2] Turning in last toolbox --- learning_curve.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..d9166fa 100755 --- a/learning_curve.py +++ b/learning_curve.py @@ -1,7 +1,7 @@ """ Exploring learning curves for classification of handwritten digits """ import matplotlib.pyplot as plt -import numpy +import numpy as npy from sklearn.datasets import * from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression @@ -10,7 +10,9 @@ print data.DESCR num_trials = 10 train_percentages = range(5,95,5) -test_accuracies = numpy.zeros(len(train_percentages)) +test_accuracies = npy.zeros(len(train_percentages)) + +#test_accuracies = numpy.zeros(len(train_percentages)) # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate # the resultant accuracy. @@ -18,6 +20,15 @@ # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner # TODO: your code here +for (i,train_percent) in enumerate(train_percentages): + testdataaccuracy=npy.zeros(num_trials) + + for n in range (num_trials): + x_train, x_test,y_train,y_test=train_test_split(data.data,data.target,train_size=train_percent/100.0) + model=LogisticRegression(C=10**-10) + model.fit(x_train,y_train) + testdataaccuracy[n] = model.score(x_test,y_test) + test_accuracies[i] = testdataaccuracy.mean() fig = plt.figure() plt.plot(train_percentages, test_accuracies) From 1b1eff96c6f68a4c7e6bab3eb4030d95c367990b Mon Sep 17 00:00:00 2001 From: Prabha Dublish Date: Sat, 12 Nov 2016 15:53:57 -0500 Subject: [PATCH 2/2] Questions --- questions.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 questions.txt diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..c24e6af --- /dev/null +++ b/questions.txt @@ -0,0 +1,13 @@ +Question 1: +As the training percentage increases, the accuracy of the test data set increases. + +Question 2: +Between 30% and 70% for the training data set seems to be the most noisy. +Its possible the data used to create the model is much different than the data for the test portion, leading to variability in accuracy. + +Question 3: +100 trials seems to really smooth out the curve. + +Question 4: +By changing C to equal 20**-10, the graph became overall more noisy, but the variation wasn't as much as it was before. +Before, the variability was a lot between 30-70, now the whole graph has some noise, but not to the extent as before.