sd16fall · pdublish · Nov 12, 2016 · Nov 12, 2016
diff --git a/learning_curve.py b/learning_curve.py
@@ -1,7 +1,7 @@
 """ Exploring learning curves for classification of handwritten digits """
 
 import matplotlib.pyplot as plt
-import numpy
+import numpy as npy
 from sklearn.datasets import *
 from sklearn.cross_validation import train_test_split
 from sklearn.linear_model import LogisticRegression
@@ -10,14 +10,25 @@
 print data.DESCR
 num_trials = 10
 train_percentages = range(5,95,5)
-test_accuracies = numpy.zeros(len(train_percentages))
+test_accuracies = npy.zeros(len(train_percentages))
+
+#test_accuracies = numpy.zeros(len(train_percentages))
 
 # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate
 # the resultant accuracy.
 # You should repeat each training percentage num_trials times to smooth out variability
 # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
 
 # TODO: your code here
+for (i,train_percent) in enumerate(train_percentages):
+    testdataaccuracy=npy.zeros(num_trials)
+
+    for n in range (num_trials):
+        x_train, x_test,y_train,y_test=train_test_split(data.data,data.target,train_size=train_percent/100.0)
+        model=LogisticRegression(C=10**-10)
+        model.fit(x_train,y_train)
+        testdataaccuracy[n] = model.score(x_test,y_test)
+    test_accuracies[i] = testdataaccuracy.mean()
 
 fig = plt.figure()
 plt.plot(train_percentages, test_accuracies)

diff --git a/questions.txt b/questions.txt
@@ -0,0 +1,13 @@
+Question 1:
+As the training percentage increases, the accuracy of the test data set increases.
+
+Question 2:
+Between 30% and 70% for the training data set seems to be the most noisy.
+Its possible the data used to create the model is much different than the data for the test portion, leading to variability in accuracy.
+
+Question 3:
+100 trials seems to really smooth out the curve.
+
+Question 4:
+By changing C to equal 20**-10, the graph became overall more noisy, but the variation wasn't as much as it was before.
+Before, the variability was a lot between 30-70, now the whole graph has some noise, but not to the extent as before.