From 4cc366e0d41b3c3b9a82669135b62b9566483d2d Mon Sep 17 00:00:00 2001
From: Prabha Dublish <pdublish1@babson.edu>
Date: Sat, 12 Nov 2016 15:53:38 -0500
Subject: [PATCH 1/2] Turning in last toolbox

---
 learning_curve.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/learning_curve.py b/learning_curve.py
index 2364f2c..d9166fa 100755
--- a/learning_curve.py
+++ b/learning_curve.py
@@ -1,7 +1,7 @@
 """ Exploring learning curves for classification of handwritten digits """
 
 import matplotlib.pyplot as plt
-import numpy
+import numpy as npy
 from sklearn.datasets import *
 from sklearn.cross_validation import train_test_split
 from sklearn.linear_model import LogisticRegression
@@ -10,7 +10,9 @@
 print data.DESCR
 num_trials = 10
 train_percentages = range(5,95,5)
-test_accuracies = numpy.zeros(len(train_percentages))
+test_accuracies = npy.zeros(len(train_percentages))
+
+#test_accuracies = numpy.zeros(len(train_percentages))
 
 # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate
 # the resultant accuracy.
@@ -18,6 +20,15 @@
 # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
 
 # TODO: your code here
+for (i,train_percent) in enumerate(train_percentages):
+    testdataaccuracy=npy.zeros(num_trials)
+
+    for n in range (num_trials):
+        x_train, x_test,y_train,y_test=train_test_split(data.data,data.target,train_size=train_percent/100.0)
+        model=LogisticRegression(C=10**-10)
+        model.fit(x_train,y_train)
+        testdataaccuracy[n] = model.score(x_test,y_test)
+    test_accuracies[i] = testdataaccuracy.mean()
 
 fig = plt.figure()
 plt.plot(train_percentages, test_accuracies)

From 1b1eff96c6f68a4c7e6bab3eb4030d95c367990b Mon Sep 17 00:00:00 2001
From: Prabha Dublish <pdublish1@babson.edu>
Date: Sat, 12 Nov 2016 15:53:57 -0500
Subject: [PATCH 2/2] Questions

---
 questions.txt | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 questions.txt

diff --git a/questions.txt b/questions.txt
new file mode 100644
index 0000000..c24e6af
--- /dev/null
+++ b/questions.txt
@@ -0,0 +1,13 @@
+Question 1:
+As the training percentage increases, the accuracy of the test data set increases.
+
+Question 2:
+Between 30% and 70% for the training data set seems to be the most noisy.
+Its possible the data used to create the model is much different than the data for the test portion, leading to variability in accuracy.
+
+Question 3:
+100 trials seems to really smooth out the curve.
+
+Question 4:
+By changing C to equal 20**-10, the graph became overall more noisy, but the variation wasn't as much as it was before.
+Before, the variability was a lot between 30-70, now the whole graph has some noise, but not to the extent as before.