sd16fall · tasandy · Oct 24, 2016 · Oct 24, 2016 · Oct 24, 2016 · Oct 24, 2016
diff --git a/ML_fig1.png b/ML_fig1.png
diff --git a/ML_fig2.png b/ML_fig2.png
diff --git a/machine_learning_code.py b/machine_learning_code.py
@@ -0,0 +1,38 @@
+""" Exploring learning curves for classification of handwritten digits """
+
+import matplotlib.pyplot as plt
+import numpy
+from sklearn.datasets import *
+from sklearn.cross_validation import train_test_split
+from sklearn.linear_model import LogisticRegression
+
+data = load_digits()
+print data.DESCR
+num_trials = 10
+#num_trials = 300
+train_percentages = range(5,95,5)
+test_accuracies = numpy.zeros(len(train_percentages))
+
+# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate
+# the resultant accuracy.
+# You should repeat each training percentage num_trials times to smooth out variability
+# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
+
+# TODO: your code here
+for i in range(len(train_percentages)):
+    for n in range(num_trials):
+        X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size = train_percentages[i]/100.0)
+        model = LogisticRegression(C=10**-10)
+        #model = LogisticRegression(C=10**-20)
+        #model = LogisticRegression(C=1**-10)
+        model.fit(X_train, y_train)
+        print "Train accuracy %f" %model.score(X_train,y_train)
+        print "Test accuracy %f"%model.score(X_test,y_test)
+        test_accuracies[i] +=model.score(X_test, y_test) #adds the model score to the test_accuracies
+    test_accuracies[i] /= num_trials #takes the average of all accuracy values
+
+fig = plt.figure()
+plt.plot(train_percentages, test_accuracies)
+plt.xlabel('Percentage of Data Used for Training')
+plt.ylabel('Accuracy on Test Set')
+plt.show()
diff --git a/questions.txt b/questions.txt
@@ -0,0 +1,15 @@
+1. What is the general trend in the curve?
+The general trend in the curve is that as the percentage of data used for training increases, so does
+the computer's accuracy on the test set. This means the more data you give the computer, the better the computer learns.
+
+2. Are there parts of the curve that appear to be noisier than others?  Why?
+The parts of the curve that appear to be noisier than others occurs mainly between 0%-50% on the graph.
+This may be because at those percentages there is less data for the computer to test with.
+
+3. How many trials do you need to get a smooth curve?
+I tested different trial sizes at 50, 100,and 200 trials. I was able to finally get a fairly smooth
+curve at 300 trials with very little noise.
+
+4. Try different values for C (by changing LogisticRegression(C=10**-10)).  What happens?
+The higher the C value, the higher the computer's accuracy at lower percentages of data. I tested this with C value equal to 1.
+The lower the C value, the noisier the curve becomes with a wilder variation. I tested this portion with C = 10**-20.