From 0a3d848a41a67487b8f36c3465b2e9fb847e7760 Mon Sep 17 00:00:00 2001
From: Saurabh Ghanekar <ghanekarsaurabh8@gmail.com>
Date: Wed, 19 Dec 2018 16:15:46 +0530
Subject: [PATCH 01/15] Create LICENSE

---
 LICENSE | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..3a102b1
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Saurabh Ghanekar
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

From 8c05a1e554348ecb18454de0d3afab0b69c0b40c Mon Sep 17 00:00:00 2001
From: Shubham Rao <shubham_rao@berkeley.edu>
Date: Sat, 9 Feb 2019 19:27:41 -0800
Subject: [PATCH 02/15] Create .gitignore and a PyCharm project

Hey there, @cshubhamrao here.
---
 .gitignore           | 80 ++++++++++++++++++++++++++++++++++++++++++++
 .idea/Colbert-AI.iml | 11 ++++++
 .idea/encodings.xml  |  4 +++
 .idea/misc.xml       |  7 ++++
 .idea/modules.xml    |  8 +++++
 .idea/vcs.xml        |  6 ++++
 6 files changed, 116 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .idea/Colbert-AI.iml
 create mode 100644 .idea/encodings.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/vcs.xml

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c44232f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,80 @@
+
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+.venv
+pip-selfcheck.json
+
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
diff --git a/.idea/Colbert-AI.iml b/.idea/Colbert-AI.iml
new file mode 100644
index 0000000..6711606
--- /dev/null
+++ b/.idea/Colbert-AI.iml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/encodings.xml b/.idea/encodings.xml
new file mode 100644
index 0000000..15a15b2
--- /dev/null
+++ b/.idea/encodings.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding" addBOMForNewFiles="with NO BOM" />
+</project>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..14980cb
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (Colbert-AI)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..5f1f9ec
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Colbert-AI.iml" filepath="$PROJECT_DIR$/.idea/Colbert-AI.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file

From 4e86e3a88246aee4b825a2551d3d0ff6f1e499e0 Mon Sep 17 00:00:00 2001
From: Shubham Rao <shubham_rao@berkeley.edu>
Date: Sat, 9 Feb 2019 19:34:22 -0800
Subject: [PATCH 03/15] Add requirements.txt

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e0d091f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+Unidecode==1.0.23
+youtube-dl==2019.2.8

From 873578aabfbf2610494cf6e628e2621e17d3c3ca Mon Sep 17 00:00:00 2001
From: Shubham Rao <shubham_rao@berkeley.edu>
Date: Sat, 9 Feb 2019 19:36:28 -0800
Subject: [PATCH 04/15] Update .gitignore

To ignore youtube-dl created files and folders
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index c44232f..b0f3a96 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,8 @@
 [Ll]ib64
 [Ll]ocal
 [Ss]cripts
+[Ee]tc
+[Ss]hare
 pyvenv.cfg
 .venv
 pip-selfcheck.json

From 3d740a92bd7928f8876c153418925dfbd725328f Mon Sep 17 00:00:00 2001
From: Shubham Rao <shubham_rao@berkeley.edu>
Date: Sat, 9 Feb 2019 19:39:07 -0800
Subject: [PATCH 05/15] Modify chain.py

Let's see if it works
---
 chain.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/chain.py b/chain.py
index 521c63f..4af0e0c 100644
--- a/chain.py
+++ b/chain.py
@@ -6,10 +6,7 @@
 from unidecode import unidecode
 from splitters import split_into_sentences
 
-try:  # pragma: no cover
-    basestring
-except NameError:  # pragma: no cover
-    basestring = str
+basestring = str
 
 BEGIN = "__BEGIN__"
 END = "__END__"

From fcd1eedda4be1e00096842f2bf72992cea34d308 Mon Sep 17 00:00:00 2001
From: Saurabh Ghanekar <ghanekarsaurabh8@gmail.com>
Date: Sat, 9 Feb 2019 20:46:15 -0800
Subject: [PATCH 06/15] Modify chain.py

'basestring' was used to check if a variable is a string or not.
---
 chain.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/chain.py b/chain.py
index 4af0e0c..8bbb32c 100644
--- a/chain.py
+++ b/chain.py
@@ -6,8 +6,6 @@
 from unidecode import unidecode
 from splitters import split_into_sentences
 
-basestring = str
-
 BEGIN = "__BEGIN__"
 END = "__END__"
 

From e863a2f2727a0bcb057026cec5f27b06260c5295 Mon Sep 17 00:00:00 2001
From: Shubham Rao <shubham_rao@berkeley.edu>
Date: Sat, 9 Feb 2019 21:27:06 -0800
Subject: [PATCH 07/15] Create download.py file

Will scrape and download subtitles from the playlist

Signed-off-by: Shubham Rao <shubham_rao@berkeley.edu>
---
 download.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 download.py

diff --git a/download.py b/download.py
new file mode 100644
index 0000000..c0c31a9
--- /dev/null
+++ b/download.py
@@ -0,0 +1,22 @@
+# MIT License
+#
+# Copyright (c) 2019 Shubham Rao
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+

From 686c241bce5ceb7a831d34fb5058a6310071d70a Mon Sep 17 00:00:00 2001
From: Shubham Rao <shubham_rao@berkeley.edu>
Date: Sat, 9 Feb 2019 22:27:32 -0800
Subject: [PATCH 08/15] Formatting fixes

* Make PEP8 conformant
* remove unnecessary lines
* break long lines

Signed-off-by: Shubham Rao <shubham_rao@berkeley.edu>
---
 chain.py     | 53 ++++++++++++++++++++++++++++++----------------------
 splitters.py | 52 ++++++++++++++++++++++++++++++---------------------
 test.py      |  8 ++++----
 3 files changed, 66 insertions(+), 47 deletions(-)

diff --git a/chain.py b/chain.py
index 4af0e0c..9cc5605 100644
--- a/chain.py
+++ b/chain.py
@@ -31,7 +31,8 @@ class MarkovChain(object):
 
     def __init__(self, corpus, state_size):
         """
-        corpus: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence.
+        corpus: It is a list of lists where the outer list like a sentence and the inner list is
+        contains the words that make the sentence.
 
         state_size: items used to represent the state of the model.
         """
@@ -41,15 +42,17 @@ def __init__(self, corpus, state_size):
 
     def build(self, corpus, state_size):
         """
-        Returns a dict of dicts where the keys of the outer dict represent all possible states, and point to the inner dicts. The inner dicts represent all possibilities for the "next" item in the chain, along with the count of times it appears.
+        Returns a dict of dicts where the keys of the outer dict represent all possible states, and
+        point to the inner dicts. The inner dicts represent all possibilities for the "next" item in
+         the chain, along with the count of times it appears.
         """
         model = {}
 
         for run in corpus:
             items = ([BEGIN] * state_size) + run + [END]
-            for i in range(len(run)+1):
-                state = tuple(items[i:i+state_size])
-                follow = items[i+state_size]
+            for i in range(len(run) + 1):
+                state = tuple(items[i:i + state_size])
+                follow = items[i + state_size]
                 if state not in model:
                     model[state] = {}
 
@@ -72,7 +75,8 @@ def move(self, state):
 
     def gen(self, init_state=None):
         """
-        Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items until the chain reaches the "END" state.
+        Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items
+        until the chain reaches the "END" state.
         """
         state = init_state or (BEGIN,) * self.state_size
         while True:
@@ -95,19 +99,19 @@ def to_json(self):
         return json.dumps(list(self.model.items()))
 
 
-"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------"""
-
 DEFAULT_MAX_OVERLAP_RATIO = 0.7
 DEFAULT_MAX_OVERLAP_TOTAL = 20
 DEFAULT_TRIES = 8
 
 
 class Text(object):
-    def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True):
+    def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None,
+                 retain_original=True):
         """
         input_text: A string.
         state_size: An integer, indicating the number of words in the model's state.
-        parsed_sentences: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence.
+        parsed_sentences: It is a list of lists where the outer list like a sentence and the inner
+        list is contains the words that make the sentence.
         """
 
         can_make_sentences = parsed_sentences is not None or input_text is not None
@@ -208,7 +212,10 @@ def generate_corpus(self, text):
 
     def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total):
         """
-        Given a generated list of words, accept or reject it. This one rejects sentences that too closely match the original text, namely those that contain any identical sequence of words of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the total number of words, and (b) `max_overlap_total` (default: 15).
+        Given a generated list of words, accept or reject it. This one rejects sentences that too
+        closely match the original text, namely those that contain any identical sequence of words
+        of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the
+        total number of words, and (b) `max_overlap_total` (default: 15).
         """
         # Rejects chunk that is similar
 
@@ -217,7 +224,7 @@ def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total):
         overlap_over = overlap_max + 1
 
         gram_count = max((len(words) - overlap_max), 1)
-        grams = [words[i:i+overlap_over] for i in range(gram_count)]
+        grams = [words[i:i + overlap_over] for i in range(gram_count)]
 
         for gm in grams:
             gram_joined = self.word_join(gm)
@@ -228,15 +235,19 @@ def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total):
 
     def make_sentences(self, init_state=None, **kwargs):
         """
-        Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to "test_sentences_output".
+        Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and
+        "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to
+        "test_sentences_output".
 
         If successful, returns the sentence as a string. If not, returns None.
 
-        If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method chooses a sentence-start at random, in accordance with the model.
+        If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method
+        chooses a sentence-start at random, in accordance with the model.
 
         If "test_output" is set as False then the "text_sentences_output" check will be skipped.
 
-        If "max_words" is specified, the word count for the sentence will be evaluated against the provided limit.
+        If "max_words" is specified, the word count for the sentence will be evaluated against the
+        provided limit.
         """
 
         tries = kwargs.get("tries", DEFAULT_TRIES)
@@ -245,7 +256,7 @@ def make_sentences(self, init_state=None, **kwargs):
         test_output = kwargs.get("test_output", True)
         max_words = kwargs.get("max_words", None)
 
-        if init_state != None:
+        if init_state is not None:
             prefix = list(init_state)
             for word in prefix:
                 if word == BEGIN:
@@ -258,7 +269,7 @@ def make_sentences(self, init_state=None, **kwargs):
 
         for _ in range(tries):
             words = prefix + self.chain.walk(init_state)
-            if max_words != None and len(words) > max_words:
+            if max_words is not None and len(words) > max_words:
                 continue
             if test_output and hasattr(self, "rejoined_text"):
                 if self.text_sentences_output(words, mor, mot):
@@ -271,14 +282,12 @@ def make_sentences(self, init_state=None, **kwargs):
 
     def make_short_sentence(self, max_chars, min_chars=0, **kwargs):
         """
-        Tries making a sentence of no more than "max_chars" characters and optionally no less than "min_chars" charcaters, passing **kwargs to "self.make_sentence".
+        Tries making a sentence of no more than "max_chars" characters and optionally no less than
+        "min_chars" charcaters, passing **kwargs to "self.make_sentence".
         """
         tries = kwargs.get("tries", DEFAULT_TRIES)
 
         for _ in range(tries):
             sentence = self.make_sentences(**kwargs)
-            if sentence and len(sentence) <= max_chars and len(sentence) >= min_chars:
+            if sentence and max_chars >= len(sentence) >= min_chars:
                 return sentence
-
-
-"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------"""
diff --git a/splitters.py b/splitters.py
index e9804eb..62d2b66 100644
--- a/splitters.py
+++ b/splitters.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*- 
+# -*- coding: utf-8 -*-
 import re
 
 ascii_lowercase = "abcdefghijklmnopqrstuvwxyz"
@@ -7,30 +7,39 @@
 # States w/ with thanks to https://github.com/unitedstates/python-us
 # Titles w/ thanks to https://github.com/nytimes/emphasis and @donohoe
 abbr_capped = "|".join([
-    "ala|ariz|ark|calif|colo|conn|del|fla|ga|ill|ind|kan|ky|la|md|mass|mich|minn|miss|mo|mont|neb|nev|okla|ore|pa|tenn|vt|va|wash|wis|wyo", # States
+    "ala|ariz|ark|calif|colo|conn|del|fla|ga|ill|ind|kan|ky|la|md|mass|mich|minn|miss|mo|mont|"
+    "neb|nev|okla|ore|pa|tenn|vt|va|wash|wis|wyo",  # States
     "u.s",
-    "mr|ms|mrs|msr|dr|gov|pres|sen|sens|rep|reps|prof|gen|messrs|col|sr|jf|sgt|mgr|fr|rev|jr|snr|atty|supt", # Titles
-    "ave|blvd|st|rd|hwy", # Streets
-    "jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec", # Months
-    "|".join(ascii_lowercase) # Initials
+    "mr|ms|mrs|msr|dr|gov|pres|sen|sens|rep|reps|prof|gen|messrs|col|sr|jf|sgt|mgr|fr|rev|jr|"
+    "snr|atty|supt",  # Titles
+    "ave|blvd|st|rd|hwy",  # Streets
+    "jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec",  # Months
+    "|".join(ascii_lowercase)  # Initials
 ]).split("|")
 
 abbr_lowercase = "etc|v|vs|viz|al|pct"
 
 exceptions = "U.S.|U.N.|E.U.|F.B.I.|C.I.A.".split("|")
 
+
 def is_abbreviation(dotted_word):
     clipped = dotted_word[:-1]
     if clipped[0] in ascii_uppercase:
-        if clipped.lower() in abbr_capped: return True
-        else: return False
+        if clipped.lower() in abbr_capped:
+            return True
+        else:
+            return False
     else:
-        if clipped in abbr_lowercase: return True
-        else: return False
+        if clipped in abbr_lowercase:
+            return True
+        else:
+            return False
+
 
 def is_sentence_ender(word):
-    if word in exceptions: return False
-    if word[-1] in [ "?", "!" ]:
+    if word in exceptions:
+        return False
+    if word[-1] in ["?", "!"]:
         return True
     if len(re.sub(r"[^A-Z]", "", word)) > 1:
         return True
@@ -38,16 +47,17 @@ def is_sentence_ender(word):
         return True
     return False
 
+
 def split_into_sentences(text):
     potential_end_pat = re.compile(r"".join([
-        r"([\w\.'’&\]\)]+[\.\?!])", # A word that ends with punctuation
-        r"([‘’“”'\"\)\]]*)", # Followed by optional quote/parens/etc
-        r"(\s+(?![a-z\-–—]))", # Followed by whitespace + non-(lowercase or dash)
-        ]), re.U)
+        r"([\w\.'’&\]\)]+[\.\?!])",  # A word that ends with punctuation
+        r"([‘’“”'\"\)\]]*)",  # Followed by optional quote/parens/etc
+        r"(\s+(?![a-z\-–—]))",  # Followed by whitespace + non-(lowercase or dash)
+    ]), re.U)
     dot_iter = re.finditer(potential_end_pat, text)
-    end_indices = [ (x.start() + len(x.group(1)) + len(x.group(2)))
-        for x in dot_iter
-        if is_sentence_ender(x.group(1)) ]
+    end_indices = [(x.start() + len(x.group(1)) + len(x.group(2)))
+                   for x in dot_iter
+                   if is_sentence_ender(x.group(1))]
     spans = zip([None] + end_indices, end_indices + [None])
-    sentences = [ text[start:end].strip() for start, end in spans ]
-    return sentences
\ No newline at end of file
+    sentences = [text[start:end].strip() for start, end in spans]
+    return sentences
diff --git a/test.py b/test.py
index 2f1e251..b3d1514 100644
--- a/test.py
+++ b/test.py
@@ -5,14 +5,14 @@
 
 text_model = chain.Text(text, state_size=3)
 
-file = open("/home/saurabh/Personal/Stuff/Next Tech Lab AP/LSSC/text3.txt","w")
+file = open("/home/saurabh/Personal/Stuff/Next Tech Lab AP/LSSC/text3.txt", "w")
 
 for j in range(200):
     print(" ")
     for i in range(1):
-        output=text_model.make_short_sentence(max_chars=230, min_chars=70)
+        output = text_model.make_short_sentence(max_chars=230, min_chars=70)
         txt = output + "\n"
         print(txt)
-        
+
         file.write(txt)
-file.close()
\ No newline at end of file
+file.close()

From 8c4b33e7532cb5274f98c97bb10f5260b1271c31 Mon Sep 17 00:00:00 2001
From: Shubham Rao <shubham_rao@berkeley.edu>
Date: Sat, 9 Feb 2019 22:38:38 -0800
Subject: [PATCH 09/15] Set PyCharm folders

* Set Excluded and Source folders

Signed-off-by: Shubham Rao <shubham_rao@berkeley.edu>
---
 .idea/Colbert-AI.iml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.idea/Colbert-AI.iml b/.idea/Colbert-AI.iml
index 6711606..faa0db1 100644
--- a/.idea/Colbert-AI.iml
+++ b/.idea/Colbert-AI.iml
@@ -1,7 +1,14 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
   <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
+      <excludeFolder url="file://$MODULE_DIR$/bin" />
+      <excludeFolder url="file://$MODULE_DIR$/etc" />
+      <excludeFolder url="file://$MODULE_DIR$/include" />
+      <excludeFolder url="file://$MODULE_DIR$/lib" />
+      <excludeFolder url="file://$MODULE_DIR$/share" />
+    </content>
     <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>

From a7e051afb553abab39b0d0908e214caab5d29d0e Mon Sep 17 00:00:00 2001
From: Shubham Rao <shubham_rao@berkeley.edu>
Date: Sat, 9 Feb 2019 22:40:19 -0800
Subject: [PATCH 10/15] download: Initial commit

NOT RECOMMENDED FOR USE (yet)
Downloads all subtitles (~1052) into the current directory

Signed-off-by: Shubham Rao <shubham_rao@berkeley.edu>
---
 download.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/download.py b/download.py
index c0c31a9..fa62b6d 100644
--- a/download.py
+++ b/download.py
@@ -20,3 +20,34 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import youtube_dl
+
+PLAYLIST_URL = 'https://www.youtube.com/playlist?list=PLiZxWe0ejyv8CSMylrxb6Nx4Ii2RHbu_j'
+
+DATA_DIR = "data/"
+opts = {
+
+    # Don't download video
+    'skip_download': True,
+
+    # Subtitle Options
+    'writesubtitles': True,
+    'subtitlelangs': 'en',
+    'subtitleformat': 'vtt',
+
+    # File Options
+    'restrictfilenames': True,
+
+    # Misc. Options
+    'playlistrandom': True,
+    'ignoreerrors': True,
+}
+
+
+def main():
+    with youtube_dl.YoutubeDL(opts) as ydl:
+        ydl.download([PLAYLIST_URL])
+
+
+if __name__ == '__main__':
+    main()

From fadd90f79cbaa78f9e14051f9cd1bd7cd25ae4fa Mon Sep 17 00:00:00 2001
From: Shubham Rao <shubham_rao@berkeley.edu>
Date: Sat, 9 Feb 2019 23:31:34 -0800
Subject: [PATCH 11/15] download.py: Download to data/ folder

* simple file names (number.en.vtt)

Signed-off-by: Shubham Rao <shubham_rao@berkeley.edu>
---
 download.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/download.py b/download.py
index fa62b6d..45999e3 100644
--- a/download.py
+++ b/download.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 import youtube_dl
+import os.path
 
 PLAYLIST_URL = 'https://www.youtube.com/playlist?list=PLiZxWe0ejyv8CSMylrxb6Nx4Ii2RHbu_j'
 
@@ -29,6 +30,7 @@
 
     # Don't download video
     'skip_download': True,
+    'downloadarchive': os.path.join(DATA_DIR, "archive"),
 
     # Subtitle Options
     'writesubtitles': True,
@@ -37,10 +39,14 @@
 
     # File Options
     'restrictfilenames': True,
+    'nooverwrites': True,
+    'outtmpl': os.path.join(DATA_DIR, "%(playlist_index)s.%(ext)s"),
 
     # Misc. Options
     'playlistrandom': True,
     'ignoreerrors': True,
+    'quiet': True,
+    'forcefilename': True,
 }
 
 

From 25ae7d17f665c5a8d48c4a6db21d403e14e4e3e3 Mon Sep 17 00:00:00 2001
From: Shubham Rao <shubham_rao@berkeley.edu>
Date: Sat, 9 Feb 2019 23:40:32 -0800
Subject: [PATCH 12/15] reformat: move .vtt to data/captions folder

---
 .gitignore  | 1 +
 data/readme | 1 +
 download.py | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100644 data/readme

diff --git a/.gitignore b/.gitignore
index b0f3a96..e240b09 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+*.vtt
 
 # Virtualenv
 # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
diff --git a/data/readme b/data/readme
new file mode 100644
index 0000000..95f2b16
--- /dev/null
+++ b/data/readme
@@ -0,0 +1 @@
+Folder for storing downloaded captions
diff --git a/download.py b/download.py
index 45999e3..9e218f7 100644
--- a/download.py
+++ b/download.py
@@ -40,7 +40,7 @@
     # File Options
     'restrictfilenames': True,
     'nooverwrites': True,
-    'outtmpl': os.path.join(DATA_DIR, "%(playlist_index)s.%(ext)s"),
+    'outtmpl': os.path.join(DATA_DIR, "captions", "%(playlist_index)s.%(ext)s"),
 
     # Misc. Options
     'playlistrandom': True,

From 4a896dee66339c4f90e450a2fe32286f4ac79984 Mon Sep 17 00:00:00 2001
From: Shubham Rao <cshubhamrao@gmail.com>
Date: Thu, 21 Nov 2019 13:36:03 +0530
Subject: [PATCH 13/15] Update Copyright text in LICENSE

Signed-off-by: Shubham Rao <cshubhamrao@gmail.com>
---
 LICENSE | 1 +
 1 file changed, 1 insertion(+)

diff --git a/LICENSE b/LICENSE
index 3a102b1..cec0a50 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,7 @@
 MIT License
 
 Copyright (c) 2018 Saurabh Ghanekar
+Copyright (c) 2019 Shubham Rao
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From bb86c14c8dc3a55b1580ef15f7f53432e727380d Mon Sep 17 00:00:00 2001
From: Shubham Rao <cshubhamrao@gmail.com>
Date: Thu, 21 Nov 2019 13:36:34 +0530
Subject: [PATCH 14/15] PyCharm: Use included virtualenv

Signed-off-by: Shubham Rao <cshubhamrao@gmail.com>
---
 .idea/Colbert-AI.iml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.idea/Colbert-AI.iml b/.idea/Colbert-AI.iml
index faa0db1..5c14f74 100644
--- a/.idea/Colbert-AI.iml
+++ b/.idea/Colbert-AI.iml
@@ -9,7 +9,7 @@
       <excludeFolder url="file://$MODULE_DIR$/lib" />
       <excludeFolder url="file://$MODULE_DIR$/share" />
     </content>
-    <orderEntry type="inheritedJdk" />
+    <orderEntry type="jdk" jdkName="Python 3.7 (Colbert-AI)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="TestRunnerService">

From a13607bf27b306053cf0bfe253beab75573da605 Mon Sep 17 00:00:00 2001
From: Shubham Rao <cshubhamrao@gmail.com>
Date: Thu, 21 Nov 2019 13:36:58 +0530
Subject: [PATCH 15/15] youtube-dl: use latest package versions

Signed-off-by: Shubham Rao <cshubhamrao@gmail.com>
---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e0d091f..afa9a77 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
-Unidecode==1.0.23
-youtube-dl==2019.2.8
+Unidecode>=1.1.1
+youtube-dl>=2019.11.5