NextTechLabAP · cshubhamrao · Nov 21, 2019 · Dec 19, 2018 · Feb 10, 2019 · Feb 10, 2019
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,83 @@
+*.vtt
+
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+[Ee]tc
+[Ss]hare
+pyvenv.cfg
+.venv
+pip-selfcheck.json
+
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
diff --git a/.idea/Colbert-AI.iml b/.idea/Colbert-AI.iml
diff --git a/.idea/encodings.xml b/.idea/encodings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2018 Saurabh Ghanekar
+Copyright (c) 2019 Shubham Rao
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/chain.py b/chain.py
@@ -6,10 +6,7 @@
 from unidecode import unidecode
 from splitters import split_into_sentences
 
-try:  # pragma: no cover
-    basestring
-except NameError:  # pragma: no cover
-    basestring = str
+basestring = str
 
 BEGIN = "__BEGIN__"
 END = "__END__"
@@ -34,7 +31,8 @@ class MarkovChain(object):
 
     def __init__(self, corpus, state_size):
         """
-        corpus: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence.
+        corpus: It is a list of lists where the outer list like a sentence and the inner list is
+        contains the words that make the sentence.
 
         state_size: items used to represent the state of the model.
         """
@@ -44,15 +42,17 @@ def __init__(self, corpus, state_size):
 
     def build(self, corpus, state_size):
         """
-        Returns a dict of dicts where the keys of the outer dict represent all possible states, and point to the inner dicts. The inner dicts represent all possibilities for the "next" item in the chain, along with the count of times it appears.
+        Returns a dict of dicts where the keys of the outer dict represent all possible states, and
+        point to the inner dicts. The inner dicts represent all possibilities for the "next" item in
+         the chain, along with the count of times it appears.
         """
         model = {}
 
         for run in corpus:
             items = ([BEGIN] * state_size) + run + [END]
-            for i in range(len(run)+1):
-                state = tuple(items[i:i+state_size])
-                follow = items[i+state_size]
+            for i in range(len(run) + 1):
+                state = tuple(items[i:i + state_size])
+                follow = items[i + state_size]
                 if state not in model:
                     model[state] = {}
 
@@ -75,7 +75,8 @@ def move(self, state):
 
     def gen(self, init_state=None):
         """
-        Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items until the chain reaches the "END" state.
+        Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items
+        until the chain reaches the "END" state.
         """
         state = init_state or (BEGIN,) * self.state_size
         while True:
@@ -98,19 +99,19 @@ def to_json(self):
         return json.dumps(list(self.model.items()))
 
 
-"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------"""
-
 DEFAULT_MAX_OVERLAP_RATIO = 0.7
 DEFAULT_MAX_OVERLAP_TOTAL = 20
 DEFAULT_TRIES = 8
 
 
 class Text(object):
-    def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True):
+    def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None,
+                 retain_original=True):
         """
         input_text: A string.
         state_size: An integer, indicating the number of words in the model's state.
-        parsed_sentences: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence.
+        parsed_sentences: It is a list of lists where the outer list like a sentence and the inner
+        list is contains the words that make the sentence.
         """
 
         can_make_sentences = parsed_sentences is not None or input_text is not None
@@ -211,7 +212,10 @@ def generate_corpus(self, text):
 
     def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total):
         """
-        Given a generated list of words, accept or reject it. This one rejects sentences that too closely match the original text, namely those that contain any identical sequence of words of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the total number of words, and (b) `max_overlap_total` (default: 15).
+        Given a generated list of words, accept or reject it. This one rejects sentences that too
+        closely match the original text, namely those that contain any identical sequence of words
+        of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the
+        total number of words, and (b) `max_overlap_total` (default: 15).
         """
         # Rejects chunk that is similar
 
@@ -220,7 +224,7 @@ def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total):
         overlap_over = overlap_max + 1
 
         gram_count = max((len(words) - overlap_max), 1)
-        grams = [words[i:i+overlap_over] for i in range(gram_count)]
+        grams = [words[i:i + overlap_over] for i in range(gram_count)]
 
         for gm in grams:
             gram_joined = self.word_join(gm)
@@ -231,15 +235,19 @@ def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total):
 
     def make_sentences(self, init_state=None, **kwargs):
         """
-        Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to "test_sentences_output".
+        Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and
+        "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to
+        "test_sentences_output".
 
         If successful, returns the sentence as a string. If not, returns None.
 
-        If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method chooses a sentence-start at random, in accordance with the model.
+        If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method
+        chooses a sentence-start at random, in accordance with the model.
 
         If "test_output" is set as False then the "text_sentences_output" check will be skipped.
 
-        If "max_words" is specified, the word count for the sentence will be evaluated against the provided limit.
+        If "max_words" is specified, the word count for the sentence will be evaluated against the
+        provided limit.
         """
 
         tries = kwargs.get("tries", DEFAULT_TRIES)
@@ -248,7 +256,7 @@ def make_sentences(self, init_state=None, **kwargs):
         test_output = kwargs.get("test_output", True)
         max_words = kwargs.get("max_words", None)
 
-        if init_state != None:
+        if init_state is not None:
             prefix = list(init_state)
             for word in prefix:
                 if word == BEGIN:
@@ -261,7 +269,7 @@ def make_sentences(self, init_state=None, **kwargs):
 
         for _ in range(tries):
             words = prefix + self.chain.walk(init_state)
-            if max_words != None and len(words) > max_words:
+            if max_words is not None and len(words) > max_words:
                 continue
             if test_output and hasattr(self, "rejoined_text"):
                 if self.text_sentences_output(words, mor, mot):
@@ -274,14 +282,12 @@ def make_sentences(self, init_state=None, **kwargs):
 
     def make_short_sentence(self, max_chars, min_chars=0, **kwargs):
         """
-        Tries making a sentence of no more than "max_chars" characters and optionally no less than "min_chars" charcaters, passing **kwargs to "self.make_sentence".
+        Tries making a sentence of no more than "max_chars" characters and optionally no less than
+        "min_chars" charcaters, passing **kwargs to "self.make_sentence".
         """
         tries = kwargs.get("tries", DEFAULT_TRIES)
 
         for _ in range(tries):
             sentence = self.make_sentences(**kwargs)
-            if sentence and len(sentence) <= max_chars and len(sentence) >= min_chars:
+            if sentence and max_chars >= len(sentence) >= min_chars:
                 return sentence
-
-
-"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------"""
diff --git a/data/readme b/data/readme
@@ -0,0 +1 @@
+Folder for storing downloaded captions
diff --git a/download.py b/download.py
@@ -0,0 +1,59 @@
+# MIT License
+#
+# Copyright (c) 2019 Shubham Rao
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import youtube_dl
+import os.path
+
+PLAYLIST_URL = 'https://www.youtube.com/playlist?list=PLiZxWe0ejyv8CSMylrxb6Nx4Ii2RHbu_j'
+
+DATA_DIR = "data/"
+opts = {
+
+    # Don't download video
+    'skip_download': True,
+    'downloadarchive': os.path.join(DATA_DIR, "archive"),
+
+    # Subtitle Options
+    'writesubtitles': True,
+    'subtitlelangs': 'en',
+    'subtitleformat': 'vtt',
+
+    # File Options
+    'restrictfilenames': True,
+    'nooverwrites': True,
+    'outtmpl': os.path.join(DATA_DIR, "captions", "%(playlist_index)s.%(ext)s"),
+
+    # Misc. Options
+    'playlistrandom': True,
+    'ignoreerrors': True,
+    'quiet': True,
+    'forcefilename': True,
+}
+
+
+def main():
+    with youtube_dl.YoutubeDL(opts) as ydl:
+        ydl.download([PLAYLIST_URL])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+Unidecode>=1.1.1
+youtube-dl>=2019.11.5