From 0a3d848a41a67487b8f36c3465b2e9fb847e7760 Mon Sep 17 00:00:00 2001 From: Saurabh Ghanekar Date: Wed, 19 Dec 2018 16:15:46 +0530 Subject: [PATCH 01/15] Create LICENSE --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3a102b1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Saurabh Ghanekar + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 8c05a1e554348ecb18454de0d3afab0b69c0b40c Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Sat, 9 Feb 2019 19:27:41 -0800 Subject: [PATCH 02/15] Create .gitignore and a PyCharm project Hey there, @cshubhamrao here. --- .gitignore | 80 ++++++++++++++++++++++++++++++++++++++++++++ .idea/Colbert-AI.iml | 11 ++++++ .idea/encodings.xml | 4 +++ .idea/misc.xml | 7 ++++ .idea/modules.xml | 8 +++++ .idea/vcs.xml | 6 ++++ 6 files changed, 116 insertions(+) create mode 100644 .gitignore create mode 100644 .idea/Colbert-AI.iml create mode 100644 .idea/encodings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c44232f --- /dev/null +++ b/.gitignore @@ -0,0 +1,80 @@ + +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +.Python +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +.venv +pip-selfcheck.json + +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser diff --git a/.idea/Colbert-AI.iml b/.idea/Colbert-AI.iml new file mode 100644 index 0000000..6711606 --- /dev/null +++ b/.idea/Colbert-AI.iml @@ -0,0 +1,11 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..15a15b2 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..14980cb --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..5f1f9ec --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file From 4e86e3a88246aee4b825a2551d3d0ff6f1e499e0 Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Sat, 9 Feb 2019 19:34:22 -0800 Subject: [PATCH 03/15] Add requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e0d091f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +Unidecode==1.0.23 +youtube-dl==2019.2.8 From 873578aabfbf2610494cf6e628e2621e17d3c3ca Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Sat, 9 Feb 2019 19:36:28 -0800 Subject: [PATCH 04/15] Update .gitignore To ignore youtube-dl created files and folders --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index c44232f..b0f3a96 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ [Ll]ib64 [Ll]ocal [Ss]cripts +[Ee]tc +[Ss]hare pyvenv.cfg .venv pip-selfcheck.json From 3d740a92bd7928f8876c153418925dfbd725328f Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Sat, 9 Feb 2019 19:39:07 -0800 Subject: [PATCH 05/15] Modify chain.py Let's see if it works --- chain.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/chain.py b/chain.py index 521c63f..4af0e0c 100644 --- a/chain.py +++ b/chain.py @@ -6,10 +6,7 @@ from unidecode import unidecode from splitters import split_into_sentences -try: # pragma: no cover - basestring -except NameError: # pragma: no cover - basestring = str +basestring = str BEGIN = "__BEGIN__" END = "__END__" From fcd1eedda4be1e00096842f2bf72992cea34d308 Mon Sep 17 00:00:00 2001 From: Saurabh Ghanekar Date: Sat, 9 Feb 2019 20:46:15 -0800 Subject: [PATCH 06/15] Modify chain.py 'basestring' was used to check if a variable is a string or not. --- chain.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/chain.py b/chain.py index 4af0e0c..8bbb32c 100644 --- a/chain.py +++ b/chain.py @@ -6,8 +6,6 @@ from unidecode import unidecode from splitters import split_into_sentences -basestring = str - BEGIN = "__BEGIN__" END = "__END__" From e863a2f2727a0bcb057026cec5f27b06260c5295 Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Sat, 9 Feb 2019 21:27:06 -0800 Subject: [PATCH 07/15] Create download.py file Will scrape and download subtitles from the playlist Signed-off-by: Shubham Rao --- download.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 download.py diff --git a/download.py b/download.py new file mode 100644 index 0000000..c0c31a9 --- /dev/null +++ b/download.py @@ -0,0 +1,22 @@ +# MIT License +# +# Copyright (c) 2019 Shubham Rao +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + From 686c241bce5ceb7a831d34fb5058a6310071d70a Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Sat, 9 Feb 2019 22:27:32 -0800 Subject: [PATCH 08/15] Formatting fixes * Make PEP8 conformant * remove unnecessary lines * break long lines Signed-off-by: Shubham Rao --- chain.py | 53 ++++++++++++++++++++++++++++++---------------------- splitters.py | 52 ++++++++++++++++++++++++++++++--------------------- test.py | 8 ++++---- 3 files changed, 66 insertions(+), 47 deletions(-) diff --git a/chain.py b/chain.py index 4af0e0c..9cc5605 100644 --- a/chain.py +++ b/chain.py @@ -31,7 +31,8 @@ class MarkovChain(object): def __init__(self, corpus, state_size): """ - corpus: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence. + corpus: It is a list of lists where the outer list like a sentence and the inner list is + contains the words that make the sentence. state_size: items used to represent the state of the model. """ @@ -41,15 +42,17 @@ def __init__(self, corpus, state_size): def build(self, corpus, state_size): """ - Returns a dict of dicts where the keys of the outer dict represent all possible states, and point to the inner dicts. The inner dicts represent all possibilities for the "next" item in the chain, along with the count of times it appears. + Returns a dict of dicts where the keys of the outer dict represent all possible states, and + point to the inner dicts. The inner dicts represent all possibilities for the "next" item in + the chain, along with the count of times it appears. """ model = {} for run in corpus: items = ([BEGIN] * state_size) + run + [END] - for i in range(len(run)+1): - state = tuple(items[i:i+state_size]) - follow = items[i+state_size] + for i in range(len(run) + 1): + state = tuple(items[i:i + state_size]) + follow = items[i + state_size] if state not in model: model[state] = {} @@ -72,7 +75,8 @@ def move(self, state): def gen(self, init_state=None): """ - Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items until the chain reaches the "END" state. + Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items + until the chain reaches the "END" state. """ state = init_state or (BEGIN,) * self.state_size while True: @@ -95,19 +99,19 @@ def to_json(self): return json.dumps(list(self.model.items())) -"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------""" - DEFAULT_MAX_OVERLAP_RATIO = 0.7 DEFAULT_MAX_OVERLAP_TOTAL = 20 DEFAULT_TRIES = 8 class Text(object): - def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True): + def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, + retain_original=True): """ input_text: A string. state_size: An integer, indicating the number of words in the model's state. - parsed_sentences: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence. + parsed_sentences: It is a list of lists where the outer list like a sentence and the inner + list is contains the words that make the sentence. """ can_make_sentences = parsed_sentences is not None or input_text is not None @@ -208,7 +212,10 @@ def generate_corpus(self, text): def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total): """ - Given a generated list of words, accept or reject it. This one rejects sentences that too closely match the original text, namely those that contain any identical sequence of words of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the total number of words, and (b) `max_overlap_total` (default: 15). + Given a generated list of words, accept or reject it. This one rejects sentences that too + closely match the original text, namely those that contain any identical sequence of words + of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the + total number of words, and (b) `max_overlap_total` (default: 15). """ # Rejects chunk that is similar @@ -217,7 +224,7 @@ def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total): overlap_over = overlap_max + 1 gram_count = max((len(words) - overlap_max), 1) - grams = [words[i:i+overlap_over] for i in range(gram_count)] + grams = [words[i:i + overlap_over] for i in range(gram_count)] for gm in grams: gram_joined = self.word_join(gm) @@ -228,15 +235,19 @@ def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total): def make_sentences(self, init_state=None, **kwargs): """ - Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to "test_sentences_output". + Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and + "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to + "test_sentences_output". If successful, returns the sentence as a string. If not, returns None. - If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method chooses a sentence-start at random, in accordance with the model. + If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method + chooses a sentence-start at random, in accordance with the model. If "test_output" is set as False then the "text_sentences_output" check will be skipped. - If "max_words" is specified, the word count for the sentence will be evaluated against the provided limit. + If "max_words" is specified, the word count for the sentence will be evaluated against the + provided limit. """ tries = kwargs.get("tries", DEFAULT_TRIES) @@ -245,7 +256,7 @@ def make_sentences(self, init_state=None, **kwargs): test_output = kwargs.get("test_output", True) max_words = kwargs.get("max_words", None) - if init_state != None: + if init_state is not None: prefix = list(init_state) for word in prefix: if word == BEGIN: @@ -258,7 +269,7 @@ def make_sentences(self, init_state=None, **kwargs): for _ in range(tries): words = prefix + self.chain.walk(init_state) - if max_words != None and len(words) > max_words: + if max_words is not None and len(words) > max_words: continue if test_output and hasattr(self, "rejoined_text"): if self.text_sentences_output(words, mor, mot): @@ -271,14 +282,12 @@ def make_sentences(self, init_state=None, **kwargs): def make_short_sentence(self, max_chars, min_chars=0, **kwargs): """ - Tries making a sentence of no more than "max_chars" characters and optionally no less than "min_chars" charcaters, passing **kwargs to "self.make_sentence". + Tries making a sentence of no more than "max_chars" characters and optionally no less than + "min_chars" charcaters, passing **kwargs to "self.make_sentence". """ tries = kwargs.get("tries", DEFAULT_TRIES) for _ in range(tries): sentence = self.make_sentences(**kwargs) - if sentence and len(sentence) <= max_chars and len(sentence) >= min_chars: + if sentence and max_chars >= len(sentence) >= min_chars: return sentence - - -"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------""" diff --git a/splitters.py b/splitters.py index e9804eb..62d2b66 100644 --- a/splitters.py +++ b/splitters.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import re ascii_lowercase = "abcdefghijklmnopqrstuvwxyz" @@ -7,30 +7,39 @@ # States w/ with thanks to https://github.com/unitedstates/python-us # Titles w/ thanks to https://github.com/nytimes/emphasis and @donohoe abbr_capped = "|".join([ - "ala|ariz|ark|calif|colo|conn|del|fla|ga|ill|ind|kan|ky|la|md|mass|mich|minn|miss|mo|mont|neb|nev|okla|ore|pa|tenn|vt|va|wash|wis|wyo", # States + "ala|ariz|ark|calif|colo|conn|del|fla|ga|ill|ind|kan|ky|la|md|mass|mich|minn|miss|mo|mont|" + "neb|nev|okla|ore|pa|tenn|vt|va|wash|wis|wyo", # States "u.s", - "mr|ms|mrs|msr|dr|gov|pres|sen|sens|rep|reps|prof|gen|messrs|col|sr|jf|sgt|mgr|fr|rev|jr|snr|atty|supt", # Titles - "ave|blvd|st|rd|hwy", # Streets - "jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec", # Months - "|".join(ascii_lowercase) # Initials + "mr|ms|mrs|msr|dr|gov|pres|sen|sens|rep|reps|prof|gen|messrs|col|sr|jf|sgt|mgr|fr|rev|jr|" + "snr|atty|supt", # Titles + "ave|blvd|st|rd|hwy", # Streets + "jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec", # Months + "|".join(ascii_lowercase) # Initials ]).split("|") abbr_lowercase = "etc|v|vs|viz|al|pct" exceptions = "U.S.|U.N.|E.U.|F.B.I.|C.I.A.".split("|") + def is_abbreviation(dotted_word): clipped = dotted_word[:-1] if clipped[0] in ascii_uppercase: - if clipped.lower() in abbr_capped: return True - else: return False + if clipped.lower() in abbr_capped: + return True + else: + return False else: - if clipped in abbr_lowercase: return True - else: return False + if clipped in abbr_lowercase: + return True + else: + return False + def is_sentence_ender(word): - if word in exceptions: return False - if word[-1] in [ "?", "!" ]: + if word in exceptions: + return False + if word[-1] in ["?", "!"]: return True if len(re.sub(r"[^A-Z]", "", word)) > 1: return True @@ -38,16 +47,17 @@ def is_sentence_ender(word): return True return False + def split_into_sentences(text): potential_end_pat = re.compile(r"".join([ - r"([\w\.'’&\]\)]+[\.\?!])", # A word that ends with punctuation - r"([‘’“”'\"\)\]]*)", # Followed by optional quote/parens/etc - r"(\s+(?![a-z\-–—]))", # Followed by whitespace + non-(lowercase or dash) - ]), re.U) + r"([\w\.'’&\]\)]+[\.\?!])", # A word that ends with punctuation + r"([‘’“”'\"\)\]]*)", # Followed by optional quote/parens/etc + r"(\s+(?![a-z\-–—]))", # Followed by whitespace + non-(lowercase or dash) + ]), re.U) dot_iter = re.finditer(potential_end_pat, text) - end_indices = [ (x.start() + len(x.group(1)) + len(x.group(2))) - for x in dot_iter - if is_sentence_ender(x.group(1)) ] + end_indices = [(x.start() + len(x.group(1)) + len(x.group(2))) + for x in dot_iter + if is_sentence_ender(x.group(1))] spans = zip([None] + end_indices, end_indices + [None]) - sentences = [ text[start:end].strip() for start, end in spans ] - return sentences \ No newline at end of file + sentences = [text[start:end].strip() for start, end in spans] + return sentences diff --git a/test.py b/test.py index 2f1e251..b3d1514 100644 --- a/test.py +++ b/test.py @@ -5,14 +5,14 @@ text_model = chain.Text(text, state_size=3) -file = open("/home/saurabh/Personal/Stuff/Next Tech Lab AP/LSSC/text3.txt","w") +file = open("/home/saurabh/Personal/Stuff/Next Tech Lab AP/LSSC/text3.txt", "w") for j in range(200): print(" ") for i in range(1): - output=text_model.make_short_sentence(max_chars=230, min_chars=70) + output = text_model.make_short_sentence(max_chars=230, min_chars=70) txt = output + "\n" print(txt) - + file.write(txt) -file.close() \ No newline at end of file +file.close() From 8c4b33e7532cb5274f98c97bb10f5260b1271c31 Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Sat, 9 Feb 2019 22:38:38 -0800 Subject: [PATCH 09/15] Set PyCharm folders * Set Excluded and Source folders Signed-off-by: Shubham Rao --- .idea/Colbert-AI.iml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.idea/Colbert-AI.iml b/.idea/Colbert-AI.iml index 6711606..faa0db1 100644 --- a/.idea/Colbert-AI.iml +++ b/.idea/Colbert-AI.iml @@ -1,7 +1,14 @@ - + + + + + + + + From a7e051afb553abab39b0d0908e214caab5d29d0e Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Sat, 9 Feb 2019 22:40:19 -0800 Subject: [PATCH 10/15] download: Initial commit NOT RECOMMENDED FOR USE (yet) Downloads all subtitles (~1052) into the current directory Signed-off-by: Shubham Rao --- download.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/download.py b/download.py index c0c31a9..fa62b6d 100644 --- a/download.py +++ b/download.py @@ -20,3 +20,34 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import youtube_dl + +PLAYLIST_URL = 'https://www.youtube.com/playlist?list=PLiZxWe0ejyv8CSMylrxb6Nx4Ii2RHbu_j' + +DATA_DIR = "data/" +opts = { + + # Don't download video + 'skip_download': True, + + # Subtitle Options + 'writesubtitles': True, + 'subtitlelangs': 'en', + 'subtitleformat': 'vtt', + + # File Options + 'restrictfilenames': True, + + # Misc. Options + 'playlistrandom': True, + 'ignoreerrors': True, +} + + +def main(): + with youtube_dl.YoutubeDL(opts) as ydl: + ydl.download([PLAYLIST_URL]) + + +if __name__ == '__main__': + main() From fadd90f79cbaa78f9e14051f9cd1bd7cd25ae4fa Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Sat, 9 Feb 2019 23:31:34 -0800 Subject: [PATCH 11/15] download.py: Download to data/ folder * simple file names (number.en.vtt) Signed-off-by: Shubham Rao --- download.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/download.py b/download.py index fa62b6d..45999e3 100644 --- a/download.py +++ b/download.py @@ -21,6 +21,7 @@ # SOFTWARE. import youtube_dl +import os.path PLAYLIST_URL = 'https://www.youtube.com/playlist?list=PLiZxWe0ejyv8CSMylrxb6Nx4Ii2RHbu_j' @@ -29,6 +30,7 @@ # Don't download video 'skip_download': True, + 'downloadarchive': os.path.join(DATA_DIR, "archive"), # Subtitle Options 'writesubtitles': True, @@ -37,10 +39,14 @@ # File Options 'restrictfilenames': True, + 'nooverwrites': True, + 'outtmpl': os.path.join(DATA_DIR, "%(playlist_index)s.%(ext)s"), # Misc. Options 'playlistrandom': True, 'ignoreerrors': True, + 'quiet': True, + 'forcefilename': True, } From 25ae7d17f665c5a8d48c4a6db21d403e14e4e3e3 Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Sat, 9 Feb 2019 23:40:32 -0800 Subject: [PATCH 12/15] reformat: move .vtt to data/captions folder --- .gitignore | 1 + data/readme | 1 + download.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 data/readme diff --git a/.gitignore b/.gitignore index b0f3a96..e240b09 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*.vtt # Virtualenv # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ diff --git a/data/readme b/data/readme new file mode 100644 index 0000000..95f2b16 --- /dev/null +++ b/data/readme @@ -0,0 +1 @@ +Folder for storing downloaded captions diff --git a/download.py b/download.py index 45999e3..9e218f7 100644 --- a/download.py +++ b/download.py @@ -40,7 +40,7 @@ # File Options 'restrictfilenames': True, 'nooverwrites': True, - 'outtmpl': os.path.join(DATA_DIR, "%(playlist_index)s.%(ext)s"), + 'outtmpl': os.path.join(DATA_DIR, "captions", "%(playlist_index)s.%(ext)s"), # Misc. Options 'playlistrandom': True, From 4a896dee66339c4f90e450a2fe32286f4ac79984 Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Thu, 21 Nov 2019 13:36:03 +0530 Subject: [PATCH 13/15] Update Copyright text in LICENSE Signed-off-by: Shubham Rao --- LICENSE | 1 + 1 file changed, 1 insertion(+) diff --git a/LICENSE b/LICENSE index 3a102b1..cec0a50 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License Copyright (c) 2018 Saurabh Ghanekar +Copyright (c) 2019 Shubham Rao Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From bb86c14c8dc3a55b1580ef15f7f53432e727380d Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Thu, 21 Nov 2019 13:36:34 +0530 Subject: [PATCH 14/15] PyCharm: Use included virtualenv Signed-off-by: Shubham Rao --- .idea/Colbert-AI.iml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.idea/Colbert-AI.iml b/.idea/Colbert-AI.iml index faa0db1..5c14f74 100644 --- a/.idea/Colbert-AI.iml +++ b/.idea/Colbert-AI.iml @@ -9,7 +9,7 @@ - + From a13607bf27b306053cf0bfe253beab75573da605 Mon Sep 17 00:00:00 2001 From: Shubham Rao Date: Thu, 21 Nov 2019 13:36:58 +0530 Subject: [PATCH 15/15] youtube-dl: use latest package versions Signed-off-by: Shubham Rao --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index e0d091f..afa9a77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -Unidecode==1.0.23 -youtube-dl==2019.2.8 +Unidecode>=1.1.1 +youtube-dl>=2019.11.5