diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e240b09 --- /dev/null +++ b/.gitignore @@ -0,0 +1,83 @@ +*.vtt + +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +.Python +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +[Ee]tc +[Ss]hare +pyvenv.cfg +.venv +pip-selfcheck.json + +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser diff --git a/.idea/Colbert-AI.iml b/.idea/Colbert-AI.iml new file mode 100644 index 0000000..5c14f74 --- /dev/null +++ b/.idea/Colbert-AI.iml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..15a15b2 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..14980cb --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..5f1f9ec --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..cec0a50 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2018 Saurabh Ghanekar +Copyright (c) 2019 Shubham Rao + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/chain.py b/chain.py index 521c63f..9cc5605 100644 --- a/chain.py +++ b/chain.py @@ -6,10 +6,7 @@ from unidecode import unidecode from splitters import split_into_sentences -try: # pragma: no cover - basestring -except NameError: # pragma: no cover - basestring = str +basestring = str BEGIN = "__BEGIN__" END = "__END__" @@ -34,7 +31,8 @@ class MarkovChain(object): def __init__(self, corpus, state_size): """ - corpus: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence. + corpus: It is a list of lists where the outer list like a sentence and the inner list is + contains the words that make the sentence. state_size: items used to represent the state of the model. """ @@ -44,15 +42,17 @@ def __init__(self, corpus, state_size): def build(self, corpus, state_size): """ - Returns a dict of dicts where the keys of the outer dict represent all possible states, and point to the inner dicts. The inner dicts represent all possibilities for the "next" item in the chain, along with the count of times it appears. + Returns a dict of dicts where the keys of the outer dict represent all possible states, and + point to the inner dicts. The inner dicts represent all possibilities for the "next" item in + the chain, along with the count of times it appears. """ model = {} for run in corpus: items = ([BEGIN] * state_size) + run + [END] - for i in range(len(run)+1): - state = tuple(items[i:i+state_size]) - follow = items[i+state_size] + for i in range(len(run) + 1): + state = tuple(items[i:i + state_size]) + follow = items[i + state_size] if state not in model: model[state] = {} @@ -75,7 +75,8 @@ def move(self, state): def gen(self, init_state=None): """ - Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items until the chain reaches the "END" state. + Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items + until the chain reaches the "END" state. """ state = init_state or (BEGIN,) * self.state_size while True: @@ -98,19 +99,19 @@ def to_json(self): return json.dumps(list(self.model.items())) -"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------""" - DEFAULT_MAX_OVERLAP_RATIO = 0.7 DEFAULT_MAX_OVERLAP_TOTAL = 20 DEFAULT_TRIES = 8 class Text(object): - def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True): + def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, + retain_original=True): """ input_text: A string. state_size: An integer, indicating the number of words in the model's state. - parsed_sentences: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence. + parsed_sentences: It is a list of lists where the outer list like a sentence and the inner + list is contains the words that make the sentence. """ can_make_sentences = parsed_sentences is not None or input_text is not None @@ -211,7 +212,10 @@ def generate_corpus(self, text): def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total): """ - Given a generated list of words, accept or reject it. This one rejects sentences that too closely match the original text, namely those that contain any identical sequence of words of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the total number of words, and (b) `max_overlap_total` (default: 15). + Given a generated list of words, accept or reject it. This one rejects sentences that too + closely match the original text, namely those that contain any identical sequence of words + of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the + total number of words, and (b) `max_overlap_total` (default: 15). """ # Rejects chunk that is similar @@ -220,7 +224,7 @@ def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total): overlap_over = overlap_max + 1 gram_count = max((len(words) - overlap_max), 1) - grams = [words[i:i+overlap_over] for i in range(gram_count)] + grams = [words[i:i + overlap_over] for i in range(gram_count)] for gm in grams: gram_joined = self.word_join(gm) @@ -231,15 +235,19 @@ def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total): def make_sentences(self, init_state=None, **kwargs): """ - Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to "test_sentences_output". + Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and + "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to + "test_sentences_output". If successful, returns the sentence as a string. If not, returns None. - If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method chooses a sentence-start at random, in accordance with the model. + If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method + chooses a sentence-start at random, in accordance with the model. If "test_output" is set as False then the "text_sentences_output" check will be skipped. - If "max_words" is specified, the word count for the sentence will be evaluated against the provided limit. + If "max_words" is specified, the word count for the sentence will be evaluated against the + provided limit. """ tries = kwargs.get("tries", DEFAULT_TRIES) @@ -248,7 +256,7 @@ def make_sentences(self, init_state=None, **kwargs): test_output = kwargs.get("test_output", True) max_words = kwargs.get("max_words", None) - if init_state != None: + if init_state is not None: prefix = list(init_state) for word in prefix: if word == BEGIN: @@ -261,7 +269,7 @@ def make_sentences(self, init_state=None, **kwargs): for _ in range(tries): words = prefix + self.chain.walk(init_state) - if max_words != None and len(words) > max_words: + if max_words is not None and len(words) > max_words: continue if test_output and hasattr(self, "rejoined_text"): if self.text_sentences_output(words, mor, mot): @@ -274,14 +282,12 @@ def make_sentences(self, init_state=None, **kwargs): def make_short_sentence(self, max_chars, min_chars=0, **kwargs): """ - Tries making a sentence of no more than "max_chars" characters and optionally no less than "min_chars" charcaters, passing **kwargs to "self.make_sentence". + Tries making a sentence of no more than "max_chars" characters and optionally no less than + "min_chars" charcaters, passing **kwargs to "self.make_sentence". """ tries = kwargs.get("tries", DEFAULT_TRIES) for _ in range(tries): sentence = self.make_sentences(**kwargs) - if sentence and len(sentence) <= max_chars and len(sentence) >= min_chars: + if sentence and max_chars >= len(sentence) >= min_chars: return sentence - - -"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------""" diff --git a/data/readme b/data/readme new file mode 100644 index 0000000..95f2b16 --- /dev/null +++ b/data/readme @@ -0,0 +1 @@ +Folder for storing downloaded captions diff --git a/download.py b/download.py new file mode 100644 index 0000000..9e218f7 --- /dev/null +++ b/download.py @@ -0,0 +1,59 @@ +# MIT License +# +# Copyright (c) 2019 Shubham Rao +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import youtube_dl +import os.path + +PLAYLIST_URL = 'https://www.youtube.com/playlist?list=PLiZxWe0ejyv8CSMylrxb6Nx4Ii2RHbu_j' + +DATA_DIR = "data/" +opts = { + + # Don't download video + 'skip_download': True, + 'downloadarchive': os.path.join(DATA_DIR, "archive"), + + # Subtitle Options + 'writesubtitles': True, + 'subtitlelangs': 'en', + 'subtitleformat': 'vtt', + + # File Options + 'restrictfilenames': True, + 'nooverwrites': True, + 'outtmpl': os.path.join(DATA_DIR, "captions", "%(playlist_index)s.%(ext)s"), + + # Misc. Options + 'playlistrandom': True, + 'ignoreerrors': True, + 'quiet': True, + 'forcefilename': True, +} + + +def main(): + with youtube_dl.YoutubeDL(opts) as ydl: + ydl.download([PLAYLIST_URL]) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..afa9a77 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +Unidecode>=1.1.1 +youtube-dl>=2019.11.5 diff --git a/splitters.py b/splitters.py index e9804eb..62d2b66 100644 --- a/splitters.py +++ b/splitters.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import re ascii_lowercase = "abcdefghijklmnopqrstuvwxyz" @@ -7,30 +7,39 @@ # States w/ with thanks to https://github.com/unitedstates/python-us # Titles w/ thanks to https://github.com/nytimes/emphasis and @donohoe abbr_capped = "|".join([ - "ala|ariz|ark|calif|colo|conn|del|fla|ga|ill|ind|kan|ky|la|md|mass|mich|minn|miss|mo|mont|neb|nev|okla|ore|pa|tenn|vt|va|wash|wis|wyo", # States + "ala|ariz|ark|calif|colo|conn|del|fla|ga|ill|ind|kan|ky|la|md|mass|mich|minn|miss|mo|mont|" + "neb|nev|okla|ore|pa|tenn|vt|va|wash|wis|wyo", # States "u.s", - "mr|ms|mrs|msr|dr|gov|pres|sen|sens|rep|reps|prof|gen|messrs|col|sr|jf|sgt|mgr|fr|rev|jr|snr|atty|supt", # Titles - "ave|blvd|st|rd|hwy", # Streets - "jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec", # Months - "|".join(ascii_lowercase) # Initials + "mr|ms|mrs|msr|dr|gov|pres|sen|sens|rep|reps|prof|gen|messrs|col|sr|jf|sgt|mgr|fr|rev|jr|" + "snr|atty|supt", # Titles + "ave|blvd|st|rd|hwy", # Streets + "jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec", # Months + "|".join(ascii_lowercase) # Initials ]).split("|") abbr_lowercase = "etc|v|vs|viz|al|pct" exceptions = "U.S.|U.N.|E.U.|F.B.I.|C.I.A.".split("|") + def is_abbreviation(dotted_word): clipped = dotted_word[:-1] if clipped[0] in ascii_uppercase: - if clipped.lower() in abbr_capped: return True - else: return False + if clipped.lower() in abbr_capped: + return True + else: + return False else: - if clipped in abbr_lowercase: return True - else: return False + if clipped in abbr_lowercase: + return True + else: + return False + def is_sentence_ender(word): - if word in exceptions: return False - if word[-1] in [ "?", "!" ]: + if word in exceptions: + return False + if word[-1] in ["?", "!"]: return True if len(re.sub(r"[^A-Z]", "", word)) > 1: return True @@ -38,16 +47,17 @@ def is_sentence_ender(word): return True return False + def split_into_sentences(text): potential_end_pat = re.compile(r"".join([ - r"([\w\.'’&\]\)]+[\.\?!])", # A word that ends with punctuation - r"([‘’“”'\"\)\]]*)", # Followed by optional quote/parens/etc - r"(\s+(?![a-z\-–—]))", # Followed by whitespace + non-(lowercase or dash) - ]), re.U) + r"([\w\.'’&\]\)]+[\.\?!])", # A word that ends with punctuation + r"([‘’“”'\"\)\]]*)", # Followed by optional quote/parens/etc + r"(\s+(?![a-z\-–—]))", # Followed by whitespace + non-(lowercase or dash) + ]), re.U) dot_iter = re.finditer(potential_end_pat, text) - end_indices = [ (x.start() + len(x.group(1)) + len(x.group(2))) - for x in dot_iter - if is_sentence_ender(x.group(1)) ] + end_indices = [(x.start() + len(x.group(1)) + len(x.group(2))) + for x in dot_iter + if is_sentence_ender(x.group(1))] spans = zip([None] + end_indices, end_indices + [None]) - sentences = [ text[start:end].strip() for start, end in spans ] - return sentences \ No newline at end of file + sentences = [text[start:end].strip() for start, end in spans] + return sentences diff --git a/test.py b/test.py index 2f1e251..b3d1514 100644 --- a/test.py +++ b/test.py @@ -5,14 +5,14 @@ text_model = chain.Text(text, state_size=3) -file = open("/home/saurabh/Personal/Stuff/Next Tech Lab AP/LSSC/text3.txt","w") +file = open("/home/saurabh/Personal/Stuff/Next Tech Lab AP/LSSC/text3.txt", "w") for j in range(200): print(" ") for i in range(1): - output=text_model.make_short_sentence(max_chars=230, min_chars=70) + output = text_model.make_short_sentence(max_chars=230, min_chars=70) txt = output + "\n" print(txt) - + file.write(txt) -file.close() \ No newline at end of file +file.close()