Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
*.vtt

# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
.Python
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
[Ee]tc
[Ss]hare
pyvenv.cfg
.venv
pip-selfcheck.json

# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
18 changes: 18 additions & 0 deletions .idea/Colbert-AI.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/encodings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
MIT License

Copyright (c) 2018 Saurabh Ghanekar
Copyright (c) 2019 Shubham Rao

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
58 changes: 32 additions & 26 deletions chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
from unidecode import unidecode
from splitters import split_into_sentences

try: # pragma: no cover
basestring
except NameError: # pragma: no cover
basestring = str
basestring = str

BEGIN = "__BEGIN__"
END = "__END__"
Expand All @@ -34,7 +31,8 @@ class MarkovChain(object):

def __init__(self, corpus, state_size):
"""
corpus: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence.
corpus: It is a list of lists where the outer list like a sentence and the inner list is
contains the words that make the sentence.

state_size: items used to represent the state of the model.
"""
Expand All @@ -44,15 +42,17 @@ def __init__(self, corpus, state_size):

def build(self, corpus, state_size):
"""
Returns a dict of dicts where the keys of the outer dict represent all possible states, and point to the inner dicts. The inner dicts represent all possibilities for the "next" item in the chain, along with the count of times it appears.
Returns a dict of dicts where the keys of the outer dict represent all possible states, and
point to the inner dicts. The inner dicts represent all possibilities for the "next" item in
the chain, along with the count of times it appears.
"""
model = {}

for run in corpus:
items = ([BEGIN] * state_size) + run + [END]
for i in range(len(run)+1):
state = tuple(items[i:i+state_size])
follow = items[i+state_size]
for i in range(len(run) + 1):
state = tuple(items[i:i + state_size])
follow = items[i + state_size]
if state not in model:
model[state] = {}

Expand All @@ -75,7 +75,8 @@ def move(self, state):

def gen(self, init_state=None):
"""
Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items until the chain reaches the "END" state.
Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items
until the chain reaches the "END" state.
"""
state = init_state or (BEGIN,) * self.state_size
while True:
Expand All @@ -98,19 +99,19 @@ def to_json(self):
return json.dumps(list(self.model.items()))


"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------"""

DEFAULT_MAX_OVERLAP_RATIO = 0.7
DEFAULT_MAX_OVERLAP_TOTAL = 20
DEFAULT_TRIES = 8


class Text(object):
def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True):
def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None,
retain_original=True):
"""
input_text: A string.
state_size: An integer, indicating the number of words in the model's state.
parsed_sentences: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence.
parsed_sentences: It is a list of lists where the outer list like a sentence and the inner
list is contains the words that make the sentence.
"""

can_make_sentences = parsed_sentences is not None or input_text is not None
Expand Down Expand Up @@ -211,7 +212,10 @@ def generate_corpus(self, text):

def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total):
"""
Given a generated list of words, accept or reject it. This one rejects sentences that too closely match the original text, namely those that contain any identical sequence of words of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the total number of words, and (b) `max_overlap_total` (default: 15).
Given a generated list of words, accept or reject it. This one rejects sentences that too
closely match the original text, namely those that contain any identical sequence of words
of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the
total number of words, and (b) `max_overlap_total` (default: 15).
"""
# Rejects chunk that is similar

Expand All @@ -220,7 +224,7 @@ def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total):
overlap_over = overlap_max + 1

gram_count = max((len(words) - overlap_max), 1)
grams = [words[i:i+overlap_over] for i in range(gram_count)]
grams = [words[i:i + overlap_over] for i in range(gram_count)]

for gm in grams:
gram_joined = self.word_join(gm)
Expand All @@ -231,15 +235,19 @@ def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total):

def make_sentences(self, init_state=None, **kwargs):
"""
Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to "test_sentences_output".
Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and
"test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to
"test_sentences_output".

If successful, returns the sentence as a string. If not, returns None.

If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method chooses a sentence-start at random, in accordance with the model.
If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method
chooses a sentence-start at random, in accordance with the model.

If "test_output" is set as False then the "text_sentences_output" check will be skipped.

If "max_words" is specified, the word count for the sentence will be evaluated against the provided limit.
If "max_words" is specified, the word count for the sentence will be evaluated against the
provided limit.
"""

tries = kwargs.get("tries", DEFAULT_TRIES)
Expand All @@ -248,7 +256,7 @@ def make_sentences(self, init_state=None, **kwargs):
test_output = kwargs.get("test_output", True)
max_words = kwargs.get("max_words", None)

if init_state != None:
if init_state is not None:
prefix = list(init_state)
for word in prefix:
if word == BEGIN:
Expand All @@ -261,7 +269,7 @@ def make_sentences(self, init_state=None, **kwargs):

for _ in range(tries):
words = prefix + self.chain.walk(init_state)
if max_words != None and len(words) > max_words:
if max_words is not None and len(words) > max_words:
continue
if test_output and hasattr(self, "rejoined_text"):
if self.text_sentences_output(words, mor, mot):
Expand All @@ -274,14 +282,12 @@ def make_sentences(self, init_state=None, **kwargs):

def make_short_sentence(self, max_chars, min_chars=0, **kwargs):
"""
Tries making a sentence of no more than "max_chars" characters and optionally no less than "min_chars" charcaters, passing **kwargs to "self.make_sentence".
Tries making a sentence of no more than "max_chars" characters and optionally no less than
"min_chars" charcaters, passing **kwargs to "self.make_sentence".
"""
tries = kwargs.get("tries", DEFAULT_TRIES)

for _ in range(tries):
sentence = self.make_sentences(**kwargs)
if sentence and len(sentence) <= max_chars and len(sentence) >= min_chars:
if sentence and max_chars >= len(sentence) >= min_chars:
return sentence


"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------"""
1 change: 1 addition & 0 deletions data/readme
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Folder for storing downloaded captions
59 changes: 59 additions & 0 deletions download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# MIT License
#
# Copyright (c) 2019 Shubham Rao
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import youtube_dl
import os.path

PLAYLIST_URL = 'https://www.youtube.com/playlist?list=PLiZxWe0ejyv8CSMylrxb6Nx4Ii2RHbu_j'

DATA_DIR = "data/"
opts = {

# Don't download video
'skip_download': True,
'downloadarchive': os.path.join(DATA_DIR, "archive"),

# Subtitle Options
'writesubtitles': True,
'subtitlelangs': 'en',
'subtitleformat': 'vtt',

# File Options
'restrictfilenames': True,
'nooverwrites': True,
'outtmpl': os.path.join(DATA_DIR, "captions", "%(playlist_index)s.%(ext)s"),

# Misc. Options
'playlistrandom': True,
'ignoreerrors': True,
'quiet': True,
'forcefilename': True,
}


def main():
with youtube_dl.YoutubeDL(opts) as ydl:
ydl.download([PLAYLIST_URL])


if __name__ == '__main__':
main()
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Unidecode>=1.1.1
youtube-dl>=2019.11.5
Loading