QuantGov · jnelson16 · Nov 6, 2017 · Nov 6, 2017 · Nov 7, 2017 · Nov 7, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -4,6 +4,8 @@ python:
 - '3.6'
 install:
 - pip install ".[testing]"
+- pip install ".[builtins]"
+- python -m nltk.downloader punkt stopwords wordnet
 script: pytest
 deploy:
   provider: pypi

diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py
@@ -3,9 +3,19 @@
 """
 import re
 import collections
+import math
 
 import quantgov
 
+import nltk.corpus
+import textblob
+
+try:
+    nltk.corpus.wordnet.ensure_loaded()
+except LookupError:
+    nltk.download('wordnet')
+    nltk.corpus.wordnet.ensure_loaded()
+
 commands = {}
 
 
@@ -93,3 +103,121 @@ def process_document(doc, terms, pattern, total_label):
 
 
 commands['count_occurrences'] = OccurrenceCounter
+
+
+class ShannonEntropy():
+    lemmas = {}
+    cli = quantgov.utils.CLISpec(
+        help='Shannon Entropy',
+        arguments=[
+            quantgov.utils.CLIArg(
+                flags=('--word_pattern', '-wp'),
+                kwargs={
+                    'help': 'regular expression defining a "word"',
+                    'type': re.compile,
+                    'default': re.compile(r'\b\w+\b')
+                }
+            ),
+            quantgov.utils.CLIArg(
+                flags=('--stopwords', '-sw'),
+                kwargs={
+                    'help': 'stopwords to ignore',
+                    'default': nltk.corpus.stopwords.words('english')
+                }
+            ),
+            quantgov.utils.CLIArg(
+                flags=('--precision'),
+                kwargs={
+                    'help': 'decimal places to round',
+                    'default': 2
+                }
+            )
+        ]
+    )
+
+    @staticmethod
+    def get_columns(args):
+        return ('shannon_entropy',)
+
+    @staticmethod
+    def process_document(doc, word_pattern, stopwords, precision):
+        words = word_pattern.findall(doc.text)
+        lemmas = [
+            lemma for lemma in (
+                ShannonEntropy.lemmatize(word) for word in words
+            )
+            if lemma not in stopwords
+        ]
+        counts = collections.Counter(lemmas)
+        return doc.index + (round(sum(
+            -(count / len(lemmas) * math.log(count / len(lemmas), 2))
+            for count in counts.values()
+        ), int(precision)),)
+
+    def lemmatize(word):
+        if word in ShannonEntropy.lemmas:
+            lemma = ShannonEntropy.lemmas[word]
+        else:
+            lemma = textblob.Word(word).lemmatize()
+            ShannonEntropy.lemmas[word] = lemma
+        return lemma
+
+
+commands['shannon_entropy'] = ShannonEntropy
+
+
+class ConditionalCounter():
+    cli = quantgov.utils.CLISpec(
+        help=('Count conditional words and phrases. Included terms are: '
+              ' "if", "but", "except", "provided", "when", "where", '
+              '"whenever", "unless", "notwithstanding", "in the event", '
+              'and "in no event"'),
+        arguments=[]
+    )
+    pattern = re.compile(
+        r'\b(if|but|except|provided|when|where'
+        r'|whenever|unless|notwithstanding'
+        r'|in\s+the\s+event|in\s+no\s+event)\b'
+    )
+
+    @staticmethod
+    def get_columns(args):
+        return ('conditionals',)
+
+    @staticmethod
+    def process_document(doc):
+        return doc.index + (len(ConditionalCounter.pattern.findall(
+                                ' '.join((doc.text).splitlines()))),)
+
+
+commands['count_conditionals'] = ConditionalCounter
+
+
+class SentenceLength():
+
+    cli = quantgov.utils.CLISpec(
+        help='Sentence Length',
+        arguments=[
+            quantgov.utils.CLIArg(
+                flags=('--precision'),
+                kwargs={
+                    'help': 'decimal places to round',
+                    'default': 2
+                }
+            )
+        ]
+    )
+
+    @staticmethod
+    def get_columns(args):
+        return ('sentence_length',)
+
+    @staticmethod
+    def process_document(doc, precision):
+        sentences = textblob.TextBlob(doc.text).sentences
+        return doc.index + (round(sum(len(
+            sentence.words) for sentence in sentences) /
+            len(sentences), int(precision)),)
+
+
+commands['sentence_length'] = SentenceLength
diff --git a/setup.py b/setup.py
@@ -60,7 +60,11 @@ def find_version(*file_paths):
         'snakemake',
     ],
     extras_require={
-        'testing': ['pytest-flake8']
+        'testing': ['pytest-flake8'],
+        'builtins': [
+            'textblob',
+            'nltk'
+        ]
     },
     entry_points={
         'console_scripts': [

diff --git a/tests/test_corpora.py b/tests/test_corpora.py
@@ -126,3 +126,48 @@ def test_termcount_multiple_with_label():
          'lorem', 'dolor sit', '--total_label', 'bothofem'],
     )
     assert output == 'file,lorem,dolor sit,bothofem\n1,1,1,2\n2,1,0,1\n'
+
+
+def test_shannon_entropy():
+    output = check_output(
+        ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH)],
+    )
+    assert output == 'file,shannon_entropy\n1,7.14\n2,8.13\n'
+
+
+def test_shannon_entropy_no_stopwords():
+    output = check_output(
+        ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH),
+         '--stopwords', 'None'],
+    )
+    assert output == 'file,shannon_entropy\n1,7.18\n2,8.09\n'
+
+
+def test_shannon_entropy_4decimals():
+    output = check_output(
+        ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH),
+         '--precision', '4'],
+    )
+    assert output == 'file,shannon_entropy\n1,7.1413\n2,8.1252\n'
+
+
+def test_conditionalcount():
+    output = check_output(
+        ['quantgov', 'corpus', 'count_conditionals', str(PSEUDO_CORPUS_PATH)],
+    )
+    assert output == 'file,conditionals\n1,0\n2,0\n'
+
+
+def test_sentencelength():
+    output = check_output(
+        ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH)],
+    )
+    assert output == 'file,sentence_length\n1,9.54\n2,8.16\n'
+
+
+def test_sentencelength_4decimals():
+    output = check_output(
+        ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH),
+         '--precision', '4'],
+    )
+    assert output == 'file,sentence_length\n1,9.5385\n2,8.1633\n'