QuantGov · OliverSherouse · Nov 28, 2017 · Nov 6, 2017 · Nov 6, 2017 · Nov 7, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -4,6 +4,8 @@ python:
 - '3.6'
 install:
 - pip install ".[testing]"
+- pip install ".[nlp]"
+- python -m nltk.downloader punkt stopwords wordnet
 script: pytest
 deploy:
   provider: pypi

diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py
@@ -3,12 +3,46 @@
 """
 import re
 import collections
+import math
 
+from decorator import decorator
 import quantgov
 
+try:
+    import nltk.corpus
+    NLTK = True
+except ImportError:
+    NLTK = None
+
+try:
+    import textblob
+except ImportError:
+    textblob = None
+
+if NLTK:
+    try:
+        nltk.corpus.wordnet.ensure_loaded()
+    except LookupError:
+        nltk.download('wordnet')
+        nltk.corpus.wordnet.ensure_loaded()
+
 commands = {}
 
 
+@decorator
+def check_nltk(func, *args, **kwargs):
+    if NLTK is None:
+        raise RuntimeError('Must install NLTK to use {}'.format(func))
+    return func(*args, **kwargs)
+
+
+@decorator
+def check_textblob(func, *args, **kwargs):
+    if textblob is None:
+        raise RuntimeError('Must install textblob to use {}'.format(func))
+    return func(*args, **kwargs)
+
+
 class WordCounter():
 
     cli = quantgov.utils.CLISpec(
@@ -93,3 +127,181 @@ def process_document(doc, terms, pattern, total_label):
 
 
 commands['count_occurrences'] = OccurrenceCounter
+
+
+class ShannonEntropy():
+    lemmas = {}
+    cli = quantgov.utils.CLISpec(
+        help='Shannon Entropy',
+        arguments=[
+            quantgov.utils.CLIArg(
+                flags=('--word_pattern', '-wp'),
+                kwargs={
+                    'help': 'regular expression defining a "word"',
+                    'type': re.compile,
+                    'default': re.compile(r'\b\w+\b')
+                }
+            ),
+            quantgov.utils.CLIArg(
+                flags=('--stopwords', '-sw'),
+                kwargs={
+                    'help': 'stopwords to ignore',
+                    'default': (nltk.corpus.stopwords.words('english')
+                                if NLTK else None)
+                }
+            ),
+            quantgov.utils.CLIArg(
+                flags=('--precision'),
+                kwargs={
+                    'help': 'decimal places to round',
+                    'default': 2
+                }
+            )
+        ]
+    )
+
+    @staticmethod
+    def get_columns(args):
+        return ('shannon_entropy',)
+
+    @staticmethod
+    @check_nltk
+    @check_textblob
+    def process_document(doc, word_pattern, precision, stopwords,
+                         textblob=textblob, nltk=NLTK):
+        words = word_pattern.findall(doc.text)
+        lemmas = [
+            lemma for lemma in (
+                ShannonEntropy.lemmatize(word) for word in words
+            )
+            if lemma not in stopwords
+        ]
+        counts = collections.Counter(lemmas)
+        return doc.index + (round(sum(
+            -(count / len(lemmas) * math.log(count / len(lemmas), 2))
+            for count in counts.values()
+        ), int(precision)),)
+
+    def lemmatize(word):
+        if word in ShannonEntropy.lemmas:
+            lemma = ShannonEntropy.lemmas[word]
+        else:
+            lemma = textblob.Word(word).lemmatize()
+            ShannonEntropy.lemmas[word] = lemma
+        return lemma
+
+
+commands['shannon_entropy'] = ShannonEntropy
+
+
+class ConditionalCounter():
+    cli = quantgov.utils.CLISpec(
+        help=('Count conditional words and phrases. Included terms are: '
+              ' "if", "but", "except", "provided", "when", "where", '
+              '"whenever", "unless", "notwithstanding", "in the event", '
+              'and "in no event"'),
+        arguments=[]
+    )
+    pattern = re.compile(
+        r'\b(if|but|except|provided|when|where'
+        r'|whenever|unless|notwithstanding'
+        r'|in\s+the\s+event|in\s+no\s+event)\b'
+    )
+
+    @staticmethod
+    def get_columns(args):
+        return ('conditionals',)
+
+    @staticmethod
+    def process_document(doc):
+        return doc.index + (len(ConditionalCounter.pattern.findall(
+                                ' '.join((doc.text).splitlines()))),)
+
+
+commands['count_conditionals'] = ConditionalCounter
+
+
+class SentenceLength():
+
+    cli = quantgov.utils.CLISpec(
+        help='Sentence Length',
+        arguments=[
+            quantgov.utils.CLIArg(
+                flags=('--precision'),
+                kwargs={
+                    'help': 'decimal places to round',
+                    'default': 2
+                }
+            )
+        ]
+    )
+
+    @staticmethod
+    def get_columns(args):
+        return ('sentence_length',)
+
+    @staticmethod
+    @check_nltk
+    @check_textblob
+    def process_document(doc, precision):
+        sentences = textblob.TextBlob(doc.text).sentences
+        # Allows for rounding to a specified number of decimals
+        if precision:
+            return doc.index + (round(sum(len(
+                sentence.words) for sentence in sentences) /
+                len(sentences), int(precision)),)
+        else:
+            return doc.index + (sum(len(
+                sentence.words) for sentence in sentences) /
+                len(sentences),)
+
+
+commands['sentence_length'] = SentenceLength
+
+
+class SentimentAnalysis():
+
+    cli = quantgov.utils.CLISpec(
+        help='Performs sentiment analysis on the text',
+        arguments=[
+            quantgov.utils.CLIArg(
+                flags=('--backend'),
+                kwargs={
+                    'help': 'which program to use for the analysis',
+                    'default': 'textblob'
+                }
+            ),
+            quantgov.utils.CLIArg(
+                flags=('--precision'),
+                kwargs={
+                    'help': 'decimal places to round',
+                    'default': 2
+                }
+            )
+        ]
+    )
+
+    @staticmethod
+    def get_columns(args):
+        if args['backend'] == 'textblob':
+            return ('sentiment_polarity', 'sentiment_subjectivity',)
+        else:
+            raise NotImplementedError
+
+    @staticmethod
+    @check_nltk
+    @check_textblob
+    def process_document(doc, backend, precision):
+        if backend == 'textblob':
+            sentiment = textblob.TextBlob(doc.text)
+            # Allows for rounding to a specified number of decimals
+            if precision:
+                return (doc.index +
+                        (round(sentiment.polarity, int(precision)),
+                            round(sentiment.subjectivity, int(precision)),))
+            else:
+                return (doc.index +
+                        (sentiment.polarity, sentiment.subjectivity,))
+
+
+commands['sentiment_analysis'] = SentimentAnalysis
diff --git a/setup.py b/setup.py
@@ -52,6 +52,7 @@ def find_version(*file_paths):
     packages=find_packages(
         exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
     install_requires=[
+        'decorator',
         'joblib',
         'pandas',
         'requests',
@@ -60,7 +61,11 @@ def find_version(*file_paths):
         'snakemake',
     ],
     extras_require={
-        'testing': ['pytest-flake8']
+        'testing': ['pytest-flake8'],
+        'nlp': [
+            'textblob',
+            'nltk',
+        ]
     },
     entry_points={
         'console_scripts': [

diff --git a/tests/test_corpora.py b/tests/test_corpora.py
@@ -126,3 +126,65 @@ def test_termcount_multiple_with_label():
          'lorem', 'dolor sit', '--total_label', 'bothofem'],
     )
     assert output == 'file,lorem,dolor sit,bothofem\n1,1,1,2\n2,1,0,1\n'
+
+
+def test_shannon_entropy():
+    output = check_output(
+        ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH)],
+    )
+    assert output == 'file,shannon_entropy\n1,7.14\n2,8.13\n'
+
+
+def test_shannon_entropy_no_stopwords():
+    output = check_output(
+        ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH),
+         '--stopwords', 'None'],
+    )
+    assert output == 'file,shannon_entropy\n1,7.18\n2,8.09\n'
+
+
+def test_shannon_entropy_4decimals():
+    output = check_output(
+        ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH),
+         '--precision', '4'],
+    )
+    assert output == 'file,shannon_entropy\n1,7.1413\n2,8.1252\n'
+
+
+def test_conditionalcount():
+    output = check_output(
+        ['quantgov', 'corpus', 'count_conditionals', str(PSEUDO_CORPUS_PATH)],
+    )
+    assert output == 'file,conditionals\n1,0\n2,0\n'
+
+
+def test_sentencelength():
+    output = check_output(
+        ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH)],
+    )
+    assert output == 'file,sentence_length\n1,9.54\n2,8.16\n'
+
+
+def test_sentencelength_4decimals():
+    output = check_output(
+        ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH),
+         '--precision', '4'],
+    )
+    assert output == 'file,sentence_length\n1,9.5385\n2,8.1633\n'
+
+
+def test_sentiment_analysis():
+    output = check_output(
+        ['quantgov', 'corpus', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH)],
+    )
+    assert output == ('file,sentiment_polarity,sentiment_subjectivity'
+                      '\n1,0.0,0.0\n2,0.0,0.0\n')
+
+
+def test_sentiment_analysis_4decimals():
+    output = check_output(
+        ['quantgov', 'corpus', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH),
+         '--precision', '4'],
+    )
+    assert output == ('file,sentiment_polarity,sentiment_subjectivity'
+                      '\n1,0.0,0.0\n2,0.0,0.0\n')