diff --git a/.travis.yml b/.travis.yml index 0173ffb..75cc0ab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,8 @@ python: - '3.6' install: - pip install ".[testing]" +- pip install ".[nlp]" +- python -m nltk.downloader punkt stopwords wordnet script: pytest deploy: provider: pypi diff --git a/quantgov/corpora/builtins.py b/quantgov/corpora/builtins.py index 5dbf1d8..bb9b9b6 100644 --- a/quantgov/corpora/builtins.py +++ b/quantgov/corpora/builtins.py @@ -3,12 +3,46 @@ """ import re import collections +import math +from decorator import decorator import quantgov +try: + import nltk.corpus + NLTK = True +except ImportError: + NLTK = None + +try: + import textblob +except ImportError: + textblob = None + +if NLTK: + try: + nltk.corpus.wordnet.ensure_loaded() + except LookupError: + nltk.download('wordnet') + nltk.corpus.wordnet.ensure_loaded() + commands = {} +@decorator +def check_nltk(func, *args, **kwargs): + if NLTK is None: + raise RuntimeError('Must install NLTK to use {}'.format(func)) + return func(*args, **kwargs) + + +@decorator +def check_textblob(func, *args, **kwargs): + if textblob is None: + raise RuntimeError('Must install textblob to use {}'.format(func)) + return func(*args, **kwargs) + + class WordCounter(): cli = quantgov.utils.CLISpec( @@ -93,3 +127,181 @@ def process_document(doc, terms, pattern, total_label): commands['count_occurrences'] = OccurrenceCounter + + +class ShannonEntropy(): + lemmas = {} + cli = quantgov.utils.CLISpec( + help='Shannon Entropy', + arguments=[ + quantgov.utils.CLIArg( + flags=('--word_pattern', '-wp'), + kwargs={ + 'help': 'regular expression defining a "word"', + 'type': re.compile, + 'default': re.compile(r'\b\w+\b') + } + ), + quantgov.utils.CLIArg( + flags=('--stopwords', '-sw'), + kwargs={ + 'help': 'stopwords to ignore', + 'default': (nltk.corpus.stopwords.words('english') + if NLTK else None) + } + ), + quantgov.utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 + } + ) + ] + ) + + @staticmethod + def get_columns(args): + return ('shannon_entropy',) + + @staticmethod + @check_nltk + @check_textblob + def process_document(doc, word_pattern, precision, stopwords, + textblob=textblob, nltk=NLTK): + words = word_pattern.findall(doc.text) + lemmas = [ + lemma for lemma in ( + ShannonEntropy.lemmatize(word) for word in words + ) + if lemma not in stopwords + ] + counts = collections.Counter(lemmas) + return doc.index + (round(sum( + -(count / len(lemmas) * math.log(count / len(lemmas), 2)) + for count in counts.values() + ), int(precision)),) + + def lemmatize(word): + if word in ShannonEntropy.lemmas: + lemma = ShannonEntropy.lemmas[word] + else: + lemma = textblob.Word(word).lemmatize() + ShannonEntropy.lemmas[word] = lemma + return lemma + + +commands['shannon_entropy'] = ShannonEntropy + + +class ConditionalCounter(): + cli = quantgov.utils.CLISpec( + help=('Count conditional words and phrases. Included terms are: ' + ' "if", "but", "except", "provided", "when", "where", ' + '"whenever", "unless", "notwithstanding", "in the event", ' + 'and "in no event"'), + arguments=[] + ) + pattern = re.compile( + r'\b(if|but|except|provided|when|where' + r'|whenever|unless|notwithstanding' + r'|in\s+the\s+event|in\s+no\s+event)\b' + ) + + @staticmethod + def get_columns(args): + return ('conditionals',) + + @staticmethod + def process_document(doc): + return doc.index + (len(ConditionalCounter.pattern.findall( + ' '.join((doc.text).splitlines()))),) + + +commands['count_conditionals'] = ConditionalCounter + + +class SentenceLength(): + + cli = quantgov.utils.CLISpec( + help='Sentence Length', + arguments=[ + quantgov.utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 + } + ) + ] + ) + + @staticmethod + def get_columns(args): + return ('sentence_length',) + + @staticmethod + @check_nltk + @check_textblob + def process_document(doc, precision): + sentences = textblob.TextBlob(doc.text).sentences + # Allows for rounding to a specified number of decimals + if precision: + return doc.index + (round(sum(len( + sentence.words) for sentence in sentences) / + len(sentences), int(precision)),) + else: + return doc.index + (sum(len( + sentence.words) for sentence in sentences) / + len(sentences),) + + +commands['sentence_length'] = SentenceLength + + +class SentimentAnalysis(): + + cli = quantgov.utils.CLISpec( + help='Performs sentiment analysis on the text', + arguments=[ + quantgov.utils.CLIArg( + flags=('--backend'), + kwargs={ + 'help': 'which program to use for the analysis', + 'default': 'textblob' + } + ), + quantgov.utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 + } + ) + ] + ) + + @staticmethod + def get_columns(args): + if args['backend'] == 'textblob': + return ('sentiment_polarity', 'sentiment_subjectivity',) + else: + raise NotImplementedError + + @staticmethod + @check_nltk + @check_textblob + def process_document(doc, backend, precision): + if backend == 'textblob': + sentiment = textblob.TextBlob(doc.text) + # Allows for rounding to a specified number of decimals + if precision: + return (doc.index + + (round(sentiment.polarity, int(precision)), + round(sentiment.subjectivity, int(precision)),)) + else: + return (doc.index + + (sentiment.polarity, sentiment.subjectivity,)) + + +commands['sentiment_analysis'] = SentimentAnalysis diff --git a/setup.py b/setup.py index b733fe1..3d424b1 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ def find_version(*file_paths): packages=find_packages( exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), install_requires=[ + 'decorator', 'joblib', 'pandas', 'requests', @@ -60,7 +61,11 @@ def find_version(*file_paths): 'snakemake', ], extras_require={ - 'testing': ['pytest-flake8'] + 'testing': ['pytest-flake8'], + 'nlp': [ + 'textblob', + 'nltk', + ] }, entry_points={ 'console_scripts': [ diff --git a/tests/test_corpora.py b/tests/test_corpora.py index 5e89460..6863e36 100644 --- a/tests/test_corpora.py +++ b/tests/test_corpora.py @@ -126,3 +126,65 @@ def test_termcount_multiple_with_label(): 'lorem', 'dolor sit', '--total_label', 'bothofem'], ) assert output == 'file,lorem,dolor sit,bothofem\n1,1,1,2\n2,1,0,1\n' + + +def test_shannon_entropy(): + output = check_output( + ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH)], + ) + assert output == 'file,shannon_entropy\n1,7.14\n2,8.13\n' + + +def test_shannon_entropy_no_stopwords(): + output = check_output( + ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH), + '--stopwords', 'None'], + ) + assert output == 'file,shannon_entropy\n1,7.18\n2,8.09\n' + + +def test_shannon_entropy_4decimals(): + output = check_output( + ['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH), + '--precision', '4'], + ) + assert output == 'file,shannon_entropy\n1,7.1413\n2,8.1252\n' + + +def test_conditionalcount(): + output = check_output( + ['quantgov', 'corpus', 'count_conditionals', str(PSEUDO_CORPUS_PATH)], + ) + assert output == 'file,conditionals\n1,0\n2,0\n' + + +def test_sentencelength(): + output = check_output( + ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH)], + ) + assert output == 'file,sentence_length\n1,9.54\n2,8.16\n' + + +def test_sentencelength_4decimals(): + output = check_output( + ['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH), + '--precision', '4'], + ) + assert output == 'file,sentence_length\n1,9.5385\n2,8.1633\n' + + +def test_sentiment_analysis(): + output = check_output( + ['quantgov', 'corpus', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH)], + ) + assert output == ('file,sentiment_polarity,sentiment_subjectivity' + '\n1,0.0,0.0\n2,0.0,0.0\n') + + +def test_sentiment_analysis_4decimals(): + output = check_output( + ['quantgov', 'corpus', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH), + '--precision', '4'], + ) + assert output == ('file,sentiment_polarity,sentiment_subjectivity' + '\n1,0.0,0.0\n2,0.0,0.0\n')