Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ python:
- '3.6'
install:
- pip install ".[testing]"
- pip install ".[nlp]"
- python -m nltk.downloader punkt stopwords wordnet
script: pytest
deploy:
provider: pypi
Expand Down
212 changes: 212 additions & 0 deletions quantgov/corpora/builtins.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,46 @@
"""
import re
import collections
import math

from decorator import decorator
import quantgov

try:
import nltk.corpus
NLTK = True
except ImportError:
NLTK = None

try:
import textblob
except ImportError:
textblob = None

if NLTK:
try:
nltk.corpus.wordnet.ensure_loaded()
except LookupError:
nltk.download('wordnet')
nltk.corpus.wordnet.ensure_loaded()

commands = {}


@decorator
def check_nltk(func, *args, **kwargs):
if NLTK is None:
raise RuntimeError('Must install NLTK to use {}'.format(func))
return func(*args, **kwargs)


@decorator
def check_textblob(func, *args, **kwargs):
if textblob is None:
raise RuntimeError('Must install textblob to use {}'.format(func))
return func(*args, **kwargs)


class WordCounter():

cli = quantgov.utils.CLISpec(
Expand Down Expand Up @@ -93,3 +127,181 @@ def process_document(doc, terms, pattern, total_label):


commands['count_occurrences'] = OccurrenceCounter


class ShannonEntropy():
lemmas = {}
cli = quantgov.utils.CLISpec(
help='Shannon Entropy',
arguments=[
quantgov.utils.CLIArg(
flags=('--word_pattern', '-wp'),
kwargs={
'help': 'regular expression defining a "word"',
'type': re.compile,
'default': re.compile(r'\b\w+\b')
}
),
quantgov.utils.CLIArg(
flags=('--stopwords', '-sw'),
kwargs={
'help': 'stopwords to ignore',
'default': (nltk.corpus.stopwords.words('english')
if NLTK else None)
}
),
quantgov.utils.CLIArg(
flags=('--precision'),
kwargs={
'help': 'decimal places to round',
'default': 2
}
)
]
)

@staticmethod
def get_columns(args):
return ('shannon_entropy',)

@staticmethod
@check_nltk
@check_textblob
def process_document(doc, word_pattern, precision, stopwords,
textblob=textblob, nltk=NLTK):
words = word_pattern.findall(doc.text)
lemmas = [
lemma for lemma in (
ShannonEntropy.lemmatize(word) for word in words
)
if lemma not in stopwords
]
counts = collections.Counter(lemmas)
return doc.index + (round(sum(
-(count / len(lemmas) * math.log(count / len(lemmas), 2))
for count in counts.values()
), int(precision)),)

def lemmatize(word):
if word in ShannonEntropy.lemmas:
lemma = ShannonEntropy.lemmas[word]
else:
lemma = textblob.Word(word).lemmatize()
ShannonEntropy.lemmas[word] = lemma
return lemma


commands['shannon_entropy'] = ShannonEntropy


class ConditionalCounter():
cli = quantgov.utils.CLISpec(
help=('Count conditional words and phrases. Included terms are: '
' "if", "but", "except", "provided", "when", "where", '
'"whenever", "unless", "notwithstanding", "in the event", '
'and "in no event"'),
arguments=[]
)
pattern = re.compile(
r'\b(if|but|except|provided|when|where'
r'|whenever|unless|notwithstanding'
r'|in\s+the\s+event|in\s+no\s+event)\b'
)

@staticmethod
def get_columns(args):
return ('conditionals',)

@staticmethod
def process_document(doc):
return doc.index + (len(ConditionalCounter.pattern.findall(
' '.join((doc.text).splitlines()))),)


commands['count_conditionals'] = ConditionalCounter


class SentenceLength():

cli = quantgov.utils.CLISpec(
help='Sentence Length',
arguments=[
quantgov.utils.CLIArg(
flags=('--precision'),
kwargs={
'help': 'decimal places to round',
'default': 2
}
)
]
)

@staticmethod
def get_columns(args):
return ('sentence_length',)

@staticmethod
@check_nltk
@check_textblob
def process_document(doc, precision):
sentences = textblob.TextBlob(doc.text).sentences
# Allows for rounding to a specified number of decimals
if precision:
return doc.index + (round(sum(len(
sentence.words) for sentence in sentences) /
len(sentences), int(precision)),)
else:
return doc.index + (sum(len(
sentence.words) for sentence in sentences) /
len(sentences),)


commands['sentence_length'] = SentenceLength


class SentimentAnalysis():

cli = quantgov.utils.CLISpec(
help='Performs sentiment analysis on the text',
arguments=[
quantgov.utils.CLIArg(
flags=('--backend'),
kwargs={
'help': 'which program to use for the analysis',
'default': 'textblob'
}
),
quantgov.utils.CLIArg(
flags=('--precision'),
kwargs={
'help': 'decimal places to round',
'default': 2
}
)
]
)

@staticmethod
def get_columns(args):
if args['backend'] == 'textblob':
return ('sentiment_polarity', 'sentiment_subjectivity',)
else:
raise NotImplementedError

@staticmethod
@check_nltk
@check_textblob
def process_document(doc, backend, precision):
if backend == 'textblob':
sentiment = textblob.TextBlob(doc.text)
# Allows for rounding to a specified number of decimals
if precision:
return (doc.index +
(round(sentiment.polarity, int(precision)),
round(sentiment.subjectivity, int(precision)),))
else:
return (doc.index +
(sentiment.polarity, sentiment.subjectivity,))


commands['sentiment_analysis'] = SentimentAnalysis
7 changes: 6 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def find_version(*file_paths):
packages=find_packages(
exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
install_requires=[
'decorator',
'joblib',
'pandas',
'requests',
Expand All @@ -60,7 +61,11 @@ def find_version(*file_paths):
'snakemake',
],
extras_require={
'testing': ['pytest-flake8']
'testing': ['pytest-flake8'],
'nlp': [
'textblob',
'nltk',
]
},
entry_points={
'console_scripts': [
Expand Down
62 changes: 62 additions & 0 deletions tests/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,65 @@ def test_termcount_multiple_with_label():
'lorem', 'dolor sit', '--total_label', 'bothofem'],
)
assert output == 'file,lorem,dolor sit,bothofem\n1,1,1,2\n2,1,0,1\n'


def test_shannon_entropy():
output = check_output(
['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH)],
)
assert output == 'file,shannon_entropy\n1,7.14\n2,8.13\n'


def test_shannon_entropy_no_stopwords():
output = check_output(
['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH),
'--stopwords', 'None'],
)
assert output == 'file,shannon_entropy\n1,7.18\n2,8.09\n'


def test_shannon_entropy_4decimals():
output = check_output(
['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH),
'--precision', '4'],
)
assert output == 'file,shannon_entropy\n1,7.1413\n2,8.1252\n'


def test_conditionalcount():
output = check_output(
['quantgov', 'corpus', 'count_conditionals', str(PSEUDO_CORPUS_PATH)],
)
assert output == 'file,conditionals\n1,0\n2,0\n'


def test_sentencelength():
output = check_output(
['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH)],
)
assert output == 'file,sentence_length\n1,9.54\n2,8.16\n'


def test_sentencelength_4decimals():
output = check_output(
['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH),
'--precision', '4'],
)
assert output == 'file,sentence_length\n1,9.5385\n2,8.1633\n'


def test_sentiment_analysis():
output = check_output(
['quantgov', 'corpus', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH)],
)
assert output == ('file,sentiment_polarity,sentiment_subjectivity'
'\n1,0.0,0.0\n2,0.0,0.0\n')


def test_sentiment_analysis_4decimals():
output = check_output(
['quantgov', 'corpus', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH),
'--precision', '4'],
)
assert output == ('file,sentiment_polarity,sentiment_subjectivity'
'\n1,0.0,0.0\n2,0.0,0.0\n')