Skip to content
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ python:
- '3.6'
install:
- pip install ".[testing]"
- pip install ".[builtins]"
- python -m nltk.downloader punkt stopwords wordnet
script: pytest
deploy:
provider: pypi
Expand Down
128 changes: 128 additions & 0 deletions quantgov/corpora/builtins.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,19 @@
"""
import re
import collections
import math

import quantgov

import nltk.corpus
import textblob

try:
nltk.corpus.wordnet.ensure_loaded()
except LookupError:
nltk.download('wordnet')
nltk.corpus.wordnet.ensure_loaded()

commands = {}


Expand Down Expand Up @@ -93,3 +103,121 @@ def process_document(doc, terms, pattern, total_label):


commands['count_occurrences'] = OccurrenceCounter


class ShannonEntropy():
lemmas = {}
cli = quantgov.utils.CLISpec(
help='Shannon Entropy',
arguments=[
quantgov.utils.CLIArg(
flags=('--word_pattern', '-wp'),
kwargs={
'help': 'regular expression defining a "word"',
'type': re.compile,
'default': re.compile(r'\b\w+\b')
}
),
quantgov.utils.CLIArg(
flags=('--stopwords', '-sw'),
kwargs={
'help': 'stopwords to ignore',
'default': nltk.corpus.stopwords.words('english')
}
),
quantgov.utils.CLIArg(
flags=('--precision'),
kwargs={
'help': 'decimal places to round',
'default': 2
}
)
]
)

@staticmethod
def get_columns(args):
return ('shannon_entropy',)

@staticmethod
def process_document(doc, word_pattern, stopwords, precision):
words = word_pattern.findall(doc.text)
lemmas = [
lemma for lemma in (
ShannonEntropy.lemmatize(word) for word in words
)
if lemma not in stopwords
]
counts = collections.Counter(lemmas)
return doc.index + (round(sum(
-(count / len(lemmas) * math.log(count / len(lemmas), 2))
for count in counts.values()
), int(precision)),)

def lemmatize(word):
if word in ShannonEntropy.lemmas:
lemma = ShannonEntropy.lemmas[word]
else:
lemma = textblob.Word(word).lemmatize()
ShannonEntropy.lemmas[word] = lemma
return lemma


commands['shannon_entropy'] = ShannonEntropy


class ConditionalCounter():
cli = quantgov.utils.CLISpec(
help=('Count conditional words and phrases. Included terms are: '
' "if", "but", "except", "provided", "when", "where", '
'"whenever", "unless", "notwithstanding", "in the event", '
'and "in no event"'),
arguments=[]
)
pattern = re.compile(
r'\b(if|but|except|provided|when|where'
r'|whenever|unless|notwithstanding'
r'|in\s+the\s+event|in\s+no\s+event)\b'
)

@staticmethod
def get_columns(args):
return ('conditionals',)

@staticmethod
def process_document(doc):
return doc.index + (len(ConditionalCounter.pattern.findall(
' '.join((doc.text).splitlines()))),)


commands['count_conditionals'] = ConditionalCounter


class SentenceLength():

cli = quantgov.utils.CLISpec(
help='Sentence Length',
arguments=[
quantgov.utils.CLIArg(
flags=('--precision'),
kwargs={
'help': 'decimal places to round',
'default': 2
}
)
]
)

@staticmethod
def get_columns(args):
return ('sentence_length',)

@staticmethod
def process_document(doc, precision):
sentences = textblob.TextBlob(doc.text).sentences
return doc.index + (round(sum(len(
sentence.words) for sentence in sentences) /
len(sentences), int(precision)),)


commands['sentence_length'] = SentenceLength
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ def find_version(*file_paths):
'snakemake',
],
extras_require={
'testing': ['pytest-flake8']
'testing': ['pytest-flake8'],
'builtins': [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either we should move these requirements to install_requires or we should make it so that qg can still run without them installed (and we throw an error if they aren't installed and someone tries to use that builtin)

'textblob',
'nltk'
]
},
entry_points={
'console_scripts': [
Expand Down
45 changes: 45 additions & 0 deletions tests/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,48 @@ def test_termcount_multiple_with_label():
'lorem', 'dolor sit', '--total_label', 'bothofem'],
)
assert output == 'file,lorem,dolor sit,bothofem\n1,1,1,2\n2,1,0,1\n'


def test_shannon_entropy():
output = check_output(
['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH)],
)
assert output == 'file,shannon_entropy\n1,7.14\n2,8.13\n'

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get tests for the options as well?


def test_shannon_entropy_no_stopwords():
output = check_output(
['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH),
'--stopwords', 'None'],
)
assert output == 'file,shannon_entropy\n1,7.18\n2,8.09\n'


def test_shannon_entropy_4decimals():
output = check_output(
['quantgov', 'corpus', 'shannon_entropy', str(PSEUDO_CORPUS_PATH),
'--precision', '4'],
)
assert output == 'file,shannon_entropy\n1,7.1413\n2,8.1252\n'


def test_conditionalcount():
output = check_output(
['quantgov', 'corpus', 'count_conditionals', str(PSEUDO_CORPUS_PATH)],
)
assert output == 'file,conditionals\n1,0\n2,0\n'


def test_sentencelength():
output = check_output(
['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH)],
)
assert output == 'file,sentence_length\n1,9.54\n2,8.16\n'


def test_sentencelength_4decimals():
output = check_output(
['quantgov', 'corpus', 'sentence_length', str(PSEUDO_CORPUS_PATH),
'--precision', '4'],
)
assert output == 'file,sentence_length\n1,9.5385\n2,8.1633\n'