Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
__pycache__/
*.py[cod]
*$py.class
.pytest_cache

# C extensions
*.so
Expand Down
6 changes: 5 additions & 1 deletion quantgov/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@

__all__ = [
'corpora',
'corpus',
'estimator',
'project',
'utils',
]

from .corpora.utils import load_driver

from . import corpora # Backwards compatibility

from .utils import load_driver

__version__ = '0.4.0.dev'
6 changes: 3 additions & 3 deletions quantgov/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import requests

import quantgov
import quantgov.corpora.builtins
import quantgov.corpus.builtins

from pathlib import Path

Expand All @@ -40,7 +40,7 @@ def parse_args():
# Corpus command
corpus = subparsers.add_parser('corpus')
corpus_subcommands = corpus.add_subparsers(dest='subcommand')
for command, builtin in quantgov.corpora.builtins.commands.items():
for command, builtin in quantgov.corpus.builtins.commands.items():
subcommand = corpus_subcommands.add_parser(
command, help=builtin.cli.help)
subcommand.add_argument(
Expand Down Expand Up @@ -161,7 +161,7 @@ def start_component(args):
def run_corpus_builtin(args):
driver = quantgov.load_driver(args.corpus)
writer = csv.writer(args.outfile)
builtin = quantgov.corpora.builtins.commands[args.subcommand]
builtin = quantgov.corpus.builtins.commands[args.subcommand]
func_args = {i: j for i, j in vars(args).items()
if i not in {'command', 'subcommand', 'outfile', 'corpus'}}
writer.writerow(driver.index_labels + builtin.get_columns(func_args))
Expand Down
9 changes: 8 additions & 1 deletion quantgov/corpora/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .structures import (
import warnings

from ..corpus import (
Document,
CorpusStreamer,
CorpusDriver,
Expand All @@ -7,3 +9,8 @@
NamePatternCorpusDriver,
IndexDriver
)

warnings.warn(
("quantgov.corpora has been moved to quantgov.corpus and will be removed"
" in a future version."),
DeprecationWarning)
16 changes: 0 additions & 16 deletions quantgov/corpora/utils.py

This file was deleted.

9 changes: 9 additions & 0 deletions quantgov/corpus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .structures import (
Document,
CorpusStreamer,
CorpusDriver,
FlatFileCorpusDriver,
RecursiveDirectoryCorpusDriver,
NamePatternCorpusDriver,
IndexDriver
)
File renamed without changes.
File renamed without changes.
19 changes: 17 additions & 2 deletions quantgov/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
# TODO: Docstrings

import collections
import concurrent.futures
import multiprocessing
import os
import sys

from pathlib import Path


def load_driver(corpus):
corpus = Path(corpus)
if corpus.name == 'driver.py' or corpus.name == 'timestamp':
corpus = corpus.parent
sys.path.insert(0, str(corpus))
from driver import driver
sys.path.pop(0)
return driver


_POOLS = {
'thread': concurrent.futures.ThreadPoolExecutor,
Expand All @@ -26,7 +41,7 @@ def lazy_parallel(func, *iterables, **kwargs):
worker = kwargs.get('worker', 'thread')
max_workers = kwargs.get('max_workers')
if max_workers is None: # Not in back-port
max_workers = (multiprocessing.cpu_count() or 1)
max_workers = (os.cpu_count() or 1)
if worker == 'thread':
max_workers *= 5
try:
Expand Down
8 changes: 4 additions & 4 deletions tests/test_corpora.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
import quantgov.corpora
import quantgov.corpus
import subprocess

from pathlib import Path
Expand All @@ -8,15 +8,15 @@
def build_recursive_directory_corpus(directory):
for path, text in (('a/1.txt', u'foo'), ('b/2.txt', u'bar')):
directory.join(path).write_text(text, encoding='utf-8', ensure=True)
return quantgov.corpora.RecursiveDirectoryCorpusDriver(
return quantgov.corpus.RecursiveDirectoryCorpusDriver(
directory=str(directory), index_labels=('letter', 'number'))


def build_name_pattern_corpus(directory):
for path, text in (('a_1.txt', u'foo'), ('b_2.txt', u'bar')):
path = directory.join(path).write_text(
text, encoding='utf-8', ensure=True)
return quantgov.corpora.NamePatternCorpusDriver(
return quantgov.corpus.NamePatternCorpusDriver(
pattern=r'(?P<letter>[a-z])_(?P<number>\d)',
directory=str(directory)
)
Expand All @@ -35,7 +35,7 @@ def build_index_corpus(directory):
with index_path.open('w', encoding='utf-8') as outf:
outf.write(u'letter,number,path\n')
outf.write(u'\n'.join(','.join(row) for row in rows))
return quantgov.corpora.IndexDriver(str(index_path))
return quantgov.corpus.IndexDriver(str(index_path))


BUILDERS = {
Expand Down