diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..0bb0f0b --- /dev/null +++ b/.editorconfig @@ -0,0 +1,21 @@ +# http://editorconfig.org +root = true + +[*] +indent_size = 2 +indent_style = space +end_of_line = lf +charset = utf-8 +max_line_length = 120 +insert_final_newline = true + +[*.py] +indent_size = 4 +trim_trailing_whitespace = true + + +[*.rst] +trim_trailing_whitespace = false + +[Makefile] +indent_style = tab diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..4a3e78e --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,67 @@ +name: CI + +# Controls when the action will run. +on: + # Triggers the workflow on push or pull request events but only for the master branch + push: + branches: [ master ] + pull_request: + branches: [ master ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + code-quality: + runs-on: ubuntu-latest + + name: "Linting" + steps: + - uses: actions/checkout@v5 + with: + fetch-depth: 0 + + - name: setup python + uses: actions/setup-python@v6 + with: + python-version: '3.13' + + - name: Install dependencies + run: make install + + - name: Linting + run: make lint + + test: + # The type of runner that the job will run on + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12", "3.13", "3.14"] + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + name: "Python ${{ matrix.python-version }}" + + steps: + - name: Check out code + uses: actions/checkout@v5 + with: + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: make install + + - name: Run tests + run: make coverage + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v5 + with: + flags: unittests-${{ matrix.python-version }} + fail_ci_if_error: true # default = false + verbose: true # default = false diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..1d8545a --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,35 @@ +name: Publish to PyPI + +on: + release: + types: [released] + +jobs: + release: + name: Release + environment: + name: pypi + url: https://pypi.org/project/stop-words + permissions: + id-token: write + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v5 + with: + submodules: true + + - uses: actions/setup-python@v6 + with: + python-version: '3.13' + + - name: Build + run: | + python -m pip install build + make update-submodules build + + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + verbose: true + print-hash: true diff --git a/.gitignore b/.gitignore index 6c27d15..af0b40c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,8 +11,10 @@ build/ dist/ *.egg-info/ logs/ -src/ -.c9/ bin/ develop-eggs/ eggs/ +coverage.xml +.coverage +build +src/stop_words/_version.py diff --git a/.gitmodules b/.gitmodules index 7a835bd..87eff44 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "stop_words/stop-words"] - path = stop_words/stop-words +[submodule "src/stop_words/stop-words"] + path = src/stop_words/stop-words url = https://github.com/Alir3z4/stop-words.git diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index ba07717..0000000 --- a/.travis.yml +++ /dev/null @@ -1,24 +0,0 @@ -language: python -python: - - "2.7" - - "3.4" - - "3.5" - - "3.6" - - "3.7-dev" # 3.7 development branch - - "nightly" # currently points to 3.7-dev -install: - - git submodule init - - git submodule update - - git submodule foreach git pull origin master - - pip install -U setuptools coveralls - - python bootstrap.py - - ./bin/buildout -before_script: - - ./bin/flake8 stop_words -script: - - ./bin/cover -notifications: - irc: - - "irc.freenode.org#python-stop-words" -after_success: - coveralls diff --git a/ChangeLog.rst b/ChangeLog.rst index 436f65f..9de0e46 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,11 @@ +2025.11.4 +========= + +* Sync with latest of https://github.com/Alir3z4/stop-words. +* Add much more tests and cleaned up the code. +* Modernized Python packaging and publishing. + + 2018.7.23 ========= diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..983e412 --- /dev/null +++ b/Makefile @@ -0,0 +1,43 @@ +.PHONY: help install test coverage build clean format check-format lint precommit update-submodules + +.DEFAULT_GOAL := help + +help: ## Display this help message + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +install: update-submodules ## Install development dependencies + pip install -e '.[dev]' + +update-submodules: ## Update all git submodules + git submodule sync --recursive + git submodule update --init --remote --recursive + +test: ## Run test suite + python -m unittest discover -s src/ -v + +coverage: ## Generate coverage report + coverage run -m unittest discover -s src/ + coverage report + coverage xml + +build: ## Build source and wheel distributions + python -m build + +clean: ## Remove build artifacts and temporary files + rm -rf build/ dist/ *.egg-info/ **/*.egg-info/ .coverage coverage.xml .mypy_cache/ 88 + +format: ## Auto-format code with isort and black + isort . + black . + +check-format: ## Check code formatting with isort and black + isort --check-only --diff . + black --check --diff . + +lint: ## Run all code quality checks + flake8 --config=flake8.ini . + mypy src/ --install-types --non-interactive + +precommit: format lint ## Full pre-commit checks (format + lint) + +##@ Development Targets diff --git a/README.rst b/README.rst index d1fb367..2b41c2f 100644 --- a/README.rst +++ b/README.rst @@ -2,106 +2,510 @@ Python Stop Words ================= -.. contents:: Table of contents +.. image:: https://img.shields.io/pypi/v/stop-words.svg + :target: https://pypi.org/project/stop-words/ + :alt: PyPI version -Overview --------- - -Get list of common stop words in various languages in Python. +.. image:: https://img.shields.io/pypi/pyversions/stop-words.svg + :target: https://pypi.org/project/stop-words/ + :alt: Python versions -.. image:: https://secure.travis-ci.org/Alir3z4/python-stop-words.png - :alt: Build Status - :target: http://travis-ci.org/Alir3z4/python-stop-words +.. image:: https://img.shields.io/pypi/l/stop-words.svg + :target: https://github.com/Alir3z4/python-stop-words/blob/master/LICENSE + :alt: License -.. image:: https://coveralls.io/repos/Alir3z4/python-stop-words/badge.png - :alt: Coverage Status - :target: https://coveralls.io/r/Alir3z4/python-stop-words +.. contents:: Table of Contents + :depth: 2 + :local: -.. image:: http://badge.kloud51.com/pypi/v/stop-words.svg - :target: https://pypi.python.org/pypi/stop-words - :alt: PyPI Version +Overview +-------- -.. image:: http://badge.kloud51.com/pypi/s/stop-words.svg - :target: https://pypi.python.org/pypi/stop-words - :alt: PyPI Status +A Python library providing curated lists of stop words across 34+ languages. Stop words are common words (like "the", "is", "at") that are typically filtered out in natural language processing and text analysis tasks. -.. image:: http://badge.kloud51.com/pypi/l/stop-words.svg - :target: https://github.com/Alir3z4/python-stop-words/blob/master/LICENSE - :alt: License +**Key Features:** -.. image:: http://badge.kloud51.com/pypi/p/stop-words.svg - :target: https://pypi.python.org/pypi/stop-words - :alt: PyPI Py_versions +* **34+ Languages** - Extensive language support. +* **Performance** - Built-in caching for fast repeated access. +* **Flexible** - Custom filtering system for advanced use cases. +* **Zero Dependencies** - Lightweight with no external requirements. -Available languages +Available Languages ------------------- -* Arabic -* Bulgarian -* Catalan -* Czech -* Danish -* Dutch -* English -* Finnish -* French -* German -* Hungarian -* Indonesian -* Italian -* Norwegian -* Polish -* Portuguese -* Romanian -* Russian -* Spanish -* Swedish -* Turkish -* Ukrainian +All the available languages supported by https://github.com/Alir3z4/stop-words + +Each language is identified by both its ISO 639-1 language code (e.g., ``en``) and full name (e.g., ``english``). Installation ------------ -``stop-words`` is available on PyPI -http://pypi.python.org/pypi/stop-words +**Via pip (Recommended):** -So easily install it by ``pip`` -:: +.. code-block:: bash $ pip install stop-words -Another way is by cloning ``stop-words``'s `git repo `_ :: +**Via Git:** + +.. code-block:: bash - $ git clone --recursive git://github.com/Alir3z4/python-stop-words.git + $ git clone --recursive https://github.com/Alir3z4/python-stop-words.git + $ cd python-stop-words + $ pip install -e . -Then install it by running: -:: +**Requirements:** - $ python setup.py install +* Usually any version of Python that supports type hints and probably has not been marked as EOL. -Basic usage +Quick Start ----------- -.. code:: python + +Basic Usage +~~~~~~~~~~~ + +.. code-block:: python from stop_words import get_stop_words + # Get English stop words using language code stop_words = get_stop_words('en') + + # Or use the full language name stop_words = get_stop_words('english') + + # Use in text processing + text = "The quick brown fox jumps over the lazy dog" + words = text.lower().split() + filtered_words = [word for word in words if word not in stop_words] + print(filtered_words) # ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog'] + + +Safe Loading +~~~~~~~~~~~~ + +Use ``safe_get_stop_words()`` when you're not sure if a language is supported: + +.. code-block:: python from stop_words import safe_get_stop_words - stop_words = safe_get_stop_words('unsupported language') + # Returns empty list instead of raising an exception + stop_words = safe_get_stop_words('klingon') # Returns [] + + # Works normally with supported languages + stop_words = safe_get_stop_words('fr') # Returns French stop words + + +Advanced Usage +-------------- + +Checking Available Languages +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from stop_words import AVAILABLE_LANGUAGES, LANGUAGE_MAPPING + + # List all available languages + print(AVAILABLE_LANGUAGES) + # ['arabic', 'bulgarian', 'catalan', ...] + + # View language code mappings + print(LANGUAGE_MAPPING) + # {'en': 'english', 'fr': 'french', ...} + + +Caching Control +~~~~~~~~~~~~~~~ + +By default, stop words are cached for performance. You can control this behavior: + +.. code-block:: python + + from stop_words import get_stop_words, STOP_WORDS_CACHE + + # Disable caching for this call + stop_words = get_stop_words('en', cache=False) + + # Clear the cache manually + STOP_WORDS_CACHE.clear() + + # Check what's cached + print(STOP_WORDS_CACHE.keys()) # ['english', 'french', ...] + + +Custom Filters +~~~~~~~~~~~~~~ + +Apply custom transformations to stop words using the filter system: + +.. code-block:: python + + from stop_words import get_stop_words, add_filter, remove_filter + + # Add a global filter (applies to all languages) + def remove_short_words(words, language): + """Remove words shorter than 3 characters.""" + return [w for w in words if len(w) >= 3] + + add_filter(remove_short_words) + stop_words = get_stop_words('en', cache=False) + + # Add a language-specific filter + def uppercase_words(words): + """Convert all words to uppercase.""" + return [w.upper() for w in words] + + add_filter(uppercase_words, language='english') + stop_words = get_stop_words('en', cache=False) + + # Remove a filter when done + remove_filter(uppercase_words, language='english') + +**Note:** Filters only apply to newly loaded stop words, not cached ones. Use ``cache=False`` or clear the cache to apply new filters. + + +Practical Examples +------------------ + +Text Preprocessing +~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from stop_words import get_stop_words + import re + + def preprocess_text(text, language='en'): + """Clean and filter text for NLP tasks.""" + stop_words = set(get_stop_words(language)) + + # Convert to lowercase and extract words + words = re.findall(r'\b\w+\b', text.lower()) + + # Remove stop words + filtered_words = [w for w in words if w not in stop_words] + + return filtered_words + + text = "The quick brown fox jumps over the lazy dog" + print(preprocess_text(text)) + # ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog'] + + +Multilingual Processing +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from stop_words import get_stop_words + + def filter_multilingual_text(texts_dict): + """Process texts in multiple languages. + + Args: + texts_dict: Dictionary mapping language codes to text strings + + Returns: + Dictionary with filtered words for each language + """ + results = {} + + for lang_code, text in texts_dict.items(): + stop_words = set(get_stop_words(lang_code)) + words = text.lower().split() + filtered = [w for w in words if w not in stop_words] + results[lang_code] = filtered + + return results + + texts = { + 'en': 'The cat is on the table', + 'fr': 'Le chat est sur la table', + 'es': 'El gato está en la mesa' + } + + print(filter_multilingual_text(texts)) + + +Keyword Extraction +~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from stop_words import get_stop_words + from collections import Counter + import re + + def extract_keywords(text, language='en', top_n=10): + """Extract the most common meaningful words from text.""" + stop_words = set(get_stop_words(language)) + + # Extract words and filter + words = re.findall(r'\b\w+\b', text.lower()) + meaningful_words = [w for w in words if w not in stop_words and len(w) > 2] + + # Count and return top keywords + word_counts = Counter(meaningful_words) + return word_counts.most_common(top_n) + + article = """ + Python is a high-level programming language. Python is known for its + simplicity and readability. Many developers choose Python for data science. + """ + + keywords = extract_keywords(article) + print(keywords) + # [('python', 3), ('language', 1), ('high-level', 1), ...] + + +API Reference +------------- + +Functions +~~~~~~~~~ + +``get_stop_words(language, *, cache=True)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Load stop words for a specified language. + +**Parameters:** + +* ``language`` (str): Language code (e.g., 'en') or full name (e.g., 'english') +* ``cache`` (bool, optional): Enable caching. Defaults to True. + +**Returns:** + +* ``list[str]``: List of stop words + +**Raises:** + +* ``StopWordError``: If language is unavailable or files are unreadable + +**Example:** + +.. code-block:: python + + stop_words = get_stop_words('en') + stop_words = get_stop_words('french', cache=False) + + +``safe_get_stop_words(language)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Safely load stop words, returning empty list on error. + +**Parameters:** + +* ``language`` (str): Language code or full name + +**Returns:** + +* ``list[str]``: Stop words, or empty list if unavailable + +**Example:** + +.. code-block:: python + + stop_words = safe_get_stop_words('unknown') # Returns [] + + +``add_filter(func, language=None)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Register a filter function for stop word post-processing. + +**Parameters:** + +* ``func`` (Callable): Filter function +* ``language`` (str | None, optional): Language code or None for global filter + +**Filter Signatures:** + +* Language-specific: ``func(stopwords: list[str]) -> list[str]`` +* Global: ``func(stopwords: list[str], language: str) -> list[str]`` + +**Example:** + +.. code-block:: python + + def remove_short(words, lang): + return [w for w in words if len(w) > 3] + + add_filter(remove_short) # Global filter + + +``remove_filter(func, language=None)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Remove a previously registered filter. + +**Parameters:** + +* ``func`` (Callable): The filter function to remove +* ``language`` (str | None, optional): Language code or None + +**Returns:** + +* ``bool``: True if removed, False if not found + +**Example:** + +.. code-block:: python + + success = remove_filter(my_filter, language='english') + + +Constants +~~~~~~~~~ + +``AVAILABLE_LANGUAGES`` +^^^^^^^^^^^^^^^^^^^^^^^^ + +List of all supported language names. + +.. code-block:: python + + ['arabic', 'bulgarian', 'catalan', ...] + + +``LANGUAGE_MAPPING`` +^^^^^^^^^^^^^^^^^^^^ + +Dictionary mapping language codes to full names. + +.. code-block:: python + + {'en': 'english', 'fr': 'french', 'de': 'german', ...} + + +``STOP_WORDS_CACHE`` +^^^^^^^^^^^^^^^^^^^^^ + +Dictionary storing cached stop words. Can be manually cleared. + +.. code-block:: python + + STOP_WORDS_CACHE.clear() # Clear all cached data + + +Exceptions +~~~~~~~~~~ + +``StopWordError`` +^^^^^^^^^^^^^^^^^ + +Raised when a language is unavailable or files cannot be read. + +.. code-block:: python + + try: + stop_words = get_stop_words('invalid') + except StopWordError as e: + print(f"Error: {e}") + + +Performance Tips +---------------- + +1. **Use caching** - Keep ``cache=True`` (default) for repeated access to the same language +2. **Reuse stop word sets** - Convert to ``set()`` once for O(1) lookup performance: + + .. code-block:: python + + stop_words_set = set(get_stop_words('en')) + # Fast membership testing + is_stop_word = 'the' in stop_words_set + +3. **Preload languages** - Load stop words during initialization, not in tight loops +4. **Use safe_get_stop_words** - Avoid try/except overhead when language availability is uncertain + + +Troubleshooting +--------------- + +**"Language unavailable" error** + +* Check spelling and use either the language code or full name +* Verify the language is in ``AVAILABLE_LANGUAGES`` +* See the `Available Languages`_ table above + +**"File is unreadable" error** + +* Ensure the package installed correctly: ``pip install --force-reinstall stop-words`` +* Check file permissions in the installation directory +* Verify the ``stop-words`` subdirectory exists in the package + +**Filters not applying** + +* Filters only affect newly loaded stop words +* Clear the cache: ``STOP_WORDS_CACHE.clear()`` +* Use ``cache=False`` when testing filters + +**Performance issues** + +* Ensure caching is enabled (default behavior) +* Convert stop word lists to sets for faster lookups +* Preload stop words outside of loops + + +Contributing +------------ + +Contributions are welcome! Here's how you can help: + +1. **Add new languages** - Submit stop word lists for unsupported languages via https://github.com/Alir3z4/stop-words +2. **Improve existing lists** - Suggest additions or removals for existing languages via https://github.com/Alir3z4/stop-words +3. **Report bugs** - Open issues on GitHub +4. **Submit PRs** - Fix bugs or add features + +**Repository:** https://github.com/Alir3z4/python-stop-words + + +License +------- + +This project is licensed under the BSD 3-Clause License. See ``LICENSE`` file for details. + + +Changelog +--------- + +See `ChangeLog.rst `_ for version history. + + +Support +------- + +* **Issues:** https://github.com/Alir3z4/python-stop-words/issues +* **PyPI:** https://pypi.org/project/stop-words/ + + +Credits +------- + +* Maintained by `Alireza Savand `_ +* Stop word lists compiled from various open sources +* Contributors: See `GitHub contributors `_ + + +Related Projects +---------------- +* `Stop Words `_ - List of common stop words in various languages. +* `NLTK `_ - Natural Language Toolkit with extensive NLP features +* `spaCy `_ - Industrial-strength NLP library +* `TextBlob `_ - Simplified text processing -Python compatibility --------------------- -Python Stop Words is compatible with: +Indices and Tables +------------------ -* Python 2.7 -* Python 3.4 -* Python 3.5 -* Python 3.6 -* Python 3.7 +* `Available Languages`_ +* `Quick Start`_ +* `Advanced Usage`_ +* `API Reference`_ diff --git a/bootstrap.py b/bootstrap.py deleted file mode 100644 index 1f59b21..0000000 --- a/bootstrap.py +++ /dev/null @@ -1,210 +0,0 @@ -############################################################################## -# -# Copyright (c) 2006 Zope Foundation and Contributors. -# All Rights Reserved. -# -# This software is subject to the provisions of the Zope Public License, -# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. -# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED -# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS -# FOR A PARTICULAR PURPOSE. -# -############################################################################## -"""Bootstrap a buildout-based project - -Simply run this script in a directory containing a buildout.cfg. -The script accepts buildout command-line options, so you can -use the -c option to specify an alternate configuration file. -""" - -import os -import shutil -import sys -import tempfile - -from optparse import OptionParser - -__version__ = '2015-07-01' -# See zc.buildout's changelog if this version is up to date. - -tmpeggs = tempfile.mkdtemp(prefix='bootstrap-') - -usage = '''\ -[DESIRED PYTHON FOR BUILDOUT] bootstrap.py [options] - -Bootstraps a buildout-based project. - -Simply run this script in a directory containing a buildout.cfg, using the -Python that you want bin/buildout to use. - -Note that by using --find-links to point to local resources, you can keep -this script from going over the network. -''' - -parser = OptionParser(usage=usage) -parser.add_option("--version", - action="store_true", default=False, - help=("Return bootstrap.py version.")) -parser.add_option("-t", "--accept-buildout-test-releases", - dest='accept_buildout_test_releases', - action="store_true", default=False, - help=("Normally, if you do not specify a --buildout-version, " - "the bootstrap script and buildout gets the newest " - "*final* versions of zc.buildout and its recipes and " - "extensions for you. If you use this flag, " - "bootstrap and buildout will get the newest releases " - "even if they are alphas or betas.")) -parser.add_option("-c", "--config-file", - help=("Specify the path to the buildout configuration " - "file to be used.")) -parser.add_option("-f", "--find-links", - help=("Specify a URL to search for buildout releases")) -parser.add_option("--allow-site-packages", - action="store_true", default=False, - help=("Let bootstrap.py use existing site packages")) -parser.add_option("--buildout-version", - help="Use a specific zc.buildout version") -parser.add_option("--setuptools-version", - help="Use a specific setuptools version") -parser.add_option("--setuptools-to-dir", - help=("Allow for re-use of existing directory of " - "setuptools versions")) - -options, args = parser.parse_args() -if options.version: - print("bootstrap.py version %s" % __version__) - sys.exit(0) - - -###################################################################### -# load/install setuptools - -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen - -ez = {} -if os.path.exists('ez_setup.py'): - exec(open('ez_setup.py').read(), ez) -else: - exec(urlopen('https://bootstrap.pypa.io/ez_setup.py').read(), ez) - -if not options.allow_site_packages: - # ez_setup imports site, which adds site packages - # this will remove them from the path to ensure that incompatible versions - # of setuptools are not in the path - import site - # inside a virtualenv, there is no 'getsitepackages'. - # We can't remove these reliably - if hasattr(site, 'getsitepackages'): - for sitepackage_path in site.getsitepackages(): - # Strip all site-packages directories from sys.path that - # are not sys.prefix; this is because on Windows - # sys.prefix is a site-package directory. - if sitepackage_path != sys.prefix: - sys.path[:] = [x for x in sys.path - if sitepackage_path not in x] - -setup_args = dict(to_dir=tmpeggs, download_delay=0) - -if options.setuptools_version is not None: - setup_args['version'] = options.setuptools_version -if options.setuptools_to_dir is not None: - setup_args['to_dir'] = options.setuptools_to_dir - -ez['use_setuptools'](**setup_args) -import setuptools -import pkg_resources - -# This does not (always?) update the default working set. We will -# do it. -for path in sys.path: - if path not in pkg_resources.working_set.entries: - pkg_resources.working_set.add_entry(path) - -###################################################################### -# Install buildout - -ws = pkg_resources.working_set - -setuptools_path = ws.find( - pkg_resources.Requirement.parse('setuptools')).location - -# Fix sys.path here as easy_install.pth added before PYTHONPATH -cmd = [sys.executable, '-c', - 'import sys; sys.path[0:0] = [%r]; ' % setuptools_path + - 'from setuptools.command.easy_install import main; main()', - '-mZqNxd', tmpeggs] - -find_links = os.environ.get( - 'bootstrap-testing-find-links', - options.find_links or - ('http://downloads.buildout.org/' - if options.accept_buildout_test_releases else None) - ) -if find_links: - cmd.extend(['-f', find_links]) - -requirement = 'zc.buildout' -version = options.buildout_version -if version is None and not options.accept_buildout_test_releases: - # Figure out the most recent final version of zc.buildout. - import setuptools.package_index - _final_parts = '*final-', '*final' - - def _final_version(parsed_version): - try: - return not parsed_version.is_prerelease - except AttributeError: - # Older setuptools - for part in parsed_version: - if (part[:1] == '*') and (part not in _final_parts): - return False - return True - - index = setuptools.package_index.PackageIndex( - search_path=[setuptools_path]) - if find_links: - index.add_find_links((find_links,)) - req = pkg_resources.Requirement.parse(requirement) - if index.obtain(req) is not None: - best = [] - bestv = None - for dist in index[req.project_name]: - distv = dist.parsed_version - if _final_version(distv): - if bestv is None or distv > bestv: - best = [dist] - bestv = distv - elif distv == bestv: - best.append(dist) - if best: - best.sort() - version = best[-1].version -if version: - requirement = '=='.join((requirement, version)) -cmd.append(requirement) - -import subprocess -if subprocess.call(cmd) != 0: - raise Exception( - "Failed to execute command:\n%s" % repr(cmd)[1:-1]) - -###################################################################### -# Import and run buildout - -ws.add_entry(tmpeggs) -ws.require(requirement) -import zc.buildout.buildout - -if not [a for a in args if '=' not in a]: - args.append('bootstrap') - -# if -c was provided, we push it back into args for buildout' main function -if options.config_file is not None: - args[0:0] = ['-c', options.config_file] - -zc.buildout.buildout.main(args) -shutil.rmtree(tmpeggs) diff --git a/buildout.cfg b/buildout.cfg deleted file mode 100644 index ae67c4b..0000000 --- a/buildout.cfg +++ /dev/null @@ -1,47 +0,0 @@ -[buildout] -develop = . -parts = test - cover - flake8 - evolve -show-picked-versions = true - -[evolve] -arguments = '-s buildout.cfg -w --indent 32 --sorting alpha' -eggs = buildout-versions-checker -recipe = zc.recipe.egg -scripts = check-buildout-updates=${:_buildout_section_name_} - -[test] -defaults = --with-progressive -eggs = nose - nose-progressive -recipe = pbp.recipe.noserunner - -[cover] -<= test -defaults = --with-coverage - --cover-erase - --cover-package=stop_words -eggs = nose - coverage - -[flake8] -eggs = flake8 -recipe = zc.recipe.egg - -[versions] -blessings = 1.6 -buildout-versions-checker = 1.5.1 -coverage = 3.7.1 -flake8 = 2.3.0 -futures = 2.2.0 -mccabe = 0.3 -nose = 1.3.4 -nose-progressive = 1.5.1 -pbp.recipe.noserunner = 0.2.6 -pep8 = 1.5.7 -pyflakes = 0.8.1 -six = 1.10.0 -zc.buildout = 2.12.1 -zc.recipe.egg = 2.0.7 diff --git a/flake8.ini b/flake8.ini new file mode 100644 index 0000000..b263f9e --- /dev/null +++ b/flake8.ini @@ -0,0 +1,8 @@ +[flake8] +max-line-length = 120 +exclude = + venv, + .venv, + cache, + build, + src/stop_words/stop-words/**, diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..80db556 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,112 @@ +[build-system] +requires = ["setuptools>=61.2", "setuptools_scm[toml]>=3.4.3"] +build-backend = "setuptools.build_meta" + +[project] +name = "stop-words" +description = "Get list of common stop words in various languages in Python" +readme = "README.rst" +authors = [{name = "Alireza Savand", email = "alireza.savand@gmail.com"}] +license = "BSD-3-Clause" +classifiers = [ + "Programming Language :: Python", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Topic :: Software Development", + "Development Status :: 6 - Mature", + "Programming Language :: Python :: 3", + "Topic :: Text Processing", + "Topic :: Text Processing :: Filters", +] +requires-python = ">=3.11" +dynamic = ["version"] + +[project.urls] +Homepage = "https://github.com/Alir3z4/python-stop-words" +Repository = "https://github.com/Alir3z4/python-stop-words.git" +Issues = "https://github.com/Alir3z4/python-stop-words/issues" +Changelog = "https://github.com/Alir3z4/python-stop-words/blob/main/ChangeLog.rst" + +[project.optional-dependencies] +dev = [ + "black==25.9.0", + "mypy==1.18.2", + "flake8==7.3.0", + "coverage==7.11.0", +] + +[tool.setuptools_scm] +write_to = "src/stop_words/_version.py" + +[tool.setuptools] +packages = ["stop_words"] +package-dir = {"" = "src"} +package-data = {stop_words = [ + "stop-words/*.txt", + "stop-words/languages.json", +]} + + +[tool.mypy] +python_version = "3.13" +exclude_gitignore = true + +[tool.coverage.run] +cover_pylib = false +omit = [ + "*site-packages*", + "*distutils*", + "venv/*", + ".venv/*", + "_version.py", +] + +[tool.coverage.report] +precision = 3 +show_missing = true +ignore_errors = true +# Regexes for lines to exclude from consideration +exclude_lines = [ + # Have to re-enable the standard pragma + "pragma: no cover", + + # Don't complain about missing debug-only code: + "def __repr__", + "def __str__", + "if self\\.debug", + + # Don't complain if tests don't hit defensive assertion code: + "raise AssertionError", + "raise NotImplementedError", + + # Don't complain if non-runnable code isn't run: + "if 0:", + "if __name__ == .__main__.:", +] +skip_covered = true + + +[tool.black] +line-length = 120 +target-version = ['py313'] +extend-exclude = ''' +/( + build + | \.venv + | src/stop_words/stop-words +)/ +''' + + +[tool.isort] +line_length = 120 +extend_skip = ["src/stop_words/stop-words", "src/stop_words/_version.py"] +sections = "FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" +indent = 4 +multi_line_output = 3 +include_trailing_comma = true +order_by_type = true +combine_as_imports = true +lines_after_imports = 2 +float_to_top = true +atomic = true diff --git a/setup.py b/setup.py deleted file mode 100644 index 53a5f11..0000000 --- a/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name='stop-words', - version=__import__("stop_words").get_version(), - description='Get list of common stop words in various languages in Python', - long_description=open('README.rst').read(), - license=open('LICENSE').read(), - author='Alireza Savand', - author_email='alireza.savand@gmail.com', - url='https://github.com/Alir3z4/python-stop-words', - packages=find_packages(), - zip_safe=False, - package_data={ - 'stop_words': [ - 'stop-words/*.txt', - 'stop-words/languages.json', - ] - }, - classifiers=[ - 'Programming Language :: Python', - 'Intended Audience :: Developers', - 'Operating System :: OS Independent', - 'Topic :: Software Development', - 'Development Status :: 6 - Mature', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Topic :: Text Processing', - 'Topic :: Text Processing :: Filters', - 'License :: OSI Approved :: BSD License', - ], -) diff --git a/src/stop_words/__init__.py b/src/stop_words/__init__.py new file mode 100644 index 0000000..5c23378 --- /dev/null +++ b/src/stop_words/__init__.py @@ -0,0 +1,187 @@ +""" +Stop Words Library + +A module for loading and managing stop words across multiple languages. +Stop words are common words that are typically filtered out in text processing. + +This module provides: +- Loading stop words from language-specific files +- Caching for performance optimization +- Custom filtering system for post-processing stop words +- Language code mapping (e.g., 'en' -> 'english') +""" + +import json +from pathlib import Path +from typing import Callable + + +# Directory configuration +CURRENT_DIR = Path(__file__).resolve().parent +STOP_WORDS_DIR = CURRENT_DIR / "stop-words" + +# Global caches +STOP_WORDS_CACHE: dict[str, list[str]] = {} +_filters: dict[str | None, list[Callable[[list[str], str | None], list[str]]]] = {None: []} + +# Load language mapping configuration +_languages_file = STOP_WORDS_DIR / "languages.json" +with _languages_file.open("r", encoding="utf-8") as f: + LANGUAGE_MAPPING: dict[str, str] = json.load(f) + +AVAILABLE_LANGUAGES: list[str] = list(LANGUAGE_MAPPING.values()) + + +class StopWordError(Exception): + """Raised when a requested language is unavailable or files are unreadable.""" + + pass + + +def get_version() -> str: + """ + Get the version of the stop words library. + + :returns: The version string from _version module. + """ + from ._version import __version__ # type: ignore + + return __version__ + + +def get_stop_words(language: str, *, cache: bool = True) -> list[str]: + """ + Load stop words for a specified language. + + :param language: Language code (e.g., 'en', 'es') or full name (e.g., 'english', 'spanish'). + Supports both ISO codes and full language names via LANGUAGE_MAPPING. + :param cache: If True, cache the results for faster subsequent access. Defaults to True. + + :returns: A list of stop words for the specified language. Returns a copy to prevent external modification. + :raises StopWordError: If the language is not available or the file cannot be read. + + Example: + >>> words = get_stop_words('en') + >>> 'the' in words + True + """ + # Normalize language code to full name + try: + language = LANGUAGE_MAPPING[language] + except KeyError: + if language not in AVAILABLE_LANGUAGES: + raise StopWordError( + f'Language "{language}" is unavailable. ' + f'Available languages: {", ".join(sorted(AVAILABLE_LANGUAGES))}' + ) + + # Return cached version if available + if cache and language in STOP_WORDS_CACHE: + return STOP_WORDS_CACHE[language].copy() + + # Load stop words from file + language_file = STOP_WORDS_DIR / f"{language}.txt" + + try: + with language_file.open("r", encoding="utf-8") as f: + stop_words = [line.strip() for line in f if line.strip()] + stop_words = apply_filters(stop_words, language) + except (IOError, OSError) as e: + raise StopWordError(f'File "{language_file}" is unreadable. Check your installation. Error: {e}') from e + + # Cache if requested + if cache: + STOP_WORDS_CACHE[language] = stop_words + + return stop_words.copy() + + +def apply_filters(stopwords: list[str], language: str | None) -> list[str]: + """ + Apply registered filters to stop words. + + Filters can modify, remove, or add stop words. Language-specific filters + are applied first, followed by global filters (registered with language=None). + + :param stopwords: List of stop words to filter. + :param language: Language code for language-specific filters. + + :returns: Filtered list of stop words. + """ + # Apply language-specific filters + if language in _filters: + for func in _filters[language]: + stopwords = func(stopwords, language) + + # Apply global filters + for func in _filters[None]: + stopwords = func(stopwords, language) + + return stopwords + + +def add_filter(func: Callable[[list[str], str | None], list[str]], *, language: str | None = None) -> None: + """ + Register a filter function for stop word post-processing. + + Language-specific filters receive: func(stopwords: list[str]) -> list[str] + Global filters receive: func(stopwords: list[str], language: str) -> list[str] + + Note: Filters only apply to newly loaded stop words, not cached ones. + Clear the cache with STOP_WORDS_CACHE.clear() to reapply filters. + + :param func: Callable that takes a list of stop words and returns a modified list. + :param language: Language code for language-specific filter, or None for global filter. + + Example: + >>> # Add a filter to uppercase all stop words for English + >>> add_filter(lambda words: [w.upper() for w in words], 'english') + >>> # Add a global filter to remove single-character words + >>> add_filter(lambda words, lang: [w for w in words if len(w) > 1]) + """ + if language is None: + _filters[None].append(func) + return + + if language not in _filters: + _filters[language] = [] + + _filters[language].append(func) + + +def remove_filter(func: Callable[[list[str], str | None], list[str]], *, language: str | None = None) -> bool: + """ + Unregister a previously registered filter function. + + :param func: The filter function to remove. + :param language: Language code or None for global filters. + + :returns: True if the filter was found and removed, False otherwise. + """ + if language not in _filters or func not in _filters[language]: + return False + + _filters[language].remove(func) + return True + + +def safe_get_stop_words(language: str) -> list[str]: + """ + Safely load stop words, returning an empty list on error. + + This is a convenience wrapper around get_stop_words() that catches + StopWordError exceptions and returns an empty list instead. + + :param language: Language code or full name. + + :returns: Stop words for the language, or empty list if unavailable. + + Example: + >>> words = safe_get_stop_words('unknown_language') + >>> words + [] + """ + try: + return get_stop_words(language) + except StopWordError: + return [] diff --git a/src/stop_words/stop-words b/src/stop_words/stop-words new file mode 160000 index 0000000..6e4b92b --- /dev/null +++ b/src/stop_words/stop-words @@ -0,0 +1 @@ +Subproject commit 6e4b92b5522f91c12264b6989d1d75269652745d diff --git a/src/tests.py b/src/tests.py new file mode 100644 index 0000000..7fc6b86 --- /dev/null +++ b/src/tests.py @@ -0,0 +1,435 @@ +import random +from pathlib import Path +from unittest import TestCase + +import stop_words +from stop_words import ( + AVAILABLE_LANGUAGES, + LANGUAGE_MAPPING, + STOP_WORDS_CACHE, + StopWordError, + add_filter, + get_stop_words, + get_version, + remove_filter, + safe_get_stop_words, +) + + +class TestStopWordsBasic(TestCase): + """Test basic stop word loading functionality.""" + + NUMBER_OF_ENGLISH_STOP_WORDS = 1333 + + def test_get_stop_words_returns_list(self) -> None: + """Stop words should be returned as a list.""" + sw = get_stop_words("english") + self.assertIsInstance(sw, list) + self.assertEqual(len(sw), self.NUMBER_OF_ENGLISH_STOP_WORDS) + + def test_get_stop_words_contains_strings(self) -> None: + """All stop words should be strings.""" + sw = get_stop_words("english") + self.assertTrue(all(isinstance(word, str) for word in sw)) + + def test_get_stop_words_no_empty_strings(self) -> None: + """Stop words should not contain empty strings.""" + sw = get_stop_words("english") + self.assertTrue(all(word.strip() for word in sw)) + + def test_get_stop_words_language_mapping(self) -> None: + """Language codes should map to full language names.""" + sw_code = get_stop_words("en") + sw_full = get_stop_words("english") + self.assertEqual(len(sw_code), self.NUMBER_OF_ENGLISH_STOP_WORDS) + self.assertEqual(sw_code, sw_full) + + def test_common_english_stop_words(self) -> None: + """Common English stop words should be present.""" + sw = get_stop_words("en") + common_words = ["the", "a", "an", "and", "or", "but", "is", "are"] + for word in common_words: + self.assertIn(word, sw, f"Expected '{word}' in English stop words") + + def test_get_version(self) -> None: + self.assertIsNotNone(get_version()) + + +class TestStopWordsCache(TestCase): + """Test caching behavior.""" + + def setUp(self) -> None: + """Clear cache before each test.""" + STOP_WORDS_CACHE.clear() + + def test_cache_enabled_by_default(self) -> None: + """Cache should be enabled by default.""" + self.assertNotIn("french", STOP_WORDS_CACHE) + get_stop_words("fr") + self.assertIn("french", STOP_WORDS_CACHE) + + def test_cache_disabled(self) -> None: + """Cache should not be used when cache=False.""" + self.assertNotIn("german", STOP_WORDS_CACHE) + get_stop_words("de", cache=False) + self.assertNotIn("german", STOP_WORDS_CACHE) + + def test_cache_persists_across_calls(self) -> None: + """Cached stop words should persist across calls.""" + original_dir = stop_words.STOP_WORDS_DIR + + # Load and cache + sw1 = get_stop_words("fr") + self.assertIn("french", STOP_WORDS_CACHE) + + # Break the file system path + stop_words.STOP_WORDS_DIR = Path("non-existent-directory") + + # Should still work from cache + sw2 = get_stop_words("french") + self.assertEqual(sw1, sw2) + + # Restore + stop_words.STOP_WORDS_DIR = original_dir + + def test_cache_miss_raises_error(self) -> None: + """Cache miss with invalid path should raise error.""" + original_dir = stop_words.STOP_WORDS_DIR + stop_words.STOP_WORDS_DIR = Path("non-existent-directory") + + with self.assertRaises(StopWordError): + get_stop_words("spanish") + + self.assertNotIn("spanish", STOP_WORDS_CACHE) + stop_words.STOP_WORDS_DIR = original_dir + + def test_returns_copy_not_reference(self) -> None: + """get_stop_words should return a copy, not the cached reference.""" + sw1 = get_stop_words("en") + sw2 = get_stop_words("en") + + # Modify one list + sw1.append("custom_word") + + # The other should be unchanged + self.assertNotIn("custom_word", sw2) + + # Cache should also be unchanged + sw3 = get_stop_words("en") + self.assertNotIn("custom_word", sw3) + + +class TestStopWordsErrors(TestCase): + """Test error handling.""" + + def test_unavailable_language_raises_error(self) -> None: + """Unknown languages should raise StopWordError.""" + with self.assertRaises(StopWordError) as ctx: + get_stop_words("sindarin") + self.assertIn("sindarin", str(ctx.exception).lower()) + + def test_missing_file_raises_error(self) -> None: + """Missing language files should raise StopWordError.""" + original_dir = stop_words.STOP_WORDS_DIR + stop_words.STOP_WORDS_DIR = Path("non-existent-directory") + + with self.assertRaises(StopWordError) as ctx: + get_stop_words("german", cache=False) + + self.assertIn("unreadable", str(ctx.exception).lower()) + stop_words.STOP_WORDS_DIR = original_dir + + def test_safe_get_stop_words_no_exception(self) -> None: + """safe_get_stop_words should never raise exceptions.""" + result = safe_get_stop_words("klingon") + self.assertEqual(result, []) + self.assertIsInstance(result, list) + + def test_safe_get_stop_words_with_valid_language(self) -> None: + """safe_get_stop_words should work with valid languages.""" + result = safe_get_stop_words("en") + self.assertGreater(len(result), 0) + + def test_error_message_includes_available_languages(self) -> None: + """Error message should hint at available languages.""" + with self.assertRaises(StopWordError) as ctx: + get_stop_words("notreal") + error_msg = str(ctx.exception).lower() + self.assertIn("available", error_msg) + + +class TestStopWordsFilters(TestCase): + """Test the filter system.""" + + def setUp(self) -> None: + """Clear cache and filters before each test.""" + STOP_WORDS_CACHE.clear() + stop_words._filters.clear() + stop_words._filters[None] = [] + + def tearDown(self) -> None: + """Clean up filters after each test.""" + stop_words._filters.clear() + stop_words._filters[None] = [] + + def test_global_filter_removes_words(self) -> None: + """Global filters should modify all languages.""" + + def remove_short_words(words: list[str], _lang: str | None = None) -> list[str]: + return [w for w in words if len(w) > 3] + + add_filter(remove_short_words) + sw = get_stop_words("en", cache=False) + + self.assertTrue(all(len(word) > 3 for word in sw)) + + def test_language_specific_filter(self) -> None: + """Language-specific filters should only affect that language.""" + + def uppercase_filter(words: list[str], _language: str | None = None) -> list[str]: + return [w.upper() for w in words] + + add_filter(uppercase_filter, language="english") + + # English should be uppercase + en_words = get_stop_words("en", cache=False) + self.assertTrue(all(w.isupper() for w in en_words if not w.isnumeric())) + + # Other languages should be unaffected + fr_words = get_stop_words("fr", cache=False) + self.assertFalse(all(w.isupper() for w in fr_words)) + + def test_multiple_filters_chain(self) -> None: + """Multiple filters should be applied in sequence.""" + + def add_prefix(words: list[str], _lang: str | None = None) -> list[str]: + return [f"prefix_{w}" for w in words] + + def add_suffix(words: list[str], _lang: str | None = None) -> list[str]: + return [f"{w}_suffix" for w in words] + + add_filter(add_prefix) + add_filter(add_suffix) + + sw = get_stop_words("en", cache=False) + sample_word = sw[0] + + self.assertTrue(sample_word.startswith("prefix_")) + self.assertTrue(sample_word.endswith("_suffix")) + + def test_remove_filter_returns_true(self) -> None: + """Removing an existing filter should return True.""" + + def dummy_filter(words: list[str], _lang: str | None = None) -> list[str]: + return words + + add_filter(dummy_filter) + + # Calling it to get the `dummy_filter` actually execute. + get_stop_words("en") + + result = remove_filter(dummy_filter) + self.assertTrue(result) + + def test_remove_nonexistent_filter_returns_false(self) -> None: + """Removing a non-existent filter should return False.""" + + def dummy_filter(words: list[str], _lang: str | None = None) -> list[str]: + return words # pragma: no cover + + result = remove_filter(dummy_filter) + self.assertFalse(result) + + def test_remove_filter_with_language(self) -> None: + """Language-specific filter removal should work.""" + + def lang_filter(words: list[str], _language: str | None = None) -> list[str]: + return words + + add_filter(lang_filter, language="english") + + # Calling it to get the `lang_filter` actually execute. + get_stop_words("en") + + result = remove_filter(lang_filter, language="english") + self.assertTrue(result) + + # Should return False when trying to remove again + result = remove_filter(lang_filter, language="english") + self.assertFalse(result) + + def test_filter_with_random_letter_removal(self) -> None: + """Original test: remove words containing a random letter.""" + language = "en" + before = get_stop_words(language, cache=False) + letter = random.choice(random.choice(before)) + + def remove_letter(words: list[str], _lang: str | None = None) -> list[str]: + return [w for w in words if letter not in w] + + add_filter(remove_letter) + after = get_stop_words(language, cache=False) + + for word in after: + self.assertNotIn(letter, word) + + self.assertTrue(remove_filter(remove_letter)) + + +class TestStopWordsAllLanguages(TestCase): + """Test all available languages.""" + + def test_all_mapped_languages_loadable(self) -> None: + """All languages in LANGUAGE_MAPPING should be loadable.""" + for code, full_name in LANGUAGE_MAPPING.items(): + with self.subTest(code=code, language=full_name): + sw = safe_get_stop_words(code) + self.assertGreater(len(sw), 0, f"No stop words loaded for {full_name} ({code})") + + def test_random_language_loading(self) -> None: + """Random sample of languages should all load successfully.""" + all_languages = list(LANGUAGE_MAPPING.keys()) + AVAILABLE_LANGUAGES + sample = random.sample(all_languages, min(10, len(all_languages))) + + for language in sample: + with self.subTest(language=language): + sw = safe_get_stop_words(language) + self.assertGreater(len(sw), 0, f"Cannot load stopwords for {language}") + + def test_all_languages_have_unique_words(self) -> None: + """Each language should have at least some unique characteristics.""" + # Compare English and French as they should be different + en = set(get_stop_words("en")) + fr = set(get_stop_words("fr")) + + # Should have different words + self.assertNotEqual(en, fr) + # Should have some overlap (common borrowed words) + self.assertGreater(len(en & fr), 0) + + +class TestStopWordsEdgeCases(TestCase): + """Test edge cases and boundary conditions.""" + + def test_empty_language_string(self) -> None: + """Empty language string should raise error.""" + with self.assertRaises(StopWordError): + get_stop_words("") + + def test_none_language(self) -> None: + """None as language should raise appropriate error.""" + with self.assertRaises((StopWordError, KeyError, TypeError)): + get_stop_words(None) # type: ignore + + def test_case_sensitive_language_codes(self) -> None: + """Language codes should be case-sensitive.""" + # Lowercase should work + sw_lower = get_stop_words("en") + self.assertGreater(len(sw_lower), 0) + + # Uppercase might not be in mapping + with self.assertRaises(StopWordError): + get_stop_words("EN") + + def test_whitespace_in_stop_words(self) -> None: + """Stop words should be properly stripped of whitespace.""" + sw = get_stop_words("en") + for word in sw: + self.assertEqual(word, word.strip(), f"Word '{word}' has extra whitespace") + + def test_duplicate_stop_words(self) -> None: + """Stop words list should not contain duplicates.""" + sw = get_stop_words("en") + unique_words = set(sw) + self.assertEqual(len(sw), len(unique_words), "Stop words list contains duplicates") + + def test_filter_returns_empty_list(self) -> None: + """Filter that returns empty list should work.""" + + def remove_all(words: list[str], _lang: str | None = None) -> list[str]: + return [] + + STOP_WORDS_CACHE.clear() + stop_words._filters.clear() + stop_words._filters[None] = [] + + add_filter(remove_all) + sw = get_stop_words("en", cache=False) + self.assertEqual(sw, []) + + # Cleanup + remove_filter(remove_all) + + def test_filter_adds_words(self) -> None: + """Filter that adds words should work.""" + + def add_custom(words: list[str], _lang: str | None = None) -> list[str]: + return words + ["custom1", "custom2"] + + STOP_WORDS_CACHE.clear() + stop_words._filters.clear() + stop_words._filters[None] = [] + + add_filter(add_custom) + sw = get_stop_words("en", cache=False) + + self.assertIn("custom1", sw) + self.assertIn("custom2", sw) + + # Cleanup + remove_filter(add_custom) + + def test_concurrent_filter_modifications(self) -> None: + """Adding and removing filters should be safe.""" + filters = [ + lambda w, language: w, + lambda w, language: [word.upper() for word in w], + lambda w, language: [word.lower() for word in w], + ] + + STOP_WORDS_CACHE.clear() + stop_words._filters.clear() + stop_words._filters[None] = [] + + # Add all filters + for f in filters: + add_filter(f) + + # Remove them in different order + for f in reversed(filters): + remove_filter(f) + + # Should be back to empty + self.assertEqual(len(stop_words._filters[None]), 0) + + +class TestStopWordsConfiguration(TestCase): + """Test module configuration and constants.""" + + def test_available_languages_is_list(self) -> None: + """AVAILABLE_LANGUAGES should be a list.""" + self.assertIsInstance(AVAILABLE_LANGUAGES, list) + self.assertGreater(len(AVAILABLE_LANGUAGES), 0) + + def test_language_mapping_is_dict(self) -> None: + """LANGUAGE_MAPPING should be a dictionary.""" + self.assertIsInstance(LANGUAGE_MAPPING, dict) + self.assertGreater(len(LANGUAGE_MAPPING), 0) + + def test_cache_is_dict(self) -> None: + """STOP_WORDS_CACHE should be a dictionary.""" + self.assertIsInstance(STOP_WORDS_CACHE, dict) + + def test_stop_words_dir_exists(self) -> None: + """STOP_WORDS_DIR should point to an existing directory.""" + self.assertTrue( + stop_words.STOP_WORDS_DIR.exists(), + f"Stop words directory not found: {stop_words.STOP_WORDS_DIR}", + ) + self.assertTrue(stop_words.STOP_WORDS_DIR.is_dir()) + + def test_language_files_exist(self) -> None: + """Language files referenced in mapping should exist.""" + for lang_name in AVAILABLE_LANGUAGES: + lang_file = stop_words.STOP_WORDS_DIR / f"{lang_name}.txt" + self.assertTrue(lang_file.exists(), f"Language file missing: {lang_file}") diff --git a/stop_words/__init__.py b/stop_words/__init__.py deleted file mode 100644 index 1cb6cb0..0000000 --- a/stop_words/__init__.py +++ /dev/null @@ -1,118 +0,0 @@ -import json -import os - -__VERSION__ = (2018, 7, 23) -CURRENT_DIR = os.path.dirname(os.path.realpath(__file__)) -STOP_WORDS_DIR = os.path.join(CURRENT_DIR, 'stop-words') -STOP_WORDS_CACHE = {} - -with open(os.path.join(STOP_WORDS_DIR, 'languages.json'), 'rb') as map_file: - buffer = map_file.read() - buffer = buffer.decode('ascii') - LANGUAGE_MAPPING = json.loads(buffer) - -AVAILABLE_LANGUAGES = list(LANGUAGE_MAPPING.values()) - - -def get_version(): - """ - :rtype: basestring - """ - return ".".join(str(v) for v in __VERSION__) - - -class StopWordError(Exception): - pass - - -def get_stop_words(language, cache=True): - """ - :type language: basestring - - :rtype: list - """ - try: - language = LANGUAGE_MAPPING[language] - except KeyError: - if language not in AVAILABLE_LANGUAGES: - raise StopWordError('{0}" language is unavailable.'.format( - language - )) - - if cache and language in STOP_WORDS_CACHE: - return STOP_WORDS_CACHE[language] - - language_filename = os.path.join(STOP_WORDS_DIR, language + '.txt') - try: - with open(language_filename, 'rb') as language_file: - stop_words = [line.decode('utf-8').strip() - for line in language_file.readlines()] - stop_words = apply_filters(stop_words, language) - except IOError: - raise StopWordError( - '{0}" file is unreadable, check your installation.'.format( - language_filename - ) - ) - - if cache: - STOP_WORDS_CACHE[language] = stop_words - - return stop_words[:] # copy list, prevent being modified - -_filters = {None: []} - - -def apply_filters(stopwords, language): - """ - Apply registered filters to stopwords - :param stopwords: list - :param language: string - :return: filtered stopwords - """ - if language in _filters: - for func in _filters[language]: - stopwords = func(stopwords) - - for func in _filters[None]: - stopwords = func(stopwords, language) - - return stopwords - - -def add_filter(func, language=None): - """ - Register filters for specific language. - If language == None the filter applies for all languages. - Filter will not apply for stop words in cache. - :param func: callable - :param language: string|None - :return: - """ - if language not in _filters: - _filters[language] = [] - _filters[language].append(func) - - -def remove_filter(func, language=None): - """ - :param func: - :param language: - :return: - """ - if not (language in _filters and func in _filters[language]): - return False - _filters[language].remove(func) - return True - - -def safe_get_stop_words(language): - """ - :type language: basestring - - :rtype: list - """ - try: - return get_stop_words(language) - except StopWordError: - return [] diff --git a/stop_words/stop-words b/stop_words/stop-words deleted file mode 160000 index 522e4e3..0000000 --- a/stop_words/stop-words +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 522e4e37a11ca1d00dd513e01f0741a9689bb062 diff --git a/stop_words/tests.py b/stop_words/tests.py deleted file mode 100644 index 3249e5f..0000000 --- a/stop_words/tests.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Tests for stop-words -""" -import random -from unittest import TestCase -from unittest import TestSuite -from unittest import TestLoader - -import stop_words -from stop_words import get_stop_words -from stop_words import safe_get_stop_words -from stop_words import StopWordError -from stop_words import LANGUAGE_MAPPING -from stop_words import AVAILABLE_LANGUAGES - - -class StopWordsTestCase(TestCase): - number_of_english_stop_words = 1298 - - def test_get_stop_words(self): - sw = get_stop_words('english') - self.assertEqual(len(sw), self.number_of_english_stop_words) - - def test_get_stop_words_language_mapping(self): - sw = get_stop_words('en') - self.assertEqual(len(sw), self.number_of_english_stop_words) - self.assertEqual(sw, get_stop_words('english')) - - def test_get_stop_words_cache(self): - self.assertFalse('french' in stop_words.STOP_WORDS_CACHE) - sw = get_stop_words('fr') - self.assertTrue('french' in stop_words.STOP_WORDS_CACHE) - original_stop_words_dir = stop_words.STOP_WORDS_DIR - stop_words.STOP_WORDS_DIR = 'not-existing-directory' - self.assertEqual(sw, get_stop_words('french')) - stop_words.STOP_WORDS_DIR = original_stop_words_dir - try: - get_stop_words('klingon') - except: - pass - self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE) - - def test_get_stop_words_unavailable_language(self): - self.assertRaises(StopWordError, get_stop_words, 'sindarin') - - def test_get_stop_words_install_issue(self): - original_stop_words_dir = stop_words.STOP_WORDS_DIR - stop_words.STOP_WORDS_DIR = 'not-existing-directory' - self.assertRaises(StopWordError, get_stop_words, 'german') - stop_words.STOP_WORDS_DIR = original_stop_words_dir - - def test_safe_get_stop_words(self): - self.assertRaises(StopWordError, get_stop_words, 'huttese') - self.assertEqual(safe_get_stop_words('huttese'), []) - - def test_random_language_stop_words_load(self): - languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES) - sample = random.sample(languages, len(languages)) - for language in sample: - stop_words = safe_get_stop_words(language) - self.assertTrue( - len(stop_words) > 0, - 'Cannot load stopwords for {0} language'.format(language) - ) - - def test_filters(self): - language = 'en' - before = get_stop_words(language, False) - letter = random.choice(random.choice(before)) - - def remove_letter(stopwords, language): - return [word for word in stopwords if letter not in word] - stop_words.add_filter(remove_letter) - after = get_stop_words(language, False) - for stopword in after: - self.assertFalse(letter in stopword) - self.assertTrue(stop_words.remove_filter(remove_letter)) - - -loader = TestLoader() - -test_suite = TestSuite( - [ - loader.loadTestsFromTestCase(StopWordsTestCase), - ] -)