From 1c694b32574aacdac4f6900bcc24348ab88e8a32 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 30 Oct 2025 16:14:49 +0300 Subject: [PATCH 1/8] Upgrading stack and environment Getting ready for release. --- .editorconfig | 21 ++++ .github/workflows/main.yml | 65 ++++++++++++ .github/workflows/pypi.yml | 34 ++++++ .gitignore | 5 +- .travis.yml | 24 ----- Makefile | 27 +++++ README.rst | 46 ++------ bootstrap.py | 210 ------------------------------------- buildout.cfg | 47 --------- flake8.ini | 8 ++ mypy.ini | 2 + pyproject.toml | 94 +++++++++++++++++ setup.py | 37 ------- stop_words/__init__.py | 56 +++++----- stop_words/stop-words | 2 +- stop_words/tests.py | 93 +++++++--------- 16 files changed, 334 insertions(+), 437 deletions(-) create mode 100644 .editorconfig create mode 100644 .github/workflows/main.yml create mode 100644 .github/workflows/pypi.yml delete mode 100644 .travis.yml create mode 100644 Makefile delete mode 100644 bootstrap.py delete mode 100644 buildout.cfg create mode 100644 flake8.ini create mode 100644 mypy.ini create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..0bb0f0b --- /dev/null +++ b/.editorconfig @@ -0,0 +1,21 @@ +# http://editorconfig.org +root = true + +[*] +indent_size = 2 +indent_style = space +end_of_line = lf +charset = utf-8 +max_line_length = 120 +insert_final_newline = true + +[*.py] +indent_size = 4 +trim_trailing_whitespace = true + + +[*.rst] +trim_trailing_whitespace = false + +[Makefile] +indent_style = tab diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..6464ea8 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,65 @@ +name: CI + +# Controls when the action will run. +on: + # Triggers the workflow on push or pull request events but only for the master branch + push: + branches: [ master ] + pull_request: + branches: [ master ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + code-quality: + runs-on: ubuntu-latest + + name: "Linting" + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: setup python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Install dependencies + run: make install + + - name: Linting + run: make lint + + test: + # The type of runner that the job will run on + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12", "3.13", "3.14"] + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + name: "Python ${{ matrix.python-version }}" + + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: make install + + - name: Run tests + run: make coverage + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v5 + with: + flags: unittests-${{ matrix.python-version }} + fail_ci_if_error: true # default = false + verbose: true # default = false diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..95e91d2 --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,34 @@ +name: Publish to PyPI + +on: + release: + types: [released] + +jobs: + release: + name: Release + environment: + name: pypi + url: https://pypi.org/project/stop-words + permissions: + id-token: write + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Build + run: | + git submodule update --remote --rebase + python -m pip install build + python -m build + + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + verbose: true + print-hash: true diff --git a/.gitignore b/.gitignore index 6c27d15..2c82bd1 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,10 @@ dist/ *.egg-info/ logs/ src/ -.c9/ bin/ develop-eggs/ eggs/ +coverage.xml +.coverage +build +stop_words/_version.py diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index ba07717..0000000 --- a/.travis.yml +++ /dev/null @@ -1,24 +0,0 @@ -language: python -python: - - "2.7" - - "3.4" - - "3.5" - - "3.6" - - "3.7-dev" # 3.7 development branch - - "nightly" # currently points to 3.7-dev -install: - - git submodule init - - git submodule update - - git submodule foreach git pull origin master - - pip install -U setuptools coveralls - - python bootstrap.py - - ./bin/buildout -before_script: - - ./bin/flake8 stop_words -script: - - ./bin/cover -notifications: - irc: - - "irc.freenode.org#python-stop-words" -after_success: - coveralls diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..96b5ec4 --- /dev/null +++ b/Makefile @@ -0,0 +1,27 @@ +.PHONY: install test build release clean + +install: + pip install '.[dev]' + pip install build + +update_stop_words: + git submodule update --remote --rebase + +test: + python -m unittest discover + +coverage: + coverage run -m unittest discover + coverage report + coverage xml + +build: + python -m build + +clean: + rm -rf dist *.egg-info coverage.xml build .coverage + +lint: + black stop_words/ + flake8 stop_words/ --config flake8.ini + mypy stop_words/ --install-types --non-interactive diff --git a/README.rst b/README.rst index d1fb367..7a23d0e 100644 --- a/README.rst +++ b/README.rst @@ -9,30 +9,6 @@ Overview Get list of common stop words in various languages in Python. -.. image:: https://secure.travis-ci.org/Alir3z4/python-stop-words.png - :alt: Build Status - :target: http://travis-ci.org/Alir3z4/python-stop-words - -.. image:: https://coveralls.io/repos/Alir3z4/python-stop-words/badge.png - :alt: Coverage Status - :target: https://coveralls.io/r/Alir3z4/python-stop-words - -.. image:: http://badge.kloud51.com/pypi/v/stop-words.svg - :target: https://pypi.python.org/pypi/stop-words - :alt: PyPI Version - -.. image:: http://badge.kloud51.com/pypi/s/stop-words.svg - :target: https://pypi.python.org/pypi/stop-words - :alt: PyPI Status - -.. image:: http://badge.kloud51.com/pypi/l/stop-words.svg - :target: https://github.com/Alir3z4/python-stop-words/blob/master/LICENSE - :alt: License - -.. image:: http://badge.kloud51.com/pypi/p/stop-words.svg - :target: https://pypi.python.org/pypi/stop-words - :alt: PyPI Py_versions - Available languages ------------------- @@ -40,6 +16,7 @@ Available languages * Arabic * Bulgarian * Catalan +* Chinese * Czech * Danish * Dutch @@ -47,18 +24,28 @@ Available languages * Finnish * French * German +* Greek +* Gujarati +* Hindi +* Hebrew * Hungarian * Indonesian +* Malaysian * Italian +* Japanese +* Korean * Norwegian * Polish * Portuguese * Romanian * Russian +* Slovak * Spanish * Swedish * Turkish * Ukrainian +* Vietnamese +* Persian/Farsi Installation @@ -94,14 +81,3 @@ Basic usage from stop_words import safe_get_stop_words stop_words = safe_get_stop_words('unsupported language') - -Python compatibility --------------------- - -Python Stop Words is compatible with: - -* Python 2.7 -* Python 3.4 -* Python 3.5 -* Python 3.6 -* Python 3.7 diff --git a/bootstrap.py b/bootstrap.py deleted file mode 100644 index 1f59b21..0000000 --- a/bootstrap.py +++ /dev/null @@ -1,210 +0,0 @@ -############################################################################## -# -# Copyright (c) 2006 Zope Foundation and Contributors. -# All Rights Reserved. -# -# This software is subject to the provisions of the Zope Public License, -# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. -# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED -# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS -# FOR A PARTICULAR PURPOSE. -# -############################################################################## -"""Bootstrap a buildout-based project - -Simply run this script in a directory containing a buildout.cfg. -The script accepts buildout command-line options, so you can -use the -c option to specify an alternate configuration file. -""" - -import os -import shutil -import sys -import tempfile - -from optparse import OptionParser - -__version__ = '2015-07-01' -# See zc.buildout's changelog if this version is up to date. - -tmpeggs = tempfile.mkdtemp(prefix='bootstrap-') - -usage = '''\ -[DESIRED PYTHON FOR BUILDOUT] bootstrap.py [options] - -Bootstraps a buildout-based project. - -Simply run this script in a directory containing a buildout.cfg, using the -Python that you want bin/buildout to use. - -Note that by using --find-links to point to local resources, you can keep -this script from going over the network. -''' - -parser = OptionParser(usage=usage) -parser.add_option("--version", - action="store_true", default=False, - help=("Return bootstrap.py version.")) -parser.add_option("-t", "--accept-buildout-test-releases", - dest='accept_buildout_test_releases', - action="store_true", default=False, - help=("Normally, if you do not specify a --buildout-version, " - "the bootstrap script and buildout gets the newest " - "*final* versions of zc.buildout and its recipes and " - "extensions for you. If you use this flag, " - "bootstrap and buildout will get the newest releases " - "even if they are alphas or betas.")) -parser.add_option("-c", "--config-file", - help=("Specify the path to the buildout configuration " - "file to be used.")) -parser.add_option("-f", "--find-links", - help=("Specify a URL to search for buildout releases")) -parser.add_option("--allow-site-packages", - action="store_true", default=False, - help=("Let bootstrap.py use existing site packages")) -parser.add_option("--buildout-version", - help="Use a specific zc.buildout version") -parser.add_option("--setuptools-version", - help="Use a specific setuptools version") -parser.add_option("--setuptools-to-dir", - help=("Allow for re-use of existing directory of " - "setuptools versions")) - -options, args = parser.parse_args() -if options.version: - print("bootstrap.py version %s" % __version__) - sys.exit(0) - - -###################################################################### -# load/install setuptools - -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen - -ez = {} -if os.path.exists('ez_setup.py'): - exec(open('ez_setup.py').read(), ez) -else: - exec(urlopen('https://bootstrap.pypa.io/ez_setup.py').read(), ez) - -if not options.allow_site_packages: - # ez_setup imports site, which adds site packages - # this will remove them from the path to ensure that incompatible versions - # of setuptools are not in the path - import site - # inside a virtualenv, there is no 'getsitepackages'. - # We can't remove these reliably - if hasattr(site, 'getsitepackages'): - for sitepackage_path in site.getsitepackages(): - # Strip all site-packages directories from sys.path that - # are not sys.prefix; this is because on Windows - # sys.prefix is a site-package directory. - if sitepackage_path != sys.prefix: - sys.path[:] = [x for x in sys.path - if sitepackage_path not in x] - -setup_args = dict(to_dir=tmpeggs, download_delay=0) - -if options.setuptools_version is not None: - setup_args['version'] = options.setuptools_version -if options.setuptools_to_dir is not None: - setup_args['to_dir'] = options.setuptools_to_dir - -ez['use_setuptools'](**setup_args) -import setuptools -import pkg_resources - -# This does not (always?) update the default working set. We will -# do it. -for path in sys.path: - if path not in pkg_resources.working_set.entries: - pkg_resources.working_set.add_entry(path) - -###################################################################### -# Install buildout - -ws = pkg_resources.working_set - -setuptools_path = ws.find( - pkg_resources.Requirement.parse('setuptools')).location - -# Fix sys.path here as easy_install.pth added before PYTHONPATH -cmd = [sys.executable, '-c', - 'import sys; sys.path[0:0] = [%r]; ' % setuptools_path + - 'from setuptools.command.easy_install import main; main()', - '-mZqNxd', tmpeggs] - -find_links = os.environ.get( - 'bootstrap-testing-find-links', - options.find_links or - ('http://downloads.buildout.org/' - if options.accept_buildout_test_releases else None) - ) -if find_links: - cmd.extend(['-f', find_links]) - -requirement = 'zc.buildout' -version = options.buildout_version -if version is None and not options.accept_buildout_test_releases: - # Figure out the most recent final version of zc.buildout. - import setuptools.package_index - _final_parts = '*final-', '*final' - - def _final_version(parsed_version): - try: - return not parsed_version.is_prerelease - except AttributeError: - # Older setuptools - for part in parsed_version: - if (part[:1] == '*') and (part not in _final_parts): - return False - return True - - index = setuptools.package_index.PackageIndex( - search_path=[setuptools_path]) - if find_links: - index.add_find_links((find_links,)) - req = pkg_resources.Requirement.parse(requirement) - if index.obtain(req) is not None: - best = [] - bestv = None - for dist in index[req.project_name]: - distv = dist.parsed_version - if _final_version(distv): - if bestv is None or distv > bestv: - best = [dist] - bestv = distv - elif distv == bestv: - best.append(dist) - if best: - best.sort() - version = best[-1].version -if version: - requirement = '=='.join((requirement, version)) -cmd.append(requirement) - -import subprocess -if subprocess.call(cmd) != 0: - raise Exception( - "Failed to execute command:\n%s" % repr(cmd)[1:-1]) - -###################################################################### -# Import and run buildout - -ws.add_entry(tmpeggs) -ws.require(requirement) -import zc.buildout.buildout - -if not [a for a in args if '=' not in a]: - args.append('bootstrap') - -# if -c was provided, we push it back into args for buildout' main function -if options.config_file is not None: - args[0:0] = ['-c', options.config_file] - -zc.buildout.buildout.main(args) -shutil.rmtree(tmpeggs) diff --git a/buildout.cfg b/buildout.cfg deleted file mode 100644 index ae67c4b..0000000 --- a/buildout.cfg +++ /dev/null @@ -1,47 +0,0 @@ -[buildout] -develop = . -parts = test - cover - flake8 - evolve -show-picked-versions = true - -[evolve] -arguments = '-s buildout.cfg -w --indent 32 --sorting alpha' -eggs = buildout-versions-checker -recipe = zc.recipe.egg -scripts = check-buildout-updates=${:_buildout_section_name_} - -[test] -defaults = --with-progressive -eggs = nose - nose-progressive -recipe = pbp.recipe.noserunner - -[cover] -<= test -defaults = --with-coverage - --cover-erase - --cover-package=stop_words -eggs = nose - coverage - -[flake8] -eggs = flake8 -recipe = zc.recipe.egg - -[versions] -blessings = 1.6 -buildout-versions-checker = 1.5.1 -coverage = 3.7.1 -flake8 = 2.3.0 -futures = 2.2.0 -mccabe = 0.3 -nose = 1.3.4 -nose-progressive = 1.5.1 -pbp.recipe.noserunner = 0.2.6 -pep8 = 1.5.7 -pyflakes = 0.8.1 -six = 1.10.0 -zc.buildout = 2.12.1 -zc.recipe.egg = 2.0.7 diff --git a/flake8.ini b/flake8.ini new file mode 100644 index 0000000..c0115e1 --- /dev/null +++ b/flake8.ini @@ -0,0 +1,8 @@ +[flake8] +max-line-length = 120 +exclude = + venv, + .venv, + cache, + build, + stop_words/stop-words/**, diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..bdffac7 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,2 @@ +[mypy] +exclude = stop_words/stop-words diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..51f5b34 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,94 @@ +[build-system] +requires = ["setuptools>=61.2", "setuptools_scm[toml]>=3.4.3"] +build-backend = "setuptools.build_meta" + +[project] +name = "stop-words" +description = "Get list of common stop words in various languages in Python" +readme = "README.rst" +authors = [{name = "Alireza Savand", email = "alireza.savand@gmail.com"}] +license = "BSD-3-Clause" +classifiers = [ + "Programming Language :: Python", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Topic :: Software Development", + "Development Status :: 6 - Mature", + "Programming Language :: Python :: 3", + "Topic :: Text Processing", + "Topic :: Text Processing :: Filters", +] +urls = {Homepage = "https://github.com/Alir3z4/python-stop-words"} +requires-python = ">=3.11" +dynamic = ["version"] + +[project.optional-dependencies] +dev = [ + "black==25.9.0", + "mypy==1.18.2", + "flake8==7.3.0", + "coverage==7.11.0", +] + +[tool.setuptools_scm] +write_to = "stop_words/_version.py" + +[tool.setuptools] +packages = ["stop_words"] + +[tool.setuptools.package-data] +stop_words = [ + "stop-words/*.txt", + "stop-words/languages.json", +] + +[tool.mypy] +python_version = "3.13" + +[tool.coverage.run] +cover_pylib = false +omit = [ + "*site-packages*", + "*distutils*", + "venv/*", + ".venv/*", +] + +[tool.coverage.report] +precision = 3 +show_missing = true +ignore_errors = true +# Regexes for lines to exclude from consideration +exclude_lines = [ + # Have to re-enable the standard pragma + "pragma: no cover", + + # Don't complain about missing debug-only code: + "def __repr__", + "def __str__", + "if self\\.debug", + + # Don't complain if tests don't hit defensive assertion code: + "raise AssertionError", + "raise NotImplementedError", + + # Don't complain if non-runnable code isn't run: + "if 0:", + "if __name__ == .__main__.:", +] +skip_covered = true + + + +[tool.black] +line-length = 120 +#skip-magic-trailing-comma = true +target-version = ['py313'] +extend-exclude = ''' +/( + migrations + | build + | \.venv + | stop_words/stop-words +)/ +''' diff --git a/setup.py b/setup.py deleted file mode 100644 index 53a5f11..0000000 --- a/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name='stop-words', - version=__import__("stop_words").get_version(), - description='Get list of common stop words in various languages in Python', - long_description=open('README.rst').read(), - license=open('LICENSE').read(), - author='Alireza Savand', - author_email='alireza.savand@gmail.com', - url='https://github.com/Alir3z4/python-stop-words', - packages=find_packages(), - zip_safe=False, - package_data={ - 'stop_words': [ - 'stop-words/*.txt', - 'stop-words/languages.json', - ] - }, - classifiers=[ - 'Programming Language :: Python', - 'Intended Audience :: Developers', - 'Operating System :: OS Independent', - 'Topic :: Software Development', - 'Development Status :: 6 - Mature', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Topic :: Text Processing', - 'Topic :: Text Processing :: Filters', - 'License :: OSI Approved :: BSD License', - ], -) diff --git a/stop_words/__init__.py b/stop_words/__init__.py index 1cb6cb0..198b419 100644 --- a/stop_words/__init__.py +++ b/stop_words/__init__.py @@ -1,69 +1,66 @@ import json import os -__VERSION__ = (2018, 7, 23) +from typing import Callable + + CURRENT_DIR = os.path.dirname(os.path.realpath(__file__)) -STOP_WORDS_DIR = os.path.join(CURRENT_DIR, 'stop-words') -STOP_WORDS_CACHE = {} +STOP_WORDS_DIR = os.path.join(CURRENT_DIR, "stop-words") +STOP_WORDS_CACHE: dict[str, list[str]] = {} -with open(os.path.join(STOP_WORDS_DIR, 'languages.json'), 'rb') as map_file: +with open(os.path.join(STOP_WORDS_DIR, "languages.json"), "rb") as map_file: buffer = map_file.read() - buffer = buffer.decode('ascii') - LANGUAGE_MAPPING = json.loads(buffer) + LANGUAGE_MAPPING = json.loads(buffer.decode("ascii")) AVAILABLE_LANGUAGES = list(LANGUAGE_MAPPING.values()) -def get_version(): +def get_version() -> str: """ :rtype: basestring """ - return ".".join(str(v) for v in __VERSION__) + from ._version import __version__ + + return __version__ class StopWordError(Exception): pass -def get_stop_words(language, cache=True): +def get_stop_words(language, cache: bool = True) -> list[str]: """ - :type language: basestring - + :param language + :param cache: :rtype: list """ try: language = LANGUAGE_MAPPING[language] except KeyError: if language not in AVAILABLE_LANGUAGES: - raise StopWordError('{0}" language is unavailable.'.format( - language - )) + raise StopWordError('{0}" language is unavailable.'.format(language)) if cache and language in STOP_WORDS_CACHE: return STOP_WORDS_CACHE[language] - language_filename = os.path.join(STOP_WORDS_DIR, language + '.txt') + language_filename = os.path.join(STOP_WORDS_DIR, language + ".txt") try: - with open(language_filename, 'rb') as language_file: - stop_words = [line.decode('utf-8').strip() - for line in language_file.readlines()] + with open(language_filename, "rb") as language_file: + stop_words = [line.decode("utf-8").strip() for line in language_file.readlines()] stop_words = apply_filters(stop_words, language) except IOError: - raise StopWordError( - '{0}" file is unreadable, check your installation.'.format( - language_filename - ) - ) + raise StopWordError('{0}" file is unreadable, check your installation.'.format(language_filename)) if cache: STOP_WORDS_CACHE[language] = stop_words - return stop_words[:] # copy list, prevent being modified + return stop_words[:] # copy list, prevent being modified + -_filters = {None: []} +_filters: dict[str | None, list[Callable]] = {None: []} -def apply_filters(stopwords, language): +def apply_filters(stopwords: list[str], language: str) -> list[str]: """ Apply registered filters to stopwords :param stopwords: list @@ -80,11 +77,12 @@ def apply_filters(stopwords, language): return stopwords -def add_filter(func, language=None): +def add_filter(func, language: str | None = None) -> None: """ Register filters for specific language. If language == None the filter applies for all languages. Filter will not apply for stop words in cache. + :param func: callable :param language: string|None :return: @@ -94,7 +92,7 @@ def add_filter(func, language=None): _filters[language].append(func) -def remove_filter(func, language=None): +def remove_filter(func, language: str | None = None) -> bool: """ :param func: :param language: @@ -106,7 +104,7 @@ def remove_filter(func, language=None): return True -def safe_get_stop_words(language): +def safe_get_stop_words(language: str) -> list[str]: """ :type language: basestring diff --git a/stop_words/stop-words b/stop_words/stop-words index 522e4e3..6e4b92b 160000 --- a/stop_words/stop-words +++ b/stop_words/stop-words @@ -1 +1 @@ -Subproject commit 522e4e37a11ca1d00dd513e01f0741a9689bb062 +Subproject commit 6e4b92b5522f91c12264b6989d1d75269652745d diff --git a/stop_words/tests.py b/stop_words/tests.py index 3249e5f..32e6b0b 100644 --- a/stop_words/tests.py +++ b/stop_words/tests.py @@ -1,86 +1,73 @@ -""" -Tests for stop-words -""" import random from unittest import TestCase -from unittest import TestSuite -from unittest import TestLoader import stop_words +from stop_words import AVAILABLE_LANGUAGES +from stop_words import LANGUAGE_MAPPING +from stop_words import StopWordError from stop_words import get_stop_words from stop_words import safe_get_stop_words -from stop_words import StopWordError -from stop_words import LANGUAGE_MAPPING -from stop_words import AVAILABLE_LANGUAGES -class StopWordsTestCase(TestCase): - number_of_english_stop_words = 1298 +class TestStopWords(TestCase): + number_of_english_stop_words = 1333 - def test_get_stop_words(self): - sw = get_stop_words('english') + def test_get_stop_words(self) -> None: + sw = get_stop_words("english") self.assertEqual(len(sw), self.number_of_english_stop_words) - def test_get_stop_words_language_mapping(self): - sw = get_stop_words('en') + def test_get_stop_words_language_mapping(self) -> None: + sw = get_stop_words("en") self.assertEqual(len(sw), self.number_of_english_stop_words) - self.assertEqual(sw, get_stop_words('english')) + self.assertEqual(sw, get_stop_words("english")) - def test_get_stop_words_cache(self): - self.assertFalse('french' in stop_words.STOP_WORDS_CACHE) - sw = get_stop_words('fr') - self.assertTrue('french' in stop_words.STOP_WORDS_CACHE) + def test_get_stop_words_cache(self) -> None: + self.assertFalse("french" in stop_words.STOP_WORDS_CACHE) + sw = get_stop_words("fr") + self.assertTrue("french" in stop_words.STOP_WORDS_CACHE) original_stop_words_dir = stop_words.STOP_WORDS_DIR - stop_words.STOP_WORDS_DIR = 'not-existing-directory' - self.assertEqual(sw, get_stop_words('french')) + stop_words.STOP_WORDS_DIR = "not-existing-directory" + self.assertEqual(sw, get_stop_words("french")) stop_words.STOP_WORDS_DIR = original_stop_words_dir try: - get_stop_words('klingon') - except: + get_stop_words("klingon") + except StopWordError: pass - self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE) + self.assertFalse("klingon" in stop_words.STOP_WORDS_CACHE) - def test_get_stop_words_unavailable_language(self): - self.assertRaises(StopWordError, get_stop_words, 'sindarin') + def test_get_stop_words_unavailable_language(self) -> None: + self.assertRaises(StopWordError, get_stop_words, "sindarin") - def test_get_stop_words_install_issue(self): + def test_get_stop_words_install_issue(self) -> None: original_stop_words_dir = stop_words.STOP_WORDS_DIR - stop_words.STOP_WORDS_DIR = 'not-existing-directory' - self.assertRaises(StopWordError, get_stop_words, 'german') + stop_words.STOP_WORDS_DIR = "not-existing-directory" + self.assertRaises(StopWordError, get_stop_words, "german") stop_words.STOP_WORDS_DIR = original_stop_words_dir - def test_safe_get_stop_words(self): - self.assertRaises(StopWordError, get_stop_words, 'huttese') - self.assertEqual(safe_get_stop_words('huttese'), []) + def test_safe_get_stop_words(self) -> None: + self.assertRaises(StopWordError, get_stop_words, "huttese") + self.assertEqual(safe_get_stop_words("huttese"), []) - def test_random_language_stop_words_load(self): + def test_random_language_stop_words_load(self) -> None: languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES) sample = random.sample(languages, len(languages)) for language in sample: stop_words = safe_get_stop_words(language) self.assertTrue( len(stop_words) > 0, - 'Cannot load stopwords for {0} language'.format(language) + "Cannot load stopwords for {0} language".format(language), ) - def test_filters(self): - language = 'en' - before = get_stop_words(language, False) - letter = random.choice(random.choice(before)) - - def remove_letter(stopwords, language): - return [word for word in stopwords if letter not in word] - stop_words.add_filter(remove_letter) - after = get_stop_words(language, False) - for stopword in after: - self.assertFalse(letter in stopword) - self.assertTrue(stop_words.remove_filter(remove_letter)) - + def test_filters(self) -> None: + language = "en" + before = get_stop_words(language, False) + letter = random.choice(random.choice(before)) -loader = TestLoader() + def remove_letter(stopwords, _language: str): + return [word for word in stopwords if letter not in word] -test_suite = TestSuite( - [ - loader.loadTestsFromTestCase(StopWordsTestCase), - ] -) + stop_words.add_filter(remove_letter) + after = get_stop_words(language, False) + for stopword in after: + self.assertFalse(letter in stopword) + self.assertTrue(stop_words.remove_filter(remove_letter)) From c121f8be89cbdb318f56b589f78f4ec5b630fb6b Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 30 Oct 2025 16:16:53 +0300 Subject: [PATCH 2/8] Update stop words before attempting to install --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 96b5ec4..02ecb6b 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .PHONY: install test build release clean -install: +install: update_stop_words pip install '.[dev]' pip install build From 16106a8877542256e35188b8cc0f177b97923de7 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Thu, 30 Oct 2025 16:20:17 +0300 Subject: [PATCH 3/8] Using foreach instead of update? --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 02ecb6b..d98c52e 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ install: update_stop_words pip install build update_stop_words: - git submodule update --remote --rebase + git submodule foreach git pull origin master --rebase test: python -m unittest discover From 9aa85d5cd1d0c628e2b268f8b81dc4b1b05c8759 Mon Sep 17 00:00:00 2001 From: Alireza Savand <591113+Alir3z4@users.noreply.github.com> Date: Sun, 2 Nov 2025 15:27:19 +0300 Subject: [PATCH 4/8] Checkout also the submodules using actions/checkout --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6464ea8..7fab9b8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -45,6 +45,8 @@ jobs: steps: - name: Check out code uses: actions/checkout@v4 + with: + submodules: true - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 From 1ca051d29caec86720bc8c0b2f8ad99a93b3a520 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 3 Nov 2025 22:22:18 +0300 Subject: [PATCH 5/8] Cleaner packaging --- .gitignore | 3 +- .gitmodules | 4 +- Makefile | 48 ++++++++++++++-------- flake8.ini | 2 +- mypy.ini | 2 - pyproject.toml | 39 +++++++++++++----- {stop_words => src/stop_words}/__init__.py | 3 +- {stop_words => src/stop_words}/stop-words | 0 {stop_words => src}/tests.py | 6 +-- 9 files changed, 66 insertions(+), 41 deletions(-) delete mode 100644 mypy.ini rename {stop_words => src/stop_words}/__init__.py (98%) rename {stop_words => src/stop_words}/stop-words (100%) rename {stop_words => src}/tests.py (93%) diff --git a/.gitignore b/.gitignore index 2c82bd1..af0b40c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,11 +11,10 @@ build/ dist/ *.egg-info/ logs/ -src/ bin/ develop-eggs/ eggs/ coverage.xml .coverage build -stop_words/_version.py +src/stop_words/_version.py diff --git a/.gitmodules b/.gitmodules index 7a835bd..87eff44 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "stop_words/stop-words"] - path = stop_words/stop-words +[submodule "src/stop_words/stop-words"] + path = src/stop_words/stop-words url = https://github.com/Alir3z4/stop-words.git diff --git a/Makefile b/Makefile index d98c52e..4d66219 100644 --- a/Makefile +++ b/Makefile @@ -1,27 +1,43 @@ -.PHONY: install test build release clean +.PHONY: help install test coverage build clean format check-format lint precommit update-submodules -install: update_stop_words - pip install '.[dev]' - pip install build +.DEFAULT_GOAL := help -update_stop_words: - git submodule foreach git pull origin master --rebase +help: ## Display this help message + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) -test: - python -m unittest discover +install: update-submodules ## Install development dependencies + pip install -e '.[dev]' -coverage: +update-submodules: ## Update all git submodules + git submodule sync --recursive + git submodule update --init --remote --recursive + +test: ## Run test suite + python -m unittest discover -s src/ -v + +coverage: ## Generate coverage report coverage run -m unittest discover coverage report coverage xml -build: +build: ## Build source and wheel distributions python -m build -clean: - rm -rf dist *.egg-info coverage.xml build .coverage +clean: ## Remove build artifacts and temporary files + rm -rf build/ dist/ *.egg-info/ **/*.egg-info/ .coverage coverage.xml .mypy_cache/ 88 + +format: ## Auto-format code with isort and black + isort . + black . + +check-format: ## Check code formatting with isort and black + isort --check-only --diff . + black --check --diff . + +lint: ## Run all code quality checks + flake8 --config=flake8.ini . + mypy src/ --install-types --non-interactive + +precommit: format lint ## Full pre-commit checks (format + lint) -lint: - black stop_words/ - flake8 stop_words/ --config flake8.ini - mypy stop_words/ --install-types --non-interactive +##@ Development Targets diff --git a/flake8.ini b/flake8.ini index c0115e1..b263f9e 100644 --- a/flake8.ini +++ b/flake8.ini @@ -5,4 +5,4 @@ exclude = .venv, cache, build, - stop_words/stop-words/**, + src/stop_words/stop-words/**, diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index bdffac7..0000000 --- a/mypy.ini +++ /dev/null @@ -1,2 +0,0 @@ -[mypy] -exclude = stop_words/stop-words diff --git a/pyproject.toml b/pyproject.toml index 51f5b34..3caf2fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,10 +18,15 @@ classifiers = [ "Topic :: Text Processing", "Topic :: Text Processing :: Filters", ] -urls = {Homepage = "https://github.com/Alir3z4/python-stop-words"} requires-python = ">=3.11" dynamic = ["version"] +[project.urls] +Homepage = "https://github.com/Alir3z4/python-stop-words" +Repository = "https://github.com/Alir3z4/python-stop-words.git" +Issues = "https://github.com/Alir3z4/python-stop-words/issues" +Changelog = "https://github.com/Alir3z4/python-stop-words/blob/main/ChangeLog.rst" + [project.optional-dependencies] dev = [ "black==25.9.0", @@ -31,19 +36,20 @@ dev = [ ] [tool.setuptools_scm] -write_to = "stop_words/_version.py" +write_to = "src/stop_words/_version.py" [tool.setuptools] packages = ["stop_words"] - -[tool.setuptools.package-data] -stop_words = [ +package-dir = {"" = "src"} +package-data = {stop_words = [ "stop-words/*.txt", "stop-words/languages.json", -] +]} + [tool.mypy] python_version = "3.13" +exclude_gitignore = true [tool.coverage.run] cover_pylib = false @@ -79,16 +85,27 @@ exclude_lines = [ skip_covered = true - [tool.black] line-length = 120 -#skip-magic-trailing-comma = true target-version = ['py313'] extend-exclude = ''' /( - migrations - | build + build | \.venv - | stop_words/stop-words + | src/stop_words/stop-words )/ ''' + + +[tool.isort] +line_length = 120 +extend_skip = ["src/stop_words/stop-words", "src/stop_words/_version.py"] +sections = "FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" +indent = 4 +multi_line_output = 3 +include_trailing_comma = true +order_by_type = true +combine_as_imports = true +lines_after_imports = 2 +float_to_top = true +atomic = true diff --git a/stop_words/__init__.py b/src/stop_words/__init__.py similarity index 98% rename from stop_words/__init__.py rename to src/stop_words/__init__.py index 198b419..0bcd180 100644 --- a/stop_words/__init__.py +++ b/src/stop_words/__init__.py @@ -1,6 +1,5 @@ import json import os - from typing import Callable @@ -19,7 +18,7 @@ def get_version() -> str: """ :rtype: basestring """ - from ._version import __version__ + from ._version import __version__ # type: ignore return __version__ diff --git a/stop_words/stop-words b/src/stop_words/stop-words similarity index 100% rename from stop_words/stop-words rename to src/stop_words/stop-words diff --git a/stop_words/tests.py b/src/tests.py similarity index 93% rename from stop_words/tests.py rename to src/tests.py index 32e6b0b..b16a9e6 100644 --- a/stop_words/tests.py +++ b/src/tests.py @@ -2,11 +2,7 @@ from unittest import TestCase import stop_words -from stop_words import AVAILABLE_LANGUAGES -from stop_words import LANGUAGE_MAPPING -from stop_words import StopWordError -from stop_words import get_stop_words -from stop_words import safe_get_stop_words +from stop_words import AVAILABLE_LANGUAGES, LANGUAGE_MAPPING, StopWordError, get_stop_words, safe_get_stop_words class TestStopWords(TestCase): From 787d2a56c05458e4a753dbb303e0c33f2ad7e12c Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 3 Nov 2025 23:46:09 +0300 Subject: [PATCH 6/8] Lots of more tests and code cleanup and better documentation --- ChangeLog.rst | 8 + Makefile | 2 +- README.rst | 525 +++++++++++++++++++++++++++++++++---- pyproject.toml | 1 + src/stop_words/__init__.py | 162 ++++++++---- src/tests.py | 472 +++++++++++++++++++++++++++++---- 6 files changed, 1023 insertions(+), 147 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 436f65f..3ae8624 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,3 +1,11 @@ +2025.11.3 +========= + +* Sync with latest of https://github.com/Alir3z4/stop-words. +* Add much more tests and cleaned up the code. +* Modernized Python packaging and publishing. + + 2018.7.23 ========= diff --git a/Makefile b/Makefile index 4d66219..983e412 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ test: ## Run test suite python -m unittest discover -s src/ -v coverage: ## Generate coverage report - coverage run -m unittest discover + coverage run -m unittest discover -s src/ coverage report coverage xml diff --git a/README.rst b/README.rst index 7a23d0e..ea0bafa 100644 --- a/README.rst +++ b/README.rst @@ -2,82 +2,511 @@ Python Stop Words ================= -.. contents:: Table of contents +.. image:: https://img.shields.io/pypi/v/stop-words.svg + :target: https://pypi.org/project/stop-words/ + :alt: PyPI version + +.. image:: https://img.shields.io/pypi/pyversions/stop-words.svg + :target: https://pypi.org/project/stop-words/ + :alt: Python versions + +.. image:: https://img.shields.io/pypi/l/stop-words.svg + :target: https://github.com/Alir3z4/python-stop-words/blob/master/LICENSE + :alt: License + +.. contents:: Table of Contents + :depth: 2 + :local: Overview -------- -Get list of common stop words in various languages in Python. +A Python library providing curated lists of stop words across 34+ languages. Stop words are common words (like "the", "is", "at") that are typically filtered out in natural language processing and text analysis tasks. +**Key Features:** -Available languages +* **34+ Languages** - Extensive language support. +* **Performance** - Built-in caching for fast repeated access. +* **Flexible** - Custom filtering system for advanced use cases. +* **Modern Python** - Full support for Python 3.8+, type hints included. +* **Zero Dependencies** - Lightweight with no external requirements. + + +Available Languages ------------------- -* Arabic -* Bulgarian -* Catalan -* Chinese -* Czech -* Danish -* Dutch -* English -* Finnish -* French -* German -* Greek -* Gujarati -* Hindi -* Hebrew -* Hungarian -* Indonesian -* Malaysian -* Italian -* Japanese -* Korean -* Norwegian -* Polish -* Portuguese -* Romanian -* Russian -* Slovak -* Spanish -* Swedish -* Turkish -* Ukrainian -* Vietnamese -* Persian/Farsi +All the available languages supported by https://github.com/Alir3z4/stop-words + +Each language is identified by both its ISO 639-1 language code (e.g., ``en``) and full name (e.g., ``english``). Installation ------------ -``stop-words`` is available on PyPI -http://pypi.python.org/pypi/stop-words +**Via pip (Recommended):** -So easily install it by ``pip`` -:: +.. code-block:: bash $ pip install stop-words -Another way is by cloning ``stop-words``'s `git repo `_ :: +**Via Git:** - $ git clone --recursive git://github.com/Alir3z4/python-stop-words.git +.. code-block:: bash -Then install it by running: -:: + $ git clone --recursive https://github.com/Alir3z4/python-stop-words.git + $ cd python-stop-words + $ pip install -e . - $ python setup.py install +**Requirements:** +* Python 3.8 or higher -Basic usage + +Quick Start ----------- -.. code:: python + +Basic Usage +~~~~~~~~~~~ + +.. code-block:: python from stop_words import get_stop_words + # Get English stop words using language code stop_words = get_stop_words('en') + + # Or use the full language name stop_words = get_stop_words('english') + + # Use in text processing + text = "The quick brown fox jumps over the lazy dog" + words = text.lower().split() + filtered_words = [word for word in words if word not in stop_words] + print(filtered_words) # ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog'] + + +Safe Loading +~~~~~~~~~~~~ + +Use ``safe_get_stop_words()`` when you're not sure if a language is supported: + +.. code-block:: python from stop_words import safe_get_stop_words - stop_words = safe_get_stop_words('unsupported language') + # Returns empty list instead of raising an exception + stop_words = safe_get_stop_words('klingon') # Returns [] + + # Works normally with supported languages + stop_words = safe_get_stop_words('fr') # Returns French stop words + + +Advanced Usage +-------------- + +Checking Available Languages +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from stop_words import AVAILABLE_LANGUAGES, LANGUAGE_MAPPING + + # List all available languages + print(AVAILABLE_LANGUAGES) + # ['arabic', 'bulgarian', 'catalan', ...] + + # View language code mappings + print(LANGUAGE_MAPPING) + # {'en': 'english', 'fr': 'french', ...} + + +Caching Control +~~~~~~~~~~~~~~~ + +By default, stop words are cached for performance. You can control this behavior: + +.. code-block:: python + + from stop_words import get_stop_words, STOP_WORDS_CACHE + + # Disable caching for this call + stop_words = get_stop_words('en', cache=False) + + # Clear the cache manually + STOP_WORDS_CACHE.clear() + + # Check what's cached + print(STOP_WORDS_CACHE.keys()) # ['english', 'french', ...] + + +Custom Filters +~~~~~~~~~~~~~~ + +Apply custom transformations to stop words using the filter system: + +.. code-block:: python + + from stop_words import get_stop_words, add_filter, remove_filter + + # Add a global filter (applies to all languages) + def remove_short_words(words, language): + """Remove words shorter than 3 characters.""" + return [w for w in words if len(w) >= 3] + + add_filter(remove_short_words) + stop_words = get_stop_words('en', cache=False) + + # Add a language-specific filter + def uppercase_words(words): + """Convert all words to uppercase.""" + return [w.upper() for w in words] + + add_filter(uppercase_words, language='english') + stop_words = get_stop_words('en', cache=False) + + # Remove a filter when done + remove_filter(uppercase_words, language='english') + +**Note:** Filters only apply to newly loaded stop words, not cached ones. Use ``cache=False`` or clear the cache to apply new filters. + + +Practical Examples +------------------ + +Text Preprocessing +~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from stop_words import get_stop_words + import re + + def preprocess_text(text, language='en'): + """Clean and filter text for NLP tasks.""" + stop_words = set(get_stop_words(language)) + + # Convert to lowercase and extract words + words = re.findall(r'\b\w+\b', text.lower()) + + # Remove stop words + filtered_words = [w for w in words if w not in stop_words] + + return filtered_words + + text = "The quick brown fox jumps over the lazy dog" + print(preprocess_text(text)) + # ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog'] + + +Multilingual Processing +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from stop_words import get_stop_words + + def filter_multilingual_text(texts_dict): + """Process texts in multiple languages. + + Args: + texts_dict: Dictionary mapping language codes to text strings + + Returns: + Dictionary with filtered words for each language + """ + results = {} + + for lang_code, text in texts_dict.items(): + stop_words = set(get_stop_words(lang_code)) + words = text.lower().split() + filtered = [w for w in words if w not in stop_words] + results[lang_code] = filtered + + return results + + texts = { + 'en': 'The cat is on the table', + 'fr': 'Le chat est sur la table', + 'es': 'El gato está en la mesa' + } + + print(filter_multilingual_text(texts)) + + +Keyword Extraction +~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from stop_words import get_stop_words + from collections import Counter + import re + + def extract_keywords(text, language='en', top_n=10): + """Extract the most common meaningful words from text.""" + stop_words = set(get_stop_words(language)) + + # Extract words and filter + words = re.findall(r'\b\w+\b', text.lower()) + meaningful_words = [w for w in words if w not in stop_words and len(w) > 2] + + # Count and return top keywords + word_counts = Counter(meaningful_words) + return word_counts.most_common(top_n) + + article = """ + Python is a high-level programming language. Python is known for its + simplicity and readability. Many developers choose Python for data science. + """ + + keywords = extract_keywords(article) + print(keywords) + # [('python', 3), ('language', 1), ('high-level', 1), ...] + + +API Reference +------------- + +Functions +~~~~~~~~~ + +``get_stop_words(language, *, cache=True)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Load stop words for a specified language. + +**Parameters:** + +* ``language`` (str): Language code (e.g., 'en') or full name (e.g., 'english') +* ``cache`` (bool, optional): Enable caching. Defaults to True. + +**Returns:** + +* ``list[str]``: List of stop words + +**Raises:** + +* ``StopWordError``: If language is unavailable or files are unreadable + +**Example:** + +.. code-block:: python + + stop_words = get_stop_words('en') + stop_words = get_stop_words('french', cache=False) + + +``safe_get_stop_words(language)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Safely load stop words, returning empty list on error. + +**Parameters:** + +* ``language`` (str): Language code or full name + +**Returns:** + +* ``list[str]``: Stop words, or empty list if unavailable + +**Example:** + +.. code-block:: python + + stop_words = safe_get_stop_words('unknown') # Returns [] + + +``add_filter(func, language=None)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Register a filter function for stop word post-processing. + +**Parameters:** + +* ``func`` (Callable): Filter function +* ``language`` (str | None, optional): Language code or None for global filter + +**Filter Signatures:** + +* Language-specific: ``func(stopwords: list[str]) -> list[str]`` +* Global: ``func(stopwords: list[str], language: str) -> list[str]`` + +**Example:** + +.. code-block:: python + + def remove_short(words, lang): + return [w for w in words if len(w) > 3] + + add_filter(remove_short) # Global filter + + +``remove_filter(func, language=None)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Remove a previously registered filter. + +**Parameters:** + +* ``func`` (Callable): The filter function to remove +* ``language`` (str | None, optional): Language code or None + +**Returns:** + +* ``bool``: True if removed, False if not found + +**Example:** + +.. code-block:: python + + success = remove_filter(my_filter, language='english') + + +Constants +~~~~~~~~~ + +``AVAILABLE_LANGUAGES`` +^^^^^^^^^^^^^^^^^^^^^^^^ + +List of all supported language names. + +.. code-block:: python + + ['arabic', 'bulgarian', 'catalan', ...] + + +``LANGUAGE_MAPPING`` +^^^^^^^^^^^^^^^^^^^^ + +Dictionary mapping language codes to full names. + +.. code-block:: python + + {'en': 'english', 'fr': 'french', 'de': 'german', ...} + + +``STOP_WORDS_CACHE`` +^^^^^^^^^^^^^^^^^^^^^ + +Dictionary storing cached stop words. Can be manually cleared. + +.. code-block:: python + + STOP_WORDS_CACHE.clear() # Clear all cached data + + +Exceptions +~~~~~~~~~~ + +``StopWordError`` +^^^^^^^^^^^^^^^^^ + +Raised when a language is unavailable or files cannot be read. + +.. code-block:: python + + try: + stop_words = get_stop_words('invalid') + except StopWordError as e: + print(f"Error: {e}") + + +Performance Tips +---------------- + +1. **Use caching** - Keep ``cache=True`` (default) for repeated access to the same language +2. **Reuse stop word sets** - Convert to ``set()`` once for O(1) lookup performance: + + .. code-block:: python + + stop_words_set = set(get_stop_words('en')) + # Fast membership testing + is_stop_word = 'the' in stop_words_set + +3. **Preload languages** - Load stop words during initialization, not in tight loops +4. **Use safe_get_stop_words** - Avoid try/except overhead when language availability is uncertain + + +Troubleshooting +--------------- + +**"Language unavailable" error** + +* Check spelling and use either the language code or full name +* Verify the language is in ``AVAILABLE_LANGUAGES`` +* See the `Available Languages`_ table above + +**"File is unreadable" error** + +* Ensure the package installed correctly: ``pip install --force-reinstall stop-words`` +* Check file permissions in the installation directory +* Verify the ``stop-words`` subdirectory exists in the package + +**Filters not applying** + +* Filters only affect newly loaded stop words +* Clear the cache: ``STOP_WORDS_CACHE.clear()`` +* Use ``cache=False`` when testing filters + +**Performance issues** + +* Ensure caching is enabled (default behavior) +* Convert stop word lists to sets for faster lookups +* Preload stop words outside of loops + + +Contributing +------------ + +Contributions are welcome! Here's how you can help: + +1. **Add new languages** - Submit stop word lists for unsupported languages via https://github.com/Alir3z4/stop-words +2. **Improve existing lists** - Suggest additions or removals for existing languages via https://github.com/Alir3z4/stop-words +3. **Report bugs** - Open issues on GitHub +4. **Submit PRs** - Fix bugs or add features + +**Repository:** https://github.com/Alir3z4/python-stop-words + + +License +------- + +This project is licensed under the BSD 3-Clause License. See ``LICENSE`` file for details. + + +Changelog +--------- + +See `ChangeLog.rst `_ for version history. + + +Support +------- + +* **Issues:** https://github.com/Alir3z4/python-stop-words/issues +* **PyPI:** https://pypi.org/project/stop-words/ + + +Credits +------- + +* Maintained by `Alireza Savand `_ +* Stop word lists compiled from various open sources +* Contributors: See `GitHub contributors `_ + + +Related Projects +---------------- +* `Stop Words `_ - List of common stop words in various languages. +* `NLTK `_ - Natural Language Toolkit with extensive NLP features +* `spaCy `_ - Industrial-strength NLP library +* `TextBlob `_ - Simplified text processing + + +Indices and Tables +------------------ + +* `Available Languages`_ +* `Quick Start`_ +* `Advanced Usage`_ +* `API Reference`_ diff --git a/pyproject.toml b/pyproject.toml index 3caf2fa..80db556 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ omit = [ "*distutils*", "venv/*", ".venv/*", + "_version.py", ] [tool.coverage.report] diff --git a/src/stop_words/__init__.py b/src/stop_words/__init__.py index 0bcd180..5c23378 100644 --- a/src/stop_words/__init__.py +++ b/src/stop_words/__init__.py @@ -1,113 +1,185 @@ +""" +Stop Words Library + +A module for loading and managing stop words across multiple languages. +Stop words are common words that are typically filtered out in text processing. + +This module provides: +- Loading stop words from language-specific files +- Caching for performance optimization +- Custom filtering system for post-processing stop words +- Language code mapping (e.g., 'en' -> 'english') +""" + import json -import os +from pathlib import Path from typing import Callable -CURRENT_DIR = os.path.dirname(os.path.realpath(__file__)) -STOP_WORDS_DIR = os.path.join(CURRENT_DIR, "stop-words") +# Directory configuration +CURRENT_DIR = Path(__file__).resolve().parent +STOP_WORDS_DIR = CURRENT_DIR / "stop-words" + +# Global caches STOP_WORDS_CACHE: dict[str, list[str]] = {} +_filters: dict[str | None, list[Callable[[list[str], str | None], list[str]]]] = {None: []} -with open(os.path.join(STOP_WORDS_DIR, "languages.json"), "rb") as map_file: - buffer = map_file.read() - LANGUAGE_MAPPING = json.loads(buffer.decode("ascii")) +# Load language mapping configuration +_languages_file = STOP_WORDS_DIR / "languages.json" +with _languages_file.open("r", encoding="utf-8") as f: + LANGUAGE_MAPPING: dict[str, str] = json.load(f) -AVAILABLE_LANGUAGES = list(LANGUAGE_MAPPING.values()) +AVAILABLE_LANGUAGES: list[str] = list(LANGUAGE_MAPPING.values()) + + +class StopWordError(Exception): + """Raised when a requested language is unavailable or files are unreadable.""" + + pass def get_version() -> str: """ - :rtype: basestring + Get the version of the stop words library. + + :returns: The version string from _version module. """ from ._version import __version__ # type: ignore return __version__ -class StopWordError(Exception): - pass +def get_stop_words(language: str, *, cache: bool = True) -> list[str]: + """ + Load stop words for a specified language. + :param language: Language code (e.g., 'en', 'es') or full name (e.g., 'english', 'spanish'). + Supports both ISO codes and full language names via LANGUAGE_MAPPING. + :param cache: If True, cache the results for faster subsequent access. Defaults to True. -def get_stop_words(language, cache: bool = True) -> list[str]: - """ - :param language - :param cache: - :rtype: list + :returns: A list of stop words for the specified language. Returns a copy to prevent external modification. + :raises StopWordError: If the language is not available or the file cannot be read. + + Example: + >>> words = get_stop_words('en') + >>> 'the' in words + True """ + # Normalize language code to full name try: language = LANGUAGE_MAPPING[language] except KeyError: if language not in AVAILABLE_LANGUAGES: - raise StopWordError('{0}" language is unavailable.'.format(language)) + raise StopWordError( + f'Language "{language}" is unavailable. ' + f'Available languages: {", ".join(sorted(AVAILABLE_LANGUAGES))}' + ) + # Return cached version if available if cache and language in STOP_WORDS_CACHE: - return STOP_WORDS_CACHE[language] + return STOP_WORDS_CACHE[language].copy() + + # Load stop words from file + language_file = STOP_WORDS_DIR / f"{language}.txt" - language_filename = os.path.join(STOP_WORDS_DIR, language + ".txt") try: - with open(language_filename, "rb") as language_file: - stop_words = [line.decode("utf-8").strip() for line in language_file.readlines()] + with language_file.open("r", encoding="utf-8") as f: + stop_words = [line.strip() for line in f if line.strip()] stop_words = apply_filters(stop_words, language) - except IOError: - raise StopWordError('{0}" file is unreadable, check your installation.'.format(language_filename)) + except (IOError, OSError) as e: + raise StopWordError(f'File "{language_file}" is unreadable. Check your installation. Error: {e}') from e + # Cache if requested if cache: STOP_WORDS_CACHE[language] = stop_words - return stop_words[:] # copy list, prevent being modified + return stop_words.copy() + +def apply_filters(stopwords: list[str], language: str | None) -> list[str]: + """ + Apply registered filters to stop words. -_filters: dict[str | None, list[Callable]] = {None: []} + Filters can modify, remove, or add stop words. Language-specific filters + are applied first, followed by global filters (registered with language=None). + :param stopwords: List of stop words to filter. + :param language: Language code for language-specific filters. -def apply_filters(stopwords: list[str], language: str) -> list[str]: - """ - Apply registered filters to stopwords - :param stopwords: list - :param language: string - :return: filtered stopwords + :returns: Filtered list of stop words. """ + # Apply language-specific filters if language in _filters: for func in _filters[language]: - stopwords = func(stopwords) + stopwords = func(stopwords, language) + # Apply global filters for func in _filters[None]: stopwords = func(stopwords, language) return stopwords -def add_filter(func, language: str | None = None) -> None: +def add_filter(func: Callable[[list[str], str | None], list[str]], *, language: str | None = None) -> None: """ - Register filters for specific language. - If language == None the filter applies for all languages. - Filter will not apply for stop words in cache. + Register a filter function for stop word post-processing. - :param func: callable - :param language: string|None - :return: + Language-specific filters receive: func(stopwords: list[str]) -> list[str] + Global filters receive: func(stopwords: list[str], language: str) -> list[str] + + Note: Filters only apply to newly loaded stop words, not cached ones. + Clear the cache with STOP_WORDS_CACHE.clear() to reapply filters. + + :param func: Callable that takes a list of stop words and returns a modified list. + :param language: Language code for language-specific filter, or None for global filter. + + Example: + >>> # Add a filter to uppercase all stop words for English + >>> add_filter(lambda words: [w.upper() for w in words], 'english') + >>> # Add a global filter to remove single-character words + >>> add_filter(lambda words, lang: [w for w in words if len(w) > 1]) """ + if language is None: + _filters[None].append(func) + return + if language not in _filters: _filters[language] = [] + _filters[language].append(func) -def remove_filter(func, language: str | None = None) -> bool: +def remove_filter(func: Callable[[list[str], str | None], list[str]], *, language: str | None = None) -> bool: """ - :param func: - :param language: - :return: + Unregister a previously registered filter function. + + :param func: The filter function to remove. + :param language: Language code or None for global filters. + + :returns: True if the filter was found and removed, False otherwise. """ - if not (language in _filters and func in _filters[language]): + if language not in _filters or func not in _filters[language]: return False + _filters[language].remove(func) return True def safe_get_stop_words(language: str) -> list[str]: """ - :type language: basestring + Safely load stop words, returning an empty list on error. + + This is a convenience wrapper around get_stop_words() that catches + StopWordError exceptions and returns an empty list instead. + + :param language: Language code or full name. + + :returns: Stop words for the language, or empty list if unavailable. - :rtype: list + Example: + >>> words = safe_get_stop_words('unknown_language') + >>> words + [] """ try: return get_stop_words(language) diff --git a/src/tests.py b/src/tests.py index b16a9e6..7fc6b86 100644 --- a/src/tests.py +++ b/src/tests.py @@ -1,69 +1,435 @@ import random +from pathlib import Path from unittest import TestCase import stop_words -from stop_words import AVAILABLE_LANGUAGES, LANGUAGE_MAPPING, StopWordError, get_stop_words, safe_get_stop_words +from stop_words import ( + AVAILABLE_LANGUAGES, + LANGUAGE_MAPPING, + STOP_WORDS_CACHE, + StopWordError, + add_filter, + get_stop_words, + get_version, + remove_filter, + safe_get_stop_words, +) -class TestStopWords(TestCase): - number_of_english_stop_words = 1333 +class TestStopWordsBasic(TestCase): + """Test basic stop word loading functionality.""" - def test_get_stop_words(self) -> None: + NUMBER_OF_ENGLISH_STOP_WORDS = 1333 + + def test_get_stop_words_returns_list(self) -> None: + """Stop words should be returned as a list.""" + sw = get_stop_words("english") + self.assertIsInstance(sw, list) + self.assertEqual(len(sw), self.NUMBER_OF_ENGLISH_STOP_WORDS) + + def test_get_stop_words_contains_strings(self) -> None: + """All stop words should be strings.""" sw = get_stop_words("english") - self.assertEqual(len(sw), self.number_of_english_stop_words) + self.assertTrue(all(isinstance(word, str) for word in sw)) + + def test_get_stop_words_no_empty_strings(self) -> None: + """Stop words should not contain empty strings.""" + sw = get_stop_words("english") + self.assertTrue(all(word.strip() for word in sw)) def test_get_stop_words_language_mapping(self) -> None: + """Language codes should map to full language names.""" + sw_code = get_stop_words("en") + sw_full = get_stop_words("english") + self.assertEqual(len(sw_code), self.NUMBER_OF_ENGLISH_STOP_WORDS) + self.assertEqual(sw_code, sw_full) + + def test_common_english_stop_words(self) -> None: + """Common English stop words should be present.""" sw = get_stop_words("en") - self.assertEqual(len(sw), self.number_of_english_stop_words) - self.assertEqual(sw, get_stop_words("english")) - - def test_get_stop_words_cache(self) -> None: - self.assertFalse("french" in stop_words.STOP_WORDS_CACHE) - sw = get_stop_words("fr") - self.assertTrue("french" in stop_words.STOP_WORDS_CACHE) - original_stop_words_dir = stop_words.STOP_WORDS_DIR - stop_words.STOP_WORDS_DIR = "not-existing-directory" - self.assertEqual(sw, get_stop_words("french")) - stop_words.STOP_WORDS_DIR = original_stop_words_dir - try: - get_stop_words("klingon") - except StopWordError: - pass - self.assertFalse("klingon" in stop_words.STOP_WORDS_CACHE) - - def test_get_stop_words_unavailable_language(self) -> None: - self.assertRaises(StopWordError, get_stop_words, "sindarin") - - def test_get_stop_words_install_issue(self) -> None: - original_stop_words_dir = stop_words.STOP_WORDS_DIR - stop_words.STOP_WORDS_DIR = "not-existing-directory" - self.assertRaises(StopWordError, get_stop_words, "german") - stop_words.STOP_WORDS_DIR = original_stop_words_dir - - def test_safe_get_stop_words(self) -> None: - self.assertRaises(StopWordError, get_stop_words, "huttese") - self.assertEqual(safe_get_stop_words("huttese"), []) - - def test_random_language_stop_words_load(self) -> None: - languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES) - sample = random.sample(languages, len(languages)) - for language in sample: - stop_words = safe_get_stop_words(language) - self.assertTrue( - len(stop_words) > 0, - "Cannot load stopwords for {0} language".format(language), - ) + common_words = ["the", "a", "an", "and", "or", "but", "is", "are"] + for word in common_words: + self.assertIn(word, sw, f"Expected '{word}' in English stop words") + + def test_get_version(self) -> None: + self.assertIsNotNone(get_version()) + + +class TestStopWordsCache(TestCase): + """Test caching behavior.""" + + def setUp(self) -> None: + """Clear cache before each test.""" + STOP_WORDS_CACHE.clear() + + def test_cache_enabled_by_default(self) -> None: + """Cache should be enabled by default.""" + self.assertNotIn("french", STOP_WORDS_CACHE) + get_stop_words("fr") + self.assertIn("french", STOP_WORDS_CACHE) + + def test_cache_disabled(self) -> None: + """Cache should not be used when cache=False.""" + self.assertNotIn("german", STOP_WORDS_CACHE) + get_stop_words("de", cache=False) + self.assertNotIn("german", STOP_WORDS_CACHE) + + def test_cache_persists_across_calls(self) -> None: + """Cached stop words should persist across calls.""" + original_dir = stop_words.STOP_WORDS_DIR + + # Load and cache + sw1 = get_stop_words("fr") + self.assertIn("french", STOP_WORDS_CACHE) + + # Break the file system path + stop_words.STOP_WORDS_DIR = Path("non-existent-directory") + + # Should still work from cache + sw2 = get_stop_words("french") + self.assertEqual(sw1, sw2) + + # Restore + stop_words.STOP_WORDS_DIR = original_dir + + def test_cache_miss_raises_error(self) -> None: + """Cache miss with invalid path should raise error.""" + original_dir = stop_words.STOP_WORDS_DIR + stop_words.STOP_WORDS_DIR = Path("non-existent-directory") + + with self.assertRaises(StopWordError): + get_stop_words("spanish") + + self.assertNotIn("spanish", STOP_WORDS_CACHE) + stop_words.STOP_WORDS_DIR = original_dir + + def test_returns_copy_not_reference(self) -> None: + """get_stop_words should return a copy, not the cached reference.""" + sw1 = get_stop_words("en") + sw2 = get_stop_words("en") + + # Modify one list + sw1.append("custom_word") + + # The other should be unchanged + self.assertNotIn("custom_word", sw2) + + # Cache should also be unchanged + sw3 = get_stop_words("en") + self.assertNotIn("custom_word", sw3) + + +class TestStopWordsErrors(TestCase): + """Test error handling.""" + + def test_unavailable_language_raises_error(self) -> None: + """Unknown languages should raise StopWordError.""" + with self.assertRaises(StopWordError) as ctx: + get_stop_words("sindarin") + self.assertIn("sindarin", str(ctx.exception).lower()) + + def test_missing_file_raises_error(self) -> None: + """Missing language files should raise StopWordError.""" + original_dir = stop_words.STOP_WORDS_DIR + stop_words.STOP_WORDS_DIR = Path("non-existent-directory") + + with self.assertRaises(StopWordError) as ctx: + get_stop_words("german", cache=False) + + self.assertIn("unreadable", str(ctx.exception).lower()) + stop_words.STOP_WORDS_DIR = original_dir + + def test_safe_get_stop_words_no_exception(self) -> None: + """safe_get_stop_words should never raise exceptions.""" + result = safe_get_stop_words("klingon") + self.assertEqual(result, []) + self.assertIsInstance(result, list) + + def test_safe_get_stop_words_with_valid_language(self) -> None: + """safe_get_stop_words should work with valid languages.""" + result = safe_get_stop_words("en") + self.assertGreater(len(result), 0) + + def test_error_message_includes_available_languages(self) -> None: + """Error message should hint at available languages.""" + with self.assertRaises(StopWordError) as ctx: + get_stop_words("notreal") + error_msg = str(ctx.exception).lower() + self.assertIn("available", error_msg) + + +class TestStopWordsFilters(TestCase): + """Test the filter system.""" + + def setUp(self) -> None: + """Clear cache and filters before each test.""" + STOP_WORDS_CACHE.clear() + stop_words._filters.clear() + stop_words._filters[None] = [] + + def tearDown(self) -> None: + """Clean up filters after each test.""" + stop_words._filters.clear() + stop_words._filters[None] = [] + + def test_global_filter_removes_words(self) -> None: + """Global filters should modify all languages.""" + + def remove_short_words(words: list[str], _lang: str | None = None) -> list[str]: + return [w for w in words if len(w) > 3] + + add_filter(remove_short_words) + sw = get_stop_words("en", cache=False) + + self.assertTrue(all(len(word) > 3 for word in sw)) + + def test_language_specific_filter(self) -> None: + """Language-specific filters should only affect that language.""" + + def uppercase_filter(words: list[str], _language: str | None = None) -> list[str]: + return [w.upper() for w in words] + + add_filter(uppercase_filter, language="english") + + # English should be uppercase + en_words = get_stop_words("en", cache=False) + self.assertTrue(all(w.isupper() for w in en_words if not w.isnumeric())) + + # Other languages should be unaffected + fr_words = get_stop_words("fr", cache=False) + self.assertFalse(all(w.isupper() for w in fr_words)) + + def test_multiple_filters_chain(self) -> None: + """Multiple filters should be applied in sequence.""" + + def add_prefix(words: list[str], _lang: str | None = None) -> list[str]: + return [f"prefix_{w}" for w in words] + + def add_suffix(words: list[str], _lang: str | None = None) -> list[str]: + return [f"{w}_suffix" for w in words] + + add_filter(add_prefix) + add_filter(add_suffix) + + sw = get_stop_words("en", cache=False) + sample_word = sw[0] + + self.assertTrue(sample_word.startswith("prefix_")) + self.assertTrue(sample_word.endswith("_suffix")) + + def test_remove_filter_returns_true(self) -> None: + """Removing an existing filter should return True.""" + + def dummy_filter(words: list[str], _lang: str | None = None) -> list[str]: + return words - def test_filters(self) -> None: + add_filter(dummy_filter) + + # Calling it to get the `dummy_filter` actually execute. + get_stop_words("en") + + result = remove_filter(dummy_filter) + self.assertTrue(result) + + def test_remove_nonexistent_filter_returns_false(self) -> None: + """Removing a non-existent filter should return False.""" + + def dummy_filter(words: list[str], _lang: str | None = None) -> list[str]: + return words # pragma: no cover + + result = remove_filter(dummy_filter) + self.assertFalse(result) + + def test_remove_filter_with_language(self) -> None: + """Language-specific filter removal should work.""" + + def lang_filter(words: list[str], _language: str | None = None) -> list[str]: + return words + + add_filter(lang_filter, language="english") + + # Calling it to get the `lang_filter` actually execute. + get_stop_words("en") + + result = remove_filter(lang_filter, language="english") + self.assertTrue(result) + + # Should return False when trying to remove again + result = remove_filter(lang_filter, language="english") + self.assertFalse(result) + + def test_filter_with_random_letter_removal(self) -> None: + """Original test: remove words containing a random letter.""" language = "en" - before = get_stop_words(language, False) + before = get_stop_words(language, cache=False) letter = random.choice(random.choice(before)) - def remove_letter(stopwords, _language: str): - return [word for word in stopwords if letter not in word] + def remove_letter(words: list[str], _lang: str | None = None) -> list[str]: + return [w for w in words if letter not in w] + + add_filter(remove_letter) + after = get_stop_words(language, cache=False) + + for word in after: + self.assertNotIn(letter, word) + + self.assertTrue(remove_filter(remove_letter)) + + +class TestStopWordsAllLanguages(TestCase): + """Test all available languages.""" + + def test_all_mapped_languages_loadable(self) -> None: + """All languages in LANGUAGE_MAPPING should be loadable.""" + for code, full_name in LANGUAGE_MAPPING.items(): + with self.subTest(code=code, language=full_name): + sw = safe_get_stop_words(code) + self.assertGreater(len(sw), 0, f"No stop words loaded for {full_name} ({code})") + + def test_random_language_loading(self) -> None: + """Random sample of languages should all load successfully.""" + all_languages = list(LANGUAGE_MAPPING.keys()) + AVAILABLE_LANGUAGES + sample = random.sample(all_languages, min(10, len(all_languages))) + + for language in sample: + with self.subTest(language=language): + sw = safe_get_stop_words(language) + self.assertGreater(len(sw), 0, f"Cannot load stopwords for {language}") + + def test_all_languages_have_unique_words(self) -> None: + """Each language should have at least some unique characteristics.""" + # Compare English and French as they should be different + en = set(get_stop_words("en")) + fr = set(get_stop_words("fr")) + + # Should have different words + self.assertNotEqual(en, fr) + # Should have some overlap (common borrowed words) + self.assertGreater(len(en & fr), 0) + + +class TestStopWordsEdgeCases(TestCase): + """Test edge cases and boundary conditions.""" + + def test_empty_language_string(self) -> None: + """Empty language string should raise error.""" + with self.assertRaises(StopWordError): + get_stop_words("") + + def test_none_language(self) -> None: + """None as language should raise appropriate error.""" + with self.assertRaises((StopWordError, KeyError, TypeError)): + get_stop_words(None) # type: ignore + + def test_case_sensitive_language_codes(self) -> None: + """Language codes should be case-sensitive.""" + # Lowercase should work + sw_lower = get_stop_words("en") + self.assertGreater(len(sw_lower), 0) + + # Uppercase might not be in mapping + with self.assertRaises(StopWordError): + get_stop_words("EN") + + def test_whitespace_in_stop_words(self) -> None: + """Stop words should be properly stripped of whitespace.""" + sw = get_stop_words("en") + for word in sw: + self.assertEqual(word, word.strip(), f"Word '{word}' has extra whitespace") + + def test_duplicate_stop_words(self) -> None: + """Stop words list should not contain duplicates.""" + sw = get_stop_words("en") + unique_words = set(sw) + self.assertEqual(len(sw), len(unique_words), "Stop words list contains duplicates") + + def test_filter_returns_empty_list(self) -> None: + """Filter that returns empty list should work.""" + + def remove_all(words: list[str], _lang: str | None = None) -> list[str]: + return [] + + STOP_WORDS_CACHE.clear() + stop_words._filters.clear() + stop_words._filters[None] = [] + + add_filter(remove_all) + sw = get_stop_words("en", cache=False) + self.assertEqual(sw, []) + + # Cleanup + remove_filter(remove_all) + + def test_filter_adds_words(self) -> None: + """Filter that adds words should work.""" + + def add_custom(words: list[str], _lang: str | None = None) -> list[str]: + return words + ["custom1", "custom2"] + + STOP_WORDS_CACHE.clear() + stop_words._filters.clear() + stop_words._filters[None] = [] + + add_filter(add_custom) + sw = get_stop_words("en", cache=False) + + self.assertIn("custom1", sw) + self.assertIn("custom2", sw) + + # Cleanup + remove_filter(add_custom) + + def test_concurrent_filter_modifications(self) -> None: + """Adding and removing filters should be safe.""" + filters = [ + lambda w, language: w, + lambda w, language: [word.upper() for word in w], + lambda w, language: [word.lower() for word in w], + ] + + STOP_WORDS_CACHE.clear() + stop_words._filters.clear() + stop_words._filters[None] = [] + + # Add all filters + for f in filters: + add_filter(f) + + # Remove them in different order + for f in reversed(filters): + remove_filter(f) + + # Should be back to empty + self.assertEqual(len(stop_words._filters[None]), 0) + + +class TestStopWordsConfiguration(TestCase): + """Test module configuration and constants.""" + + def test_available_languages_is_list(self) -> None: + """AVAILABLE_LANGUAGES should be a list.""" + self.assertIsInstance(AVAILABLE_LANGUAGES, list) + self.assertGreater(len(AVAILABLE_LANGUAGES), 0) + + def test_language_mapping_is_dict(self) -> None: + """LANGUAGE_MAPPING should be a dictionary.""" + self.assertIsInstance(LANGUAGE_MAPPING, dict) + self.assertGreater(len(LANGUAGE_MAPPING), 0) + + def test_cache_is_dict(self) -> None: + """STOP_WORDS_CACHE should be a dictionary.""" + self.assertIsInstance(STOP_WORDS_CACHE, dict) + + def test_stop_words_dir_exists(self) -> None: + """STOP_WORDS_DIR should point to an existing directory.""" + self.assertTrue( + stop_words.STOP_WORDS_DIR.exists(), + f"Stop words directory not found: {stop_words.STOP_WORDS_DIR}", + ) + self.assertTrue(stop_words.STOP_WORDS_DIR.is_dir()) - stop_words.add_filter(remove_letter) - after = get_stop_words(language, False) - for stopword in after: - self.assertFalse(letter in stopword) - self.assertTrue(stop_words.remove_filter(remove_letter)) + def test_language_files_exist(self) -> None: + """Language files referenced in mapping should exist.""" + for lang_name in AVAILABLE_LANGUAGES: + lang_file = stop_words.STOP_WORDS_DIR / f"{lang_name}.txt" + self.assertTrue(lang_file.exists(), f"Language file missing: {lang_file}") From b4444394774ec001daf3f5f00100530b6d089ff3 Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 3 Nov 2025 23:49:22 +0300 Subject: [PATCH 7/8] Remove mention of python versions. --- README.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.rst b/README.rst index ea0bafa..2b41c2f 100644 --- a/README.rst +++ b/README.rst @@ -28,7 +28,6 @@ A Python library providing curated lists of stop words across 34+ languages. Sto * **34+ Languages** - Extensive language support. * **Performance** - Built-in caching for fast repeated access. * **Flexible** - Custom filtering system for advanced use cases. -* **Modern Python** - Full support for Python 3.8+, type hints included. * **Zero Dependencies** - Lightweight with no external requirements. @@ -59,7 +58,7 @@ Installation **Requirements:** -* Python 3.8 or higher +* Usually any version of Python that supports type hints and probably has not been marked as EOL. Quick Start From ad4b1ab3fb02348b8e3dbba2cca3e1429d9c49fd Mon Sep 17 00:00:00 2001 From: Alireza Savand Date: Mon, 3 Nov 2025 23:57:20 +0300 Subject: [PATCH 8/8] Update github actions --- .github/workflows/main.yml | 8 ++++---- .github/workflows/pypi.yml | 9 +++++---- ChangeLog.rst | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7fab9b8..4a3e78e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,12 +17,12 @@ jobs: name: "Linting" steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: fetch-depth: 0 - name: setup python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.13' @@ -44,12 +44,12 @@ jobs: steps: - name: Check out code - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: submodules: true - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 95e91d2..1d8545a 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -15,17 +15,18 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 + with: + submodules: true - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: '3.13' - name: Build run: | - git submodule update --remote --rebase python -m pip install build - python -m build + make update-submodules build - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/ChangeLog.rst b/ChangeLog.rst index 3ae8624..9de0e46 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -1,4 +1,4 @@ -2025.11.3 +2025.11.4 ========= * Sync with latest of https://github.com/Alir3z4/stop-words.