diff --git a/LICENSE b/LICENSE index b7982b1..48665b7 100644 --- a/LICENSE +++ b/LICENSE @@ -17,4 +17,62 @@ This project contains content developed by The MITRE Corporation. If this code is used in a deployment or embedded within another project, it is requested that you send an email to opensource@mitre.org in order to let us know where - this software is being used. \ No newline at end of file + this software is being used. + +***************************************************************************** + +The nlp_text_splitter utlity uses the following sentence detection libraries: + +***************************************************************************** + +The WtP, "Where the Point", sentence segmentation library falls under the MIT License: + +https://github.com/bminixhofer/wtpsplit/blob/main/LICENSE + +MIT License + +Copyright (c) 2024 Benjamin Minixhofer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +***************************************************************************** + +The spaCy Natural Language Processing library falls under the MIT License: + +The MIT License (MIT) + +Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/detection/nlp_text_splitter/README.md b/detection/nlp_text_splitter/README.md new file mode 100644 index 0000000..06bbbfc --- /dev/null +++ b/detection/nlp_text_splitter/README.md @@ -0,0 +1,50 @@ +# Overview + +This directory contains the source code, test examples, and installation script +for the OpenMPF NlpTextSplitter tool, which uses WtP and spaCy libraries +to detect sentences in a given chunk of text. + +# Background + +Our primary motivation for creating this tool was to find a lightweight, accurate +sentence detection capability to support a large variety of text processing tasks +including translation and tagging. + +Through preliminary investigation, we identified the [WtP library ("Where's the +Point")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence +detection model](https://spacy.io/models) for identifying sentence breaks +in a large section of text. + +WtP models are trained to split up multilingual text by sentence without the need of an +input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 +GB of GPU memory. On the other hand, spaCy has a single multilingual sentence detection +that appears to work better for splitting up English text in certain cases. Unfortunately +this model lacks support handling for Chinese punctuation. + +# Installation + +To install this tool users will need to run `./install.sh`. By default this will set up a +CPU-only PyTorch installation. + +Please note that several customizations are supported: + +- `--text-splitter-dir|-t `: This parameter specifies where the + source code is located relative to the installation script. In general, + since the installation script and source code are both located here, it's not + necessary to update this parameter unless the user is running the `install.sh` + script from a different directory. + +- `--gpu`: Add this parameter to the installation command line above to + setup a PyTorch installation with CUDA (GPU) libraries. + +- `--wtp-models-dir |-m `: Add this parameter to + change the default WtP model installation directory + (default: `/opt/wtp/models`). + +- `--install-wtp-model|-w `: Add this parameter to specify + additional WTP models for installation. This parameter can be provided + multiple times to install more than one model. + +- `--install-spacy-model|-s `: Add this parameter to specify + additional spaCy models for installation. This parameter can be provided + multiple times to install more than one model. diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh new file mode 100755 index 0000000..45a4dbf --- /dev/null +++ b/detection/nlp_text_splitter/install.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash + +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +set -o errexit -o pipefail + +main() { + if ! options=$(getopt --name "$0" \ + --options t:gm:w:s: \ + --longoptions text-splitter-dir:,gpu,wtp-models-dir:,install-wtp-model:,install-spacy-model: \ + -- "$@"); then + print_usage + fi + eval set -- "$options" + local wtp_models_dir=/opt/wtp/models + local wtp_models=("wtp-bert-mini") + local spacy_models=("xx_sent_ud_sm") + while true; do + case "$1" in + --text-splitter-dir | -t ) + shift + local text_splitter_dir=$1 + ;; + --gpu | -g ) + local gpu_enabled=true + ;; + --wtp-models-dir | -m ) + shift + wtp_models_dir=$1; + ;; + --install-wtp-model | -w ) + shift + wtp_models+=("$1") + ;; + --install-spacy-model | -s ) + shift + spacy_models+=("$1") + ;; + -- ) + shift + break + ;; + esac + shift + done + + install_text_splitter "$text_splitter_dir" + install_py_torch "$gpu_enabled" + download_wtp_models "$wtp_models_dir" "${wtp_models[@]}" + download_spacy_models "${spacy_models[@]}" +} + + +install_text_splitter() { + local text_splitter_dir=$1 + if [[ ! $text_splitter_dir ]]; then + text_splitter_dir=$(dirname "$(realpath "${BASH_SOURCE[0]}")") + fi + + echo "Installing text splitter from source directory: $text_splitter_dir" + pip3 install "$text_splitter_dir" +} + + +install_py_torch() { + local gpu_enabled=$1 + local torch_package='torch~=2.3' + if [[ $gpu_enabled ]]; then + echo "Installing GPU enabled PyTorch." + pip3 install "$torch_package" + else + echo "Installing CPU only version of PyTorch." + # networkx is a dependency of PyTorch, but the version of networkx in the PyTorch package + # index requires Python 3.9. networkx needs to be installed in a separate command so that + # pip can get networkx from PyPi. + pip3 install 'networkx~=3.1' + pip3 install "$torch_package" --index-url https://download.pytorch.org/whl/cpu + fi +} + + +download_wtp_models() { + local wtp_models_dir=$1 + shift + local model_names=("$@") + setup_wtp_models_dir "$wtp_models_dir" + + for model_name in "${model_names[@]}"; do + echo "Downloading the $model_name model to $wtp_models_dir." + local wtp_model_dir="$wtp_models_dir/$model_name" + python3 -c \ + "from huggingface_hub import snapshot_download; \ + snapshot_download('benjamin/$model_name', local_dir='$wtp_model_dir')" + done +} + +setup_wtp_models_dir() { + local wtp_models_dir=$1 + + if [[ ! $REQUESTS_CA_BUNDLE ]]; then + export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt + fi + + if ! mkdir --parents "$wtp_models_dir"; then + echo "ERROR: Failed to create the $wtp_models_dir directory." + exit 3 + fi + + if [[ ! -w "$wtp_models_dir" ]]; then + echo -n "ERROR: The model directory, \"$wtp_models_dir\" is not writable by the current user. " + echo "The permissions on \"$wtp_models_dir\" must be modified." + exit 4 + fi +} + +download_spacy_models() { + for model_name in "$@"; do + echo "Downloading the $model_name spaCy model." + python3 -m spacy download "$model_name" + done +} + + +print_usage() { + echo + echo "Usage: +$0 [--text-splitter-dir|-t ] [--gpu|-g] [--wtp-models-dir |-m ] [--install-wtp-model|-w ]* [--install-spacy-model|-s ]* +Options + --text-splitter-dir, -t : Path to text splitter source code. (defaults to to the + same directory as this script) + --gpu, -g: Install the GPU version of PyTorch + --wtp-models-dir , -m : Path where WTP models will be stored. + (defaults to /opt/wtp/models) + --install-wtp-model, -w : Name of a WTP model to install in addtion to wtp-bert-mini. + This option can be provided more than once to specify + multiple models. + --install-spacy-model | -s : Names of a spaCy model to install in addtion to + xx_sent_ud_sm. The option can be provided more than once + to specify multiple models. +" + exit 1 +} + +main "$@" diff --git a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py new file mode 100644 index 0000000..f083a5f --- /dev/null +++ b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py @@ -0,0 +1,265 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import logging +import os +import pkg_resources + +import spacy +from wtpsplit import WtP +from typing import Callable, List, Optional, Tuple + +from .wtp_lang_settings import WtpLanguageSettings + +import torch + + +DEFAULT_WTP_MODELS = "/opt/wtp/models" + +# If we want to package model installation with this utility in the future: +WTP_MODELS_PATH = pkg_resources.resource_filename( + __name__, "models" +) + +log = logging.getLogger(__name__) + +# These models must have an specified language during sentence splitting. +WTP_MANDATORY_ADAPTOR = ['wtp-canine-s-1l', + 'wtp-canine-s-3l', + 'wtp-canine-s-6l', + 'wtp-canine-s-9l', + 'wtp-canine-s-12l'] + +GPU_AVAILABLE = False +if torch.cuda.is_available(): + GPU_AVAILABLE = True + + +class TextSplitterModel: + # To hold spaCy, WtP, and other potential sentence detection models in cache + + def __init__(self, model_name: str, model_setting: str, default_lang: str = "en") -> None: + self._model_name = "" + self._model_setting = "" + self._default_lang = default_lang + self._mandatory_wtp_language = False + self.split = lambda t, **param: [t] + self.update_model(model_name, model_setting, default_lang) + + def update_model(self, model_name: str, model_setting: str = "cpu", default_lang: str="en"): + if model_name: + if "wtp" in model_name: + self._update_wtp_model(model_name, model_setting, default_lang) + self.split = self._split_wtp + log.info(f"Setup WtP model: {model_name}") + else: + self._update_spacy_model(model_name) + self.split = self._split_spacy + log.info(f"Setup spaCy model: {model_name}") + + def _update_wtp_model(self, wtp_model_name: str, + model_setting: str, + default_lang: str) -> None: + + if model_setting == "gpu" or model_setting == "cuda": + if GPU_AVAILABLE: + model_setting = "cuda" + else: + log.warning("PyTorch determined that CUDA is not available. " + "You may need to update the NVIDIA driver for the host system, " + "or reinstall PyTorch with GPU support by setting " + "ARGS BUILD_TYPE=gpu in the Dockerfile when building this component.") + model_setting = "cpu" + elif model_setting != "cpu": + log.warning("Invalid WtP model setting. Only `cpu` and `cuda` " + "(or `gpu`) WtP model options available at this time. " + "Defaulting to `cpu` mode.") + model_setting = "cpu" + + if wtp_model_name in WTP_MANDATORY_ADAPTOR: + self._mandatory_wtp_language = True + self._default_lang = default_lang + + if self._model_name == wtp_model_name and self._model_setting == model_setting: + log.info(f"Using cached model, running on {self._model_setting}: " + f"{self._model_name}") + else: + self._model_setting = model_setting + self._model_name = wtp_model_name + # Check if model has been downloaded + if os.path.exists(os.path.join(WTP_MODELS_PATH, wtp_model_name)): + log.info(f"Using downloaded {wtp_model_name} model.") + wtp_model_name = os.path.join(WTP_MODELS_PATH, wtp_model_name) + + elif os.path.exists(os.path.join(DEFAULT_WTP_MODELS, + wtp_model_name)): + + log.info(f"Using downloaded {wtp_model_name} model.") + wtp_model_name = os.path.join(DEFAULT_WTP_MODELS, + wtp_model_name) + + else: + log.warning(f"Model {wtp_model_name} not found, " + "downloading from hugging face.") + + self.wtp_model = WtP(wtp_model_name) + + if model_setting != "cpu" and model_setting != "cuda": + log.warning(f"Invalid setting for WtP runtime {model_setting}. " + "Defaulting to CPU mode.") + model_setting = "cpu" + self.wtp_model.to(model_setting) + + def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: + if lang: + iso_lang = WtpLanguageSettings.convert_to_iso(lang) + if iso_lang: + return self.wtp_model.split(text, lang_code=iso_lang) + else: + log.warning(f"Language {lang} was not used to train WtP model. " + "If text splitting is not working well with WtP, " + "consider trying spaCy's sentence detection model." + ) + if self._mandatory_wtp_language: + log.warning("WtP model requires a language. " + f"Using default language : {self._default_lang}.") + iso_lang = WtpLanguageSettings.convert_to_iso(self._default_lang) + return self.wtp_model.split(text, lang_code=iso_lang) + return self.wtp_model.split(text) + + def _update_spacy_model(self, spacy_model_name: str): + self.spacy_model = spacy.load(spacy_model_name, exclude=["parser"]) + self.spacy_model.enable_pipe("senter") + + def _split_spacy(self, text: str, lang: Optional[str] = None) -> List[str]: + # TODO: We may add an auto model selection for spaCy in the future. + # However, the drawback is we will also need to + # download a large number of spaCy models beforehand. + processed_text = self.spacy_model(text) + return [sent.text_with_ws for sent in processed_text.sents] + +class TextSplitter: + + def __init__( + self, text: str, limit: int, num_boundary_chars: int, + get_text_size: Callable[[str], int], + sentence_model: TextSplitterModel, + in_lang: Optional[str] = None) -> None: + self._sentence_model = sentence_model + self._limit = limit + self._num_boundary_chars = num_boundary_chars + self._get_text_size = get_text_size + self._text = "" + self._text_full_size = 0 + self._overhead_size = 0 + self._soft_limit = self._limit + self._in_lang = in_lang + + if text: + self.set_text(text) + + def set_text(self, text: str): + self._text = text + self._text_full_size = self._get_text_size(text) + chars_per_size = len(text) / self._text_full_size + self._overhead_size = self._get_text_size('') + + self._soft_limit = int(self._limit * chars_per_size) - self._overhead_size + + if self._soft_limit <= 1: + # Caused by an unusually large overhead relative to text. + # This is unlikely to occur except during testing of small text limits. + # Recalculate soft limit by subtracting overhead from limit + # before applying chars_per_size weighting. + self._soft_limit = max(1, + int((self._limit - self._overhead_size) * chars_per_size)) + + def _isolate_largest_section(self, text:str) -> str: + # Using cached word splitting model, isolate largest section of text + string_length = len(text) + + if self._num_boundary_chars <= 0: + num_chars_to_process = string_length + else: + num_chars_to_process = self._num_boundary_chars + + start_indx = max(0, string_length - num_chars_to_process) + substring = text[start_indx: string_length] + substring_list = self._sentence_model.split(substring, lang = self._in_lang) + div_index = string_length - len(substring_list[-1]) + + if div_index==start_indx: + return text + + return text[0:div_index] + + @classmethod + def split(cls, + text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], + sentence_model: TextSplitterModel, + in_lang: Optional[str] = None + ): + return cls(text, limit, num_boundary_chars, get_text_size, sentence_model, in_lang)._split() + + + def _split(self): + if self._text_full_size <= self._limit: + yield self._text + else: + yield from self._split_internal(self._text) + + def _split_internal(self, text): + right = text + while True: + left, right = self._divide(right) + yield left + if not right: + return + + def _divide(self, text) -> Tuple[str, str]: + limit = self._soft_limit + while True: + left = text[:limit] + left_size = self._get_text_size(left) + + if left_size <= self._limit: + if left != text: + # If dividing into two parts + # Determine soft boundary for left segment + left = self._isolate_largest_section(left) + return left, text[len(left):] + + char_per_size = len(left) / left_size + + + limit = int(self._limit * char_per_size) - self._overhead_size + + if limit < 1: + # Caused by an unusually large overhead relative to text. + # This is unlikely to occur except during testing of small text limits. + # Recalculate soft limit by subtracting overhead from limit before + # applying chars_per_size weighting. + limit = max(1, int((self._limit - self._overhead_size) * char_per_size)) diff --git a/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py b/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py new file mode 100644 index 0000000..c682fd3 --- /dev/null +++ b/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py @@ -0,0 +1,259 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from typing import Optional + +class WtpLanguageSettings: + # Supported languages and ISO 639-1, 639-2 codes for WtP models. + # https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages + # https://www.loc.gov/standards/iso639-2/php/code_list.php + _wtp_lang_map = { + 'afrikaans': 'af', + 'afr': 'af', + 'amharic': 'am', + 'amh': 'am', + 'arabic': 'ar', + 'ara': 'ar', + 'azerbaijani': 'az', + 'aze': 'az', + 'belarusian': 'be', + 'bel': 'be', + 'bulgarian': 'bg', + 'bul': 'bg', + 'bengali': 'bn', + 'ben': 'bn', + 'catalan': 'ca', + 'valencian': 'ca', + 'cat': 'ca', + 'cebuano': 'ceb', # In some cases, ISO-639-1 is not available, use ISO-639-2 + 'ceb': 'ceb', + 'czech': 'cs', + 'cze': 'cs', + 'ces': 'cs', + 'welsh': 'cy', + 'wel': 'cy', + 'cym': 'cy', + 'danish': 'da', + 'dan': 'da', + 'german': 'de', + 'ger': 'de', + 'deu': 'de', + 'greek': 'el', + 'gre': 'el', + 'ell': 'el', + 'english': 'en', + 'eng': 'en', + 'esperanto': 'eo', + 'epo': 'eo', + 'spanish': 'es', + 'castilian': 'es', + 'spa': 'es', + 'estonian': 'et', + 'est': 'et', + 'basque': 'eu', + 'baq': 'eu', + 'eus': 'eu', + 'persian': 'fa', + 'per': 'fa', + 'fas': 'fa', + 'finnish': 'fi', + 'fin': 'fi', + 'french': 'fr', + 'fre': 'fr', + 'fra': 'fr', + 'western frisian': 'fy', + 'fry': 'fy', + 'irish': 'ga', + 'gle': 'ga', + 'gaelic': 'gd', + 'scottish gaelic': 'gd', + 'gla': 'gd', + 'galician': 'gl', + 'glg': 'gl', + 'gujarati': 'gu', + 'guj': 'gu', + 'hausa': 'ha', + 'hau': 'ha', + 'hebrew': 'he', + 'heb': 'he', + 'hindi': 'hi', + 'hin': 'hi', + 'hungarian': 'hu', + 'hun': 'hu', + 'armenian': 'hy', + 'arm': 'hy', + 'hye': 'hy', + 'indonesian': 'id', + 'ind': 'id', + 'igbo': 'ig', + 'ibo': 'ig', + 'icelandic': 'is', + 'ice': 'is', + 'isl': 'is', + 'italian': 'it', + 'ita': 'it', + 'japanese': 'ja', + 'jpn': 'ja', + 'javanese': 'jv', + 'jav': 'jv', + 'georgian': 'ka', + 'geo': 'ka', + 'kat': 'ka', + 'kazakh': 'kk', + 'kaz': 'kk', + 'central khmer': 'km', + 'khm': 'km', + 'kannada': 'kn', + 'kan': 'kn', + 'korean': 'ko', + 'kor': 'ko', + 'kurdish': 'ku', + 'kur': 'ku', + 'kirghiz': 'ky', + 'kyrgyz': 'ky', + 'kir': 'ky', + 'latin': 'la', + 'lat': 'la', + 'lithuanian': 'lt', + 'lit': 'lt', + 'latvian': 'lv', + 'lav': 'lv', + 'malagasy': 'mg', + 'mlg': 'mg', + 'macedonian': 'mk', + 'mac': 'mk', + 'mkd': 'mk', + 'malayalam': 'ml', + 'mal': 'ml', + 'mongolian': 'mn', + 'mon': 'mn', + 'marathi': 'mr', + 'mar': 'mr', + 'malay': 'ms', + 'may': 'ms', + 'msa': 'ms', + 'maltese': 'mt', + 'mlt': 'mt', + 'burmese': 'my', + 'bur': 'my', + 'mya': 'my', + 'nepali': 'ne', + 'nep': 'ne', + 'dutch': 'nl', + 'flemish': 'nl', + 'dut': 'nl', + 'nld': 'nl', + 'norwegian': 'no', + 'nor': 'no', + 'panjabi': 'pa', + 'punjabi': 'pa', + 'pan': 'pa', + 'polish': 'pl', + 'pol': 'pl', + 'pushto': 'ps', + 'pashto': 'ps', + 'pus': 'ps', + 'portuguese': 'pt', + 'por': 'pt', + 'romanian': 'ro', + 'moldavian': 'ro', + 'moldovan': 'ro', + 'rum': 'ro', + 'ron': 'ro', + 'russian': 'ru', + 'rus': 'ru', + 'sinhala': 'si', + 'sinhalese': 'si', + 'sin': 'si', + 'slovak': 'sk', + 'slo': 'sk', + 'slk': 'sk', + 'slovenian': 'sl', + 'slv': 'sl', + 'albanian': 'sq', + 'alb': 'sq', + 'sqi': 'sq', + 'serbian': 'sr', + 'srp': 'sr', + 'swedish': 'sv', + 'swe': 'sv', + 'tamil': 'ta', + 'tam': 'ta', + 'telugu': 'te', + 'tel': 'te', + 'tajik': 'tg', + 'tgk': 'tg', + 'thai': 'th', + 'tha': 'th', + 'turkish': 'tr', + 'tur': 'tr', + 'ukrainian': 'uk', + 'ukr': 'uk', + 'urdu': 'ur', + 'urd': 'ur', + 'uzbek': 'uz', + 'uzb': 'uz', + 'vietnamese': 'vi', + 'vie': 'vi', + 'xhosa': 'xh', + 'xho': 'xh', + 'yiddish': 'yi', + 'yid': 'yi', + 'yoruba': 'yo', + 'yor': 'yo', + 'chinese': 'zh', + 'chi': 'zh', + 'zho': 'zh', + 'zulu': 'zu', + 'zul': 'zu', + 'hans':'zh', # Also check for chinese scripts + 'hant': 'zh', + 'cmn':'zh' # In some cases we use 'cmn' = 'Mandarin' + } + + _wtp_iso_set = set(_wtp_lang_map.values()) + + @classmethod + def convert_to_iso(cls, lang: str) -> Optional[str]: + # ISO 639-2 (language) is sometimes paired with ISO 15924 (script). + # Extract the language portion and check if supported in WtP. + if not lang: + return None + + if '-' in lang: + lang = lang.split('-')[0] + if '_' in lang: + lang = lang.split('_')[0] + + lang = lang.strip().lower() + + if lang in cls._wtp_iso_set: + return lang + + if lang in cls._wtp_lang_map: + return cls._wtp_lang_map[lang] + + return None diff --git a/detection/nlp_text_splitter/pyproject.toml b/detection/nlp_text_splitter/pyproject.toml new file mode 100644 index 0000000..31583a6 --- /dev/null +++ b/detection/nlp_text_splitter/pyproject.toml @@ -0,0 +1,37 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "nlp_text_splitter" +version = "8.0" +dependencies = [ + "spacy>=3.7.4", + "wtpsplit>=1.3.0" +] diff --git a/detection/nlp_text_splitter/tests/test_data/NOTICE b/detection/nlp_text_splitter/tests/test_data/NOTICE new file mode 100644 index 0000000..0e3ac4d --- /dev/null +++ b/detection/nlp_text_splitter/tests/test_data/NOTICE @@ -0,0 +1,4 @@ +# art-of-war.txt +Contains the beginning of "The Art of War" by Sunzi in Traditional Chinese. +Public Domain +https://www.gutenberg.org/ebooks/12407 \ No newline at end of file diff --git a/detection/nlp_text_splitter/tests/test_data/art-of-war.txt b/detection/nlp_text_splitter/tests/test_data/art-of-war.txt new file mode 100644 index 0000000..25e19f5 --- /dev/null +++ b/detection/nlp_text_splitter/tests/test_data/art-of-war.txt @@ -0,0 +1,8 @@ +兵者,國之大事,死生之地,存亡之道,不可不察也。 +故經之以五事,校之以計,而索其情:一曰道,二曰天,三曰地,四曰將,五曰法。道者,令民於上同意,可與之死,可與之生, +而不危也;天者,陰陽、寒暑、時制也;地者,遠近、險易、廣狹、死生也;將者,智、信、仁、勇、嚴也;法者,曲制、官道、 +主用也。凡此五者,將莫不聞,知之者勝,不知之者不勝。故校之以計,而索其情,曰:主孰有道?將孰有能?天地孰得?法令孰行? +兵眾孰強?士卒孰練?賞罰孰明?吾以此知勝負矣。將聽吾計,用之必勝,留之;將不聽吾計,用之必敗,去之。計利以聽,乃為之勢, +以佐其外。勢者,因利而制權也。兵者,詭道也。故能而示之不能,用而示之不用,近而示之遠,遠而示之近。利而誘之,亂而取之, +實而備之,強而避之,怒而撓之,卑而驕之,佚而勞之,親而離之,攻其無備,出其不意。此兵家之勝,不可先傳也。 +夫未戰而廟算勝者,得算多也;未戰而廟算不勝者,得算少也。多算勝少算,而況於無算乎!吾以此觀之,勝負見矣。 diff --git a/detection/nlp_text_splitter/tests/test_text_splitter.py b/detection/nlp_text_splitter/tests/test_text_splitter.py new file mode 100644 index 0000000..9782870 --- /dev/null +++ b/detection/nlp_text_splitter/tests/test_text_splitter.py @@ -0,0 +1,213 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import pathlib +import unittest + +from nlp_text_splitter import TextSplitterModel, TextSplitter + + +TEST_DATA = pathlib.Path(__file__).parent / 'test_data' + +class TestTextSplitter(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") + cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cpu", "zh") + cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") + + def test_split_engine_difference(self): + # Note: Only WtP's multilingual models + # can detect some of '。' characters used for this language. + text = (TEST_DATA / 'art-of-war.txt').read_text() + + text_without_newlines = text.replace('\n', '') + + actual = self.wtp_model._split_wtp(text_without_newlines) + self.assertEqual(3, len(actual)) + for line in actual: + self.assertTrue(line.endswith('。')) + + actual = self.spacy_model._split_spacy(text_without_newlines) + self.assertEqual(1, len(actual)) + + # However, WtP prefers newlines over the '。' character. + actual = self.wtp_model._split_wtp(text) + self.assertEqual(10, len(actual)) + + def test_guess_split_simple_sentence(self): + input_text = 'Hello, what is your name? My name is John.' + actual = list(TextSplitter.split(input_text, + 28, + 28, + len, + self.wtp_model)) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + + # "Hello, what is your name?" + self.assertEqual('Hello, what is your name? ', actual[0]) + # " My name is John." + self.assertEqual('My name is John.', actual[1]) + + input_text = 'Hello, what is your name? My name is John.' + actual = list(TextSplitter.split(input_text, + 28, + 28, + len, + self.spacy_model)) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + + # "Hello, what is your name?" + self.assertEqual('Hello, what is your name? ', actual[0]) + # " My name is John." + self.assertEqual('My name is John.', actual[1]) + + def test_split_sentence_end_punctuation(self): + input_text = 'Hello. How are you? asdfasdf' + actual = list(TextSplitter.split(input_text, + 20, + 10, + len, + self.wtp_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + + self.assertEqual('Hello. How are you? ', actual[0]) + self.assertEqual('asdfasdf', actual[1]) + + actual = list(TextSplitter.split(input_text, + 20, + 10, + len, + self.spacy_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + + self.assertEqual('Hello. How are you? ', actual[0]) + self.assertEqual('asdfasdf', actual[1]) + + + def test_guess_split_edge_cases(self): + input_text = ("This is a sentence (Dr.Test). Is this," + " a sentence as well? Maybe...maybe not?" + " \n All done, I think!") + + # Split using WtP model. + actual = list(TextSplitter.split(input_text, + 30, + 30, + len, + self.wtp_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(4, len(actual)) + + # WtP should detect and split out each sentence + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? \n ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + + actual = list(TextSplitter.split(input_text, + 35, + 35, + len, + self.spacy_model)) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(4, len(actual)) + + # Split using spaCy model. + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? \n ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + + + def test_split_wtp_basic(self): + text = (TEST_DATA / 'art-of-war.txt').read_text().replace('\n','') + actual = list(TextSplitter.split(text, + 150, + 150, + len, + self.wtp_model)) + + self.assertEqual(4, len(actual)) + + expected_chunk_lengths = [86, 116, 104, 114] + self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n',''))) + + self.assertTrue(actual[0].startswith('兵者,')) + self.assertTrue(actual[0].endswith('而不危也;')) + self.assertEqual(expected_chunk_lengths[0], len(actual[0])) + + self.assertTrue(actual[1].startswith('天者,陰陽')) + self.assertTrue(actual[1].endswith('兵眾孰強?')) + self.assertEqual(expected_chunk_lengths[1], len(actual[1])) + + self.assertTrue(actual[2].startswith('士卒孰練?')) + self.assertTrue(actual[2].endswith('遠而示之近。')) + self.assertEqual(expected_chunk_lengths[2], len(actual[2])) + + self.assertTrue(actual[3].startswith('利而誘之,')) + self.assertTrue(actual[3].endswith('勝負見矣。')) + self.assertEqual(expected_chunk_lengths[3], len(actual[3])) + + def test_split_wtp_advanced(self): + text = (TEST_DATA / 'art-of-war.txt').read_text().replace('\n','') + actual = list(TextSplitter.split(text, + 150, + 150, + len, + self.wtp_adv_model)) + + self.assertEqual(4, len(actual)) + + expected_chunk_lengths = [61, 150, 61, 148] + self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n',''))) + + self.assertTrue(actual[0].startswith('兵者,')) + self.assertTrue(actual[0].endswith('四曰將,五曰法。')) + self.assertEqual(expected_chunk_lengths[0], len(actual[0])) + + self.assertTrue(actual[1].startswith('道者,令民於上同意')) + self.assertTrue(actual[1].endswith('賞罰孰明')) + self.assertEqual(expected_chunk_lengths[1], len(actual[1])) + + self.assertTrue(actual[2].startswith('?吾以此知勝')) + self.assertTrue(actual[2].endswith('因利而制權也。')) + self.assertEqual(expected_chunk_lengths[2], len(actual[2])) + + self.assertTrue(actual[3].startswith('兵者,詭道也。')) + self.assertTrue(actual[3].endswith('之,勝負見矣。')) + self.assertEqual(expected_chunk_lengths[3], len(actual[3])) + + +if __name__ == '__main__': + unittest.main(verbosity=2)