From c0afbb1c6e7336369654760677c693070eefd585 Mon Sep 17 00:00:00 2001 From: Brian Rosenberg Date: Fri, 26 Apr 2024 11:19:02 -0400 Subject: [PATCH 01/11] Add nlp_text_splitter package --- detection/nlp_text_splitter/install.sh | 105 +++++++ .../nlp_text_splitter/__init__.py | 262 ++++++++++++++++++ .../nlp_text_splitter/wtp_lang_settings.py | 259 +++++++++++++++++ detection/nlp_text_splitter/pyproject.toml | 37 +++ .../tests/test_data/art-of-war.txt | 8 + .../tests/test_text_splitter.py | 91 ++++++ 6 files changed, 762 insertions(+) create mode 100755 detection/nlp_text_splitter/install.sh create mode 100644 detection/nlp_text_splitter/nlp_text_splitter/__init__.py create mode 100644 detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py create mode 100644 detection/nlp_text_splitter/pyproject.toml create mode 100644 detection/nlp_text_splitter/tests/test_data/art-of-war.txt create mode 100644 detection/nlp_text_splitter/tests/test_text_splitter.py diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh new file mode 100755 index 0000000..bed0824 --- /dev/null +++ b/detection/nlp_text_splitter/install.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash + +set -o errexit -o pipefail + + +main() { + if ! options=$(getopt --name "$0" \ + --options t:gm: \ + --longoptions text-splitter-dir:,gpu,models-dir: \ + -- "$@"); then + print_usage + fi + eval set -- "$options" + while true; do + case "$1" in + --text-splitter-dir | -t ) + shift + local text_splitter_dir=$1 + ;; + --gpu | -g ) + local gpu_enabled=true + ;; + --models-dir | -m ) + shift + local models_dir=$1; + ;; + -- ) + shift + break + ;; + esac + shift + done + + install_text_splitter "$text_splitter_dir" + install_py_torch "$gpu_enabled" + download_models "$models_dir" +} + + +install_text_splitter() { + local text_splitter_dir=$1 + if [[ ! $text_splitter_dir ]]; then + text_splitter_dir=$(dirname "$(realpath "${BASH_SOURCE[0]}")") + fi + + echo "Installing text splitter from source directory: $text_splitter_dir" + pip3 install "$text_splitter_dir" +} + + +install_py_torch() { + local gpu_enabled=$1 + local torch_package='torch~=2.3' + if [[ $gpu_enabled ]]; then + echo "Installing GPU enabled PyTorch." + pip3 install "$torch_package" + else + echo "Installing CPU only version of PyTorch." + # networkx is a dependency of PyTorch, but the version of networkx in the PyTorch package + # index requires Python 3.9. networkx needs to be installed in a separate command so that + # pip can get networkx from PyPi. + pip3 install 'networkx~=3.1' + pip3 install "$torch_package" --index-url https://download.pytorch.org/whl/cpu + fi +} + +download_models() { + local models_dir=${1:-/opt/wtp/models} + + if [[ ! $REQUESTS_CA_BUNDLE ]]; then + export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt + fi + + echo 'Downloading the xx_sent_ud_sm Spacy model.' + python3 -m spacy download xx_sent_ud_sm + + echo "Downloading the wtp-bert-mini model to $models_dir." + + if ! mkdir --parents "$models_dir"; then + echo "ERROR: Failed to create the $models_dir directory." + exit 3 + fi + + if [[ ! -w "$models_dir" ]]; then + echo -n "ERROR: The model directory, \"$models_dir\" is not writable by the current user. " + echo "The permissions on \"$models_dir\" must be modified." + exit 4 + fi + + local bert_model_dir="$models_dir"/wtp-bert-mini + python3 -c \ + "from huggingface_hub import snapshot_download; \ + snapshot_download('benjamin/wtp-bert-mini', local_dir='$bert_model_dir')" +} + + +print_usage() { + echo + echo "Usage: +$0 [--text-splitter-dir|-t ] [--gpu|-g] [--models-dir|-m ]" + exit 1 +} + +main "$@" diff --git a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py new file mode 100644 index 0000000..3ae3b93 --- /dev/null +++ b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py @@ -0,0 +1,262 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import logging +import os +import pkg_resources + +import spacy +from wtpsplit import WtP +from typing import Callable, List, Optional, Tuple + +from .wtp_lang_settings import WtpLanguageSettings + +import torch + + +DEFAULT_WTP_MODELS = "/opt/wtp/models" + +# If we want to package model installation with this utility in the future: +WTP_MODELS_PATH = pkg_resources.resource_filename( + __name__, "models" +) + +log = logging.getLogger(__name__) + +# These models must have an specified language during sentence splitting. +WTP_MANDATORY_ADAPTOR = ['wtp-canine-s-1l', + 'wtp-canine-s-3l', + 'wtp-canine-s-6l', + 'wtp-canine-s-9l', + 'wtp-canine-s-12l'] + +GPU_AVAILABLE = False +if torch.cuda.is_available(): + GPU_AVAILABLE = True + + +class TextSplitterModel: + # To hold spaCy, WtP, and other potential sentence detection models in cache + + def __init__(self, model_name: str, model_setting: str, default_lang: str = "en") -> None: + self._model_name = "" + self._default_lang = default_lang + self._mandatory_wtp_language = False + self.split = lambda t, **param: [t] + self.update_model(model_name, model_setting, default_lang) + + def update_model(self, model_name: str, model_setting: str = "cpu", default_lang: str="en"): + if model_name: + if "wtp" in model_name: + self._update_wtp_model(model_name, model_setting, default_lang) + self.split = self._split_wtp + log.info(f"Setup WtP model: {model_name}") + else: + self._update_spacy_model(model_name) + self.split = self._split_spacy + log.info(f"Setup spaCy model: {model_name}") + + def _update_wtp_model(self, wtp_model_name: str, + model_setting: str, + default_lang: str) -> None: + + if model_setting == "gpu" or model_setting == "cuda": + if GPU_AVAILABLE: + model_setting = "cuda" + else: + log.warning("PyTorch determined that CUDA is not available. " + "You may need to update the NVIDIA driver for the host system, " + "or reinstall PyTorch with GPU support by setting " + "ARGS BUILD_TYPE=gpu in the Dockerfile when building this component.") + model_setting = "cpu" + elif model_setting != "cpu": + log.warning("Invalid WtP model setting. Only `cpu` and `cuda` " + "(or `gpu`) WtP model options available at this time. " + "Defaulting to `cpu` mode.") + model_setting = "cpu" + + if wtp_model_name in WTP_MANDATORY_ADAPTOR: + self._mandatory_wtp_language = True + self._default_lang = default_lang + + if self._model_name == wtp_model_name: + log.info(f"Using cached model: {self._model_name}") + else: + self._model_name = wtp_model_name + # Check if model has been downloaded + if os.path.exists(os.path.join(WTP_MODELS_PATH, wtp_model_name)): + log.info(f"Using downloaded {wtp_model_name} model.") + wtp_model_name = os.path.join(WTP_MODELS_PATH, wtp_model_name) + + elif os.path.exists(os.path.join(DEFAULT_WTP_MODELS, + wtp_model_name)): + + log.info(f"Using downloaded {wtp_model_name} model.") + wtp_model_name = os.path.join(DEFAULT_WTP_MODELS, + wtp_model_name) + + else: + log.warning(f"Model {wtp_model_name} not found, " + "downloading from hugging face.") + + self.wtp_model = WtP(wtp_model_name) + + if model_setting != "cpu" and model_setting != "cuda": + log.warning(f"Invalid setting for WtP runtime {model_setting}. " + "Defaulting to CPU mode.") + model_setting = "cpu" + self.wtp_model.to(model_setting) + + def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: + if lang: + iso_lang = WtpLanguageSettings.convert_to_iso(lang) + if iso_lang: + return self.wtp_model.split(text, lang_code=iso_lang) + else: + log.warning(f"Language {lang} was not used to train WtP model. " + "If text splitting is not working well with WtP, " + "consider trying spaCy's sentence detection model." + ) + if self._mandatory_wtp_language: + log.warning("WtP model requires a language. " + f"Using default language : {self._default_lang}.") + iso_lang = WtpLanguageSettings.convert_to_iso(self._default_lang) + return self.wtp_model.split(text, lang_code=iso_lang) + return self.wtp_model.split(text) + + def _update_spacy_model(self, spacy_model_name: str): + self.spacy_model = spacy.load(spacy_model_name, exclude=["parser"]) + self.spacy_model.enable_pipe("senter") + + def _split_spacy(self, text: str, lang: Optional[str] = None) -> List[str]: + # TODO: We may add an auto model selection for spaCy in the future. + # However, the drawback is we will also need to + # download a large number of spaCy models beforehand. + processed_text = self.spacy_model(text) + return [sent.text_with_ws for sent in processed_text.sents] + +class TextSplitter: + + def __init__( + self, text: str, limit: int, num_boundary_chars: int, + get_text_size: Callable[[str], int], + sentence_model: TextSplitterModel, + in_lang: Optional[str] = None) -> None: + self._sentence_model = sentence_model + self._limit = limit + self._num_boundary_chars = num_boundary_chars + self._get_text_size = get_text_size + self._text = "" + self._text_full_size = 0 + self._overhead_size = 0 + self._soft_limit = self._limit + self._in_lang = in_lang + + if text: + self.set_text(text) + + def set_text(self, text: str): + self._text = text + self._text_full_size = self._get_text_size(text) + chars_per_size = len(text) / self._text_full_size + self._overhead_size = self._get_text_size('') + + self._soft_limit = int(self._limit * chars_per_size) - self._overhead_size + + if self._soft_limit <= 1: + # Caused by an unusually large overhead relative to text. + # This is unlikely to occur except during testing of small text limits. + # Recalculate soft limit by subtracting overhead from limit + # before applying chars_per_size weighting. + self._soft_limit = max(1, + int((self._limit - self._overhead_size) * chars_per_size)) + + def _isolate_largest_section(self, text:str) -> str: + # Using cached word splitting model, isolate largest section of text + string_length = len(text) + + if self._num_boundary_chars <= 0: + num_chars_to_process = string_length + else: + num_chars_to_process = self._num_boundary_chars + + start_indx = max(0, string_length - num_chars_to_process) + substring = text[start_indx: string_length] + substring_list = self._sentence_model.split(substring, lang = self._in_lang) + div_index = string_length - len(substring_list[-1]) + + if div_index==start_indx: + return text + + return text[0:div_index] + + @classmethod + def split(cls, + text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], + sentence_model: TextSplitterModel, + in_lang: Optional[str] = None + ): + return cls(text, limit, num_boundary_chars, get_text_size, sentence_model, in_lang)._split() + + + def _split(self): + if self._text_full_size <= self._limit: + yield self._text + else: + yield from self._split_internal(self._text) + + def _split_internal(self, text): + right = text + while True: + left, right = self._divide(right) + yield left + if not right: + return + + def _divide(self, text) -> Tuple[str, str]: + limit = self._soft_limit + while True: + left = text[:limit] + left_size = self._get_text_size(left) + + if left_size <= self._limit: + if left != text: + # If dividing into two parts + # Determine soft boundary for left segment + left = self._isolate_largest_section(left) + return left, text[len(left):] + + char_per_size = len(left) / left_size + + + limit = int(self._limit * char_per_size) - self._overhead_size + + if limit < 1: + # Caused by an unusually large overhead relative to text. + # This is unlikely to occur except during testing of small text limits. + # Recalculate soft limit by subtracting overhead from limit before + # applying chars_per_size weighting. + limit = max(1, int((self._limit - self._overhead_size) * char_per_size)) diff --git a/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py b/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py new file mode 100644 index 0000000..c682fd3 --- /dev/null +++ b/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py @@ -0,0 +1,259 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from typing import Optional + +class WtpLanguageSettings: + # Supported languages and ISO 639-1, 639-2 codes for WtP models. + # https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages + # https://www.loc.gov/standards/iso639-2/php/code_list.php + _wtp_lang_map = { + 'afrikaans': 'af', + 'afr': 'af', + 'amharic': 'am', + 'amh': 'am', + 'arabic': 'ar', + 'ara': 'ar', + 'azerbaijani': 'az', + 'aze': 'az', + 'belarusian': 'be', + 'bel': 'be', + 'bulgarian': 'bg', + 'bul': 'bg', + 'bengali': 'bn', + 'ben': 'bn', + 'catalan': 'ca', + 'valencian': 'ca', + 'cat': 'ca', + 'cebuano': 'ceb', # In some cases, ISO-639-1 is not available, use ISO-639-2 + 'ceb': 'ceb', + 'czech': 'cs', + 'cze': 'cs', + 'ces': 'cs', + 'welsh': 'cy', + 'wel': 'cy', + 'cym': 'cy', + 'danish': 'da', + 'dan': 'da', + 'german': 'de', + 'ger': 'de', + 'deu': 'de', + 'greek': 'el', + 'gre': 'el', + 'ell': 'el', + 'english': 'en', + 'eng': 'en', + 'esperanto': 'eo', + 'epo': 'eo', + 'spanish': 'es', + 'castilian': 'es', + 'spa': 'es', + 'estonian': 'et', + 'est': 'et', + 'basque': 'eu', + 'baq': 'eu', + 'eus': 'eu', + 'persian': 'fa', + 'per': 'fa', + 'fas': 'fa', + 'finnish': 'fi', + 'fin': 'fi', + 'french': 'fr', + 'fre': 'fr', + 'fra': 'fr', + 'western frisian': 'fy', + 'fry': 'fy', + 'irish': 'ga', + 'gle': 'ga', + 'gaelic': 'gd', + 'scottish gaelic': 'gd', + 'gla': 'gd', + 'galician': 'gl', + 'glg': 'gl', + 'gujarati': 'gu', + 'guj': 'gu', + 'hausa': 'ha', + 'hau': 'ha', + 'hebrew': 'he', + 'heb': 'he', + 'hindi': 'hi', + 'hin': 'hi', + 'hungarian': 'hu', + 'hun': 'hu', + 'armenian': 'hy', + 'arm': 'hy', + 'hye': 'hy', + 'indonesian': 'id', + 'ind': 'id', + 'igbo': 'ig', + 'ibo': 'ig', + 'icelandic': 'is', + 'ice': 'is', + 'isl': 'is', + 'italian': 'it', + 'ita': 'it', + 'japanese': 'ja', + 'jpn': 'ja', + 'javanese': 'jv', + 'jav': 'jv', + 'georgian': 'ka', + 'geo': 'ka', + 'kat': 'ka', + 'kazakh': 'kk', + 'kaz': 'kk', + 'central khmer': 'km', + 'khm': 'km', + 'kannada': 'kn', + 'kan': 'kn', + 'korean': 'ko', + 'kor': 'ko', + 'kurdish': 'ku', + 'kur': 'ku', + 'kirghiz': 'ky', + 'kyrgyz': 'ky', + 'kir': 'ky', + 'latin': 'la', + 'lat': 'la', + 'lithuanian': 'lt', + 'lit': 'lt', + 'latvian': 'lv', + 'lav': 'lv', + 'malagasy': 'mg', + 'mlg': 'mg', + 'macedonian': 'mk', + 'mac': 'mk', + 'mkd': 'mk', + 'malayalam': 'ml', + 'mal': 'ml', + 'mongolian': 'mn', + 'mon': 'mn', + 'marathi': 'mr', + 'mar': 'mr', + 'malay': 'ms', + 'may': 'ms', + 'msa': 'ms', + 'maltese': 'mt', + 'mlt': 'mt', + 'burmese': 'my', + 'bur': 'my', + 'mya': 'my', + 'nepali': 'ne', + 'nep': 'ne', + 'dutch': 'nl', + 'flemish': 'nl', + 'dut': 'nl', + 'nld': 'nl', + 'norwegian': 'no', + 'nor': 'no', + 'panjabi': 'pa', + 'punjabi': 'pa', + 'pan': 'pa', + 'polish': 'pl', + 'pol': 'pl', + 'pushto': 'ps', + 'pashto': 'ps', + 'pus': 'ps', + 'portuguese': 'pt', + 'por': 'pt', + 'romanian': 'ro', + 'moldavian': 'ro', + 'moldovan': 'ro', + 'rum': 'ro', + 'ron': 'ro', + 'russian': 'ru', + 'rus': 'ru', + 'sinhala': 'si', + 'sinhalese': 'si', + 'sin': 'si', + 'slovak': 'sk', + 'slo': 'sk', + 'slk': 'sk', + 'slovenian': 'sl', + 'slv': 'sl', + 'albanian': 'sq', + 'alb': 'sq', + 'sqi': 'sq', + 'serbian': 'sr', + 'srp': 'sr', + 'swedish': 'sv', + 'swe': 'sv', + 'tamil': 'ta', + 'tam': 'ta', + 'telugu': 'te', + 'tel': 'te', + 'tajik': 'tg', + 'tgk': 'tg', + 'thai': 'th', + 'tha': 'th', + 'turkish': 'tr', + 'tur': 'tr', + 'ukrainian': 'uk', + 'ukr': 'uk', + 'urdu': 'ur', + 'urd': 'ur', + 'uzbek': 'uz', + 'uzb': 'uz', + 'vietnamese': 'vi', + 'vie': 'vi', + 'xhosa': 'xh', + 'xho': 'xh', + 'yiddish': 'yi', + 'yid': 'yi', + 'yoruba': 'yo', + 'yor': 'yo', + 'chinese': 'zh', + 'chi': 'zh', + 'zho': 'zh', + 'zulu': 'zu', + 'zul': 'zu', + 'hans':'zh', # Also check for chinese scripts + 'hant': 'zh', + 'cmn':'zh' # In some cases we use 'cmn' = 'Mandarin' + } + + _wtp_iso_set = set(_wtp_lang_map.values()) + + @classmethod + def convert_to_iso(cls, lang: str) -> Optional[str]: + # ISO 639-2 (language) is sometimes paired with ISO 15924 (script). + # Extract the language portion and check if supported in WtP. + if not lang: + return None + + if '-' in lang: + lang = lang.split('-')[0] + if '_' in lang: + lang = lang.split('_')[0] + + lang = lang.strip().lower() + + if lang in cls._wtp_iso_set: + return lang + + if lang in cls._wtp_lang_map: + return cls._wtp_lang_map[lang] + + return None diff --git a/detection/nlp_text_splitter/pyproject.toml b/detection/nlp_text_splitter/pyproject.toml new file mode 100644 index 0000000..74a21ee --- /dev/null +++ b/detection/nlp_text_splitter/pyproject.toml @@ -0,0 +1,37 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "nlp_text_splitter" +version = "8.0" +dependencies = [ + "spacy>=3.7.4", + "wtpsplit>=1.3.0" +] diff --git a/detection/nlp_text_splitter/tests/test_data/art-of-war.txt b/detection/nlp_text_splitter/tests/test_data/art-of-war.txt new file mode 100644 index 0000000..25e19f5 --- /dev/null +++ b/detection/nlp_text_splitter/tests/test_data/art-of-war.txt @@ -0,0 +1,8 @@ +兵者,國之大事,死生之地,存亡之道,不可不察也。 +故經之以五事,校之以計,而索其情:一曰道,二曰天,三曰地,四曰將,五曰法。道者,令民於上同意,可與之死,可與之生, +而不危也;天者,陰陽、寒暑、時制也;地者,遠近、險易、廣狹、死生也;將者,智、信、仁、勇、嚴也;法者,曲制、官道、 +主用也。凡此五者,將莫不聞,知之者勝,不知之者不勝。故校之以計,而索其情,曰:主孰有道?將孰有能?天地孰得?法令孰行? +兵眾孰強?士卒孰練?賞罰孰明?吾以此知勝負矣。將聽吾計,用之必勝,留之;將不聽吾計,用之必敗,去之。計利以聽,乃為之勢, +以佐其外。勢者,因利而制權也。兵者,詭道也。故能而示之不能,用而示之不用,近而示之遠,遠而示之近。利而誘之,亂而取之, +實而備之,強而避之,怒而撓之,卑而驕之,佚而勞之,親而離之,攻其無備,出其不意。此兵家之勝,不可先傳也。 +夫未戰而廟算勝者,得算多也;未戰而廟算不勝者,得算少也。多算勝少算,而況於無算乎!吾以此觀之,勝負見矣。 diff --git a/detection/nlp_text_splitter/tests/test_text_splitter.py b/detection/nlp_text_splitter/tests/test_text_splitter.py new file mode 100644 index 0000000..12d98d0 --- /dev/null +++ b/detection/nlp_text_splitter/tests/test_text_splitter.py @@ -0,0 +1,91 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import pathlib +import unittest + +from nlp_text_splitter import TextSplitterModel, TextSplitter + + +TEST_DATA = pathlib.Path(__file__).parent / 'test_data' + +class TestTextSplitter(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") + # cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cpu", "zh") + cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") + + def test_split_engine_difference(self): + # Note: Only WtP's multilingual models + # can detect some of '。' characters used for this language. + text = (TEST_DATA / 'art-of-war.txt').read_text() + + text_without_newlines = text.replace('\n', '') + + actual = self.wtp_model._split_wtp(text_without_newlines) + self.assertEqual(3, len(actual)) + for line in actual: + self.assertTrue(line.endswith('。')) + + actual = self.spacy_model._split_spacy(text_without_newlines) + self.assertEqual(1, len(actual)) + + # However, WtP prefers newlines over the '。' character. + actual = self.wtp_model._split_wtp(text) + self.assertEqual(10, len(actual)) + + def test_guess_split_simple_sentence(self): + input_text = 'Hello, what is your name? My name is John.' + actual = list(TextSplitter.split(input_text, + 28, + 28, + len, + self.wtp_model)) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + + # "Hello, what is your name?" + self.assertEqual('Hello, what is your name? ', actual[0]) + # " My name is John." + self.assertEqual('My name is John.', actual[1]) + + input_text = 'Hello, what is your name? My name is John.' + actual = list(TextSplitter.split(input_text, + 28, + 28, + len, + self.spacy_model)) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + + # "Hello, what is your name?" + self.assertEqual('Hello, what is your name? ', actual[0]) + # " My name is John." + self.assertEqual('My name is John.', actual[1]) + +if __name__ == '__main__': + unittest.main(verbosity=2) From eaa59f49604edca2d3a21a2d036601715d250ff1 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Mon, 29 Apr 2024 23:18:24 -0400 Subject: [PATCH 02/11] Update install script to include models option. Update License and text splitter test files. --- LICENSE | 60 ++++++++++++- detection/nlp_text_splitter/install.sh | 35 ++++++-- .../tests/test_text_splitter.py | 89 ++++++++++++++++++- 3 files changed, 176 insertions(+), 8 deletions(-) diff --git a/LICENSE b/LICENSE index b7982b1..48665b7 100644 --- a/LICENSE +++ b/LICENSE @@ -17,4 +17,62 @@ This project contains content developed by The MITRE Corporation. If this code is used in a deployment or embedded within another project, it is requested that you send an email to opensource@mitre.org in order to let us know where - this software is being used. \ No newline at end of file + this software is being used. + +***************************************************************************** + +The nlp_text_splitter utlity uses the following sentence detection libraries: + +***************************************************************************** + +The WtP, "Where the Point", sentence segmentation library falls under the MIT License: + +https://github.com/bminixhofer/wtpsplit/blob/main/LICENSE + +MIT License + +Copyright (c) 2024 Benjamin Minixhofer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +***************************************************************************** + +The spaCy Natural Language Processing library falls under the MIT License: + +The MIT License (MIT) + +Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index bed0824..ffade85 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -2,11 +2,10 @@ set -o errexit -o pipefail - main() { if ! options=$(getopt --name "$0" \ - --options t:gm: \ - --longoptions text-splitter-dir:,gpu,models-dir: \ + --options t:gm:i: \ + --longoptions text-splitter-dir:,gpu,models-dir:,install-models: \ -- "$@"); then print_usage fi @@ -24,6 +23,10 @@ main() { shift local models_dir=$1; ;; + --install-models | -i ) + shift + local inst_models=$1; + ;; -- ) shift break @@ -34,7 +37,7 @@ main() { install_text_splitter "$text_splitter_dir" install_py_torch "$gpu_enabled" - download_models "$models_dir" + download_models "$models_dir" "$inst_models" } @@ -67,12 +70,13 @@ install_py_torch() { download_models() { local models_dir=${1:-/opt/wtp/models} + local model_names=$2 if [[ ! $REQUESTS_CA_BUNDLE ]]; then export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt fi - echo 'Downloading the xx_sent_ud_sm Spacy model.' + echo 'Downloading the xx_sent_ud_sm spaCy model.' python3 -m spacy download xx_sent_ud_sm echo "Downloading the wtp-bert-mini model to $models_dir." @@ -92,13 +96,32 @@ download_models() { python3 -c \ "from huggingface_hub import snapshot_download; \ snapshot_download('benjamin/wtp-bert-mini', local_dir='$bert_model_dir')" + + # Download additional models of interest specified by user. + if [[ -n "$model_names" ]]; then + for i in $(echo $model_names | sed "s/,/ /g") + do + local model_name=$i + if [[ $model_name =~ "wtp" ]]; then + echo "Downloading the $model_name WtP model." + bert_model_dir="$models_dir"/"$model_name" + python3 -c \ + "from huggingface_hub import snapshot_download; \ + snapshot_download('benjamin/wtp-bert-mini', local_dir='$bert_model_dir')" + else + echo "Downloading the $model_name spaCy model." + python3 -m spacy download $model_name + fi + + done + fi } print_usage() { echo echo "Usage: -$0 [--text-splitter-dir|-t ] [--gpu|-g] [--models-dir|-m ]" +$0 [--text-splitter-dir|-t ] [--gpu|-g] [--models-dir|-m ] [--install-models|-i ]" exit 1 } diff --git a/detection/nlp_text_splitter/tests/test_text_splitter.py b/detection/nlp_text_splitter/tests/test_text_splitter.py index 12d98d0..8991b05 100644 --- a/detection/nlp_text_splitter/tests/test_text_splitter.py +++ b/detection/nlp_text_splitter/tests/test_text_splitter.py @@ -36,7 +36,7 @@ class TestTextSplitter(unittest.TestCase): @classmethod def setUpClass(cls): cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") - # cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cpu", "zh") + cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cpu", "zh") cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") def test_split_engine_difference(self): @@ -87,5 +87,92 @@ def test_guess_split_simple_sentence(self): # " My name is John." self.assertEqual('My name is John.', actual[1]) + def test_split_sentence_end_punctuation(self): + input_text = 'Hello. How are you? asdfasdf' + actual = list(TextSplitter.split(input_text, + 20, + 10, + len, + self.wtp_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + + self.assertEqual('Hello. How are you? ', actual[0]) + self.assertEqual('asdfasdf', actual[1]) + + actual = list(TextSplitter.split(input_text, + 20, + 10, + len, + self.spacy_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + + self.assertEqual('Hello. How are you? ', actual[0]) + self.assertEqual('asdfasdf', actual[1]) + + + def test_split_wtp_basic(self): + text = (TEST_DATA / 'art-of-war.txt').read_text().replace('\n','') + actual = list(TextSplitter.split(text, + 150, + 150, + len, + self.wtp_model)) + + self.assertEqual(4, len(actual)) + + expected_chunk_lengths = [86, 116, 104, 114] + self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n',''))) + + self.assertTrue(actual[0].startswith('兵者,')) + self.assertTrue(actual[0].endswith('而不危也;')) + self.assertEqual(expected_chunk_lengths[0], len(actual[0])) + + self.assertTrue(actual[1].startswith('天者,陰陽')) + self.assertTrue(actual[1].endswith('兵眾孰強?')) + self.assertEqual(expected_chunk_lengths[1], len(actual[1])) + + self.assertTrue(actual[2].startswith('士卒孰練?')) + self.assertTrue(actual[2].endswith('遠而示之近。')) + self.assertEqual(expected_chunk_lengths[2], len(actual[2])) + + self.assertTrue(actual[3].startswith('利而誘之,')) + self.assertTrue(actual[3].endswith('勝負見矣。')) + self.assertEqual(expected_chunk_lengths[3], len(actual[3])) + + def test_split_wtp_advanced(self): + text = (TEST_DATA / 'art-of-war.txt').read_text().replace('\n','') + actual = list(TextSplitter.split(text, + 150, + 150, + len, + self.wtp_adv_model)) + + print(actual) + self.assertEqual(4, len(actual)) + + expected_chunk_lengths = [61, 150, 61, 148] + self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n',''))) + + self.assertTrue(actual[0].startswith('兵者,')) + self.assertTrue(actual[0].endswith('四曰將,五曰法。')) + self.assertEqual(expected_chunk_lengths[0], len(actual[0])) + + self.assertTrue(actual[1].startswith('道者,令民於上同意')) + self.assertTrue(actual[1].endswith('賞罰孰明')) + self.assertEqual(expected_chunk_lengths[1], len(actual[1])) + + self.assertTrue(actual[2].startswith('?吾以此知勝')) + self.assertTrue(actual[2].endswith('因利而制權也。')) + self.assertEqual(expected_chunk_lengths[2], len(actual[2])) + + self.assertTrue(actual[3].startswith('兵者,詭道也。')) + self.assertTrue(actual[3].endswith('之,勝負見矣。')) + self.assertEqual(expected_chunk_lengths[3], len(actual[3])) + + if __name__ == '__main__': unittest.main(verbosity=2) From 5c9a07e904176aaeb1de22b7282a9e620ad28f6c Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 30 Apr 2024 02:06:00 -0400 Subject: [PATCH 03/11] Cleanup install script and unit test. --- detection/nlp_text_splitter/install.sh | 60 +++++++++++++------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index ffade85..67def9f 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -68,19 +68,42 @@ install_py_torch() { fi } +download_model_names() { + local models_dir=$1 + local model_names=$2 + + if [[ -n "$models_dir" ]] && [[ -n "$model_names" ]] ; then + for i in $(echo $model_names | sed "s/,/ /g") + do + local model_name=$i + if [[ $model_name =~ "wtp" ]]; then + echo "Downloading the $model_name model to $models_dir." + bert_model_dir="$models_dir"/"$model_name" + python3 -c \ + "from huggingface_hub import snapshot_download; \ + snapshot_download('benjamin/$model_name', local_dir='$bert_model_dir')" + else + echo "Downloading the $model_name spaCy model." + python3 -m spacy download $model_name + fi + done + fi +} + + download_models() { local models_dir=${1:-/opt/wtp/models} - local model_names=$2 + + if [[ -n "$2" ]]; then + local model_names='xx_sent_ud_sm,wtp-bert-mini,'$2 + else + local model_names='xx_sent_ud_sm,wtp-bert-mini' + fi if [[ ! $REQUESTS_CA_BUNDLE ]]; then export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt fi - echo 'Downloading the xx_sent_ud_sm spaCy model.' - python3 -m spacy download xx_sent_ud_sm - - echo "Downloading the wtp-bert-mini model to $models_dir." - if ! mkdir --parents "$models_dir"; then echo "ERROR: Failed to create the $models_dir directory." exit 3 @@ -92,29 +115,8 @@ download_models() { exit 4 fi - local bert_model_dir="$models_dir"/wtp-bert-mini - python3 -c \ - "from huggingface_hub import snapshot_download; \ - snapshot_download('benjamin/wtp-bert-mini', local_dir='$bert_model_dir')" - - # Download additional models of interest specified by user. - if [[ -n "$model_names" ]]; then - for i in $(echo $model_names | sed "s/,/ /g") - do - local model_name=$i - if [[ $model_name =~ "wtp" ]]; then - echo "Downloading the $model_name WtP model." - bert_model_dir="$models_dir"/"$model_name" - python3 -c \ - "from huggingface_hub import snapshot_download; \ - snapshot_download('benjamin/wtp-bert-mini', local_dir='$bert_model_dir')" - else - echo "Downloading the $model_name spaCy model." - python3 -m spacy download $model_name - fi - - done - fi + # Download models of interest specified by user. + download_model_names "$models_dir" "$model_names" } From f04d049ed5c5da1e4826da2c920f892106981fd5 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 30 Apr 2024 04:21:17 -0400 Subject: [PATCH 04/11] Updating documentation for text splitter utility. --- detection/nlp_text_splitter/README.md | 49 +++++++++++++++++++ .../nlp_text_splitter/tests/test_data/NOTICE | 4 ++ .../tests/test_text_splitter.py | 1 - 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 detection/nlp_text_splitter/README.md create mode 100644 detection/nlp_text_splitter/tests/test_data/NOTICE diff --git a/detection/nlp_text_splitter/README.md b/detection/nlp_text_splitter/README.md new file mode 100644 index 0000000..888175e --- /dev/null +++ b/detection/nlp_text_splitter/README.md @@ -0,0 +1,49 @@ +# Overview + +This directory contains the source code, test examples, and installation script +for the MPF NlpTextSplitter tool, which uses WtP and spaCy libraries +to detect sentences in a given chunk of text. + +## Background + +Our primary motivation for creating this tool was to find a lightweight, accurate +sentence detection capability to support a large variety of text processing tasks +including translation and tagging. + +Through preliminary investigation, we identified the [WtP library ("Where's the +Point")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence +detection model](https://spacy.io/models) for identifying sentence breaks +in a large section of text. + +WtP models are trained to split up multilingual text by sentence without the need of an +input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 +GB of GPU memory. On the other hand, spaCy has a single multilingual sentence detection +that appears to work better for splitting up English text in certain cases, unfortunately +this model lacks support handling for Chinese punctuation. + + +## Installation of NlpTextSplitter: GPU vs CPU modes + +To install this tool users will need to run: + +`./install.sh` - Which will setup a CPU-only PyTorch installation. + +Please note that several customizations are supported: + +- `--text-splitter-dir|-t ` This parameter specifies where the + source code is located relative to the installation script. In general, + since the installation script and source code are both located here, it's not + necessary to update this parameter unless the user is running the `install.sh` + script from a different directory. + +- `--gpu ` - Add this parameter to the installation command line above to + setup a PyTorch installation with CUDA (GPU) libraries. + +- `--models-dir|-m ` - Add this parameter to + change the default WtP model installation directory + (default: `/opt/wtp/models`). + +- `--install-models|-i ` - Add this parameter to + specify a comma-separated list of WtP and spaCy models for + installation. Please note, no spaces should be added + between model names (i.e. `model_1,model_2,etc.`). diff --git a/detection/nlp_text_splitter/tests/test_data/NOTICE b/detection/nlp_text_splitter/tests/test_data/NOTICE new file mode 100644 index 0000000..0e3ac4d --- /dev/null +++ b/detection/nlp_text_splitter/tests/test_data/NOTICE @@ -0,0 +1,4 @@ +# art-of-war.txt +Contains the beginning of "The Art of War" by Sunzi in Traditional Chinese. +Public Domain +https://www.gutenberg.org/ebooks/12407 \ No newline at end of file diff --git a/detection/nlp_text_splitter/tests/test_text_splitter.py b/detection/nlp_text_splitter/tests/test_text_splitter.py index 8991b05..0bd2d84 100644 --- a/detection/nlp_text_splitter/tests/test_text_splitter.py +++ b/detection/nlp_text_splitter/tests/test_text_splitter.py @@ -151,7 +151,6 @@ def test_split_wtp_advanced(self): len, self.wtp_adv_model)) - print(actual) self.assertEqual(4, len(actual)) expected_chunk_lengths = [61, 150, 61, 148] From 8638c86958167717d619a7a9f297c8a6d20b685a Mon Sep 17 00:00:00 2001 From: Brian Rosenberg Date: Wed, 1 May 2024 11:20:07 -0400 Subject: [PATCH 05/11] Improve parameter parsing in install script --- detection/nlp_text_splitter/install.sh | 86 +++++++++++++++----------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index 67def9f..e67c061 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -4,12 +4,15 @@ set -o errexit -o pipefail main() { if ! options=$(getopt --name "$0" \ - --options t:gm:i: \ - --longoptions text-splitter-dir:,gpu,models-dir:,install-models: \ + --options t:gm:w:b: \ + --longoptions text-splitter-dir:,gpu,models-dir:,install-wtp-models:,install-spacy-models: \ -- "$@"); then print_usage fi eval set -- "$options" + local models_dir=/opt/wtp/models + local wtp_models=("wtp-bert-mini") + local spacy_models=("xx_sent_ud_sm") while true; do case "$1" in --text-splitter-dir | -t ) @@ -21,11 +24,15 @@ main() { ;; --models-dir | -m ) shift - local models_dir=$1; + models_dir=$1; ;; - --install-models | -i ) + --install-wtp-models | -w ) shift - local inst_models=$1; + wtp_models+=("$1") + ;; + --install-spacy-models | -s ) + shift + spacy_models+=("$1") ;; -- ) shift @@ -37,7 +44,8 @@ main() { install_text_splitter "$text_splitter_dir" install_py_torch "$gpu_enabled" - download_models "$models_dir" "$inst_models" + download_wtp_models "$models_dir" "${wtp_models[@]}" + download_spacy_models "${spacy_models[@]}" } @@ -68,37 +76,24 @@ install_py_torch() { fi } -download_model_names() { + +download_wtp_models() { local models_dir=$1 - local model_names=$2 - - if [[ -n "$models_dir" ]] && [[ -n "$model_names" ]] ; then - for i in $(echo $model_names | sed "s/,/ /g") - do - local model_name=$i - if [[ $model_name =~ "wtp" ]]; then - echo "Downloading the $model_name model to $models_dir." - bert_model_dir="$models_dir"/"$model_name" - python3 -c \ - "from huggingface_hub import snapshot_download; \ - snapshot_download('benjamin/$model_name', local_dir='$bert_model_dir')" - else - echo "Downloading the $model_name spaCy model." - python3 -m spacy download $model_name - fi - done - fi + shift + local model_names=("$@") + setup_models_dir "$models_dir" + + for model_name in "${model_names[@]}"; do + echo "Downloading the $model_name model to $models_dir." + local wtp_model_dir="$models_dir/$model_name" + python3 -c \ + "from huggingface_hub import snapshot_download; \ + snapshot_download('benjamin/$model_name', local_dir='$wtp_model_dir')" + done } - -download_models() { - local models_dir=${1:-/opt/wtp/models} - - if [[ -n "$2" ]]; then - local model_names='xx_sent_ud_sm,wtp-bert-mini,'$2 - else - local model_names='xx_sent_ud_sm,wtp-bert-mini' - fi +setup_models_dir() { + local models_dir=$1 if [[ ! $REQUESTS_CA_BUNDLE ]]; then export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt @@ -114,16 +109,33 @@ download_models() { echo "The permissions on \"$models_dir\" must be modified." exit 4 fi +} - # Download models of interest specified by user. - download_model_names "$models_dir" "$model_names" +download_spacy_models() { + for model_name in "$@"; do + echo "Downloading the $model_name spaCy model." + python3 -m spacy download "$model_name" + done } print_usage() { echo echo "Usage: -$0 [--text-splitter-dir|-t ] [--gpu|-g] [--models-dir|-m ] [--install-models|-i ]" +$0 [--text-splitter-dir|-t ] [--gpu|-g] [--models-dir|-m ] [--install-wtp-models|-w ]* [--install-spacy-models,|-s ]* +Options + --text-splitter-dir, -t : Path to text splitter source code. (defaults to to the + same directory as this script) + --gpu, -g: Install the GPU version of PyTorch + --models-dir, -m : Path where WTP models will be stored. + (defaults to /opt/wtp/models) + --install-wtp-models, -w : Names of WTP models to install in addtion to wtp-bert-mini. + This option can be provided more than once to specify + multiple models. + --install-spacy-models | -s : Names of spaCy models to install in addtion to + xx_sent_ud_sm. The option can be provided more than once + to specify multiple models. +" exit 1 } From 41640ef18136c086c9bfeb687e60763e908ea7d2 Mon Sep 17 00:00:00 2001 From: Brian Rosenberg Date: Wed, 1 May 2024 11:58:09 -0400 Subject: [PATCH 06/11] Fix pluralization --- detection/nlp_text_splitter/README.md | 11 ++++++---- detection/nlp_text_splitter/install.sh | 30 +++++++++++++------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/detection/nlp_text_splitter/README.md b/detection/nlp_text_splitter/README.md index 888175e..71a145f 100644 --- a/detection/nlp_text_splitter/README.md +++ b/detection/nlp_text_splitter/README.md @@ -43,7 +43,10 @@ Please note that several customizations are supported: change the default WtP model installation directory (default: `/opt/wtp/models`). -- `--install-models|-i ` - Add this parameter to - specify a comma-separated list of WtP and spaCy models for - installation. Please note, no spaces should be added - between model names (i.e. `model_1,model_2,etc.`). +- `--install-wtp-model|-w :` - Add this parameter to specify + additional WTP models for installation. This parameter can be provided + multiple times to install more than one model. + +- `--install-spacy-model|-s :` - Add this parameter to specify + additional spaCy models for installation. This parameter can be provided + multiple times to install more than one model. diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index e67c061..e9b16d5 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -5,7 +5,7 @@ set -o errexit -o pipefail main() { if ! options=$(getopt --name "$0" \ --options t:gm:w:b: \ - --longoptions text-splitter-dir:,gpu,models-dir:,install-wtp-models:,install-spacy-models: \ + --longoptions text-splitter-dir:,gpu,models-dir:,install-wtp-model:,install-spacy-model: \ -- "$@"); then print_usage fi @@ -26,11 +26,11 @@ main() { shift models_dir=$1; ;; - --install-wtp-models | -w ) + --install-wtp-model | -w ) shift wtp_models+=("$1") ;; - --install-spacy-models | -s ) + --install-spacy-model | -s ) shift spacy_models+=("$1") ;; @@ -122,19 +122,19 @@ download_spacy_models() { print_usage() { echo echo "Usage: -$0 [--text-splitter-dir|-t ] [--gpu|-g] [--models-dir|-m ] [--install-wtp-models|-w ]* [--install-spacy-models,|-s ]* +$0 [--text-splitter-dir|-t ] [--gpu|-g] [--models-dir|-m ] [--install-wtp-model|-w ]* [--install-spacy-model|-s ]* Options - --text-splitter-dir, -t : Path to text splitter source code. (defaults to to the - same directory as this script) - --gpu, -g: Install the GPU version of PyTorch - --models-dir, -m : Path where WTP models will be stored. - (defaults to /opt/wtp/models) - --install-wtp-models, -w : Names of WTP models to install in addtion to wtp-bert-mini. - This option can be provided more than once to specify - multiple models. - --install-spacy-models | -s : Names of spaCy models to install in addtion to - xx_sent_ud_sm. The option can be provided more than once - to specify multiple models. + --text-splitter-dir, -t : Path to text splitter source code. (defaults to to the + same directory as this script) + --gpu, -g: Install the GPU version of PyTorch + --models-dir, -m : Path where WTP models will be stored. + (defaults to /opt/wtp/models) + --install-wtp-model, -w : Name of a WTP model to install in addtion to wtp-bert-mini. + This option can be provided more than once to specify + multiple models. + --install-spacy-model | -s : Names of a spaCy model to install in addtion to + xx_sent_ud_sm. The option can be provided more than once + to specify multiple models. " exit 1 } From 1d8463036a5b0af413aa1313eba9b4ab5ba360c0 Mon Sep 17 00:00:00 2001 From: jrobble Date: Wed, 1 May 2024 14:04:15 -0400 Subject: [PATCH 07/11] Improve README formatting. --- detection/nlp_text_splitter/README.md | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/detection/nlp_text_splitter/README.md b/detection/nlp_text_splitter/README.md index 71a145f..84d01d8 100644 --- a/detection/nlp_text_splitter/README.md +++ b/detection/nlp_text_splitter/README.md @@ -1,10 +1,10 @@ # Overview This directory contains the source code, test examples, and installation script -for the MPF NlpTextSplitter tool, which uses WtP and spaCy libraries +for the OpenMPF NlpTextSplitter tool, which uses WtP and spaCy libraries to detect sentences in a given chunk of text. -## Background +# Background Our primary motivation for creating this tool was to find a lightweight, accurate sentence detection capability to support a large variety of text processing tasks @@ -18,35 +18,33 @@ in a large section of text. WtP models are trained to split up multilingual text by sentence without the need of an input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 GB of GPU memory. On the other hand, spaCy has a single multilingual sentence detection -that appears to work better for splitting up English text in certain cases, unfortunately +that appears to work better for splitting up English text in certain cases. Unfortunately this model lacks support handling for Chinese punctuation. +# Installation -## Installation of NlpTextSplitter: GPU vs CPU modes - -To install this tool users will need to run: - -`./install.sh` - Which will setup a CPU-only PyTorch installation. +To install this tool users will need to run `./install.sh`. By default this will set up a +CPU-only PyTorch installation. Please note that several customizations are supported: -- `--text-splitter-dir|-t ` This parameter specifies where the +- `--text-splitter-dir|-t `: This parameter specifies where the source code is located relative to the installation script. In general, since the installation script and source code are both located here, it's not necessary to update this parameter unless the user is running the `install.sh` script from a different directory. -- `--gpu ` - Add this parameter to the installation command line above to +- `--gpu`: Add this parameter to the installation command line above to setup a PyTorch installation with CUDA (GPU) libraries. -- `--models-dir|-m ` - Add this parameter to +- `--models-dir|-m `: Add this parameter to change the default WtP model installation directory (default: `/opt/wtp/models`). -- `--install-wtp-model|-w :` - Add this parameter to specify +- `--install-wtp-model|-w `: Add this parameter to specify additional WTP models for installation. This parameter can be provided multiple times to install more than one model. -- `--install-spacy-model|-s :` - Add this parameter to specify +- `--install-spacy-model|-s `: Add this parameter to specify additional spaCy models for installation. This parameter can be provided multiple times to install more than one model. From b0350fcade94d11e25c8c8503b11cce843e66c9f Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 2 May 2024 11:17:16 -0400 Subject: [PATCH 08/11] Updates to installation script, README. fix spacy model option. --- detection/nlp_text_splitter/README.md | 2 +- detection/nlp_text_splitter/install.sh | 64 +++++++++++++------ detection/nlp_text_splitter/pyproject.toml | 4 +- .../tests/test_text_splitter.py | 40 +++++++++++- 4 files changed, 86 insertions(+), 24 deletions(-) diff --git a/detection/nlp_text_splitter/README.md b/detection/nlp_text_splitter/README.md index 84d01d8..06bbbfc 100644 --- a/detection/nlp_text_splitter/README.md +++ b/detection/nlp_text_splitter/README.md @@ -37,7 +37,7 @@ Please note that several customizations are supported: - `--gpu`: Add this parameter to the installation command line above to setup a PyTorch installation with CUDA (GPU) libraries. -- `--models-dir|-m `: Add this parameter to +- `--wtp-models-dir |-m `: Add this parameter to change the default WtP model installation directory (default: `/opt/wtp/models`). diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index e9b16d5..02024b0 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -1,16 +1,42 @@ #!/usr/bin/env bash +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + set -o errexit -o pipefail main() { if ! options=$(getopt --name "$0" \ - --options t:gm:w:b: \ - --longoptions text-splitter-dir:,gpu,models-dir:,install-wtp-model:,install-spacy-model: \ + --options t:gm:w:s: \ + --longoptions text-splitter-dir:,gpu,wtp-models-dir :,install-wtp-model:,install-spacy-model: \ -- "$@"); then print_usage fi eval set -- "$options" - local models_dir=/opt/wtp/models + local wtp_models_dir =/opt/wtp/models local wtp_models=("wtp-bert-mini") local spacy_models=("xx_sent_ud_sm") while true; do @@ -22,9 +48,9 @@ main() { --gpu | -g ) local gpu_enabled=true ;; - --models-dir | -m ) + --wtp-models-dir | -m ) shift - models_dir=$1; + wtp_models_dir =$1; ;; --install-wtp-model | -w ) shift @@ -44,7 +70,7 @@ main() { install_text_splitter "$text_splitter_dir" install_py_torch "$gpu_enabled" - download_wtp_models "$models_dir" "${wtp_models[@]}" + download_wtp_models "$wtp_models_dir " "${wtp_models[@]}" download_spacy_models "${spacy_models[@]}" } @@ -78,35 +104,35 @@ install_py_torch() { download_wtp_models() { - local models_dir=$1 + local wtp_models_dir =$1 shift local model_names=("$@") - setup_models_dir "$models_dir" + setup_wtp_models_dir "$wtp_models_dir " for model_name in "${model_names[@]}"; do - echo "Downloading the $model_name model to $models_dir." - local wtp_model_dir="$models_dir/$model_name" + echo "Downloading the $model_name model to $wtp_models_dir ." + local wtp_model_dir="$wtp_models_dir /$model_name" python3 -c \ "from huggingface_hub import snapshot_download; \ snapshot_download('benjamin/$model_name', local_dir='$wtp_model_dir')" done } -setup_models_dir() { - local models_dir=$1 +setup_wtp_models_dir () { + local wtp_models_dir =$1 if [[ ! $REQUESTS_CA_BUNDLE ]]; then export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt fi - if ! mkdir --parents "$models_dir"; then - echo "ERROR: Failed to create the $models_dir directory." + if ! mkdir --parents "$wtp_models_dir "; then + echo "ERROR: Failed to create the $wtp_models_dir directory." exit 3 fi - if [[ ! -w "$models_dir" ]]; then - echo -n "ERROR: The model directory, \"$models_dir\" is not writable by the current user. " - echo "The permissions on \"$models_dir\" must be modified." + if [[ ! -w "$wtp_models_dir " ]]; then + echo -n "ERROR: The model directory, \"$wtp_models_dir \" is not writable by the current user. " + echo "The permissions on \"$wtp_models_dir \" must be modified." exit 4 fi } @@ -122,12 +148,12 @@ download_spacy_models() { print_usage() { echo echo "Usage: -$0 [--text-splitter-dir|-t ] [--gpu|-g] [--models-dir|-m ] [--install-wtp-model|-w ]* [--install-spacy-model|-s ]* +$0 [--text-splitter-dir|-t ] [--gpu|-g] [--wtp-models-dir |-m ] [--install-wtp-model|-w ]* [--install-spacy-model|-s ]* Options --text-splitter-dir, -t : Path to text splitter source code. (defaults to to the same directory as this script) --gpu, -g: Install the GPU version of PyTorch - --models-dir, -m : Path where WTP models will be stored. + --wtp-models-dir , -m : Path where WTP models will be stored. (defaults to /opt/wtp/models) --install-wtp-model, -w : Name of a WTP model to install in addtion to wtp-bert-mini. This option can be provided more than once to specify diff --git a/detection/nlp_text_splitter/pyproject.toml b/detection/nlp_text_splitter/pyproject.toml index 74a21ee..31583a6 100644 --- a/detection/nlp_text_splitter/pyproject.toml +++ b/detection/nlp_text_splitter/pyproject.toml @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/detection/nlp_text_splitter/tests/test_text_splitter.py b/detection/nlp_text_splitter/tests/test_text_splitter.py index 0bd2d84..9782870 100644 --- a/detection/nlp_text_splitter/tests/test_text_splitter.py +++ b/detection/nlp_text_splitter/tests/test_text_splitter.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -114,6 +114,42 @@ def test_split_sentence_end_punctuation(self): self.assertEqual('asdfasdf', actual[1]) + def test_guess_split_edge_cases(self): + input_text = ("This is a sentence (Dr.Test). Is this," + " a sentence as well? Maybe...maybe not?" + " \n All done, I think!") + + # Split using WtP model. + actual = list(TextSplitter.split(input_text, + 30, + 30, + len, + self.wtp_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(4, len(actual)) + + # WtP should detect and split out each sentence + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? \n ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + + actual = list(TextSplitter.split(input_text, + 35, + 35, + len, + self.spacy_model)) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(4, len(actual)) + + # Split using spaCy model. + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? \n ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + + def test_split_wtp_basic(self): text = (TEST_DATA / 'art-of-war.txt').read_text().replace('\n','') actual = list(TextSplitter.split(text, From ea5e488c59d8f6204cf0b110392fa56e3dc31615 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 2 May 2024 13:06:56 -0400 Subject: [PATCH 09/11] minor bugfix --- detection/nlp_text_splitter/install.sh | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index 02024b0..2abbfc8 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -36,7 +36,7 @@ main() { print_usage fi eval set -- "$options" - local wtp_models_dir =/opt/wtp/models + local wtp_models_dir=/opt/wtp/models local wtp_models=("wtp-bert-mini") local spacy_models=("xx_sent_ud_sm") while true; do @@ -50,7 +50,7 @@ main() { ;; --wtp-models-dir | -m ) shift - wtp_models_dir =$1; + wtp_models_dir=$1; ;; --install-wtp-model | -w ) shift @@ -70,7 +70,7 @@ main() { install_text_splitter "$text_splitter_dir" install_py_torch "$gpu_enabled" - download_wtp_models "$wtp_models_dir " "${wtp_models[@]}" + download_wtp_models "$wtp_models_dir" "${wtp_models[@]}" download_spacy_models "${spacy_models[@]}" } @@ -104,35 +104,35 @@ install_py_torch() { download_wtp_models() { - local wtp_models_dir =$1 + local wtp_models_dir=$1 shift local model_names=("$@") - setup_wtp_models_dir "$wtp_models_dir " + setup_wtp_models_dir "$wtp_models_dir" for model_name in "${model_names[@]}"; do - echo "Downloading the $model_name model to $wtp_models_dir ." - local wtp_model_dir="$wtp_models_dir /$model_name" + echo "Downloading the $model_name model to $wtp_models_dir." + local wtp_model_dir="$wtp_models_dir/$model_name" python3 -c \ "from huggingface_hub import snapshot_download; \ snapshot_download('benjamin/$model_name', local_dir='$wtp_model_dir')" done } -setup_wtp_models_dir () { - local wtp_models_dir =$1 +setup_wtp_models_dir() { + local wtp_models_dir=$1 if [[ ! $REQUESTS_CA_BUNDLE ]]; then export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt fi - if ! mkdir --parents "$wtp_models_dir "; then - echo "ERROR: Failed to create the $wtp_models_dir directory." + if ! mkdir --parents "$wtp_models_dir"; then + echo "ERROR: Failed to create the $wtp_models_dir directory." exit 3 fi - if [[ ! -w "$wtp_models_dir " ]]; then - echo -n "ERROR: The model directory, \"$wtp_models_dir \" is not writable by the current user. " - echo "The permissions on \"$wtp_models_dir \" must be modified." + if [[ ! -w "$wtp_models_dir" ]]; then + echo -n "ERROR: The model directory, \"$wtp_models_dir\" is not writable by the current user. " + echo "The permissions on \"$wtp_models_dir\" must be modified." exit 4 fi } From 220f3d0175bb40cefaf8a64458810b8fac3ded35 Mon Sep 17 00:00:00 2001 From: jrobble Date: Fri, 3 May 2024 17:44:11 -0400 Subject: [PATCH 10/11] Remove space. --- detection/nlp_text_splitter/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index 2abbfc8..45a4dbf 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -31,7 +31,7 @@ set -o errexit -o pipefail main() { if ! options=$(getopt --name "$0" \ --options t:gm:w:s: \ - --longoptions text-splitter-dir:,gpu,wtp-models-dir :,install-wtp-model:,install-spacy-model: \ + --longoptions text-splitter-dir:,gpu,wtp-models-dir:,install-wtp-model:,install-spacy-model: \ -- "$@"); then print_usage fi From 427e4b0ab2e3a00e48a8ee14e185586dc21fae4a Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Mon, 6 May 2024 03:11:43 -0400 Subject: [PATCH 11/11] Fix caching behavior for initialized models. --- detection/nlp_text_splitter/nlp_text_splitter/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py index 3ae3b93..f083a5f 100644 --- a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py +++ b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py @@ -63,6 +63,7 @@ class TextSplitterModel: def __init__(self, model_name: str, model_setting: str, default_lang: str = "en") -> None: self._model_name = "" + self._model_setting = "" self._default_lang = default_lang self._mandatory_wtp_language = False self.split = lambda t, **param: [t] @@ -102,9 +103,11 @@ def _update_wtp_model(self, wtp_model_name: str, self._mandatory_wtp_language = True self._default_lang = default_lang - if self._model_name == wtp_model_name: - log.info(f"Using cached model: {self._model_name}") + if self._model_name == wtp_model_name and self._model_setting == model_setting: + log.info(f"Using cached model, running on {self._model_setting}: " + f"{self._model_name}") else: + self._model_setting = model_setting self._model_name = wtp_model_name # Check if model has been downloaded if os.path.exists(os.path.join(WTP_MODELS_PATH, wtp_model_name)):