From a097cd8b8cf7c4dae65aec7bf30ff30167bda266 Mon Sep 17 00:00:00 2001 From: shademe Date: Mon, 17 Apr 2023 12:58:33 +0200 Subject: [PATCH 1/5] Pin vectors to the CPU after deserialization --- sense2vec/sense2vec.py | 19 +++++++++++++++++-- sense2vec/tests/test_issue155.py | 11 +++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 sense2vec/tests/test_issue155.py diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index bb157f5..0b35849 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -3,6 +3,7 @@ from spacy.vectors import Vectors from spacy.strings import StringStore from spacy.util import SimpleFrozenDict +from thinc.api import NumpyOps import numpy import srsly @@ -247,7 +248,11 @@ def get_other_senses( result = [] key = key if isinstance(key, str) else self.strings[key] word, orig_sense = self.split_key(key) - versions = set([word, word.lower(), word.upper(), word.title()]) if ignore_case else [word] + versions = ( + set([word, word.lower(), word.upper(), word.title()]) + if ignore_case + else [word] + ) for text in versions: for sense in self.senses: new_key = self.make_key(text, sense) @@ -270,7 +275,11 @@ def get_best_sense( sense_options = senses or self.senses if not sense_options: return None - versions = set([word, word.lower(), word.upper(), word.title()]) if ignore_case else [word] + versions = ( + set([word, word.lower(), word.upper(), word.title()]) + if ignore_case + else [word] + ) freqs = [] for text in versions: for sense in sense_options: @@ -304,6 +313,9 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): """ data = srsly.msgpack_loads(bytes_data) self.vectors = Vectors().from_bytes(data["vectors"]) + # Pin vectors to the CPU so that we don't get up comparing + # numpy and cupy arrays. + self.vectors.to_ops(NumpyOps()) self.freqs = dict(data.get("freqs", [])) self.cfg.update(data.get("cfg", {})) if "strings" not in exclude and "strings" in data: @@ -340,6 +352,9 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): freqs_path = path / "freqs.json" cache_path = path / "cache" self.vectors = Vectors().from_disk(path) + # Pin vectors to the CPU so that we don't get up comparing + # numpy and cupy arrays. + self.vectors.to_ops(NumpyOps()) self.cfg.update(srsly.read_json(path / "cfg")) if freqs_path.exists(): self.freqs = dict(srsly.read_json(freqs_path)) diff --git a/sense2vec/tests/test_issue155.py b/sense2vec/tests/test_issue155.py new file mode 100644 index 0000000..b6df403 --- /dev/null +++ b/sense2vec/tests/test_issue155.py @@ -0,0 +1,11 @@ +from pathlib import Path +import spacy +from sense2vec.sense2vec import Sense2Vec + + +def test_issue155(): + data_path = Path(__file__).parent / "data" + spacy.require_gpu() + + s2v = Sense2Vec().from_disk(data_path) + s2v.most_similar("beekeepers|NOUN") From 6f8989846f7e5c32ff07d0959391b61c6f38c8ad Mon Sep 17 00:00:00 2001 From: shademe Date: Mon, 17 Apr 2023 13:01:26 +0200 Subject: [PATCH 2/5] Restore CPU ops after regression test --- sense2vec/tests/test_issue155.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sense2vec/tests/test_issue155.py b/sense2vec/tests/test_issue155.py index b6df403..020f08f 100644 --- a/sense2vec/tests/test_issue155.py +++ b/sense2vec/tests/test_issue155.py @@ -9,3 +9,6 @@ def test_issue155(): s2v = Sense2Vec().from_disk(data_path) s2v.most_similar("beekeepers|NOUN") + + # Restore CPU ops for the rest of the session + spacy.require_cpu() From 25c0c544540a269e11d4895a2603096b5d5a9746 Mon Sep 17 00:00:00 2001 From: shademe Date: Mon, 17 Apr 2023 13:04:49 +0200 Subject: [PATCH 3/5] Skip test if GPU support is not present --- sense2vec/tests/test_issue155.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sense2vec/tests/test_issue155.py b/sense2vec/tests/test_issue155.py index 020f08f..af635e7 100644 --- a/sense2vec/tests/test_issue155.py +++ b/sense2vec/tests/test_issue155.py @@ -1,8 +1,11 @@ from pathlib import Path +import pytest import spacy from sense2vec.sense2vec import Sense2Vec +from thinc.util import has_cupy_gpu +@pytest.mark.skipif(not has_cupy_gpu, reason="requires Cupy/GPU") def test_issue155(): data_path = Path(__file__).parent / "data" spacy.require_gpu() From 995820c2f911f4aebaf197d1122fe1f16c5784c5 Mon Sep 17 00:00:00 2001 From: shademe Date: Mon, 17 Apr 2023 13:42:59 +0200 Subject: [PATCH 4/5] Use `use_ops` context manager in test --- sense2vec/tests/test_issue155.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/sense2vec/tests/test_issue155.py b/sense2vec/tests/test_issue155.py index af635e7..546734d 100644 --- a/sense2vec/tests/test_issue155.py +++ b/sense2vec/tests/test_issue155.py @@ -1,17 +1,13 @@ from pathlib import Path import pytest -import spacy from sense2vec.sense2vec import Sense2Vec +from thinc.api import use_ops from thinc.util import has_cupy_gpu @pytest.mark.skipif(not has_cupy_gpu, reason="requires Cupy/GPU") def test_issue155(): data_path = Path(__file__).parent / "data" - spacy.require_gpu() - - s2v = Sense2Vec().from_disk(data_path) - s2v.most_similar("beekeepers|NOUN") - - # Restore CPU ops for the rest of the session - spacy.require_cpu() + with use_ops("cupy"): + s2v = Sense2Vec().from_disk(data_path) + s2v.most_similar("beekeepers|NOUN") From db14ddc41a4860ba84910bb4f74536e3fc4c1a93 Mon Sep 17 00:00:00 2001 From: shademe Date: Mon, 17 Apr 2023 13:44:21 +0200 Subject: [PATCH 5/5] Typo --- sense2vec/sense2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 0b35849..1e1cf8f 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -313,7 +313,7 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): """ data = srsly.msgpack_loads(bytes_data) self.vectors = Vectors().from_bytes(data["vectors"]) - # Pin vectors to the CPU so that we don't get up comparing + # Pin vectors to the CPU so that we don't end up comparing # numpy and cupy arrays. self.vectors.to_ops(NumpyOps()) self.freqs = dict(data.get("freqs", [])) @@ -352,7 +352,7 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): freqs_path = path / "freqs.json" cache_path = path / "cache" self.vectors = Vectors().from_disk(path) - # Pin vectors to the CPU so that we don't get up comparing + # Pin vectors to the CPU so that we don't end up comparing # numpy and cupy arrays. self.vectors.to_ops(NumpyOps()) self.cfg.update(srsly.read_json(path / "cfg"))