diff --git a/minette/tagger/base.py b/minette/tagger/base.py index 0b5fef0..befac61 100644 --- a/minette/tagger/base.py +++ b/minette/tagger/base.py @@ -42,7 +42,23 @@ def parse(self, text): Returns ------- - words : list of minette.WordNode + words : list of minette.WordNode (empty) Word nodes """ return [] + + def parse_as_generator(self, text): + """ + Analyze and parse text, returns Generator + + Parameters + ---------- + text : str + Text to analyze + + Returns + ------- + words : Generator of minette.WordNode (empty) + Word nodes + """ + yield from () diff --git a/minette/tagger/janometagger.py b/minette/tagger/janometagger.py index 28d6244..0cfc778 100644 --- a/minette/tagger/janometagger.py +++ b/minette/tagger/janometagger.py @@ -96,9 +96,9 @@ def __init__(self, config=None, timezone=None, logger=None, *, else: self.tokenizer = Tokenizer() - def parse(self, text): + def parse_as_generator(self, text): """ - Parse and annotate using Janome + Parse and annotate using Janome, returns Generator Parameters ---------- @@ -107,17 +107,31 @@ def parse(self, text): Returns ------- - words : list of minette.minette.tagger.janometagger.JanomeNode + words : Generator of minette.minette.tagger.janometagger.JanomeNode Janome nodes """ - ret = [] if not text: - return ret + return try: for token in self.tokenizer.tokenize(text): - ret.append(JanomeNode.create(token.surface, token)) + yield JanomeNode.create(token.surface, token) except Exception as ex: self.logger.error( "Janome parsing error: " + str(ex) + "\n" + traceback.format_exc()) - return ret + + def parse(self, text): + """ + Parse and annotate using Janome + + Parameters + ---------- + text : str + Text to analyze + + Returns + ------- + words : Generator of minette.minette.tagger.janometagger.JanomeNode + Janome nodes + """ + return [jn for jn in self.parse_as_generator(text)] diff --git a/minette/tagger/mecabtagger.py b/minette/tagger/mecabtagger.py index 791c4c5..4d8ca15 100755 --- a/minette/tagger/mecabtagger.py +++ b/minette/tagger/mecabtagger.py @@ -74,9 +74,9 @@ class MeCabTagger(Tagger): Logger """ - def parse(self, text): + def parse_as_generator(self, text): """ - Analyze and parse text + Analyze and parse text using MeCab, returns Generator Parameters ---------- @@ -99,10 +99,26 @@ def parse(self, text): while node: features = node.feature.split(",") if features[0] != "BOS/EOS": - ret.append(MeCabNode.create(node.surface, features)) + # ret.append(MeCabNode.create(node.surface, features)) + yield MeCabNode.create(node.surface, features) node = node.next except Exception as ex: self.logger.error( "MeCab parsing error: " + str(ex) + "\n" + traceback.format_exc()) - return ret + + def parse(self, text): + """ + Analyze and parse text + + Parameters + ---------- + text : str + Text to analyze + + Returns + ------- + words : list of minette.tagger.mecabtagger.MeCabNode + MeCab word nodes + """ + return [mn for mn in self.parse_as_generator(text)] diff --git a/tests/tagger/test_janometagger.py b/tests/tagger/test_janometagger.py index 4d9fcb0..a20db7d 100644 --- a/tests/tagger/test_janometagger.py +++ b/tests/tagger/test_janometagger.py @@ -1,5 +1,6 @@ import pytest from pytz import timezone +from types import GeneratorType try: from minette.tagger.janometagger import JanomeTagger, JanomeNode @@ -38,6 +39,37 @@ def test_parse(): assert words[2].pronunciation == "ヨイ" +def test_parse_as_generator(): + tagger = JanomeTagger() + # 空文字列 + empty_words_gen = tagger.parse_as_generator("") + assert isinstance(empty_words_gen, GeneratorType) + empty_words = [ew for ew in empty_words_gen] + assert empty_words == [] + # センテンスあり + words = tagger.parse_as_generator("今日は良い天気です") + assert isinstance(words, GeneratorType) + i = 0 + for w in words: + if i == 0: + assert w.surface == "今日" + assert w.part == "名詞" + assert w.part_detail1 == "副詞可能" + assert w.word == "今日" + assert w.kana == "キョウ" + assert w.pronunciation == "キョー" + elif i == 2: + assert w.surface == "良い" + assert w.part == "形容詞" + assert w.part_detail1 == "自立" + assert w.stem_type == "形容詞・アウオ段" + assert w.stem_form == "基本形" + assert w.word == "良い" + assert w.kana == "ヨイ" + assert w.pronunciation == "ヨイ" + i += 1 + + def test_error(): tagger = JanomeTagger() assert tagger.parse(object()) == [] diff --git a/tests/tagger/test_mecabtagger.py b/tests/tagger/test_mecabtagger.py index 28f87d8..7c9a974 100644 --- a/tests/tagger/test_mecabtagger.py +++ b/tests/tagger/test_mecabtagger.py @@ -1,5 +1,6 @@ import pytest from pytz import timezone +from types import GeneratorType try: from minette.tagger.mecabtagger import MeCabTagger, MeCabNode @@ -38,6 +39,37 @@ def test_parse(): assert words[2].pronunciation == "ヨイ" +def test_parse_as_generator(): + tagger = MeCabTagger() + # 空文字列 + empty_words_gen = tagger.parse_as_generator("") + assert isinstance(empty_words_gen, GeneratorType) + empty_words = [ew for ew in empty_words_gen] + assert empty_words == [] + # センテンスあり + words = tagger.parse_as_generator("今日は良い天気です") + assert isinstance(words, GeneratorType) + i = 0 + for w in words: + if i == 0: + assert w.surface == "今日" + assert w.part == "名詞" + assert w.part_detail1 == "副詞可能" + assert w.word == "今日" + assert w.kana == "キョウ" + assert w.pronunciation == "キョー" + elif i == 2: + assert w.surface == "良い" + assert w.part == "形容詞" + assert w.part_detail1 == "自立" + assert w.stem_type == "形容詞・アウオ段" + assert w.stem_form == "基本形" + assert w.word == "良い" + assert w.kana == "ヨイ" + assert w.pronunciation == "ヨイ" + i += 1 + + def test_error(): tagger = MeCabTagger() assert tagger.parse(object()) == [] diff --git a/tests/tagger/test_tagger_base.py b/tests/tagger/test_tagger_base.py index e8bba96..dd6c073 100644 --- a/tests/tagger/test_tagger_base.py +++ b/tests/tagger/test_tagger_base.py @@ -1,5 +1,6 @@ import pytest from pytz import timezone +from types import GeneratorType from minette import Tagger @@ -12,3 +13,8 @@ def test_init(): def test_parse(): tagger = Tagger() assert tagger.parse("今日は良い天気です") == [] + + +def test_parse_as_generator(): + tagger = Tagger() + assert isinstance(tagger.parse_as_generator("今日は良い天気です"), GeneratorType)