Skip to content
This repository was archived by the owner on Dec 18, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion minette/tagger/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,23 @@ def parse(self, text):

Returns
-------
words : list of minette.WordNode
words : list of minette.WordNode (empty)
Word nodes
"""
return []

def parse_as_generator(self, text):
"""
Analyze and parse text, returns Generator

Parameters
----------
text : str
Text to analyze

Returns
-------
words : Generator of minette.WordNode (empty)
Word nodes
"""
yield from ()
28 changes: 21 additions & 7 deletions minette/tagger/janometagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ def __init__(self, config=None, timezone=None, logger=None, *,
else:
self.tokenizer = Tokenizer()

def parse(self, text):
def parse_as_generator(self, text):
"""
Parse and annotate using Janome
Parse and annotate using Janome, returns Generator

Parameters
----------
Expand All @@ -107,17 +107,31 @@ def parse(self, text):

Returns
-------
words : list of minette.minette.tagger.janometagger.JanomeNode
words : Generator of minette.minette.tagger.janometagger.JanomeNode
Janome nodes
"""
ret = []
if not text:
return ret
return
try:
for token in self.tokenizer.tokenize(text):
ret.append(JanomeNode.create(token.surface, token))
yield JanomeNode.create(token.surface, token)
except Exception as ex:
self.logger.error(
"Janome parsing error: "
+ str(ex) + "\n" + traceback.format_exc())
return ret

def parse(self, text):
"""
Parse and annotate using Janome

Parameters
----------
text : str
Text to analyze

Returns
-------
words : Generator of minette.minette.tagger.janometagger.JanomeNode
Janome nodes
"""
return [jn for jn in self.parse_as_generator(text)]
24 changes: 20 additions & 4 deletions minette/tagger/mecabtagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ class MeCabTagger(Tagger):
Logger
"""

def parse(self, text):
def parse_as_generator(self, text):
"""
Analyze and parse text
Analyze and parse text using MeCab, returns Generator

Parameters
----------
Expand All @@ -99,10 +99,26 @@ def parse(self, text):
while node:
features = node.feature.split(",")
if features[0] != "BOS/EOS":
ret.append(MeCabNode.create(node.surface, features))
# ret.append(MeCabNode.create(node.surface, features))
yield MeCabNode.create(node.surface, features)
node = node.next
except Exception as ex:
self.logger.error(
"MeCab parsing error: "
+ str(ex) + "\n" + traceback.format_exc())
return ret

def parse(self, text):
"""
Analyze and parse text

Parameters
----------
text : str
Text to analyze

Returns
-------
words : list of minette.tagger.mecabtagger.MeCabNode
MeCab word nodes
"""
return [mn for mn in self.parse_as_generator(text)]
32 changes: 32 additions & 0 deletions tests/tagger/test_janometagger.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from pytz import timezone
from types import GeneratorType

try:
from minette.tagger.janometagger import JanomeTagger, JanomeNode
Expand Down Expand Up @@ -38,6 +39,37 @@ def test_parse():
assert words[2].pronunciation == "ヨイ"


def test_parse_as_generator():
tagger = JanomeTagger()
# 空文字列
empty_words_gen = tagger.parse_as_generator("")
assert isinstance(empty_words_gen, GeneratorType)
empty_words = [ew for ew in empty_words_gen]
assert empty_words == []
# センテンスあり
words = tagger.parse_as_generator("今日は良い天気です")
assert isinstance(words, GeneratorType)
i = 0
for w in words:
if i == 0:
assert w.surface == "今日"
assert w.part == "名詞"
assert w.part_detail1 == "副詞可能"
assert w.word == "今日"
assert w.kana == "キョウ"
assert w.pronunciation == "キョー"
elif i == 2:
assert w.surface == "良い"
assert w.part == "形容詞"
assert w.part_detail1 == "自立"
assert w.stem_type == "形容詞・アウオ段"
assert w.stem_form == "基本形"
assert w.word == "良い"
assert w.kana == "ヨイ"
assert w.pronunciation == "ヨイ"
i += 1


def test_error():
tagger = JanomeTagger()
assert tagger.parse(object()) == []
32 changes: 32 additions & 0 deletions tests/tagger/test_mecabtagger.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from pytz import timezone
from types import GeneratorType

try:
from minette.tagger.mecabtagger import MeCabTagger, MeCabNode
Expand Down Expand Up @@ -38,6 +39,37 @@ def test_parse():
assert words[2].pronunciation == "ヨイ"


def test_parse_as_generator():
tagger = MeCabTagger()
# 空文字列
empty_words_gen = tagger.parse_as_generator("")
assert isinstance(empty_words_gen, GeneratorType)
empty_words = [ew for ew in empty_words_gen]
assert empty_words == []
# センテンスあり
words = tagger.parse_as_generator("今日は良い天気です")
assert isinstance(words, GeneratorType)
i = 0
for w in words:
if i == 0:
assert w.surface == "今日"
assert w.part == "名詞"
assert w.part_detail1 == "副詞可能"
assert w.word == "今日"
assert w.kana == "キョウ"
assert w.pronunciation == "キョー"
elif i == 2:
assert w.surface == "良い"
assert w.part == "形容詞"
assert w.part_detail1 == "自立"
assert w.stem_type == "形容詞・アウオ段"
assert w.stem_form == "基本形"
assert w.word == "良い"
assert w.kana == "ヨイ"
assert w.pronunciation == "ヨイ"
i += 1


def test_error():
tagger = MeCabTagger()
assert tagger.parse(object()) == []
6 changes: 6 additions & 0 deletions tests/tagger/test_tagger_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from pytz import timezone
from types import GeneratorType

from minette import Tagger

Expand All @@ -12,3 +13,8 @@ def test_init():
def test_parse():
tagger = Tagger()
assert tagger.parse("今日は良い天気です") == []


def test_parse_as_generator():
tagger = Tagger()
assert isinstance(tagger.parse_as_generator("今日は良い天気です"), GeneratorType)