From 71aa5ebd7bbcb43b56169e7df0b1a4af74dac65b Mon Sep 17 00:00:00 2001 From: MpolaarbearM Date: Sat, 4 Nov 2023 00:15:12 +0100 Subject: [PATCH 1/4] add function for pos tag with transformers --- ptagger_temp4XcDf.json | 1 + pythainlp/tag/__init__.py | 3 ++- pythainlp/tag/pos_tag.py | 50 +++++++++++++++++++++++++++++++++++++++ tests/test_tag.py | 13 +++++++++- 4 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 ptagger_temp4XcDf.json diff --git a/ptagger_temp4XcDf.json b/ptagger_temp4XcDf.json new file mode 100644 index 000000000..526ea1935 --- /dev/null +++ b/ptagger_temp4XcDf.json @@ -0,0 +1 @@ +{"weights": {"bias": {"N": 0.934, "V": -0.934}, "i suffix คน": {"N": 0.091, "V": -0.091}, "i pref1 ค": {"N": 0.091, "V": -0.091}, "i-1 tag -START-": {"N": 0.997, "V": -0.997}, "i-2 tag -START2-": {"N": 0.997, "V": -0.997}, "i tag+i-2 tag -START- -START2-": {"N": 0.997, "V": -0.997}, "i word คน": {"N": 0.091, "V": -0.091}, "i-1 tag+i word -START- คน": {"N": 0.997, "V": -0.997}, "i-1 word -START2-": {"N": 0.997, "V": -0.997}, "i-1 suffix T2-": {"N": 0.997, "V": -0.997}, "i-2 word -START-": {"N": 0.997, "V": -0.997}, "i+1 word เดิน": {"N": 0.997, "V": -0.997}, "i+1 suffix ดิน": {"N": 0.997, "V": -0.997}, "i+2 word -END-": {"N": 0.091, "V": -0.091}, "i suffix ดิน": {"V": 0.994, "N": -0.994}, "i pref1 เ": {"V": 0.994, "N": -0.994}, "i-1 tag V": {"V": 0.066, "N": -0.066}, "i-2 tag -START-": {"V": 1.894, "N": -1.894}, "i tag+i-2 tag V -START-": {"V": 0.994, "N": -0.994}, "i word เดิน": {"V": 0.994, "N": -0.994}, "i-1 tag+i word V เดิน": {"V": 0.994, "N": -0.994}, "i-1 word คน": {"V": 0.997, "N": -0.997}, "i-1 suffix คน": {"V": 0.997, "N": -0.997}, "i-2 word -START2-": {"V": 1.894, "N": -1.894}, "i+1 word -END-": {"V": -0.843, "N": 0.843}, "i+1 suffix ND-": {"V": -0.843, "N": 0.843}, "i+2 word -END2-": {"V": -0.843, "N": 0.843}, "i suffix นก": {"N": 0.929, "V": -0.929}, "i pref1 น": {"N": 0.929, "V": -0.929}, "i-2 tag N": {"N": 1.831, "V": -1.831}, "i tag+i-2 tag V N": {"N": 0.929, "V": -0.929}, "i word นก": {"N": 0.929, "V": -0.929}, "i-1 tag+i word V นก": {"N": 0.929, "V": -0.929}, "i-1 word กิน": {"N": 0.929, "V": -0.929}, "i-1 suffix กิน": {"N": 0.929, "V": -0.929}, "i-2 word คน": {"N": 1.831, "V": -1.831}, "i-1 tag N": {"V": -0.003, "N": 0.003}, "i tag+i-2 tag N -START-": {"V": 0.9, "N": -0.9}, "i-1 tag+i word N คน": {"V": 0.906, "N": -0.906}, "i+1 word กาแฟ": {"V": 0.906, "N": -0.906}, "i+1 suffix าแฟ": {"V": 0.906, "N": -0.906}, "i suffix าแฟ": {"N": 0.903, "V": -0.903}, "i pref1 ก": {"N": 0.903, "V": -0.903}, "i tag+i-2 tag N N": {"N": 0.903, "V": -0.903}, "i word กาแฟ": {"N": 0.903, "V": -0.903}, "i-1 tag+i word N กาแฟ": {"N": 0.903, "V": -0.903}, "i suffix 021": {"N": 0.8, "V": -0.8}, "i pref1 2": {"N": 0.8, "V": -0.8}, "i word !YEAR": {"N": 0.8, "V": -0.8}, "i-1 tag+i word N !YEAR": {"N": 0.8, "V": -0.8}, "i-1 word ปี": {"N": 0.8, "V": -0.8}, "i-1 suffix ปี": {"N": 0.8, "V": -0.8}, "i suffix บิน": {"V": 0.794, "N": -0.794}, "i pref1 บ": {"V": 0.794, "N": -0.794}, "i word บิน": {"V": 0.794, "N": -0.794}, "i-1 tag+i word N บิน": {"V": 0.794, "N": -0.794}, "i-1 word นก": {"V": 0.794, "N": -0.794}, "i-1 suffix นก": {"V": 0.794, "N": -0.794}}, "tagdict": {"กิน": "V"}, "classes": ["N", "V"]} \ No newline at end of file diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 93d57e3c0..3325cbcb0 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -27,10 +27,11 @@ "chunk_parse", "NER", "NNER", + "pos_tag_transformers" # added by moss ] from pythainlp.tag.locations import tag_provinces -from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents +from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents,pos_tag_transformers from pythainlp.tag._tag_perceptron import PerceptronTagger from pythainlp.tag.chunk import chunk_parse from pythainlp.tag.named_entity import NER, NNER diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index c2e9bbf7f..c095b0217 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -15,6 +15,7 @@ from typing import List, Tuple + def pos_tag( words: List[str], engine: str = "perceptron", corpus: str = "orchid" ) -> List[Tuple[str, str]]: @@ -176,3 +177,52 @@ def pos_tag_sents( return [] return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences] + + +def pos_tag_transformers( + words: str, engine: str = "bert-base-th-cased-blackboard" +): + """ + "wangchanberta-ud-thai-pud-upos", + "mdeberta-v3-ud-thai-pud-upos", + "bert-base-th-cased-blackboard", + + """ + + try: + from transformers import AutoModelForTokenClassification, AutoTokenizer, TokenClassificationPipeline + except ImportError: + raise ImportError("Not found transformers! Please install transformers by pip install transformers") + + if not words: + return [] + + if engine == "wangchanberta-ud-thai-pud-upos" : + model = AutoModelForTokenClassification.from_pretrained("Pavarissy/wangchanberta-ud-thai-pud-upos") + tokenizer = AutoTokenizer.from_pretrained("Pavarissy/wangchanberta-ud-thai-pud-upos") + elif engine == "mdeberta-v3-ud-thai-pud-upos": + model = AutoModelForTokenClassification.from_pretrained("Pavarissy/mdeberta-v3-ud-thai-pud-upos") + tokenizer = AutoTokenizer.from_pretrained("Pavarissy/mdeberta-v3-ud-thai-pud-upos") + elif engine == "bert-base-th-cased-blackboard": + model = AutoModelForTokenClassification.from_pretrained("lunarlist/pos_thai") + tokenizer = AutoTokenizer.from_pretrained("lunarlist/pos_thai") + else: + raise ValueError( + "pos_tag_transformers not support {0} engine.".format( + engine + ) + ) + + pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, grouped_entities=True) + + outputs = pipeline(words) + return outputs + + + + + + + + + diff --git a/tests/test_tag.py b/tests/test_tag.py index eae51bf30..f5bb8747e 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -9,10 +9,12 @@ perceptron, pos_tag, pos_tag_sents, + pos_tag_transformers, unigram, - tltk, + #tltk, NER, NNER, + ) from pythainlp.tag.locations import tag_provinces from pythainlp.tag.thainer import ThaiNameTagger @@ -362,3 +364,12 @@ def test_NER_class(self): def test_NNER_class(self): nner = NNER() self.assertIsNotNone(nner.tag("แมวทำอะไรตอนห้าโมงเช้า")) + + def test_pos_tag_transformers(self): + self.assertIsNotNone(pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine = "bert-base-th-cased-blackboard")) + self.assertIsNotNone(pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine = "mdeberta-v3-ud-thai-pud-upos")) + self.assertIsNotNone(pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine = "wangchanberta-ud-thai-pud-upos")) + + with self.assertRaises(ValueError): + pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine = "non-existing-engine") + From 174dfb286bd6f27e3b5bec8a74f0dcb69de21095 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 5 Nov 2023 02:40:08 +0700 Subject: [PATCH 2/4] Fixed postag test --- tests/test_tag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tag.py b/tests/test_tag.py index f5bb8747e..16e398d02 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -11,7 +11,7 @@ pos_tag_sents, pos_tag_transformers, unigram, - #tltk, + tltk, NER, NNER, From 04bf517b94c72f682d6c96c8d6dcf7cb488387bd Mon Sep 17 00:00:00 2001 From: MpolaarbearM Date: Sat, 4 Nov 2023 23:43:53 +0100 Subject: [PATCH 3/4] fixed pep8 --- ptagger_temp4XcDf.json | 1 - pythainlp/tag/__init__.py | 4 ++-- pythainlp/tag/pos_tag.py | 31 +++++++++++++------------------ tests/test_tag.py | 13 +++++++------ 4 files changed, 22 insertions(+), 27 deletions(-) delete mode 100644 ptagger_temp4XcDf.json diff --git a/ptagger_temp4XcDf.json b/ptagger_temp4XcDf.json deleted file mode 100644 index 526ea1935..000000000 --- a/ptagger_temp4XcDf.json +++ /dev/null @@ -1 +0,0 @@ -{"weights": {"bias": {"N": 0.934, "V": -0.934}, "i suffix คน": {"N": 0.091, "V": -0.091}, "i pref1 ค": {"N": 0.091, "V": -0.091}, "i-1 tag -START-": {"N": 0.997, "V": -0.997}, "i-2 tag -START2-": {"N": 0.997, "V": -0.997}, "i tag+i-2 tag -START- -START2-": {"N": 0.997, "V": -0.997}, "i word คน": {"N": 0.091, "V": -0.091}, "i-1 tag+i word -START- คน": {"N": 0.997, "V": -0.997}, "i-1 word -START2-": {"N": 0.997, "V": -0.997}, "i-1 suffix T2-": {"N": 0.997, "V": -0.997}, "i-2 word -START-": {"N": 0.997, "V": -0.997}, "i+1 word เดิน": {"N": 0.997, "V": -0.997}, "i+1 suffix ดิน": {"N": 0.997, "V": -0.997}, "i+2 word -END-": {"N": 0.091, "V": -0.091}, "i suffix ดิน": {"V": 0.994, "N": -0.994}, "i pref1 เ": {"V": 0.994, "N": -0.994}, "i-1 tag V": {"V": 0.066, "N": -0.066}, "i-2 tag -START-": {"V": 1.894, "N": -1.894}, "i tag+i-2 tag V -START-": {"V": 0.994, "N": -0.994}, "i word เดิน": {"V": 0.994, "N": -0.994}, "i-1 tag+i word V เดิน": {"V": 0.994, "N": -0.994}, "i-1 word คน": {"V": 0.997, "N": -0.997}, "i-1 suffix คน": {"V": 0.997, "N": -0.997}, "i-2 word -START2-": {"V": 1.894, "N": -1.894}, "i+1 word -END-": {"V": -0.843, "N": 0.843}, "i+1 suffix ND-": {"V": -0.843, "N": 0.843}, "i+2 word -END2-": {"V": -0.843, "N": 0.843}, "i suffix นก": {"N": 0.929, "V": -0.929}, "i pref1 น": {"N": 0.929, "V": -0.929}, "i-2 tag N": {"N": 1.831, "V": -1.831}, "i tag+i-2 tag V N": {"N": 0.929, "V": -0.929}, "i word นก": {"N": 0.929, "V": -0.929}, "i-1 tag+i word V นก": {"N": 0.929, "V": -0.929}, "i-1 word กิน": {"N": 0.929, "V": -0.929}, "i-1 suffix กิน": {"N": 0.929, "V": -0.929}, "i-2 word คน": {"N": 1.831, "V": -1.831}, "i-1 tag N": {"V": -0.003, "N": 0.003}, "i tag+i-2 tag N -START-": {"V": 0.9, "N": -0.9}, "i-1 tag+i word N คน": {"V": 0.906, "N": -0.906}, "i+1 word กาแฟ": {"V": 0.906, "N": -0.906}, "i+1 suffix าแฟ": {"V": 0.906, "N": -0.906}, "i suffix าแฟ": {"N": 0.903, "V": -0.903}, "i pref1 ก": {"N": 0.903, "V": -0.903}, "i tag+i-2 tag N N": {"N": 0.903, "V": -0.903}, "i word กาแฟ": {"N": 0.903, "V": -0.903}, "i-1 tag+i word N กาแฟ": {"N": 0.903, "V": -0.903}, "i suffix 021": {"N": 0.8, "V": -0.8}, "i pref1 2": {"N": 0.8, "V": -0.8}, "i word !YEAR": {"N": 0.8, "V": -0.8}, "i-1 tag+i word N !YEAR": {"N": 0.8, "V": -0.8}, "i-1 word ปี": {"N": 0.8, "V": -0.8}, "i-1 suffix ปี": {"N": 0.8, "V": -0.8}, "i suffix บิน": {"V": 0.794, "N": -0.794}, "i pref1 บ": {"V": 0.794, "N": -0.794}, "i word บิน": {"V": 0.794, "N": -0.794}, "i-1 tag+i word N บิน": {"V": 0.794, "N": -0.794}, "i-1 word นก": {"V": 0.794, "N": -0.794}, "i-1 suffix นก": {"V": 0.794, "N": -0.794}}, "tagdict": {"กิน": "V"}, "classes": ["N", "V"]} \ No newline at end of file diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py index 3325cbcb0..790810a62 100644 --- a/pythainlp/tag/__init__.py +++ b/pythainlp/tag/__init__.py @@ -27,11 +27,11 @@ "chunk_parse", "NER", "NNER", - "pos_tag_transformers" # added by moss + "pos_tag_transformers" ] from pythainlp.tag.locations import tag_provinces -from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents,pos_tag_transformers +from pythainlp.tag.pos_tag import pos_tag, pos_tag_sents, pos_tag_transformers from pythainlp.tag._tag_perceptron import PerceptronTagger from pythainlp.tag.chunk import chunk_parse from pythainlp.tag.named_entity import NER, NNER diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index c095b0217..abdfe5fc2 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -190,18 +190,22 @@ def pos_tag_transformers( """ try: - from transformers import AutoModelForTokenClassification, AutoTokenizer, TokenClassificationPipeline + from transformers import AutoModelForTokenClassification, \ + AutoTokenizer, TokenClassificationPipeline except ImportError: - raise ImportError("Not found transformers! Please install transformers by pip install transformers") - + raise ImportError( + "Not found transformers! Please install transformers by pip install transformers") + if not words: return [] - - if engine == "wangchanberta-ud-thai-pud-upos" : - model = AutoModelForTokenClassification.from_pretrained("Pavarissy/wangchanberta-ud-thai-pud-upos") + + if engine == "wangchanberta-ud-thai-pud-upos": + model = AutoModelForTokenClassification.from_pretrained( + "Pavarissy/wangchanberta-ud-thai-pud-upos") tokenizer = AutoTokenizer.from_pretrained("Pavarissy/wangchanberta-ud-thai-pud-upos") elif engine == "mdeberta-v3-ud-thai-pud-upos": - model = AutoModelForTokenClassification.from_pretrained("Pavarissy/mdeberta-v3-ud-thai-pud-upos") + model = AutoModelForTokenClassification.from_pretrained( + "Pavarissy/mdeberta-v3-ud-thai-pud-upos") tokenizer = AutoTokenizer.from_pretrained("Pavarissy/mdeberta-v3-ud-thai-pud-upos") elif engine == "bert-base-th-cased-blackboard": model = AutoModelForTokenClassification.from_pretrained("lunarlist/pos_thai") @@ -212,17 +216,8 @@ def pos_tag_transformers( engine ) ) - + pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, grouped_entities=True) outputs = pipeline(words) - return outputs - - - - - - - - - + return outputs \ No newline at end of file diff --git a/tests/test_tag.py b/tests/test_tag.py index f5bb8747e..c0b8082e2 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -366,10 +366,11 @@ def test_NNER_class(self): self.assertIsNotNone(nner.tag("แมวทำอะไรตอนห้าโมงเช้า")) def test_pos_tag_transformers(self): - self.assertIsNotNone(pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine = "bert-base-th-cased-blackboard")) - self.assertIsNotNone(pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine = "mdeberta-v3-ud-thai-pud-upos")) - self.assertIsNotNone(pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine = "wangchanberta-ud-thai-pud-upos")) - + self.assertIsNotNone(pos_tag_transformers( + words="แมวทำอะไรตอนห้าโมงเช้า", engine="bert-base-th-cased-blackboard")) + self.assertIsNotNone(pos_tag_transformers( + words="แมวทำอะไรตอนห้าโมงเช้า", engine="mdeberta-v3-ud-thai-pud-upos")) + self.assertIsNotNone(pos_tag_transformers( + words="แมวทำอะไรตอนห้าโมงเช้า", engine="wangchanberta-ud-thai-pud-upos")) with self.assertRaises(ValueError): - pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine = "non-existing-engine") - + pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine = "non-existing-engine") \ No newline at end of file From 387d1373a550e1bf9fecda37cbedf6515838962c Mon Sep 17 00:00:00 2001 From: MpolaarbearM Date: Sat, 4 Nov 2023 23:48:11 +0100 Subject: [PATCH 4/4] fixed pep8 --- tests/test_tag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tag.py b/tests/test_tag.py index c0b8082e2..b27fc77bb 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -373,4 +373,4 @@ def test_pos_tag_transformers(self): self.assertIsNotNone(pos_tag_transformers( words="แมวทำอะไรตอนห้าโมงเช้า", engine="wangchanberta-ud-thai-pud-upos")) with self.assertRaises(ValueError): - pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine = "non-existing-engine") \ No newline at end of file + pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine="non-existing-engine") \ No newline at end of file