diff --git a/docs/conf.py b/docs/conf.py
index c3f8194b8..bc1b294f1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -16,7 +16,6 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 from datetime import datetime
-import sys, os
 
 # -- Project information -----------------------------------------------------
 
diff --git a/docs/pythainlp-dev-thai.md b/docs/pythainlp-dev-thai.md
index 82a5581e5..fce9ce741 100644
--- a/docs/pythainlp-dev-thai.md
+++ b/docs/pythainlp-dev-thai.md
@@ -29,9 +29,9 @@ pip install pythainlp
 **ติดตั้ง PyICU บน macOS**
 
 ```sh
-$ brew install icu4c --force
-$ brew link --force icu4c
-$ CFLAGS=-I/usr/local/opt/icu4c/include LDFLAGS=-L/usr/local/opt/icu4c/lib pip install pythainlp
+brew install icu4c --force
+brew link --force icu4c
+CFLAGS=-I/usr/local/opt/icu4c/include LDFLAGS=-L/usr/local/opt/icu4c/lib pip install pythainlp
 ```
 
 ข้อมูลเพิ่มเติมที่ https://medium.com/data-science-cafe/install-polyglot-on-mac-3c90445abc1f
@@ -46,22 +46,21 @@ $ CFLAGS=-I/usr/local/opt/icu4c/include LDFLAGS=-L/usr/local/opt/icu4c/lib pip i
 
 ```python
 from pythainlp.tokenize import word_tokenize
+
 word_tokenize(text, engine)
 ```
 text คือ ข้อความในรูปแบบสตริง str เท่านั้น
 
-engine คือ ระบบตัดคำ ปัจจุบัน PyThaiNLP มี 6 engine ดังนี้
+engine คือ ระบบตัดคำ ปัจจุบันมี engine ดังนี้
 
-1. newmm (ค่าเริ่มต้น) - ใช้วิธี Maximum Matching โค้ดชุดใหม่[โดยคุณ Korakot Chaovavanich](https://www.facebook.com/groups/408004796247683/permalink/431283740586455/)
-2. icu - เรียกใช้ตัวตัดคำจาก ICU ตัวดั้งเดิมของ PyThaiNLP (ความแม่นยำต่ำ)
-3. dict - ตัดคำโดยใช้พจานุกรมจาก thaiword.txt ใน corpus (ความแม่นยำปานกลาง) **จะคืนค่า False หากข้อความนั้นไม่สามารถตัดคำได้**
-4. longest-matching - ใช้วิธี Longest Matching
-5. mm - ใช้วิธี Maximum Matching **(โค้ดชุดเก่า อยู่ในสถานะบำรุงรักษาเท่านั้น)**
-6. pylexto - เรียกใช้ตัวตัดคำจาก LexTo ซึ่งเป็น Longest Matching
-7. deepcut - เรียกใช้ [deepcut](https://github.com/rkcosmos/deepcut) ซึ่งตัดคำจากโมเดลการเรียนรู้ของเครื่อง
-8. wordcutpy - เรียกใช้ตัวตัดคำจาก [wordcutpy](https://github.com/veer66/wordcutpy)
+- newmm (ค่าเริ่มต้น) - ใช้พจนานุกรม ด้วยวิธี Maximum Matching + Thai Character Cluster โค้ดชุดใหม่[โดยคุณ Korakot Chaovavanich](https://www.facebook.com/groups/408004796247683/permalink/431283740586455/)
+- longest - ใช้พจนานุกรม ด้วยวิธี Longest Matching
+- icu - เรียกใช้ตัวตัดคำจาก ICU ใช้พจนานุกรม (ความแม่นยำต่ำ)
+- wordcutpy - เรียกใช้ตัวตัดคำจาก [wordcutpy](https://github.com/veer66/wordcutpy) ใช้พจนานุกรม
+- pylexto - เรียกใช้ตัวตัดคำจาก LexTo ใช้พจนานุกรม ด้วยวิธี Longest Matching
+- deepcut - เรียกใช้ตัวตัดคำจาก [deepcut](https://github.com/rkcosmos/deepcut) ใช้การเรียนรู้ของเครื่อง
 
-คืนค่าเป็น ''list'' เช่น ['แมว','กิน']
+คืนค่าเป็น ''list'' เช่น ['แมว', 'กิน']
 
 **การใช้งาน**
 
@@ -69,8 +68,8 @@ engine คือ ระบบตัดคำ ปัจจุบัน PyThaiNLP
 from pythainlp.tokenize import word_tokenize
 
 text = "โอเคบ่เรารักภาษาถิ่น"
->>> word_tokenize(text, engine="newmm")  # ['โอเค', 'บ่', 'เรา', 'รัก', 'ภาษาถิ่น']
->>> word_tokenize(text, engine="icu")  # ['โอ', 'เค', 'บ่', 'เรา', 'รัก', 'ภาษา', 'ถิ่น']
+word_tokenize(text, engine="newmm")  # ['โอเค', 'บ่', 'เรา', 'รัก', 'ภาษาถิ่น']
+word_tokenize(text, engine="icu")  # ['โอ', 'เค', 'บ่', 'เรา', 'รัก', 'ภาษา', 'ถิ่น']
 ```
 
 #### dict_word_tokenize
@@ -86,11 +85,10 @@ text คือ ข้อความที่ต้องการตัดค
 
 filename คือ ที่ตั้งไฟล์ที่ต้องการมาเป็นฐานข้อมูลตัดคำ
 
-engine คือ เครื่องมือตัดคำ
-- newmm ตัดคำด้วย newmm
-- wordcutpy ใช้ [wordcutpy](https://github.com/veer66/wordcutpy) ในการตัดคำ
-- mm ตัดคำด้วย mm
-- longest-matching ตัดคำโดยใช้ longest matching
+engine คือ ระบบตัดคำ (ดูรายละเอียดที่ word_tokenize)
+- newmm
+- longest
+- wordcutpy
 
 ตัวอย่างการใช้งาน https://gist.github.com/wannaphongcom/1e862583051bf0464b6ef4ed592f739c
 
@@ -117,9 +115,9 @@ engine คือ เครื่องมือสำหรับใช้ตั
 ใช้ตัดคำ/ประโยคจากช่องว่างในสตริง
 
 ```python
->>> from pythainlp.tokenize import WhitespaceTokenizer
->>> WhitespaceTokenizer("ทดสอบ ตัดคำช่องว่าง")
-['ทดสอบ', 'ตัดคำช่องว่าง']
+from pythainlp.tokenize import WhitespaceTokenizer
+
+WhitespaceTokenizer("ทดสอบ ตัดคำช่องว่าง")  # ['ทดสอบ', 'ตัดคำช่องว่าง']
 ```
 
 
@@ -147,27 +145,28 @@ check_all สำหรับส่งคืนค่า True หรือ False
 
 **เครดิต**
 
-TCC: Jakkrit TeCho
-Grammar: Wittawat Jitkrittum (https://github.com/wittawatj/jtcc/blob/master/TCC.g)
-Python code: Korakot Chaovavanich 
+- TCC: Jakkrit TeCho
+- Grammar: Wittawat Jitkrittum (https://github.com/wittawatj/jtcc/blob/master/TCC.g)
+- Python code: Korakot Chaovavanich
 
 **การใช้งาน**
 
 ```python
 from pythainlp.tokenize import tcc
+
 tcc.tcc("ประเทศไทย")  # 'ป/ระ/เท/ศ/ไท/ย'
 ```
 
 #### Enhanced Thai Character Cluster (ETCC)
 
-นอกจาก TCC แล้ว PyThaiNLP 1.4 ยังรองรับ Enhanced Thai Character Cluster (ETCC) โดยแบ่งกลุ่มด้วย /
+นอกจาก TCC แล้ว PyThaiNLP ยังรองรับ Enhanced Thai Character Cluster (ETCC) โดยแบ่งกลุ่มด้วย /
 
 **การใช้งาน**
 
 ```python
->>> from pythainlp.tokenize import etcc
->>> etcc.etcc('คืนความสุข')
-'/คืน/ความสุข'
+from pythainlp.tokenize import etcc
+
+etcc.etcc('คืนความสุข')  # '/คืน/ความสุข'
 ```
 
 ### tag
@@ -176,84 +175,86 @@ Part-of-speech tagging ภาษาไทย
 
 ```python
 from pythainlp.tag import pos_tag
+
 pos_tag(text, engine="unigram", corpus="orchid")
 ```
 
 list คือ list ที่เก็บข้อความหลังผ่านการตัดคำแล้ว
 
-engine คือ ตัวติดป้ายกำกับคำ (pos tagger) มี 3 ตัวดังนี้
-
-1. unigram (ค่าเริ่มต้น) - UnigramTagger
-2. perceptron - PerceptronTagger
-3. artagger - RDR POS Tagger ละเอียดยิ่งกว่าเดิม
+engine คือ ตัวติดป้ายกำกับคำ (pos tagger) มีดังนี้
+- unigram (ค่าเริ่มต้น) - UnigramTagger
+- perceptron - PerceptronTagger
+- artagger - RDR POS Tagger ละเอียดยิ่งกว่าเดิม
 
 corpus ที่รองรับ
-
-1. orchid
-2. pud ใช้ข้อมูล Parallel Universal Dependencies (PUD) treebanks
+- orchid ใช้ข้อมูลจากคลังคำ ORCHID โดยเนคเทค
+- pud ใช้ข้อมูล Parallel Universal Dependencies (PUD) treebanks
 
 ### summarize
 
-เป็นระบบสรุปเอกสารภาษาไทยแบบง่าย ๆ
+สรุปเอกสารภาษาไทยแบบง่าย ๆ
 
 ```python
-summarize_text(text, n , engine="frequency")
+summarize_text(text, n, engine="frequency")
 ```
 
 text เป็นข้อความ
+
 n คือ จำนวนประโยคสรุป
+
 engine ที่รองรับ
-frequency
+- frequency
 
 **การใช้งาน**
 
 ```python
->>> from pythainlp.summarize import summarize_text
->>> summarize_text(text="อาหาร หมายถึง ของแข็งหรือของเหลว ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว จะทำให้เกิดพลังงานและความร้อนยเจริญเติบโต ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ อาหารจะต้องงกาย", n=1, engine="frequency")
-['อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย']
+from pythainlp.summarize import summarize_text
+
+summarize_text(text="อาหาร หมายถึง ของแข็งหรือของเหลว ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว จะทำให้เกิดพลังงานและความร้อนยเจริญเติบโต ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ อาหารจะต้องงกาย", n=1, engine="frequency")
+# ['อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย']
 ```
 
 ### word_vector
 
+สร้างเวกเตอร์คำ
+
 ```python
 from pythainlp.word_vector import thai2vec
 ```
 
-word_vector เป็นระบบ word vector ใน PyThaiNLP
 
-ปัจจุบันนี้รองรับเฉพาะ thai2vec (https://github.com/cstorm125/thai2vec)
+ปัจจุบันรองรับเฉพาะ thai2vec (https://github.com/cstorm125/thai2vec)
 
 พัฒนาโดย Charin Polpanumas
 
 #### thai2vec
 
 ความต้องการโมดูล
-
 - gensim
 - numpy
 
 ##### API
 
 - get_model() - รับข้อมูล model ในรูปแบบของ gensim
-- most_similar_cosmul(positive,negative)
+- most_similar_cosmul(positive, negative)
 - doesnt_match(listdata)
-- similarity(word1,word2) - หาค่าความคล้ายกันระหว่าง 2 คำ โดยทั้งคู่เป็น str
-- sentence_vectorizer(ss,dim=300,use_mean=False)
+- similarity(word1, word2) - หาค่าความคล้ายระหว่าง 2 คำ โดยทั้งคู่เป็น str
+- sentence_vectorizer(ss, dim=300, use_mean=False)
 - about() - รายละเอียด thai2vec
 
 ### keywords
 
-ใช้หาคำสำคัญ (keyword) จากข้อความภาษาไทย
+หาคำสำคัญ (keyword) จากข้อความภาษาไทย
 
 #### find_keyword
 
-การทำงาน หาคำที่ถูกใช้งานมากกว่าค่าขั้นต่ำที่กำหนดได้ โดยจะลบ stopword ออกไป
+การทำงาน หาคำที่ถูกใช้งานมากกว่าค่าขั้นต่ำที่กำหนดได้ โดยจะลบ stopword ออก
 
 ```python
 find_keyword(word_list, lentext=3)
 ```
 
-word_list คือ list ของข้อความที่ผ่านการตัดคำแล้ว
+word_list คือ list ของข้อความที่ตัดคำแล้ว
 
 lentext คือ จำนวนคำขั้นต่ำที่ต้องการหา keyword
 
@@ -263,16 +264,15 @@ lentext คือ จำนวนคำขั้นต่ำที่ต้อ
 
 ```python
 from pythainlp.romanization import romanize
+
 romanize(str, engine="royin")
 ```
-มี 2 engine ดังนี้
 
+มี engine ดังนี้
 - pyicu ส่งค่าสัทอักษร
-- royin ใช้หลักเกณฑ์การถอดอักษรไทยเป็นอักษรโรมัน ฉบับราชบัณฑิตยสถาน (**หากมีข้อผิดพลาด ให้ใช้คำอ่าน เนื่องจากตัว royin ไม่มีตัวแปลงคำเป็นคำอ่าน**) 
+- royin ใช้หลักเกณฑ์การถอดอักษรไทยเป็นอักษรโรมัน ฉบับราชบัณฑิตยสถาน (**หากมีข้อผิดพลาด ให้ใช้คำอ่าน เนื่องจากตัว royin ไม่มีตัวแปลงคำเป็นคำอ่าน**)
 
-data :
-
-รับค่า ''str'' ข้อความ 
+รับค่า ''str'' ข้อความ
 
 คืนค่าเป็น ''str'' ข้อความ
 
@@ -280,26 +280,27 @@ data :
 
 ```python
 from pythainlp.romanization import romanize
-romanize("แมว") # 'maew'
+
+romanize("แมว")  # 'maew'
 ```
 
-### spell 
+### spell
 
-ตรวจสอบคำผิดในภาษาไทย 
+ตรวจสอบคำผิดในภาษาไทย
 
 ```python
 spell(word, engine="pn")
 ```
 
 engine ที่รองรับ
-
-- pn พัฒนามาจาก Peter Norvig (ค่าเริ่มต้น)
-- hunspell ใช้ hunspell
+- pn (ค่าเริ่มต้น) พัฒนาจาก Peter Norvig
+- hunspell เรียก hunspell ที่ติดตั้งอยู่ในระบบปฏิบัติการ (มีในระบบ Linux)
 
 **ตัวอย่างการใช้งาน**
 
 ```python
-from pythainlp.spell import *
+from pythainlp.spell import spell
+
 a = spell("สี่เหลียม")
 print(a)  # ['สี่เหลี่ยม']
 ```
@@ -309,39 +310,36 @@ print(a)  # ['สี่เหลี่ยม']
 correction(word)
 ```
 
-แสดงคำที่เป็นไปได้มากที่สุด
+จะคืนค่าคำที่เป็นไปได้มากที่สุด
 
 **ตัวอย่างการใช้งาน**
 
 ```python
 from pythainlp.spell.pn import correction
-a = correction("สี่เหลียม")
-print(a) # ['สี่เหลี่ยม']
-```
-
-ผลลัพธ์
 
-```
-สี่เหลี่ยม
+a = correction("สี่เหลียม")
+print(a)  # ['สี่เหลี่ยม']
 ```
 
 ### pythainlp.number
 
+จัดการกับตัวเลข
+
 ```python
 from pythainlp.number import *
 ```
-จัดการกับตัวเลข โดยมีดังนี้
 
-- thai_num_to_num(str) - เป็นการแปลงเลขไทยสู่เลข
-- thai_num_to_text(str) - เลขไทยสู่ข้อความ
-- num_to_thai_num(str) - เลขสู่เลขไทย
+มีฟังก์ชันดังนี้
+- thai_num_to_num(str) - แปลงเลขไทยสู่เลขอารบิก
+- thai_num_to_text(str) - เลขไทยสู่คำอ่านไทย
+- num_to_thai_num(str) - เลขอารบิกสู่เลขไทย
 - num_to_text(str) - เลขสู่ข้อความ
 - text_to_num(str) - ข้อความสู่เลข
-- numtowords(float) - อ่านจำนวนตัวเลขภาษาไทย (บาท) รับค่าเป็น ''float'' คืนค่าเป็น 'str'
+- numtowords(float) - อ่านจำนวนภาษาไทย (บาท) รับค่าเป็น ''float'' คืนค่าเป็น 'str'
 
 ### collation
 
-ใช้ในการเรียงลำดับข้อมูลภาษาไทยใน List
+เรียงลำดับข้อมูลภาษาไทยใน List
 
 ```python
 from pythainlp.collation import collation
@@ -358,6 +356,7 @@ print(collation(["ไก่", "ไข่", "กา", "ฮา"]))  # ['กา', '
 
 ```python
 from pythainlp.date import now
+
 now()  # '30 พฤษภาคม 2560 18:45:24'
 ```
 ### rank
@@ -368,6 +367,7 @@ now()  # '30 พฤษภาคม 2560 18:45:24'
 
 ```python
 from pythainlp.rank import rank
+
 rank(list)
 ```
 
@@ -376,13 +376,12 @@ rank(list)
 **ตัวอย่างการใช้งาน**
 
 ```python
->>> rank(["แมง", "แมง", "คน"])
-Counter({'แมง': 2, 'คน': 1})
+rank(["แมง", "แมง", "คน"])  # Counter({'แมง': 2, 'คน': 1})
 ```
 
 ### change
 
-#### แก้ไขปัญหาการพิมพ์ลืมเปลี่ยนภาษา
+#### แก้ไขปัญหาการลืมเปลี่ยนภาษาแป้นพิมพ์
 
 ```python
 from pythainlp.change import *
@@ -399,25 +398,20 @@ from pythainlp.change import *
 
 เครดิต Korakot Chaovavanich https://gist.github.com/korakot/0b772e09340cac2f493868da035597e8
 
-กฎที่รองรับในรุ่น 1.4
-
+กฎที่รองรับ
 - LK82 - กฎการเข้ารหัสซาวน์เด็กซ์ของ วิชิตหล่อจีระชุณห์กุล และ เจริญ คุวินทร์พันธุ์
 - Udom83 - กฎการเข้ารหัสซาวน์เด็กซ์ของ วรรณี อุดมพาณิชย์
 
 **การใช้งาน**
 
 ```python
->>> from pythainlp.soundex import LK82, Udom83
->>> print(LK82("รถ"))
-ร3000
->>> print(LK82("รด"))
-ร3000
->>> print(LK82("จัน"))
-จ4000
->>> print(LK82("จันทร์"))
-จ4000
->>> print(Udom83("รถ"))
-ร800000
+from pythainlp.soundex import LK82, Udom83
+
+print(LK82("รถ"))  # ร3000
+print(LK82("รด"))  # ร3000
+print(LK82("จัน"))  # จ4000
+print(LK82("จันทร์"))  # จ4000
+print(Udom83("รถ"))  # ร800000
 ```
 
 ### MetaSound ภาษาไทย
@@ -429,9 +423,9 @@ Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statisti
 **การใช้งาน**
 
 ```python
->>> from pythainlp.MetaSound import *
->>> MetaSound("คน")
-'15'
+from pythainlp.metasound import metasound
+
+metasound("รัก")  # 'ร100'
 ```
 
 ### sentiment
@@ -440,10 +434,13 @@ sentiment analysis ภาษาไทย ใช้ข้อมูลจาก [h
 
 ```python
 from pythainlp.sentiment import sentiment
+
 sentiment(str)
 ```
 
-รับค่า str ส่งออกเป็น pos , neg
+รับค่า str
+
+คืนค่าเป็น str ซึ่งมีค่า "pos" หรือ "neg"
 
 ### Util
 
@@ -455,7 +452,7 @@ from pythainlp.util import *
 
 #### ngrams
 
-สำหรับสร้าง n-grams 
+สำหรับสร้าง n-grams
 
 ```python
 ngrams(token, num)
@@ -496,7 +493,7 @@ normalize(text)
 
 ```python
 # เ เ ป ล ก กับ แปลก
-normalize("เเปลก") == "แปลก"  # True 
+normalize("เเปลก") == "แปลก"  # True
 ```
 
 #### listtext_num2num
@@ -510,8 +507,7 @@ listtext_num2num(list)
 **ตัวอย่าง**
 
 ```python
->>> listtext_num2num(["หก", "ล้าน", "หกแสน", "หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"])
-6666666
+listtext_num2num(["หกหมื่น", "หกพัน", "หกร้อย", "หกสิบ", "หก"])  # 66666
 ```
 
 ### Corpus
@@ -545,17 +541,22 @@ API เหมือนกับ NLTK โดยรองรับ API ดัง
 **ตัวอย่าง**
 
 ```python
->>> from pythainlp.corpus import wordnet
->>> print(wordnet.synsets("หนึ่ง"))
-[Synset('one.s.05'), Synset('one.s.04'), Synset('one.s.01'), Synset('one.n.01')]
->>> print(wordnet.synsets("หนึ่ง")[0].lemma_names("tha"))
-[]
->>> print(wordnet.synset("one.s.05"))
-Synset('one.s.05')
->>> print(wordnet.synset("spy.n.01").lemmas())
-[Lemma('spy.n.01.spy'), Lemma('spy.n.01.undercover_agent')]
->>> print(wordnet.synset("spy.n.01").lemma_names("tha"))
-['สปาย', 'สายลับ']
+from pythainlp.corpus import wordnet
+
+print(wordnet.synsets("หนึ่ง"))
+# [Synset('one.s.05'), Synset('one.s.04'), Synset('one.s.01'), Synset('one.n.01')]
+
+print(wordnet.synsets("หนึ่ง")[0].lemma_names("tha"))
+# []
+
+print(wordnet.synset("one.s.05"))
+# Synset('one.s.05')
+
+print(wordnet.synset("spy.n.01").lemmas())
+# [Lemma('spy.n.01.spy'), Lemma('spy.n.01.undercover_agent')]
+
+print(wordnet.synset("spy.n.01").lemma_names("tha"))
+# ['สปาย', 'สายลับ']
 ```
 
 #### stopword ภาษาไทย
@@ -591,6 +592,7 @@ alphabet.get_data()
 ```python
 from pythainlp.corpus.thaiword import get_data  # ข้อมูลเก่า
 get_data()
+
 from pythainlp.corpus.newthaiword import get_data  # ข้อมูลใหม่
 get_data()
 ```
@@ -622,9 +624,9 @@ text_list คือ ข้อความภาษาไทยที่อยู
 **ตัวอย่าง**
 
 ```python
->>> d = ["หนองคาย", "น่าอยู่", "นอกจากนี้", "ยัง", "มี", "เชียงใหม่"]
->>> parsed_docs(d)
-["[LOC : 'หนองคาย']", 'น่าอยู่', 'นอกจากนี้', 'ยัง', 'มี', "[LOC : 'เชียงใหม่']"]
+d = ["หนองคาย", "น่าอยู่", "นอกจากนี้", "ยัง", "มี", "เชียงใหม่"]
+parsed_docs(d)
+# ["[LOC : 'หนองคาย']", 'น่าอยู่', 'นอกจากนี้', 'ยัง', 'มี', "[LOC : 'เชียงใหม่']"]
 ```
 
 #### ConceptNet
@@ -656,7 +658,6 @@ word คือ คำ
 domain คือ หมวดหมู่ของคำ
 
 มีหมวดหมู่ดังนี้
-
 - all
 - imaginative
 - natural-pure-science
@@ -668,5 +669,3 @@ domain คือ หมวดหมู่ของคำ
 - belief-thought
 - leisure
 - others
-
-เขียนโดย PyThaiNLP
diff --git a/examples/tokenize.py b/examples/tokenize.py
index c6b6028e5..0b8a0d00b 100644
--- a/examples/tokenize.py
+++ b/examples/tokenize.py
@@ -20,5 +20,5 @@
 print(word_tokenize(text2))
 # ['กฎหมายแรงงาน']
 
-print(word_tokenize(text2, engine="longest-matching"))
+print(word_tokenize(text2, engine="longest"))
 # ['กฎหมาย', 'แรงงาน']
diff --git a/pythainlp/MetaSound.py b/pythainlp/MetaSound.py
deleted file mode 100644
index 1027462f0..000000000
--- a/pythainlp/MetaSound.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import,division,unicode_literals,print_function
-from builtins import *
-'''
-MetaSound
-
-References
-
-Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical Ontology for Analysing Names Given in Accordance with Thai Astrology. Retrieved from https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
-'''
-import re
-def MetaSound(name):
-    '''
-    Thai MetaSound
-
-    :param str name: thai text
-    :return: MetaSound for thai text
-    **Example**::
-        >>> from pythainlp.MetaSound import MetaSound
-        >>> MetaSound('รัก')
-        '501'
-        >>> MetaSound('ลัก')
-        '501'
-    '''
-    name1=list(name)
-    count=len(name1)
-    word=[]
-    i=0
-    while i<count:
-        if (re.search(r'[ก-ฮ]',name1[i]),re.U):
-            word.append(name1[i])
-        i+=1
-    i=0
-    count=len(name1)
-    while i<count:
-        if (re.search('์',name1[i],re.U)):
-            word[i-1]=''
-            word[i]=''
-        i+=1
-    i=0
-    while i<count:
-        if (re.search('[กขฃคฆฅ]',word[i],re.U)):
-            name1[i]='1'
-        elif (re.search('[จฉชฌซฐทฒดฎตสศษ]',word[i],re.U)):
-            name1[i]='2'
-        elif (re.search('[ฟฝพผภบป]',word[i],re.U)):
-            name1[i]='3'
-        elif (re.search('[ง]',word[i],re.U)):
-            name1[i]='4'
-        elif (re.search('[ลฬรนณฦญ]',word[i],re.U)):
-            name1[i]='5'
-        elif (re.search('[ม]',word[i],re.U)):
-            name1[i]='6'
-        elif (re.search('[ย]',word[i],re.U)):
-            name1[i]='7'
-        elif (re.search('[ว]',word[i],re.U)):
-            name1[i]='8'
-        else:
-            name1[i]='0'
-        i+=1
-    return ''.join(name1)
-if __name__ == '__main__':
-    print(MetaSound('รัก'))
-    print(MetaSound('ลัก'))
diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py
index edae7c220..2130a245e 100644
--- a/pythainlp/__init__.py
+++ b/pythainlp/__init__.py
@@ -3,7 +3,7 @@
 from pythainlp.collation import collation
 from pythainlp.date import now
 from pythainlp.keywords import find_keyword
-from pythainlp.MetaSound import MetaSound
+from pythainlp.metasound import metasound
 from pythainlp.rank import rank
 from pythainlp.romanization import romanize
 from pythainlp.sentiment import sentiment
diff --git a/pythainlp/metasound.py b/pythainlp/metasound.py
new file mode 100644
index 000000000..fb07aabe3
--- /dev/null
+++ b/pythainlp/metasound.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+"""
+MetaSound - Thai soundex system
+
+References:
+Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with a Statistical
+Ontology for Analysing Names Given in Accordance with Thai Astrology.
+https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
+"""
+
+_CONS_THANTHAKHAT = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ์"
+_THANTHAKHAT = "์"  # \u0e4c
+_C1 = "กขฃคฆฅ"  # sound K -> coded letter 1
+_C2 = "จฉชฌซฐทฒดฎตสศษ"  # D -> 2
+_C3 = "ฟฝพผภบป"  # B -> 3
+_C4 = "ง"  # NG -> 4
+_C5 = "ลฬรนณฦญ"  # N -> 5
+_C6 = "ม"  # M -> 6
+_C7 = "ย"  # Y -> 7
+_C8 = "ว"  # W -> 8
+
+
+def metasound(text, length=4):
+    """
+    Thai MetaSound
+
+    :param str text: Thai text
+    :param int length: preferred length of the MetaSound (default is 4)
+    :return: MetaSound for the text
+    **Example**::
+        from pythainlp.metasound import metasound
+        metasound("ลัก")  # 'ล100'
+        metasound("รัก")  # 'ร100'
+        metasound("รักษ์")  # 'ร100'
+        metasound("บูรณการ", 5))  # 'บ5515'
+    """
+    # keep only consonants and thanthakhat
+    chars = []
+    for ch in text:
+        if ch in _CONS_THANTHAKHAT:
+            chars.append(ch)
+
+    # remove karan (thanthakhat and a consonant before it)
+    i = 0
+    while i < len(chars):
+        if chars[i] == _THANTHAKHAT:
+            if i > 0:
+                chars[i - 1] = " "
+            chars[i] = " "
+        i += 1
+
+    # retain first consonant, encode the rest
+    chars = chars[:length]
+    i = 1
+    while i < len(chars):
+        if chars[i] in _C1:
+            chars[i] = "1"
+        elif chars[i] in _C2:
+            chars[i] = "2"
+        elif chars[i] in _C3:
+            chars[i] = "3"
+        elif chars[i] in _C4:
+            chars[i] = "4"
+        elif chars[i] in _C5:
+            chars[i] = "5"
+        elif chars[i] in _C6:
+            chars[i] = "6"
+        elif chars[i] in _C7:
+            chars[i] = "7"
+        elif chars[i] in _C8:
+            chars[i] = "8"
+        else:
+            chars[i] = "0"
+        i += 1
+
+    while len(chars) < length:
+        chars.append("0")
+
+    return "".join(chars)
+
+
+if __name__ == "__main__":
+    print(metasound("บูรณะ"))  # บ550 (an example from the original paper [Figure 4])
+    print(metasound("บูรณการ", 5))  # บ5515
+    print(metasound("ลักษณะ"))  # ล125
+    print(metasound("ลัก"))  # ล100
+    print(metasound("รัก"))  # ร100
+    print(metasound("รักษ์"))  # ร100
+    print(metasound(""))  # 0000
+
+    print(metasound("คน"))
+    print(metasound("คนA"))
+    print(metasound("ดา"))
+    print(metasound("ปา"))
+    print(metasound("งา"))
+    print(metasound("ลา"))
+    print(metasound("มา"))
+    print(metasound("วา"))
\ No newline at end of file
diff --git a/pythainlp/romanization/pyicu.py b/pythainlp/romanization/pyicu.py
index 3ba61fc18..732db3e24 100644
--- a/pythainlp/romanization/pyicu.py
+++ b/pythainlp/romanization/pyicu.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
 
-import sys
-
 try:
     import icu
 except ImportError:
diff --git a/pythainlp/sentiment/ulmfit_sent.py b/pythainlp/sentiment/ulmfit_sent.py
index fb58d227f..2fd6a1d27 100644
--- a/pythainlp/sentiment/ulmfit_sent.py
+++ b/pythainlp/sentiment/ulmfit_sent.py
@@ -3,7 +3,6 @@
 Sentiment analyzer based on thai2vec ("ulmfit" engine)
 Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
 """
-import sys
 from collections import defaultdict
 
 from pythainlp.corpus import download, get_file
@@ -85,7 +84,8 @@ def about():
     return """
     Sentiment analyzer based on thai2vec
     Data is from various online reviews including but not limited to JagerV3 and Wongnai Challenge.
-    89% accuracy based on 15% validation set compared to 72% of fastText and 52% most-frequent-class baseline.
+    89% accuracy based on 15% validation set compared to
+    72% of fastText and 52% most-frequent-class baseline.
 
     Development: Charin Polpanumas
     GitHub: https://github.com/cstorm125/thai2vec
diff --git a/pythainlp/tag/__init__.py b/pythainlp/tag/__init__.py
index 8efb234e4..ec367816c 100644
--- a/pythainlp/tag/__init__.py
+++ b/pythainlp/tag/__init__.py
@@ -2,9 +2,8 @@
 """
 Part-Of-Speech tagger
 """
-import sys
 
-ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
+_ARTAGGER_URL = "https://github.com/wannaphongcom/artagger/archive/master.zip"
 
 
 def pos_tag(words, engine="unigram", corpus="orchid"):
@@ -31,11 +30,11 @@ def _tag(text, corpus=None):
             except ImportError:
                 from pythainlp.tools import install_package
 
-                install_package(ARTAGGER_URL)
+                install_package(_ARTAGGER_URL)
                 try:
                     from artagger import Tagger
                 except ImportError:
-                    raise ImportError("Error: Try 'pip install " + ARTAGGER_URL + "'")
+                    raise ImportError("Error: Try 'pip install " + _ARTAGGER_URL + "'")
 
             words = Tagger().tag(" ".join(text))
 
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
index 1b0927d9a..519b35a03 100644
--- a/pythainlp/tag/perceptron.py
+++ b/pythainlp/tag/perceptron.py
@@ -24,7 +24,7 @@ def pud_data():
     return model
 
 
-def tag(text, corpus):
+def tag(text, corpus="pud"):
     """
     รับค่าเป็น ''list'' คืนค่าเป็น ''list'' เช่น [('ข้อความ', 'ชนิดคำ')]"""
     if corpus == "orchid":
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 2cc0c689c..c5e7818c4 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -16,42 +16,48 @@
 
 def word_tokenize(text, engine="newmm", whitespaces=True):
     """
-    :param str text:  the text to be tokenized
-    :param str engine: the engine to tokenize text
-    :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai.
+    :param str text: text to be tokenized
+    :param str engine: tokenizer to be used
+    :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai
     :Parameters for engine:
-        * newmm - Maximum Matching algorithm + TCC
-        * icu -  IBM ICU
-        * longest-matching - Longest matching
-        * mm - Maximum Matching algorithm
-        * pylexto - LexTo
-        * deepcut - Deep Neural Network
-        * wordcutpy - wordcutpy (https://github.com/veer66/wordcutpy)
-    :return: A list of words, tokenized from a text
+        * newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
+        * longest - dictionary-based, Longest Matching
+        * icu - wrapper for ICU, dictionary-based
+        * wordcutpy - wrapper for wordcutpy, dictionary-based https://github.com/veer66/wordcutpy
+        * pylexto - wrapper for PyLexTo, dictionary-based, Longest Matching
+        * deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
+        * ulmfit - use newmm engine with a specific dictionary for use with thai2vec
+    :return: list of words, tokenized from the text
 
     **Example**::
-    from pythainlp.tokenize import word_tokenize
-    text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
-    word_tokenize(text, engine="newmm")  # ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
-    word_tokenize(text, engine="icu")  # ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
+        >>> from pythainlp.tokenize import word_tokenize
+        >>> text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
+        >>> word_tokenize(text, engine="newmm")
+        ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
+        >>> word_tokenize(text, engine="icu")
+        ['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
     """
-    if engine == "icu":
-        from .pyicu import segment
-    elif engine == "multi_cut" or engine == "mm":
-        from .multi_cut import segment
+    if engine == "newmm" or engine == "onecut":
+        from .newmm import mmcut as segment
+    elif engine == "longest" or engine == "longest-matching":
+        from .longest import segment
     elif engine == "ulmfit":
         from .newmm import mmcut
+
         def segment(text):
             return mmcut(text, trie=FROZEN_DICT_TRIE)
-    elif engine == "longest-matching":
-        from .longest import segment
-    elif engine == "pylexto":
-        from .pylexto import segment
+
+    elif engine == "icu":
+        from .pyicu import segment
     elif engine == "deepcut":
         from .deepcut import segment
     elif engine == "wordcutpy":
         from .wordcutpy import segment
-    else:  # default, use "newmm" ("onecut") engine
+    elif engine == "pylexto":
+        from .pylexto import segment
+    elif engine == "mm" or engine == "multi_cut":
+        from .multi_cut import segment
+    else:  # default, use "newmm" engine
         from .newmm import mmcut as segment
 
     if not whitespaces:
@@ -63,27 +69,28 @@ def segment(text):
 def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
     """
     :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.
-
-    :param str text: the text to be tokenized
-    :param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie
-    :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching)
-    :return: A list of words, tokenized from a text.
+    :param str text: text to be tokenized
+    :param dict custom_dict_trie: a dictionary trie
+    :param str engine: choose between different options of engine to token (newmm, longest, wordcutpy)
+    :return: list of words
     **Example**::
         >>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
-        >>> listword=['แมว',"ดี"]
-        >>> data_dict=create_custom_dict_trie(listword)
-        >>> dict_word_tokenize("แมวดีดีแมว",data_dict)
+        >>> listword = ["แมว", "ดี"]
+        >>> data_dict = create_custom_dict_trie(listword)
+        >>> dict_word_tokenize("แมวดีดีแมว", data_dict)
         ['แมว', 'ดี', 'ดี', 'แมว']
     """
-    if engine == "mm" or engine == "multi_cut":
-        from .multi_cut import segment
-    elif engine == "longest-matching":
+    if engine == "newmm" or engine == "onecut":
+        from .newmm import mmcut as segment
+    elif engine == "longest" or engine == "longest-matching":
         from .longest import segment
     elif engine == "wordcutpy":
         from .wordcutpy import segment
 
         return segment(text, custom_dict_trie.keys())
-    else:  # default, use "newmm" ("onecut") engine
+    elif engine == "mm" or engine == "multi_cut":
+        from .multi_cut import segment
+    else:  # default, use "newmm" engine
         from .newmm import mmcut as segment
 
     return segment(text, custom_dict_trie)
@@ -167,11 +174,12 @@ def syllable_tokenize(text):
 
 
 def create_custom_dict_trie(custom_dict_source):
-    """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html
-
-    :param string/list custom_dict_source:  a list of vocaburaries or a path to source file
+    """
+    The function is used to create a custom dict trie which will be used for word_tokenize() function.
+    For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html
 
-    :return: A trie created from custom dict input
+    :param string/list custom_dict_source: a list of vocaburaries or a path to source file
+    :return: a trie created from custom dictionary input
     """
 
     if type(custom_dict_source) is str:
diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
index bad0eb906..20f744f25 100644
--- a/pythainlp/tokenize/deepcut.py
+++ b/pythainlp/tokenize/deepcut.py
@@ -2,7 +2,6 @@
 """
 Wrapper for deepcut Thai word segmentation
 """
-import sys
 
 try:
     import deepcut
diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
index a6b1ad8d8..1b50e41cb 100644
--- a/pythainlp/tokenize/longest.py
+++ b/pythainlp/tokenize/longest.py
@@ -2,7 +2,8 @@
 """
 Longest-matching Thai word segmentation
 
-Based on code from https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py
+Based on code from
+https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py
 """
 import re
 
@@ -35,7 +36,6 @@
 
 
 class Tokenizer(object):
-
     def __init__(self, trie):
         self.__trie = trie
 
diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py
index d8c107e0d..45b5adb05 100644
--- a/pythainlp/tokenize/pyicu.py
+++ b/pythainlp/tokenize/pyicu.py
@@ -3,7 +3,6 @@
 Wrapper for ICU word segmentation
 """
 import re
-import sys
 
 try:
     import icu
diff --git a/pythainlp/tokenize/pylexto.py b/pythainlp/tokenize/pylexto.py
index ba137db7b..a90bb3109 100644
--- a/pythainlp/tokenize/pylexto.py
+++ b/pythainlp/tokenize/pylexto.py
@@ -2,7 +2,6 @@
 """
 Wrapper for LexTo Thai word segmentation
 """
-import sys
 
 _LEXTO_URL = "https://github.com/PyThaiNLP/pylexto/archive/master.zip"
 
diff --git a/pythainlp/tokenize/wordcutpy.py b/pythainlp/tokenize/wordcutpy.py
index 2dff149e3..04b810816 100644
--- a/pythainlp/tokenize/wordcutpy.py
+++ b/pythainlp/tokenize/wordcutpy.py
@@ -2,7 +2,6 @@
 """
 Wrapper for WordCut Thai word segmentation
 """
-import sys
 
 try:
     from wordcut import Wordcut
diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py
index 138db40c1..75ebd00c5 100644
--- a/pythainlp/ulmfit/utils.py
+++ b/pythainlp/ulmfit/utils.py
@@ -4,7 +4,6 @@
 Code by https://github.com/cstorm125/thai2vec/tree/master/notebook
 """
 import re
-import sys
 
 from pythainlp.corpus import download, get_file
 from pythainlp.tokenize import word_tokenize
@@ -43,12 +42,11 @@ class ThaiTokenizer:
     def __init__(self, engine="newmm"):
         """
         :parameters for tokenization engine:
-            * newmm - Maximum Matching algorithm + TCC
-            * icu - IBM ICU
-            * longest-matching - Longest matching
-            * mm - Maximum Matching algorithm
-            * pylexto - LexTo
-            * deepcut - Deep Neural Network
+            * newmm - dictionary-based, Maximum Matching algorithm + TCC
+            * longest - dictionary-based, Longest Matching
+            * icu - use ICU, dictionary-based
+            * pylexto - use LexTo, dictionary-based
+            * deepcut - use deepcut, language model-based
         """
         self.engine = engine
         self.__RE_BR = re.compile(r"<\s*br\s*/?>", re.IGNORECASE)
diff --git a/pythainlp/word_vector/thai2vec.py b/pythainlp/word_vector/thai2vec.py
index c31f8f685..a390eae06 100644
--- a/pythainlp/word_vector/thai2vec.py
+++ b/pythainlp/word_vector/thai2vec.py
@@ -3,8 +3,6 @@
 thai2vec - Thai word vector
 Code by https://github.com/cstorm125/thai2vec/blob/master/notebooks/examples.ipynb
 """
-import sys
-
 from pythainlp.corpus import download as download_data
 from pythainlp.corpus import get_file
 from pythainlp.tokenize import word_tokenize
@@ -40,9 +38,9 @@ def get_model():
 
 def most_similar_cosmul(positive, negative):
     """
-	การใช้งาน
-	input list
-	"""
+    การใช้งาน
+    input list
+    """
     return get_model().most_similar_cosmul(positive=positive, negative=negative)
 
 
@@ -74,10 +72,10 @@ def sentence_vectorizer(text, dim=300, use_mean=False):
 
 def about():
     return """
-	thai2vec
-	State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language.
+    thai2vec
+    State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language.
     Created as part of pyThaiNLP with ULMFit implementation from fast.ai
-	
-	Development: Charin Polpanumas
-	GitHub: https://github.com/cstorm125/thai2vec
-	"""
+
+    Development: Charin Polpanumas
+    GitHub: https://github.com/cstorm125/thai2vec
+    """
diff --git a/tests/__init__.py b/tests/__init__.py
index 6b0d0ef7e..87e7f5ddd 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -17,7 +17,7 @@
 )
 from pythainlp.date import now, reign_year_to_ad, now_reign_year
 from pythainlp.keywords import find_keyword
-from pythainlp.MetaSound import MetaSound
+from pythainlp.metasound import metasound
 from pythainlp.ner import ThaiNameRecognizer
 from pythainlp.number import numtowords
 from pythainlp.rank import rank
@@ -30,6 +30,7 @@
 from pythainlp.util import listtext_num2num, normalize
 from pythainlp.Text import Text
 
+
 class TestUM(unittest.TestCase):
     """
     ทดสอบการทำงาน
@@ -78,7 +79,7 @@ def test_segment_newmm(self):
 
     def test_segment_longest_matching(self):
         self.assertEqual(
-            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest-matching"),
+            word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"),
             ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
         )
 
@@ -132,15 +133,11 @@ def test_lk82(self):
         self.assertEqual(Udom83("รถ"), "ร800000")
 
     def test_ms(self):
-        self.assertEqual(MetaSound("คน"), "15")
-        self.assertEqual(MetaSound("คนA"), "150")
-        self.assertEqual(MetaSound("ดา"), "20")
-        self.assertEqual(MetaSound("ปา"), "30")
-        self.assertEqual(MetaSound("งา"), "40")
-        self.assertEqual(MetaSound("ลา"), "50")
-        self.assertEqual(MetaSound("มา"), "60")
-        self.assertEqual(MetaSound("วา"), "80")
-        self.assertEqual(MetaSound("ลัก"), MetaSound("รัก"))
+        self.assertEqual(metasound("บูรณะ"), "บ550")
+        self.assertEqual(metasound("คน"), "ค500")
+        self.assertEqual(metasound("คนA"), "ค500")
+        self.assertEqual(metasound("ดา"), "ด000")
+        self.assertEqual(metasound("รักษ์"), metasound("รัก"))
 
     def test_wordnet(self):
         self.assertEqual(
@@ -251,8 +248,10 @@ def test_ner(self):
                 ("เช้า", "I-TIME"),
             ],
         )
+
     def test_Text(self):
         self.assertIsNotNone(Text("ทดสอบภาษาไทย"))
 
+
 if __name__ == "__main__":
     unittest.main()