diff --git a/exercises/1901100139/d10/mymodule/stats_word.py b/exercises/1901100139/d10/mymodule/stats_word.py index a1b269578..0742f13be 100644 --- a/exercises/1901100139/d10/mymodule/stats_word.py +++ b/exercises/1901100139/d10/mymodule/stats_word.py @@ -1,4 +1,5 @@ from collections import Counter +import jieba def stats_text_en(text,count): elements = text.split() words = [] @@ -10,11 +11,13 @@ def stats_text_en(text,count): words.append(element) return Counter(words).most_common(count) def stats_text_cn(text,count): - cn_characters = [] - for character in text: - if '\u4e00' <= character <= '\u9fff': - cn_characters.append(character) - return Counter(cn_characters).most_common(count) + words=jieba.cut(text) + tmp=[] + for i in words: + if len(i)>1: + tmp.append(i) + return Counter(tmp).most_common(count) + def stats_text(text,count): ''' 合并中英词频的结果 diff --git a/exercises/1901100139/d11/mymodule/stats_word.py b/exercises/1901100139/d11/mymodule/stats_word.py index a1b269578..0742f13be 100644 --- a/exercises/1901100139/d11/mymodule/stats_word.py +++ b/exercises/1901100139/d11/mymodule/stats_word.py @@ -1,4 +1,5 @@ from collections import Counter +import jieba def stats_text_en(text,count): elements = text.split() words = [] @@ -10,11 +11,13 @@ def stats_text_en(text,count): words.append(element) return Counter(words).most_common(count) def stats_text_cn(text,count): - cn_characters = [] - for character in text: - if '\u4e00' <= character <= '\u9fff': - cn_characters.append(character) - return Counter(cn_characters).most_common(count) + words=jieba.cut(text) + tmp=[] + for i in words: + if len(i)>1: + tmp.append(i) + return Counter(tmp).most_common(count) + def stats_text(text,count): ''' 合并中英词频的结果 diff --git a/exercises/1901100139/d12/main.py b/exercises/1901100139/d12/main.py index 9e58a1533..60d86f52d 100644 --- a/exercises/1901100139/d12/main.py +++ b/exercises/1901100139/d12/main.py @@ -25,7 +25,7 @@ def handler(msg): msg.reply(str(result)) except Exception as e: logging.exception(e) - embed + embed() if __name__=='__main__': diff --git a/exercises/1901100139/d12/mymodule/stats_word.py b/exercises/1901100139/d12/mymodule/stats_word.py index a1b269578..0742f13be 100644 --- a/exercises/1901100139/d12/mymodule/stats_word.py +++ b/exercises/1901100139/d12/mymodule/stats_word.py @@ -1,4 +1,5 @@ from collections import Counter +import jieba def stats_text_en(text,count): elements = text.split() words = [] @@ -10,11 +11,13 @@ def stats_text_en(text,count): words.append(element) return Counter(words).most_common(count) def stats_text_cn(text,count): - cn_characters = [] - for character in text: - if '\u4e00' <= character <= '\u9fff': - cn_characters.append(character) - return Counter(cn_characters).most_common(count) + words=jieba.cut(text) + tmp=[] + for i in words: + if len(i)>1: + tmp.append(i) + return Counter(tmp).most_common(count) + def stats_text(text,count): ''' 合并中英词频的结果 diff --git a/exercises/1901100139/d13/main.py b/exercises/1901100139/d13/main.py new file mode 100644 index 000000000..68efc3ca8 --- /dev/null +++ b/exercises/1901100139/d13/main.py @@ -0,0 +1,60 @@ +from os import path +import logging +import requests +import matplotlib.pyplot as plt +import pyquery +from wxpy import * +from mymodule import stats_word + +cwd=path.abspath(path.dirname(__file__)) + +plt.rcParams['font.sans-serif']='simHei' + +logging.basicConfig(format='file:%(filename)s|line:%(lineno)d|message:%(message)s',level=logging.DEBUG) + +def get_article(url): + r=requests.get(url) + document= pyquery.PyQuery(r.text) + return document('#js_content').text() + +def generate_image(data,image_path): + labels=[v[0] for v in data] + widths=[v[1] for v in data] + ypos = range(len(data)) + fig,ax=plt.subplots() + ax.barh(ypos,widths) + ax.set_yticks(ypos) + ax.set_yticklabels(labels) + ax.invert_yaxis() + ax.set_ylabel('关键字') + ax.set_xlabel('词频') + ax.set_title('词频统计') + fig.savefig(image_path,bbox_inches='tight') + +def main(): + bot=Bot() + friends=bot.friends() + + @bot.register(friends,SHARING) + def handler(msg): + try: + logging.info('sharing url=%s',msg.url) + article=get_article(msg.url) + result=stats_word.stats_text_cn(article,20) + image_path=path.join(cwd,'stats.png') + generate_image(result,image_path) + msg.reply_image(image_path) + except Exception as e: + logging.exception(e) + embed() + +def test(): + article=get_article('https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA') + result=stats_word.stats_text_cn(article,20) + image_path=path.join(cwd,'stats.png') + generate_image(result,image_path) + +if __name__=='__main__': + test() + + diff --git a/exercises/1901100139/d13/mymodule/stats_word.py b/exercises/1901100139/d13/mymodule/stats_word.py new file mode 100644 index 000000000..0742f13be --- /dev/null +++ b/exercises/1901100139/d13/mymodule/stats_word.py @@ -0,0 +1,25 @@ +from collections import Counter +import jieba +def stats_text_en(text,count): + elements = text.split() + words = [] + symbols = ',.*-!' + for element in elements: + for symbol in symbols: + element = element.replace(symbol,'') + if len(element) and element.isascii(): + words.append(element) + return Counter(words).most_common(count) +def stats_text_cn(text,count): + words=jieba.cut(text) + tmp=[] + for i in words: + if len(i)>1: + tmp.append(i) + return Counter(tmp).most_common(count) + +def stats_text(text,count): + ''' + 合并中英词频的结果 + ''' + return stats_text_en(text,count) + stats_text_cn(text,count) \ No newline at end of file