From 72a675ead8edf327b0ec8bf5e26fcb57d4ffab17 Mon Sep 17 00:00:00 2001 From: Qingyun Liu Date: Mon, 3 Oct 2016 00:17:42 -0400 Subject: [PATCH 1/3] Turning in my mini project 1 --- mini_project_1.ipynb | 166 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 mini_project_1.ipynb diff --git a/mini_project_1.ipynb b/mini_project_1.ipynb new file mode 100644 index 0000000..1ea2ff0 --- /dev/null +++ b/mini_project_1.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'the'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pattern.web import *\n", + "import string\n", + "\n", + "christianity=URL(\"http://www.gutenberg.org/cache/epub/8294/pg8294.txt\").download()\n", + "\n", + "\n", + "\n", + " \n", + "def process_file(filename):\n", + " dic = dict()\n", + " fil = filename\n", + " for word in fil.split():\n", + " words = word.replace(\"-\",\" \")\n", + " wordss = words.strip(string.punctuation + string.whitespace)\n", + " wordss = wordss.lower()\n", + " if wordss in dic:\n", + " dic[wordss] += 1\n", + " else:\n", + " dic [wordss] = 1\n", + " le=[]\n", + " for a in dic:\n", + " le.append((dic[a],a))\n", + " le.sort(reverse=True)\n", + " for a,b in le:\n", + " print b\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "process_file(christianity)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'the'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pattern.web import *\n", + "import string\n", + "\n", + "buddhism=URL(\"http://www.gutenberg.org/files/15255/15255-0.txt\").download()\n", + "\n", + "\n", + "\n", + " \n", + "def process_file(filename):\n", + " dic = dict()\n", + " fil = filename\n", + " for word in fil.split():\n", + " words = word.replace(\"-\",\" \")\n", + " wordss = words.strip(string.punctuation + string.whitespace)\n", + " wordss = wordss.lower()\n", + " if wordss in dic:\n", + " dic[wordss] += 1\n", + " else:\n", + " dic [wordss] = 1\n", + " le=[]\n", + " for a in dic:\n", + " le.append((dic[a],a))\n", + " le.sort(reverse=True)\n", + " for a,b in le:\n", + " print b\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "process_file(buddhism)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "Project overview:\n", + " For the data, I used two books from_ the project glutenberg to analyze. I used the technique of counting the top\n", + " ten words that appeared in_ the book. Since the two book that I picked are religious books. One is_ about buddhism \n", + " and_ the other one is_ about christianity. I want to see how the top words in_ these two books can show the difference\n", + " of the two religion.\n", + "Implementation:\n", + " The main logic of my analysis is_ create a dictionary that has all_ the words in_ the book as_ keys and_ everytime\n", + " the word shows up again, the value of the key increase by one. So that the frequency of all_ the words in_ the book\n", + " will be found. At the beginning I thought about weather I should download the book to the computer first or_ I should\n", + " just create a variable and_ set_ its value to be the book content. I decided to create a varaible so that all_ \n", + " computers will be able to use my code directly.\n", + "Results:\n", + " The top ten words of the christianity book are god,Israel,Son,Man,King,People,We,Children,Land,and_ Father.\n", + " The top ten words of the buddhism book are Buddha,Life,India,Existence,Human,Ideas,Knowledge,Universe,Intellectual,\n", + " and_ China.\n", + " I think the words are definitely different. They all_ show the main idol of the religion. God and_ Buddha. They also\n", + " show the main place of the religion. Israel and_ India. The other words shows that Christianity is_ more about people.\n", + " The words like son, we, father, children shows Christianity is_ mainly about people and_ peoples relationship.\n", + " Differently, Buddhism words are more about thinking, about ideas. The words like life, idea, knowledge, universe,\n", + " intellectual.These words shows how Buddhism pay more attention on how people think and_ the importance of thinking.\n", + "Reflection:\n", + " The process went pretty smoothly since I had a lot of similar practices in_ the completion of the reading journals.\n", + " One thing that I need to pay attention later on is_ that I need to be more careful about what format_ the imput is_\n", + " in_. Because the first time I run the code i got a count of each of the alphabetical letters in_ the book. That is_\n", + " because I didnot realize that the input_ isnot seperated by lines. So I over divided it and_ get all_ letters instead\n", + " of words.\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From 615bf7c85bbd351b112c88292e0b99b93e7e379f Mon Sep 17 00:00:00 2001 From: lqyeric94 Date: Mon, 3 Oct 2016 00:21:25 -0400 Subject: [PATCH 2/3] Add files via upload --- mini_project_1.ipynb | 166 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 mini_project_1.ipynb diff --git a/mini_project_1.ipynb b/mini_project_1.ipynb new file mode 100644 index 0000000..1ea2ff0 --- /dev/null +++ b/mini_project_1.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'the'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pattern.web import *\n", + "import string\n", + "\n", + "christianity=URL(\"http://www.gutenberg.org/cache/epub/8294/pg8294.txt\").download()\n", + "\n", + "\n", + "\n", + " \n", + "def process_file(filename):\n", + " dic = dict()\n", + " fil = filename\n", + " for word in fil.split():\n", + " words = word.replace(\"-\",\" \")\n", + " wordss = words.strip(string.punctuation + string.whitespace)\n", + " wordss = wordss.lower()\n", + " if wordss in dic:\n", + " dic[wordss] += 1\n", + " else:\n", + " dic [wordss] = 1\n", + " le=[]\n", + " for a in dic:\n", + " le.append((dic[a],a))\n", + " le.sort(reverse=True)\n", + " for a,b in le:\n", + " print b\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "process_file(christianity)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'the'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pattern.web import *\n", + "import string\n", + "\n", + "buddhism=URL(\"http://www.gutenberg.org/files/15255/15255-0.txt\").download()\n", + "\n", + "\n", + "\n", + " \n", + "def process_file(filename):\n", + " dic = dict()\n", + " fil = filename\n", + " for word in fil.split():\n", + " words = word.replace(\"-\",\" \")\n", + " wordss = words.strip(string.punctuation + string.whitespace)\n", + " wordss = wordss.lower()\n", + " if wordss in dic:\n", + " dic[wordss] += 1\n", + " else:\n", + " dic [wordss] = 1\n", + " le=[]\n", + " for a in dic:\n", + " le.append((dic[a],a))\n", + " le.sort(reverse=True)\n", + " for a,b in le:\n", + " print b\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "process_file(buddhism)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "Project overview:\n", + " For the data, I used two books from_ the project glutenberg to analyze. I used the technique of counting the top\n", + " ten words that appeared in_ the book. Since the two book that I picked are religious books. One is_ about buddhism \n", + " and_ the other one is_ about christianity. I want to see how the top words in_ these two books can show the difference\n", + " of the two religion.\n", + "Implementation:\n", + " The main logic of my analysis is_ create a dictionary that has all_ the words in_ the book as_ keys and_ everytime\n", + " the word shows up again, the value of the key increase by one. So that the frequency of all_ the words in_ the book\n", + " will be found. At the beginning I thought about weather I should download the book to the computer first or_ I should\n", + " just create a variable and_ set_ its value to be the book content. I decided to create a varaible so that all_ \n", + " computers will be able to use my code directly.\n", + "Results:\n", + " The top ten words of the christianity book are god,Israel,Son,Man,King,People,We,Children,Land,and_ Father.\n", + " The top ten words of the buddhism book are Buddha,Life,India,Existence,Human,Ideas,Knowledge,Universe,Intellectual,\n", + " and_ China.\n", + " I think the words are definitely different. They all_ show the main idol of the religion. God and_ Buddha. They also\n", + " show the main place of the religion. Israel and_ India. The other words shows that Christianity is_ more about people.\n", + " The words like son, we, father, children shows Christianity is_ mainly about people and_ peoples relationship.\n", + " Differently, Buddhism words are more about thinking, about ideas. The words like life, idea, knowledge, universe,\n", + " intellectual.These words shows how Buddhism pay more attention on how people think and_ the importance of thinking.\n", + "Reflection:\n", + " The process went pretty smoothly since I had a lot of similar practices in_ the completion of the reading journals.\n", + " One thing that I need to pay attention later on is_ that I need to be more careful about what format_ the imput is_\n", + " in_. Because the first time I run the code i got a count of each of the alphabetical letters in_ the book. That is_\n", + " because I didnot realize that the input_ isnot seperated by lines. So I over divided it and_ get all_ letters instead\n", + " of words.\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From 15d287dca4a6a704f9bff70f08929506941cc9de Mon Sep 17 00:00:00 2001 From: Qingyun Liu Date: Mon, 3 Oct 2016 14:47:01 -0400 Subject: [PATCH 3/3] Turning in my mini project 1 --- mini_project_1.ipynb | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mini_project_1.ipynb b/mini_project_1.ipynb index 1ea2ff0..945cb91 100644 --- a/mini_project_1.ipynb +++ b/mini_project_1.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 20, + "execution_count": 4, "metadata": { "collapsed": false }, @@ -13,7 +13,7 @@ "'the'" ] }, - "execution_count": 20, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -43,7 +43,7 @@ " le.append((dic[a],a))\n", " le.sort(reverse=True)\n", " for a,b in le:\n", - " print b\n", + " return b\n", " \n", " \n", " \n", @@ -104,6 +104,15 @@ "process_file(buddhism)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null,