diff --git a/mini_project_1.ipynb b/mini_project_1.ipynb new file mode 100644 index 0000000..945cb91 --- /dev/null +++ b/mini_project_1.ipynb @@ -0,0 +1,175 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'the'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pattern.web import *\n", + "import string\n", + "\n", + "christianity=URL(\"http://www.gutenberg.org/cache/epub/8294/pg8294.txt\").download()\n", + "\n", + "\n", + "\n", + " \n", + "def process_file(filename):\n", + " dic = dict()\n", + " fil = filename\n", + " for word in fil.split():\n", + " words = word.replace(\"-\",\" \")\n", + " wordss = words.strip(string.punctuation + string.whitespace)\n", + " wordss = wordss.lower()\n", + " if wordss in dic:\n", + " dic[wordss] += 1\n", + " else:\n", + " dic [wordss] = 1\n", + " le=[]\n", + " for a in dic:\n", + " le.append((dic[a],a))\n", + " le.sort(reverse=True)\n", + " for a,b in le:\n", + " return b\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "process_file(christianity)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'the'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pattern.web import *\n", + "import string\n", + "\n", + "buddhism=URL(\"http://www.gutenberg.org/files/15255/15255-0.txt\").download()\n", + "\n", + "\n", + "\n", + " \n", + "def process_file(filename):\n", + " dic = dict()\n", + " fil = filename\n", + " for word in fil.split():\n", + " words = word.replace(\"-\",\" \")\n", + " wordss = words.strip(string.punctuation + string.whitespace)\n", + " wordss = wordss.lower()\n", + " if wordss in dic:\n", + " dic[wordss] += 1\n", + " else:\n", + " dic [wordss] = 1\n", + " le=[]\n", + " for a in dic:\n", + " le.append((dic[a],a))\n", + " le.sort(reverse=True)\n", + " for a,b in le:\n", + " print b\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "process_file(buddhism)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "Project overview:\n", + " For the data, I used two books from_ the project glutenberg to analyze. I used the technique of counting the top\n", + " ten words that appeared in_ the book. Since the two book that I picked are religious books. One is_ about buddhism \n", + " and_ the other one is_ about christianity. I want to see how the top words in_ these two books can show the difference\n", + " of the two religion.\n", + "Implementation:\n", + " The main logic of my analysis is_ create a dictionary that has all_ the words in_ the book as_ keys and_ everytime\n", + " the word shows up again, the value of the key increase by one. So that the frequency of all_ the words in_ the book\n", + " will be found. At the beginning I thought about weather I should download the book to the computer first or_ I should\n", + " just create a variable and_ set_ its value to be the book content. I decided to create a varaible so that all_ \n", + " computers will be able to use my code directly.\n", + "Results:\n", + " The top ten words of the christianity book are god,Israel,Son,Man,King,People,We,Children,Land,and_ Father.\n", + " The top ten words of the buddhism book are Buddha,Life,India,Existence,Human,Ideas,Knowledge,Universe,Intellectual,\n", + " and_ China.\n", + " I think the words are definitely different. They all_ show the main idol of the religion. God and_ Buddha. They also\n", + " show the main place of the religion. Israel and_ India. The other words shows that Christianity is_ more about people.\n", + " The words like son, we, father, children shows Christianity is_ mainly about people and_ peoples relationship.\n", + " Differently, Buddhism words are more about thinking, about ideas. The words like life, idea, knowledge, universe,\n", + " intellectual.These words shows how Buddhism pay more attention on how people think and_ the importance of thinking.\n", + "Reflection:\n", + " The process went pretty smoothly since I had a lot of similar practices in_ the completion of the reading journals.\n", + " One thing that I need to pay attention later on is_ that I need to be more careful about what format_ the imput is_\n", + " in_. Because the first time I run the code i got a count of each of the alphabetical letters in_ the book. That is_\n", + " because I didnot realize that the input_ isnot seperated by lines. So I over divided it and_ get all_ letters instead\n", + " of words.\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}