diff --git a/miniproject_EPL.ipynb b/miniproject_EPL.ipynb new file mode 100644 index 0000000..595054d --- /dev/null +++ b/miniproject_EPL.ipynb @@ -0,0 +1,171 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "collecting 1000 number of Tweets on the topics: Spurs%20Tottenham%20Hotspur\n" + ] + } + ], + "source": [ + "from twitterscraper import TwitterScraper\n", + "\n", + "topics = ['Spurs','Tottenham','Hotspur']\n", + "\n", + "# game against CSKA happend on 2016/09/27\n", + "filename = 'Spurs_CSKA.txt'\n", + "Spurs_CSKA=TwitterScraper.Scraper(topics, 1000, lang='en',begin_date='2016-09-27', \n", + " end_date = '2016-09-28', filename = filename)\n", + "\n", + "Spurs_CSKA.scrape()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "lst = open(filename,'r').readlines()\n", + "\n", + "\n", + "\n", + "#create new list with only words - excluding urls and other tags/punctuations\n", + "\n", + "new_lst=[]\n", + "for a in lst:\n", + " tweet = a.split('\\t')[-1]\n", + " splitted = tweet.split(\"http\")\n", + " new_lst.append(splitted[0])\n", + "\n", + "import string\n", + "\n", + "new_lst2 = []\n", + "for a in new_lst:\n", + " no_punc=a.translate(string.maketrans(\"\",\"\"), string.punctuation)\n", + " new_lst2.append(no_punc)\n", + " \n", + "#define function to count the number of times each word appeared in the tweet\n", + "\n", + "def get_word_count(word_list):\n", + " '''\n", + " This function takes a list of words and returns a dictionary where the keys are the \n", + " words in the list and the values are the amount of times that the word appears in the list\n", + " '''\n", + " d = {}\n", + " for line in word_list:\n", + " line_2 = line.split()\n", + " for word in line_2:\n", + " if word not in d:\n", + " d[word]=1\n", + " else:\n", + " d[word]+=1\n", + " return d\n", + "\n", + "\n", + "wordlist=get_word_count(new_lst2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#selecting names of only attackers from the list\n", + "\n", + "player_names = ['Vincent','Janssen', 'Kane', 'Harry', 'Son','Dele','Alli','Eriksen','Christian','Lamela']\n", + "\n", + "attacking_players = dict([k, wordlist[k]] for k in player_names)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#defining function to sort by number of count each player is mentioned\n", + "\n", + "def order_by_count(player_dict):\n", + " '''\n", + " This function takes the dictionary of word counts and sort it by counts of each words\n", + " '''\n", + " r = []\n", + " for k,v in player_dict.items():\n", + " r.append((v,k)) \n", + " r.sort(reverse=True)\n", + " return r\n", + "\n", + "#to see which player is mentioned the most\n", + "hyped_players=order_by_count(attacking_players)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Man of the Match for game against CSKA is Son with 213 tweets mentioning the player!\n" + ] + } + ], + "source": [ + "#MoM = Man of Match\n", + "MoM = hyped_players[0]\n", + "\n", + "print 'Man of the Match for game against CSKA is', MoM[1],'with', MoM[0], 'tweets mentioning the player!' " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/wayne_writeup.txt b/wayne_writeup.txt new file mode 100644 index 0000000..4ed3090 --- /dev/null +++ b/wayne_writeup.txt @@ -0,0 +1,19 @@ +Wayne Kwon + +Project Overview: + I used twitter data to pull tweets of certain topics that I wanted to have a look at. I used 'twitterscraper' to scrape tweets from twitter that mentions my favorite soccer team Tottenham Hotspur from English Premier League Soccer. I wanted to look at how many times each attacking players are mentioned on a game day (in this project, champions league game against CSKA Moscow) and find out which player is mentioned the most. Eventually, the player that is mentioned the most will be awarded Man of the Match for that game. + +Implementation: + First, as I was pulling in the tweets, there were unnecessary parts (such as punctuation, hashtags, etc) that I did not need. I got rid of all the urls and other unnecessary parts. In addition, I split all the sentences into words and, using for loops, I also counted how many times each word is mentioned then created a dictionary in the format of {'word' : # of times mentioned}. Then I filtered the result to only have the name of attacking players. Because I wanted to sort the key by the value, I defined a function so that it returns a list of tuples sorted by values. + One decision I had to make was the method of filtering for attacking players. I wanted to create the list of tuples with # of counts first, and then filter for attacking players. However, for some reason, the function would not capture the the # of counts correctly. I realized that it would save me both time and memory if I filter for the names by creating a new dictionary by applying the filters. + +Results: + I tried to find out if the tweets actually reflect the performance of the players that are playing very well in the season. Right now, Son Heung-min is the best performing player from Tottenham Hotspur as he is scoring most of the goals this season. + My mini project result shows that he is definitely mentioned the most from the tweets. For the game against CSKA, he scored a winning goal, giving Spurs 3 points lead in the Champions League qualifiers. Performing the same code for a day before the game, it was interesting to find out that everyone was hyped up about Son Heung-min as he was mentioned the most even before the game. Maybe there is a reason why people are hyped up about a player - for his ability or for his current form. + +Reflection: + I think my project was appropriately scoped. At first, I was worried about retweets being counted multiple times. However, I realized that retweets also represent someone's opinion as it means that one agrees/disagrees with the tweet they are reposting. + If I had more time, this mini project can definitely beccome a very large project. After getting # counts of each player being mentioned, we can compare it to the players' actual performance (goals + assists) to see if there are any correlation. Moreover, I can also perform sentiment analysis on each player - but this would have been a different scope to start with. + + +