diff --git a/EXTRAS/homeworks_to_submit/1036785977/homework_05/homework_5.ipynb b/EXTRAS/homeworks_to_submit/1036785977/homework_05/homework_5.ipynb
new file mode 100644
index 0000000..11fb18d
--- /dev/null
+++ b/EXTRAS/homeworks_to_submit/1036785977/homework_05/homework_5.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOTyZx2ugR9cJRAceTB317d"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# Homework 5 _ NLP\n","# Santiago Ruiz Piedrahita\n","\n","HOMEWORK\n","\n","I. Take the airline opinion dataset (airline_tweets.csv) that was the subject of the Class 12\n","\n","II. Divide the dataset into Train/Test: 80/20 percent fix the randomness!\n","\n","III. Compare the different Methods\n","\n","IV. Compare the results in the table\n","\n","V. Write conclusions"],"metadata":{"id":"IC4ypxmtLEK-"}},{"cell_type":"code","source":["# librerias\n","import pandas as pd\n","import re\n","import seaborn as sns"],"metadata":{"id":"aIh4g0GaLCuE","executionInfo":{"status":"ok","timestamp":1669815535540,"user_tz":300,"elapsed":1463,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":1,"outputs":[]},{"cell_type":"markdown","source":["# Limpiando los datos"],"metadata":{"id":"dlTufwtlQrJ1"}},{"cell_type":"code","source":["data_source_url = \"https://raw.githubusercontent.com/mhemmg/datasets/master/nlp/airline_tweets.csv\"\n","airline_tweets = pd.read_csv(data_source_url)\n","airline_tweets2 = airline_tweets[['airline_sentiment','text']]\n","airline_tweets2.to_csv('full.csv',index=False)\n","airline_tweets.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":496},"id":"Q5NGI-lRPWZ0","executionInfo":{"status":"ok","timestamp":1669815538967,"user_tz":300,"elapsed":402,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"5b257622-364f-4684-aafe-ffcc8039580e"},"execution_count":2,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" tweet_id airline_sentiment airline_sentiment_confidence \\\n","0 570306133677760513 neutral 1.0000 \n","1 570301130888122368 positive 0.3486 \n","2 570301083672813571 neutral 0.6837 \n","3 570301031407624196 negative 1.0000 \n","4 570300817074462722 negative 1.0000 \n","\n"," negativereason negativereason_confidence airline \\\n","0 NaN NaN Virgin America \n","1 NaN 0.0000 Virgin America \n","2 NaN NaN Virgin America \n","3 Bad Flight 0.7033 Virgin America \n","4 Can't Tell 1.0000 Virgin America \n","\n"," airline_sentiment_gold name negativereason_gold retweet_count \\\n","0 NaN cairdin NaN 0 \n","1 NaN jnardino NaN 0 \n","2 NaN yvonnalynn NaN 0 \n","3 NaN jnardino NaN 0 \n","4 NaN jnardino NaN 0 \n","\n"," text tweet_coord \\\n","0 @VirginAmerica What @dhepburn said. NaN \n","1 @VirginAmerica plus you've added commercials t... NaN \n","2 @VirginAmerica I didn't today... Must mean I n... NaN \n","3 @VirginAmerica it's really aggressive to blast... NaN \n","4 @VirginAmerica and it's a really big bad thing... NaN \n","\n"," tweet_created tweet_location user_timezone \n","0 2015-02-24 11:35:52 -0800 NaN Eastern Time (US & Canada) \n","1 2015-02-24 11:15:59 -0800 NaN Pacific Time (US & Canada) \n","2 2015-02-24 11:15:48 -0800 Lets Play Central Time (US & Canada) \n","3 2015-02-24 11:15:36 -0800 NaN Pacific Time (US & Canada) \n","4 2015-02-24 11:14:45 -0800 NaN Pacific Time (US & Canada) "],"text/html":["\n","
\n","
\n","
\n","\n","
\n"," \n","
\n","
\n","
tweet_id
\n","
airline_sentiment
\n","
airline_sentiment_confidence
\n","
negativereason
\n","
negativereason_confidence
\n","
airline
\n","
airline_sentiment_gold
\n","
name
\n","
negativereason_gold
\n","
retweet_count
\n","
text
\n","
tweet_coord
\n","
tweet_created
\n","
tweet_location
\n","
user_timezone
\n","
\n"," \n"," \n","
\n","
0
\n","
570306133677760513
\n","
neutral
\n","
1.0000
\n","
NaN
\n","
NaN
\n","
Virgin America
\n","
NaN
\n","
cairdin
\n","
NaN
\n","
0
\n","
@VirginAmerica What @dhepburn said.
\n","
NaN
\n","
2015-02-24 11:35:52 -0800
\n","
NaN
\n","
Eastern Time (US & Canada)
\n","
\n","
\n","
1
\n","
570301130888122368
\n","
positive
\n","
0.3486
\n","
NaN
\n","
0.0000
\n","
Virgin America
\n","
NaN
\n","
jnardino
\n","
NaN
\n","
0
\n","
@VirginAmerica plus you've added commercials t...
\n","
NaN
\n","
2015-02-24 11:15:59 -0800
\n","
NaN
\n","
Pacific Time (US & Canada)
\n","
\n","
\n","
2
\n","
570301083672813571
\n","
neutral
\n","
0.6837
\n","
NaN
\n","
NaN
\n","
Virgin America
\n","
NaN
\n","
yvonnalynn
\n","
NaN
\n","
0
\n","
@VirginAmerica I didn't today... Must mean I n...
\n","
NaN
\n","
2015-02-24 11:15:48 -0800
\n","
Lets Play
\n","
Central Time (US & Canada)
\n","
\n","
\n","
3
\n","
570301031407624196
\n","
negative
\n","
1.0000
\n","
Bad Flight
\n","
0.7033
\n","
Virgin America
\n","
NaN
\n","
jnardino
\n","
NaN
\n","
0
\n","
@VirginAmerica it's really aggressive to blast...
\n","
NaN
\n","
2015-02-24 11:15:36 -0800
\n","
NaN
\n","
Pacific Time (US & Canada)
\n","
\n","
\n","
4
\n","
570300817074462722
\n","
negative
\n","
1.0000
\n","
Can't Tell
\n","
1.0000
\n","
Virgin America
\n","
NaN
\n","
jnardino
\n","
NaN
\n","
0
\n","
@VirginAmerica and it's a really big bad thing...
\n","
NaN
\n","
2015-02-24 11:14:45 -0800
\n","
NaN
\n","
Pacific Time (US & Canada)
\n","
\n"," \n","
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":2}]},{"cell_type":"code","source":["features = airline_tweets2['text'].values\n","labels = airline_tweets2['airline_sentiment'].values"],"metadata":{"id":"VJ2vCCO8QlZK","executionInfo":{"status":"ok","timestamp":1669815542841,"user_tz":300,"elapsed":406,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["len(features)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yQkjg-BIQ-9i","executionInfo":{"status":"ok","timestamp":1669815545017,"user_tz":300,"elapsed":7,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"64d3c2ce-2d7a-42c9-cb83-79e6ce14cd26"},"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["14640"]},"metadata":{},"execution_count":4}]},{"cell_type":"code","source":["features"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VbI0CGw_RDH6","executionInfo":{"status":"ok","timestamp":1669815547537,"user_tz":300,"elapsed":347,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"7fddaad2-602d-4575-eb33-ec7e53e24420"},"execution_count":5,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['@VirginAmerica What @dhepburn said.',\n"," \"@VirginAmerica plus you've added commercials to the experience... tacky.\",\n"," \"@VirginAmerica I didn't today... Must mean I need to take another trip!\",\n"," ...,\n"," '@AmericanAir Please bring American Airlines to #BlackBerry10',\n"," \"@AmericanAir you have my money, you change my flight, and don't answer your phones! Any other suggestions so I can make my commitment??\",\n"," '@AmericanAir we have 8 ppl so we need 2 know how many seats are on the next flight. Plz put us on standby for 4 people on the next flight?'],\n"," dtype=object)"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","source":["labels"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"z35x4draQ_3a","executionInfo":{"status":"ok","timestamp":1669815549253,"user_tz":300,"elapsed":7,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"54319a34-a096-475a-cc80-ad840eb9330d"},"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['neutral', 'positive', 'neutral', ..., 'neutral', 'negative',\n"," 'neutral'], dtype=object)"]},"metadata":{},"execution_count":6}]},{"cell_type":"code","source":["processed_features = []\n","\n","for i in range(0, len(features)):\n"," \n"," # remowing tweet\n"," processed_feature = re.sub(r'@\\w+', ' ', str(features[i]))\n","\n"," # remowing retweet\n"," processed_feature = re.sub(r'rt @\\w+:', ' ', processed_feature)\n","\n"," # Eliminación de números\n"," processed_feature = re.sub(\"\\d+\", ' ', processed_feature)\n","\n"," # Removing links\n"," processed_feature = re.sub(r'http\\S+', ' ', processed_feature)\n","\n"," # Remove all the special characters\n"," processed_feature = re.sub(r'\\W', ' ', processed_feature)\n","\n"," # remove all single characters\n"," processed_feature= re.sub(r'\\s+[a-zA-Z]\\s+', ' ', processed_feature)\n","\n"," # Remove single characters from the start\n"," processed_feature = re.sub(r'\\^[a-zA-Z]\\s+', ' ', processed_feature) \n","\n"," # Substituting multiple spaces with single space\n"," processed_feature = re.sub(r'\\s+', ' ', processed_feature, flags=re.I)\n","\n"," # Removing prefixed 'b'\n","# processed_feature = re.sub(r'^b\\s+', '', processed_feature)\n"," \n"," # Converting to Lowercase\n"," processed_feature = processed_feature.lower()\n","\n"," processed_features.append(processed_feature)"],"metadata":{"id":"Xi_FSWFCR4Ix","executionInfo":{"status":"ok","timestamp":1669815551606,"user_tz":300,"elapsed":1016,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["airline_tweets['processed_features']=processed_features\n","airline_tweets2=airline_tweets[['airline_sentiment','text','processed_features']]\n","airline_tweets2.to_csv('full_clean.csv',index=False)"],"metadata":{"id":"ZDqw4yIOR-Uv","executionInfo":{"status":"ok","timestamp":1669815554203,"user_tz":300,"elapsed":429,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":8,"outputs":[]},{"cell_type":"code","source":["airline_tweets2"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":424},"id":"wxs_xEGiSwsu","executionInfo":{"status":"ok","timestamp":1669815555868,"user_tz":300,"elapsed":10,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"da88424e-8290-413d-ce08-895b958196e0"},"execution_count":9,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" airline_sentiment text \\\n","0 neutral @VirginAmerica What @dhepburn said. \n","1 positive @VirginAmerica plus you've added commercials t... \n","2 neutral @VirginAmerica I didn't today... Must mean I n... \n","3 negative @VirginAmerica it's really aggressive to blast... \n","4 negative @VirginAmerica and it's a really big bad thing... \n","... ... ... \n","14635 positive @AmericanAir thank you we got on a different f... \n","14636 negative @AmericanAir leaving over 20 minutes Late Flig... \n","14637 neutral @AmericanAir Please bring American Airlines to... \n","14638 negative @AmericanAir you have my money, you change my ... \n","14639 neutral @AmericanAir we have 8 ppl so we need 2 know h... \n","\n"," processed_features \n","0 what said \n","1 plus you ve added commercials to the experien... \n","2 didn today must mean need to take another trip \n","3 it really aggressive to blast obnoxious enter... \n","4 and it a really big bad thing about it \n","... ... \n","14635 thank you we got on different flight to chicago \n","14636 leaving over minutes late flight no warnings ... \n","14637 please bring american airlines to blackberry \n","14638 you have my money you change my flight and do... \n","14639 we have ppl so we need know how many seats ar... \n","\n","[14640 rows x 3 columns]"],"text/html":["\n","
\n","
\n","
\n","\n","
\n"," \n","
\n","
\n","
airline_sentiment
\n","
text
\n","
processed_features
\n","
\n"," \n"," \n","
\n","
0
\n","
neutral
\n","
@VirginAmerica What @dhepburn said.
\n","
what said
\n","
\n","
\n","
1
\n","
positive
\n","
@VirginAmerica plus you've added commercials t...
\n","
plus you ve added commercials to the experien...
\n","
\n","
\n","
2
\n","
neutral
\n","
@VirginAmerica I didn't today... Must mean I n...
\n","
didn today must mean need to take another trip
\n","
\n","
\n","
3
\n","
negative
\n","
@VirginAmerica it's really aggressive to blast...
\n","
it really aggressive to blast obnoxious enter...
\n","
\n","
\n","
4
\n","
negative
\n","
@VirginAmerica and it's a really big bad thing...
\n","
and it a really big bad thing about it
\n","
\n","
\n","
...
\n","
...
\n","
...
\n","
...
\n","
\n","
\n","
14635
\n","
positive
\n","
@AmericanAir thank you we got on a different f...
\n","
thank you we got on different flight to chicago
\n","
\n","
\n","
14636
\n","
negative
\n","
@AmericanAir leaving over 20 minutes Late Flig...
\n","
leaving over minutes late flight no warnings ...
\n","
\n","
\n","
14637
\n","
neutral
\n","
@AmericanAir Please bring American Airlines to...
\n","
please bring american airlines to blackberry
\n","
\n","
\n","
14638
\n","
negative
\n","
@AmericanAir you have my money, you change my ...
\n","
you have my money you change my flight and do...
\n","
\n","
\n","
14639
\n","
neutral
\n","
@AmericanAir we have 8 ppl so we need 2 know h...
\n","
we have ppl so we need know how many seats ar...
\n","
\n"," \n","
\n","
14640 rows × 3 columns
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":9}]},{"cell_type":"markdown","source":["# Analisis de NLP\n","\n","* ### Linear vector classifier\n","* ### Random forest\n","* ### TextBlob - Naive Bayes\n"],"metadata":{"id":"TptcqL4VTR7Z"}},{"cell_type":"code","source":["!pip install tqdm"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8KLTCdzNZaTe","executionInfo":{"status":"ok","timestamp":1669815562858,"user_tz":300,"elapsed":3987,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"abe2e761-2c6e-4986-d690-80c263f2df4d"},"execution_count":10,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (4.64.1)\n"]}]},{"cell_type":"code","source":["from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.svm import LinearSVC\n","from sklearn.ensemble import RandomForestClassifier\n","\n","from nltk.corpus import stopwords\n","\n","import spacy\n","\n","from tqdm import tqdm\n","\n","from textblob import TextBlob\n","from textblob.classifiers import NaiveBayesClassifier\n","from textblob.classifiers import DecisionTreeClassifier\n","\n","import nltk"],"metadata":{"id":"D63fHsomS6d9","executionInfo":{"status":"ok","timestamp":1669815575875,"user_tz":300,"elapsed":11778,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":11,"outputs":[]},{"cell_type":"code","source":["nltk.download('punkt')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jgEztEDDdxqR","executionInfo":{"status":"ok","timestamp":1669815579455,"user_tz":300,"elapsed":1062,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"74ede08c-d4df-404e-ed24-be1f8868e6fa"},"execution_count":12,"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Unzipping tokenizers/punkt.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":12}]},{"cell_type":"code","source":["nlp = spacy.load(\"en_core_web_sm\")\n","spacy_stopwords_en = spacy.lang.en.stop_words.STOP_WORDS"],"metadata":{"id":"_gDY5t-tT9N1","executionInfo":{"status":"ok","timestamp":1669815585071,"user_tz":300,"elapsed":2657,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":13,"outputs":[]},{"cell_type":"code","source":["vectorizer = TfidfVectorizer ( stop_words=spacy_stopwords_en)\n","processed_features_vec = vectorizer.fit_transform(processed_features).toarray()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2PUYRdftUxzA","executionInfo":{"status":"ok","timestamp":1669815588682,"user_tz":300,"elapsed":1671,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"01f841a8-388e-4ab6-8ed3-d2ea746fcfdd"},"execution_count":14,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/sklearn/feature_extraction/text.py:401: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ll', 've'] not in stop_words.\n"," % sorted(inconsistent)\n"]}]},{"cell_type":"code","source":["X_train, X_test, y_train, y_test = train_test_split(processed_features_vec, labels, test_size=0.2, random_state=0)"],"metadata":{"id":"Bou0JwdcU_lC","executionInfo":{"status":"ok","timestamp":1669815591543,"user_tz":300,"elapsed":1535,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":15,"outputs":[]},{"cell_type":"code","source":["# Linear vector classifier\n","LSVC_classifier = LinearSVC(random_state=0, tol=1e-6)\n","LSVC_classifier.fit(X_train, y_train)\n","\n","LSVC_predictions = LSVC_classifier.predict(X_test)\n","LSVC_acc = accuracy_score(y_test, LSVC_predictions)\n","\n","print('Accuracy \\t:',LSVC_acc)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Gj7HSs0hVeHm","executionInfo":{"status":"ok","timestamp":1669815594066,"user_tz":300,"elapsed":1032,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"21c47785-e994-4805-ed93-00d9da9e15c3"},"execution_count":16,"outputs":[{"output_type":"stream","name":"stdout","text":["Accuracy \t: 0.7708333333333334\n"]}]},{"cell_type":"code","source":["# randon forest\n","RF_classifier = RandomForestClassifier(n_estimators=200, random_state=0)\n","RF_classifier.fit(X_train, y_train)\n","\n","RF_predictions = RF_classifier.predict(X_test)\n","RF_acc =accuracy_score(y_test, RF_predictions)\n","\n","print('Accuracy \\t:',RF_acc)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"KNbY488QWaGk","executionInfo":{"status":"ok","timestamp":1669815794530,"user_tz":300,"elapsed":198804,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"3d7ad1c6-51fc-40c0-d410-a68bf3ece00f"},"execution_count":17,"outputs":[{"output_type":"stream","name":"stdout","text":["Accuracy \t: 0.7537568306010929\n"]}]},{"cell_type":"code","source":["# TextBlob - Naive Bayes\n","X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)\n","\n","def new_format(X,Y): \n"," return [(x,y) for x,y in zip(X,Y)]\n","\n","def divide(data,n): \n"," m = int(len(data)/n)\n"," return [data[i*m:(i+1)*m] for i in range(n)]\n","\n","n_sets = 30\n","sets = divide(new_format(X_train,y_train),n_sets) \n","\n"],"metadata":{"id":"U91P2eQYW3vU","executionInfo":{"status":"ok","timestamp":1669815800299,"user_tz":300,"elapsed":389,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":18,"outputs":[]},{"cell_type":"code","source":["cl = NaiveBayesClassifier(sets[0])\n","for i,j in zip(range(5),tqdm(range(5))): \n"," cl.update(sets[i+1])\n","\n","TB_acc = cl.accuracy(new_format(X_test,y_test))\n","print('Accuracy \\t:',TB_acc)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"dc7eP95_bStH","executionInfo":{"status":"ok","timestamp":1669815967909,"user_tz":300,"elapsed":164250,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"06b6286c-06a4-45dd-a291-1893782ba5f4"},"execution_count":19,"outputs":[{"output_type":"stream","name":"stderr","text":[" 80%|████████ | 4/5 [00:58<00:14, 14.54s/it]\n"]},{"output_type":"stream","name":"stdout","text":["Accuracy \t: 0.76775956284153\n"]}]},{"cell_type":"markdown","source":["## Testeando un ejemplo"],"metadata":{"id":"GfylfCo-yGxr"}},{"cell_type":"code","source":["dat = {\"Metohd\":[\"Linear vector classifier\",\"Random Forest\",\"TB naive Bayes\"],\"Accuracy\":[LSVC_acc,RF_acc,TB_acc]}\n","pd.DataFrame(dat)"],"metadata":{"id":"Jgh0w85rdjwS","colab":{"base_uri":"https://localhost:8080/","height":143},"executionInfo":{"status":"ok","timestamp":1669816108432,"user_tz":300,"elapsed":380,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"9fe81ffc-b627-4756-e194-ad6cca6dfab1"},"execution_count":20,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Metohd Accuracy\n","0 Linear vector classifier 0.770833\n","1 Random Forest 0.753757\n","2 TB naive Bayes 0.767760"],"text/html":["\n","
\n","
\n","
\n","\n","
\n"," \n","
\n","
\n","
Metohd
\n","
Accuracy
\n","
\n"," \n"," \n","
\n","
0
\n","
Linear vector classifier
\n","
0.770833
\n","
\n","
\n","
1
\n","
Random Forest
\n","
0.753757
\n","
\n","
\n","
2
\n","
TB naive Bayes
\n","
0.767760
\n","
\n"," \n","
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":20}]},{"cell_type":"code","source":["# Analicemos las últimas filas \n","test = airline_tweets2.tail()\n","test"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"ca8-MnjyzH90","executionInfo":{"status":"ok","timestamp":1669816583199,"user_tz":300,"elapsed":11,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"1ecfd064-66a2-4a3c-e5a9-1bb018170542"},"execution_count":23,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" airline_sentiment text \\\n","14635 positive @AmericanAir thank you we got on a different f... \n","14636 negative @AmericanAir leaving over 20 minutes Late Flig... \n","14637 neutral @AmericanAir Please bring American Airlines to... \n","14638 negative @AmericanAir you have my money, you change my ... \n","14639 neutral @AmericanAir we have 8 ppl so we need 2 know h... \n","\n"," processed_features \n","14635 thank you we got on different flight to chicago \n","14636 leaving over minutes late flight no warnings ... \n","14637 please bring american airlines to blackberry \n","14638 you have my money you change my flight and do... \n","14639 we have ppl so we need know how many seats ar... "],"text/html":["\n","
\n","
\n","
\n","\n","
\n"," \n","
\n","
\n","
airline_sentiment
\n","
text
\n","
processed_features
\n","
\n"," \n"," \n","
\n","
14635
\n","
positive
\n","
@AmericanAir thank you we got on a different f...
\n","
thank you we got on different flight to chicago
\n","
\n","
\n","
14636
\n","
negative
\n","
@AmericanAir leaving over 20 minutes Late Flig...
\n","
leaving over minutes late flight no warnings ...
\n","
\n","
\n","
14637
\n","
neutral
\n","
@AmericanAir Please bring American Airlines to...
\n","
please bring american airlines to blackberry
\n","
\n","
\n","
14638
\n","
negative
\n","
@AmericanAir you have my money, you change my ...
\n","
you have my money you change my flight and do...
\n","
\n","
\n","
14639
\n","
neutral
\n","
@AmericanAir we have 8 ppl so we need 2 know h...
\n","
we have ppl so we need know how many seats ar...
\n","
\n"," \n","
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":23}]},{"cell_type":"code","source":["features = test[\"processed_features\"].values\n","\n","features_v = vectorizer.transform(features)"],"metadata":{"id":"1DNVF_kRzOuB","executionInfo":{"status":"ok","timestamp":1669817144254,"user_tz":300,"elapsed":204,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":34,"outputs":[]},{"cell_type":"code","source":["testeoRF = []\n","testeoLSVC = []\n","testeoTB = []\n","\n","for i in features_v: \n"," testeoRF.append(RF_classifier.predict(i))\n"," testeoLSVC.append(LSVC_classifier.predict(i))\n"],"metadata":{"id":"4AbL5LJw2Xmg","executionInfo":{"status":"ok","timestamp":1669817150526,"user_tz":300,"elapsed":985,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":35,"outputs":[]},{"cell_type":"code","source":["for i in features: \n"," testeoTB.append(cl.classify(i))"],"metadata":{"id":"EVonB_Jc2sRw","executionInfo":{"status":"ok","timestamp":1669817414619,"user_tz":300,"elapsed":634,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":37,"outputs":[]},{"cell_type":"code","source":["test[\"Random Forest\"]=testeoRF\n","test[\"Linear vector classifier\"]=testeoLSVC\n","test[\"TB naive Bayes\"]=testeoTB"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"huDqp2Ym2-C1","executionInfo":{"status":"ok","timestamp":1669817825744,"user_tz":300,"elapsed":204,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"7c34ae90-536d-4ee6-b5f9-43c7ecf861fd"},"execution_count":39,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n","A value is trying to be set on a copy of a slice from a DataFrame.\n","Try using .loc[row_indexer,col_indexer] = value instead\n","\n","See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"," \"\"\"Entry point for launching an IPython kernel.\n","/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n","A value is trying to be set on a copy of a slice from a DataFrame.\n","Try using .loc[row_indexer,col_indexer] = value instead\n","\n","See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"," \n","/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n","A value is trying to be set on a copy of a slice from a DataFrame.\n","Try using .loc[row_indexer,col_indexer] = value instead\n","\n","See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"," This is separate from the ipykernel package so we can avoid doing imports until\n"]}]},{"cell_type":"code","source":["test"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"ju45shiu3_EX","executionInfo":{"status":"ok","timestamp":1669817832746,"user_tz":300,"elapsed":211,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"770543c9-bed2-4e24-c40c-3daae6dac1f4"},"execution_count":40,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" airline_sentiment text \\\n","14635 positive @AmericanAir thank you we got on a different f... \n","14636 negative @AmericanAir leaving over 20 minutes Late Flig... \n","14637 neutral @AmericanAir Please bring American Airlines to... \n","14638 negative @AmericanAir you have my money, you change my ... \n","14639 neutral @AmericanAir we have 8 ppl so we need 2 know h... \n","\n"," processed_features Random Forest \\\n","14635 thank you we got on different flight to chicago [positive] \n","14636 leaving over minutes late flight no warnings ... [negative] \n","14637 please bring american airlines to blackberry [neutral] \n","14638 you have my money you change my flight and do... [negative] \n","14639 we have ppl so we need know how many seats ar... [negative] \n","\n"," Linear vector classifier TB naive Bayes \n","14635 [positive] neutral \n","14636 [negative] negative \n","14637 [neutral] neutral \n","14638 [negative] negative \n","14639 [negative] negative "],"text/html":["\n","
\n","
\n","
\n","\n","
\n"," \n","
\n","
\n","
airline_sentiment
\n","
text
\n","
processed_features
\n","
Random Forest
\n","
Linear vector classifier
\n","
TB naive Bayes
\n","
\n"," \n"," \n","
\n","
14635
\n","
positive
\n","
@AmericanAir thank you we got on a different f...
\n","
thank you we got on different flight to chicago
\n","
[positive]
\n","
[positive]
\n","
neutral
\n","
\n","
\n","
14636
\n","
negative
\n","
@AmericanAir leaving over 20 minutes Late Flig...
\n","
leaving over minutes late flight no warnings ...
\n","
[negative]
\n","
[negative]
\n","
negative
\n","
\n","
\n","
14637
\n","
neutral
\n","
@AmericanAir Please bring American Airlines to...
\n","
please bring american airlines to blackberry
\n","
[neutral]
\n","
[neutral]
\n","
neutral
\n","
\n","
\n","
14638
\n","
negative
\n","
@AmericanAir you have my money, you change my ...
\n","
you have my money you change my flight and do...
\n","
[negative]
\n","
[negative]
\n","
negative
\n","
\n","
\n","
14639
\n","
neutral
\n","
@AmericanAir we have 8 ppl so we need 2 know h...
\n","
we have ppl so we need know how many seats ar...
\n","
[negative]
\n","
[negative]
\n","
negative
\n","
\n"," \n","
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":40}]},{"cell_type":"markdown","source":["## Conclusion\n","\n","Para responder las preguntas\n","\n","* how woud you detect sentences showing: hatred/racism/sexism, in Twitter (for example)\n","* which method would you choose to clasify opinions on Amazon products: positive/negative\n","\n","Con el reto de Bancolombia aprendí a hacer una clasificacion por medio de ZeroShot o de cosine similarity, esto añadido al analisis que se hace en este notebook, lo que se puede hacer por ejemplo es primero sacar los post negativos, seria una clasificacion para extraer lo negativo, y posteriormente se puede hacer otra clasificacion en dichos negativos por medio de los métodos vistos en la dadaton donde se usen un conjunto de datos que contenga las palabras que sean hatred/racism/sexism y asi detectarlas."],"metadata":{"id":"dQ4vsj966p0e"}},{"cell_type":"code","source":[],"metadata":{"id":"Q91rsjkg8CIC"},"execution_count":null,"outputs":[]}]}
\ No newline at end of file
diff --git a/EXTRAS/homeworks_to_submit/1036785977/homework_05/void b/EXTRAS/homeworks_to_submit/1036785977/homework_05/void
new file mode 100644
index 0000000..6476f9f
--- /dev/null
+++ b/EXTRAS/homeworks_to_submit/1036785977/homework_05/void
@@ -0,0 +1 @@
+homework 5
diff --git a/EXTRAS/homeworks_to_submit/1036785977/homework_06/homework_6.ipynb b/EXTRAS/homeworks_to_submit/1036785977/homework_06/homework_6.ipynb
new file mode 100644
index 0000000..d0aa04c
--- /dev/null
+++ b/EXTRAS/homeworks_to_submit/1036785977/homework_06/homework_6.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPfaywRXEuKWu7dSDkj6SOx"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# Homework 6 _ JSON\n","# Santiago Ruiz Piedrahita\n","\n","HOMEWORK\n","\n","En un notebook de jupyter desarrolle los siguientes pasos\n","\n","Descargue el JSON con la lista de paises del siguiente link:\n","https://datahub.io/core/country-list/r/data.json\n","y escoja aleatoriamente un país de Europa \n","(Ejemplo que es abajo usa Colombia pero con pais de europa es mas facil)\n","\n","Use los diferentes API endpoints de inspire-hep \n","https://inspirehep.net/ \n","\n","Para extraer la lista de investigadores de una institución de ese país en esa base de datos. \n","\n","\n","Para ello:\n","\n","A) Use el API de institutions para extraer la lista de instituciones del país, por ejemplo: colombia (mejor si es Europa, por completo de datos )\n","https://inspirehep.net/api/institutions?q=colombia\n","\n","\n","Para la primera institución con\n","\n","number_of_papers > 0\n","\n","\n","Obtenga el valor\n","\n","legacy_ICN:\n","\n","\n","Por ejemplo: \n","Colombia, U. Natl.\n","\n","\n","Si ninguna institución satisface la condición number_of_papers > 0, escoja de nuevo otro país aleatorio y repita el proceso\n","\n"],"metadata":{"id":"HHS4qx3S4I7B"}},{"cell_type":"code","source":["# librerias\n","import json\n","import requests\n","import numpy as np\n","import pandas as pd"],"metadata":{"id":"8xN7--Ws4GIj","executionInfo":{"status":"ok","timestamp":1669750684552,"user_tz":300,"elapsed":1978,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":1,"outputs":[]},{"cell_type":"code","execution_count":2,"metadata":{"id":"Jgp2qPIG3msx","executionInfo":{"status":"ok","timestamp":1669750825604,"user_tz":300,"elapsed":196,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"outputs":[],"source":["# leemos los datos\n","data = open(\"data_json.json\")"]},{"cell_type":"code","source":["# eligiendo un pais aleatoriamente\n","pais = \"Estonia\""],"metadata":{"id":"4zm6MAtt57Cn","executionInfo":{"status":"ok","timestamp":1669750931602,"user_tz":300,"elapsed":193,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["# veamos las instituciones\n","URL = \"https://inspirehep.net/api/institutions?q={}\".format(pais)\n","institucion = requests.get(URL)\n","\n","legacy_ICN = []\n","\n","for i in institucion.json().get('hits').get('hits'):\n","\n"," legacy = i.get('metadata').get('legacy_ICN')\n"," papers = i.get('metadata').get('number_of_papers')\n","\n"," if papers>0:\n"," legacy_ICN.append(legacy)\n","\n","legacy_ICN"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"xSNUMTIT57E-","executionInfo":{"status":"ok","timestamp":1669751239290,"user_tz":300,"elapsed":744,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"d7422bff-2f76-4bd6-8e38-7f20203b4aa1"},"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['Unlisted, EE',\n"," 'Tartu, Inst. Phys.',\n"," 'Tartu, Inst. Astrophys.',\n"," 'Tartu Observ.',\n"," 'Estonian U.',\n"," 'Tartu State U.',\n"," 'Tallinn Polytechnic Inst.',\n"," 'Comp. Sci. Coll., Tallinn',\n"," 'Tallinn Pedagogical U.',\n"," 'Estonian Agricultural U.']"]},"metadata":{},"execution_count":4}]},{"cell_type":"markdown","source":["B) Con el API de literatura obtenga el JSON con los artículos de menos de 10 autores usando el \"legacy_ICN\" de la siguiente manera\n","\n","https://inspirehep.net/api/literature?sort=mostrecent&page=1&q=aff+Colombia,+U.+Natl.+and+ac+1->+10\n","\n","\n","aff: usa el valor de legacy_ICN\n","and: es un operador lógico\n","ac: establece los autores entre 1 y 10\n","\n"],"metadata":{"id":"HFWKqlSU4jSZ"}},{"cell_type":"code","source":["link = legacy_ICN[0].replace(' ','+')\n","lit =\"https://inspirehep.net/api/literature?sort=mostrecent&page=1&q=aff+{}+and+ac+1->+10\".format(link)\n","lit = requests.get(lit)\n","\n","\n","art = []\n","for i in lit.json().get('hits').get('hits'):\n"," art.append (i.get('metadata')) \n","\n","len(art)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ec0GkMKL4j5W","executionInfo":{"status":"ok","timestamp":1669752408672,"user_tz":300,"elapsed":1640,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"4187cc06-9d49-43a8-eb90-67ad550bded6"},"execution_count":11,"outputs":[{"output_type":"execute_result","data":{"text/plain":["7"]},"metadata":{},"execution_count":11}]},{"cell_type":"markdown","source":["C) Para al menos un artículo de esa institución, extraiga el URL del perfil de cada autor de esa institución que se encuentra dentro del campo \"authors\" en \"record\" y luego en \"$ref\". Por ejemplo\n","\n","https://inspirehep.net/api/authors/1010271\n","\n","\n"],"metadata":{"id":"W4v0-tLf7qr8"}},{"cell_type":"code","source":["# extrayendo los autores\n","autores = []\n","for i in art[0].get('authors'):\n"," autores.append(i.get('record').get('$ref')) \n","\n","autores"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VB3xU83y_Lge","executionInfo":{"status":"ok","timestamp":1669752425750,"user_tz":300,"elapsed":205,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"0cf2b0dd-543c-44b8-c187-97a3c75e0277"},"execution_count":12,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['https://inspirehep.net/api/authors/1259434',\n"," 'https://inspirehep.net/api/authors/1334455',\n"," 'https://inspirehep.net/api/authors/1274396']"]},"metadata":{},"execution_count":12}]},{"cell_type":"markdown","source":["D) Con cada uno de los datos del resultado del API para cada perfil construya una tabla con los siguientes columnas (puede que alguno de los datos no esté disponible): \n","Nombre Completo\n","Correo electrónico\n","posición más reciente (la primera que aparece en la lista \"positions\" del JSON) con su correspondiente:\n","rango \n","institución \n","fecha de inicio \n","fecha de finalización"],"metadata":{"id":"WluzuBT17quA"}},{"cell_type":"code","source":["# para guardar los datos\n","nombre = []\n","email = []\n","posicion = []\n","institucion = []\n","F_inicio = []\n","F_finalizacion = []"],"metadata":{"id":"bpHAgbnVA5PR","executionInfo":{"status":"ok","timestamp":1669754133615,"user_tz":300,"elapsed":230,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":18,"outputs":[]},{"cell_type":"code","source":["for i in autores:\n","\n"," info = requests.get(i).json().get('metadata')\n","\n"," name = info.get('name').get('value')\n"," nombre.append(name)\n","\n"," email_a = info.get('email_addresses')[0].get('value')\n"," email.append(email_a)\n","\n"," rank = info.get('positions')[0].get('rank')\n"," posicion.append(rank)\n","\n"," institution = info.get('positions')[0].get('institution') #institution where they had their last position\n"," institucion.append(institution)\n","\n"," start = info.get('positions')[0].get('start_date')\n"," F_inicio.append(start)\n","\n"," end = info.get('positions')[0].get('end_date')\n"," F_finalizacion.append(end) "],"metadata":{"id":"33dQeCPhA5R5","executionInfo":{"status":"ok","timestamp":1669754135979,"user_tz":300,"elapsed":1697,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}}},"execution_count":19,"outputs":[]},{"cell_type":"code","source":["# guardamos los datos\n","informacion = {\"Nombre\":nombre,\"Email\":email,\"Posicion\":posicion,\"Institucion\":institucion,\"F_inicio\":F_inicio,\"F_finalizacion\":F_finalizacion}\n","pd.DataFrame(informacion)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":143},"id":"97z-0HG7A5UW","executionInfo":{"status":"ok","timestamp":1669754499390,"user_tz":300,"elapsed":202,"user":{"displayName":"SANTIAGO RUIZ PIEDRAHITA","userId":"01504872925764674078"}},"outputId":"aa431f2f-0682-4679-f0f1-6ee6f9ac6b64"},"execution_count":20,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Nombre Email Posicion Institucion \\\n","0 Lewicki, Marek marek.lewicki@fuw.edu.pl None Warsaw U. \n","1 Vaskonen, Ville vvaskonen@ifae.es POSTDOC Padua U. \n","2 Veermäe, Hardi hardi.veermae@cern.ch None NICPB, Tallinn \n","\n"," F_inicio F_finalizacion \n","0 2020 None \n","1 2022 None \n","2 None None "],"text/html":["\n","
\n","
\n","
\n","\n","
\n"," \n","
\n","
\n","
Nombre
\n","
Email
\n","
Posicion
\n","
Institucion
\n","
F_inicio
\n","
F_finalizacion
\n","
\n"," \n"," \n","
\n","
0
\n","
Lewicki, Marek
\n","
marek.lewicki@fuw.edu.pl
\n","
None
\n","
Warsaw U.
\n","
2020
\n","
None
\n","
\n","
\n","
1
\n","
Vaskonen, Ville
\n","
vvaskonen@ifae.es
\n","
POSTDOC
\n","
Padua U.
\n","
2022
\n","
None
\n","
\n","
\n","
2
\n","
Veermäe, Hardi
\n","
hardi.veermae@cern.ch
\n","
None
\n","
NICPB, Tallinn
\n","
None
\n","
None
\n","
\n"," \n","
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":20}]},{"cell_type":"code","source":[],"metadata":{"id":"SWxLKEJvH65l"},"execution_count":null,"outputs":[]}]}
\ No newline at end of file
diff --git a/EXTRAS/homeworks_to_submit/1036785977/homework_06/void b/EXTRAS/homeworks_to_submit/1036785977/homework_06/void
new file mode 100644
index 0000000..a5f0c25
--- /dev/null
+++ b/EXTRAS/homeworks_to_submit/1036785977/homework_06/void
@@ -0,0 +1 @@
+homework 6