diff --git a/README.md b/README.md index 254493f..0bc7965 100644 --- a/README.md +++ b/README.md @@ -1,66 +1,66 @@ -# MiniProject2: Discover a list of projects on sourceforge.net and gitlab.com +# MiniProject2: Discover a list of projects on SourceForge.net and GitLab.com -These two forges presente two different types of challeges to discovery. +These two forges present two different types of data discovery challenges. -SourceForge actively prevents discovery. SourceForge over ten years ago was he largest forge -but as it started losing its market share to other forges it started blocking project discovery. +SourceForge actively prevents discovery. Over ten years ago it was the largest forge +but as it started losing market share to other forges, they started blocking project discovery. GitLab, on the other hand, has an error-prone API that is highly unreliable. - -Discover at least 50 projects on each of the forges that have -names starting with the letter (case insensitive) in font of your name. -Please provide ipython notebook used to do the discovery. +## Part 1 + - Discover at least 50 projects on SourceForge and GitLab whose +names start with the letter (case insensitive) in front of your name in the list below. + - Provide the IPython notebook you used to discovery the data. You are free to use any method, including a list compiled by someone else, search on google search engine, etc. -but you do need to verify that the discovered projects do currently exist on these -forges by retrieving the url of the version control repository used by the project. - -Please do discovery using the googlecloud VM to avoid accidentally -triggereing blocking to others. - -``` -a;3PIV;pprovins;Provins IV, Preston -b;BrettBass13;bbass11;Bass, Brett Czech -c;CipherR9;gyj992;Johnson, Rojae Antonio -d;Colsarcol;cmawhinn;Mawhinney, Colin Joseph -e;EvanEzell;eezell3;Ezell, Evan Collin -f;MikeynJerry;jdunca51;Duncan, Jerry -g;Tasmia;trahman4;Rahman, Tasmia -h;awilki13;awilki13;Wilkinson, Alex Webb -i;bryanpacep1;jpace7;Pace, Jonathan Bryan -j;caiwjohn;cjohn3;John, Cai William -k;cflemmon;cflemmon;Flemmons, Cole -l;dbarry9;dbarry;Barry, Daniel Patrick -m;desai07;adesai6;Desai, Avie -n;gjones1911;gjones2;Jones, Gerald Leon -o;herronej;eherron5;Herron, Emily Joyce -p;hossain-rayhan;rhossai2;Hossain, Rayhan -q;jdong6;jdong6;Dong, Jeffrey Jing -r;jyu25utk;jyu25;Yu, Jinxiao -s;mkramer6;mkramer6;Kramer, Matthew S -t;mmahbub;mmahbub;Mahbub, Maria -u;nmansou4;nmansou4;Mansour, Nasib -v;nschwerz;nschwerz;Schwerzler, Nicolas Winfield William -w;rdabbs42;rdabbs1;Dabbs, Rosemary -x;saramsv;mousavi;Mousavicheshmehkaboodi, Sara -y;spaulsteinberg;ssteinb2;Steinberg, Samuel Paul -z;zol0;akarnauc;Karnauch, Andrey -a;zrandall;zrandall;Randall, Zachary Adams -b;lpassarella;lpassare;Passarella, Linsey Sara -c;tgoedecke;pgoedec1;Goedecke, Trish -d;ray830305;hchang13;Chang, Hsun Jui -e;ssravali;ssadhu2;Sadhu, Sri Ravali -f;diadoo;jpovlin;Povlin, John P -g;mander59;mander59;Anderson, Matt Mcguffee -h;iway1;iway1;Way, Isaac Caldwell -``` +but you do need to verify that the discovered projects currently exist on these +forges by retrieving the url of the version control repository used by the project. + +Please use the Google Cloud VM when discovering the project names to avoid accidentally +causing UTK to be blocked. + +| Letter | GitHub Username | NetID | Name | +|:-:|:-:|:-:|---| +| a | 3PIV | pprovins | Provins IV, Preston | +| b | BrettBass13 | bbass11 | Bass, Brett Czech | +| c | CipherR9 | gyj992 | Johnson, Rojae Antonio | +| d | Colsarcol | cmawhinn | Mawhinney, Colin Joseph | +| e | EvanEzell | eezell3 | Ezell, Evan Collin | +| f | MikeynJerry | jdunca51 | Duncan, Jerry | +| g | Tasmia | trahman4 | Rahman, Tasmia | +| h | awilki13 | awilki13 | Wilkinson, Alex Webb | +| i | bryanpacep1 | jpace7 | Pace, Jonathan Bryan | +| j | caiwjohn | cjohn3 | John, Cai William | +| k | cflemmon | cflemmon | Flemmons, Cole | +| l | dbarry9 | dbarry | Barry, Daniel Patrick | +| m | desai07 | adesai6 | Desai, Avie | +| n | gjones1911 | gjones2 | Jones, Gerald Leon | +| o | herronej | eherron5 | Herron, Emily Joyce | +| p | hossain-rayhan | rhossai2 | Hossain, Rayhan | +| q | jdong6 | jdong6 | Dong, Jeffrey Jing | +| r | jyu25utk | jyu25 | Yu, Jinxiao | +| s | mkramer6 | mkramer6 | Kramer, Matthew S | +| t | mmahbub | mmahbub | Mahbub, Maria | +| u | nmansou4 | nmansou4 | Mansour, Nasib | +| v | nschwerz | nschwerz | Schwerzler, Nicolas Winfield William | +| w | rdabbs42 | rdabbs1 | Dabbs, Rosemary | +| x | saramsv | mousavi | Mousavicheshmehkaboodi, Sara | +| y | spaulsteinberg | ssteinb2 | Steinberg, Samuel Paul | +| z | zol0 | akarnauc | Karnauch, Andrey | +| a | zrandall | zrandall | Randall, Zachary Adams | +| b | lpassarella | lpassare | Passarella, Linsey Sara | +| c | tgoedecke | pgoedec1 | Goedecke, Trish | +| d | ray830305 | hchang13 | Chang, Hsun Jui | +| e | ssravali | ssadhu2 | Sadhu, Sri Ravali | +| f | diadoo | jpovlin | Povlin, John P | +| g | mander59 | mander59 | Anderson, Matt Mcguffee | +| h | iway1 | iway1 | Way, Isaac Caldwell | ## GitLab discovery -GitLab provides [APIs](https://docs.gitlab.com/ee/api/) to retrieve project url. -Here is a sample code for collecting projects url (and stores data in mogodb): +GitLab provides [APIs](https://docs.gitlab.com/ee/api/) to retrieve project urls. +Here is sample code for collecting project urls (and storing data in mongodb): ``` import sys import re @@ -70,10 +70,10 @@ import time import datetime import requests -dbname = fdac18mp2 #please use this database -collname = glprj_yourutkid #please modify so you store data in your collection +dbname = "fdac18mp2" #please use this database +collname = "glprj_yourutkid" #please modify so you store data in your collection # beginning page index -begin = sys.argv[1] +begin = "0" client = pymongo.MongoClient() db = client[dbname] @@ -118,7 +118,7 @@ def get(url, coll): gleft = int(r.headers.get('RateLimit-Remaining')) lll = r.headers.get('Link') - t = r.text.encode('utf-8') + t = r.text array = json.loads(t) for el in array: @@ -139,7 +139,7 @@ def get(url, coll): return "got blocked", str(bginnum) if (r.ok): lll = r.headers.get('Link') - t = r.text.encode('utf-8') + t = r.text array1 = json.loads(t) for el in array1: coll.insert(el) @@ -162,4 +162,4 @@ def get(url, coll): get(beginurl,coll) ``` -Note that the parameters in the sample code are not optimal. Please feel free to tune them. This sample code is not robust enough to deal with various returned errors from query. You might need to investigate errors encountered individually. +Note that the parameters in the sample code are not optimal. Please feel free to tune them. This sample code is not robust enough to deal with various returned errors from query. You might need to investigate errors encountered individually. diff --git a/jdunca51.ipynb b/jdunca51.ipynb new file mode 100644 index 0000000..d3a3149 --- /dev/null +++ b/jdunca51.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import sys\n", + "import re\n", + "import pymongo\n", + "import json\n", + "import time\n", + "import datetime\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "dbname = \"fdac18mp2\" #please use this database\n", + "collname = \"glprj_jdunca51\" #please modify so you store data in your collection\n", + "my_char = 'f'\n", + "\n", + "# beginning page index\n", + "begin = \"1\"\n", + "client = pymongo.MongoClient()\n", + "\n", + "db = client[dbname]\n", + "coll = db[collname]\n", + "\n", + "\n", + "gitlab_url = \"https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page=\" + begin + \\\n", + " \"&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false\"\n", + "\n", + "gleft = 20\n", + "\n", + "source_url = \"https://sourceforge.net/directory/?q=\" + my_char + \"&sort=name&page=\"\n", + "rest_url = \"https://sourceforge.net/rest/p/\"\n", + "\n", + "header = {'per_page': 99}\n", + "\n", + "# check remaining query chances for rate-limit restriction\n", + "def wait(left):\n", + " global header\n", + " while (left < 20):\n", + " l = requests.get('https://gitlab.com/api/v4/projects', headers=header)\n", + " if (l.ok):\n", + " left = int(l.headers.get('RateLimit-Remaining'))\n", + " time .sleep(60)\n", + " return left\n", + "\n", + "def project_exists(url):\n", + " r = requests.get(url)\n", + " if r.status_code == 200:\n", + " return True\n", + " return False\n", + "\n", + "def get_source(url, coll, rest):\n", + " page = 1\n", + " project_count = 0\n", + " while True:\n", + " resp = requests.get(url + str(page))\n", + " text = resp.text\n", + " soup = BeautifulSoup(text, 'html.parser')\n", + " if re.search('No results found.', soup.get_text()):\n", + " return\n", + "\n", + " for link in soup.find_all(class_=\"project-icon\", href=True):\n", + " name = re.findall('/projects/([A-Za-z0-9\\-]*)', link.get('href'))\n", + " name = name[0] if name else None\n", + " if name is not None and name.lower().startswith(my_char):\n", + " resp = requests.get(rest + name)\n", + " if resp.status_code == 200:\n", + " info = json.loads(resp.text)\n", + " info['forge'] = 'sourceforge'\n", + " coll.insert_one(info)\n", + " project_count += 1\n", + " if project_count >= 50:\n", + " return\n", + " page += 1\n", + " return\n", + "\n", + "# send queries and extract urls \n", + "def get_gitlab(url, coll):\n", + "\n", + " global gleft\n", + " global header\n", + " global bginnum\n", + " gleft = wait(gleft)\n", + " values = []\n", + " size = 0\n", + " project_count = 0\n", + "\n", + " try:\n", + " r = requests .get(url, headers=header)\n", + " time .sleep(0.5)\n", + " # got blocked\n", + " if r.status_code == 403:\n", + " return \"got blocked\", str(bginnum)\n", + " if (r.ok):\n", + "\n", + " gleft = int(r.headers.get('RateLimit-Remaining'))\n", + " lll = r.headers.get('Link')\n", + " t = r.text\n", + " array = json.loads(t)\n", + " \n", + " for el in array:\n", + " if el['name'].lower().startswith(my_char):\n", + " if project_exists(el['http_url_to_repo']):\n", + " project_count += 1\n", + " el['forge'] = 'gitlab'\n", + " coll.insert_one(el)\n", + " if project_count >= 50:\n", + " return\n", + " \n", + " #next page\n", + " while ('; rel=\"next\"' in lll):\n", + " gleft = int(r.headers.get('RateLimit-Remaining'))\n", + " gleft = wait(gleft)\n", + " # extract next page url\n", + " ll = lll.replace(';', ',').split(',')\n", + " url = ll[ll.index(' rel=\"next\"') -\n", + " 1].replace('<', '').replace('>', '').lstrip()\n", + " \n", + " try:\n", + " r = requests .get(url, headers=header)\n", + " if r.status_code == 403:\n", + " return \"got blocked\", str(bginnum)\n", + " if (r.ok):\n", + " lll = r.headers.get('Link')\n", + " t = r.text\n", + " array1 = json.loads(t)\n", + " for el in array1:\n", + " if el['name'].lower().startswith(my_char):\n", + " if project_exists(el['http_url_to_repo']):\n", + " project_count += 1\n", + " el['forge'] = 'gitlab'\n", + " coll.insert_one(el)\n", + " if project_count >= 50:\n", + " return\n", + " else:\n", + " sys.stderr.write(\"url can not found:\\n\" + url + '\\n')\n", + " return \n", + " except requests.exceptions.ConnectionError:\n", + " sys.stderr.write('could not get ' + url + '\\n')\n", + " \n", + " else:\n", + " sys.stderr.write(\"url can not found:\\n\" + url + '\\n')\n", + " return\n", + "\n", + " except requests.exceptions.ConnectionError:\n", + " sys.stderr.write('could not get ' + url + '\\n')\n", + " except Exception as e:\n", + " sys.stderr.write(url + ';' + str(e) + '\\n')\n", + " \n", + "#start retrieving \n", + "get_gitlab(gitlab_url,coll)\n", + "get_source(source_url, coll, rest_url)\n", + "#print collected data\n", + "for doc in coll.find({}):\n", + " print(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}