From dc67dd8a4bff0ecab9b93492c32039188050f834 Mon Sep 17 00:00:00 2001 From: tgoedecke <32520805+tgoedecke@users.noreply.github.com> Date: Tue, 9 Oct 2018 22:27:03 -0500 Subject: [PATCH 01/29] Edit file with letter c, my collection directory --- jdunca51.ipynb => pgoedec1.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename jdunca51.ipynb => pgoedec1.ipynb (98%) diff --git a/jdunca51.ipynb b/pgoedec1.ipynb similarity index 98% rename from jdunca51.ipynb rename to pgoedec1.ipynb index d3a3149..5b4947e 100644 --- a/jdunca51.ipynb +++ b/pgoedec1.ipynb @@ -18,8 +18,8 @@ "from bs4 import BeautifulSoup\n", "\n", "dbname = \"fdac18mp2\" #please use this database\n", - "collname = \"glprj_jdunca51\" #please modify so you store data in your collection\n", - "my_char = 'f'\n", + "collname = \"tgoedecke_pgoedec1"\" #please modify so you store data in your collection\n", + "my_char = 'c'\n", "\n", "# beginning page index\n", "begin = \"1\"\n", From eebf6a7aa304519cca11c9ad04e8ebdcb5720052 Mon Sep 17 00:00:00 2001 From: tgoedecke <32520805+tgoedecke@users.noreply.github.com> Date: Tue, 9 Oct 2018 22:30:41 -0500 Subject: [PATCH 02/29] Update pgoedec1.ipynb (Python 3) --- pgoedec1.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgoedec1.ipynb b/pgoedec1.ipynb index 5b4947e..84cbd01 100644 --- a/pgoedec1.ipynb +++ b/pgoedec1.ipynb @@ -171,7 +171,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python", "language": "python", "name": "python3" }, From a2b32f9e94593442db918e7f86113c81a7afaa4f Mon Sep 17 00:00:00 2001 From: tgoedecke <32520805+tgoedecke@users.noreply.github.com> Date: Tue, 9 Oct 2018 22:52:25 -0500 Subject: [PATCH 03/29] Update pgoedec1.ipynb --- pgoedec1.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgoedec1.ipynb b/pgoedec1.ipynb index 84cbd01..5b4947e 100644 --- a/pgoedec1.ipynb +++ b/pgoedec1.ipynb @@ -171,7 +171,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python", + "display_name": "Python 3", "language": "python", "name": "python3" }, From 3bd25a6f3e53ad3b3aa53ec91f73295b3412f5fc Mon Sep 17 00:00:00 2001 From: tgoedecke <32520805+tgoedecke@users.noreply.github.com> Date: Tue, 9 Oct 2018 23:26:49 -0500 Subject: [PATCH 04/29] Update collname repeat "dbname = \"fdac18mp2\" #please use this database\n", "collname = \"tgoedecke\" #please modify so you store data in your collection\n", "my_char = 'c'\n", --- pgoedec1.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgoedec1.ipynb b/pgoedec1.ipynb index 5b4947e..780533e 100644 --- a/pgoedec1.ipynb +++ b/pgoedec1.ipynb @@ -18,7 +18,7 @@ "from bs4 import BeautifulSoup\n", "\n", "dbname = \"fdac18mp2\" #please use this database\n", - "collname = \"tgoedecke_pgoedec1"\" #please modify so you store data in your collection\n", + "collname = \"tgoedecke\" #please modify so you store data in your collection\n", "my_char = 'c'\n", "\n", "# beginning page index\n", From 7d4c5b79e7990b20c958a2bfdf9fb445e28e5337 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 15 Oct 2018 12:08:58 -0400 Subject: [PATCH 05/29] Update README.md --- README.md | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0bc7965..4041fe8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,76 @@ -# MiniProject2: Discover a list of projects on SourceForge.net and GitLab.com +# MiniProject2: Phase2: Store info on NPM packages in MongoDB + +## Task: Getting Release info from GitHub on NPM packages + +### Resources: +NPM package list + +The list of packages is unique to each one of you: +/data/shared/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz +where XX is between 0 and 33: to find your number look at the list below. + +### Goal: +1. Download and store data from npm on all your packages on mongodb database: + fdac18mp2, collection: ghrel_yourutkid +1. Identify the packages that have GH repos (based on the stored info) +``` +# it has to contain value in +record["collected"]["metadata"]["repository"]["url"] +"git+https://github.com//0-.git" +``` +2. For each such package, get a list of all releases. Use Github API: +``` +https://developer.github.com/v3/repos/releases/ +``` +3. Find no. of commits between the latest and other releases. + +For example: + E.g. https://api.github.com/repos/webpack-contrib/html-loader/compare/v0.5.4...master or https://api.github.com/repos/git/git/compare/v2.2.0-rc1...v2.2.0-rc2 + More resource: https://stackoverflow.com/questions/26925312/github-api-how-to-compare-2-commits (look for comparing the tags in the answer) + Get the data from the json, look for something like to get no. of commits between releases + "status": "ahead", + "ahead_by": 24, + "behind_by": 0, + "total_commits": 24, + +| number | GitHub Username | NetID | Name | +|:-:|:-:|:-:|---| +| 0 | 3PIV | pprovins | Provins IV, Preston | +| 1 | BrettBass13 | bbass11 | Bass, Brett Czech | +| 2 | CipherR9 | gyj992 | Johnson, Rojae Antonio | +| 3 | Colsarcol | cmawhinn | Mawhinney, Colin Joseph | +| 4 | EvanEzell | eezell3 | Ezell, Evan Collin | +| 5 | MikeynJerry | jdunca51 | Duncan, Jerry | +| 6 | Tasmia | trahman4 | Rahman, Tasmia | +| 7 | awilki13 | awilki13 | Wilkinson, Alex Webb | +| 8 | bryanpacep1 | jpace7 | Pace, Jonathan Bryan | +| 9 | caiwjohn | cjohn3 | John, Cai William | +| 10 | cflemmon | cflemmon | Flemmons, Cole | +| 11 | dbarry9 | dbarry | Barry, Daniel Patrick | +| 12 | desai07 | adesai6 | Desai, Avie | +| 13 | gjones1911 | gjones2 | Jones, Gerald Leon | +| 14 | herronej | eherron5 | Herron, Emily Joyce | +| 15 | hossain-rayhan | rhossai2 | Hossain, Rayhan | +| 16 | jdong6 | jdong6 | Dong, Jeffrey Jing | +| 17 | jyu25utk | jyu25 | Yu, Jinxiao | +| 18 | mkramer6 | mkramer6 | Kramer, Matthew S | +| 19 | mmahbub | mmahbub | Mahbub, Maria | +| 20 | nmansou4 | nmansou4 | Mansour, Nasib | +| 21 | nschwerz | nschwerz | Schwerzler, Nicolas Winfield William | +| 22 | rdabbs42 | rdabbs1 | Dabbs, Rosemary | +| 23 | saramsv | mousavi | Mousavicheshmehkaboodi, Sara | +| 24 | spaulsteinberg | ssteinb2 | Steinberg, Samuel Paul | +| 25 | zol0 | akarnauc | Karnauch, Andrey | +| 26 | zrandall | zrandall | Randall, Zachary Adams | +| 27 | lpassarella | lpassare | Passarella, Linsey Sara | +| 28 | tgoedecke | pgoedec1 | Goedecke, Trish | +| 29 | ray830305 | hchang13 | Chang, Hsun Jui | +| 30 | ssravali | ssadhu2 | Sadhu, Sri Ravali | +| 31 | diadoo | jpovlin | Povlin, John P | +| 32 | mander59 | mander59 | Anderson, Matt Mcguffee | +| 33 | iway1 | iway1 | Way, Isaac Caldwell | + +# MiniProject2: Phase1: Discover a list of projects on SourceForge.net and GitLab.com These two forges present two different types of data discovery challenges. From 0af3808d50ec92544506e6317ea7b4f77d3d37d0 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 15 Oct 2018 15:50:07 -0400 Subject: [PATCH 06/29] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4041fe8..8bc7db1 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ NPM package list The list of packages is unique to each one of you: -/data/shared/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz +/data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz where XX is between 0 and 33: to find your number look at the list below. ### Goal: From 0f85604c4b1d10172fde41c3f70fe04872589bbc Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 15 Oct 2018 16:26:58 -0400 Subject: [PATCH 07/29] Create readNpm.py --- readNpm.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 readNpm.py diff --git a/readNpm.py b/readNpm.py new file mode 100644 index 0000000..9263e6b --- /dev/null +++ b/readNpm.py @@ -0,0 +1,40 @@ +import sys, json, pymongo, time, datetime, re, requests +from urllib.parse import quote + +#for da2 +#client = pymongo .MongoClient (host="da1.eecs.utk.edu") +#for gcloud machine +client = pymongo .MongoClient () + +db = client ['fdac18mp2'] + +#replace audris with your utkid +coll = db['npm_audris'] + +pre = 'https://api.npms.io/v2/package/' + +def output(s, p): + print(str(s) + ";" + p) + +for pname in sys.stdin.readlines(): + pname = pname.strip('\n') + #Thks @Macbrine: url parameters need to be quoted + pname = quote(pname, safe='') + r = requests.get(pre + pname) + if(r.ok): + result = r.content + try: + result_json = json.loads(result.decode('ascii', errors='ignore')) + #modify keys to remove unwanted '$' '.' characters that mongodb does not allow + r1 = {} + for k in result_json: + k1 = k.replace('$', 'DOLLARSIGN') + k1 = k1.replace('.', 'PERIODSIGN') + r1 [k1] = result_json [k] + coll .insert (r1, check_keys=False) + output (0, pname) + except: + e = sys.exc_info()[0] + output (e, pname) + else: + output (r .ok, pname) From fea34c998417b65e7af65bd378bd417734c2c121 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 15 Oct 2018 16:28:02 -0400 Subject: [PATCH 08/29] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8bc7db1..400c004 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ where XX is between 0 and 33: to find your number look at the list below. ### Goal: 1. Download and store data from npm on all your packages on mongodb database: - fdac18mp2, collection: ghrel_yourutkid + fdac18mp2, collection: npm_yourutkid 1. Identify the packages that have GH repos (based on the stored info) ``` # it has to contain value in From ce27f26d1b44ea94ac501cb00a470f2fc4764c4c Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 15 Oct 2018 16:29:03 -0400 Subject: [PATCH 09/29] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 400c004..6cceb29 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ where XX is between 0 and 33: to find your number look at the list below. ### Goal: 1. Download and store data from npm on all your packages on mongodb database: - fdac18mp2, collection: npm_yourutkid + fdac18mp2, collection: npm_yourutkid, the example code is in readNpm.py 1. Identify the packages that have GH repos (based on the stored info) ``` # it has to contain value in From 71d6f1cc08aaf17c450f328c8063068750cd6d98 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 15 Oct 2018 16:39:28 -0400 Subject: [PATCH 10/29] Update README.md --- README.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6cceb29..e24e2b1 100644 --- a/README.md +++ b/README.md @@ -14,11 +14,21 @@ where XX is between 0 and 33: to find your number look at the list below. fdac18mp2, collection: npm_yourutkid, the example code is in readNpm.py 1. Identify the packages that have GH repos (based on the stored info) ``` -# it has to contain value in -record["collected"]["metadata"]["repository"]["url"] -"git+https://github.com//0-.git" +import pymongo, json, sys +client = pymongo.MongoClient () +db = client ['fdac18mp2'] +id = sys.argv[1] #your utkid +coll = db [ 'npm_' + id] +for r in coll.find(): + if 'collected' in r: + r = r['collected'] + if 'metadata' in r: + r = r['metadata'] + if 'repository' in r: + r = r['url'] + getReleases('url') ``` -2. For each such package, get a list of all releases. Use Github API: +2. For each such package, get a list of all releases. Example file is Use Github API: ``` https://developer.github.com/v3/repos/releases/ ``` From b82c2af5b2c03cb76bd2f21f9908e49a32452cd4 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 15 Oct 2018 16:58:34 -0400 Subject: [PATCH 11/29] Create readGit.py --- readGit.py | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 readGit.py diff --git a/readGit.py b/readGit.py new file mode 100644 index 0000000..9f152e3 --- /dev/null +++ b/readGit.py @@ -0,0 +1,122 @@ +import sys, re, pymongo, json, time +import datetime +from requests.auth import HTTPBasicAuth +import requests + +client = pymongo.MongoClient () +#client = pymongo.MongoClient (host="da1.eecs.utk.edu") +login = "your ghid" +passwd = "your ghpassword" + +baseurl = 'https://api.github.com/repos' +headers = {'Accept': 'application/vnd.github.v3.star+json'} +headers = {'Accept': 'application/vnd.github.hellcat-preview+json'} + +collName = 'releases_yourutkid' +coll = db [collName] +def wait (left): + while (left < 20): + l = requests .get('https://api.github.com/rate_limit', auth=(login,passwd)) + if (l.ok): + left = int (l.headers.get ('X-RateLimit-Remaining')) + reset = int (l.headers.get ('x-ratelimit-reset')) + now = int (time.time ()) + dif = reset - now + if (dif > 0 and left < 20): + sys.stderr.write ("waiting for " + str (dif) + "s until"+str(left)+"s\n") + time .sleep (dif) + time .sleep (0.5) + return left + +def get (url): + global gleft + gleft = wait (gleft) + values = [] + size = 0 + # sys.stderr.write ("left:"+ str(left)+"s\n") + try: + r = requests .get (url, headers=headers, auth=(login, passwd)) + time .sleep (0.5) + if (r.ok): + gleft = int(r.headers.get ('X-RateLimit-Remaining')) + lll = r.headers.get ('Link') + links = [''] + if lll is not None: + links = lll.split(',') + t = r.text + size += len (t) + try: + array = json .loads (t) + for el in array: + values .append (el) + except Exception as e: + sys.stderr.write(str(e)+" in json .loads\n") + #t = r.text.encode ('utf-8') + while '; rel="next"' in links[0]: + gleft = int(r.headers.get ('X-RateLimit-Remaining')) + gleft = wait (gleft) + url = links[0] .split(';')[0].replace('<','').replace('>',''); + try: + r = requests .get(url, headers=headers, auth=(login, passwd)) + if (r.ok): + lll = r.headers.get ('Link') + links = [''] + if lll is not None: + links = lll .split(',') + t = r.text + size += len (t) + try: + array = json.loads (t) + for el in array: + values .append (el) + print ('in load next: ' + str(len (values))) + except Exception as e: + sys.stderr.write(str(e)+" in json .loads next\n") + else: + links = [''] + except requests.exceptions.ConnectionError: + sys.stderr.write('could not get ' + links + ' for '+ url + '\n') + #print u';'.join((u, repo, t)).encode('utf-8') + try: + print (url + ';' + str(values)) + except Exception as e: + sys.stderr.write(str(e)+" in print " + url + "\n") + else: + print (url + ';ERROR r not ok') + except requests.exceptions.ConnectionError: + print (url + ';ERROR ConnectionError') + print ('returning nkeys=' + str(len (values))) + return values, size + +def chunks(l, n): + if n < 1: n = 1 + return [l[i:i + n] for i in range(0, len(l), n)] + +def getReleases(n): + #first clean the url + n = re.sub("^.*github.com/","",n) + n = re.sub("\.git$","",n) + url = baseurl + '/' + n + '/' + releases + url1 = url + v = [] + size = 0 + try: + v, size = get (url1) + print (str (len (v)) + ';' + str (size) + ';' + url1) + sys .stdout .flush () + except Exception as e: + sys.stderr.write ("Could not get:" + url1 + ". Exception:" + str(e) + "\n") + continue + print (url1 + ' after exception lenv(v)=' + str(len (v))) + ts = datetime.datetime.utcnow() + if len (v) > 0: + # size may be bigger in bson, factor of 2 doesnot always suffice + if (size < 16777216/3): + print (v)#coll.insert ( { 'name': n, 'url': url, 'utc':ts, 'values': v } ) + else: + s = size; + n = 3*s/16777216 + i = 0 + for ch in chunks (v, n): + #coll.insert ( { 'chunk': i, 'name':n, 'url': url, 'utc':ts, 'values': ch } ) + i = i + 1 From ad9edae0f875eadb463d635fb6842a121f7cd1f8 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 15 Oct 2018 17:00:53 -0400 Subject: [PATCH 12/29] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e24e2b1..66ba4d9 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ for r in coll.find(): r = r['url'] getReleases('url') ``` -2. For each such package, get a list of all releases. Example file is Use Github API: +2. For each such package, get a list of all releases. Example file is readGit.py (you can use it with the snippet above to get releases). Reference to Github API: ``` https://developer.github.com/v3/repos/releases/ ``` From 2327f0a638f875e1458ac6db24257949956ee893 Mon Sep 17 00:00:00 2001 From: tgoedecke <32520805+tgoedecke@users.noreply.github.com> Date: Wed, 17 Oct 2018 14:15:19 -0500 Subject: [PATCH 13/29] update readGit.py coll.insert becomes coll.insert_one added in class db=client['fdac18mps'] --- readGit.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/readGit.py b/readGit.py index 9f152e3..0181eed 100644 --- a/readGit.py +++ b/readGit.py @@ -12,6 +12,7 @@ headers = {'Accept': 'application/vnd.github.v3.star+json'} headers = {'Accept': 'application/vnd.github.hellcat-preview+json'} +db=client['fdac18mps'] # added in class collName = 'releases_yourutkid' coll = db [collName] def wait (left): @@ -112,11 +113,12 @@ def getReleases(n): if len (v) > 0: # size may be bigger in bson, factor of 2 doesnot always suffice if (size < 16777216/3): - print (v)#coll.insert ( { 'name': n, 'url': url, 'utc':ts, 'values': v } ) + print (v)# + _one ( { 'name': n, 'url': url, 'utc':ts, 'values': v } ) else: s = size; n = 3*s/16777216 i = 0 for ch in chunks (v, n): - #coll.insert ( { 'chunk': i, 'name':n, 'url': url, 'utc':ts, 'values': ch } ) + #coll.insert_one ( { 'chunk': i, 'name':n, 'url': url, 'utc':ts, 'values': ch } ) i = i + 1 From 6083d25ddb69e07745b8de7191798d6fee992724 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Wed, 17 Oct 2018 16:49:33 -0400 Subject: [PATCH 14/29] Update README.md --- README.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 66ba4d9..5351ffb 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,18 @@ for r in coll.find(): if 'metadata' in r: r = r['metadata'] if 'repository' in r: - r = r['url'] - getReleases('url') + r = r['repository'] + if 'url' in r: + r = r['url'] + print (r) ``` -2. For each such package, get a list of all releases. Example file is readGit.py (you can use it with the snippet above to get releases). Reference to Github API: +Suppose the above code is in extrNpm.py. To output the urls: +``` +zcat /data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz > myurls +``` + +2. For each such package, get a list of all releases. Example file is readGit.py (you can use it with the snippet above to get releases). It reads from standard input and populates +releases_yourutkid collection. Reference to Github API: ``` https://developer.github.com/v3/repos/releases/ ``` From 011f2d2832d0610aa9e9e9c2a02ad5d8a9f275b1 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Wed, 17 Oct 2018 16:50:26 -0400 Subject: [PATCH 15/29] Update readNpm.py --- readNpm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readNpm.py b/readNpm.py index 9263e6b..cd54065 100644 --- a/readNpm.py +++ b/readNpm.py @@ -31,7 +31,7 @@ def output(s, p): k1 = k.replace('$', 'DOLLARSIGN') k1 = k1.replace('.', 'PERIODSIGN') r1 [k1] = result_json [k] - coll .insert (r1, check_keys=False) + coll .insert_one (r1, check_keys=False) output (0, pname) except: e = sys.exc_info()[0] From 4b21422378293c04f5fc0c635fdccd1703637516 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Wed, 17 Oct 2018 16:52:36 -0400 Subject: [PATCH 16/29] Update readGit.py --- readGit.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/readGit.py b/readGit.py index 0181eed..aa89114 100644 --- a/readGit.py +++ b/readGit.py @@ -2,6 +2,7 @@ import datetime from requests.auth import HTTPBasicAuth import requests +gleft = 15 client = pymongo.MongoClient () #client = pymongo.MongoClient (host="da1.eecs.utk.edu") @@ -12,7 +13,7 @@ headers = {'Accept': 'application/vnd.github.v3.star+json'} headers = {'Accept': 'application/vnd.github.hellcat-preview+json'} -db=client['fdac18mps'] # added in class +db = client['fdac18mps'] # added in class collName = 'releases_yourutkid' coll = db [collName] def wait (left): @@ -93,7 +94,7 @@ def chunks(l, n): if n < 1: n = 1 return [l[i:i + n] for i in range(0, len(l), n)] -def getReleases(n): +for n in sys.stdin.readlines(): #first clean the url n = re.sub("^.*github.com/","",n) n = re.sub("\.git$","",n) @@ -113,12 +114,11 @@ def getReleases(n): if len (v) > 0: # size may be bigger in bson, factor of 2 doesnot always suffice if (size < 16777216/3): - print (v)# - _one ( { 'name': n, 'url': url, 'utc':ts, 'values': v } ) + coll.insert_one ( { 'name': n, 'url': url, 'utc':ts, 'values': v } ) else: s = size; n = 3*s/16777216 i = 0 for ch in chunks (v, n): - #coll.insert_one ( { 'chunk': i, 'name':n, 'url': url, 'utc':ts, 'values': ch } ) + coll.insert_one ( { 'chunk': i, 'name':n, 'url': url, 'utc':ts, 'values': ch } ) i = i + 1 From cfa97f446cb78f72a30030527558d8a73c4b1d06 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Wed, 17 Oct 2018 21:09:40 +0000 Subject: [PATCH 17/29] fixed readGit.py --- readGit.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/readGit.py b/readGit.py index aa89114..f5bfd69 100644 --- a/readGit.py +++ b/readGit.py @@ -2,19 +2,19 @@ import datetime from requests.auth import HTTPBasicAuth import requests -gleft = 15 +gleft = 1500 -client = pymongo.MongoClient () -#client = pymongo.MongoClient (host="da1.eecs.utk.edu") -login = "your ghid" -passwd = "your ghpassword" +#client = pymongo.MongoClient () +client = pymongo.MongoClient (host="da1.eecs.utk.edu") +login = sys.argv[1] +passwd = sys.argv[2] baseurl = 'https://api.github.com/repos' headers = {'Accept': 'application/vnd.github.v3.star+json'} headers = {'Accept': 'application/vnd.github.hellcat-preview+json'} -db = client['fdac18mps'] # added in class -collName = 'releases_yourutkid' +db = client['fdac18mp2'] # added in class +collName = 'releases_audris' coll = db [collName] def wait (left): while (left < 20): @@ -96,10 +96,12 @@ def chunks(l, n): for n in sys.stdin.readlines(): #first clean the url + n = n.rstrip() n = re.sub("^.*github.com/","",n) n = re.sub("\.git$","",n) - url = baseurl + '/' + n + '/' + releases + url = baseurl + '/' + n + '/releases' url1 = url + print("trying to get: " + url1) v = [] size = 0 try: From 783c96f39a80bb85f6449af32dba03aebdc043f2 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Wed, 17 Oct 2018 17:34:16 -0400 Subject: [PATCH 18/29] Update README.md --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 5351ffb..d1cb074 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,24 @@ releases_yourutkid collection. Reference to Github API: ``` https://developer.github.com/v3/repos/releases/ ``` +3. Extract releases from mongodb +``` +import pymongo, json, sys +client = pymongo.MongoClient () +db = client ['fdac18mp2'] +id = sys.argv[1] #your utkid +coll = db [ 'npm_' + id] +for r in coll.find(): + if 'collected' in r: + r = r['collected'] + if 'metadata' in r: + r = r['metadata'] + if 'repository' in r: + r = r['repository'] + if 'url' in r: + r = r['url'] + print (r) +``` 3. Find no. of commits between the latest and other releases. For example: From 475395933d5bd2d5f6139731a4724e3bac080af5 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Thu, 18 Oct 2018 13:29:35 -0400 Subject: [PATCH 19/29] Update README.md --- README.md | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index d1cb074..f10ac3d 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ for r in coll.find(): ``` Suppose the above code is in extrNpm.py. To output the urls: ``` -zcat /data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz > myurls +zcat /data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz | python3 extrNpm.py > myurls ``` 2. For each such package, get a list of all releases. Example file is readGit.py (you can use it with the snippet above to get releases). It reads from standard input and populates @@ -43,22 +43,24 @@ https://developer.github.com/v3/repos/releases/ 3. Extract releases from mongodb ``` import pymongo, json, sys -client = pymongo.MongoClient () +client = pymongo.MongoClient (host="da1") db = client ['fdac18mp2'] -id = sys.argv[1] #your utkid -coll = db [ 'npm_' + id] +id = "audris" +coll = db [ 'releases_' + id] for r in coll.find(): - if 'collected' in r: - r = r['collected'] - if 'metadata' in r: - r = r['metadata'] - if 'repository' in r: - r = r['repository'] - if 'url' in r: - r = r['url'] - print (r) + n = r['name'] + if 'values' in r: + for v in r['values']: + if 'tag_name' in v: + print (n+';'+v['tag_name']) ``` -3. Find no. of commits between the latest and other releases. +Suppose the above code is in extrRels.py. To output the urls: +``` +cat myurls | python3 extrRels.py > myrels +``` + + +4. Find no. of commits between the latest and other releases. For example: E.g. https://api.github.com/repos/webpack-contrib/html-loader/compare/v0.5.4...master or https://api.github.com/repos/git/git/compare/v2.2.0-rc1...v2.2.0-rc2 From 8a51fddd1a3200e263a9d5215d455616fad535c0 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Thu, 18 Oct 2018 13:31:41 -0400 Subject: [PATCH 20/29] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f10ac3d..3e45ff4 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,9 @@ where XX is between 0 and 33: to find your number look at the list below. ### Goal: 1. Download and store data from npm on all your packages on mongodb database: fdac18mp2, collection: npm_yourutkid, the example code is in readNpm.py +``` +zcat /data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz | python3 readNpm.py +``` 1. Identify the packages that have GH repos (based on the stored info) ``` import pymongo, json, sys @@ -32,7 +35,7 @@ for r in coll.find(): ``` Suppose the above code is in extrNpm.py. To output the urls: ``` -zcat /data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz | python3 extrNpm.py > myurls +python3 extrNpm.py > myurls ``` 2. For each such package, get a list of all releases. Example file is readGit.py (you can use it with the snippet above to get releases). It reads from standard input and populates From a156c9d952abc9484f3eb9a66fe58e37fb74852d Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Thu, 18 Oct 2018 17:32:39 +0000 Subject: [PATCH 21/29] added extraction scripts --- extrNpm.py | 15 +++++++++++++++ extrRels.py | 11 +++++++++++ 2 files changed, 26 insertions(+) create mode 100644 extrNpm.py create mode 100644 extrRels.py diff --git a/extrNpm.py b/extrNpm.py new file mode 100644 index 0000000..bc63d14 --- /dev/null +++ b/extrNpm.py @@ -0,0 +1,15 @@ +import pymongo, json, sys +client = pymongo.MongoClient (host="da1") +db = client ['fdac18mp2'] +id = "audris" +coll = db [ 'npm_' + id] +for r in coll.find(): + if 'collected' in r: + r = r['collected'] + if 'metadata' in r: + r = r['metadata'] + if 'repository' in r: + r = r['repository'] + if 'url' in r: + r = r['url'] + print (r) diff --git a/extrRels.py b/extrRels.py new file mode 100644 index 0000000..a6f612c --- /dev/null +++ b/extrRels.py @@ -0,0 +1,11 @@ +import pymongo, json, sys +client = pymongo.MongoClient (host="da1") +db = client ['fdac18mp2'] +id = "audris" +coll = db [ 'releases_' + id] +for r in coll.find(): + n = r['name'] + if 'values' in r: + for v in r['values']: + if 'tag_name' in v: + print (n+';'+v['tag_name']) From b3e1d3ad5125814ac7b73a5cd171fb1fa7f16532 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Thu, 18 Oct 2018 18:12:48 +0000 Subject: [PATCH 22/29] added script to compare releases --- compareRels.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 compareRels.py diff --git a/compareRels.py b/compareRels.py new file mode 100644 index 0000000..bef6838 --- /dev/null +++ b/compareRels.py @@ -0,0 +1,80 @@ +import sys, re, pymongo, json, time +import datetime +from requests.auth import HTTPBasicAuth +import requests +gleft = 1500 + +#client = pymongo.MongoClient () +client = pymongo.MongoClient (host="da1.eecs.utk.edu") +login = sys.argv[1] +passwd = sys.argv[2] + +baseurl = 'https://api.github.com/repos' +headers = {'Accept': 'application/vnd.github.v3.star+json'} +headers = {'Accept': 'application/vnd.github.hellcat-preview+json'} + +db = client['fdac18mp2'] # added in class +collName = 'releases_audris' +coll = db [collName] +def wait (left): + while (left < 20): + l = requests .get('https://api.github.com/rate_limit', auth=(login,passwd)) + if (l.ok): + left = int (l.headers.get ('X-RateLimit-Remaining')) + reset = int (l.headers.get ('x-ratelimit-reset')) + now = int (time.time ()) + dif = reset - now + if (dif > 0 and left < 20): + sys.stderr.write ("waiting for " + str (dif) + "s until"+str(left)+"s\n") + time .sleep (dif) + time .sleep (0.5) + return left + +def get (url): + global gleft + gleft = wait (gleft) + values = [] + # sys.stderr.write ("left:"+ str(left)+"s\n") + try: + r = requests .get (url, headers=headers, auth=(login, passwd)) + time .sleep (0.5) + if (r.ok): + gleft = int(r.headers.get ('X-RateLimit-Remaining')) + lll = r.headers.get ('Link') + links = [''] + if lll is not None: + links = lll.split(',') + except Exception as e: + sys.stderr.write ("Could not get:" + url + ". Exception:" + str(e) + "\n") + return (json.loads(r.text)) + +def chunks(l, n): + if n < 1: n = 1 + return [l[i:i + n] for i in range(0, len(l), n)] + +def cmp_rel (url): + v = [] + size = 0 + try: + v = get (url) + except Exception as e: + sys.stderr.write ("Could not get:" + url + ". Exception:" + str(e) + "\n") + print (url+';'+str(v['ahead_by'])+';'+str(v['behind_by'])) + + +p2r = {} +for l in sys.stdin.readlines(): + l = l.rstrip() + p, r = l.split(';') + if p in p2r: + p2r[p] .append (r) + else: + p2r[p] = [r] + +for p in p2r: + rs = p2r[p] + if len (rs) > 1: + for i in range(1,len (rs)): + url = 'https://api.github.com/repos/'+p+'/compare/' + rs[i-1] + '...' + rs[i] + cmp_rel (url) + From d10980a40e586c4e1264bcaad568916a4052802b Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Thu, 18 Oct 2018 14:15:09 -0400 Subject: [PATCH 23/29] Added compareRels.py --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 3e45ff4..0d96cfd 100644 --- a/README.md +++ b/README.md @@ -69,10 +69,16 @@ For example: E.g. https://api.github.com/repos/webpack-contrib/html-loader/compare/v0.5.4...master or https://api.github.com/repos/git/git/compare/v2.2.0-rc1...v2.2.0-rc2 More resource: https://stackoverflow.com/questions/26925312/github-api-how-to-compare-2-commits (look for comparing the tags in the answer) Get the data from the json, look for something like to get no. of commits between releases +``` "status": "ahead", "ahead_by": 24, "behind_by": 0, "total_commits": 24, +``` +For example +``` +cat myrels | python3 compareRels.py +``` | number | GitHub Username | NetID | Name | |:-:|:-:|:-:|---| From 1da1844d37f8a723fc2f8e240e959a11fb944cf6 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Fri, 19 Oct 2018 14:50:51 -0400 Subject: [PATCH 24/29] Update readNpm.py --- readNpm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/readNpm.py b/readNpm.py index cd54065..3f31ce3 100644 --- a/readNpm.py +++ b/readNpm.py @@ -2,9 +2,9 @@ from urllib.parse import quote #for da2 -#client = pymongo .MongoClient (host="da1.eecs.utk.edu") +client = pymongo .MongoClient (host="da1.eecs.utk.edu") #for gcloud machine -client = pymongo .MongoClient () +#client = pymongo .MongoClient () db = client ['fdac18mp2'] @@ -31,7 +31,7 @@ def output(s, p): k1 = k.replace('$', 'DOLLARSIGN') k1 = k1.replace('.', 'PERIODSIGN') r1 [k1] = result_json [k] - coll .insert_one (r1, check_keys=False) + coll .insert_one (r1) output (0, pname) except: e = sys.exc_info()[0] From 8a44a8967411999ea43973ef3efe8954a172b107 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Fri, 19 Oct 2018 15:03:21 -0400 Subject: [PATCH 25/29] Update README.md --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0d96cfd..cd9a1e8 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,10 @@ where XX is between 0 and 33: to find your number look at the list below. ``` zcat /data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz | python3 readNpm.py ``` -1. Identify the packages that have GH repos (based on the stored info) +Please keep in mind that /data/NPMvulnerabilities/ is not on gcloud, only +on da2, so please run it on da2 or copy NPMpkglist_XX.gz to gcloud. + +2. Identify the packages that have GH repos (based on the stored info) ``` import pymongo, json, sys client = pymongo.MongoClient () @@ -33,17 +36,18 @@ for r in coll.find(): r = r['url'] print (r) ``` -Suppose the above code is in extrNpm.py. To output the urls: +The above code is in extrNpm.py. To output the urls: ``` python3 extrNpm.py > myurls ``` -2. For each such package, get a list of all releases. Example file is readGit.py (you can use it with the snippet above to get releases). It reads from standard input and populates +3. For each such package, get a list of all releases. Example file is readGit.py (you can use it with the snippet above to get releases). It reads from standard input and populates releases_yourutkid collection. Reference to Github API: ``` https://developer.github.com/v3/repos/releases/ ``` -3. Extract releases from mongodb + +4. Extract releases from mongodb ``` import pymongo, json, sys client = pymongo.MongoClient (host="da1") @@ -57,13 +61,13 @@ for r in coll.find(): if 'tag_name' in v: print (n+';'+v['tag_name']) ``` -Suppose the above code is in extrRels.py. To output the urls: +The above code is in extrRels.py. To output the urls: ``` cat myurls | python3 extrRels.py > myrels ``` -4. Find no. of commits between the latest and other releases. +5. Find no. of commits between the latest and other releases. For example: E.g. https://api.github.com/repos/webpack-contrib/html-loader/compare/v0.5.4...master or https://api.github.com/repos/git/git/compare/v2.2.0-rc1...v2.2.0-rc2 From 2c8a408f1c929a82ca8bb575876ac6ea0e026cd3 Mon Sep 17 00:00:00 2001 From: EvanEzell Date: Tue, 23 Oct 2018 16:27:47 -0400 Subject: [PATCH 26/29] Fix case when there is no common ancestor between commits There are few cases where the api will return that there is no common ancestor between commits. In my list of releases it was the following: https://api.github.com/repos/Onegini/cordova-plugin-onegini/compare/2.1.0...1.8.7 The propose change will print an error message to standard error when that is the case --- compareRels.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/compareRels.py b/compareRels.py index bef6838..279f47a 100644 --- a/compareRels.py +++ b/compareRels.py @@ -59,7 +59,10 @@ def cmp_rel (url): v = get (url) except Exception as e: sys.stderr.write ("Could not get:" + url + ". Exception:" + str(e) + "\n") - print (url+';'+str(v['ahead_by'])+';'+str(v['behind_by'])) + if 'ahead_by' in v and 'behind_by' in v: + print (url+';'+str(v['ahead_by'])+';'+str(v['behind_by'])) + else: + sys.stderr.write ("Could not compare releases for: " + url + "; There exists no common ancestor between the two versions." + "\n") p2r = {} From d725d16e96bdf3e18ba38aa444b1fb9c533a01c7 Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 29 Oct 2018 15:08:15 -0400 Subject: [PATCH 27/29] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index cd9a1e8..6a63391 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,8 @@ python3 extrNpm.py > myurls 3. For each such package, get a list of all releases. Example file is readGit.py (you can use it with the snippet above to get releases). It reads from standard input and populates releases_yourutkid collection. Reference to Github API: ``` -https://developer.github.com/v3/repos/releases/ +cat myurls | python3 readGit.py ``` - 4. Extract releases from mongodb ``` import pymongo, json, sys @@ -63,7 +62,7 @@ for r in coll.find(): ``` The above code is in extrRels.py. To output the urls: ``` -cat myurls | python3 extrRels.py > myrels +python3 extrRels.py > myrels ``` From b540b8f1e7b9c1c26e3259db7a5c7b4d1b0305eb Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 29 Oct 2018 15:09:22 -0400 Subject: [PATCH 28/29] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 6a63391..54f90dd 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,8 @@ python3 extrNpm.py > myurls releases_yourutkid collection. Reference to Github API: ``` cat myurls | python3 readGit.py +#or +python3 readGit.py < myurls ``` 4. Extract releases from mongodb ``` From 88fa5f1bcdbe4d45c501b983e37fdb7037418afe Mon Sep 17 00:00:00 2001 From: Audris Mockus Date: Mon, 29 Oct 2018 15:13:51 -0400 Subject: [PATCH 29/29] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 54f90dd..b2a4c07 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ For example: ``` For example ``` -cat myrels | python3 compareRels.py +cat myrels | python3 compareRels.py > myrels.cmp ``` | number | GitHub Username | NetID | Name |