diff --git a/README.md b/README.md index 0bc7965..b2a4c07 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,128 @@ -# MiniProject2: Discover a list of projects on SourceForge.net and GitLab.com +# MiniProject2: Phase2: Store info on NPM packages in MongoDB + +## Task: Getting Release info from GitHub on NPM packages + +### Resources: +NPM package list + +The list of packages is unique to each one of you: +/data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz +where XX is between 0 and 33: to find your number look at the list below. + +### Goal: +1. Download and store data from npm on all your packages on mongodb database: + fdac18mp2, collection: npm_yourutkid, the example code is in readNpm.py +``` +zcat /data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz | python3 readNpm.py +``` +Please keep in mind that /data/NPMvulnerabilities/ is not on gcloud, only +on da2, so please run it on da2 or copy NPMpkglist_XX.gz to gcloud. + +2. Identify the packages that have GH repos (based on the stored info) +``` +import pymongo, json, sys +client = pymongo.MongoClient () +db = client ['fdac18mp2'] +id = sys.argv[1] #your utkid +coll = db [ 'npm_' + id] +for r in coll.find(): + if 'collected' in r: + r = r['collected'] + if 'metadata' in r: + r = r['metadata'] + if 'repository' in r: + r = r['repository'] + if 'url' in r: + r = r['url'] + print (r) +``` +The above code is in extrNpm.py. To output the urls: +``` +python3 extrNpm.py > myurls +``` + +3. For each such package, get a list of all releases. Example file is readGit.py (you can use it with the snippet above to get releases). It reads from standard input and populates +releases_yourutkid collection. Reference to Github API: +``` +cat myurls | python3 readGit.py +#or +python3 readGit.py < myurls +``` +4. Extract releases from mongodb +``` +import pymongo, json, sys +client = pymongo.MongoClient (host="da1") +db = client ['fdac18mp2'] +id = "audris" +coll = db [ 'releases_' + id] +for r in coll.find(): + n = r['name'] + if 'values' in r: + for v in r['values']: + if 'tag_name' in v: + print (n+';'+v['tag_name']) +``` +The above code is in extrRels.py. To output the urls: +``` +python3 extrRels.py > myrels +``` + + +5. Find no. of commits between the latest and other releases. + +For example: + E.g. https://api.github.com/repos/webpack-contrib/html-loader/compare/v0.5.4...master or https://api.github.com/repos/git/git/compare/v2.2.0-rc1...v2.2.0-rc2 + More resource: https://stackoverflow.com/questions/26925312/github-api-how-to-compare-2-commits (look for comparing the tags in the answer) + Get the data from the json, look for something like to get no. of commits between releases +``` + "status": "ahead", + "ahead_by": 24, + "behind_by": 0, + "total_commits": 24, +``` +For example +``` +cat myrels | python3 compareRels.py > myrels.cmp +``` + +| number | GitHub Username | NetID | Name | +|:-:|:-:|:-:|---| +| 0 | 3PIV | pprovins | Provins IV, Preston | +| 1 | BrettBass13 | bbass11 | Bass, Brett Czech | +| 2 | CipherR9 | gyj992 | Johnson, Rojae Antonio | +| 3 | Colsarcol | cmawhinn | Mawhinney, Colin Joseph | +| 4 | EvanEzell | eezell3 | Ezell, Evan Collin | +| 5 | MikeynJerry | jdunca51 | Duncan, Jerry | +| 6 | Tasmia | trahman4 | Rahman, Tasmia | +| 7 | awilki13 | awilki13 | Wilkinson, Alex Webb | +| 8 | bryanpacep1 | jpace7 | Pace, Jonathan Bryan | +| 9 | caiwjohn | cjohn3 | John, Cai William | +| 10 | cflemmon | cflemmon | Flemmons, Cole | +| 11 | dbarry9 | dbarry | Barry, Daniel Patrick | +| 12 | desai07 | adesai6 | Desai, Avie | +| 13 | gjones1911 | gjones2 | Jones, Gerald Leon | +| 14 | herronej | eherron5 | Herron, Emily Joyce | +| 15 | hossain-rayhan | rhossai2 | Hossain, Rayhan | +| 16 | jdong6 | jdong6 | Dong, Jeffrey Jing | +| 17 | jyu25utk | jyu25 | Yu, Jinxiao | +| 18 | mkramer6 | mkramer6 | Kramer, Matthew S | +| 19 | mmahbub | mmahbub | Mahbub, Maria | +| 20 | nmansou4 | nmansou4 | Mansour, Nasib | +| 21 | nschwerz | nschwerz | Schwerzler, Nicolas Winfield William | +| 22 | rdabbs42 | rdabbs1 | Dabbs, Rosemary | +| 23 | saramsv | mousavi | Mousavicheshmehkaboodi, Sara | +| 24 | spaulsteinberg | ssteinb2 | Steinberg, Samuel Paul | +| 25 | zol0 | akarnauc | Karnauch, Andrey | +| 26 | zrandall | zrandall | Randall, Zachary Adams | +| 27 | lpassarella | lpassare | Passarella, Linsey Sara | +| 28 | tgoedecke | pgoedec1 | Goedecke, Trish | +| 29 | ray830305 | hchang13 | Chang, Hsun Jui | +| 30 | ssravali | ssadhu2 | Sadhu, Sri Ravali | +| 31 | diadoo | jpovlin | Povlin, John P | +| 32 | mander59 | mander59 | Anderson, Matt Mcguffee | +| 33 | iway1 | iway1 | Way, Isaac Caldwell | + +# MiniProject2: Phase1: Discover a list of projects on SourceForge.net and GitLab.com These two forges present two different types of data discovery challenges. diff --git a/compareRels.py b/compareRels.py new file mode 100644 index 0000000..279f47a --- /dev/null +++ b/compareRels.py @@ -0,0 +1,83 @@ +import sys, re, pymongo, json, time +import datetime +from requests.auth import HTTPBasicAuth +import requests +gleft = 1500 + +#client = pymongo.MongoClient () +client = pymongo.MongoClient (host="da1.eecs.utk.edu") +login = sys.argv[1] +passwd = sys.argv[2] + +baseurl = 'https://api.github.com/repos' +headers = {'Accept': 'application/vnd.github.v3.star+json'} +headers = {'Accept': 'application/vnd.github.hellcat-preview+json'} + +db = client['fdac18mp2'] # added in class +collName = 'releases_audris' +coll = db [collName] +def wait (left): + while (left < 20): + l = requests .get('https://api.github.com/rate_limit', auth=(login,passwd)) + if (l.ok): + left = int (l.headers.get ('X-RateLimit-Remaining')) + reset = int (l.headers.get ('x-ratelimit-reset')) + now = int (time.time ()) + dif = reset - now + if (dif > 0 and left < 20): + sys.stderr.write ("waiting for " + str (dif) + "s until"+str(left)+"s\n") + time .sleep (dif) + time .sleep (0.5) + return left + +def get (url): + global gleft + gleft = wait (gleft) + values = [] + # sys.stderr.write ("left:"+ str(left)+"s\n") + try: + r = requests .get (url, headers=headers, auth=(login, passwd)) + time .sleep (0.5) + if (r.ok): + gleft = int(r.headers.get ('X-RateLimit-Remaining')) + lll = r.headers.get ('Link') + links = [''] + if lll is not None: + links = lll.split(',') + except Exception as e: + sys.stderr.write ("Could not get:" + url + ". Exception:" + str(e) + "\n") + return (json.loads(r.text)) + +def chunks(l, n): + if n < 1: n = 1 + return [l[i:i + n] for i in range(0, len(l), n)] + +def cmp_rel (url): + v = [] + size = 0 + try: + v = get (url) + except Exception as e: + sys.stderr.write ("Could not get:" + url + ". Exception:" + str(e) + "\n") + if 'ahead_by' in v and 'behind_by' in v: + print (url+';'+str(v['ahead_by'])+';'+str(v['behind_by'])) + else: + sys.stderr.write ("Could not compare releases for: " + url + "; There exists no common ancestor between the two versions." + "\n") + + +p2r = {} +for l in sys.stdin.readlines(): + l = l.rstrip() + p, r = l.split(';') + if p in p2r: + p2r[p] .append (r) + else: + p2r[p] = [r] + +for p in p2r: + rs = p2r[p] + if len (rs) > 1: + for i in range(1,len (rs)): + url = 'https://api.github.com/repos/'+p+'/compare/' + rs[i-1] + '...' + rs[i] + cmp_rel (url) + diff --git a/extrNpm.py b/extrNpm.py new file mode 100644 index 0000000..bc63d14 --- /dev/null +++ b/extrNpm.py @@ -0,0 +1,15 @@ +import pymongo, json, sys +client = pymongo.MongoClient (host="da1") +db = client ['fdac18mp2'] +id = "audris" +coll = db [ 'npm_' + id] +for r in coll.find(): + if 'collected' in r: + r = r['collected'] + if 'metadata' in r: + r = r['metadata'] + if 'repository' in r: + r = r['repository'] + if 'url' in r: + r = r['url'] + print (r) diff --git a/extrRels.py b/extrRels.py new file mode 100644 index 0000000..a6f612c --- /dev/null +++ b/extrRels.py @@ -0,0 +1,11 @@ +import pymongo, json, sys +client = pymongo.MongoClient (host="da1") +db = client ['fdac18mp2'] +id = "audris" +coll = db [ 'releases_' + id] +for r in coll.find(): + n = r['name'] + if 'values' in r: + for v in r['values']: + if 'tag_name' in v: + print (n+';'+v['tag_name']) diff --git a/jdunca51.ipynb b/pgoedec1.ipynb similarity index 98% rename from jdunca51.ipynb rename to pgoedec1.ipynb index d3a3149..780533e 100644 --- a/jdunca51.ipynb +++ b/pgoedec1.ipynb @@ -18,8 +18,8 @@ "from bs4 import BeautifulSoup\n", "\n", "dbname = \"fdac18mp2\" #please use this database\n", - "collname = \"glprj_jdunca51\" #please modify so you store data in your collection\n", - "my_char = 'f'\n", + "collname = \"tgoedecke\" #please modify so you store data in your collection\n", + "my_char = 'c'\n", "\n", "# beginning page index\n", "begin = \"1\"\n", diff --git a/readGit.py b/readGit.py new file mode 100644 index 0000000..f5bfd69 --- /dev/null +++ b/readGit.py @@ -0,0 +1,126 @@ +import sys, re, pymongo, json, time +import datetime +from requests.auth import HTTPBasicAuth +import requests +gleft = 1500 + +#client = pymongo.MongoClient () +client = pymongo.MongoClient (host="da1.eecs.utk.edu") +login = sys.argv[1] +passwd = sys.argv[2] + +baseurl = 'https://api.github.com/repos' +headers = {'Accept': 'application/vnd.github.v3.star+json'} +headers = {'Accept': 'application/vnd.github.hellcat-preview+json'} + +db = client['fdac18mp2'] # added in class +collName = 'releases_audris' +coll = db [collName] +def wait (left): + while (left < 20): + l = requests .get('https://api.github.com/rate_limit', auth=(login,passwd)) + if (l.ok): + left = int (l.headers.get ('X-RateLimit-Remaining')) + reset = int (l.headers.get ('x-ratelimit-reset')) + now = int (time.time ()) + dif = reset - now + if (dif > 0 and left < 20): + sys.stderr.write ("waiting for " + str (dif) + "s until"+str(left)+"s\n") + time .sleep (dif) + time .sleep (0.5) + return left + +def get (url): + global gleft + gleft = wait (gleft) + values = [] + size = 0 + # sys.stderr.write ("left:"+ str(left)+"s\n") + try: + r = requests .get (url, headers=headers, auth=(login, passwd)) + time .sleep (0.5) + if (r.ok): + gleft = int(r.headers.get ('X-RateLimit-Remaining')) + lll = r.headers.get ('Link') + links = [''] + if lll is not None: + links = lll.split(',') + t = r.text + size += len (t) + try: + array = json .loads (t) + for el in array: + values .append (el) + except Exception as e: + sys.stderr.write(str(e)+" in json .loads\n") + #t = r.text.encode ('utf-8') + while '; rel="next"' in links[0]: + gleft = int(r.headers.get ('X-RateLimit-Remaining')) + gleft = wait (gleft) + url = links[0] .split(';')[0].replace('<','').replace('>',''); + try: + r = requests .get(url, headers=headers, auth=(login, passwd)) + if (r.ok): + lll = r.headers.get ('Link') + links = [''] + if lll is not None: + links = lll .split(',') + t = r.text + size += len (t) + try: + array = json.loads (t) + for el in array: + values .append (el) + print ('in load next: ' + str(len (values))) + except Exception as e: + sys.stderr.write(str(e)+" in json .loads next\n") + else: + links = [''] + except requests.exceptions.ConnectionError: + sys.stderr.write('could not get ' + links + ' for '+ url + '\n') + #print u';'.join((u, repo, t)).encode('utf-8') + try: + print (url + ';' + str(values)) + except Exception as e: + sys.stderr.write(str(e)+" in print " + url + "\n") + else: + print (url + ';ERROR r not ok') + except requests.exceptions.ConnectionError: + print (url + ';ERROR ConnectionError') + print ('returning nkeys=' + str(len (values))) + return values, size + +def chunks(l, n): + if n < 1: n = 1 + return [l[i:i + n] for i in range(0, len(l), n)] + +for n in sys.stdin.readlines(): + #first clean the url + n = n.rstrip() + n = re.sub("^.*github.com/","",n) + n = re.sub("\.git$","",n) + url = baseurl + '/' + n + '/releases' + url1 = url + print("trying to get: " + url1) + v = [] + size = 0 + try: + v, size = get (url1) + print (str (len (v)) + ';' + str (size) + ';' + url1) + sys .stdout .flush () + except Exception as e: + sys.stderr.write ("Could not get:" + url1 + ". Exception:" + str(e) + "\n") + continue + print (url1 + ' after exception lenv(v)=' + str(len (v))) + ts = datetime.datetime.utcnow() + if len (v) > 0: + # size may be bigger in bson, factor of 2 doesnot always suffice + if (size < 16777216/3): + coll.insert_one ( { 'name': n, 'url': url, 'utc':ts, 'values': v } ) + else: + s = size; + n = 3*s/16777216 + i = 0 + for ch in chunks (v, n): + coll.insert_one ( { 'chunk': i, 'name':n, 'url': url, 'utc':ts, 'values': ch } ) + i = i + 1 diff --git a/readNpm.py b/readNpm.py new file mode 100644 index 0000000..3f31ce3 --- /dev/null +++ b/readNpm.py @@ -0,0 +1,40 @@ +import sys, json, pymongo, time, datetime, re, requests +from urllib.parse import quote + +#for da2 +client = pymongo .MongoClient (host="da1.eecs.utk.edu") +#for gcloud machine +#client = pymongo .MongoClient () + +db = client ['fdac18mp2'] + +#replace audris with your utkid +coll = db['npm_audris'] + +pre = 'https://api.npms.io/v2/package/' + +def output(s, p): + print(str(s) + ";" + p) + +for pname in sys.stdin.readlines(): + pname = pname.strip('\n') + #Thks @Macbrine: url parameters need to be quoted + pname = quote(pname, safe='') + r = requests.get(pre + pname) + if(r.ok): + result = r.content + try: + result_json = json.loads(result.decode('ascii', errors='ignore')) + #modify keys to remove unwanted '$' '.' characters that mongodb does not allow + r1 = {} + for k in result_json: + k1 = k.replace('$', 'DOLLARSIGN') + k1 = k1.replace('.', 'PERIODSIGN') + r1 [k1] = result_json [k] + coll .insert_one (r1) + output (0, pname) + except: + e = sys.exc_info()[0] + output (e, pname) + else: + output (r .ok, pname)