diff --git a/.gitignore b/.gitignore index b6e4761..92a1198 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,8 @@ dmypy.json # Pyre type checker .pyre/ + +testing.py + +# PyCharm +.idea/ diff --git a/figstats/__init__.py b/figstats/__init__.py new file mode 100644 index 0000000..b8023d8 --- /dev/null +++ b/figstats/__init__.py @@ -0,0 +1 @@ +__version__ = '0.0.1' diff --git a/figstats/commons.py b/figstats/commons.py new file mode 100644 index 0000000..9e0c51a --- /dev/null +++ b/figstats/commons.py @@ -0,0 +1,52 @@ +import json +import requests +from requests.exceptions import HTTPError + + +def issue_request(method, url, headers, data=None, binary=False, + params=None): + """Wrapper for HTTP request + + Parameters + ---------- + method : str + HTTP method. One of GET, PUT, POST or DELETE + + url : str + URL for the request + + headers: dict + HTTP header information + + data: dict + Figshare article data + + binary: bool + Whether data is binary or not + + params: dict + Additional information for URL GET request + + Returns + ------- + response_data: dict + JSON response for the request returned as python dict + """ + if data is not None and not binary: + data = json.dumps(data) + + response = requests.request(method, url, headers=headers, + data=data, params=params) + + try: + response.raise_for_status() + try: + response_data = json.loads(response.text) + except ValueError: + response_data = response.content + except HTTPError as error: + print('Caught an HTTPError: {}'.format(error)) + print('Body:\n', response.text) + raise + + return response_data diff --git a/figstats/stats.py b/figstats/stats.py new file mode 100644 index 0000000..0262586 --- /dev/null +++ b/figstats/stats.py @@ -0,0 +1,214 @@ +from os.path import join +import pandas as pd + +from .commons import issue_request + +counter_list = ['views', 'downloads', 'shares'] + + +class Figshare: + """ + Purpose: + A Python interface to work with Figshare statistics endpoint + + """ + + def __init__(self, api_token='', basic_token='', institution=False, institute=''): + + # For stats API + self.stats_baseurl = 'https://stats.figshare.com' + self.institution = institution + if institute: + self.institute = institute + self.stats_baseurl_institute = join(self.stats_baseurl, self.institute) + + # Base64 token + self.basic_headers0 = {'Content-Type': 'application/json'} + + self.basic_headers = self.basic_headers0.copy() + self.basic_token = basic_token + if self.basic_token: + self.basic_headers['Authorization'] = f'Basic {self.basic_token}' + + # For Figshare API + self.main_baseurl = 'https://api.figshare.com/v2/account/' + if self.institution: + self.main_baseurl_institute = join(self.main_baseurl, "institution") + + # API token + self.api_headers = {'Content-Type': 'application/json'} + self.api_token = api_token + if self.api_token: + self.api_headers['Authorization'] = f'token {self.api_token}' + + def stats_endpoint(self, link, institution=False): + if institution: + return join(self.stats_baseurl_institute, link) + else: + return join(self.stats_baseurl, link) + + def get_totals(self, item_id, item='article', institution=False): + """ + Retrieve totals of views, downloads, and share for an "item" + Item can be 'article', 'author', 'collection', 'group' or 'project' + Note: This does not require authenticating credentials for institution accounts + + See: https://docs.figshare.com/#stats_totals + """ + + if item not in ['article', 'author', 'collection', 'group', 'project']: + raise ValueError("Incorrect item type") + + total_dict = {} + for counter in counter_list: + # Using non-institution one since that seems to give correct stats + url = self.stats_endpoint(join('total', counter, item, str(item_id)), + institution=institution) + result = issue_request('GET', url, headers=self.basic_headers) + total_dict[counter] = result['totals'] + return total_dict + + def get_user_totals(self, author_id): + """ + Retrieve an author's total using get_totals() + + :param author_id: This is not the same as the institution_user_id for institutional accounts + :return: total_dict: dict containing total views, downloads, and shares + Note: This does not require authenticating credentials for institution accounts + """ + total_dict = self.get_totals(author_id, item='author', + institution=False) + return total_dict + + def get_timeline(self, item_id, item='article', granularity='day', + institution=False): + timeline_dict = {} + for counter in counter_list: + # Using non-institution one since that seems to give correct stats + urls = ['timeline', granularity, counter, item, str(item_id)] + url = self.stats_endpoint(join(*urls), institution=institution) + result = issue_request('GET', url, headers=self.basic_headers) + # Sort contents by date + result_sort = {} + cum_dict = {} + count = 0 + # Use views record for timeline (most populated generally) + if counter == 'views': + save_date = sorted(result['timeline']) + for key in save_date: + if isinstance(result['timeline'], type(None)): + # Handle when counter is not available (NoneType) + result_sort[key] = 0 + else: + try: + result_sort[key] = result['timeline'][key] + count += result['timeline'][key] + except KeyError: + result_sort[key] = 0 + cum_dict[key] = count + timeline_dict[counter] = result_sort + timeline_dict[f"{counter}-cum"] = cum_dict + return timeline_dict + + def get_figshare_id(self, accounts_df): + """ + Retrieve Figshare account ID(s) + Note: This is not the institutional ID, but one associated with + the unique profile + + :param accounts_df: pandas DataFrame containing institution ID + :return: accounts_df: The input DataFrame with an additional column + """ + + endpoint = join(self.main_baseurl_institute, "users") + + author_id = [] + for institute_id in accounts_df['id']: + url = f"{endpoint}/{institute_id}" + response = issue_request('GET', url, self.api_headers) + author_id.append(response['id']) + accounts_df['author_id'] = author_id + return accounts_df + + def retrieve_institution_users(self, ignore_admin=False): + """ + Retrieve accounts within institutional instance + + This is based on LD-Cool-P get_account_list method of FigshareInstituteAdmin + It includes retrieving the default author_id + + It uses: + https://docs.figshare.com/#private_institution_accounts_list + https://docs.figshare.com/#private_account_institution_user + """ + url = join(self.main_baseurl_institute, "accounts") + + # Figshare API is limited to a maximum of 1000 per page + params = {'page': 1, 'page_size': 1000} + accounts = issue_request('GET', url, self.api_headers, params=params) + + accounts_df = pd.DataFrame(accounts) + accounts_df = accounts_df.drop(columns='institution_id') + + if ignore_admin: + print("Excluding administrative and test accounts") + + drop_index = list(accounts_df[accounts_df['email'] == + 'data-management@email.arizona.edu'].index) + drop_index += list(accounts_df[accounts_df['email'].str.contains('-test@email.arizona.edu')].index) + + accounts_df = accounts_df.drop(drop_index).reset_index(drop=True) + + accounts_df = self.get_figshare_id(accounts_df) + + return accounts_df + + def retrieve_institution_articles(self): + + url = join(self.main_baseurl_institute, "articles") + + # Figshare API is limited to a maximum of 1000 per page + params = {'page': 1, + 'page_size': 1000} + articles = issue_request('GET', url, self.api_headers, params=params) + + articles_df = pd.DataFrame(articles) + + # Only consider published dataset + articles_df = articles_df.loc[articles_df['published_date'].notnull()] + articles_df = articles_df.reset_index() + return articles_df + + def retrieve_article_details(self, article_id): + """Retrieve article details""" + url = join('https://api.figshare.com/v2/', f"articles/{article_id}") + article_dict = issue_request('GET', url, self.basic_headers0) + return article_dict + + def get_institution_totals(self, df=None, by_method='author'): + """ + Retrieve total views, downloads, and shares by either authors or articles + """ + + if isinstance(df, type(None)): + if by_method == 'author': + df = self.retrieve_institution_users(ignore_admin=False) + if by_method == 'article': + df = self.retrieve_institution_articles() + + total_dict = dict() + for i in df.index: + print(f"{i+1} of {len(df.index)}") + record = df.loc[i] + if by_method == 'author': + first_name = record['first_name'] + last_name = record['last_name'] + author_id = record['author_id'] + total_dict[f"{first_name} {last_name} ({author_id})"] = self.get_user_totals(author_id) + if by_method == 'article': + total_dict[f"{record['id']}"] = self.get_totals(record['id'], + item='article', + institution=False) + # Construct pandas DataFrame + total_df = pd.DataFrame.from_dict(total_dict, orient='index') + return total_df diff --git a/figstats/visualization.py b/figstats/visualization.py new file mode 100644 index 0000000..e751f4a --- /dev/null +++ b/figstats/visualization.py @@ -0,0 +1,118 @@ +from typing import Union +from datetime import datetime as dt + +import matplotlib.pyplot as plt +import matplotlib.dates as m_dates +from matplotlib import figure, axes + +from textwrap import wrap + +title_width = 75 + + +def matplotlib_date_format(date_list: list) -> list: + """Generate list of datetime objects""" + datetime_list = [dt.strptime(date, '%Y-%m-%d') for date in date_list] + + return datetime_list + + +def plot_shares(ax: axes.Axes, timeline_dict: dict): + """Plot shares data""" + shares_dict = timeline_dict['shares'] + non_zero = [key for key in shares_dict.keys() if shares_dict[key] > 0] + + if len(non_zero) > 0: + dates = matplotlib_date_format(non_zero) + for date, key in zip(dates, non_zero): + ax.axvline(x=date, color='red') + ax.text(date, 1, f"{shares_dict[key]}", color='red', + ha='right', va='bottom') + + +def plot_timeline(timeline_dict: dict, article_dict: dict, + out_pdf: str = '', save: bool = False) \ + -> Union[None, figure.Figure]: + """ + Plot timeline showing views and downloads + + :param timeline_dict: Contains daily and cumulative numbers. + From ``stats.Figshare.get_timeline`` + :param article_dict: Contains articles details. + From ``stats.Figshare.retrieve_article_details`` + :param out_pdf: Output filename. Default: `timeline_.pdf` + :param save: Flag to save PDF file. Otherwise returns ``matplotlib`` fig object + + :return fig: If save == False, fig is returned + """ + datetime_list = matplotlib_date_format(list(timeline_dict['views'].keys())) + fig, [ax0, ax1] = plt.subplots(ncols=2, nrows=2, + gridspec_kw={'height_ratios': [3, 1]}) + + counters = ['views', 'downloads'] + + for ii, counter in zip(range(len(counters)), counters): + + # Bottom panels + y_bottom = timeline_dict[counter].values() + ax1[ii].bar(datetime_list, y_bottom) + locator = m_dates.AutoDateLocator(minticks=3, maxticks=7) + formatter = m_dates.ConciseDateFormatter(locator) + ax1[ii].xaxis.set_major_locator(locator) + ax1[ii].xaxis.set_major_formatter(formatter) + ax1[ii].set_ylabel(f"Daily {counter}") + ax1[ii].tick_params(axis='y', direction='in') + ax1[ii].tick_params(axis='x', direction='out') + ax1[ii].annotate(f'Maximum daily {counter}: {max(y_bottom)}', + (0.025, 0.95), xycoords='axes fraction', + va='top', ha='left') + ax1[ii].set_ylim(bottom=0) + + # Top panels + y_top = timeline_dict[counter+'-cum'].values() + ax0[ii].plot(datetime_list, y_top, linestyle='-', linewidth=2.0, + marker='') + ax0[ii].xaxis.set_major_locator(locator) + ax0[ii].xaxis.set_major_formatter(formatter) + ax0[ii].set_xticklabels('') + ax0[ii].set_ylabel(f"Cumulative {counter}") + ax0[ii].tick_params(axis='both', direction='in') + ax0[ii].annotate(f'Total {counter}: {max(y_top)}', (0.025, 0.975), + xycoords='axes fraction', va='top', ha='left') + # ax0[ii].set_xlabel('Date') + ax0[ii].set_ylim(bottom=0) + + plot_shares(ax0[ii], timeline_dict) + + # Heading containing title, author, license, DOI + + title_chunks = wrap(article_dict['title'], title_width) + for cc in range(len(title_chunks)): + if cc == 0: + left_heading = f"Title: {title_chunks[cc]}\n" + else: + left_heading += f" {title_chunks[cc]}\n" + author_list = [auth_dict['full_name'] for auth_dict in article_dict['authors']] + if len(author_list) > 3: + left_heading += f"Authors: {author_list[0]} et al.\n" + else: + left_heading += f"Authors: {' '.join(author_list)}\n" + left_heading += f"License: {article_dict['license']['name']} " + left_heading += f"DOI: https://doi.org/{article_dict['doi']}" + ax0[0].text(0.01, 1.25, left_heading, ha='left', va='top', + transform=ax0[0].transAxes) + + right_heading = f"Shares: {max(timeline_dict['shares-cum'].values())}" + ax0[1].text(1.0, 1.25, right_heading, ha='right', va='top', + transform=ax0[1].transAxes) + + fig.set_size_inches(8, 6) + plt.subplots_adjust(left=0.09, bottom=0.08, top=0.85, right=0.985, + hspace=0.025) + + if save: + if not out_pdf: + out_pdf = f"timeline_{article_dict['id']}.pdf" + fig.savefig(out_pdf) + else: + return fig diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ff48e81 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +requests==2.22.0 +pandas==1.0.2 +matplotlib +numpy diff --git a/scripts/make_timeline_plots b/scripts/make_timeline_plots new file mode 100755 index 0000000..9ee3e4d --- /dev/null +++ b/scripts/make_timeline_plots @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +import argparse +from requests.exceptions import HTTPError + +from figstats import stats, visualization +from matplotlib.backends.backend_pdf import PdfPages + + +if __name__ in '__main__': + description = 'Command-line driver for figstats timeline plots.' + parser = argparse.ArgumentParser(description=description) + parser.add_argument('-a', '--api-token', required=True, + help='Figshare API token') + parser.add_argument('-b', '--basic-token', required=True, + help='Figshare base64 API stats token') + parser.add_argument('-i', '--institute', required=True, + help='Name of institution (e.g., "arizona")') + args = parser.parse_args() + + fs = stats.Figshare(api_token=args.api_token, + basic_token=args.basic_token, + institution=True, + institute=args.institute) + + articles_df = fs.retrieve_institution_articles() + out_pdf = f"{args.institute}_timeline_plots.pdf" + pp = PdfPages(out_pdf) + + for article_id in articles_df['id']: + print(f"Working on : {article_id}") + try: + article_dict = fs.retrieve_article_details(article_id) + try: + timeline_dict = fs.get_timeline(article_id, item='article', + institution=True) + + fig = visualization.plot_timeline(timeline_dict, article_dict, + save=False) + print(type(fig)) + fig.savefig(pp, format='pdf', bbox_inches='tight') + fig.clear() + except TypeError: + print("TypeError") + except (HTTPError, UnicodeEncodeError): + print(f"Skipping: {article_id}") + pass + + print(f"Writing : {out_pdf}") + pp.close() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e3d593e --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +from setuptools import setup + +with open("README.md", "r") as fh: + long_description = fh.read() + +with open("requirements.txt", "r") as fr: + requirements = fr.read().splitlines() + +setup( + name='figstats', + version='v0.0.1', + packages=['figstats'], + url='https://github.com/UAL-ODIS/figstats', + license='MIT License', + author='Chun Ly', + author_email='astro.chun@gmail.com', + description='Python tool to retrieve stats from Figshare API', + long_description=long_description, + long_description_content_type='text/markdown', + install_requires=requirements +)