UAL-RE · astrochun · Nov 18, 2020 · Nov 18, 2020 · Dec 2, 2020 · Dec 3, 2020
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,8 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+testing.py
+
+# PyCharm
+.idea/
diff --git a/figstats/__init__.py b/figstats/__init__.py
@@ -0,0 +1 @@
+__version__ = '0.0.1'
diff --git a/figstats/commons.py b/figstats/commons.py
@@ -0,0 +1,52 @@
+import json
+import requests
+from requests.exceptions import HTTPError
+
+
+def issue_request(method, url, headers, data=None, binary=False,
+                  params=None):
+    """Wrapper for HTTP request
+
+    Parameters
+    ----------
+    method : str
+        HTTP method. One of GET, PUT, POST or DELETE
+
+    url : str
+        URL for the request
+
+    headers: dict
+        HTTP header information
+
+    data: dict
+        Figshare article data
+
+    binary: bool
+        Whether data is binary or not
+
+    params: dict
+        Additional information for URL GET request
+
+    Returns
+    -------
+    response_data: dict
+        JSON response for the request returned as python dict
+    """
+    if data is not None and not binary:
+        data = json.dumps(data)
+
+    response = requests.request(method, url, headers=headers,
+                                data=data, params=params)
+
+    try:
+        response.raise_for_status()
+        try:
+            response_data = json.loads(response.text)
+        except ValueError:
+            response_data = response.content
+    except HTTPError as error:
+        print('Caught an HTTPError: {}'.format(error))
+        print('Body:\n', response.text)
+        raise
+
+    return response_data
diff --git a/figstats/stats.py b/figstats/stats.py
@@ -0,0 +1,214 @@
+from os.path import join
+import pandas as pd
+
+from .commons import issue_request
+
+counter_list = ['views', 'downloads', 'shares']
+
+
+class Figshare:
+    """
+    Purpose:
+      A Python interface to work with Figshare statistics endpoint
+
+    """
+
+    def __init__(self, api_token='', basic_token='', institution=False, institute=''):
+
+        # For stats API
+        self.stats_baseurl = 'https://stats.figshare.com'
+        self.institution = institution
+        if institute:
+            self.institute = institute
+            self.stats_baseurl_institute = join(self.stats_baseurl, self.institute)
+
+        # Base64 token
+        self.basic_headers0 = {'Content-Type': 'application/json'}
+
+        self.basic_headers = self.basic_headers0.copy()
+        self.basic_token = basic_token
+        if self.basic_token:
+            self.basic_headers['Authorization'] = f'Basic {self.basic_token}'
+
+        # For Figshare API
+        self.main_baseurl = 'https://api.figshare.com/v2/account/'
+        if self.institution:
+            self.main_baseurl_institute = join(self.main_baseurl, "institution")
+
+        # API token
+        self.api_headers = {'Content-Type': 'application/json'}
+        self.api_token = api_token
+        if self.api_token:
+            self.api_headers['Authorization'] = f'token {self.api_token}'
+
+    def stats_endpoint(self, link, institution=False):
+        if institution:
+            return join(self.stats_baseurl_institute, link)
+        else:
+            return join(self.stats_baseurl, link)
+
+    def get_totals(self, item_id, item='article', institution=False):
+        """
+        Retrieve totals of views, downloads, and share for an "item"
+        Item can be 'article', 'author', 'collection', 'group' or 'project'
+        Note: This does not require authenticating credentials for institution accounts
+
+        See: https://docs.figshare.com/#stats_totals
+        """
+
+        if item not in ['article', 'author', 'collection', 'group', 'project']:
+            raise ValueError("Incorrect item type")
+
+        total_dict = {}
+        for counter in counter_list:
+            # Using non-institution one since that seems to give correct stats
+            url = self.stats_endpoint(join('total', counter, item, str(item_id)),
+                                      institution=institution)
+            result = issue_request('GET', url, headers=self.basic_headers)
+            total_dict[counter] = result['totals']
+        return total_dict
+
+    def get_user_totals(self, author_id):
+        """
+        Retrieve an author's total using get_totals()
+
+        :param author_id: This is not the same as the institution_user_id for institutional accounts
+        :return: total_dict: dict containing total views, downloads, and shares
+        Note: This does not require authenticating credentials for institution accounts
+        """
+        total_dict = self.get_totals(author_id, item='author',
+                                     institution=False)
+        return total_dict
+
+    def get_timeline(self, item_id, item='article', granularity='day',
+                     institution=False):
+        timeline_dict = {}
+        for counter in counter_list:
+            # Using non-institution one since that seems to give correct stats
+            urls = ['timeline', granularity, counter, item, str(item_id)]
+            url = self.stats_endpoint(join(*urls), institution=institution)
+            result = issue_request('GET', url, headers=self.basic_headers)
+            # Sort contents by date
+            result_sort = {}
+            cum_dict = {}
+            count = 0
+            # Use views record for timeline (most populated generally)
+            if counter == 'views':
+                save_date = sorted(result['timeline'])
+            for key in save_date:
+                if isinstance(result['timeline'], type(None)):
+                    # Handle when counter is not available (NoneType)
+                    result_sort[key] = 0
+                else:
+                    try:
+                        result_sort[key] = result['timeline'][key]
+                        count += result['timeline'][key]
+                    except KeyError:
+                        result_sort[key] = 0
+                cum_dict[key] = count
+            timeline_dict[counter] = result_sort
+            timeline_dict[f"{counter}-cum"] = cum_dict
+        return timeline_dict
+
+    def get_figshare_id(self, accounts_df):
+        """
+        Retrieve Figshare account ID(s)
+        Note: This is not the institutional ID, but one associated with
+              the unique profile
+
+        :param accounts_df: pandas DataFrame containing institution ID
+        :return: accounts_df: The input DataFrame with an additional column
+        """
+
+        endpoint = join(self.main_baseurl_institute, "users")
+
+        author_id = []
+        for institute_id in accounts_df['id']:
+            url = f"{endpoint}/{institute_id}"
+            response = issue_request('GET', url, self.api_headers)
+            author_id.append(response['id'])
+        accounts_df['author_id'] = author_id
+        return accounts_df
+
+    def retrieve_institution_users(self, ignore_admin=False):
+        """
+        Retrieve accounts within institutional instance
+
+        This is based on LD-Cool-P get_account_list method of FigshareInstituteAdmin
+        It includes retrieving the default author_id
+
+        It uses:
+        https://docs.figshare.com/#private_institution_accounts_list
+        https://docs.figshare.com/#private_account_institution_user
+        """
+        url = join(self.main_baseurl_institute, "accounts")
+
+        # Figshare API is limited to a maximum of 1000 per page
+        params = {'page': 1, 'page_size': 1000}
+        accounts = issue_request('GET', url, self.api_headers, params=params)
+
+        accounts_df = pd.DataFrame(accounts)
+        accounts_df = accounts_df.drop(columns='institution_id')
+
+        if ignore_admin:
+            print("Excluding administrative and test accounts")
+
+            drop_index = list(accounts_df[accounts_df['email'] ==
+                                          'data-management@email.arizona.edu'].index)
+            drop_index += list(accounts_df[accounts_df['email'].str.contains('-test@email.arizona.edu')].index)
+
+            accounts_df = accounts_df.drop(drop_index).reset_index(drop=True)
+
+        accounts_df = self.get_figshare_id(accounts_df)
+
+        return accounts_df
+
+    def retrieve_institution_articles(self):
+
+        url = join(self.main_baseurl_institute, "articles")
+
+        # Figshare API is limited to a maximum of 1000 per page
+        params = {'page': 1,
+                  'page_size': 1000}
+        articles = issue_request('GET', url, self.api_headers, params=params)
+
+        articles_df = pd.DataFrame(articles)
+
+        # Only consider published dataset
+        articles_df = articles_df.loc[articles_df['published_date'].notnull()]
+        articles_df = articles_df.reset_index()
+        return articles_df
+
+    def retrieve_article_details(self, article_id):
+        """Retrieve article details"""
+        url = join('https://api.figshare.com/v2/', f"articles/{article_id}")
+        article_dict = issue_request('GET', url, self.basic_headers0)
+        return article_dict
+
+    def get_institution_totals(self, df=None, by_method='author'):
+        """
+        Retrieve total views, downloads, and shares by either authors or articles
+        """
+
+        if isinstance(df, type(None)):
+            if by_method == 'author':
+                df = self.retrieve_institution_users(ignore_admin=False)
+            if by_method == 'article':
+                df = self.retrieve_institution_articles()
+
+        total_dict = dict()
+        for i in df.index:
+            print(f"{i+1} of {len(df.index)}")
+            record = df.loc[i]
+            if by_method == 'author':
+                first_name = record['first_name']
+                last_name = record['last_name']
+                author_id = record['author_id']
+                total_dict[f"{first_name} {last_name} ({author_id})"] = self.get_user_totals(author_id)
+            if by_method == 'article':
+                total_dict[f"{record['id']}"] = self.get_totals(record['id'],
+                                                                item='article',
+                                                                institution=False)
+        # Construct pandas DataFrame
+        total_df = pd.DataFrame.from_dict(total_dict, orient='index')
+        return total_df
diff --git a/figstats/visualization.py b/figstats/visualization.py
@@ -0,0 +1,118 @@
+from typing import Union
+from datetime import datetime as dt
+
+import matplotlib.pyplot as plt
+import matplotlib.dates as m_dates
+from matplotlib import figure, axes
+
+from textwrap import wrap
+
+title_width = 75
+
+
+def matplotlib_date_format(date_list: list) -> list:
+    """Generate list of datetime objects"""
+    datetime_list = [dt.strptime(date, '%Y-%m-%d') for date in date_list]
+
+    return datetime_list
+
+
+def plot_shares(ax: axes.Axes, timeline_dict: dict):
+    """Plot shares data"""
+    shares_dict = timeline_dict['shares']
+    non_zero = [key for key in shares_dict.keys() if shares_dict[key] > 0]
+
+    if len(non_zero) > 0:
+        dates = matplotlib_date_format(non_zero)
+        for date, key in zip(dates, non_zero):
+            ax.axvline(x=date, color='red')
+            ax.text(date, 1, f"{shares_dict[key]}", color='red',
+                    ha='right', va='bottom')
+
+
+def plot_timeline(timeline_dict: dict, article_dict: dict,
+                  out_pdf: str = '', save: bool = False) \
+        -> Union[None, figure.Figure]:
+    """
+    Plot timeline showing views and downloads
+
+    :param timeline_dict: Contains daily and cumulative numbers.
+           From ``stats.Figshare.get_timeline``
+    :param article_dict: Contains articles details.
+           From ``stats.Figshare.retrieve_article_details``
+    :param out_pdf: Output filename. Default: `timeline_<article_id>.pdf`
+    :param save: Flag to save PDF file. Otherwise returns ``matplotlib`` fig object
+
+    :return fig: If save == False, fig is returned
+    """
+    datetime_list = matplotlib_date_format(list(timeline_dict['views'].keys()))
+    fig, [ax0, ax1] = plt.subplots(ncols=2, nrows=2,
+                                   gridspec_kw={'height_ratios': [3, 1]})
+
+    counters = ['views', 'downloads']
+
+    for ii, counter in zip(range(len(counters)), counters):
+
+        # Bottom panels
+        y_bottom = timeline_dict[counter].values()
+        ax1[ii].bar(datetime_list, y_bottom)
+        locator = m_dates.AutoDateLocator(minticks=3, maxticks=7)
+        formatter = m_dates.ConciseDateFormatter(locator)
+        ax1[ii].xaxis.set_major_locator(locator)
+        ax1[ii].xaxis.set_major_formatter(formatter)
+        ax1[ii].set_ylabel(f"Daily {counter}")
+        ax1[ii].tick_params(axis='y', direction='in')
+        ax1[ii].tick_params(axis='x', direction='out')
+        ax1[ii].annotate(f'Maximum daily {counter}: {max(y_bottom)}',
+                         (0.025, 0.95), xycoords='axes fraction',
+                         va='top', ha='left')
+        ax1[ii].set_ylim(bottom=0)
+
+        # Top panels
+        y_top = timeline_dict[counter+'-cum'].values()
+        ax0[ii].plot(datetime_list, y_top, linestyle='-', linewidth=2.0,
+                     marker='')
+        ax0[ii].xaxis.set_major_locator(locator)
+        ax0[ii].xaxis.set_major_formatter(formatter)
+        ax0[ii].set_xticklabels('')
+        ax0[ii].set_ylabel(f"Cumulative {counter}")
+        ax0[ii].tick_params(axis='both', direction='in')
+        ax0[ii].annotate(f'Total {counter}: {max(y_top)}', (0.025, 0.975),
+                         xycoords='axes fraction', va='top', ha='left')
+        # ax0[ii].set_xlabel('Date')
+        ax0[ii].set_ylim(bottom=0)
+
+        plot_shares(ax0[ii], timeline_dict)
+
+    # Heading containing title, author, license, DOI
+
+    title_chunks = wrap(article_dict['title'], title_width)
+    for cc in range(len(title_chunks)):
+        if cc == 0:
+            left_heading = f"Title: {title_chunks[cc]}\n"
+        else:
+            left_heading += f"         {title_chunks[cc]}\n"
+    author_list = [auth_dict['full_name'] for auth_dict in article_dict['authors']]
+    if len(author_list) > 3:
+        left_heading += f"Authors: {author_list[0]} et al.\n"
+    else:
+        left_heading += f"Authors: {' '.join(author_list)}\n"
+    left_heading += f"License: {article_dict['license']['name']}  "
+    left_heading += f"DOI: https://doi.org/{article_dict['doi']}"
+    ax0[0].text(0.01, 1.25, left_heading, ha='left', va='top',
+                transform=ax0[0].transAxes)
+
+    right_heading = f"Shares: {max(timeline_dict['shares-cum'].values())}"
+    ax0[1].text(1.0, 1.25, right_heading, ha='right', va='top',
+                transform=ax0[1].transAxes)
+
+    fig.set_size_inches(8, 6)
+    plt.subplots_adjust(left=0.09, bottom=0.08, top=0.85, right=0.985,
+                        hspace=0.025)
+
+    if save:
+        if not out_pdf:
+            out_pdf = f"timeline_{article_dict['id']}.pdf"
+        fig.savefig(out_pdf)
+    else:
+        return fig