Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c91a618
Add API code #1
astrochun Nov 18, 2020
a06513d
stats.Figshare: Add institution in get_totals #1
astrochun Nov 18, 2020
293b0c4
stats.Figshare: Add get_user_totals and get_timeline methods
astrochun Dec 2, 2020
ef31020
A number of changes
astrochun Dec 3, 2020
8e7b485
stats.Figshare: Add get_figshare_id and retrieve_institution_users me…
astrochun Dec 3, 2020
7eacb85
stats.Figshare: Add get_institution_totals
astrochun Dec 3, 2020
91747b5
Loop over all authors, include author name
astrochun Dec 3, 2020
bb680da
stats.Figshare: Add retrieve_institution_articles method
astrochun Dec 3, 2020
6560567
stats.Figshare: Sort timeline by date in get_timeline method
astrochun Dec 4, 2020
a8dd2c1
stats.Figshare: Add cumulative numbers for views, downloads, shares i…
astrochun Dec 4, 2020
e4fe93e
stats.Figshare: Ensure timeline is the same for all records
astrochun Dec 4, 2020
f01e556
Minor fix to zero daily numbers if not in timeline
astrochun Dec 8, 2020
a77df39
stats: Add retrieve_article_details method
astrochun Dec 8, 2020
5cf4adb
Add visualization module
astrochun Dec 8, 2020
355c88f
Add option to save PDF file or return matplotlib fig instance
astrochun Dec 8, 2020
4f1783e
visualization: Add plot_shares method
astrochun Dec 8, 2020
229618e
Add numpy to requirements
astrochun Dec 8, 2020
1cd267e
Exclude testing scripts
astrochun Dec 8, 2020
9347032
Refactor to use counter_list
astrochun Dec 9, 2020
897c21d
Change cumulative panels to from bar to line
astrochun Dec 9, 2020
9ef5203
Baseline script for multi-timeline retrieval
astrochun Dec 9, 2020
1943cfa
Fix case when timeline is not available (e.g., shares)
astrochun Dec 10, 2020
968b6b4
Use textwrap to handle long title
astrochun Dec 10, 2020
64a7a1d
Reduce title_width to give room for shares in upper right
astrochun Dec 10, 2020
2e8590a
Add version info
astrochun Mar 3, 2021
0442e8b
Add HTTPException handling
astrochun Mar 3, 2021
326aff2
make_timeline_plots: Adjust for PEP8 width
astrochun Mar 9, 2021
cfcf095
visualization: type hinting
astrochun Mar 9, 2021
218a596
make_timeline_plots: Temporarily handle UnicodeEncodeError
astrochun Mar 9, 2021
973048b
Add PyCharm files to .gitignore
astrochun Mar 9, 2021
b06ce5d
Adjust argparse inputs with hyphen format
astrochun Jul 27, 2021
d281996
Fix to ensure using a raw JSON response with no basic token
astrochun Jul 27, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,8 @@ dmypy.json

# Pyre type checker
.pyre/

testing.py

# PyCharm
.idea/
1 change: 1 addition & 0 deletions figstats/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = '0.0.1'
52 changes: 52 additions & 0 deletions figstats/commons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import json
import requests
from requests.exceptions import HTTPError


def issue_request(method, url, headers, data=None, binary=False,
params=None):
"""Wrapper for HTTP request

Parameters
----------
method : str
HTTP method. One of GET, PUT, POST or DELETE

url : str
URL for the request

headers: dict
HTTP header information

data: dict
Figshare article data

binary: bool
Whether data is binary or not

params: dict
Additional information for URL GET request

Returns
-------
response_data: dict
JSON response for the request returned as python dict
"""
if data is not None and not binary:
data = json.dumps(data)

response = requests.request(method, url, headers=headers,
data=data, params=params)

try:
response.raise_for_status()
try:
response_data = json.loads(response.text)
except ValueError:
response_data = response.content
except HTTPError as error:
print('Caught an HTTPError: {}'.format(error))
print('Body:\n', response.text)
raise

return response_data
214 changes: 214 additions & 0 deletions figstats/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
from os.path import join
import pandas as pd

from .commons import issue_request

counter_list = ['views', 'downloads', 'shares']


class Figshare:
"""
Purpose:
A Python interface to work with Figshare statistics endpoint

"""

def __init__(self, api_token='', basic_token='', institution=False, institute=''):

# For stats API
self.stats_baseurl = 'https://stats.figshare.com'
self.institution = institution
if institute:
self.institute = institute
self.stats_baseurl_institute = join(self.stats_baseurl, self.institute)

# Base64 token
self.basic_headers0 = {'Content-Type': 'application/json'}

self.basic_headers = self.basic_headers0.copy()
self.basic_token = basic_token
if self.basic_token:
self.basic_headers['Authorization'] = f'Basic {self.basic_token}'

# For Figshare API
self.main_baseurl = 'https://api.figshare.com/v2/account/'
if self.institution:
self.main_baseurl_institute = join(self.main_baseurl, "institution")

# API token
self.api_headers = {'Content-Type': 'application/json'}
self.api_token = api_token
if self.api_token:
self.api_headers['Authorization'] = f'token {self.api_token}'

def stats_endpoint(self, link, institution=False):
if institution:
return join(self.stats_baseurl_institute, link)
else:
return join(self.stats_baseurl, link)

def get_totals(self, item_id, item='article', institution=False):
"""
Retrieve totals of views, downloads, and share for an "item"
Item can be 'article', 'author', 'collection', 'group' or 'project'
Note: This does not require authenticating credentials for institution accounts

See: https://docs.figshare.com/#stats_totals
"""

if item not in ['article', 'author', 'collection', 'group', 'project']:
raise ValueError("Incorrect item type")

total_dict = {}
for counter in counter_list:
# Using non-institution one since that seems to give correct stats
url = self.stats_endpoint(join('total', counter, item, str(item_id)),
institution=institution)
result = issue_request('GET', url, headers=self.basic_headers)
total_dict[counter] = result['totals']
return total_dict

def get_user_totals(self, author_id):
"""
Retrieve an author's total using get_totals()

:param author_id: This is not the same as the institution_user_id for institutional accounts
:return: total_dict: dict containing total views, downloads, and shares
Note: This does not require authenticating credentials for institution accounts
"""
total_dict = self.get_totals(author_id, item='author',
institution=False)
return total_dict

def get_timeline(self, item_id, item='article', granularity='day',
institution=False):
timeline_dict = {}
for counter in counter_list:
# Using non-institution one since that seems to give correct stats
urls = ['timeline', granularity, counter, item, str(item_id)]
url = self.stats_endpoint(join(*urls), institution=institution)
result = issue_request('GET', url, headers=self.basic_headers)
# Sort contents by date
result_sort = {}
cum_dict = {}
count = 0
# Use views record for timeline (most populated generally)
if counter == 'views':
save_date = sorted(result['timeline'])
for key in save_date:
if isinstance(result['timeline'], type(None)):
# Handle when counter is not available (NoneType)
result_sort[key] = 0
else:
try:
result_sort[key] = result['timeline'][key]
count += result['timeline'][key]
except KeyError:
result_sort[key] = 0
cum_dict[key] = count
timeline_dict[counter] = result_sort
timeline_dict[f"{counter}-cum"] = cum_dict
return timeline_dict

def get_figshare_id(self, accounts_df):
"""
Retrieve Figshare account ID(s)
Note: This is not the institutional ID, but one associated with
the unique profile

:param accounts_df: pandas DataFrame containing institution ID
:return: accounts_df: The input DataFrame with an additional column
"""

endpoint = join(self.main_baseurl_institute, "users")

author_id = []
for institute_id in accounts_df['id']:
url = f"{endpoint}/{institute_id}"
response = issue_request('GET', url, self.api_headers)
author_id.append(response['id'])
accounts_df['author_id'] = author_id
return accounts_df

def retrieve_institution_users(self, ignore_admin=False):
"""
Retrieve accounts within institutional instance

This is based on LD-Cool-P get_account_list method of FigshareInstituteAdmin
It includes retrieving the default author_id

It uses:
https://docs.figshare.com/#private_institution_accounts_list
https://docs.figshare.com/#private_account_institution_user
"""
url = join(self.main_baseurl_institute, "accounts")

# Figshare API is limited to a maximum of 1000 per page
params = {'page': 1, 'page_size': 1000}
accounts = issue_request('GET', url, self.api_headers, params=params)

accounts_df = pd.DataFrame(accounts)
accounts_df = accounts_df.drop(columns='institution_id')

if ignore_admin:
print("Excluding administrative and test accounts")

drop_index = list(accounts_df[accounts_df['email'] ==
'data-management@email.arizona.edu'].index)
drop_index += list(accounts_df[accounts_df['email'].str.contains('-test@email.arizona.edu')].index)

accounts_df = accounts_df.drop(drop_index).reset_index(drop=True)

accounts_df = self.get_figshare_id(accounts_df)

return accounts_df

def retrieve_institution_articles(self):

url = join(self.main_baseurl_institute, "articles")

# Figshare API is limited to a maximum of 1000 per page
params = {'page': 1,
'page_size': 1000}
articles = issue_request('GET', url, self.api_headers, params=params)

articles_df = pd.DataFrame(articles)

# Only consider published dataset
articles_df = articles_df.loc[articles_df['published_date'].notnull()]
articles_df = articles_df.reset_index()
return articles_df

def retrieve_article_details(self, article_id):
"""Retrieve article details"""
url = join('https://api.figshare.com/v2/', f"articles/{article_id}")
article_dict = issue_request('GET', url, self.basic_headers0)
return article_dict

def get_institution_totals(self, df=None, by_method='author'):
"""
Retrieve total views, downloads, and shares by either authors or articles
"""

if isinstance(df, type(None)):
if by_method == 'author':
df = self.retrieve_institution_users(ignore_admin=False)
if by_method == 'article':
df = self.retrieve_institution_articles()

total_dict = dict()
for i in df.index:
print(f"{i+1} of {len(df.index)}")
record = df.loc[i]
if by_method == 'author':
first_name = record['first_name']
last_name = record['last_name']
author_id = record['author_id']
total_dict[f"{first_name} {last_name} ({author_id})"] = self.get_user_totals(author_id)
if by_method == 'article':
total_dict[f"{record['id']}"] = self.get_totals(record['id'],
item='article',
institution=False)
# Construct pandas DataFrame
total_df = pd.DataFrame.from_dict(total_dict, orient='index')
return total_df
118 changes: 118 additions & 0 deletions figstats/visualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from typing import Union
from datetime import datetime as dt

import matplotlib.pyplot as plt
import matplotlib.dates as m_dates
from matplotlib import figure, axes

from textwrap import wrap

title_width = 75


def matplotlib_date_format(date_list: list) -> list:
"""Generate list of datetime objects"""
datetime_list = [dt.strptime(date, '%Y-%m-%d') for date in date_list]

return datetime_list


def plot_shares(ax: axes.Axes, timeline_dict: dict):
"""Plot shares data"""
shares_dict = timeline_dict['shares']
non_zero = [key for key in shares_dict.keys() if shares_dict[key] > 0]

if len(non_zero) > 0:
dates = matplotlib_date_format(non_zero)
for date, key in zip(dates, non_zero):
ax.axvline(x=date, color='red')
ax.text(date, 1, f"{shares_dict[key]}", color='red',
ha='right', va='bottom')


def plot_timeline(timeline_dict: dict, article_dict: dict,
out_pdf: str = '', save: bool = False) \
-> Union[None, figure.Figure]:
"""
Plot timeline showing views and downloads

:param timeline_dict: Contains daily and cumulative numbers.
From ``stats.Figshare.get_timeline``
:param article_dict: Contains articles details.
From ``stats.Figshare.retrieve_article_details``
:param out_pdf: Output filename. Default: `timeline_<article_id>.pdf`
:param save: Flag to save PDF file. Otherwise returns ``matplotlib`` fig object

:return fig: If save == False, fig is returned
"""
datetime_list = matplotlib_date_format(list(timeline_dict['views'].keys()))
fig, [ax0, ax1] = plt.subplots(ncols=2, nrows=2,
gridspec_kw={'height_ratios': [3, 1]})

counters = ['views', 'downloads']

for ii, counter in zip(range(len(counters)), counters):

# Bottom panels
y_bottom = timeline_dict[counter].values()
ax1[ii].bar(datetime_list, y_bottom)
locator = m_dates.AutoDateLocator(minticks=3, maxticks=7)
formatter = m_dates.ConciseDateFormatter(locator)
ax1[ii].xaxis.set_major_locator(locator)
ax1[ii].xaxis.set_major_formatter(formatter)
ax1[ii].set_ylabel(f"Daily {counter}")
ax1[ii].tick_params(axis='y', direction='in')
ax1[ii].tick_params(axis='x', direction='out')
ax1[ii].annotate(f'Maximum daily {counter}: {max(y_bottom)}',
(0.025, 0.95), xycoords='axes fraction',
va='top', ha='left')
ax1[ii].set_ylim(bottom=0)

# Top panels
y_top = timeline_dict[counter+'-cum'].values()
ax0[ii].plot(datetime_list, y_top, linestyle='-', linewidth=2.0,
marker='')
ax0[ii].xaxis.set_major_locator(locator)
ax0[ii].xaxis.set_major_formatter(formatter)
ax0[ii].set_xticklabels('')
ax0[ii].set_ylabel(f"Cumulative {counter}")
ax0[ii].tick_params(axis='both', direction='in')
ax0[ii].annotate(f'Total {counter}: {max(y_top)}', (0.025, 0.975),
xycoords='axes fraction', va='top', ha='left')
# ax0[ii].set_xlabel('Date')
ax0[ii].set_ylim(bottom=0)

plot_shares(ax0[ii], timeline_dict)

# Heading containing title, author, license, DOI

title_chunks = wrap(article_dict['title'], title_width)
for cc in range(len(title_chunks)):
if cc == 0:
left_heading = f"Title: {title_chunks[cc]}\n"
else:
left_heading += f" {title_chunks[cc]}\n"
author_list = [auth_dict['full_name'] for auth_dict in article_dict['authors']]
if len(author_list) > 3:
left_heading += f"Authors: {author_list[0]} et al.\n"
else:
left_heading += f"Authors: {' '.join(author_list)}\n"
left_heading += f"License: {article_dict['license']['name']} "
left_heading += f"DOI: https://doi.org/{article_dict['doi']}"
ax0[0].text(0.01, 1.25, left_heading, ha='left', va='top',
transform=ax0[0].transAxes)

right_heading = f"Shares: {max(timeline_dict['shares-cum'].values())}"
ax0[1].text(1.0, 1.25, right_heading, ha='right', va='top',
transform=ax0[1].transAxes)

fig.set_size_inches(8, 6)
plt.subplots_adjust(left=0.09, bottom=0.08, top=0.85, right=0.985,
hspace=0.025)

if save:
if not out_pdf:
out_pdf = f"timeline_{article_dict['id']}.pdf"
fig.savefig(out_pdf)
else:
return fig
Loading