From af9030e6ec52460420389b0c7ad93268b76437a9 Mon Sep 17 00:00:00 2001 From: Milind Kamble Date: Fri, 24 Aug 2018 10:42:47 -0500 Subject: [PATCH 1/3] Generic ofx2dataframe converter capable of handling multiple inputs Created unittest for the same. Modified utils/ofx2xlsx.py script to use the converter and support output in csv or xlsx format --- ofxparse/ofxtodataframe.py | 44 +++++++++++++++++++++ tests/test_parse.py | 17 +++++++- utils/ofx2xlsx.py | 79 ++++++++++++++++++-------------------- 3 files changed, 97 insertions(+), 43 deletions(-) create mode 100644 ofxparse/ofxtodataframe.py diff --git a/ofxparse/ofxtodataframe.py b/ofxparse/ofxtodataframe.py new file mode 100644 index 0000000..04b44dd --- /dev/null +++ b/ofxparse/ofxtodataframe.py @@ -0,0 +1,44 @@ +from ofxparse import OfxParser +import pandas as pd +import codecs +import os.path as path + +# fields of transactions are auto extracted using dir(transactiontype)-{attributes starting with '_'} + +def ofx_to_dataframe(files, id_len=24): + collected_df={} + if type(files) is str: + files = [files] + assert(isinstance(files, list)) + for fname in files: + data = {} + with codecs.open(fname) as fileobj: + ofx = OfxParser.parse(fileobj) + # it seems one ofx file contains only one securities list. Create a mapping from ID to ticker + security_map = {x.uniqueid : x.ticker for x in ofx.security_list} + # different transaction types have different fields. So we create df for each txn_type + # and append the contents of each txn into appropriate df + for account in ofx.accounts: + for transaction in account.statement.transactions: + txn_type = type(transaction).__name__ + if not txn_type in data: + fields = [x for x in dir(transaction) if not x.startswith('_')] + data[txn_type] = pd.DataFrame(columns=fields) + df = data[txn_type] + fields = set(df.columns) + sr = pd.Series([getattr(transaction,f) for f in fields], index=fields) + data[txn_type] = df.append(sr, ignore_index=True) + # add fname, acctnum common info into each df. Truncate ID if needed + for key,df in data.items(): + df['fname'] = path.basename(fname) + df['id'] = df['id'].str[:id_len] # clip the last part of the ID which changes from download to download + df['acctnum']=account.number + if 'security' in df.columns: + df['security'] = df['security'].apply(lambda x: security_map[x]) + if 'AGGREGATE_TYPES' in df.columns : + del df['AGGREGATE_TYPES'] + if key in collected_df: + collected_df[key] = collected_df[key].append(df, ignore_index=True) + else: + collected_df[key] = df + return collected_df diff --git a/tests/test_parse.py b/tests/test_parse.py index 78bd779..dc9dd78 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -12,7 +12,8 @@ from .support import open_file from ofxparse import OfxParser, AccountType, Account, Statement, Transaction from ofxparse.ofxparse import OfxFile, OfxPreprocessedFile, OfxParserException, soup_maker - +from ofxparse.ofxtodataframe import ofx_to_dataframe +import glob class TestOfxFile(TestCase): OfxFileCls = OfxFile @@ -1049,6 +1050,20 @@ def testFailure(self): self.assertEqual(ofx.signon.severity, 'ERROR') self.assertEqual(ofx.signon.message, 'Your request could not be processed because you supplied an invalid identification code or your password was incorrect') +class TestOfxToDataFrame(TestCase): + def testSingleFile(self): + dfs = ofx_to_dataframe('tests/fixtures/fidelity.ofx') + self.assertEqual(sorted(dfs), ['InvestmentTransaction', 'Transaction']) + self.assertEqual(len(dfs['InvestmentTransaction']), 14) + self.assertEqual(len(dfs['Transaction']), 3) + + def testMultipleFiles(self): + dfs = ofx_to_dataframe(['tests/fixtures/fidelity.ofx', 'tests/fixtures/investment_401k.ofx']) + self.assertEqual(sorted(dfs), ['InvestmentTransaction', 'Transaction']) + self.assertEqual(len(dfs['InvestmentTransaction']), 17) + self.assertEqual(len(dfs['Transaction']), 3) + + if __name__ == "__main__": import unittest unittest.main() diff --git a/utils/ofx2xlsx.py b/utils/ofx2xlsx.py index 32b19c6..05d7515 100644 --- a/utils/ofx2xlsx.py +++ b/utils/ofx2xlsx.py @@ -1,57 +1,51 @@ -from ofxparse import OfxParser +from ofxparse.ofxtodataframe import ofx_to_dataframe import pandas as pd +from pandas import ExcelWriter import argparse -# TODO automatically extract from transactions -fields = ['id','type', 'date', 'memo', 'payee', 'amount', 'checknum', 'mcc'] - +# ToDo: Remove duplicate transactions from different files parser = argparse.ArgumentParser(description='Convert multiple .qfx or .ofx to' - ' .xlsx.\n' - 'Remove duplicate transactions ' - 'from different files.\n' - 'use fixed columns:' - ' %s'%', '.join(fields)) + ' .xlsx or csv.\n') parser.add_argument('files', metavar='*.ofx *.qfx', type=str, nargs='+', - help='.qfx or .ofx file names') -parser.add_argument('--start', type=str, metavar='2014-01-01', - default='2014-01-01', - help="Don't take transaction before this date") -parser.add_argument('--end', type=str, metavar='2014-12-31', - default='2014-12-31', +help='.qfx or .ofx file names') +parser.add_argument('--start', type=str, metavar='1700-01-01', + default='1700-01-01', + help="Don't take transaction before this date") +parser.add_argument('--end', type=str, metavar='3000-12-31', + default='3000-12-31', help="Don't take transaction after this date") -parser.add_argument('--output', metavar='output.xlsx', type=str, - default='output.xlsx', help='Were to store the xlsx') +parser.add_argument('-o', '--output', metavar='output.csv', type=str, + default='output.csv', help='Were to store the output. Extension determines output format') parser.add_argument('--id-length', metavar='24', type=int, default=24, - help='Truncate the number of digits in a transaction ID.' - ' This is important because this program remove' - ' transactions with duplicate IDs (after verifing' - ' that they are identical.' - ' If you feel unsafe then use a large number but' - 'usually the last digits of the transaction ID are' - 'running numbers which change from download to download' - ' as a result you will have duplicate transactions' - ' unless you truncate the ID.') + help='Truncate the number of digits in a transaction ID.' + ' This is important because this program remove' + ' transactions with duplicate IDs (after verifing' + ' that they are identical.' + ' If you feel unsafe then use a large number but' + 'usually the last digits of the transaction ID are' + 'running numbers which change from download to download' + ' as a result you will have duplicate transactions' + ' unless you truncate the ID.') args = parser.parse_args() - -data = {} -for fname in args.files: - ofx = OfxParser.parse(file(fname)) - for account in ofx.accounts: - df = data.get(account.number, pd.DataFrame(columns=fields+['fname'])) - for transaction in account.statement.transactions: - s = pd.Series([getattr(transaction,f) for f in fields], index=fields) - s['fname'] = fname.split('/')[-1] - df = df.append(s, ignore_index=True) - df['id'] = df['id'].str[:args.id_length] # clip the last part of the ID which changes from download to download - data[account.number] = df - -print "Writing result to", args.output -writer = pd.ExcelWriter(args.output) - +data = ofx_to_dataframe(args.files) + +if 'csv' in args.output: + outstring = "" + for key,df in data.items(): + outstring += "##### %s".format(key) + df.to_csv(None, index=False, header=True) + with open(args.output, 'w') as fileobj: + print(outstring, file=fileobj) +elif 'xlsx' in args.output: + writer = pd.ExcelWriter(args.output) + for key,df in data.items(): + df.to_excel(writer, sheet_name=key) + writer.save() + +__dev_notes__ = ''' for account_number, df in data.iteritems(): # A transaction is identified using all `fields` # collapse all repeated transactions from the same file into one row @@ -88,3 +82,4 @@ df2.to_excel(writer, account_number, index=False) writer.save() +''' From e94fb136751990fa0d2751343146abed88c531c6 Mon Sep 17 00:00:00 2001 From: Milind Kamble Date: Sun, 16 Sep 2018 23:42:11 -0500 Subject: [PATCH 2/3] Created Cash 'Position' for regular (i.e. banking) statement Thus investment securities and cash balance positions are consolidated into 'Positions' dataframe Fxed bug associated with attaching acctname to transactions. --- ofxparse/ofxtodataframe.py | 47 +++++++++++++++++++++++++++----------- utils/ofx2xlsx.py | 17 ++++++++++---- 2 files changed, 47 insertions(+), 17 deletions(-) mode change 100644 => 100755 utils/ofx2xlsx.py diff --git a/ofxparse/ofxtodataframe.py b/ofxparse/ofxtodataframe.py index 04b44dd..39682ae 100644 --- a/ofxparse/ofxtodataframe.py +++ b/ofxparse/ofxtodataframe.py @@ -2,24 +2,28 @@ import pandas as pd import codecs import os.path as path +import sys, warnings +import decimal # fields of transactions are auto extracted using dir(transactiontype)-{attributes starting with '_'} -def ofx_to_dataframe(files, id_len=24): +def ofx_to_dataframe(fileobjs, id_len=24): collected_df={} - if type(files) is str: - files = [files] - assert(isinstance(files, list)) - for fname in files: + assert(isinstance(fileobjs, list)) + for fileobj in fileobjs: data = {} - with codecs.open(fname) as fileobj: - ofx = OfxParser.parse(fileobj) + + #with codecs.open(fname) as fileobj: + # ofx = OfxParser.parse(fileobj) + ofx = OfxParser.parse(fileobj) # it seems one ofx file contains only one securities list. Create a mapping from ID to ticker - security_map = {x.uniqueid : x.ticker for x in ofx.security_list} + if hasattr(ofx, 'security_list'): + security_map = {x.uniqueid : x.ticker for x in ofx.security_list} # different transaction types have different fields. So we create df for each txn_type # and append the contents of each txn into appropriate df for account in ofx.accounts: - for transaction in account.statement.transactions: + for transaction in account.statement.transactions + \ + (hasattr(account.statement, 'positions') and account.statement.positions or []): txn_type = type(transaction).__name__ if not txn_type in data: fields = [x for x in dir(transaction) if not x.startswith('_')] @@ -27,12 +31,29 @@ def ofx_to_dataframe(files, id_len=24): df = data[txn_type] fields = set(df.columns) sr = pd.Series([getattr(transaction,f) for f in fields], index=fields) + sr = pd.Series({f:transaction.f} for f in fields) + sr['acctnum'] = account.number data[txn_type] = df.append(sr, ignore_index=True) - # add fname, acctnum common info into each df. Truncate ID if needed + if hasattr(account, 'balance'): + txn_type = 'Positions' + if not txn_type in data: + fields = ['date', 'market_value', 'security', 'unit_price', 'units'] + data[txn_type] = pd.DataFrame(columns=fields) + df = data[txn_type] + fields = set(df.columns) + sr = pd.Series({ + 'date' :statement.end_date, + 'security' : 'Cash', + 'units' :statement.balance, + 'unit_price' : decimal.Decimal('1.00')}, index=fields) + sr['acctnum'] = account.number + data[txn_type] = df.append(sr, ignore_index=True) + + # add fname info into each df. Truncate ID if needed for key,df in data.items(): - df['fname'] = path.basename(fname) - df['id'] = df['id'].str[:id_len] # clip the last part of the ID which changes from download to download - df['acctnum']=account.number + df['fname'] = hasattr(fileobj, 'name') and fileobj.name or 'stdin' + if 'id' in df.columns: + df['id'] = df['id'].str[:id_len] # clip the last part of the ID which changes from download to download if 'security' in df.columns: df['security'] = df['security'].apply(lambda x: security_map[x]) if 'AGGREGATE_TYPES' in df.columns : diff --git a/utils/ofx2xlsx.py b/utils/ofx2xlsx.py old mode 100644 new mode 100755 index 05d7515..f8fc09c --- a/utils/ofx2xlsx.py +++ b/utils/ofx2xlsx.py @@ -1,13 +1,18 @@ +#!/usr/bin/env python3 +import warnings +warnings.filterwarnings("ignore", message="numpy.dtype size changed") + from ofxparse.ofxtodataframe import ofx_to_dataframe import pandas as pd from pandas import ExcelWriter - +import sys import argparse +from io import StringIO # ToDo: Remove duplicate transactions from different files parser = argparse.ArgumentParser(description='Convert multiple .qfx or .ofx to' ' .xlsx or csv.\n') -parser.add_argument('files', metavar='*.ofx *.qfx', type=str, nargs='+', +parser.add_argument('files', type=argparse.FileType('r'), nargs='+', #;metavar='*.ofx *.qfx', default=[], type=str, nargs='+', help='.qfx or .ofx file names') parser.add_argument('--start', type=str, metavar='1700-01-01', default='1700-01-01', @@ -30,13 +35,17 @@ args = parser.parse_args() - +if 'stdin' in args.files[0].name: + fp=args.files[0] + args.files=[StringIO(fp.read())] data = ofx_to_dataframe(args.files) if 'csv' in args.output: outstring = "" for key,df in data.items(): - outstring += "##### %s".format(key) + df.to_csv(None, index=False, header=True) + outstring += "##### {}\n".format(key) + df.to_csv(None, index=False, header=True) + if args.output=='output.csv': + print(outstring) with open(args.output, 'w') as fileobj: print(outstring, file=fileobj) elif 'xlsx' in args.output: From 6d377d1857b9a33590695ad69db919107b25113d Mon Sep 17 00:00:00 2001 From: Milind Kamble Date: Sun, 19 Apr 2020 22:39:46 -0500 Subject: [PATCH 3/3] more improvements --- ofxparse/ofxtodataframe.py | 48 ++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/ofxparse/ofxtodataframe.py b/ofxparse/ofxtodataframe.py index 39682ae..621cd51 100644 --- a/ofxparse/ofxtodataframe.py +++ b/ofxparse/ofxtodataframe.py @@ -17,37 +17,43 @@ def ofx_to_dataframe(fileobjs, id_len=24): # ofx = OfxParser.parse(fileobj) ofx = OfxParser.parse(fileobj) # it seems one ofx file contains only one securities list. Create a mapping from ID to ticker + security_map = {} if hasattr(ofx, 'security_list'): - security_map = {x.uniqueid : x.ticker for x in ofx.security_list} + security_map.update({x.uniqueid : x.ticker for x in ofx.security_list}) # different transaction types have different fields. So we create df for each txn_type # and append the contents of each txn into appropriate df for account in ofx.accounts: for transaction in account.statement.transactions + \ (hasattr(account.statement, 'positions') and account.statement.positions or []): txn_type = type(transaction).__name__ + transaction.acctnum = account.number if not txn_type in data: fields = [x for x in dir(transaction) if not x.startswith('_')] data[txn_type] = pd.DataFrame(columns=fields) df = data[txn_type] fields = set(df.columns) - sr = pd.Series([getattr(transaction,f) for f in fields], index=fields) - sr = pd.Series({f:transaction.f} for f in fields) - sr['acctnum'] = account.number + sr = pd.Series({f: getattr(transaction,f) for f in fields}) data[txn_type] = df.append(sr, ignore_index=True) - if hasattr(account, 'balance'): - txn_type = 'Positions' - if not txn_type in data: - fields = ['date', 'market_value', 'security', 'unit_price', 'units'] - data[txn_type] = pd.DataFrame(columns=fields) - df = data[txn_type] - fields = set(df.columns) + + # add cash balance as a "Cash" position + cash_amount = None + if hasattr(account.statement, 'balance'): + cash_amount = account.statement.balance + dt = account.statement.balance_date + elif hasattr(account.statement, 'available_cash'): + cash_amount = account.statement.available_cash + dt = account.statement.end_date + if cash_amount is not None: + df = data.get('Position', + pd.DataFrame(columns=['date', 'market_value', 'security', 'unit_price', 'units', 'acctnum'])) sr = pd.Series({ - 'date' :statement.end_date, - 'security' : 'Cash', - 'units' :statement.balance, - 'unit_price' : decimal.Decimal('1.00')}, index=fields) - sr['acctnum'] = account.number - data[txn_type] = df.append(sr, ignore_index=True) + 'date' : dt, + 'security' : account.curdef, + 'market_value': cash_amount, + 'units' : cash_amount, + 'unit_price' : decimal.Decimal('1.00'), + 'acctnum' : account.number}) + data['Position'] = df.append(sr, ignore_index=True) # add fname info into each df. Truncate ID if needed for key,df in data.items(): @@ -55,7 +61,7 @@ def ofx_to_dataframe(fileobjs, id_len=24): if 'id' in df.columns: df['id'] = df['id'].str[:id_len] # clip the last part of the ID which changes from download to download if 'security' in df.columns: - df['security'] = df['security'].apply(lambda x: security_map[x]) + df['security'] = df['security'].apply(lambda x: security_map.get(x, x)) if 'AGGREGATE_TYPES' in df.columns : del df['AGGREGATE_TYPES'] if key in collected_df: @@ -63,3 +69,9 @@ def ofx_to_dataframe(fileobjs, id_len=24): else: collected_df[key] = df return collected_df + +__dev_notes__=''' +For brokerage, balances are available in account.statement.balance_list... but overall cash is also summarized in account.statement.available_cash corresponding to statement.end_date +For bank, balance is available in account.statement.balance (and balance_date) + +'''