From 002fcaa1519cff48c76fdd241b11eb4125504364 Mon Sep 17 00:00:00 2001 From: Gary Date: Mon, 3 Nov 2025 10:08:56 +0000 Subject: [PATCH 1/6] Initial commit for parting by first character --- code/processes/wdb.q | 58 ++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/code/processes/wdb.q b/code/processes/wdb.q index 65e18c7a8..552a5e0f5 100644 --- a/code/processes/wdb.q +++ b/code/processes/wdb.q @@ -30,6 +30,9 @@ writedownmode:@[value;`writedownmode;`default]; /-the /- at EOD the data will be merged from each partition before being moved to hdb /- 3. partbyenum - the data is partitioned by [ partitiontype ] and a symbol or integer column with parted attribution assigned in sort.csv /- at EOD the data will be merged from each partition before being moved to hdb + /- 3. partbyfirstchar - the data is partitioned by [ partitiontype ] and a symbol column based on the first character with parted attribution assigned in sort.csv + /- at EOD the data will be sorted and merged from each partition before being moved to hdb + mergemode:@[value;`mergemode;`part]; /-the partbyattr writedown mode can merge data from temporary storage to the hdb in three ways: /- 1. part - the entire partition is merged to the hdb @@ -52,7 +55,7 @@ tpcheckcycles:@[value;`tpcheckcycles;0W]; /-num sorttypes:@[value;`sorttypes;`sort]; /-list of sort types to look for upon a sort sortworkertypes:@[value;`sortworkertypes;`sortworker]; /-list of sort types to look for upon a sort being called with worker process - +`partbyfirstchar wdbtypes:@[value;`wdbtypes;`wdb]; /-list of wdb types for sort processes to look for on initmissingtables subtabs:@[value;`subtabs;`]; /-list of tables to subscribe for @@ -92,7 +95,7 @@ saveenabled: any `save`saveandsort in mode; sortenabled: any `sort`saveandsort in mode; /- parted writedown modes have special behaviour during merging or WDB initialisation -partwritemodes:`partbyattr`partbyenum; +partwritemodes:`partbyattr`partbyenum`partbyfirstchar; / - log which modes are enabled switch: string `off`on; @@ -129,15 +132,16 @@ maptoint:{[val] upserttopartition:{[dir;tablename;tabdata;pt;expttype;expt;writedownmode] /- enumerate first extra partition value if[writedownmode~`partbyenum;i:maptoint first expt]; + if[writedownmode~`partbyfirstchar;i:$[(c1:first upper string first expt)in .Q.nA;first where c1=.Q.nA;count .Q.nA]]; /- create directory location for selected partition /- replace non-alphanumeric characters in symbols with _ /- convert to symbols and replace any null values with `TORQNULLSYMBOL - directory:$[writedownmode~`partbyenum; + directory:$[writedownmode in `partbyenum`partbyfirstchar; ` sv .Q.par[dir;pt;`$string i],tablename,`; - ` sv .Q.par[dir;pt;tablename],(`$"_"^.Q.an .Q.an?"_" sv string `TORQNULLSYMBOL^ ensuresymlist[expt]),`]; + ` sv .Q.par[dir;pt;tablename],(`$"_"^.Q.an .Q.an?"_" sv string `TORQNULLSYMBOL^ ensuresymlist[expt]),`]; .lg.o[`save;"saving ",(string tablename)," data to partition ",string directory]; /- selecting rows of table with matching partition - r:?[tabdata;$[writedownmode~`partbyenum;enlist(in;first expttype;expt);{(x;y;(),z)}[in;;]'[expttype;expt]];0b;()]; + r:?[tabdata;$[writedownmode in `partbyenum`partbyfirstchar;enlist(in;first expttype;expt);{(x;y;(),z)}[in;;]'[expttype;expt]];0b;()]; /- upsert selected data matched on partition to specific directory .[upsert;(directory;r);{[e] .lg.e[`savetablesbypart;"Failed to save table to disk : ",e];'e}]; .lg.o[`track;"appending details to partsizes"]; @@ -152,8 +156,8 @@ savetablesbypart:{[dir;pt;forcesave;tablename;writedownmode] .lg.o[`rowcheck;"the ",(string tablename)," table consists of ", (string arows), " rows"]; /- get additional partition(s) defined by parted attribute in sort.csv extrapartitiontype:.merge.getextrapartitiontype[tablename]; - if[(writedownmode~`partbyenum) and 1count each key each tnds[;1]; + $[tempfix1&0>system"s"; + [.lg.o[`sortandmerge;"sorting on worker sort", string .z.p]; + {(neg x)(`.wdb.reloadsymfile;y);(neg x)(::)}[;.Q.dd[hdbsettings `hdbdir;`sym]] each .z.pd[]; + {[x;compression] setcompression compression;.sort.sorttab x;if[gc;.gc.run[]]}[;hdbsettings`compression] peach tnds]; + [.lg.o[`sort;"sorting on main sort"]; + reloadsymfile[.Q.dd[hdbsettings `hdbdir;`sym]]; + {[x] .sort.sorttab[x];if[gc;.gc.run[]]} each tnds]]; + .lg.o[`sort;"finished sorting data"]; + endofdaymerge[dir;pt;tablist;mergelimits;hdbsettings;mergemethod;writedownmode]; + }; + /- end of day sort [depends on writedown mode] endofdaysort:{[dir;pt;tablist;writedownmode;mergelimits;hdbsettings;mergemethod] /- set compression level (.z.zd) setcompression[hdbsettings[`compression]]; $[writedownmode in partwritemodes; - endofdaymerge[dir;pt;tablist;mergelimits;hdbsettings;mergemethod;writedownmode]; + $[writedownmode~`partbyfirstchar; /-partbyfirstchar will not be sorted by sym within each parition, this needs done first + endofdaysortandmerge[dir;pt;tablist;mergelimits;hdbsettings;mergemethod;writedownmode]; + endofdaymerge[dir;pt;tablist;mergelimits;hdbsettings;mergemethod;writedownmode]]; endofdaysortdate[dir;pt;key tablist;hdbsettings] ]; /- run steps to rollover idb @@ -534,7 +560,7 @@ fixpartition:{[subto] ]; } -/- for writedown modes partbyenum/default we make sure that partition 0/currentpartition has all the tables. +/- for writedown modes partbyenum/partbyfirstchar/default we make sure that partition 0/currentpartition has all the tables. /- In that case we can use .Q.chk later to fill the db making it useable for intraday processes /- pt - date; partition for which the function should initialise initmissingtables:{[pt] @@ -550,7 +576,7 @@ filldb:{[pt] /- initialises table t in db with its schema in part inittable:{[t;pt] - tabledir:` sv $[writedownmode~`partbyenum; .Q.par[.Q.dd[hsym savedir;pt];0;t]; .Q.par[hsym savedir;pt;t]],`; + tabledir:` sv $[writedownmode in `partbyenum`partbyfirstchar; .Q.par[.Q.dd[hsym savedir;pt];0;t]; .Q.par[hsym savedir;pt;t]],`; if[() ~ key tabledir;tabledir set .Q.en[hsym hdbdir;0#value t]]; } @@ -590,7 +616,7 @@ getsortparams:{[] /- get the attributes csv file /-even if running with a sort process should read this file to cope with backups .sort.getsortcsv[sortcsv]; - /- check the sort.csv for parted attributes `p if the writedownmode `partbyattr or `partbyenum is selected + /- check the sort.csv for parted attributes `p if the writedownmode `partbyattr, `partbyenum or `partbyfirstchar is selected /- if each table does not have at least one `p attribute the process will exit if[writedownmode in partwritemodes; @@ -612,7 +638,7 @@ getsortparams:{[] /- If the function is ran on sort process send initmissingtables command to wdbs idbreload:{[pt] .lg.o[`idb;"starting idb reload"]; - if[writedownmode in `partbyenum`default; + if[writedownmode in `partbyenum`default`partbyfirstchar; .lg.o[`eod;"initialising wdbhdb for partition: ",string[pt]]; $[.proc.proctype~`sort;{[pt]ws:exec w from .servers.getservers[`proctype;wdbtypes;()!();1b;0b];{[ws;pt]ws(`.wdb.initmissingtables;[pt])}[;pt] each ws}[pt];initmissingtables[pt]]; .lg.o[`eod;"notifying idbs for newly created partition"]; From 803ecc5595f7b510a2d59c413084340c272d0142 Mon Sep 17 00:00:00 2001 From: Gary Date: Mon, 3 Nov 2025 11:57:48 +0000 Subject: [PATCH 2/6] Updated to account for data merge --- code/processes/wdb.q | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/processes/wdb.q b/code/processes/wdb.q index 552a5e0f5..29aaf96ec 100644 --- a/code/processes/wdb.q +++ b/code/processes/wdb.q @@ -367,7 +367,7 @@ merge:{[dir;pt;tableinfo;mergelimits;hdbsettings;mergemethod;writedownmode] .merge.mergehybrid[tableinfo;dest;partdirs;mergelimits[tabname]] ]; .lg.o[`merge;"removing segments ", (", " sv string[partdirs])]; - $[writedownmode in `partbyenum; + $[writedownmode in `partbyenum`partbyfirstchar; removetablefromenumdir each partdirs; .os.deldir .os.pth[[string[tabledir]]] ]; @@ -435,7 +435,7 @@ endofdaysortandmerge:{[dir;pt;tablist;mergelimits;hdbsettings;mergemethod;writed /- .z.pd function in finspace will cause an error. Add in this check to skip over the use of .z.pd. This should be temporary and will be removed when issue resolved by AWS. tempfix1:$[.finspace.enabled;0b;count[.z.pd[]]]; tnds:raze{y,/:.Q.dd[x;]each key[x],\:y}[.Q.dd[dir;pt]]each key tablist; - tnds where 0<>count each key each tnds[;1]; + tnds:tnds where tnds[;1] in exec ptdir from .merge.partsizes; $[tempfix1&0>system"s"; [.lg.o[`sortandmerge;"sorting on worker sort", string .z.p]; {(neg x)(`.wdb.reloadsymfile;y);(neg x)(::)}[;.Q.dd[hdbsettings `hdbdir;`sym]] each .z.pd[]; From fa7476f0494fd053b5b2502eb2a86791a3331b0f Mon Sep 17 00:00:00 2001 From: Gary Date: Mon, 3 Nov 2025 11:59:27 +0000 Subject: [PATCH 3/6] Removed trailing sym --- code/processes/wdb.q | 1 - 1 file changed, 1 deletion(-) diff --git a/code/processes/wdb.q b/code/processes/wdb.q index 29aaf96ec..895ef1bae 100644 --- a/code/processes/wdb.q +++ b/code/processes/wdb.q @@ -55,7 +55,6 @@ tpcheckcycles:@[value;`tpcheckcycles;0W]; /-num sorttypes:@[value;`sorttypes;`sort]; /-list of sort types to look for upon a sort sortworkertypes:@[value;`sortworkertypes;`sortworker]; /-list of sort types to look for upon a sort being called with worker process -`partbyfirstchar wdbtypes:@[value;`wdbtypes;`wdb]; /-list of wdb types for sort processes to look for on initmissingtables subtabs:@[value;`subtabs;`]; /-list of tables to subscribe for From 5fa7f29cc6b75011961c8bce7f5279124730dbd7 Mon Sep 17 00:00:00 2001 From: Gary Date: Tue, 4 Nov 2025 15:47:14 +0000 Subject: [PATCH 4/6] Added mapping function and documentation --- code/processes/idb.q | 7 ++++++ code/processes/wdb.q | 7 +++++- docs/Processes.md | 54 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/code/processes/idb.q b/code/processes/idb.q index b566da0bc..ee7ae3a7a 100644 --- a/code/processes/idb.q +++ b/code/processes/idb.q @@ -113,3 +113,10 @@ maptoint:{[val] /- if using a symbol column, enumerate against the hdb sym file sym?`TORQNULLSYMBOL^val] }; + +/- helper function to support queries against the sym column in partbyfirstchar +mapfctoint:{[val] + count[.Q.nA]^$[type[val]>0;raze first each where each .Q.nA=/:string[val][;0];first where .Q.nA=string[val][0]] + }; + + diff --git a/code/processes/wdb.q b/code/processes/wdb.q index 895ef1bae..a20a4073e 100644 --- a/code/processes/wdb.q +++ b/code/processes/wdb.q @@ -127,11 +127,16 @@ maptoint:{[val] `long$ (` sv hdbsettings[`hdbdir],`sym)?`TORQNULLSYMBOL^val] }; +mapfctoint:{[val] + count[.Q.nA]^$[type[val]>0;raze first each where each .Q.nA=/:string[val][;0];first where .Q.nA=string[val][0]] + }; + + /- function to upsert to specified directory upserttopartition:{[dir;tablename;tabdata;pt;expttype;expt;writedownmode] /- enumerate first extra partition value if[writedownmode~`partbyenum;i:maptoint first expt]; - if[writedownmode~`partbyfirstchar;i:$[(c1:first upper string first expt)in .Q.nA;first where c1=.Q.nA;count .Q.nA]]; + if[writedownmode~`partbyfirstchar;i:mapfctoint first expt]; /- create directory location for selected partition /- replace non-alphanumeric characters in symbols with _ /- convert to symbols and replace any null values with `TORQNULLSYMBOL diff --git a/docs/Processes.md b/docs/Processes.md index b25ea2750..b967084f8 100755 --- a/docs/Processes.md +++ b/docs/Processes.md @@ -1035,6 +1035,41 @@ sorting at the end of the day. In the above example, the data is parted by sym, and number 456 is the order of MSFT_N symbol entry in the HDB sym file. +- partbyfirstchar - Data is persisted to a partition scheme where the partition + is derived from the first character in sym colum present in the sort.csv + file. Like partbyenum, this can be only be done by one column which has the + parted attribute applied to it. It must be a symbol column due the nature + of the character extraction. The numerical value for characters will map + to the index of the character in the .Q.nA, levveraging upper chase letters. + For those that arent contained within .Q.nA, they will map to the count of + .Q.nA, effectively the next numeric. Partitioning in this way means that the + data within each partition is no sorted for the parted attribute to be applied, + which means in the end of day process the data must be sorted before being + merged. This sort happens partition by partition rather than as a whole. + The wdb partition scheme is of the form + \[wdbdir\]/\[partitiontype\]/\[first char index .Q.nA\]/\[table(s)\]/ + A typical partition directory would be similar to (for ex sym: MSFT_N) + wdb/database/2025.11.04/22/trade + In the above example, the data is parted by sym, and number 22 is the + index position of M in .Q.nA. + +Data is persisted to a partition scheme where the partition + is derived from parameters in the sort.csv file. In this mode partition + only can be done by one column which has parted attribute applied on it + and it also has to be of a symbol or integer (short, int, long) type. + If the column is a symbol type, the partitioning on disk will + be the symbol entries enumerated against the HDB sym file. + If the column is an integer type, the partitioning on disk will + be the raw integer values clamped between 0 and 2,147,483,647 + (the maximum int value), with negative and null values + mapped to 0. The general partition scheme is of the form + \[wdbdir\]/\[partitiontype\]/\[parted enumerated column\]/\[table(s)\]/. + A typical partition directory would be similar to (for ex sym: MSFT_N) + wdb/database/2015.11.26/456/trade/ + In the above example, the data is parted by sym, and number 456 is + the order of MSFT_N symbol entry in the HDB sym file. + + The advantage of partbyenum over partbyattr could be that the directory structure it uses represents a HDB that is ready to be loaded intraday. At the end of the day the data gets upserted to the HDB the @@ -1046,7 +1081,10 @@ data sets with a low cardinality (ie. small number of distinct elements) the optional method may provide a significant time saving, upwards of 50%. The optional method should also reduce the memory usage at the end of day event, as joining data is generally less memory intensive than -sorting. +sorting. The optional partbyfirstchar method allows a method for subdividing +data with a high cardinality to reduce the number of partitions being +written to, while providing a means for reduced memory footprint on final sort +versus default. @@ -1056,13 +1094,19 @@ Intraday Database (IDB) The Intraday Database or IDB is a simple process that allows access to data written down intraday. This assumes that there is an existing WDB (and HDB) process creating a DB on disk that can be loaded with a simple -load command. As of now default and partbyenum WDB writedown modes are supported. -The responsibility of an IDB is therefore: +load command. As of now default, partbyenum and partbyfirstchar WDB writedown +modes are supported. The responsibility of an IDB is therefore: 1. Serving queries. Since partbyenum writedown mode is done by enumerated symbol columns a helper function maptoint is implemented to support symbol lookup in sym file: select from trade where int=maptoint[`MSFT_N] + Also with partbyfirstchar being an alternate approach to create a + numerical partition, there is a helper function to locate the correct + value: + select from trade where int=mapfctoint[`MSFT],sym=`MSFT + select from trade where int in mapfctoint[`MSFT`AAPL],sym in `MSFT`AAPL + 2. Can be triggered for a reload. This is usually done by the WDB process periodically. @@ -1097,6 +1141,10 @@ The IDB can be queried just like any other HDB. If writedown mode partbyenum is ``` neg[gwHandle](`.gw.asyncexec;"select from trade where int=maptoint[`GOOG]";`idb);gwHandle[] ``` +Likewise if partbyfirstchar writedown mode is used there is a "mapfctoint" which can be used +``` +neg[gwHandle](`.gw.asyncexec;"select from trade where int in maptoint[`GOOG`MSFT],sym in `GOOG`MSFT";`idb);gwHandle[] +``` ### Scalability From 5040b0ecaae63cd027c0fbac92e43ac6e9a154a1 Mon Sep 17 00:00:00 2001 From: Gary Date: Wed, 5 Nov 2025 11:09:47 +0000 Subject: [PATCH 5/6] Updated to use .Q.an and modified documentation --- code/processes/idb.q | 4 +--- code/processes/wdb.q | 3 +-- docs/Processes.md | 37 ++++++++++--------------------------- 3 files changed, 12 insertions(+), 32 deletions(-) diff --git a/code/processes/idb.q b/code/processes/idb.q index ee7ae3a7a..29a67b1da 100644 --- a/code/processes/idb.q +++ b/code/processes/idb.q @@ -116,7 +116,5 @@ maptoint:{[val] /- helper function to support queries against the sym column in partbyfirstchar mapfctoint:{[val] - count[.Q.nA]^$[type[val]>0;raze first each where each .Q.nA=/:string[val][;0];first where .Q.nA=string[val][0]] + .Q.an?$[00;raze first each where each .Q.nA=/:string[val][;0];first where .Q.nA=string[val][0]] + .Q.an?$[0 Date: Thu, 6 Nov 2025 16:28:37 +0000 Subject: [PATCH 6/6] Updated x to val # --- code/processes/idb.q | 2 +- code/processes/wdb.q | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/code/processes/idb.q b/code/processes/idb.q index 29a67b1da..844a6c4e8 100644 --- a/code/processes/idb.q +++ b/code/processes/idb.q @@ -116,5 +116,5 @@ maptoint:{[val] /- helper function to support queries against the sym column in partbyfirstchar mapfctoint:{[val] - .Q.an?$[0