From db9c3de68b753d5b021a88d51da677762a2432b0 Mon Sep 17 00:00:00 2001 From: jtoffoloDI Date: Fri, 2 Jan 2026 12:33:59 +0000 Subject: [PATCH] adding new tplogutils module --- di/tplogutils/init.q | 54 ++++++ di/tplogutils/test.csv | 20 ++ di/tplogutils/test.q | 368 ++++++++++++++++++++++++++++++++++++ di/tplogutils/tplogutils.md | 257 +++++++++++++++++++++++++ 4 files changed, 699 insertions(+) create mode 100644 di/tplogutils/init.q create mode 100644 di/tplogutils/test.csv create mode 100644 di/tplogutils/test.q create mode 100644 di/tplogutils/tplogutils.md diff --git a/di/tplogutils/init.q b/di/tplogutils/init.q new file mode 100644 index 0000000..4250d5f --- /dev/null +++ b/di/tplogutils/init.q @@ -0,0 +1,54 @@ +HEADER: 8 # -8!(`upd;`trade;()); / - header to build deserialisable msg +UPDMSG: `char$10 # 8 _ -8!(`upd;`trade;()); / - first part of tp update msg +CHUNK: 10 * 1024 * 1024; / - size of default chunk to read (10MB) +MAXCHUNK: 8 * CHUNK; / - don't let single read exceed this + +check: {[logfile;lastmsgtoreplay] + / - logfile (symbol) is the handle to the logsfile + / - lastmsgtoreplay (long) is index position of the last message to be replayed from the log + / - check if the logfile is corrupt + loginfo: -11!(-2;logfile); + :$[ 1 = count loginfo; + / - the log file is good so return the good log file handle + :logfile; + loginfo[0] <= lastmsgtoreplay + 1; + :logfile; + repair[logfile] + ] + }; + +repair: {[logfile] + / - append ".good" to the "good" log file + goodlog: `$ string[logfile],".good"; + / - create file and open handle to it + goodlogh: hopen goodlog set (); + / - loop through the file in chunks + repairover[logfile;goodlogh] over `start`size!(0j;CHUNK); + / - return goodlog + goodlog + }; + +repairover: {[logfile;goodlogh;d] + / - logfile (symbol) is the handle to the logsfile + / - goodlogh (int) is the handle to the "good" log file + / - d (dictionary) has two keys start and size, the point to start reading from and size of chunk to read + x:read1 logfile,d`start`size; / - read bytes from + u: ss[`char$x;UPDMSG]; / - find the start points of upd messages + if[not count u; / - nothing in this block + if[hcount[logfile] <= sum d`start`size;:d]; / - EOF - we're done + :@[d;`start;+;d`size]]; / - move on bytes + m: u _ x; / - split bytes into msgs + mz: 0x0 vs' `int$ 8 + ms: count each m; / - message sizes as bytes + hd: @[HEADER;7 6 5 4;:;] each mz; / - set msg size at correct part of hdr + g: @[(1b;)@-9!;;(0b;)@] each hd,'m; / - try and deserialize each msg + goodlogh g[;1] where k:g[;0]; / - write good msgs to the "good" log + if[not any k; / - saw msg(s) but couldn't read + if[MAXCHUNK <= d`size; / - read as much as we dare, give up + :@[d;`start`size;:;(sum d`start`size;CHUNK)]]; + :@[d;`size;*;2]]; / - read a bigger chunk + ns: d[`start] + sums[ms] last where k; / - move to the end of the last good msg + :@[d;`start`size;:;(ns;CHUNK)]; + }; + +export:([check;repair;repairover]) + diff --git a/di/tplogutils/test.csv b/di/tplogutils/test.csv new file mode 100644 index 0000000..3c14f56 --- /dev/null +++ b/di/tplogutils/test.csv @@ -0,0 +1,20 @@ +action,ms,bytes,lang,code,repeat,minver,comment +before,0,0,q,os:tplogsutil:use`di.tplogsutil,1,,Initialize module +before,0,0,q,os:use`di.os,1,,Initialize module +before,0,0,q,"system ""l "", os.abspath[""test.q""]",1,1,load additional testing functions / dependencies +run,0,0,q,test_repair_and_replay[],1,1, +run,0,0,q,test_repair_recovers_messages[],1,1, +run,0,0,q,test_repair_creates_good_file[],1,1, +run,0,0,q,test_check_valid_log[],1,1, +run,0,0,q,test_check_corrupt_sufficient_messages[],1,1, +run,0,0,q,test_repair_creates_good_file[],1,1, +run,0,0,q,test_repair_recovers_messages[],1,1, +run,0,0,q,test_check_triggers_repair[],1,1, +run,0,0,q,test_repair_garbage_at_end[],1,1, +run,0,0,q,test_multiple_corrupt_sections[],1,1, +run,0,0,q,test_completely_corrupt_log[],1,1, +run,0,0,q,test_empty_log[],1,1, +run,0,0,q,test_repair_and_replay[],1,1, +run,0,0,q,test_large_file_handling[],1,1, +run,0,0,q,test_repair_creates_good_file[],1,1, +run,0,0,q,test_sequential_operations[],1,1, diff --git a/di/tplogutils/test.q b/di/tplogutils/test.q new file mode 100644 index 0000000..8c983a6 --- /dev/null +++ b/di/tplogutils/test.q @@ -0,0 +1,368 @@ +/ ============================================================================= +/ TEST HELPERS +/ ============================================================================= + +upd:{[t;x] t upsert x}; +trade:([] time:`timestamp$(); sym:`symbol$(); price:`float$(); size:`long$()); + +/ @function createValidLog +/ @description Create a valid tickerplant log file for testing +/ @param filepath {symbol} Path where to create the log file +/ @param msgcount {long} Number of messages to write +createValidLog: {[filepath;msgcount] + / Create test table + trade:([] time:.z.p + til msgcount; sym:msgcount?`AAPL`GOOGL`MSFT`AMZN`TSLA; price:100+msgcount?100.0; size:100+msgcount?1000); + / Create log file and write messages + h:hopen filepath set (); + {[h;i;t] h enlist (`upd;`trade;value t[i])} [h;;trade] each til msgcount; + hclose h; + }; + +/ @function createCorruptLog +/ @description Create a log file with valid messages followed by corruption +/ @param filepath {symbol} Path where to create the log file +/ @param msgcount {long} Number of messages in log file +/ @param corruptpos {long} Message position where to insert corruption +createCorruptLog: {[filepath;msgcount;corruptpos] + / Create test table + trade:([] time:.z.p + til msgcount; sym:msgcount?`AAPL`GOOGL`MSFT`AMZN`TSLA; price:100+msgcount?100.0; size:100+msgcount?1000); + / Create log file and write messages + h:hopen filepath set (); + {[h;i;t;corruptpos] + if[=[i;corruptpos]; + data:enlist (`upd;`trade;value t[i]); + data_bytes:-18!data; + data_bytes[10+til 20]:`byte$(20?50); + :h data_bytes; + ] + h enlist (`upd;`trade;value t[i]) + } [h;;trade;corruptpos] each til msgcount; + hclose h; + }; + +/ @function countLogMessages +/ @description Count number of messages in a log file +/ @param filepath {symbol} Path to log file +/ @returns {long} Number of messages in the log +countLogMessages: {[filepath] + count -11!(1;filepath) + }; + +/ @function cleanup +/ @description Delete test files +/ @param filepaths {symbol[]} List of file paths to delete +cleanup: {[filepaths] + {[fp] @[hdel;fp;{}]} each filepaths; + }; + +/ ============================================================================= +/ BASIC FUNCTIONALITY TESTS +/ ============================================================================= + +/ @test Valid log file tplogsutil.check returns original filepath +test_check_valid_log: { + testfile:`:test_valid.log; + msgcount:10; + + / Setup + createValidLog[testfile;msgcount]; + + / Test + result:tplogsutil.check[testfile;msgcount-1]; + + / Assert + passes:result~testfile; + + / Cleanup + cleanup enlist testfile; + + / Return + passes + }; + +/ @test tplogsutil.check returns original when enough good messages exist +test_check_corrupt_sufficient_messages: { + testfile:`:test_corrupt_sufficient.log; + validmsgcount:20; + lastmsgtoreplay:10j; + + / Setup: corrupt after position where we have enough good messages + createCorruptLog[testfile;validmsgcount;500]; + + / Test + result:tplogsutil.check[testfile;lastmsgtoreplay]; + + / Assert - should return original since we have enough good messages + goodmsgcount:first -11!(-2;testfile); + passes:(result~testfile) and (goodmsgcount > lastmsgtoreplay); + + / Cleanup + cleanup enlist testfile; + + passes + }; + +/ @test tplogsutil.repair creates .good file with correct name +test_repair_creates_good_file: { + testfile:`:test_tplogsutil.repair.log; + expectedgoodfile:`$string[testfile],".good"; + + / Setup + createCorruptLog[testfile;15;150]; + + / Test + result:tplogsutil.repair[testfile]; + + / Assert + nameCorrect:result~expectedgoodfile; + fileExists:not ()~key expectedgoodfile; + passes:nameCorrect and fileExists; + + / Cleanup + cleanup (testfile;expectedgoodfile); + + passes + }; + +/ @test tplogsutil.repair recovers valid messages from corrupt log +test_repair_recovers_messages: { + testfile:`:test_recover.log; + goodfile:`$string[testfile],".good"; + validmsgcount:20; + + / Setup + createCorruptLog[testfile;validmsgcount;250]; + + / Test + tplogsutil.repair[testfile]; + + / Count messages in good file + recoveredcount:countLogMessages[goodfile]; + + / Assert - should recover at least some messages + passes:(recoveredcount>0) and (recoveredcount<=validmsgcount); + + / Cleanup + cleanup (testfile;goodfile); + + passes + }; + +/ @test tplogsutil.check triggers tplogsutil.repair when insufficient good messages +test_check_triggers_repair: { + testfile:`:test_tplogsutil.check_tplogsutil.repair.log; + goodfile:`$string[testfile],".good"; + validmsgcount:10; + lastmsgtoreplay:15j; / Need more messages than available good ones + + / Setup - corrupt early so not enough good messages + createCorruptLog[testfile;validmsgcount;100]; + + / Test + result:tplogsutil.check[testfile;lastmsgtoreplay]; + + / Assert + triggerstplogsutil.repair:result~goodfile; + fileCreated:not ()~key goodfile; + passes:triggerstplogsutil.repair and fileCreated; + + / Cleanup + cleanup (testfile;goodfile); + + passes + }; + +/ ============================================================================= +/ EDGE CASE TESTS +/ ============================================================================= + +/ @test tplogsutil.repair handles garbage at end of file +test_repair_garbage_at_end: { + testfile:`:test_garbage_end.log; + goodfile:`$string[testfile],".good"; + + / Setup - create log and append garbage at end + createValidLog[testfile;10]; + bytes:read1 testfile; + testfile set bytes,100#0x00; + + / Test + result:tplogsutil.repair[testfile]; + + / Assert + nameCorrect:result~goodfile; + hasMessages:countLogMessages[goodfile]>0; + passes:nameCorrect and hasMessages; + + / Cleanup + cleanup (testfile;goodfile); + + passes + }; + +/ @test Handles multiple corruption points +test_multiple_corrupt_sections: { + testfile:`:test_multi_corrupt.log; + goodfile:`$string[testfile],".good"; + + / Setup - create log with corruption in middle + createValidLog[testfile;30]; + bytes:read1 testfile; + + / Insert corruption at position (should have valid messages before and after) + if[200 < count bytes; + corrupted:bytes[til 200],10#0xFF,bytes[210+til count[bytes]-210]; + testfile set corrupted; + ]; + + / Test + result:tplogsutil.repair[testfile]; + + / Assert - should create file and recover something + fileCorrect:result~goodfile; + fileExists:not ()~key goodfile; + passes:fileCorrect and fileExists; + + / Cleanup + cleanup (testfile;goodfile); + + passes + }; + +/ @test Completely corrupt log creates empty .good file +test_completely_corrupt_log: { + testfile:`:test_all_corrupt.log; + goodfile:`$string[testfile],".good"; + + / Setup - create completely corrupt file + testfile set 1000#0x00; + + / Test + result:tplogsutil.repair[testfile]; + + / Assert - should create .good file even if empty/minimal + nameCorrect:result~goodfile; + fileExists:not ()~key goodfile; + passes:nameCorrect and fileExists; + + / Cleanup + cleanup (testfile;goodfile); + + passes + }; + +/ @test Empty log file handling +test_empty_log: { + testfile:`:test_empty.log; + + / Setup - create empty file + testfile set 0#0x00; + + / Test - should not crash + result:tplogsutil.check[testfile;0j]; + passes:1b; / If we got here without error, test passes + + / Cleanup + cleanup enlist testfile; + + passes + }; + +/ ============================================================================= +/ CONFIGURATION TESTS +/ ============================================================================= + +/ @test Verify module constants are set correctly +test_constants_set: { + chunkOk:CHUNK=10*1024*1024; + maxchunkOk:MAXCHUNK=8*CHUNK; + updmsgOk:10=count UPDMSG; + headerOk:8=count HEADER; + + chunkOk and maxchunkOk and updmsgOk and headerOk + }; + +/ @test Module metadata is present +test_module_info: { + hasName:`name in key info; + hasVersion:`version in key info; + hasDesc:`description in key info; + + hasName and hasVersion and hasDesc + }; + +/ ============================================================================= +/ INTEGRATION TESTS +/ ============================================================================= + +/ @test tplogsutil.repair then replay workflow +test_repair_and_replay: { + testfile:`:test_replay.log; + goodfile:`$string[testfile],".good"; + + / Setup + createCorruptLog[testfile;20;200]; + + / Test - tplogsutil.repair and try to replay + tplogsutil.repair[testfile]; + + / This should not throw an error if the .good file is valid + replayOk:@[{-11!(1;x);1b};goodfile;{0b}]; + + / Cleanup + cleanup (testfile;goodfile); + + replayOk + }; + +/ @test Large file handling (performance test) +test_large_file_handling: { + testfile:`:test_large.log; + goodfile:`$string[testfile],".good"; + msgcount:500; / Reasonable size for testing + + / Setup + createCorruptLog[testfile;msgcount;5000]; + + / Test - measure time + start:.z.p; + result:tplogsutil.repair[testfile]; + elapsed:`second$.z.p-start; + + / Assert - should complete and create file + completed:result~goodfile; + reasonable:elapsed<30; / Should complete in under 30 seconds + passes:completed and reasonable; + + / Cleanup + cleanup (testfile;goodfile); + + passes + }; + +/ @test Sequential tplogsutil.check and tplogsutil.repair calls +test_sequential_operations: { + testfile:`:test_sequential.log; + goodfile:`$string[testfile],".good"; + + / Setup + createCorruptLog[testfile;15;150]; + + / Test - tplogsutil.check then tplogsutil.repair + tplogsutil.checkResult:tplogsutil.check[testfile;20j]; + + / If tplogsutil.check triggered tplogsutil.repair, goodfile should exist + / If not, manually tplogsutil.repair + if[not tplogsutil.checkResult~goodfile; + tplogsutil.repair[testfile]; + ]; + + / Assert - .good file should exist in either case + passes:not ()~key goodfile; + + / Cleanup + cleanup (testfile;goodfile); + + passes + }; + diff --git a/di/tplogutils/tplogutils.md b/di/tplogutils/tplogutils.md new file mode 100644 index 0000000..e26b2c4 --- /dev/null +++ b/di/tplogutils/tplogutils.md @@ -0,0 +1,257 @@ +# `tplogutils` – Tickerplant Log Check & Repair Utilities for kdb+/q + +A small utility module for **checking** and **best‑effort repairing** tickerplant-style log files by scanning raw bytes for update-message boundaries, attempting to deserialize candidate messages, and writing any recoverable messages into a new `*.good` logfile. + +> **Note:** As currently implemented, recovery is keyed off the signature of `(`upd;`trade;...)` (see **Configuration**). If your logs contain other tables or message shapes, you may need to adapt the signature constants. + +--- + +## :sparkles: Features + +- Check whether a logfile should be used as-is or repaired (based on the logic in `check`). +- Repair a corrupt logfile by extracting messages that can be successfully deserialized. +- Chunked scanning to avoid loading large files into memory. +- Adaptive read sizing when no valid messages are found in a chunk. +- Produces a new `.good` output file (append-only write during recovery). +- Includes a test suite (`test.q`, `test.csv`) that generates valid/corrupt logs and validates recovery outcomes. + +--- + +## :file_folder: Directory contents + +- `init.q` – module implementation (constants + `check`, `repair`, `repairover`) +- `tplogutils.md` – documentation (you can replace/rename to `README.md` if desired) +- `test.q` – tests + helpers for creating valid/corrupted logs +- `test.csv` – test manifest for your project’s test harness + +--- + +## :label: Naming note (important) + +The folder name is `tplogutils`, and `init.q` exports: + +```q +export:([check;repair;repairover]) +``` + +However, the included tests reference the name `tplogsutil` in a few places (e.g. `tplogsutil.check`, `tplogsutil.repair`). + +Depending on your package loader conventions, you may want to: +- load the module into a variable named `tplogsutil`, **or** +- update the tests to use `tplogutils` consistently. + +This README uses **`tplogutils`** when referring to the module variable. + +--- + +## :inbox_tray: Loading + +### KDB-X (supports `use`) +If you are using KDB-X (where `use` exists), load the module using the symbol that matches your `QPATH` layout. + +If your `QPATH` includes the `di` directory (e.g. `~/kdbx-modules/di`), a common pattern is: + +```q +tplogutils:use`tplogutils +``` + +### Plain q (no `use`) +If your session does not support `use`, load directly from the file: + +```q +\l /path/to/kdbx-modules/di/tplogutils/init.q +``` + +--- + +## :gear: Configuration + +These constants are defined at the top of `init.q`: + +| Name | Type | Description | +|------------|-------------|-------------| +| `HEADER` | byte list | Template bytes used to build a deserialisable message header. | +| `UPDMSG` | char list | Prefix used to detect candidate update messages within raw bytes. | +| `CHUNK` | long | Default chunk size (bytes) to read (10MB). | +| `MAXCHUNK` | long | Maximum chunk size for a single read attempt (`8 * CHUNK`). | + +### Current default signature + +The module sets `UPDMSG` based on the serialized form of: + +```q +(`upd;`trade;()) +``` + +This means: +- it is geared toward logs containing `upd` messages for the `trade` table +- logs containing other table names or different update call shapes may not be recovered unless you adjust the signature logic + +--- + +## :wrench: Functions + +### Summary + +| Function | Description | +|----------|-------------| +| `check[logfile;lastmsgtoreplay]` | Returns `logfile` if it should be used as-is per `check` logic, otherwise triggers `repair` and returns `.good`. | +| `repair[logfile]` | Creates `.good` and writes any recoverable messages into it. Returns the new filename. | +| `repairover[logfile;goodlogh;d]` | Internal chunk worker called repeatedly by `repair` (exported for testing/advanced use). | + +--- + +### `check` + +```q +tplogutils.check[logfile; lastmsgtoreplay] +``` + +**Parameters** + +| Parameter | Type | Description | +|----------:|------|-------------| +| `logfile` | symbol | Path to logfile as a symbol (e.g. ```:tp.log```), as used by `-11!`, `hcount`, `read1`, etc. | +| `lastmsgtoreplay` | long | Index position of the last message the caller intends to replay. | + +**Behavior (as implemented)** +- inspects logfile info via `-11!(-2; logfile)` +- returns either: + - the original `logfile`, or + - a repaired logfile produced by `repair[logfile]` + +**Returns** +- `logfile` **or** `.good` + +--- + +### `repair` + +```q +tplogutils.repair[logfile] +``` + +**Purpose** +Create a “good” logfile containing only recoverable messages. + +**Behavior (as implemented)** +- writes output to `.good` +- processes the input logfile in chunks +- for each chunk: + - searches for occurrences of the configured `UPDMSG` signature + - splits the chunk into candidate messages + - constructs a header for each candidate + - attempts to deserialize each candidate + - writes successfully decoded messages into the output logfile + +**Returns** +- symbol path of the repaired logfile (e.g. ```:tp.log.good```) + +--- + +### `repairover` + +```q +tplogutils.repairover[logfile; goodlogh; d] +``` + +**Parameters** + +| Parameter | Type | Description | +|----------:|------|-------------| +| `logfile` | symbol | Source logfile | +| `goodlogh` | int | Handle to the output `.good` logfile (opened via `hopen`) | +| `d` | dict | State dictionary with keys `start` and `size` (byte offset and chunk length) | + +**Notes** +- `repair` calls `repairover` repeatedly using `over` and a `(start;size)` state dictionary. +- This is exported for transparency/testing; most users should call `check` or `repair`. + +--- + +## :rocket: Typical usage + +### Repair-if-needed flow + +```q +/ Load module +tplogutils:use`tplogutils + +/ Decide whether to repair +log:`:tp.log +safe:tplogutils.check[log; 0j] + +/ safe is either `:tp.log or `:tp.log.good +safe +``` + +### Always repair + +```q +tplogutils:use`tplogutils + +log:`:tp.log +good:tplogutils.repair log +good +``` + +--- + +## :test_tube: Tests + +The module includes `test.q` and `test.csv`. + +### What the tests do (high level) + +`test.q` provides helpers to: +- create a valid log by writing records shaped like `enlist (`upd;`trade; rowData)` +- create a corrupt log by introducing byte-level corruption into one record +- verify that `check` and `repair` behave as expected across scenarios: + - valid logs + - corruption with enough valid messages + - corruption requiring repair + - garbage at end-of-file + - multiple corrupt sections + - completely corrupt logs + - empty logs + - sequential operations + +### Running tests manually + +```q +/ Load module +tplogutils:use`tplogutils + +/ Load tests +\l /path/to/kdbx-modules/di/tplogutils/test.q + +/ Run a few key tests +test_check_valid_log[] +test_repair_creates_good_file[] +test_repair_recovers_messages[] +test_repair_garbage_at_end[] +``` + +> **Note:** If the tests refer to `tplogsutil` but you loaded the module as `tplogutils`, either: +> - load the module into a `tplogsutil` variable as well, or +> - update the test references to `tplogutils`. + +--- + +## :bulb: Notes & limitations + +- **Best-effort recovery only:** The repair process only keeps messages that can be successfully deserialized by the module’s decode attempt. +- **Signature-specific:** The scan is currently tuned to the prefix of `(`upd;`trade;...)`. +- **Chunk-boundary sensitivity:** Recovery depends on being able to locate the message signature within the bytes read for a given chunk. +- **Validate output:** Always validate that `.good` replays correctly in your environment before using it as a production recovery artifact. + +--- + +## :package: Exported symbols + +The module exports: + +```q +export:([check;repair;repairover]) +``` +