From b4d25273f356e60c02b654acf2639c0c8973c482 Mon Sep 17 00:00:00 2001 From: edward Date: Thu, 22 Dec 2022 19:12:34 -0500 Subject: [PATCH 1/5] refactor entropy.so.1 as a C extension that can be built with setuptools, removing the dependency on running the Makefile --- ofrak_core/ofrak/core/entropy/entropy.c | 69 ++++++++++++++++++- ofrak_core/ofrak/core/entropy/entropy.py | 44 ++---------- .../ofrak/core/entropy/reference_entropy.py | 53 ++++++++++++++ ofrak_core/setup.py | 10 +++ .../components/test_entropy_component.py | 47 ++----------- 5 files changed, 138 insertions(+), 85 deletions(-) create mode 100644 ofrak_core/ofrak/core/entropy/reference_entropy.py diff --git a/ofrak_core/ofrak/core/entropy/entropy.c b/ofrak_core/ofrak/core/entropy/entropy.c index 43467f098..47e172d30 100644 --- a/ofrak_core/ofrak/core/entropy/entropy.c +++ b/ofrak_core/ofrak/core/entropy/entropy.c @@ -1,17 +1,28 @@ #include // size_t, NULL #include // uint8_t, uint32_t #include // floor, log2 +#include #define HISTOGRAM_SIZE 256 #define MAX_BRIGHTNESS_FLOAT 255.0 #define LOGGING_CHUNKS 10 +/*** + * Use a Python callback to log the current percent completion of the calculation + */ +void log_percent(int percent, void* py_callback){ + PyObject *args = Py_BuildValue("(i)", percent); + PyObject *result = PyEval_CallObject(py_callback, args); + Py_XDECREF(result); + Py_DECREF(args); +} + /*** * Calculate the Shannon entropy of a distribution of size `window_size` sampled from a sliding * window over `data`. The results of each calculation are stored in `result`. */ int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size, - void (*log_percent)(uint8_t)) + void* py_log_callback) { if (data == NULL || result == NULL || window_size > data_len || data_len == 0 || window_size == 0) { @@ -92,8 +103,62 @@ int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size, } } - log_percent((i * 100) / data_len); + log_percent((i * 100) / data_len, py_log_callback); } return 0; } + + +PyObject* entropy_wrapper(PyObject* _, PyObject* args){ + Py_buffer data_buffer; + size_t data_len; + size_t window_size; + PyObject* py_log_percent; + + if (!PyArg_ParseTuple(args, "y*nnO", &data_buffer, &data_len, &window_size, &py_log_percent)){ + return NULL; + } + + uint8_t *data = data_buffer.buf; + size_t result_size = data_len - window_size; + uint8_t *result = (uint8_t*) calloc(result_size, sizeof(uint8_t)); + + // Actual entropy calculation + entropy(data, data_len, result, window_size, py_log_percent); + + PyObject* result_object = Py_BuildValue("y#", result, result_size); + + // Clean up memory + PyBuffer_Release(&data_buffer); + free(result); + + return result_object; +} + + +// Functions defined in this module +static PyMethodDef methods[] = { + { + "entropy_c", + entropy_wrapper, + METH_VARARGS, + "Calculate the Shannon entropy of a distribution of size `window_size` sampled from a sliding window over `data`. The results of each calculation are stored in `result`." + }, + {NULL, NULL, 0, NULL} +}; + + +// Module definition +static struct PyModuleDef entropy_definition = { + PyModuleDef_HEAD_INIT, + "entropy_c", + "A Python module that calculates Shannon entropy", + -1, + methods, +}; + +PyObject* PyInit_entropy_c(void) { + Py_Initialize(); + return PyModule_Create(&entropy_definition); +} diff --git a/ofrak_core/ofrak/core/entropy/entropy.py b/ofrak_core/ofrak/core/entropy/entropy.py index e4390c090..da08f5590 100644 --- a/ofrak_core/ofrak/core/entropy/entropy.py +++ b/ofrak_core/ofrak/core/entropy/entropy.py @@ -2,14 +2,11 @@ import ctypes import logging import math -import os from concurrent.futures import ProcessPoolExecutor from concurrent.futures.process import BrokenProcessPool from dataclasses import dataclass -from ofrak.component.abstract import ComponentMissingDependencyError from ofrak.component.analyzer import Analyzer -from ofrak.model.component_model import ComponentExternalTool from ofrak.model.resource_model import ResourceAttributes from ofrak.resource import Resource, ResourceFactory from ofrak.service.data_service_i import DataServiceInterface @@ -21,33 +18,9 @@ C_LOG_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_uint8) try: - _lib_entropy = ctypes.cdll.LoadLibrary( - os.path.abspath(os.path.join(os.path.dirname(__file__), "entropy.so.1")) - ) - ENTROPY_FUNCTION = _lib_entropy.entropy - - ENTROPY_FUNCTION.argtypes = ( - ctypes.c_char_p, - ctypes.c_size_t, - ctypes.c_char_p, - ctypes.c_size_t, - C_LOG_TYPE, - ) - ENTROPY_FUNCTION.restype = ctypes.c_int -except OSError: - ENTROPY_FUNCTION = None # type: ignore - - -class _EntropyCTypesTool(ComponentExternalTool): - def __init__(self): - # TODO: Add docs page on building entropy.so.1 - super().__init__("entropy.so.1", None, None, None) - - def is_tool_installed(self) -> bool: - return ENTROPY_FUNCTION is not None - - -_ENTROPY_SO_DEPENDENCY = _EntropyCTypesTool() + from .entropy_c import entropy_c as entropy_func +except: + from ofrak.core.entropy.reference_entropy import entropy_func @dataclass(**ResourceAttributes.DATACLASS_PARAMS) @@ -64,7 +37,6 @@ class DataSummaryAnalyzer(Analyzer[None, DataSummary]): targets = () # Target any resource with data outputs = (DataSummary,) - external_dependencies = (_ENTROPY_SO_DEPENDENCY,) def __init__( self, @@ -82,9 +54,6 @@ async def analyze(self, resource: Resource, config=None, depth=0) -> DataSummary f"Analysis process killed more than {self.max_analysis_retries} times. Aborting." ) - if not _ENTROPY_SO_DEPENDENCY.is_tool_installed(): - raise ComponentMissingDependencyError(self, _ENTROPY_SO_DEPENDENCY) - data = await resource.get_data() # Run blocking computations in separate processes try: @@ -121,12 +90,7 @@ def sample_entropy( def log_percent(percent): # pragma: no cover LOGGER.info(f"Entropy calculation {percent}% complete for {resource_id.hex()}") - # Make the entropy buffer mutable to the external C function - entropy = ctypes.create_string_buffer(len(data) - window_size) - errval = ENTROPY_FUNCTION(data, len(data), entropy, window_size, C_LOG_TYPE(log_percent)) - if errval != 0: - raise ValueError("Bad input to entropy function.") - result = bytes(entropy.raw) + result = entropy_func(data, len(data), window_size, log_percent) if len(result) <= max_samples: return result diff --git a/ofrak_core/ofrak/core/entropy/reference_entropy.py b/ofrak_core/ofrak/core/entropy/reference_entropy.py new file mode 100644 index 000000000..aee2f6249 --- /dev/null +++ b/ofrak_core/ofrak/core/entropy/reference_entropy.py @@ -0,0 +1,53 @@ +import logging +import math +from typing import Callable, List + + +def entropy_func( + data: bytes, data_len: int, window_size: int, log_percent: Callable[[int], None] +) -> bytes: + """ + Return a list of entropy values where each value represents the Shannon entropy of the byte + value distribution over a fixed-size, sliding window. + """ + logging.warning( + f"Using the Python implementation of the Shannon entropy calculation! This is potentially " + f"very slow, and is only used when the C extension cannot be built/found." + ) + + # Create a histogram, and populate it with initial values + histogram = [0] * 256 + for b in data[:window_size]: + histogram[b] += 1 + + # Calculate the entropy using a sliding window + entropy = [0] * (data_len - window_size) + last_percent_logged = 0 + for i in range(len(entropy)): + entropy[i] = math.floor(255 * _shannon_entropy(histogram, window_size)) + histogram[data[i]] -= 1 + histogram[data[i + window_size]] += 1 + percent = int((i * 100) / data_len) + if percent > last_percent_logged and percent % 10 == 0: + log_percent(percent) + last_percent_logged = percent + return bytes(entropy) + + +def _shannon_entropy(distribution: List[int], window_size: int) -> float: + """ + Return the Shannon entropy of the input probability distribution (represented as a histogram + counting byte occurrences over a window of known size). + + Shannon entropy represents how uniform a probability distribution is. Since more uniform + implies less predictable (because the probability of any outcome is equally likely in a + uniform distribution), a sample with higher entropy is "more random" than one with lower + entropy. More here: . + """ + + result = 0 + for num_occurrences in distribution: + probability = num_occurrences / window_size + # Note that the zero check is required because the domain of log2 is the positive reals + result += probability * math.log2(probability) if probability != 0.0 else 0.0 + return -result / math.log2(window_size) diff --git a/ofrak_core/setup.py b/ofrak_core/setup.py index 374cf7a28..7383d0ade 100644 --- a/ofrak_core/setup.py +++ b/ofrak_core/setup.py @@ -19,6 +19,15 @@ def run(self): long_description = f.read() +entropy_so = setuptools.Extension( + "ofrak.core.entropy.entropy_c", + sources=["ofrak/core/entropy/entropy.c"], + libraries=["m"], # math library + export_symbols=["shannon_entropy"], + optional=True, +) + + setuptools.setup( name="ofrak", version="1.0.0", @@ -99,4 +108,5 @@ def run(self): license_files=["LICENSE"], cmdclass={"egg_info": egg_info_ex}, entry_points={"ofrak.packages": ["ofrak_pkg = ofrak"]}, + ext_modules=[entropy_so], ) diff --git a/ofrak_core/test_ofrak/components/test_entropy_component.py b/ofrak_core/test_ofrak/components/test_entropy_component.py index 8e0081013..4c39e9e3c 100644 --- a/ofrak_core/test_ofrak/components/test_entropy_component.py +++ b/ofrak_core/test_ofrak/components/test_entropy_component.py @@ -1,13 +1,11 @@ -import math import os.path -from typing import List import pytest from ofrak.core.entropy import DataSummaryAnalyzer, DataSummary from ofrak import OFRAKContext import test_ofrak.components - +from ofrak.core.entropy.reference_entropy import entropy_func TEST_FILES = [ "hello.out", @@ -33,8 +31,10 @@ async def test_analyzer(ofrak_context: OFRAKContext, test_file_path): await root.run(DataSummaryAnalyzer) data_summary = root.get_attributes(DataSummary) entropy = data_summary.entropy_samples + data = await root.get_data() + assert len(entropy) == len(entropy_func(data, len(data), 256, lambda s: None)) assert _almost_equal( - entropy, _reference_entropy(await root.get_data()) + entropy, entropy_func(data, len(data), 256, lambda s: None) ), f"Entropy analysis for {test_file_path} differs from reference entropy." @@ -52,42 +52,3 @@ def _almost_equal(bytes1: bytes, bytes2: bytes) -> bool: print(f"Inputs differ at byte {i} ({bytes1[i]} != {bytes2[i]})") return False return True - - -def _reference_entropy(data: bytes, window_size: int = 256) -> bytes: - """ - Return a list of entropy values where each value represents the Shannon entropy of the byte - value distribution over a fixed-size, sliding window. - """ - - # Create a histogram, and populate it with initial values - histogram = [0] * 256 - for b in data[:window_size]: - histogram[b] += 1 - - # Calculate the entropy using a sliding window - entropy = [0] * (len(data) - window_size) - for i in range(len(entropy)): - entropy[i] = math.floor(255 * _shannon_entropy(histogram, window_size)) - histogram[data[i]] -= 1 - histogram[data[i + window_size]] += 1 - return bytes(entropy) - - -def _shannon_entropy(distribution: List[int], window_size: int) -> float: - """ - Return the Shannon entropy of the input probability distribution (represented as a histogram - counting byte occurrences over a window of known size). - - Shannon entropy represents how uniform a probability distribution is. Since more uniform - implies less predictable (because the probability of any outcome is equally likely in a - uniform distribution), a sample with higher entropy is "more random" than one with lower - entropy. More here: . - """ - - result = 0 - for num_occurrences in distribution: - probability = num_occurrences / window_size - # Note that the zero check is required because the domain of log2 is the positive reals - result += probability * math.log2(probability) if probability != 0.0 else 0.0 - return -result / math.log2(window_size) From 211de6beed0c5f0f4fec3949afbcc7089a3132a8 Mon Sep 17 00:00:00 2001 From: edward Date: Thu, 22 Dec 2022 19:13:50 -0500 Subject: [PATCH 2/5] remove unneeded entropy makefile --- ofrak_core/ofrak/core/entropy/Makefile | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 ofrak_core/ofrak/core/entropy/Makefile diff --git a/ofrak_core/ofrak/core/entropy/Makefile b/ofrak_core/ofrak/core/entropy/Makefile deleted file mode 100644 index 8e2f973d1..000000000 --- a/ofrak_core/ofrak/core/entropy/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -SHELL := bash - -CC = gcc -CFLAGS = -std=c99 \ - -pedantic \ - -Wall \ - -Wextra \ - -Werror \ - -fPIC \ - -fstack-protector-all \ - -D_FORTIFY_SOURCE=2 \ - -shared \ - -nostdlib \ - -O3 -LDLIBS = -lm # Link the math library - -# Use this .so.1 extension because otherwise the dependency injector will -# erroneously try to import entropy.so, which will fail. -entropy.so.1: entropy.c - $(CC) \ - $(CFLAGS) \ - $(filter %.c, $^) \ - $(LDLIBS) \ - -o $@ From 29836c32f823eb54c3ccac191b028b8bc5050f05 Mon Sep 17 00:00:00 2001 From: edward Date: Thu, 22 Dec 2022 19:23:54 -0500 Subject: [PATCH 3/5] remove unneeded data_len arg, smarter warning log --- ofrak_core/ofrak/core/entropy/entropy.c | 7 +++---- ofrak_core/ofrak/core/entropy/entropy.py | 6 +++++- .../ofrak/core/entropy/reference_entropy.py | 20 +++++++++++-------- ofrak_core/setup.py | 2 +- .../components/test_entropy_component.py | 4 ++-- 5 files changed, 23 insertions(+), 16 deletions(-) diff --git a/ofrak_core/ofrak/core/entropy/entropy.c b/ofrak_core/ofrak/core/entropy/entropy.c index 47e172d30..ff770e8f9 100644 --- a/ofrak_core/ofrak/core/entropy/entropy.c +++ b/ofrak_core/ofrak/core/entropy/entropy.c @@ -112,20 +112,19 @@ int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size, PyObject* entropy_wrapper(PyObject* _, PyObject* args){ Py_buffer data_buffer; - size_t data_len; size_t window_size; PyObject* py_log_percent; - if (!PyArg_ParseTuple(args, "y*nnO", &data_buffer, &data_len, &window_size, &py_log_percent)){ + if (!PyArg_ParseTuple(args, "y*nO", &data_buffer, &window_size, &py_log_percent)){ return NULL; } uint8_t *data = data_buffer.buf; - size_t result_size = data_len - window_size; + size_t result_size = data_buffer.len - window_size; uint8_t *result = (uint8_t*) calloc(result_size, sizeof(uint8_t)); // Actual entropy calculation - entropy(data, data_len, result, window_size, py_log_percent); + entropy(data, data_buffer.len, result, window_size, py_log_percent); PyObject* result_object = Py_BuildValue("y#", result, result_size); diff --git a/ofrak_core/ofrak/core/entropy/entropy.py b/ofrak_core/ofrak/core/entropy/entropy.py index da08f5590..fe182feec 100644 --- a/ofrak_core/ofrak/core/entropy/entropy.py +++ b/ofrak_core/ofrak/core/entropy/entropy.py @@ -5,6 +5,7 @@ from concurrent.futures import ProcessPoolExecutor from concurrent.futures.process import BrokenProcessPool from dataclasses import dataclass +from typing import Callable from ofrak.component.analyzer import Analyzer from ofrak.model.resource_model import ResourceAttributes @@ -17,6 +18,9 @@ C_LOG_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_uint8) + +entropy_func: Callable[[bytes, int, Callable[[int], None]], bytes] + try: from .entropy_c import entropy_c as entropy_func except: @@ -90,7 +94,7 @@ def sample_entropy( def log_percent(percent): # pragma: no cover LOGGER.info(f"Entropy calculation {percent}% complete for {resource_id.hex()}") - result = entropy_func(data, len(data), window_size, log_percent) + result = entropy_func(data, window_size, log_percent) if len(result) <= max_samples: return result diff --git a/ofrak_core/ofrak/core/entropy/reference_entropy.py b/ofrak_core/ofrak/core/entropy/reference_entropy.py index aee2f6249..04d2451ee 100644 --- a/ofrak_core/ofrak/core/entropy/reference_entropy.py +++ b/ofrak_core/ofrak/core/entropy/reference_entropy.py @@ -1,19 +1,23 @@ import logging import math -from typing import Callable, List +from typing import Callable, List, Optional def entropy_func( - data: bytes, data_len: int, window_size: int, log_percent: Callable[[int], None] + data: bytes, window_size: int, log_percent: Optional[Callable[[int], None]] = None ) -> bytes: """ Return a list of entropy values where each value represents the Shannon entropy of the byte value distribution over a fixed-size, sliding window. """ - logging.warning( - f"Using the Python implementation of the Shannon entropy calculation! This is potentially " - f"very slow, and is only used when the C extension cannot be built/found." - ) + if log_percent is None: + log_percent = lambda x: None + else: + # Sort of hacky way to know we are being called from the tests and don't need to log this + logging.warning( + f"Using the Python implementation of the Shannon entropy calculation! This is potentially " + f"very slow, and is only used when the C extension cannot be built/found." + ) # Create a histogram, and populate it with initial values histogram = [0] * 256 @@ -21,13 +25,13 @@ def entropy_func( histogram[b] += 1 # Calculate the entropy using a sliding window - entropy = [0] * (data_len - window_size) + entropy = [0] * (len(data) - window_size) last_percent_logged = 0 for i in range(len(entropy)): entropy[i] = math.floor(255 * _shannon_entropy(histogram, window_size)) histogram[data[i]] -= 1 histogram[data[i + window_size]] += 1 - percent = int((i * 100) / data_len) + percent = int((i * 100) / len(data)) if percent > last_percent_logged and percent % 10 == 0: log_percent(percent) last_percent_logged = percent diff --git a/ofrak_core/setup.py b/ofrak_core/setup.py index 7383d0ade..a468fbbc2 100644 --- a/ofrak_core/setup.py +++ b/ofrak_core/setup.py @@ -24,7 +24,7 @@ def run(self): sources=["ofrak/core/entropy/entropy.c"], libraries=["m"], # math library export_symbols=["shannon_entropy"], - optional=True, + optional=True, # If this fails the build, OFRAK will fall back to Python implementation ) diff --git a/ofrak_core/test_ofrak/components/test_entropy_component.py b/ofrak_core/test_ofrak/components/test_entropy_component.py index 4c39e9e3c..2e733f805 100644 --- a/ofrak_core/test_ofrak/components/test_entropy_component.py +++ b/ofrak_core/test_ofrak/components/test_entropy_component.py @@ -32,9 +32,9 @@ async def test_analyzer(ofrak_context: OFRAKContext, test_file_path): data_summary = root.get_attributes(DataSummary) entropy = data_summary.entropy_samples data = await root.get_data() - assert len(entropy) == len(entropy_func(data, len(data), 256, lambda s: None)) + assert len(entropy) == len(entropy_func(data, 256)) assert _almost_equal( - entropy, entropy_func(data, len(data), 256, lambda s: None) + entropy, entropy_func(data, 256) ), f"Entropy analysis for {test_file_path} differs from reference entropy." From 6e19fe03464f779070fb0027f9475f28821915db Mon Sep 17 00:00:00 2001 From: edward Date: Thu, 22 Dec 2022 19:31:47 -0500 Subject: [PATCH 4/5] fix some typing stuff --- ofrak_core/mypy.ini | 3 +++ ofrak_core/ofrak/core/entropy/entropy.py | 3 --- ofrak_core/ofrak/core/entropy/reference_entropy.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ofrak_core/mypy.ini b/ofrak_core/mypy.ini index 32a3125bc..53667cb6a 100644 --- a/ofrak_core/mypy.ini +++ b/ofrak_core/mypy.ini @@ -39,3 +39,6 @@ ignore_missing_imports = True [mypy-reedsolo.*] ignore_missing_imports = True + +[mypy-ofrak.core.entropy.entropy_c.*] +ignore_missing_imports = True diff --git a/ofrak_core/ofrak/core/entropy/entropy.py b/ofrak_core/ofrak/core/entropy/entropy.py index fe182feec..23170e678 100644 --- a/ofrak_core/ofrak/core/entropy/entropy.py +++ b/ofrak_core/ofrak/core/entropy/entropy.py @@ -5,7 +5,6 @@ from concurrent.futures import ProcessPoolExecutor from concurrent.futures.process import BrokenProcessPool from dataclasses import dataclass -from typing import Callable from ofrak.component.analyzer import Analyzer from ofrak.model.resource_model import ResourceAttributes @@ -19,8 +18,6 @@ C_LOG_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_uint8) -entropy_func: Callable[[bytes, int, Callable[[int], None]], bytes] - try: from .entropy_c import entropy_c as entropy_func except: diff --git a/ofrak_core/ofrak/core/entropy/reference_entropy.py b/ofrak_core/ofrak/core/entropy/reference_entropy.py index 04d2451ee..2589ac09a 100644 --- a/ofrak_core/ofrak/core/entropy/reference_entropy.py +++ b/ofrak_core/ofrak/core/entropy/reference_entropy.py @@ -49,7 +49,7 @@ def _shannon_entropy(distribution: List[int], window_size: int) -> float: entropy. More here: . """ - result = 0 + result = 0.0 for num_occurrences in distribution: probability = num_occurrences / window_size # Note that the zero check is required because the domain of log2 is the positive reals From 355bcd9d309749a784cf62827b31be13f81176ec Mon Sep 17 00:00:00 2001 From: edward Date: Fri, 23 Dec 2022 17:14:07 -0500 Subject: [PATCH 5/5] apply changes suggested in review --- ofrak_core/Makefile | 2 -- ofrak_core/ofrak/core/entropy/entropy.c | 9 ++++++++ ofrak_core/ofrak/core/entropy/entropy.py | 13 +++++++++-- .../{reference_entropy.py => entropy_py.py} | 2 +- ofrak_core/setup.py | 2 +- .../components/test_entropy_component.py | 23 +++++++++++++++---- 6 files changed, 41 insertions(+), 10 deletions(-) rename ofrak_core/ofrak/core/entropy/{reference_entropy.py => entropy_py.py} (99%) diff --git a/ofrak_core/Makefile b/ofrak_core/Makefile index 08ab2f41d..2533bdb73 100644 --- a/ofrak_core/Makefile +++ b/ofrak_core/Makefile @@ -3,12 +3,10 @@ PIP=pip3 .PHONY: install install: - $(MAKE) -C ofrak/core/entropy $(PIP) install . .PHONY: develop develop: - $(MAKE) -C ofrak/core/entropy $(PIP) install -e .[docs,test] .PHONY: inspect diff --git a/ofrak_core/ofrak/core/entropy/entropy.c b/ofrak_core/ofrak/core/entropy/entropy.c index ff770e8f9..52b06aae3 100644 --- a/ofrak_core/ofrak/core/entropy/entropy.c +++ b/ofrak_core/ofrak/core/entropy/entropy.c @@ -116,9 +116,18 @@ PyObject* entropy_wrapper(PyObject* _, PyObject* args){ PyObject* py_log_percent; if (!PyArg_ParseTuple(args, "y*nO", &data_buffer, &window_size, &py_log_percent)){ + PyErr_SetString(PyExc_RuntimeError, "Failed to parse arguments to entropy_wrapper!"); return NULL; } + if (data_buffer.len <= window_size){ + PyBuffer_Release(&data_buffer); + // return b"" + // we just need a definitely non-NULL pointer to pass to Py_BuildValue + // &window_size works fine (no data is read from it) + return Py_BuildValue("y#", &window_size, 0); + } + uint8_t *data = data_buffer.buf; size_t result_size = data_buffer.len - window_size; uint8_t *result = (uint8_t*) calloc(result_size, sizeof(uint8_t)); diff --git a/ofrak_core/ofrak/core/entropy/entropy.py b/ofrak_core/ofrak/core/entropy/entropy.py index 23170e678..b3b19bd40 100644 --- a/ofrak_core/ofrak/core/entropy/entropy.py +++ b/ofrak_core/ofrak/core/entropy/entropy.py @@ -19,13 +19,22 @@ try: - from .entropy_c import entropy_c as entropy_func + from ofrak.core.entropy.entropy_c import entropy_c as entropy_func except: - from ofrak.core.entropy.reference_entropy import entropy_func + from ofrak.core.entropy.entropy_py import entropy_py as entropy_func @dataclass(**ResourceAttributes.DATACLASS_PARAMS) class DataSummary(ResourceAttributes): + """ + High-level summary of binary data. + + :ivar entropy_samples: Shannon entropy of the data. A description of Shannon entropy and how it + can be used is [here](../../../../user-guide/gui/minimap.md#entropy-view). + :ivar magnitude_samples: Sample of the binary data to put an upper limit on the displayed byte + magnitudes; if the input data is smaller than this upper limit, all bytes are sampled. + """ + entropy_samples: bytes magnitude_samples: bytes diff --git a/ofrak_core/ofrak/core/entropy/reference_entropy.py b/ofrak_core/ofrak/core/entropy/entropy_py.py similarity index 99% rename from ofrak_core/ofrak/core/entropy/reference_entropy.py rename to ofrak_core/ofrak/core/entropy/entropy_py.py index 2589ac09a..a1c68b60f 100644 --- a/ofrak_core/ofrak/core/entropy/reference_entropy.py +++ b/ofrak_core/ofrak/core/entropy/entropy_py.py @@ -3,7 +3,7 @@ from typing import Callable, List, Optional -def entropy_func( +def entropy_py( data: bytes, window_size: int, log_percent: Optional[Callable[[int], None]] = None ) -> bytes: """ diff --git a/ofrak_core/setup.py b/ofrak_core/setup.py index a468fbbc2..99e1aa660 100644 --- a/ofrak_core/setup.py +++ b/ofrak_core/setup.py @@ -23,8 +23,8 @@ def run(self): "ofrak.core.entropy.entropy_c", sources=["ofrak/core/entropy/entropy.c"], libraries=["m"], # math library - export_symbols=["shannon_entropy"], optional=True, # If this fails the build, OFRAK will fall back to Python implementation + extra_compile_args=["-O3"], ) diff --git a/ofrak_core/test_ofrak/components/test_entropy_component.py b/ofrak_core/test_ofrak/components/test_entropy_component.py index 2e733f805..07c9b46bb 100644 --- a/ofrak_core/test_ofrak/components/test_entropy_component.py +++ b/ofrak_core/test_ofrak/components/test_entropy_component.py @@ -5,11 +5,13 @@ from ofrak import OFRAKContext import test_ofrak.components -from ofrak.core.entropy.reference_entropy import entropy_func +from ofrak.core.entropy.entropy_py import entropy_py +from ofrak.core.entropy.entropy_c import entropy_c TEST_FILES = [ "hello.out", "arm_reloc_relocated.elf", + "flash_test_magic.bin", "hello.rar", "imx7d-sdb.dtb", "simple_arm_gcc.o.elf", @@ -27,14 +29,27 @@ async def test_analyzer(ofrak_context: OFRAKContext, test_file_path): 1. The sampling of large files may lead to spurious test failures. 2. The reference method is *extremely* slow for even moderately sized files. """ + with open(test_file_path, "rb") as f: + data = f.read() + c_implementation_entropy = entropy_c(data, 256, lambda s: None) + py_implementation_entropy = entropy_py(data, 256) + + if len(data) < 256: + assert c_implementation_entropy == b"" + assert py_implementation_entropy == b"" + + assert _almost_equal( + c_implementation_entropy, py_implementation_entropy + ), f"Python and C entropy implementations for {test_file_path} differ." + + expected_entropy = c_implementation_entropy + root = await ofrak_context.create_root_resource_from_file(test_file_path) await root.run(DataSummaryAnalyzer) data_summary = root.get_attributes(DataSummary) entropy = data_summary.entropy_samples - data = await root.get_data() - assert len(entropy) == len(entropy_func(data, 256)) assert _almost_equal( - entropy, entropy_func(data, 256) + entropy, expected_entropy ), f"Entropy analysis for {test_file_path} differs from reference entropy."