diff --git a/ofrak_core/Makefile b/ofrak_core/Makefile index 08ab2f41d..2533bdb73 100644 --- a/ofrak_core/Makefile +++ b/ofrak_core/Makefile @@ -3,12 +3,10 @@ PIP=pip3 .PHONY: install install: - $(MAKE) -C ofrak/core/entropy $(PIP) install . .PHONY: develop develop: - $(MAKE) -C ofrak/core/entropy $(PIP) install -e .[docs,test] .PHONY: inspect diff --git a/ofrak_core/mypy.ini b/ofrak_core/mypy.ini index 32a3125bc..53667cb6a 100644 --- a/ofrak_core/mypy.ini +++ b/ofrak_core/mypy.ini @@ -39,3 +39,6 @@ ignore_missing_imports = True [mypy-reedsolo.*] ignore_missing_imports = True + +[mypy-ofrak.core.entropy.entropy_c.*] +ignore_missing_imports = True diff --git a/ofrak_core/ofrak/core/entropy/Makefile b/ofrak_core/ofrak/core/entropy/Makefile deleted file mode 100644 index 8e2f973d1..000000000 --- a/ofrak_core/ofrak/core/entropy/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -SHELL := bash - -CC = gcc -CFLAGS = -std=c99 \ - -pedantic \ - -Wall \ - -Wextra \ - -Werror \ - -fPIC \ - -fstack-protector-all \ - -D_FORTIFY_SOURCE=2 \ - -shared \ - -nostdlib \ - -O3 -LDLIBS = -lm # Link the math library - -# Use this .so.1 extension because otherwise the dependency injector will -# erroneously try to import entropy.so, which will fail. -entropy.so.1: entropy.c - $(CC) \ - $(CFLAGS) \ - $(filter %.c, $^) \ - $(LDLIBS) \ - -o $@ diff --git a/ofrak_core/ofrak/core/entropy/entropy.c b/ofrak_core/ofrak/core/entropy/entropy.c index 43467f098..52b06aae3 100644 --- a/ofrak_core/ofrak/core/entropy/entropy.c +++ b/ofrak_core/ofrak/core/entropy/entropy.c @@ -1,17 +1,28 @@ #include // size_t, NULL #include // uint8_t, uint32_t #include // floor, log2 +#include #define HISTOGRAM_SIZE 256 #define MAX_BRIGHTNESS_FLOAT 255.0 #define LOGGING_CHUNKS 10 +/*** + * Use a Python callback to log the current percent completion of the calculation + */ +void log_percent(int percent, void* py_callback){ + PyObject *args = Py_BuildValue("(i)", percent); + PyObject *result = PyEval_CallObject(py_callback, args); + Py_XDECREF(result); + Py_DECREF(args); +} + /*** * Calculate the Shannon entropy of a distribution of size `window_size` sampled from a sliding * window over `data`. The results of each calculation are stored in `result`. */ int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size, - void (*log_percent)(uint8_t)) + void* py_log_callback) { if (data == NULL || result == NULL || window_size > data_len || data_len == 0 || window_size == 0) { @@ -92,8 +103,70 @@ int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size, } } - log_percent((i * 100) / data_len); + log_percent((i * 100) / data_len, py_log_callback); } return 0; } + + +PyObject* entropy_wrapper(PyObject* _, PyObject* args){ + Py_buffer data_buffer; + size_t window_size; + PyObject* py_log_percent; + + if (!PyArg_ParseTuple(args, "y*nO", &data_buffer, &window_size, &py_log_percent)){ + PyErr_SetString(PyExc_RuntimeError, "Failed to parse arguments to entropy_wrapper!"); + return NULL; + } + + if (data_buffer.len <= window_size){ + PyBuffer_Release(&data_buffer); + // return b"" + // we just need a definitely non-NULL pointer to pass to Py_BuildValue + // &window_size works fine (no data is read from it) + return Py_BuildValue("y#", &window_size, 0); + } + + uint8_t *data = data_buffer.buf; + size_t result_size = data_buffer.len - window_size; + uint8_t *result = (uint8_t*) calloc(result_size, sizeof(uint8_t)); + + // Actual entropy calculation + entropy(data, data_buffer.len, result, window_size, py_log_percent); + + PyObject* result_object = Py_BuildValue("y#", result, result_size); + + // Clean up memory + PyBuffer_Release(&data_buffer); + free(result); + + return result_object; +} + + +// Functions defined in this module +static PyMethodDef methods[] = { + { + "entropy_c", + entropy_wrapper, + METH_VARARGS, + "Calculate the Shannon entropy of a distribution of size `window_size` sampled from a sliding window over `data`. The results of each calculation are stored in `result`." + }, + {NULL, NULL, 0, NULL} +}; + + +// Module definition +static struct PyModuleDef entropy_definition = { + PyModuleDef_HEAD_INIT, + "entropy_c", + "A Python module that calculates Shannon entropy", + -1, + methods, +}; + +PyObject* PyInit_entropy_c(void) { + Py_Initialize(); + return PyModule_Create(&entropy_definition); +} diff --git a/ofrak_core/ofrak/core/entropy/entropy.py b/ofrak_core/ofrak/core/entropy/entropy.py index e4390c090..b3b19bd40 100644 --- a/ofrak_core/ofrak/core/entropy/entropy.py +++ b/ofrak_core/ofrak/core/entropy/entropy.py @@ -2,14 +2,11 @@ import ctypes import logging import math -import os from concurrent.futures import ProcessPoolExecutor from concurrent.futures.process import BrokenProcessPool from dataclasses import dataclass -from ofrak.component.abstract import ComponentMissingDependencyError from ofrak.component.analyzer import Analyzer -from ofrak.model.component_model import ComponentExternalTool from ofrak.model.resource_model import ResourceAttributes from ofrak.resource import Resource, ResourceFactory from ofrak.service.data_service_i import DataServiceInterface @@ -20,38 +17,24 @@ C_LOG_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_uint8) -try: - _lib_entropy = ctypes.cdll.LoadLibrary( - os.path.abspath(os.path.join(os.path.dirname(__file__), "entropy.so.1")) - ) - ENTROPY_FUNCTION = _lib_entropy.entropy - - ENTROPY_FUNCTION.argtypes = ( - ctypes.c_char_p, - ctypes.c_size_t, - ctypes.c_char_p, - ctypes.c_size_t, - C_LOG_TYPE, - ) - ENTROPY_FUNCTION.restype = ctypes.c_int -except OSError: - ENTROPY_FUNCTION = None # type: ignore - - -class _EntropyCTypesTool(ComponentExternalTool): - def __init__(self): - # TODO: Add docs page on building entropy.so.1 - super().__init__("entropy.so.1", None, None, None) - - def is_tool_installed(self) -> bool: - return ENTROPY_FUNCTION is not None - -_ENTROPY_SO_DEPENDENCY = _EntropyCTypesTool() +try: + from ofrak.core.entropy.entropy_c import entropy_c as entropy_func +except: + from ofrak.core.entropy.entropy_py import entropy_py as entropy_func @dataclass(**ResourceAttributes.DATACLASS_PARAMS) class DataSummary(ResourceAttributes): + """ + High-level summary of binary data. + + :ivar entropy_samples: Shannon entropy of the data. A description of Shannon entropy and how it + can be used is [here](../../../../user-guide/gui/minimap.md#entropy-view). + :ivar magnitude_samples: Sample of the binary data to put an upper limit on the displayed byte + magnitudes; if the input data is smaller than this upper limit, all bytes are sampled. + """ + entropy_samples: bytes magnitude_samples: bytes @@ -64,7 +47,6 @@ class DataSummaryAnalyzer(Analyzer[None, DataSummary]): targets = () # Target any resource with data outputs = (DataSummary,) - external_dependencies = (_ENTROPY_SO_DEPENDENCY,) def __init__( self, @@ -82,9 +64,6 @@ async def analyze(self, resource: Resource, config=None, depth=0) -> DataSummary f"Analysis process killed more than {self.max_analysis_retries} times. Aborting." ) - if not _ENTROPY_SO_DEPENDENCY.is_tool_installed(): - raise ComponentMissingDependencyError(self, _ENTROPY_SO_DEPENDENCY) - data = await resource.get_data() # Run blocking computations in separate processes try: @@ -121,12 +100,7 @@ def sample_entropy( def log_percent(percent): # pragma: no cover LOGGER.info(f"Entropy calculation {percent}% complete for {resource_id.hex()}") - # Make the entropy buffer mutable to the external C function - entropy = ctypes.create_string_buffer(len(data) - window_size) - errval = ENTROPY_FUNCTION(data, len(data), entropy, window_size, C_LOG_TYPE(log_percent)) - if errval != 0: - raise ValueError("Bad input to entropy function.") - result = bytes(entropy.raw) + result = entropy_func(data, window_size, log_percent) if len(result) <= max_samples: return result diff --git a/ofrak_core/ofrak/core/entropy/entropy_py.py b/ofrak_core/ofrak/core/entropy/entropy_py.py new file mode 100644 index 000000000..a1c68b60f --- /dev/null +++ b/ofrak_core/ofrak/core/entropy/entropy_py.py @@ -0,0 +1,57 @@ +import logging +import math +from typing import Callable, List, Optional + + +def entropy_py( + data: bytes, window_size: int, log_percent: Optional[Callable[[int], None]] = None +) -> bytes: + """ + Return a list of entropy values where each value represents the Shannon entropy of the byte + value distribution over a fixed-size, sliding window. + """ + if log_percent is None: + log_percent = lambda x: None + else: + # Sort of hacky way to know we are being called from the tests and don't need to log this + logging.warning( + f"Using the Python implementation of the Shannon entropy calculation! This is potentially " + f"very slow, and is only used when the C extension cannot be built/found." + ) + + # Create a histogram, and populate it with initial values + histogram = [0] * 256 + for b in data[:window_size]: + histogram[b] += 1 + + # Calculate the entropy using a sliding window + entropy = [0] * (len(data) - window_size) + last_percent_logged = 0 + for i in range(len(entropy)): + entropy[i] = math.floor(255 * _shannon_entropy(histogram, window_size)) + histogram[data[i]] -= 1 + histogram[data[i + window_size]] += 1 + percent = int((i * 100) / len(data)) + if percent > last_percent_logged and percent % 10 == 0: + log_percent(percent) + last_percent_logged = percent + return bytes(entropy) + + +def _shannon_entropy(distribution: List[int], window_size: int) -> float: + """ + Return the Shannon entropy of the input probability distribution (represented as a histogram + counting byte occurrences over a window of known size). + + Shannon entropy represents how uniform a probability distribution is. Since more uniform + implies less predictable (because the probability of any outcome is equally likely in a + uniform distribution), a sample with higher entropy is "more random" than one with lower + entropy. More here: . + """ + + result = 0.0 + for num_occurrences in distribution: + probability = num_occurrences / window_size + # Note that the zero check is required because the domain of log2 is the positive reals + result += probability * math.log2(probability) if probability != 0.0 else 0.0 + return -result / math.log2(window_size) diff --git a/ofrak_core/setup.py b/ofrak_core/setup.py index 374cf7a28..99e1aa660 100644 --- a/ofrak_core/setup.py +++ b/ofrak_core/setup.py @@ -19,6 +19,15 @@ def run(self): long_description = f.read() +entropy_so = setuptools.Extension( + "ofrak.core.entropy.entropy_c", + sources=["ofrak/core/entropy/entropy.c"], + libraries=["m"], # math library + optional=True, # If this fails the build, OFRAK will fall back to Python implementation + extra_compile_args=["-O3"], +) + + setuptools.setup( name="ofrak", version="1.0.0", @@ -99,4 +108,5 @@ def run(self): license_files=["LICENSE"], cmdclass={"egg_info": egg_info_ex}, entry_points={"ofrak.packages": ["ofrak_pkg = ofrak"]}, + ext_modules=[entropy_so], ) diff --git a/ofrak_core/test_ofrak/components/test_entropy_component.py b/ofrak_core/test_ofrak/components/test_entropy_component.py index 8e0081013..07c9b46bb 100644 --- a/ofrak_core/test_ofrak/components/test_entropy_component.py +++ b/ofrak_core/test_ofrak/components/test_entropy_component.py @@ -1,17 +1,17 @@ -import math import os.path -from typing import List import pytest from ofrak.core.entropy import DataSummaryAnalyzer, DataSummary from ofrak import OFRAKContext import test_ofrak.components - +from ofrak.core.entropy.entropy_py import entropy_py +from ofrak.core.entropy.entropy_c import entropy_c TEST_FILES = [ "hello.out", "arm_reloc_relocated.elf", + "flash_test_magic.bin", "hello.rar", "imx7d-sdb.dtb", "simple_arm_gcc.o.elf", @@ -29,12 +29,27 @@ async def test_analyzer(ofrak_context: OFRAKContext, test_file_path): 1. The sampling of large files may lead to spurious test failures. 2. The reference method is *extremely* slow for even moderately sized files. """ + with open(test_file_path, "rb") as f: + data = f.read() + c_implementation_entropy = entropy_c(data, 256, lambda s: None) + py_implementation_entropy = entropy_py(data, 256) + + if len(data) < 256: + assert c_implementation_entropy == b"" + assert py_implementation_entropy == b"" + + assert _almost_equal( + c_implementation_entropy, py_implementation_entropy + ), f"Python and C entropy implementations for {test_file_path} differ." + + expected_entropy = c_implementation_entropy + root = await ofrak_context.create_root_resource_from_file(test_file_path) await root.run(DataSummaryAnalyzer) data_summary = root.get_attributes(DataSummary) entropy = data_summary.entropy_samples assert _almost_equal( - entropy, _reference_entropy(await root.get_data()) + entropy, expected_entropy ), f"Entropy analysis for {test_file_path} differs from reference entropy." @@ -52,42 +67,3 @@ def _almost_equal(bytes1: bytes, bytes2: bytes) -> bool: print(f"Inputs differ at byte {i} ({bytes1[i]} != {bytes2[i]})") return False return True - - -def _reference_entropy(data: bytes, window_size: int = 256) -> bytes: - """ - Return a list of entropy values where each value represents the Shannon entropy of the byte - value distribution over a fixed-size, sliding window. - """ - - # Create a histogram, and populate it with initial values - histogram = [0] * 256 - for b in data[:window_size]: - histogram[b] += 1 - - # Calculate the entropy using a sliding window - entropy = [0] * (len(data) - window_size) - for i in range(len(entropy)): - entropy[i] = math.floor(255 * _shannon_entropy(histogram, window_size)) - histogram[data[i]] -= 1 - histogram[data[i + window_size]] += 1 - return bytes(entropy) - - -def _shannon_entropy(distribution: List[int], window_size: int) -> float: - """ - Return the Shannon entropy of the input probability distribution (represented as a histogram - counting byte occurrences over a window of known size). - - Shannon entropy represents how uniform a probability distribution is. Since more uniform - implies less predictable (because the probability of any outcome is equally likely in a - uniform distribution), a sample with higher entropy is "more random" than one with lower - entropy. More here: . - """ - - result = 0 - for num_occurrences in distribution: - probability = num_occurrences / window_size - # Note that the zero check is required because the domain of log2 is the positive reals - result += probability * math.log2(probability) if probability != 0.0 else 0.0 - return -result / math.log2(window_size)