Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions ofrak_core/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@ PIP=pip3

.PHONY: install
install:
$(MAKE) -C ofrak/core/entropy
$(PIP) install .

.PHONY: develop
develop:
$(MAKE) -C ofrak/core/entropy
$(PIP) install -e .[docs,test]

.PHONY: inspect
Expand Down
3 changes: 3 additions & 0 deletions ofrak_core/mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,6 @@ ignore_missing_imports = True

[mypy-reedsolo.*]
ignore_missing_imports = True

[mypy-ofrak.core.entropy.entropy_c.*]
ignore_missing_imports = True
24 changes: 0 additions & 24 deletions ofrak_core/ofrak/core/entropy/Makefile

This file was deleted.

77 changes: 75 additions & 2 deletions ofrak_core/ofrak/core/entropy/entropy.c
Original file line number Diff line number Diff line change
@@ -1,17 +1,28 @@
#include <stddef.h> // size_t, NULL
#include <inttypes.h> // uint8_t, uint32_t
#include <math.h> // floor, log2
#include <Python.h>
Comment thread
rbs-jacob marked this conversation as resolved.

#define HISTOGRAM_SIZE 256
#define MAX_BRIGHTNESS_FLOAT 255.0
#define LOGGING_CHUNKS 10

/***
* Use a Python callback to log the current percent completion of the calculation
*/
void log_percent(int percent, void* py_callback){
PyObject *args = Py_BuildValue("(i)", percent);
PyObject *result = PyEval_CallObject(py_callback, args);
Py_XDECREF(result);
Py_DECREF(args);
}

/***
* Calculate the Shannon entropy of a distribution of size `window_size` sampled from a sliding
* window over `data`. The results of each calculation are stored in `result`.
*/
int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size,
void (*log_percent)(uint8_t))
void* py_log_callback)
{
if (data == NULL || result == NULL || window_size > data_len || data_len == 0 ||
window_size == 0) {
Expand Down Expand Up @@ -92,8 +103,70 @@ int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size,
}
}

log_percent((i * 100) / data_len);
log_percent((i * 100) / data_len, py_log_callback);
}

return 0;
}


PyObject* entropy_wrapper(PyObject* _, PyObject* args){
Py_buffer data_buffer;
size_t window_size;
PyObject* py_log_percent;

if (!PyArg_ParseTuple(args, "y*nO", &data_buffer, &window_size, &py_log_percent)){
PyErr_SetString(PyExc_RuntimeError, "Failed to parse arguments to entropy_wrapper!");
return NULL;
}

if (data_buffer.len <= window_size){
PyBuffer_Release(&data_buffer);
// return b""
// we just need a definitely non-NULL pointer to pass to Py_BuildValue
// &window_size works fine (no data is read from it)
return Py_BuildValue("y#", &window_size, 0);
}

uint8_t *data = data_buffer.buf;
size_t result_size = data_buffer.len - window_size;
uint8_t *result = (uint8_t*) calloc(result_size, sizeof(uint8_t));

// Actual entropy calculation
entropy(data, data_buffer.len, result, window_size, py_log_percent);

PyObject* result_object = Py_BuildValue("y#", result, result_size);

// Clean up memory
PyBuffer_Release(&data_buffer);
free(result);

return result_object;
}


// Functions defined in this module
static PyMethodDef methods[] = {
{
"entropy_c",
entropy_wrapper,
METH_VARARGS,
"Calculate the Shannon entropy of a distribution of size `window_size` sampled from a sliding window over `data`. The results of each calculation are stored in `result`."
},
{NULL, NULL, 0, NULL}
};


// Module definition
static struct PyModuleDef entropy_definition = {
PyModuleDef_HEAD_INIT,
"entropy_c",
"A Python module that calculates Shannon entropy",
-1,
methods,
};

PyObject* PyInit_entropy_c(void) {
Py_Initialize();
return PyModule_Create(&entropy_definition);
}
54 changes: 14 additions & 40 deletions ofrak_core/ofrak/core/entropy/entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,11 @@
import ctypes
import logging
import math
import os
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures.process import BrokenProcessPool
from dataclasses import dataclass

from ofrak.component.abstract import ComponentMissingDependencyError
from ofrak.component.analyzer import Analyzer
from ofrak.model.component_model import ComponentExternalTool
from ofrak.model.resource_model import ResourceAttributes
from ofrak.resource import Resource, ResourceFactory
from ofrak.service.data_service_i import DataServiceInterface
Expand All @@ -20,38 +17,24 @@

C_LOG_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_uint8)

try:
_lib_entropy = ctypes.cdll.LoadLibrary(
os.path.abspath(os.path.join(os.path.dirname(__file__), "entropy.so.1"))
)
ENTROPY_FUNCTION = _lib_entropy.entropy

ENTROPY_FUNCTION.argtypes = (
ctypes.c_char_p,
ctypes.c_size_t,
ctypes.c_char_p,
ctypes.c_size_t,
C_LOG_TYPE,
)
ENTROPY_FUNCTION.restype = ctypes.c_int
except OSError:
ENTROPY_FUNCTION = None # type: ignore


class _EntropyCTypesTool(ComponentExternalTool):
def __init__(self):
# TODO: Add docs page on building entropy.so.1
super().__init__("entropy.so.1", None, None, None)

def is_tool_installed(self) -> bool:
return ENTROPY_FUNCTION is not None


_ENTROPY_SO_DEPENDENCY = _EntropyCTypesTool()
try:
from ofrak.core.entropy.entropy_c import entropy_c as entropy_func
except:
from ofrak.core.entropy.entropy_py import entropy_py as entropy_func


@dataclass(**ResourceAttributes.DATACLASS_PARAMS)
class DataSummary(ResourceAttributes):
"""
High-level summary of binary data.

:ivar entropy_samples: Shannon entropy of the data. A description of Shannon entropy and how it
can be used is [here](../../../../user-guide/gui/minimap.md#entropy-view).
:ivar magnitude_samples: Sample of the binary data to put an upper limit on the displayed byte
magnitudes; if the input data is smaller than this upper limit, all bytes are sampled.
"""

entropy_samples: bytes
magnitude_samples: bytes

Expand All @@ -64,7 +47,6 @@ class DataSummaryAnalyzer(Analyzer[None, DataSummary]):

targets = () # Target any resource with data
outputs = (DataSummary,)
external_dependencies = (_ENTROPY_SO_DEPENDENCY,)

def __init__(
self,
Expand All @@ -82,9 +64,6 @@ async def analyze(self, resource: Resource, config=None, depth=0) -> DataSummary
f"Analysis process killed more than {self.max_analysis_retries} times. Aborting."
)

if not _ENTROPY_SO_DEPENDENCY.is_tool_installed():
raise ComponentMissingDependencyError(self, _ENTROPY_SO_DEPENDENCY)

data = await resource.get_data()
# Run blocking computations in separate processes
try:
Expand Down Expand Up @@ -121,12 +100,7 @@ def sample_entropy(
def log_percent(percent): # pragma: no cover
LOGGER.info(f"Entropy calculation {percent}% complete for {resource_id.hex()}")

# Make the entropy buffer mutable to the external C function
entropy = ctypes.create_string_buffer(len(data) - window_size)
errval = ENTROPY_FUNCTION(data, len(data), entropy, window_size, C_LOG_TYPE(log_percent))
if errval != 0:
raise ValueError("Bad input to entropy function.")
result = bytes(entropy.raw)
result = entropy_func(data, window_size, log_percent)

if len(result) <= max_samples:
return result
Expand Down
57 changes: 57 additions & 0 deletions ofrak_core/ofrak/core/entropy/entropy_py.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import logging
import math
from typing import Callable, List, Optional


def entropy_py(
data: bytes, window_size: int, log_percent: Optional[Callable[[int], None]] = None
) -> bytes:
"""
Return a list of entropy values where each value represents the Shannon entropy of the byte
value distribution over a fixed-size, sliding window.
"""
if log_percent is None:
log_percent = lambda x: None
else:
# Sort of hacky way to know we are being called from the tests and don't need to log this
logging.warning(
f"Using the Python implementation of the Shannon entropy calculation! This is potentially "
f"very slow, and is only used when the C extension cannot be built/found."
)

# Create a histogram, and populate it with initial values
histogram = [0] * 256
for b in data[:window_size]:
histogram[b] += 1

# Calculate the entropy using a sliding window
entropy = [0] * (len(data) - window_size)
last_percent_logged = 0
for i in range(len(entropy)):
entropy[i] = math.floor(255 * _shannon_entropy(histogram, window_size))
histogram[data[i]] -= 1
histogram[data[i + window_size]] += 1
percent = int((i * 100) / len(data))
if percent > last_percent_logged and percent % 10 == 0:
log_percent(percent)
last_percent_logged = percent
return bytes(entropy)


def _shannon_entropy(distribution: List[int], window_size: int) -> float:
"""
Return the Shannon entropy of the input probability distribution (represented as a histogram
counting byte occurrences over a window of known size).

Shannon entropy represents how uniform a probability distribution is. Since more uniform
implies less predictable (because the probability of any outcome is equally likely in a
uniform distribution), a sample with higher entropy is "more random" than one with lower
entropy. More here: <https://en.wikipedia.org/wiki/Entropy_(information_theory)>.
"""

result = 0.0
for num_occurrences in distribution:
probability = num_occurrences / window_size
# Note that the zero check is required because the domain of log2 is the positive reals
result += probability * math.log2(probability) if probability != 0.0 else 0.0
return -result / math.log2(window_size)
10 changes: 10 additions & 0 deletions ofrak_core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ def run(self):
long_description = f.read()


entropy_so = setuptools.Extension(
Comment thread
EdwardLarson marked this conversation as resolved.
"ofrak.core.entropy.entropy_c",
sources=["ofrak/core/entropy/entropy.c"],
libraries=["m"], # math library
optional=True, # If this fails the build, OFRAK will fall back to Python implementation
extra_compile_args=["-O3"],
)


setuptools.setup(
name="ofrak",
version="1.0.0",
Expand Down Expand Up @@ -99,4 +108,5 @@ def run(self):
license_files=["LICENSE"],
cmdclass={"egg_info": egg_info_ex},
entry_points={"ofrak.packages": ["ofrak_pkg = ofrak"]},
ext_modules=[entropy_so],
)
Loading