redballoonsecurity · EdwardLarson · Dec 28, 2022 · Dec 23, 2022 · Dec 23, 2022 · Dec 23, 2022
diff --git a/ofrak_core/Makefile b/ofrak_core/Makefile
@@ -3,12 +3,10 @@ PIP=pip3
 
 .PHONY: install
 install:
-	$(MAKE) -C ofrak/core/entropy
 	$(PIP) install .
 
 .PHONY: develop
 develop:
-	$(MAKE) -C ofrak/core/entropy
 	$(PIP) install -e .[docs,test]
 
 .PHONY: inspect

diff --git a/ofrak_core/mypy.ini b/ofrak_core/mypy.ini
@@ -39,3 +39,6 @@ ignore_missing_imports = True
 
 [mypy-reedsolo.*]
 ignore_missing_imports = True
+
+[mypy-ofrak.core.entropy.entropy_c.*]
+ignore_missing_imports = True
diff --git a/ofrak_core/ofrak/core/entropy/Makefile b/ofrak_core/ofrak/core/entropy/Makefile
diff --git a/ofrak_core/ofrak/core/entropy/entropy.c b/ofrak_core/ofrak/core/entropy/entropy.c
@@ -1,17 +1,28 @@
 #include <stddef.h>   // size_t, NULL
 #include <inttypes.h> // uint8_t, uint32_t
 #include <math.h>     // floor, log2
+#include <Python.h>
 
 #define HISTOGRAM_SIZE 256
 #define MAX_BRIGHTNESS_FLOAT 255.0
 #define LOGGING_CHUNKS 10
 
+/***
+ * Use a Python callback to log the current percent completion of the calculation
+ */
+void log_percent(int percent, void* py_callback){
+    PyObject *args = Py_BuildValue("(i)", percent);
+    PyObject *result = PyEval_CallObject(py_callback, args);
+    Py_XDECREF(result);
+    Py_DECREF(args);
+}
+
 /***
  * Calculate the Shannon entropy of a distribution of size `window_size` sampled from a sliding
  * window over `data`. The results of each calculation are stored in `result`.
  */
 int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size,
-            void (*log_percent)(uint8_t))
+            void* py_log_callback)
 {
     if (data == NULL || result == NULL || window_size > data_len || data_len == 0 ||
         window_size == 0) {
@@ -92,8 +103,70 @@ int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size,
             }
         }
 
-        log_percent((i * 100) / data_len);
+        log_percent((i * 100) / data_len, py_log_callback);
     }
 
     return 0;
 }
+
+
+PyObject* entropy_wrapper(PyObject* _, PyObject* args){
+    Py_buffer data_buffer;
+    size_t window_size;
+    PyObject* py_log_percent;
+
+    if (!PyArg_ParseTuple(args, "y*nO", &data_buffer, &window_size, &py_log_percent)){
+        PyErr_SetString(PyExc_RuntimeError, "Failed to parse arguments to entropy_wrapper!");
+        return NULL;
+    }
+
+    if (data_buffer.len <= window_size){
+        PyBuffer_Release(&data_buffer);
+         // return b""
+         // we just need a definitely non-NULL pointer to pass to Py_BuildValue
+         // &window_size works fine (no data is read from it)
+        return Py_BuildValue("y#", &window_size, 0);
+    }
+
+    uint8_t *data = data_buffer.buf;
+    size_t result_size = data_buffer.len - window_size;
+    uint8_t *result = (uint8_t*) calloc(result_size, sizeof(uint8_t));
+
+    // Actual entropy calculation
+    entropy(data, data_buffer.len, result, window_size, py_log_percent);
+
+    PyObject* result_object = Py_BuildValue("y#", result, result_size);
+
+    // Clean up memory
+    PyBuffer_Release(&data_buffer);
+    free(result);
+
+    return result_object;
+}
+
+
+// Functions defined in this module
+static PyMethodDef methods[] = {
+    {
+        "entropy_c",
+        entropy_wrapper,
+        METH_VARARGS,
+        "Calculate the Shannon entropy of a distribution of size `window_size` sampled from a sliding window over `data`. The results of each calculation are stored in `result`."
+    },
+    {NULL, NULL, 0, NULL}
+};
+
+
+// Module definition
+static struct PyModuleDef entropy_definition = {
+    PyModuleDef_HEAD_INIT,
+    "entropy_c",
+    "A Python module that calculates Shannon entropy",
+    -1,
+    methods,
+};
+
+PyObject* PyInit_entropy_c(void) {
+    Py_Initialize();
+    return PyModule_Create(&entropy_definition);
+}
diff --git a/ofrak_core/ofrak/core/entropy/entropy.py b/ofrak_core/ofrak/core/entropy/entropy.py
@@ -2,14 +2,11 @@
 import ctypes
 import logging
 import math
-import os
 from concurrent.futures import ProcessPoolExecutor
 from concurrent.futures.process import BrokenProcessPool
 from dataclasses import dataclass
 
-from ofrak.component.abstract import ComponentMissingDependencyError
 from ofrak.component.analyzer import Analyzer
-from ofrak.model.component_model import ComponentExternalTool
 from ofrak.model.resource_model import ResourceAttributes
 from ofrak.resource import Resource, ResourceFactory
 from ofrak.service.data_service_i import DataServiceInterface
@@ -20,38 +17,24 @@
 
 C_LOG_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_uint8)
 
-try:
-    _lib_entropy = ctypes.cdll.LoadLibrary(
-        os.path.abspath(os.path.join(os.path.dirname(__file__), "entropy.so.1"))
-    )
-    ENTROPY_FUNCTION = _lib_entropy.entropy
-
-    ENTROPY_FUNCTION.argtypes = (
-        ctypes.c_char_p,
-        ctypes.c_size_t,
-        ctypes.c_char_p,
-        ctypes.c_size_t,
-        C_LOG_TYPE,
-    )
-    ENTROPY_FUNCTION.restype = ctypes.c_int
-except OSError:
-    ENTROPY_FUNCTION = None  # type: ignore
-
-
-class _EntropyCTypesTool(ComponentExternalTool):
-    def __init__(self):
-        # TODO: Add docs page on building entropy.so.1
-        super().__init__("entropy.so.1", None, None, None)
-
-    def is_tool_installed(self) -> bool:
-        return ENTROPY_FUNCTION is not None
-
 
-_ENTROPY_SO_DEPENDENCY = _EntropyCTypesTool()
+try:
+    from ofrak.core.entropy.entropy_c import entropy_c as entropy_func
+except:
+    from ofrak.core.entropy.entropy_py import entropy_py as entropy_func
 
 
 @dataclass(**ResourceAttributes.DATACLASS_PARAMS)
 class DataSummary(ResourceAttributes):
+    """
+    High-level summary of binary data.
+
+    :ivar entropy_samples: Shannon entropy of the data. A description of Shannon entropy and how it
+    can be used is [here](../../../../user-guide/gui/minimap.md#entropy-view).
+    :ivar magnitude_samples: Sample of the binary data to put an upper limit on the displayed byte
+    magnitudes; if the input data is smaller than this upper limit, all bytes are sampled.
+    """
+
     entropy_samples: bytes
     magnitude_samples: bytes
 
@@ -64,7 +47,6 @@ class DataSummaryAnalyzer(Analyzer[None, DataSummary]):
 
     targets = ()  # Target any resource with data
     outputs = (DataSummary,)
-    external_dependencies = (_ENTROPY_SO_DEPENDENCY,)
 
     def __init__(
         self,
@@ -82,9 +64,6 @@ async def analyze(self, resource: Resource, config=None, depth=0) -> DataSummary
                 f"Analysis process killed more than {self.max_analysis_retries} times. Aborting."
             )
 
-        if not _ENTROPY_SO_DEPENDENCY.is_tool_installed():
-            raise ComponentMissingDependencyError(self, _ENTROPY_SO_DEPENDENCY)
-
         data = await resource.get_data()
         # Run blocking computations in separate processes
         try:
@@ -121,12 +100,7 @@ def sample_entropy(
     def log_percent(percent):  # pragma: no cover
         LOGGER.info(f"Entropy calculation {percent}% complete for {resource_id.hex()}")
 
-    # Make the entropy buffer mutable to the external C function
-    entropy = ctypes.create_string_buffer(len(data) - window_size)
-    errval = ENTROPY_FUNCTION(data, len(data), entropy, window_size, C_LOG_TYPE(log_percent))
-    if errval != 0:
-        raise ValueError("Bad input to entropy function.")
-    result = bytes(entropy.raw)
+    result = entropy_func(data, window_size, log_percent)
 
     if len(result) <= max_samples:
         return result

diff --git a/ofrak_core/ofrak/core/entropy/entropy_py.py b/ofrak_core/ofrak/core/entropy/entropy_py.py
@@ -0,0 +1,57 @@
+import logging
+import math
+from typing import Callable, List, Optional
+
+
+def entropy_py(
+    data: bytes, window_size: int, log_percent: Optional[Callable[[int], None]] = None
+) -> bytes:
+    """
+    Return a list of entropy values where each value represents the Shannon entropy of the byte
+    value distribution over a fixed-size, sliding window.
+    """
+    if log_percent is None:
+        log_percent = lambda x: None
+    else:
+        # Sort of hacky way to know we are being called from the tests and don't need to log this
+        logging.warning(
+            f"Using the Python implementation of the Shannon entropy calculation! This is potentially "
+            f"very slow, and is only used when the C extension cannot be built/found."
+        )
+
+    # Create a histogram, and populate it with initial values
+    histogram = [0] * 256
+    for b in data[:window_size]:
+        histogram[b] += 1
+
+    # Calculate the entropy using a sliding window
+    entropy = [0] * (len(data) - window_size)
+    last_percent_logged = 0
+    for i in range(len(entropy)):
+        entropy[i] = math.floor(255 * _shannon_entropy(histogram, window_size))
+        histogram[data[i]] -= 1
+        histogram[data[i + window_size]] += 1
+        percent = int((i * 100) / len(data))
+        if percent > last_percent_logged and percent % 10 == 0:
+            log_percent(percent)
+            last_percent_logged = percent
+    return bytes(entropy)
+
+
+def _shannon_entropy(distribution: List[int], window_size: int) -> float:
+    """
+    Return the Shannon entropy of the input probability distribution (represented as a histogram
+    counting byte occurrences over a window of known size).
+
+    Shannon entropy represents how uniform a probability distribution is. Since more uniform
+    implies less predictable (because the probability of any outcome is equally likely in a
+    uniform distribution), a sample with higher entropy is "more random" than one with lower
+    entropy. More here: <https://en.wikipedia.org/wiki/Entropy_(information_theory)>.
+    """
+
+    result = 0.0
+    for num_occurrences in distribution:
+        probability = num_occurrences / window_size
+        # Note that the zero check is required because the domain of log2 is the positive reals
+        result += probability * math.log2(probability) if probability != 0.0 else 0.0
+    return -result / math.log2(window_size)
diff --git a/ofrak_core/setup.py b/ofrak_core/setup.py
@@ -19,6 +19,15 @@ def run(self):
     long_description = f.read()
 
 
+entropy_so = setuptools.Extension(
+    "ofrak.core.entropy.entropy_c",
+    sources=["ofrak/core/entropy/entropy.c"],
+    libraries=["m"],  # math library
+    optional=True,  # If this fails the build, OFRAK will fall back to Python implementation
+    extra_compile_args=["-O3"],
+)
+
+
 setuptools.setup(
     name="ofrak",
     version="1.0.0",
@@ -99,4 +108,5 @@ def run(self):
     license_files=["LICENSE"],
     cmdclass={"egg_info": egg_info_ex},
     entry_points={"ofrak.packages": ["ofrak_pkg = ofrak"]},
+    ext_modules=[entropy_so],
 )