From b4d25273f356e60c02b654acf2639c0c8973c482 Mon Sep 17 00:00:00 2001
From: edward <edward@redballoonsecurity.com>
Date: Thu, 22 Dec 2022 19:12:34 -0500
Subject: [PATCH 1/5] refactor entropy.so.1 as a C extension that can be built
 with setuptools, removing the dependency on running the Makefile

---
 ofrak_core/ofrak/core/entropy/entropy.c       | 69 ++++++++++++++++++-
 ofrak_core/ofrak/core/entropy/entropy.py      | 44 ++----------
 .../ofrak/core/entropy/reference_entropy.py   | 53 ++++++++++++++
 ofrak_core/setup.py                           | 10 +++
 .../components/test_entropy_component.py      | 47 ++-----------
 5 files changed, 138 insertions(+), 85 deletions(-)
 create mode 100644 ofrak_core/ofrak/core/entropy/reference_entropy.py

diff --git a/ofrak_core/ofrak/core/entropy/entropy.c b/ofrak_core/ofrak/core/entropy/entropy.c
index 43467f098..47e172d30 100644
--- a/ofrak_core/ofrak/core/entropy/entropy.c
+++ b/ofrak_core/ofrak/core/entropy/entropy.c
@@ -1,17 +1,28 @@
 #include <stddef.h>   // size_t, NULL
 #include <inttypes.h> // uint8_t, uint32_t
 #include <math.h>     // floor, log2
+#include <Python.h>
 
 #define HISTOGRAM_SIZE 256
 #define MAX_BRIGHTNESS_FLOAT 255.0
 #define LOGGING_CHUNKS 10
 
+/***
+ * Use a Python callback to log the current percent completion of the calculation
+ */
+void log_percent(int percent, void* py_callback){
+    PyObject *args = Py_BuildValue("(i)", percent);
+    PyObject *result = PyEval_CallObject(py_callback, args);
+    Py_XDECREF(result);
+    Py_DECREF(args);
+}
+
 /***
  * Calculate the Shannon entropy of a distribution of size `window_size` sampled from a sliding
  * window over `data`. The results of each calculation are stored in `result`.
  */
 int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size,
-            void (*log_percent)(uint8_t))
+            void* py_log_callback)
 {
     if (data == NULL || result == NULL || window_size > data_len || data_len == 0 ||
         window_size == 0) {
@@ -92,8 +103,62 @@ int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size,
             }
         }
 
-        log_percent((i * 100) / data_len);
+        log_percent((i * 100) / data_len, py_log_callback);
     }
 
     return 0;
 }
+
+
+PyObject* entropy_wrapper(PyObject* _, PyObject* args){
+    Py_buffer data_buffer;
+    size_t data_len;
+    size_t window_size;
+    PyObject* py_log_percent;
+
+    if (!PyArg_ParseTuple(args, "y*nnO", &data_buffer, &data_len, &window_size, &py_log_percent)){
+        return NULL;
+    }
+
+    uint8_t *data = data_buffer.buf;
+    size_t result_size = data_len - window_size;
+    uint8_t *result = (uint8_t*) calloc(result_size, sizeof(uint8_t));
+
+    // Actual entropy calculation
+    entropy(data, data_len, result, window_size, py_log_percent);
+
+    PyObject* result_object = Py_BuildValue("y#", result, result_size);
+
+    // Clean up memory
+    PyBuffer_Release(&data_buffer);
+    free(result);
+
+    return result_object;
+}
+
+
+// Functions defined in this module
+static PyMethodDef methods[] = {
+    {
+        "entropy_c",
+        entropy_wrapper,
+        METH_VARARGS,
+        "Calculate the Shannon entropy of a distribution of size `window_size` sampled from a sliding window over `data`. The results of each calculation are stored in `result`."
+    },
+    {NULL, NULL, 0, NULL}
+};
+
+
+// Module definition
+static struct PyModuleDef entropy_definition = {
+    PyModuleDef_HEAD_INIT,
+    "entropy_c",
+    "A Python module that calculates Shannon entropy",
+    -1,
+    methods,
+};
+
+PyObject* PyInit_entropy_c(void) {
+    Py_Initialize();
+    return PyModule_Create(&entropy_definition);
+}
diff --git a/ofrak_core/ofrak/core/entropy/entropy.py b/ofrak_core/ofrak/core/entropy/entropy.py
index e4390c090..da08f5590 100644
--- a/ofrak_core/ofrak/core/entropy/entropy.py
+++ b/ofrak_core/ofrak/core/entropy/entropy.py
@@ -2,14 +2,11 @@
 import ctypes
 import logging
 import math
-import os
 from concurrent.futures import ProcessPoolExecutor
 from concurrent.futures.process import BrokenProcessPool
 from dataclasses import dataclass
 
-from ofrak.component.abstract import ComponentMissingDependencyError
 from ofrak.component.analyzer import Analyzer
-from ofrak.model.component_model import ComponentExternalTool
 from ofrak.model.resource_model import ResourceAttributes
 from ofrak.resource import Resource, ResourceFactory
 from ofrak.service.data_service_i import DataServiceInterface
@@ -21,33 +18,9 @@
 C_LOG_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_uint8)
 
 try:
-    _lib_entropy = ctypes.cdll.LoadLibrary(
-        os.path.abspath(os.path.join(os.path.dirname(__file__), "entropy.so.1"))
-    )
-    ENTROPY_FUNCTION = _lib_entropy.entropy
-
-    ENTROPY_FUNCTION.argtypes = (
-        ctypes.c_char_p,
-        ctypes.c_size_t,
-        ctypes.c_char_p,
-        ctypes.c_size_t,
-        C_LOG_TYPE,
-    )
-    ENTROPY_FUNCTION.restype = ctypes.c_int
-except OSError:
-    ENTROPY_FUNCTION = None  # type: ignore
-
-
-class _EntropyCTypesTool(ComponentExternalTool):
-    def __init__(self):
-        # TODO: Add docs page on building entropy.so.1
-        super().__init__("entropy.so.1", None, None, None)
-
-    def is_tool_installed(self) -> bool:
-        return ENTROPY_FUNCTION is not None
-
-
-_ENTROPY_SO_DEPENDENCY = _EntropyCTypesTool()
+    from .entropy_c import entropy_c as entropy_func
+except:
+    from ofrak.core.entropy.reference_entropy import entropy_func
 
 
 @dataclass(**ResourceAttributes.DATACLASS_PARAMS)
@@ -64,7 +37,6 @@ class DataSummaryAnalyzer(Analyzer[None, DataSummary]):
 
     targets = ()  # Target any resource with data
     outputs = (DataSummary,)
-    external_dependencies = (_ENTROPY_SO_DEPENDENCY,)
 
     def __init__(
         self,
@@ -82,9 +54,6 @@ async def analyze(self, resource: Resource, config=None, depth=0) -> DataSummary
                 f"Analysis process killed more than {self.max_analysis_retries} times. Aborting."
             )
 
-        if not _ENTROPY_SO_DEPENDENCY.is_tool_installed():
-            raise ComponentMissingDependencyError(self, _ENTROPY_SO_DEPENDENCY)
-
         data = await resource.get_data()
         # Run blocking computations in separate processes
         try:
@@ -121,12 +90,7 @@ def sample_entropy(
     def log_percent(percent):  # pragma: no cover
         LOGGER.info(f"Entropy calculation {percent}% complete for {resource_id.hex()}")
 
-    # Make the entropy buffer mutable to the external C function
-    entropy = ctypes.create_string_buffer(len(data) - window_size)
-    errval = ENTROPY_FUNCTION(data, len(data), entropy, window_size, C_LOG_TYPE(log_percent))
-    if errval != 0:
-        raise ValueError("Bad input to entropy function.")
-    result = bytes(entropy.raw)
+    result = entropy_func(data, len(data), window_size, log_percent)
 
     if len(result) <= max_samples:
         return result
diff --git a/ofrak_core/ofrak/core/entropy/reference_entropy.py b/ofrak_core/ofrak/core/entropy/reference_entropy.py
new file mode 100644
index 000000000..aee2f6249
--- /dev/null
+++ b/ofrak_core/ofrak/core/entropy/reference_entropy.py
@@ -0,0 +1,53 @@
+import logging
+import math
+from typing import Callable, List
+
+
+def entropy_func(
+    data: bytes, data_len: int, window_size: int, log_percent: Callable[[int], None]
+) -> bytes:
+    """
+    Return a list of entropy values where each value represents the Shannon entropy of the byte
+    value distribution over a fixed-size, sliding window.
+    """
+    logging.warning(
+        f"Using the Python implementation of the Shannon entropy calculation! This is potentially "
+        f"very slow, and is only used when the C extension cannot be built/found."
+    )
+
+    # Create a histogram, and populate it with initial values
+    histogram = [0] * 256
+    for b in data[:window_size]:
+        histogram[b] += 1
+
+    # Calculate the entropy using a sliding window
+    entropy = [0] * (data_len - window_size)
+    last_percent_logged = 0
+    for i in range(len(entropy)):
+        entropy[i] = math.floor(255 * _shannon_entropy(histogram, window_size))
+        histogram[data[i]] -= 1
+        histogram[data[i + window_size]] += 1
+        percent = int((i * 100) / data_len)
+        if percent > last_percent_logged and percent % 10 == 0:
+            log_percent(percent)
+            last_percent_logged = percent
+    return bytes(entropy)
+
+
+def _shannon_entropy(distribution: List[int], window_size: int) -> float:
+    """
+    Return the Shannon entropy of the input probability distribution (represented as a histogram
+    counting byte occurrences over a window of known size).
+
+    Shannon entropy represents how uniform a probability distribution is. Since more uniform
+    implies less predictable (because the probability of any outcome is equally likely in a
+    uniform distribution), a sample with higher entropy is "more random" than one with lower
+    entropy. More here: <https://en.wikipedia.org/wiki/Entropy_(information_theory)>.
+    """
+
+    result = 0
+    for num_occurrences in distribution:
+        probability = num_occurrences / window_size
+        # Note that the zero check is required because the domain of log2 is the positive reals
+        result += probability * math.log2(probability) if probability != 0.0 else 0.0
+    return -result / math.log2(window_size)
diff --git a/ofrak_core/setup.py b/ofrak_core/setup.py
index 374cf7a28..7383d0ade 100644
--- a/ofrak_core/setup.py
+++ b/ofrak_core/setup.py
@@ -19,6 +19,15 @@ def run(self):
     long_description = f.read()
 
 
+entropy_so = setuptools.Extension(
+    "ofrak.core.entropy.entropy_c",
+    sources=["ofrak/core/entropy/entropy.c"],
+    libraries=["m"],  # math library
+    export_symbols=["shannon_entropy"],
+    optional=True,
+)
+
+
 setuptools.setup(
     name="ofrak",
     version="1.0.0",
@@ -99,4 +108,5 @@ def run(self):
     license_files=["LICENSE"],
     cmdclass={"egg_info": egg_info_ex},
     entry_points={"ofrak.packages": ["ofrak_pkg = ofrak"]},
+    ext_modules=[entropy_so],
 )
diff --git a/ofrak_core/test_ofrak/components/test_entropy_component.py b/ofrak_core/test_ofrak/components/test_entropy_component.py
index 8e0081013..4c39e9e3c 100644
--- a/ofrak_core/test_ofrak/components/test_entropy_component.py
+++ b/ofrak_core/test_ofrak/components/test_entropy_component.py
@@ -1,13 +1,11 @@
-import math
 import os.path
-from typing import List
 
 import pytest
 from ofrak.core.entropy import DataSummaryAnalyzer, DataSummary
 
 from ofrak import OFRAKContext
 import test_ofrak.components
-
+from ofrak.core.entropy.reference_entropy import entropy_func
 
 TEST_FILES = [
     "hello.out",
@@ -33,8 +31,10 @@ async def test_analyzer(ofrak_context: OFRAKContext, test_file_path):
     await root.run(DataSummaryAnalyzer)
     data_summary = root.get_attributes(DataSummary)
     entropy = data_summary.entropy_samples
+    data = await root.get_data()
+    assert len(entropy) == len(entropy_func(data, len(data), 256, lambda s: None))
     assert _almost_equal(
-        entropy, _reference_entropy(await root.get_data())
+        entropy, entropy_func(data, len(data), 256, lambda s: None)
     ), f"Entropy analysis for {test_file_path} differs from reference entropy."
 
 
@@ -52,42 +52,3 @@ def _almost_equal(bytes1: bytes, bytes2: bytes) -> bool:
             print(f"Inputs differ at byte {i} ({bytes1[i]} != {bytes2[i]})")
             return False
     return True
-
-
-def _reference_entropy(data: bytes, window_size: int = 256) -> bytes:
-    """
-    Return a list of entropy values where each value represents the Shannon entropy of the byte
-    value distribution over a fixed-size, sliding window.
-    """
-
-    # Create a histogram, and populate it with initial values
-    histogram = [0] * 256
-    for b in data[:window_size]:
-        histogram[b] += 1
-
-    # Calculate the entropy using a sliding window
-    entropy = [0] * (len(data) - window_size)
-    for i in range(len(entropy)):
-        entropy[i] = math.floor(255 * _shannon_entropy(histogram, window_size))
-        histogram[data[i]] -= 1
-        histogram[data[i + window_size]] += 1
-    return bytes(entropy)
-
-
-def _shannon_entropy(distribution: List[int], window_size: int) -> float:
-    """
-    Return the Shannon entropy of the input probability distribution (represented as a histogram
-    counting byte occurrences over a window of known size).
-
-    Shannon entropy represents how uniform a probability distribution is. Since more uniform
-    implies less predictable (because the probability of any outcome is equally likely in a
-    uniform distribution), a sample with higher entropy is "more random" than one with lower
-    entropy. More here: <https://en.wikipedia.org/wiki/Entropy_(information_theory)>.
-    """
-
-    result = 0
-    for num_occurrences in distribution:
-        probability = num_occurrences / window_size
-        # Note that the zero check is required because the domain of log2 is the positive reals
-        result += probability * math.log2(probability) if probability != 0.0 else 0.0
-    return -result / math.log2(window_size)

From 211de6beed0c5f0f4fec3949afbcc7089a3132a8 Mon Sep 17 00:00:00 2001
From: edward <edward@redballoonsecurity.com>
Date: Thu, 22 Dec 2022 19:13:50 -0500
Subject: [PATCH 2/5] remove unneeded entropy makefile

---
 ofrak_core/ofrak/core/entropy/Makefile | 24 ------------------------
 1 file changed, 24 deletions(-)
 delete mode 100644 ofrak_core/ofrak/core/entropy/Makefile

diff --git a/ofrak_core/ofrak/core/entropy/Makefile b/ofrak_core/ofrak/core/entropy/Makefile
deleted file mode 100644
index 8e2f973d1..000000000
--- a/ofrak_core/ofrak/core/entropy/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-SHELL := bash
-
-CC = gcc
-CFLAGS = -std=c99 \
-	-pedantic \
-	-Wall \
-	-Wextra \
-	-Werror \
-	-fPIC \
-	-fstack-protector-all \
-	-D_FORTIFY_SOURCE=2 \
-	-shared \
-	-nostdlib \
-	-O3
-LDLIBS = -lm  # Link the math library
-
-# Use this .so.1 extension because otherwise the dependency injector will
-# erroneously try to import entropy.so, which will fail. 
-entropy.so.1: entropy.c
-	$(CC) \
-		$(CFLAGS) \
-		$(filter %.c, $^) \
-		$(LDLIBS) \
-		-o $@

From 29836c32f823eb54c3ccac191b028b8bc5050f05 Mon Sep 17 00:00:00 2001
From: edward <edward@redballoonsecurity.com>
Date: Thu, 22 Dec 2022 19:23:54 -0500
Subject: [PATCH 3/5] remove unneeded data_len arg, smarter warning log

---
 ofrak_core/ofrak/core/entropy/entropy.c       |  7 +++----
 ofrak_core/ofrak/core/entropy/entropy.py      |  6 +++++-
 .../ofrak/core/entropy/reference_entropy.py   | 20 +++++++++++--------
 ofrak_core/setup.py                           |  2 +-
 .../components/test_entropy_component.py      |  4 ++--
 5 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/ofrak_core/ofrak/core/entropy/entropy.c b/ofrak_core/ofrak/core/entropy/entropy.c
index 47e172d30..ff770e8f9 100644
--- a/ofrak_core/ofrak/core/entropy/entropy.c
+++ b/ofrak_core/ofrak/core/entropy/entropy.c
@@ -112,20 +112,19 @@ int entropy(uint8_t *data, size_t data_len, uint8_t *result, size_t window_size,
 
 PyObject* entropy_wrapper(PyObject* _, PyObject* args){
     Py_buffer data_buffer;
-    size_t data_len;
     size_t window_size;
     PyObject* py_log_percent;
 
-    if (!PyArg_ParseTuple(args, "y*nnO", &data_buffer, &data_len, &window_size, &py_log_percent)){
+    if (!PyArg_ParseTuple(args, "y*nO", &data_buffer, &window_size, &py_log_percent)){
         return NULL;
     }
 
     uint8_t *data = data_buffer.buf;
-    size_t result_size = data_len - window_size;
+    size_t result_size = data_buffer.len - window_size;
     uint8_t *result = (uint8_t*) calloc(result_size, sizeof(uint8_t));
 
     // Actual entropy calculation
-    entropy(data, data_len, result, window_size, py_log_percent);
+    entropy(data, data_buffer.len, result, window_size, py_log_percent);
 
     PyObject* result_object = Py_BuildValue("y#", result, result_size);
 
diff --git a/ofrak_core/ofrak/core/entropy/entropy.py b/ofrak_core/ofrak/core/entropy/entropy.py
index da08f5590..fe182feec 100644
--- a/ofrak_core/ofrak/core/entropy/entropy.py
+++ b/ofrak_core/ofrak/core/entropy/entropy.py
@@ -5,6 +5,7 @@
 from concurrent.futures import ProcessPoolExecutor
 from concurrent.futures.process import BrokenProcessPool
 from dataclasses import dataclass
+from typing import Callable
 
 from ofrak.component.analyzer import Analyzer
 from ofrak.model.resource_model import ResourceAttributes
@@ -17,6 +18,9 @@
 
 C_LOG_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_uint8)
 
+
+entropy_func: Callable[[bytes, int, Callable[[int], None]], bytes]
+
 try:
     from .entropy_c import entropy_c as entropy_func
 except:
@@ -90,7 +94,7 @@ def sample_entropy(
     def log_percent(percent):  # pragma: no cover
         LOGGER.info(f"Entropy calculation {percent}% complete for {resource_id.hex()}")
 
-    result = entropy_func(data, len(data), window_size, log_percent)
+    result = entropy_func(data, window_size, log_percent)
 
     if len(result) <= max_samples:
         return result
diff --git a/ofrak_core/ofrak/core/entropy/reference_entropy.py b/ofrak_core/ofrak/core/entropy/reference_entropy.py
index aee2f6249..04d2451ee 100644
--- a/ofrak_core/ofrak/core/entropy/reference_entropy.py
+++ b/ofrak_core/ofrak/core/entropy/reference_entropy.py
@@ -1,19 +1,23 @@
 import logging
 import math
-from typing import Callable, List
+from typing import Callable, List, Optional
 
 
 def entropy_func(
-    data: bytes, data_len: int, window_size: int, log_percent: Callable[[int], None]
+    data: bytes, window_size: int, log_percent: Optional[Callable[[int], None]] = None
 ) -> bytes:
     """
     Return a list of entropy values where each value represents the Shannon entropy of the byte
     value distribution over a fixed-size, sliding window.
     """
-    logging.warning(
-        f"Using the Python implementation of the Shannon entropy calculation! This is potentially "
-        f"very slow, and is only used when the C extension cannot be built/found."
-    )
+    if log_percent is None:
+        log_percent = lambda x: None
+    else:
+        # Sort of hacky way to know we are being called from the tests and don't need to log this
+        logging.warning(
+            f"Using the Python implementation of the Shannon entropy calculation! This is potentially "
+            f"very slow, and is only used when the C extension cannot be built/found."
+        )
 
     # Create a histogram, and populate it with initial values
     histogram = [0] * 256
@@ -21,13 +25,13 @@ def entropy_func(
         histogram[b] += 1
 
     # Calculate the entropy using a sliding window
-    entropy = [0] * (data_len - window_size)
+    entropy = [0] * (len(data) - window_size)
     last_percent_logged = 0
     for i in range(len(entropy)):
         entropy[i] = math.floor(255 * _shannon_entropy(histogram, window_size))
         histogram[data[i]] -= 1
         histogram[data[i + window_size]] += 1
-        percent = int((i * 100) / data_len)
+        percent = int((i * 100) / len(data))
         if percent > last_percent_logged and percent % 10 == 0:
             log_percent(percent)
             last_percent_logged = percent
diff --git a/ofrak_core/setup.py b/ofrak_core/setup.py
index 7383d0ade..a468fbbc2 100644
--- a/ofrak_core/setup.py
+++ b/ofrak_core/setup.py
@@ -24,7 +24,7 @@ def run(self):
     sources=["ofrak/core/entropy/entropy.c"],
     libraries=["m"],  # math library
     export_symbols=["shannon_entropy"],
-    optional=True,
+    optional=True,  # If this fails the build, OFRAK will fall back to Python implementation
 )
 
 
diff --git a/ofrak_core/test_ofrak/components/test_entropy_component.py b/ofrak_core/test_ofrak/components/test_entropy_component.py
index 4c39e9e3c..2e733f805 100644
--- a/ofrak_core/test_ofrak/components/test_entropy_component.py
+++ b/ofrak_core/test_ofrak/components/test_entropy_component.py
@@ -32,9 +32,9 @@ async def test_analyzer(ofrak_context: OFRAKContext, test_file_path):
     data_summary = root.get_attributes(DataSummary)
     entropy = data_summary.entropy_samples
     data = await root.get_data()
-    assert len(entropy) == len(entropy_func(data, len(data), 256, lambda s: None))
+    assert len(entropy) == len(entropy_func(data, 256))
     assert _almost_equal(
-        entropy, entropy_func(data, len(data), 256, lambda s: None)
+        entropy, entropy_func(data, 256)
     ), f"Entropy analysis for {test_file_path} differs from reference entropy."
 
 

From 6e19fe03464f779070fb0027f9475f28821915db Mon Sep 17 00:00:00 2001
From: edward <edward@redballoonsecurity.com>
Date: Thu, 22 Dec 2022 19:31:47 -0500
Subject: [PATCH 4/5] fix some typing stuff

---
 ofrak_core/mypy.ini                                | 3 +++
 ofrak_core/ofrak/core/entropy/entropy.py           | 3 ---
 ofrak_core/ofrak/core/entropy/reference_entropy.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ofrak_core/mypy.ini b/ofrak_core/mypy.ini
index 32a3125bc..53667cb6a 100644
--- a/ofrak_core/mypy.ini
+++ b/ofrak_core/mypy.ini
@@ -39,3 +39,6 @@ ignore_missing_imports = True
 
 [mypy-reedsolo.*]
 ignore_missing_imports = True
+
+[mypy-ofrak.core.entropy.entropy_c.*]
+ignore_missing_imports = True
diff --git a/ofrak_core/ofrak/core/entropy/entropy.py b/ofrak_core/ofrak/core/entropy/entropy.py
index fe182feec..23170e678 100644
--- a/ofrak_core/ofrak/core/entropy/entropy.py
+++ b/ofrak_core/ofrak/core/entropy/entropy.py
@@ -5,7 +5,6 @@
 from concurrent.futures import ProcessPoolExecutor
 from concurrent.futures.process import BrokenProcessPool
 from dataclasses import dataclass
-from typing import Callable
 
 from ofrak.component.analyzer import Analyzer
 from ofrak.model.resource_model import ResourceAttributes
@@ -19,8 +18,6 @@
 C_LOG_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_uint8)
 
 
-entropy_func: Callable[[bytes, int, Callable[[int], None]], bytes]
-
 try:
     from .entropy_c import entropy_c as entropy_func
 except:
diff --git a/ofrak_core/ofrak/core/entropy/reference_entropy.py b/ofrak_core/ofrak/core/entropy/reference_entropy.py
index 04d2451ee..2589ac09a 100644
--- a/ofrak_core/ofrak/core/entropy/reference_entropy.py
+++ b/ofrak_core/ofrak/core/entropy/reference_entropy.py
@@ -49,7 +49,7 @@ def _shannon_entropy(distribution: List[int], window_size: int) -> float:
     entropy. More here: <https://en.wikipedia.org/wiki/Entropy_(information_theory)>.
     """
 
-    result = 0
+    result = 0.0
     for num_occurrences in distribution:
         probability = num_occurrences / window_size
         # Note that the zero check is required because the domain of log2 is the positive reals

From 355bcd9d309749a784cf62827b31be13f81176ec Mon Sep 17 00:00:00 2001
From: edward <edward@redballoonsecurity.com>
Date: Fri, 23 Dec 2022 17:14:07 -0500
Subject: [PATCH 5/5] apply changes suggested in review

---
 ofrak_core/Makefile                           |  2 --
 ofrak_core/ofrak/core/entropy/entropy.c       |  9 ++++++++
 ofrak_core/ofrak/core/entropy/entropy.py      | 13 +++++++++--
 .../{reference_entropy.py => entropy_py.py}   |  2 +-
 ofrak_core/setup.py                           |  2 +-
 .../components/test_entropy_component.py      | 23 +++++++++++++++----
 6 files changed, 41 insertions(+), 10 deletions(-)
 rename ofrak_core/ofrak/core/entropy/{reference_entropy.py => entropy_py.py} (99%)

diff --git a/ofrak_core/Makefile b/ofrak_core/Makefile
index 08ab2f41d..2533bdb73 100644
--- a/ofrak_core/Makefile
+++ b/ofrak_core/Makefile
@@ -3,12 +3,10 @@ PIP=pip3
 
 .PHONY: install
 install:
-	$(MAKE) -C ofrak/core/entropy
 	$(PIP) install .
 
 .PHONY: develop
 develop:
-	$(MAKE) -C ofrak/core/entropy
 	$(PIP) install -e .[docs,test]
 
 .PHONY: inspect
diff --git a/ofrak_core/ofrak/core/entropy/entropy.c b/ofrak_core/ofrak/core/entropy/entropy.c
index ff770e8f9..52b06aae3 100644
--- a/ofrak_core/ofrak/core/entropy/entropy.c
+++ b/ofrak_core/ofrak/core/entropy/entropy.c
@@ -116,9 +116,18 @@ PyObject* entropy_wrapper(PyObject* _, PyObject* args){
     PyObject* py_log_percent;
 
     if (!PyArg_ParseTuple(args, "y*nO", &data_buffer, &window_size, &py_log_percent)){
+        PyErr_SetString(PyExc_RuntimeError, "Failed to parse arguments to entropy_wrapper!");
         return NULL;
     }
 
+    if (data_buffer.len <= window_size){
+        PyBuffer_Release(&data_buffer);
+         // return b""
+         // we just need a definitely non-NULL pointer to pass to Py_BuildValue
+         // &window_size works fine (no data is read from it)
+        return Py_BuildValue("y#", &window_size, 0);
+    }
+
     uint8_t *data = data_buffer.buf;
     size_t result_size = data_buffer.len - window_size;
     uint8_t *result = (uint8_t*) calloc(result_size, sizeof(uint8_t));
diff --git a/ofrak_core/ofrak/core/entropy/entropy.py b/ofrak_core/ofrak/core/entropy/entropy.py
index 23170e678..b3b19bd40 100644
--- a/ofrak_core/ofrak/core/entropy/entropy.py
+++ b/ofrak_core/ofrak/core/entropy/entropy.py
@@ -19,13 +19,22 @@
 
 
 try:
-    from .entropy_c import entropy_c as entropy_func
+    from ofrak.core.entropy.entropy_c import entropy_c as entropy_func
 except:
-    from ofrak.core.entropy.reference_entropy import entropy_func
+    from ofrak.core.entropy.entropy_py import entropy_py as entropy_func
 
 
 @dataclass(**ResourceAttributes.DATACLASS_PARAMS)
 class DataSummary(ResourceAttributes):
+    """
+    High-level summary of binary data.
+
+    :ivar entropy_samples: Shannon entropy of the data. A description of Shannon entropy and how it
+    can be used is [here](../../../../user-guide/gui/minimap.md#entropy-view).
+    :ivar magnitude_samples: Sample of the binary data to put an upper limit on the displayed byte
+    magnitudes; if the input data is smaller than this upper limit, all bytes are sampled.
+    """
+
     entropy_samples: bytes
     magnitude_samples: bytes
 
diff --git a/ofrak_core/ofrak/core/entropy/reference_entropy.py b/ofrak_core/ofrak/core/entropy/entropy_py.py
similarity index 99%
rename from ofrak_core/ofrak/core/entropy/reference_entropy.py
rename to ofrak_core/ofrak/core/entropy/entropy_py.py
index 2589ac09a..a1c68b60f 100644
--- a/ofrak_core/ofrak/core/entropy/reference_entropy.py
+++ b/ofrak_core/ofrak/core/entropy/entropy_py.py
@@ -3,7 +3,7 @@
 from typing import Callable, List, Optional
 
 
-def entropy_func(
+def entropy_py(
     data: bytes, window_size: int, log_percent: Optional[Callable[[int], None]] = None
 ) -> bytes:
     """
diff --git a/ofrak_core/setup.py b/ofrak_core/setup.py
index a468fbbc2..99e1aa660 100644
--- a/ofrak_core/setup.py
+++ b/ofrak_core/setup.py
@@ -23,8 +23,8 @@ def run(self):
     "ofrak.core.entropy.entropy_c",
     sources=["ofrak/core/entropy/entropy.c"],
     libraries=["m"],  # math library
-    export_symbols=["shannon_entropy"],
     optional=True,  # If this fails the build, OFRAK will fall back to Python implementation
+    extra_compile_args=["-O3"],
 )
 
 
diff --git a/ofrak_core/test_ofrak/components/test_entropy_component.py b/ofrak_core/test_ofrak/components/test_entropy_component.py
index 2e733f805..07c9b46bb 100644
--- a/ofrak_core/test_ofrak/components/test_entropy_component.py
+++ b/ofrak_core/test_ofrak/components/test_entropy_component.py
@@ -5,11 +5,13 @@
 
 from ofrak import OFRAKContext
 import test_ofrak.components
-from ofrak.core.entropy.reference_entropy import entropy_func
+from ofrak.core.entropy.entropy_py import entropy_py
+from ofrak.core.entropy.entropy_c import entropy_c
 
 TEST_FILES = [
     "hello.out",
     "arm_reloc_relocated.elf",
+    "flash_test_magic.bin",
     "hello.rar",
     "imx7d-sdb.dtb",
     "simple_arm_gcc.o.elf",
@@ -27,14 +29,27 @@ async def test_analyzer(ofrak_context: OFRAKContext, test_file_path):
     1. The sampling of large files may lead to spurious test failures.
     2. The reference method is *extremely* slow for even moderately sized files.
     """
+    with open(test_file_path, "rb") as f:
+        data = f.read()
+    c_implementation_entropy = entropy_c(data, 256, lambda s: None)
+    py_implementation_entropy = entropy_py(data, 256)
+
+    if len(data) < 256:
+        assert c_implementation_entropy == b""
+        assert py_implementation_entropy == b""
+
+    assert _almost_equal(
+        c_implementation_entropy, py_implementation_entropy
+    ), f"Python and C entropy implementations for {test_file_path} differ."
+
+    expected_entropy = c_implementation_entropy
+
     root = await ofrak_context.create_root_resource_from_file(test_file_path)
     await root.run(DataSummaryAnalyzer)
     data_summary = root.get_attributes(DataSummary)
     entropy = data_summary.entropy_samples
-    data = await root.get_data()
-    assert len(entropy) == len(entropy_func(data, 256))
     assert _almost_equal(
-        entropy, entropy_func(data, 256)
+        entropy, expected_entropy
     ), f"Entropy analysis for {test_file_path} differs from reference entropy."