From ca21401c428231745c9645b9f0630c3040623a00 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Fri, 19 May 2023 14:16:38 -0700
Subject: [PATCH 01/14] wip

---
 .../geotag/construct_mp4_parser.py            | 18 ++++---
 mapillary_tools/geotag/simple_mp4_builder.py  | 54 ++++++++++---------
 2 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/mapillary_tools/geotag/construct_mp4_parser.py b/mapillary_tools/geotag/construct_mp4_parser.py
index 8736f5e22..201a692f5 100644
--- a/mapillary_tools/geotag/construct_mp4_parser.py
+++ b/mapillary_tools/geotag/construct_mp4_parser.py
@@ -447,12 +447,6 @@ def parse_box(self, data: bytes) -> BoxDict:
     def parse_boxlist(self, data: bytes) -> T.List[BoxDict]:
         return T.cast(T.List[BoxDict], self.BoxList.parse(data))
 
-    def build_box(self, box: BoxDict) -> bytes:
-        return self.Box.build(box)
-
-    def build_boxlist(self, boxes: T.Sequence[BoxDict]) -> bytes:
-        return self.BoxList.build(boxes)
-
 
 class Box32ConstructBuilder(Box64ConstructBuilder):
     """
@@ -473,6 +467,18 @@ def Box(self) -> C.Construct:
 
         return self._box
 
+    def parse_box(self, data: bytes) -> BoxDict:
+        raise NotImplementedError("Box32ConstructBuilder does not support parsing")
+
+    def parse_boxlist(self, data: bytes) -> T.List[BoxDict]:
+        raise NotImplementedError("Box32ConstructBuilder does not support parsing")
+
+    def build_box(self, box: BoxDict) -> bytes:
+        return self.Box.build(box)
+
+    def build_boxlist(self, boxes: T.Sequence[BoxDict]) -> bytes:
+        return self.BoxList.build(boxes)
+
 
 # pyre-ignore[9]: pyre does not support recursive type SwitchMapType
 CMAP: SwitchMapType = {
diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py
index a718c6a92..424149bc4 100644
--- a/mapillary_tools/geotag/simple_mp4_builder.py
+++ b/mapillary_tools/geotag/simple_mp4_builder.py
@@ -324,62 +324,68 @@ def transform_mp4(
 ) -> io_utils.ChainedIO:
     # extract ftyp
     src_fp.seek(0)
-    source_ftyp_box_data = parser.parse_mp4_data_firstx(src_fp, [b"ftyp"])
-    source_ftyp_data = cparser.MP4WithoutSTBLBuilderConstruct.build_box(
-        {"type": b"ftyp", "data": source_ftyp_box_data}
-    )
+    ftyp_data = parser.parse_mp4_data_firstx(src_fp, [b"ftyp"])
 
     # extract moov
     src_fp.seek(0)
-    src_moov_data = parser.parse_mp4_data_firstx(src_fp, [b"moov"])
-    moov_children = _MOOVChildrenParserConstruct.parse_boxlist(src_moov_data)
+    moov_data = parser.parse_mp4_data_firstx(src_fp, [b"moov"])
+    moov_children = _MOOVChildrenParserConstruct.parse_boxlist(moov_data)
 
     # filter tracks in moov
     moov_children = list(_filter_moov_children_boxes(moov_children))
 
     # extract video samples
     source_samples = list(iterate_samples(moov_children))
-    movie_sample_readers = [
+    sample_readers: T.List[io.IOBase] = [
         io_utils.SlicedIO(src_fp, sample.offset, sample.size)
         for sample in source_samples
     ]
     if sample_generator is not None:
-        sample_readers = list(sample_generator(src_fp, moov_children))
-    else:
-        sample_readers = []
+        sample_readers.extend(sample_generator(src_fp, moov_children))
 
     _update_all_trak_tkhd(moov_children)
 
-    # moov_boxes should be immutable since here
+    return build_mp4(ftyp_data, moov_children, sample_readers)
+
+
+def build_mp4(
+    ftyp_data: bytes,
+    moov_children: T.Sequence[BoxDict],
+    sample_readers: T.Iterable[io.IOBase],
+) -> io_utils.ChainedIO:
+    ftyp_box = cparser.MP4WithoutSTBLBuilderConstruct.build_box(
+        {"type": b"ftyp", "data": ftyp_data}
+    )
     mdat_body_size = sum(sample.size for sample in iterate_samples(moov_children))
+    # moov_children should be immutable since here
+    new_moov_box = _rewrite_moov(len(ftyp_box), moov_children)
     return io_utils.ChainedIO(
         [
-            io.BytesIO(source_ftyp_data),
-            io.BytesIO(_rewrite_moov(len(source_ftyp_data), moov_children)),
+            io.BytesIO(ftyp_box),
+            io.BytesIO(new_moov_box),
             io.BytesIO(_build_mdat_header_bytes(mdat_body_size)),
-            *movie_sample_readers,
             *sample_readers,
         ]
     )
 
 
-def _rewrite_moov(moov_offset: int, moov_boxes: T.Sequence[BoxDict]) -> bytes:
+def _rewrite_moov(moov_offset: int, moov_children: T.Sequence[BoxDict]) -> bytes:
     # build moov for calculating moov size
     sample_offset = 0
-    for box in _filter_trak_boxes(moov_boxes):
+    for box in _filter_trak_boxes(moov_children):
         sample_offset = _update_sbtl(box, sample_offset)
-    moov_data = _build_moov_bytes(moov_boxes)
-    moov_data_size = len(moov_data)
+    moov_bytes = _build_moov_bytes(moov_children)
+    moov_bytes_size = len(moov_bytes)
 
     # mdat header size
-    mdat_body_size = sum(sample.size for sample in iterate_samples(moov_boxes))
+    mdat_body_size = sum(sample.size for sample in iterate_samples(moov_children))
     mdat_header = _build_mdat_header_bytes(mdat_body_size)
 
     # build moov for real
-    sample_offset = moov_offset + len(moov_data) + len(mdat_header)
-    for box in _filter_trak_boxes(moov_boxes):
+    sample_offset = moov_offset + len(moov_bytes) + len(mdat_header)
+    for box in _filter_trak_boxes(moov_children):
         sample_offset = _update_sbtl(box, sample_offset)
-    moov_data = _build_moov_bytes(moov_boxes)
-    assert len(moov_data) == moov_data_size, f"{len(moov_data)} != {moov_data_size}"
+    moov_bytes = _build_moov_bytes(moov_children)
+    assert len(moov_bytes) == moov_bytes_size, f"{len(moov_bytes)} != {moov_bytes_size}"
 
-    return moov_data
+    return moov_bytes

From be2331e6e441ec384aa9e6af4d97977228b5c590 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Thu, 3 Aug 2023 11:23:29 -0700
Subject: [PATCH 02/14] more naming refactoring

---
 mapillary_tools/geotag/simple_mp4_builder.py | 59 +++++++++++++-------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py
index 424149bc4..75c532759 100644
--- a/mapillary_tools/geotag/simple_mp4_builder.py
+++ b/mapillary_tools/geotag/simple_mp4_builder.py
@@ -11,6 +11,17 @@
 from .construct_mp4_parser import BoxDict
 from .mp4_sample_parser import RawSample
 
+"""
+Variable naming conventions:
+
+- *_box: a BoxDict
+- *_boxes: a list of BoxDicts
+- *_children: a list of BoxDicts under the parent box
+- *_data: the data in bytes of a box (without the header (type and size))
+- *_typed_data: the data in bytes of a box (with the header (type and size))
+"""
+
+
 UINT32_MAX = 2**32 - 1
 UINT64_MAX = 2**64 - 1
 
@@ -225,7 +236,7 @@ def _update_all_trak_tkhd(moov_chilren: T.Sequence[BoxDict]) -> None:
 )
 
 
-def _update_sbtl(trak: BoxDict, sample_offset: int) -> int:
+def _update_sbtl_sample_offsets(trak: BoxDict, sample_offset: int) -> int:
     assert trak["type"] == b"trak"
 
     # new samples with offsets updated
@@ -249,8 +260,7 @@ def _update_sbtl(trak: BoxDict, sample_offset: int) -> int:
     stbl_children_boxes = build_stbl_from_raw_samples(
         descriptions, repositioned_samples
     )
-    new_stbl_bytes = _STBLChildrenBuilderConstruct.build_boxlist(stbl_children_boxes)
-    stbl_box["data"] = new_stbl_bytes
+    stbl_box["data"] = _STBLChildrenBuilderConstruct.build_boxlist(stbl_children_boxes)
 
     return sample_offset
 
@@ -269,7 +279,7 @@ def iterate_samples(
             yield from raw_samples_iter
 
 
-def _build_mdat_header_bytes(mdat_size: int) -> bytes:
+def _build_mdat_header_data(mdat_size: int) -> bytes:
     if UINT32_MAX < mdat_size + 8:
         return cparser.BoxHeader64.build(
             {
@@ -302,7 +312,7 @@ def find_movie_timescale(moov_children: T.Sequence[BoxDict]) -> int:
     return T.cast(T.Dict, mvhd["data"])["timescale"]
 
 
-def _build_moov_bytes(moov_children: T.Sequence[BoxDict]) -> bytes:
+def _build_moov_typed_data(moov_children: T.Sequence[BoxDict]) -> bytes:
     return cparser.MP4WithoutSTBLBuilderConstruct.build_box(
         {
             "type": b"moov",
@@ -353,39 +363,48 @@ def build_mp4(
     moov_children: T.Sequence[BoxDict],
     sample_readers: T.Iterable[io.IOBase],
 ) -> io_utils.ChainedIO:
-    ftyp_box = cparser.MP4WithoutSTBLBuilderConstruct.build_box(
+    ftyp_typed_data = cparser.MP4WithoutSTBLBuilderConstruct.build_box(
         {"type": b"ftyp", "data": ftyp_data}
     )
     mdat_body_size = sum(sample.size for sample in iterate_samples(moov_children))
     # moov_children should be immutable since here
-    new_moov_box = _rewrite_moov(len(ftyp_box), moov_children)
+    new_moov_typed_data = _rewrite_and_build_moov_typed_data(
+        len(ftyp_typed_data), moov_children
+    )
     return io_utils.ChainedIO(
         [
-            io.BytesIO(ftyp_box),
-            io.BytesIO(new_moov_box),
-            io.BytesIO(_build_mdat_header_bytes(mdat_body_size)),
+            # ftyp
+            io.BytesIO(ftyp_typed_data),
+            # moov
+            io.BytesIO(new_moov_typed_data),
+            # mdat
+            io.BytesIO(_build_mdat_header_data(mdat_body_size)),
             *sample_readers,
         ]
     )
 
 
-def _rewrite_moov(moov_offset: int, moov_children: T.Sequence[BoxDict]) -> bytes:
+def _rewrite_and_build_moov_typed_data(
+    moov_offset: int, moov_children: T.Sequence[BoxDict]
+) -> bytes:
     # build moov for calculating moov size
     sample_offset = 0
     for box in _filter_trak_boxes(moov_children):
-        sample_offset = _update_sbtl(box, sample_offset)
-    moov_bytes = _build_moov_bytes(moov_children)
-    moov_bytes_size = len(moov_bytes)
+        sample_offset = _update_sbtl_sample_offsets(box, sample_offset)
+    moov_typed_data = _build_moov_typed_data(moov_children)
+    moov_typed_data_size = len(moov_typed_data)
 
     # mdat header size
     mdat_body_size = sum(sample.size for sample in iterate_samples(moov_children))
-    mdat_header = _build_mdat_header_bytes(mdat_body_size)
+    mdat_header_data = _build_mdat_header_data(mdat_body_size)
 
     # build moov for real
-    sample_offset = moov_offset + len(moov_bytes) + len(mdat_header)
+    sample_offset = moov_offset + len(moov_typed_data) + len(mdat_header_data)
     for box in _filter_trak_boxes(moov_children):
-        sample_offset = _update_sbtl(box, sample_offset)
-    moov_bytes = _build_moov_bytes(moov_children)
-    assert len(moov_bytes) == moov_bytes_size, f"{len(moov_bytes)} != {moov_bytes_size}"
+        sample_offset = _update_sbtl_sample_offsets(box, sample_offset)
+    moov_typed_data = _build_moov_typed_data(moov_children)
+    assert (
+        len(moov_typed_data) == moov_typed_data_size
+    ), f"{len(moov_typed_data)} != {moov_typed_data_size}"
 
-    return moov_bytes
+    return moov_typed_data

From 50b533949e64aaaedc93aa1b9b51275bb3f6db00 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Fri, 4 Aug 2023 16:20:50 -0700
Subject: [PATCH 03/14] simplify Sample and RawSample

---
 mapillary_tools/geotag/camm_builder.py       |   2 +-
 mapillary_tools/geotag/camm_parser.py        |   8 +-
 mapillary_tools/geotag/gpmf_parser.py        |  23 ++--
 mapillary_tools/geotag/mp4_sample_parser.py  | 132 +++++++------------
 mapillary_tools/geotag/simple_mp4_builder.py |  11 +-
 mapillary_tools/sample_video.py              |  10 +-
 tests/unit/test_mp4_sample_parser.py         |   6 +-
 tests/unit/test_simple_mp4_builder.py        |  30 ++---
 8 files changed, 96 insertions(+), 126 deletions(-)

diff --git a/mapillary_tools/geotag/camm_builder.py b/mapillary_tools/geotag/camm_builder.py
index 5ff61e35b..53740d2c2 100644
--- a/mapillary_tools/geotag/camm_builder.py
+++ b/mapillary_tools/geotag/camm_builder.py
@@ -104,7 +104,7 @@ def convert_points_to_raw_samples(
             offset=0,
             size=len(camm_sample_data),
             timedelta=timedelta,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=True,
         )
 
diff --git a/mapillary_tools/geotag/camm_parser.py b/mapillary_tools/geotag/camm_parser.py
index 994769d4d..d653452f0 100644
--- a/mapillary_tools/geotag/camm_parser.py
+++ b/mapillary_tools/geotag/camm_parser.py
@@ -82,12 +82,12 @@ class CAMMType(Enum):
 def _parse_point_from_sample(
     fp: T.BinaryIO, sample: sample_parser.Sample
 ) -> T.Optional[geo.Point]:
-    fp.seek(sample.offset, io.SEEK_SET)
-    data = fp.read(sample.size)
+    fp.seek(sample.raw_sample.offset, io.SEEK_SET)
+    data = fp.read(sample.raw_sample.size)
     box = CAMMSampleData.parse(data)
     if box.type == CAMMType.MIN_GPS.value:
         return geo.Point(
-            time=sample.time_offset,
+            time=sample.exact_time,
             lat=box.data[0],
             lon=box.data[1],
             alt=box.data[2],
@@ -97,7 +97,7 @@ def _parse_point_from_sample(
         # Not using box.data.time_gps_epoch as the point timestamp
         # because it is from another clock
         return geo.Point(
-            time=sample.time_offset,
+            time=sample.exact_time,
             lat=box.data.latitude,
             lon=box.data.longitude,
             alt=box.data.altitude,
diff --git a/mapillary_tools/geotag/gpmf_parser.py b/mapillary_tools/geotag/gpmf_parser.py
index c01cf0ba3..f8a75056e 100644
--- a/mapillary_tools/geotag/gpmf_parser.py
+++ b/mapillary_tools/geotag/gpmf_parser.py
@@ -257,8 +257,8 @@ def _extract_dvnm_from_samples(
     dvnm_by_dvid: T.Dict[int, bytes] = {}
 
     for sample in samples:
-        fp.seek(sample.offset, io.SEEK_SET)
-        data = fp.read(sample.size)
+        fp.seek(sample.raw_sample.offset, io.SEEK_SET)
+        data = fp.read(sample.raw_sample.size)
         gpmf_sample_data = T.cast(T.Dict, GPMFSampleData.parse(data))
 
         # iterate devices
@@ -281,8 +281,8 @@ def _extract_points_from_samples(
     points_by_dvid: T.Dict[int, T.List[geo.PointWithFix]] = {}
 
     for sample in samples:
-        fp.seek(sample.offset, io.SEEK_SET)
-        data = fp.read(sample.size)
+        fp.seek(sample.raw_sample.offset, io.SEEK_SET)
+        data = fp.read(sample.raw_sample.size)
         gpmf_sample_data = T.cast(T.Dict, GPMFSampleData.parse(data))
 
         # iterate devices
@@ -291,9 +291,9 @@ def _extract_points_from_samples(
             sample_points = _find_first_gps_stream(device["data"])
             if sample_points:
                 # interpolate timestamps in between
-                avg_timedelta = sample.timedelta / len(sample_points)
+                avg_timedelta = sample.exact_timedelta / len(sample_points)
                 for idx, point in enumerate(sample_points):
-                    point.time = sample.time_offset + avg_timedelta * idx
+                    point.time = sample.exact_time + avg_timedelta * idx
 
                 device_id = _find_first_device_id(device["data"])
                 device_points = points_by_dvid.setdefault(device_id, [])
@@ -340,10 +340,9 @@ def _extract_gpmd_samples_from_trak(
     if gpmd_descriptions:
         s.seek(trak_start_offset, io.SEEK_SET)
         samples = sample_parser.parse_samples_from_trak(s, maxsize=maxsize)
-        gpmd_samples = (
-            sample for sample in samples if sample.description["format"] == b"gpmd"
-        )
-        yield from gpmd_samples
+        for sample in samples:
+            if sample.description["format"] == b"gpmd":
+                yield sample
 
 
 def extract_all_device_names(fp: T.BinaryIO) -> T.Dict[int, bytes]:
@@ -398,6 +397,6 @@ def iterate_gpmd_sample_data(fp: T.BinaryIO) -> T.Generator[T.Dict, None, None]:
     for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
         gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize)
         for sample in gpmd_samples:
-            fp.seek(sample.offset, io.SEEK_SET)
-            data = fp.read(sample.size)
+            fp.seek(sample.raw_sample.offset, io.SEEK_SET)
+            data = fp.read(sample.raw_sample.size)
             yield T.cast(T.Dict, GPMFSampleData.parse(data))
diff --git a/mapillary_tools/geotag/mp4_sample_parser.py b/mapillary_tools/geotag/mp4_sample_parser.py
index 4c90e0a30..fa8d52422 100644
--- a/mapillary_tools/geotag/mp4_sample_parser.py
+++ b/mapillary_tools/geotag/mp4_sample_parser.py
@@ -9,47 +9,39 @@
 class RawSample(T.NamedTuple):
     # 1-based index
     description_idx: int
-    # sample offset
+
+    # sample offset (offset from the beginning of the file)
     offset: int
-    # sample size
+
+    # sample size (in bytes)
     size: int
-    # sample_delta read from stts entries,
+
+    # sample_delta read from stts entries that decides when to decode the sample,
     # i.e. STTS(n) in the forumula DT(n+1) = DT(n) + STTS(n)
+    # NOTE: timescale is not applied yet (hence int)
     timedelta: int
-    # sample composition offset,
+
+    # sample composition offset that decides when to present the sample,
     # i.e. CTTS(n) in the forumula CT(n) = DT(n) + CTTS(n).
-    composition_offset: int
+    # NOTE: timescale is not applied yet (hence int)
+    composition_timedelta: int
+
     # if it is a sync sample
     is_sync: bool
 
 
-# TODO: can not inherit RawSample?
 class Sample(T.NamedTuple):
-    # copied from RawSample
+    raw_sample: RawSample
 
-    # 1-based index
-    description_idx: int
-    # sample offset
-    offset: int
-    # sample size
-    size: int
-    # sample delta in seconds read from stts entries,
-    # i.e. (STTS(n) / timescale) in the forumula DT(n+1) = DT(n) + STTS(n)
-    timedelta: float
-    # sample composition offset in seconds,
-    # i.e. (CTTS(n) / timescale) in the forumula CT(n) = DT(n) + CTTS(n).
-    composition_offset: float
-    # if it is a sync sample
-    is_sync: bool
+    # accumulated timedelta in seconds, i.e. DT(n) / timescale
+    exact_time: float
 
-    # extended fields below
+    # accumulated composition timedelta in seconds, i.e. CT(n) / timescale
+    exact_composition_time: float
+
+    # exact timedelta in seconds, i.e. STTS(n) / timescale
+    exact_timedelta: float
 
-    # accumulated sample_delta in seconds,
-    # i.e. (DT(n) / timescale) in the forumula DT(n+1) = DT(n) + STTS(n)
-    time_offset: T.Union[int, float]
-    # accumulated composition offset in seconds,
-    # i.e. (CT(n) / timescale) in the forumula CT(n) = DT(n) + CTTS(n).
-    composition_time_offset: T.Union[int, float]
     # reference to the sample description
     description: T.Dict
 
@@ -59,7 +51,7 @@ def _extract_raw_samples(
     chunk_entries: T.Sequence[T.Dict],
     chunk_offsets: T.Sequence[int],
     timedeltas: T.Sequence[int],
-    composition_offsets: T.Optional[T.Sequence[int]],
+    composition_timedeltas: T.Optional[T.Sequence[int]],
     syncs: T.Optional[T.Set[int]],
 ) -> T.Generator[RawSample, None, None]:
     if not sizes:
@@ -90,9 +82,9 @@ def _extract_raw_samples(
             # iterate samples in this chunk
             for _ in range(entry["samples_per_chunk"]):
                 is_sync = syncs is None or (sample_idx + 1) in syncs
-                composition_offset = (
-                    composition_offsets[sample_idx]
-                    if composition_offsets is not None
+                composition_timedelta = (
+                    composition_timedeltas[sample_idx]
+                    if composition_timedeltas is not None
                     else 0
                 )
                 yield RawSample(
@@ -100,7 +92,7 @@ def _extract_raw_samples(
                     offset=sample_offset,
                     size=sizes[sample_idx],
                     timedelta=timedeltas[sample_idx],
-                    composition_offset=composition_offset,
+                    composition_timedelta=composition_timedelta,
                     is_sync=is_sync,
                 )
                 sample_offset += sizes[sample_idx]
@@ -117,9 +109,9 @@ def _extract_raw_samples(
         # iterate samples in this chunk
         for _ in range(chunk_entries[-1]["samples_per_chunk"]):
             is_sync = syncs is None or (sample_idx + 1) in syncs
-            composition_offset = (
-                composition_offsets[sample_idx]
-                if composition_offsets is not None
+            composition_timedelta = (
+                composition_timedeltas[sample_idx]
+                if composition_timedeltas is not None
                 else 0
             )
             yield RawSample(
@@ -127,7 +119,7 @@ def _extract_raw_samples(
                 offset=sample_offset,
                 size=sizes[sample_idx],
                 timedelta=timedeltas[sample_idx],
-                composition_offset=composition_offset,
+                composition_timedelta=composition_timedelta,
                 is_sync=is_sync,
             )
             sample_offset += sizes[sample_idx]
@@ -138,38 +130,22 @@ def _extract_raw_samples(
 def _extract_samples(
     raw_samples: T.Iterator[RawSample],
     descriptions: T.List,
+    timescale: int,
 ) -> T.Generator[Sample, None, None]:
     acc_delta = 0
     for raw_sample in raw_samples:
         yield Sample(
-            description_idx=raw_sample.description_idx,
-            offset=raw_sample.offset,
-            size=raw_sample.size,
-            timedelta=raw_sample.timedelta,
-            composition_offset=raw_sample.composition_offset,
-            is_sync=raw_sample.is_sync,
+            raw_sample=raw_sample,
             description=descriptions[raw_sample.description_idx - 1],
-            time_offset=acc_delta,
+            exact_time=acc_delta / timescale,
+            exact_timedelta=raw_sample.timedelta / timescale,
             # CT(n) = DT(n) + CTTS(n)
-            composition_time_offset=(acc_delta + raw_sample.composition_offset),
+            exact_composition_time=(acc_delta + raw_sample.composition_timedelta)
+            / timescale,
         )
         acc_delta += raw_sample.timedelta
 
 
-def _apply_timescale(sample: Sample, media_timescale: int) -> Sample:
-    return Sample(
-        description_idx=sample.description_idx,
-        offset=sample.offset,
-        size=sample.size,
-        timedelta=sample.timedelta / media_timescale,
-        composition_offset=sample.composition_offset / media_timescale,
-        is_sync=sample.is_sync,
-        description=sample.description,
-        time_offset=sample.time_offset / media_timescale,
-        composition_time_offset=sample.composition_time_offset / media_timescale,
-    )
-
-
 def parse_raw_samples_from_stbl(
     stbl: T.BinaryIO,
     maxsize: int = -1,
@@ -183,7 +159,7 @@ def parse_raw_samples_from_stbl(
     chunk_offsets = []
     chunk_entries = []
     timedeltas: T.List[int] = []
-    composition_offsets: T.Optional[T.List[int]] = None
+    composition_timedeltas: T.Optional[T.List[int]] = None
     syncs: T.Optional[T.Set[int]] = None
 
     for h, s in parser.parse_boxes(stbl, maxsize=maxsize, extend_eof=False):
@@ -212,11 +188,11 @@ def parse_raw_samples_from_stbl(
                 for _ in range(entry.sample_count):
                     timedeltas.append(entry.sample_delta)
         elif h.type == b"ctts":
-            composition_offsets = []
+            composition_timedeltas = []
             box = cparser.CompositionTimeToSampleBox.parse(s.read(h.maxsize))
             for entry in box.entries:
                 for _ in range(entry.sample_count):
-                    composition_offsets.append(entry.sample_offset)
+                    composition_timedeltas.append(entry.sample_offset)
         elif h.type == b"stss":
             box = cparser.SyncSampleBox.parse(s.read(h.maxsize))
             syncs = set(box.entries)
@@ -225,12 +201,12 @@ def parse_raw_samples_from_stbl(
     # in this case append 0's to timedeltas
     while len(timedeltas) < len(sizes):
         timedeltas.append(0)
-    if composition_offsets is not None:
-        while len(composition_offsets) < len(sizes):
-            composition_offsets.append(0)
+    if composition_timedeltas is not None:
+        while len(composition_timedeltas) < len(sizes):
+            composition_timedeltas.append(0)
 
     raw_samples = _extract_raw_samples(
-        sizes, chunk_entries, chunk_offsets, timedeltas, composition_offsets, syncs
+        sizes, chunk_entries, chunk_offsets, timedeltas, composition_timedeltas, syncs
     )
     return descriptions, raw_samples
 
@@ -248,7 +224,7 @@ def parse_raw_samples_from_stbl_bytes(
     chunk_offsets = []
     chunk_entries = []
     timedeltas: T.List[int] = []
-    composition_offsets: T.Optional[T.List[int]] = None
+    composition_timedeltas: T.Optional[T.List[int]] = None
     syncs: T.Optional[T.Set[int]] = None
 
     stbl_boxes = T.cast(T.Sequence[cparser.BoxDict], STBLBoxlistConstruct.parse(stbl))
@@ -275,10 +251,10 @@ def parse_raw_samples_from_stbl_bytes(
                 for _ in range(entry["sample_count"]):
                     timedeltas.append(entry["sample_delta"])
         elif box["type"] == b"ctts":
-            composition_offsets = []
+            composition_timedeltas = []
             for entry in data["entries"]:
                 for _ in range(entry["sample_count"]):
-                    composition_offsets.append(entry["sample_offset"])
+                    composition_timedeltas.append(entry["sample_offset"])
         elif box["type"] == b"stss":
             syncs = set(data["entries"])
 
@@ -286,12 +262,12 @@ def parse_raw_samples_from_stbl_bytes(
     # in this case append 0's to timedeltas
     while len(timedeltas) < len(sizes):
         timedeltas.append(0)
-    if composition_offsets is not None:
-        while len(composition_offsets) < len(sizes):
-            composition_offsets.append(0)
+    if composition_timedeltas is not None:
+        while len(composition_timedeltas) < len(sizes):
+            composition_timedeltas.append(0)
 
     raw_samples = _extract_raw_samples(
-        sizes, chunk_entries, chunk_offsets, timedeltas, composition_offsets, syncs
+        sizes, chunk_entries, chunk_offsets, timedeltas, composition_timedeltas, syncs
     )
     return descriptions, raw_samples
 
@@ -322,10 +298,7 @@ def parse_samples_from_trak(
     )
     descriptions, raw_samples = parse_raw_samples_from_stbl(s, maxsize=h.maxsize)
 
-    yield from (
-        _apply_timescale(s, mdhd["timescale"])
-        for s in _extract_samples(raw_samples, descriptions)
-    )
+    yield from _extract_samples(raw_samples, descriptions, mdhd["timescale"])
 
 
 STSDBoxListConstruct = cparser.Box64ConstructBuilder(
@@ -369,10 +342,7 @@ def parse_samples(self) -> T.Generator[Sample, None, None]:
             T.Dict,
             cparser.find_box_at_pathx(self.trak_boxes, [b"mdia", b"mdhd"])["data"],
         )
-        yield from (
-            _apply_timescale(s, mdhd["timescale"])
-            for s in _extract_samples(raw_samples, descriptions)
-        )
+        yield from _extract_samples(raw_samples, descriptions, mdhd["timescale"])
 
 
 class MovieBoxParser:
diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py
index 75c532759..5a5ece821 100644
--- a/mapillary_tools/geotag/simple_mp4_builder.py
+++ b/mapillary_tools/geotag/simple_mp4_builder.py
@@ -139,14 +139,15 @@ def _build_stts(sample_deltas: T.Iterable[int]) -> BoxDict:
 class _CompressedSampleCompositionOffset:
     __slots__ = ("sample_count", "sample_offset")
     # make sure dataclasses.asdict() produce the result as CompositionTimeToSampleBox expects
+    # SO DO NOT RENAME THE PROPERTIES BELOW
     sample_count: int
     sample_offset: int
 
 
-def _build_ctts(sample_composition_offsets: T.Iterable[int]) -> BoxDict:
+def _build_ctts(sample_composition_timedeltas: T.Iterable[int]) -> BoxDict:
     # compress offsets
     compressed: T.List[_CompressedSampleCompositionOffset] = []
-    for offset in sample_composition_offsets:
+    for offset in sample_composition_timedeltas:
         if compressed and offset == compressed[-1].sample_offset:
             compressed[-1].sample_count += 1
         else:
@@ -196,8 +197,8 @@ def build_stbl_from_raw_samples(
         # so we can calculate the moov box size in advance
         _build_co64(raw_samples),
     ]
-    if any(s.composition_offset for s in raw_samples):
-        boxes.append(_build_ctts((s.composition_offset for s in raw_samples)))
+    if any(s.composition_timedelta for s in raw_samples):
+        boxes.append(_build_ctts((s.composition_timedelta for s in raw_samples)))
     if any(not s.is_sync for s in raw_samples):
         boxes.append(_build_stss((s.is_sync for s in raw_samples)))
     return boxes
@@ -248,7 +249,7 @@ def _update_sbtl_sample_offsets(trak: BoxDict, sample_offset: int) -> int:
                 offset=sample_offset,
                 size=sample.size,
                 timedelta=sample.timedelta,
-                composition_offset=sample.composition_offset,
+                composition_timedelta=sample.composition_timedelta,
                 is_sync=sample.is_sync,
             )
         )
diff --git a/mapillary_tools/sample_video.py b/mapillary_tools/sample_video.py
index 6cfdfe121..6c1d80afd 100644
--- a/mapillary_tools/sample_video.py
+++ b/mapillary_tools/sample_video.py
@@ -237,7 +237,7 @@ def _sample_video_stream_by_distance(
     sorted_samples = list(video_track_parser.parse_samples())
     # we need sort sampels by composition time (CT) not the decoding offset (DT)
     # CT is the oder of videos streaming to audiences, as well as the order ffmpeg sampling
-    sorted_samples.sort(key=lambda sample: sample.composition_time_offset)
+    sorted_samples.sort(key=lambda sample: sample.exact_composition_time)
     LOG.info("Found total %d video samples", len(sorted_samples))
 
     # interpolate sample points between the GPS track range (with 1ms buffer)
@@ -251,11 +251,11 @@ def _sample_video_stream_by_distance(
         (
             frame_idx_0based,
             video_sample,
-            interpolator.interpolate(video_sample.composition_time_offset),
+            interpolator.interpolate(video_sample.exact_composition_time),
         )
         for frame_idx_0based, video_sample in enumerate(sorted_samples)
         if _within_track_time_range_buffered(
-            points, video_sample.composition_time_offset
+            points, video_sample.exact_composition_time
         )
     ]
     LOG.info("Found total %d interpolated video samples", len(interp_sample_points))
@@ -350,8 +350,8 @@ def _sample_single_video_by_distance(
 
             video_sample, interp = sample_point
             assert (
-                interp.time == video_sample.composition_time_offset
-            ), f"interpolated time {interp.time} should match the video sample time {video_sample.composition_time_offset}"
+                interp.time == video_sample.exact_composition_time
+            ), f"interpolated time {interp.time} should match the video sample time {video_sample.exact_composition_time}"
 
             timestamp = start_time + datetime.timedelta(seconds=interp.time)
             exif_edit = ExifEdit(sample_paths[0])
diff --git a/tests/unit/test_mp4_sample_parser.py b/tests/unit/test_mp4_sample_parser.py
index 003fae370..1b08bdc6e 100644
--- a/tests/unit/test_mp4_sample_parser.py
+++ b/tests/unit/test_mp4_sample_parser.py
@@ -49,6 +49,6 @@ def test_movie_box_parser():
     }
     assert isinstance(video_track.tkhd(), dict)
     for sample, raw_sample in zip(samples, raw_samples):
-        assert sample.offset == raw_sample.offset
-        assert sample.is_sync == raw_sample.is_sync
-        assert sample.size == raw_sample.size
+        assert sample.raw_sample.offset == raw_sample.offset
+        assert sample.raw_sample.is_sync == raw_sample.is_sync
+        assert sample.raw_sample.size == raw_sample.size
diff --git a/tests/unit/test_simple_mp4_builder.py b/tests/unit/test_simple_mp4_builder.py
index 88b00cad2..aaeba8efc 100644
--- a/tests/unit/test_simple_mp4_builder.py
+++ b/tests/unit/test_simple_mp4_builder.py
@@ -62,7 +62,7 @@ def test_build_stbl_happy():
             offset=1,
             size=1,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=True,
         ),
         sample_parser.RawSample(
@@ -70,7 +70,7 @@ def test_build_stbl_happy():
             offset=2,
             size=9,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=False,
         ),
     ]
@@ -82,7 +82,7 @@ def test_build_stbl_happy():
             offset=1,
             size=1,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=True,
         ),
         sample_parser.RawSample(
@@ -90,7 +90,7 @@ def test_build_stbl_happy():
             offset=2,
             size=2,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=False,
         ),
         # another chunk here due to a 1-byte break
@@ -99,7 +99,7 @@ def test_build_stbl_happy():
             offset=5,
             size=1,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=True,
         ),
         sample_parser.RawSample(
@@ -107,7 +107,7 @@ def test_build_stbl_happy():
             offset=6,
             size=9,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=False,
         ),
     ]
@@ -119,7 +119,7 @@ def test_build_stbl_happy():
             offset=1,
             size=1,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=False,
         ),
         sample_parser.RawSample(
@@ -127,7 +127,7 @@ def test_build_stbl_happy():
             offset=2,
             size=2,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=True,
         ),
         # another chunk here
@@ -136,7 +136,7 @@ def test_build_stbl_happy():
             offset=4,
             size=1,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=True,
         ),
         # another chunk here
@@ -145,7 +145,7 @@ def test_build_stbl_happy():
             offset=5,
             size=9,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=True,
         ),
     ]
@@ -157,7 +157,7 @@ def test_build_stbl_happy():
             offset=1,
             size=1,
             timedelta=2,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=True,
         ),
     ]
@@ -257,7 +257,7 @@ def test_parse_raw_samples_from_stbl():
             offset=1,
             size=1,
             timedelta=20,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=True,
         ),
         sample_parser.RawSample(
@@ -265,7 +265,7 @@ def test_parse_raw_samples_from_stbl():
             offset=2,
             size=2,
             timedelta=30,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=False,
         ),
         sample_parser.RawSample(
@@ -273,7 +273,7 @@ def test_parse_raw_samples_from_stbl():
             offset=5,
             size=3,
             timedelta=30,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=True,
         ),
         sample_parser.RawSample(
@@ -281,7 +281,7 @@ def test_parse_raw_samples_from_stbl():
             offset=8,
             size=3,
             timedelta=50,
-            composition_offset=0,
+            composition_timedelta=0,
             is_sync=False,
         ),
     ] == samples

From 89e4fd281fe89344b3a75b59f129230288685cba Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Fri, 4 Aug 2023 18:06:44 -0700
Subject: [PATCH 04/14] rename: import simple_mp4_parser as sparser

---
 mapillary_tools/geotag/blackvue_parser.py     |  8 ++++----
 mapillary_tools/geotag/camm_parser.py         | 16 +++++++--------
 .../geotag/geotag_videos_from_video.py        |  8 ++++----
 mapillary_tools/geotag/gpmf_parser.py         |  8 ++++----
 mapillary_tools/geotag/mp4_sample_parser.py   | 12 +++++------
 mapillary_tools/geotag/simple_mp4_builder.py  |  6 +++---
 tests/cli/simple_mp4_parser.py                | 20 +++++++++----------
 tests/unit/test_simple_mp4_builder.py         |  4 ++--
 tests/unit/test_simple_mp4_parser.py          |  8 ++++----
 9 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/mapillary_tools/geotag/blackvue_parser.py b/mapillary_tools/geotag/blackvue_parser.py
index a34d53278..812dc70a3 100644
--- a/mapillary_tools/geotag/blackvue_parser.py
+++ b/mapillary_tools/geotag/blackvue_parser.py
@@ -7,7 +7,7 @@
 import pynmea2
 
 from .. import geo
-from . import simple_mp4_parser
+from . import simple_mp4_parser as sparser
 
 
 LOG = logging.getLogger(__name__)
@@ -55,8 +55,8 @@ def _parse_gps_box(gps_data: bytes) -> T.Generator[geo.Point, None, None]:
 
 def extract_camera_model(fp: T.BinaryIO) -> str:
     try:
-        cprt_bytes = simple_mp4_parser.parse_mp4_data_first(fp, [b"free", b"cprt"])
-    except simple_mp4_parser.ParsingError:
+        cprt_bytes = sparser.parse_mp4_data_first(fp, [b"free", b"cprt"])
+    except sparser.ParsingError:
         return ""
 
     if cprt_bytes is None:
@@ -91,7 +91,7 @@ def extract_camera_model(fp: T.BinaryIO) -> str:
 
 
 def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
-    gps_data = simple_mp4_parser.parse_mp4_data_first(fp, [b"free", b"gps "])
+    gps_data = sparser.parse_mp4_data_first(fp, [b"free", b"gps "])
     if gps_data is None:
         return None
 
diff --git a/mapillary_tools/geotag/camm_parser.py b/mapillary_tools/geotag/camm_parser.py
index d653452f0..c529839ae 100644
--- a/mapillary_tools/geotag/camm_parser.py
+++ b/mapillary_tools/geotag/camm_parser.py
@@ -13,7 +13,7 @@
     construct_mp4_parser as cparser,
     geo,
     mp4_sample_parser as sample_parser,
-    simple_mp4_parser as parser,
+    simple_mp4_parser as sparser,
 )
 
 
@@ -170,7 +170,7 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
     media_timescale = None
     elst_entries = None
 
-    for h, s in parser.parse_path(fp, [b"moov", [b"mvhd", b"trak"]]):
+    for h, s in sparser.parse_path(fp, [b"moov", [b"mvhd", b"trak"]]):
         if h.type == b"trak":
             trak_start_offset = s.tell()
 
@@ -191,14 +191,14 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
                 points = [p for p in points_with_nones if p is not None]
                 if points:
                     s.seek(trak_start_offset)
-                    elst_data = parser.parse_box_data_first(
+                    elst_data = sparser.parse_box_data_first(
                         s, [b"edts", b"elst"], maxsize=h.maxsize
                     )
                     if elst_data is not None:
                         elst_entries = cparser.EditBox.parse(elst_data)["entries"]
 
                     s.seek(trak_start_offset)
-                    mdhd_data = parser.parse_box_data_firstx(
+                    mdhd_data = sparser.parse_box_data_firstx(
                         s, [b"mdia", b"mdhd"], maxsize=h.maxsize
                     )
                     mdhd = cparser.MediaHeaderBox.parse(mdhd_data)
@@ -238,7 +238,7 @@ def parse_gpx(path: pathlib.Path) -> T.List[geo.Point]:
 )
 
 
-def _decode_quietly(data: bytes, h: parser.Header) -> str:
+def _decode_quietly(data: bytes, h: sparser.Header) -> str:
     try:
         return data.decode("utf-8")
     except UnicodeDecodeError:
@@ -246,7 +246,7 @@ def _decode_quietly(data: bytes, h: parser.Header) -> str:
         return ""
 
 
-def _parse_quietly(data: bytes, h: parser.Header) -> bytes:
+def _parse_quietly(data: bytes, h: sparser.Header) -> bytes:
     try:
         parsed = MakeOrModel.parse(data)
     except C.ConstructError:
@@ -256,7 +256,7 @@ def _parse_quietly(data: bytes, h: parser.Header) -> bytes:
 
 
 def extract_camera_make_and_model(fp: T.BinaryIO) -> T.Tuple[str, str]:
-    header_and_stream = parser.parse_path(
+    header_and_stream = sparser.parse_path(
         fp,
         [
             b"moov",
@@ -296,7 +296,7 @@ def extract_camera_make_and_model(fp: T.BinaryIO) -> T.Tuple[str, str]:
             # quit when both found
             if make and model:
                 break
-    except parser.ParsingError:
+    except sparser.ParsingError:
         pass
 
     if make:
diff --git a/mapillary_tools/geotag/geotag_videos_from_video.py b/mapillary_tools/geotag/geotag_videos_from_video.py
index 77be8c6f4..42846ffc0 100644
--- a/mapillary_tools/geotag/geotag_videos_from_video.py
+++ b/mapillary_tools/geotag/geotag_videos_from_video.py
@@ -12,7 +12,7 @@
     camm_parser,
     gpmf_gps_filter,
     gpmf_parser,
-    simple_mp4_parser as parser,
+    simple_mp4_parser as sparser,
     utils as video_utils,
 )
 from .geotag_from_generic import GeotagVideosFromGeneric
@@ -77,7 +77,7 @@ def _extract_video_metadata(
             with video_path.open("rb") as fp:
                 try:
                     points = camm_parser.extract_points(fp)
-                except parser.ParsingError:
+                except sparser.ParsingError:
                     points = None
 
                 if points is not None:
@@ -100,7 +100,7 @@ def _extract_video_metadata(
             with video_path.open("rb") as fp:
                 try:
                     points_with_fix = gpmf_parser.extract_points(fp)
-                except parser.ParsingError:
+                except sparser.ParsingError:
                     points_with_fix = None
 
                 if points_with_fix is not None:
@@ -123,7 +123,7 @@ def _extract_video_metadata(
             with video_path.open("rb") as fp:
                 try:
                     points = blackvue_parser.extract_points(fp)
-                except parser.ParsingError:
+                except sparser.ParsingError:
                     points = None
 
                 if points is not None:
diff --git a/mapillary_tools/geotag/gpmf_parser.py b/mapillary_tools/geotag/gpmf_parser.py
index f8a75056e..f4f984bea 100644
--- a/mapillary_tools/geotag/gpmf_parser.py
+++ b/mapillary_tools/geotag/gpmf_parser.py
@@ -11,7 +11,7 @@
 import construct as C
 
 from .. import geo
-from . import mp4_sample_parser as sample_parser, simple_mp4_parser as parser
+from . import mp4_sample_parser as sample_parser, simple_mp4_parser as sparser
 
 """
 Parsing GPS from GPMF data format stored in GoPros. See the GPMF spec: https://github.com/gopro/gpmf-parser
@@ -309,7 +309,7 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.PointWithFix]]:
     otherwise None
     """
     points = None
-    for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
+    for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
         trak_start_offset = s.tell()
         descriptions = _extract_gpmd_descriptions_from_trak(s, h.maxsize)
         if descriptions:
@@ -346,7 +346,7 @@ def _extract_gpmd_samples_from_trak(
 
 
 def extract_all_device_names(fp: T.BinaryIO) -> T.Dict[int, bytes]:
-    for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
+    for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
         gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize)
         device_names = _extract_dvnm_from_samples(fp, gpmd_samples)
         if device_names:
@@ -394,7 +394,7 @@ def parse_gpx(path: pathlib.Path) -> T.List[geo.PointWithFix]:
 
 
 def iterate_gpmd_sample_data(fp: T.BinaryIO) -> T.Generator[T.Dict, None, None]:
-    for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
+    for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
         gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize)
         for sample in gpmd_samples:
             fp.seek(sample.raw_sample.offset, io.SEEK_SET)
diff --git a/mapillary_tools/geotag/mp4_sample_parser.py b/mapillary_tools/geotag/mp4_sample_parser.py
index fa8d52422..c01d06376 100644
--- a/mapillary_tools/geotag/mp4_sample_parser.py
+++ b/mapillary_tools/geotag/mp4_sample_parser.py
@@ -3,7 +3,7 @@
 import typing as T
 from pathlib import Path
 
-from . import construct_mp4_parser as cparser, simple_mp4_parser as parser
+from . import construct_mp4_parser as cparser, simple_mp4_parser as sparser
 
 
 class RawSample(T.NamedTuple):
@@ -162,7 +162,7 @@ def parse_raw_samples_from_stbl(
     composition_timedeltas: T.Optional[T.List[int]] = None
     syncs: T.Optional[T.Set[int]] = None
 
-    for h, s in parser.parse_boxes(stbl, maxsize=maxsize, extend_eof=False):
+    for h, s in sparser.parse_boxes(stbl, maxsize=maxsize, extend_eof=False):
         if h.type == b"stsd":
             box = cparser.SampleDescriptionBox.parse(s.read(h.maxsize))
             descriptions = list(box.entries)
@@ -273,7 +273,7 @@ def parse_raw_samples_from_stbl_bytes(
 
 
 def parse_descriptions_from_trak(trak: T.BinaryIO, maxsize: int = -1) -> T.List[T.Dict]:
-    data = parser.parse_box_data_first(
+    data = sparser.parse_box_data_first(
         trak, [b"mdia", b"minf", b"stbl", b"stsd"], maxsize=maxsize
     )
     if data is None:
@@ -289,11 +289,11 @@ def parse_samples_from_trak(
     trak_start_offset = trak.tell()
 
     trak.seek(trak_start_offset, io.SEEK_SET)
-    mdhd_box = parser.parse_box_data_firstx(trak, [b"mdia", b"mdhd"], maxsize=maxsize)
+    mdhd_box = sparser.parse_box_data_firstx(trak, [b"mdia", b"mdhd"], maxsize=maxsize)
     mdhd = T.cast(T.Dict, cparser.MediaHeaderBox.parse(mdhd_box))
 
     trak.seek(trak_start_offset, io.SEEK_SET)
-    h, s = parser.parse_box_path_firstx(
+    h, s = sparser.parse_box_path_firstx(
         trak, [b"mdia", b"minf", b"stbl"], maxsize=maxsize
     )
     descriptions, raw_samples = parse_raw_samples_from_stbl(s, maxsize=h.maxsize)
@@ -357,7 +357,7 @@ def __init__(self, moov: bytes):
     @classmethod
     def parse_file(cls, video_path: Path) -> "MovieBoxParser":
         with video_path.open("rb") as fp:
-            moov = parser.parse_box_data_firstx(fp, [b"moov"])
+            moov = sparser.parse_box_data_firstx(fp, [b"moov"])
         return MovieBoxParser(moov)
 
     def mvhd(self):
diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py
index 5a5ece821..e4fce07ce 100644
--- a/mapillary_tools/geotag/simple_mp4_builder.py
+++ b/mapillary_tools/geotag/simple_mp4_builder.py
@@ -6,7 +6,7 @@
     construct_mp4_parser as cparser,
     io_utils,
     mp4_sample_parser as sample_parser,
-    simple_mp4_parser as parser,
+    simple_mp4_parser as sparser,
 )
 from .construct_mp4_parser import BoxDict
 from .mp4_sample_parser import RawSample
@@ -335,11 +335,11 @@ def transform_mp4(
 ) -> io_utils.ChainedIO:
     # extract ftyp
     src_fp.seek(0)
-    ftyp_data = parser.parse_mp4_data_firstx(src_fp, [b"ftyp"])
+    ftyp_data = sparser.parse_mp4_data_firstx(src_fp, [b"ftyp"])
 
     # extract moov
     src_fp.seek(0)
-    moov_data = parser.parse_mp4_data_firstx(src_fp, [b"moov"])
+    moov_data = sparser.parse_mp4_data_firstx(src_fp, [b"moov"])
     moov_children = _MOOVChildrenParserConstruct.parse_boxlist(moov_data)
 
     # filter tracks in moov
diff --git a/tests/cli/simple_mp4_parser.py b/tests/cli/simple_mp4_parser.py
index 3fd6ae524..c15d844ca 100644
--- a/tests/cli/simple_mp4_parser.py
+++ b/tests/cli/simple_mp4_parser.py
@@ -9,7 +9,7 @@
 from mapillary_tools.geotag import (
     construct_mp4_parser as cparser,
     mp4_sample_parser as sample_parser,
-    simple_mp4_parser as parser,
+    simple_mp4_parser as sparser,
 )
 
 LOG = logging.getLogger(__name__)
@@ -37,7 +37,7 @@ def _validate_samples(
     samples: T.List[sample_parser.RawSample] = []
 
     with open(path, "rb") as fp:
-        for h, s in parser.parse_path(
+        for h, s in sparser.parse_path(
             fp, [b"moov", b"trak", b"mdia", b"minf", b"stbl"]
         ):
             (
@@ -67,7 +67,7 @@ def _validate_samples(
 
 
 def _parse_structs(fp: T.BinaryIO):
-    for h, d, s in parser.parse_boxes_recursive(fp, box_list_types=box_list_types):
+    for h, d, s in sparser.parse_boxes_recursive(fp, box_list_types=box_list_types):
         margin = "\t" * d
         if h.size32 == 0:
             header = f"{str(h.type)} {h.box_size} (open-ended):"
@@ -86,7 +86,7 @@ def _parse_structs(fp: T.BinaryIO):
 
 
 def _dump_box_data_at(fp: T.BinaryIO, box_type_path: T.List[bytes]):
-    for h, s in parser.parse_path(fp, box_type_path):
+    for h, s in sparser.parse_path(fp, box_type_path):
         max_chunk_size = 1024
         read = 0
         while read < h.maxsize or h.maxsize == -1:
@@ -103,9 +103,9 @@ def _dump_box_data_at(fp: T.BinaryIO, box_type_path: T.List[bytes]):
 
 
 def _parse_samples(fp: T.BinaryIO, filters: T.Optional[T.Container[bytes]] = None):
-    for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
+    for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
         offset = s.tell()
-        for h1, s1 in parser.parse_path(s, [b"mdia", b"mdhd"], maxsize=h.maxsize):
+        for h1, s1 in sparser.parse_path(s, [b"mdia", b"mdhd"], maxsize=h.maxsize):
             box = cparser.MediaHeaderBox.parse(s1.read(h.maxsize))
             LOG.info(box)
             LOG.info(sample_parser.to_datetime(box.creation_time))
@@ -117,7 +117,7 @@ def _parse_samples(fp: T.BinaryIO, filters: T.Optional[T.Container[bytes]] = Non
 
 
 def _dump_samples(fp: T.BinaryIO, filters: T.Optional[T.Container[bytes]] = None):
-    for h, s in parser.parse_path(fp, [b"moov", b"trak"]):
+    for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
         for sample in sample_parser.parse_samples_from_trak(s, maxsize=h.maxsize):
             if filters is None or sample.description["format"] in filters:
                 fp.seek(sample.offset, io.SEEK_SET)
@@ -203,13 +203,13 @@ def _process_path(parsed_args, path: pathlib.Path):
                     if box_path is None:
                         _parse_structs(fp)
                     else:
-                        data = parser.parse_mp4_data_firstx(fp, box_path)
+                        data = sparser.parse_mp4_data_firstx(fp, box_path)
                         _parse_structs(io.BytesIO(data))
                 elif parsed_args.full:
                     if box_path is None:
                         boxes = cparser.MP4ParserConstruct.BoxList.parse_stream(fp)
                     else:
-                        data = parser.parse_mp4_data_firstx(fp, box_path)
+                        data = sparser.parse_mp4_data_firstx(fp, box_path)
                         boxes = cparser.MP4ParserConstruct.BoxList.parse_stream(
                             io.BytesIO(data)
                         )
@@ -222,7 +222,7 @@ def _process_path(parsed_args, path: pathlib.Path):
                             )
                         )
                     else:
-                        data = parser.parse_mp4_data_firstx(fp, box_path)
+                        data = sparser.parse_mp4_data_firstx(fp, box_path)
                         boxes = (
                             cparser.MP4WithoutSTBLParserConstruct.BoxList.parse_stream(
                                 io.BytesIO(data)
diff --git a/tests/unit/test_simple_mp4_builder.py b/tests/unit/test_simple_mp4_builder.py
index aaeba8efc..e6edabe47 100644
--- a/tests/unit/test_simple_mp4_builder.py
+++ b/tests/unit/test_simple_mp4_builder.py
@@ -5,7 +5,7 @@
     construct_mp4_parser as cparser,
     mp4_sample_parser as sample_parser,
     simple_mp4_builder as builder,
-    simple_mp4_parser as parser,
+    simple_mp4_parser as sparser,
 )
 
 
@@ -44,7 +44,7 @@ def _build_and_parse_stbl(
     d = cparser.Box32ConstructBuilder({b"stbl": cparser.CMAP[b"stbl"]}).Box.build(
         {"type": b"stbl", "data": s}
     )
-    ss = parser.parse_box_data_firstx(io.BytesIO(d), [b"stbl"])
+    ss = sparser.parse_box_data_firstx(io.BytesIO(d), [b"stbl"])
     assert d[8:] == ss
     _, parsed_samples = sample_parser.parse_raw_samples_from_stbl(io.BytesIO(ss))
     assert expected_samples == list(parsed_samples)
diff --git a/tests/unit/test_simple_mp4_parser.py b/tests/unit/test_simple_mp4_parser.py
index 19701398c..eaeb7142b 100644
--- a/tests/unit/test_simple_mp4_parser.py
+++ b/tests/unit/test_simple_mp4_parser.py
@@ -3,7 +3,7 @@
 
 from mapillary_tools.geotag import (
     construct_mp4_parser as cparser,
-    simple_mp4_parser as parser,
+    simple_mp4_parser as sparser,
 )
 
 
@@ -26,7 +26,7 @@ def _parse(data: bytes):
     }
     consumed_size = 0
     ret = []
-    for h, _d, s in parser.parse_boxes_recursive(
+    for h, _d, s in sparser.parse_boxes_recursive(
         io.BytesIO(data), box_list_types=box_list_types
     ):
         box_data = s.read(h.maxsize)
@@ -42,7 +42,7 @@ def _parse(data: bytes):
 
 def _assert_box_type(
     data: bytes,
-    parsed: typing.List[typing.Tuple[parser.Header, bytes]],
+    parsed: typing.List[typing.Tuple[sparser.Header, bytes]],
     box_type: bytes,
 ):
     assert 1 == len(parsed)
@@ -55,7 +55,7 @@ def _assert_box_type(
 
 def test_parse_box_header():
     s = io.BytesIO(b"hello")
-    header = parser.parse_box_header(s, maxsize=0)
+    header = sparser.parse_box_header(s, maxsize=0)
     assert header.header_size == 0
     assert header.box_size == 0
     assert header.type == b""

From 74fd1d110d80064b86560b2e66ad369ad8c001da Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Thu, 10 Aug 2023 18:25:49 -0400
Subject: [PATCH 05/14] refactor mp4 sample parser

---
 mapillary_tools/geotag/camm_parser.py        |   2 +-
 mapillary_tools/geotag/gpmf_parser.py        |   2 +-
 mapillary_tools/geotag/mp4_sample_parser.py  | 170 ++++++-------------
 mapillary_tools/geotag/simple_mp4_builder.py |   4 +-
 tests/cli/simple_mp4_parser.py               |  12 +-
 tests/unit/test_simple_mp4_builder.py        | 156 +++++++++--------
 6 files changed, 145 insertions(+), 201 deletions(-)

diff --git a/mapillary_tools/geotag/camm_parser.py b/mapillary_tools/geotag/camm_parser.py
index c529839ae..35553364e 100644
--- a/mapillary_tools/geotag/camm_parser.py
+++ b/mapillary_tools/geotag/camm_parser.py
@@ -152,7 +152,7 @@ def _extract_camm_samples(
     s: T.BinaryIO,
     maxsize: int = -1,
 ) -> T.Generator[sample_parser.Sample, None, None]:
-    samples = sample_parser.parse_samples_from_trak(s, maxsize=maxsize)
+    samples = sample_parser.parse_samples_from_trak_DEPRECATED(s, maxsize=maxsize)
     camm_samples = (
         sample for sample in samples if sample.description["format"] == b"camm"
     )
diff --git a/mapillary_tools/geotag/gpmf_parser.py b/mapillary_tools/geotag/gpmf_parser.py
index f4f984bea..3deb7fe00 100644
--- a/mapillary_tools/geotag/gpmf_parser.py
+++ b/mapillary_tools/geotag/gpmf_parser.py
@@ -339,7 +339,7 @@ def _extract_gpmd_samples_from_trak(
     gpmd_descriptions = _extract_gpmd_descriptions_from_trak(s, maxsize=maxsize)
     if gpmd_descriptions:
         s.seek(trak_start_offset, io.SEEK_SET)
-        samples = sample_parser.parse_samples_from_trak(s, maxsize=maxsize)
+        samples = sample_parser.parse_samples_from_trak_DEPRECATED(s, maxsize=maxsize)
         for sample in samples:
             if sample.description["format"] == b"gpmd":
                 yield sample
diff --git a/mapillary_tools/geotag/mp4_sample_parser.py b/mapillary_tools/geotag/mp4_sample_parser.py
index c01d06376..79dcc7d86 100644
--- a/mapillary_tools/geotag/mp4_sample_parser.py
+++ b/mapillary_tools/geotag/mp4_sample_parser.py
@@ -1,5 +1,4 @@
 import datetime
-import io
 import typing as T
 from pathlib import Path
 
@@ -146,77 +145,12 @@ def _extract_samples(
         acc_delta += raw_sample.timedelta
 
 
-def parse_raw_samples_from_stbl(
-    stbl: T.BinaryIO,
-    maxsize: int = -1,
-) -> T.Tuple[T.List[T.Dict], T.Generator[RawSample, None, None]]:
-    """
-    DEPRECATED: use parse_raw_samples_from_stbl_bytes instead
-    """
-
-    descriptions = []
-    sizes = []
-    chunk_offsets = []
-    chunk_entries = []
-    timedeltas: T.List[int] = []
-    composition_timedeltas: T.Optional[T.List[int]] = None
-    syncs: T.Optional[T.Set[int]] = None
-
-    for h, s in sparser.parse_boxes(stbl, maxsize=maxsize, extend_eof=False):
-        if h.type == b"stsd":
-            box = cparser.SampleDescriptionBox.parse(s.read(h.maxsize))
-            descriptions = list(box.entries)
-        elif h.type == b"stsz":
-            box = cparser.SampleSizeBox.parse(s.read(h.maxsize))
-            if box.sample_size == 0:
-                sizes = list(box.entries)
-            else:
-                sizes = [box.sample_size for _ in range(box.sample_count)]
-        elif h.type == b"stco":
-            box = cparser.ChunkOffsetBox.parse(s.read(h.maxsize))
-            chunk_offsets = list(box.entries)
-        elif h.type == b"co64":
-            box = cparser.ChunkLargeOffsetBox.parse(s.read(h.maxsize))
-            chunk_offsets = list(box.entries)
-        elif h.type == b"stsc":
-            box = cparser.SampleToChunkBox.parse(s.read(h.maxsize))
-            chunk_entries = list(box.entries)
-        elif h.type == b"stts":
-            timedeltas = []
-            box = cparser.TimeToSampleBox.parse(s.read(h.maxsize))
-            for entry in box.entries:
-                for _ in range(entry.sample_count):
-                    timedeltas.append(entry.sample_delta)
-        elif h.type == b"ctts":
-            composition_timedeltas = []
-            box = cparser.CompositionTimeToSampleBox.parse(s.read(h.maxsize))
-            for entry in box.entries:
-                for _ in range(entry.sample_count):
-                    composition_timedeltas.append(entry.sample_offset)
-        elif h.type == b"stss":
-            box = cparser.SyncSampleBox.parse(s.read(h.maxsize))
-            syncs = set(box.entries)
-
-    # some stbl have less timedeltas than the sample count i.e. len(sizes),
-    # in this case append 0's to timedeltas
-    while len(timedeltas) < len(sizes):
-        timedeltas.append(0)
-    if composition_timedeltas is not None:
-        while len(composition_timedeltas) < len(sizes):
-            composition_timedeltas.append(0)
-
-    raw_samples = _extract_raw_samples(
-        sizes, chunk_entries, chunk_offsets, timedeltas, composition_timedeltas, syncs
-    )
-    return descriptions, raw_samples
-
-
 STBLBoxlistConstruct = cparser.Box64ConstructBuilder(
     T.cast(cparser.SwitchMapType, cparser.CMAP[b"stbl"])
 ).BoxList
 
 
-def parse_raw_samples_from_stbl_bytes(
+def parse_raw_samples_from_stbl_data(
     stbl: bytes,
 ) -> T.Tuple[T.List[T.Dict], T.Generator[RawSample, None, None]]:
     descriptions = []
@@ -227,9 +161,11 @@ def parse_raw_samples_from_stbl_bytes(
     composition_timedeltas: T.Optional[T.List[int]] = None
     syncs: T.Optional[T.Set[int]] = None
 
-    stbl_boxes = T.cast(T.Sequence[cparser.BoxDict], STBLBoxlistConstruct.parse(stbl))
+    stbl_children = T.cast(
+        T.Sequence[cparser.BoxDict], STBLBoxlistConstruct.parse(stbl)
+    )
 
-    for box in stbl_boxes:
+    for box in stbl_children:
         data: T.Dict = T.cast(T.Dict, box["data"])
 
         if box["type"] == b"stsd":
@@ -272,86 +208,70 @@ def parse_raw_samples_from_stbl_bytes(
     return descriptions, raw_samples
 
 
-def parse_descriptions_from_trak(trak: T.BinaryIO, maxsize: int = -1) -> T.List[T.Dict]:
-    data = sparser.parse_box_data_first(
-        trak, [b"mdia", b"minf", b"stbl", b"stsd"], maxsize=maxsize
-    )
-    if data is None:
-        return []
-    box = cparser.SampleDescriptionBox.parse(data)
-    return list(box.entries)
-
-
-def parse_samples_from_trak(
-    trak: T.BinaryIO,
-    maxsize: int = -1,
-) -> T.Generator[Sample, None, None]:
-    trak_start_offset = trak.tell()
-
-    trak.seek(trak_start_offset, io.SEEK_SET)
-    mdhd_box = sparser.parse_box_data_firstx(trak, [b"mdia", b"mdhd"], maxsize=maxsize)
-    mdhd = T.cast(T.Dict, cparser.MediaHeaderBox.parse(mdhd_box))
-
-    trak.seek(trak_start_offset, io.SEEK_SET)
-    h, s = sparser.parse_box_path_firstx(
-        trak, [b"mdia", b"minf", b"stbl"], maxsize=maxsize
-    )
-    descriptions, raw_samples = parse_raw_samples_from_stbl(s, maxsize=h.maxsize)
-
-    yield from _extract_samples(raw_samples, descriptions, mdhd["timescale"])
-
-
-STSDBoxListConstruct = cparser.Box64ConstructBuilder(
+_STSDBoxListConstruct = cparser.Box64ConstructBuilder(
     # pyre-ignore[6]: pyre does not support recursive type SwitchMapType
     {b"stsd": cparser.CMAP[b"stsd"]}
 ).BoxList
 
 
 class TrackBoxParser:
-    trak_boxes: T.Sequence[cparser.BoxDict]
+    trak_children: T.Sequence[cparser.BoxDict]
     stbl_data: bytes
 
-    def __init__(self, trak_boxes: T.Sequence[cparser.BoxDict]):
-        self.trak_boxes = trak_boxes
-        stbl = cparser.find_box_at_pathx(self.trak_boxes, [b"mdia", b"minf", b"stbl"])
+    def __init__(self, trak_children: T.Sequence[cparser.BoxDict]):
+        self.trak_children = trak_children
+        stbl = cparser.find_box_at_pathx(
+            self.trak_children, [b"mdia", b"minf", b"stbl"]
+        )
         self.stbl_data = T.cast(bytes, stbl["data"])
 
     def tkhd(self) -> T.Dict:
         return T.cast(
-            T.Dict, cparser.find_box_at_pathx(self.trak_boxes, [b"tkhd"])["data"]
+            T.Dict, cparser.find_box_at_pathx(self.trak_children, [b"tkhd"])["data"]
         )
 
     def is_video_track(self) -> bool:
-        hdlr = cparser.find_box_at_pathx(self.trak_boxes, [b"mdia", b"hdlr"])
+        hdlr = cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"hdlr"])
         return T.cast(T.Dict[str, T.Any], hdlr["data"])["handler_type"] == b"vide"
 
-    def parse_sample_description(self) -> T.Dict:
-        boxes = STSDBoxListConstruct.parse(self.stbl_data)
+    def parse_sample_descriptions(self) -> T.List[T.Dict]:
+        # TODO: return [] if parsing fail
+        boxes = _STSDBoxListConstruct.parse(self.stbl_data)
         stsd = cparser.find_box_at_pathx(
             T.cast(T.Sequence[cparser.BoxDict], boxes), [b"stsd"]
         )
-        return T.cast(T.Dict, stsd["data"])
+        return T.cast(T.List[T.Dict], T.cast(T.Dict, stsd["data"])["entries"])
+
+    def extract_elst_boxdata(self) -> T.Optional[T.Dict]:
+        box = cparser.find_box_at_path(self.trak_children, [b"edts", b"elst"])
+        if box is None:
+            return None
+        return T.cast(T.Dict, box["data"])
+
+    def extract_mdhd_boxdata(self) -> T.Dict:
+        box = cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"mdhd"])
+        return T.cast(T.Dict, box["data"])
 
     def parse_raw_samples(self) -> T.Generator[RawSample, None, None]:
-        _, raw_samples = parse_raw_samples_from_stbl_bytes(self.stbl_data)
+        _, raw_samples = parse_raw_samples_from_stbl_data(self.stbl_data)
         yield from raw_samples
 
     def parse_samples(self) -> T.Generator[Sample, None, None]:
-        descriptions, raw_samples = parse_raw_samples_from_stbl_bytes(self.stbl_data)
+        descriptions, raw_samples = parse_raw_samples_from_stbl_data(self.stbl_data)
         mdhd = T.cast(
             T.Dict,
-            cparser.find_box_at_pathx(self.trak_boxes, [b"mdia", b"mdhd"])["data"],
+            cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"mdhd"])["data"],
         )
         yield from _extract_samples(raw_samples, descriptions, mdhd["timescale"])
 
 
 class MovieBoxParser:
-    moov_boxes: T.Sequence[cparser.BoxDict]
+    moov_children: T.Sequence[cparser.BoxDict]
 
-    def __init__(self, moov: bytes):
-        self.moov_boxes = T.cast(
+    def __init__(self, moov_data: bytes):
+        self.moov_children = T.cast(
             T.Sequence[cparser.BoxDict],
-            cparser.MOOVWithoutSTBLBuilderConstruct.BoxList.parse(moov),
+            cparser.MOOVWithoutSTBLBuilderConstruct.BoxList.parse(moov_data),
         )
 
     @classmethod
@@ -360,12 +280,17 @@ def parse_file(cls, video_path: Path) -> "MovieBoxParser":
             moov = sparser.parse_box_data_firstx(fp, [b"moov"])
         return MovieBoxParser(moov)
 
-    def mvhd(self):
-        mvhd = cparser.find_box_at_pathx(self.moov_boxes, [b"mvhd"])
-        return mvhd["data"]
+    @classmethod
+    def parse_stream(cls, stream: T.BinaryIO) -> "MovieBoxParser":
+        moov = sparser.parse_box_data_firstx(stream, [b"moov"])
+        return MovieBoxParser(moov)
+
+    def mvhd(self) -> T.Dict:
+        mvhd = cparser.find_box_at_pathx(self.moov_children, [b"mvhd"])
+        return T.cast(T.Dict, mvhd["data"])
 
     def parse_tracks(self) -> T.Generator[TrackBoxParser, None, None]:
-        for box in self.moov_boxes:
+        for box in self.moov_children:
             if box["type"] == b"trak":
                 yield TrackBoxParser(T.cast(T.Sequence[cparser.BoxDict], box["data"]))
 
@@ -374,16 +299,17 @@ def parse_track_at(self, stream_idx: int) -> TrackBoxParser:
         stream_idx should be the stream_index specifier. See http://ffmpeg.org/ffmpeg.html#Stream-specifiers-1
         > Stream numbering is based on the order of the streams as detected by libavformat
         """
-        trak_boxes = [box for box in self.moov_boxes if box["type"] == b"trak"]
+        trak_boxes = [box for box in self.moov_children if box["type"] == b"trak"]
         if not (0 <= stream_idx < len(trak_boxes)):
             raise IndexError(
                 "unable to read stream at %d from the track list (length %d)",
                 stream_idx,
                 len(trak_boxes),
             )
-        return TrackBoxParser(
-            T.cast(T.Sequence[cparser.BoxDict], trak_boxes[stream_idx]["data"])
+        trak_children = T.cast(
+            T.Sequence[cparser.BoxDict], trak_boxes[stream_idx]["data"]
         )
+        return TrackBoxParser(trak_children)
 
 
 _DT_1904 = datetime.datetime.utcfromtimestamp(0).replace(year=1904)
diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py
index e4fce07ce..a3f097196 100644
--- a/mapillary_tools/geotag/simple_mp4_builder.py
+++ b/mapillary_tools/geotag/simple_mp4_builder.py
@@ -255,7 +255,7 @@ def _update_sbtl_sample_offsets(trak: BoxDict, sample_offset: int) -> int:
         )
         sample_offset += sample.size
     stbl_box = cparser.find_box_at_pathx(trak, [b"trak", b"mdia", b"minf", b"stbl"])
-    descriptions, _ = sample_parser.parse_raw_samples_from_stbl(
+    descriptions, _ = sample_parser.parse_raw_samples_from_stbl_DEPRECATED(
         io.BytesIO(T.cast(bytes, stbl_box["data"]))
     )
     stbl_children_boxes = build_stbl_from_raw_samples(
@@ -274,7 +274,7 @@ def iterate_samples(
             stbl_box = cparser.find_box_at_pathx(
                 box, [b"trak", b"mdia", b"minf", b"stbl"]
             )
-            _, raw_samples_iter = sample_parser.parse_raw_samples_from_stbl(
+            _, raw_samples_iter = sample_parser.parse_raw_samples_from_stbl_DEPRECATED(
                 io.BytesIO(T.cast(bytes, stbl_box["data"]))
             )
             yield from raw_samples_iter
diff --git a/tests/cli/simple_mp4_parser.py b/tests/cli/simple_mp4_parser.py
index c15d844ca..74f24056d 100644
--- a/tests/cli/simple_mp4_parser.py
+++ b/tests/cli/simple_mp4_parser.py
@@ -43,7 +43,9 @@ def _validate_samples(
             (
                 descriptions,
                 raw_samples,
-            ) = sample_parser.parse_raw_samples_from_stbl(s, maxsize=h.maxsize)
+            ) = sample_parser.parse_raw_samples_from_stbl_DEPRECATED(
+                s, maxsize=h.maxsize
+            )
             samples.extend(
                 sample
                 for sample in raw_samples
@@ -111,14 +113,18 @@ def _parse_samples(fp: T.BinaryIO, filters: T.Optional[T.Container[bytes]] = Non
             LOG.info(sample_parser.to_datetime(box.creation_time))
             LOG.info(box.duration / box.timescale)
         s.seek(offset, io.SEEK_SET)
-        for sample in sample_parser.parse_samples_from_trak(s, maxsize=h.maxsize):
+        for sample in sample_parser.parse_samples_from_trak_DEPRECATED(
+            s, maxsize=h.maxsize
+        ):
             if filters is None or sample.description["format"] in filters:
                 print(sample)
 
 
 def _dump_samples(fp: T.BinaryIO, filters: T.Optional[T.Container[bytes]] = None):
     for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
-        for sample in sample_parser.parse_samples_from_trak(s, maxsize=h.maxsize):
+        for sample in sample_parser.parse_samples_from_trak_DEPRECATED(
+            s, maxsize=h.maxsize
+        ):
             if filters is None or sample.description["format"] in filters:
                 fp.seek(sample.offset, io.SEEK_SET)
                 data = fp.read(sample.size)
diff --git a/tests/unit/test_simple_mp4_builder.py b/tests/unit/test_simple_mp4_builder.py
index e6edabe47..27cbbd84c 100644
--- a/tests/unit/test_simple_mp4_builder.py
+++ b/tests/unit/test_simple_mp4_builder.py
@@ -46,7 +46,11 @@ def _build_and_parse_stbl(
     )
     ss = sparser.parse_box_data_firstx(io.BytesIO(d), [b"stbl"])
     assert d[8:] == ss
-    _, parsed_samples = sample_parser.parse_raw_samples_from_stbl(io.BytesIO(ss))
+    _, parsed_samples = sample_parser.parse_raw_samples_from_stbl_DEPRECATED(
+        io.BytesIO(ss)
+    )
+    assert expected_samples == list(parsed_samples)
+    _, parsed_samples = sample_parser.parse_raw_samples_from_stbl_data(ss)
     assert expected_samples == list(parsed_samples)
 
 
@@ -247,80 +251,88 @@ def test_parse_raw_samples_from_stbl():
             },
         ]
     )
-    descs, sample_iter = sample_parser.parse_raw_samples_from_stbl(
+
+    def _verify_samples(descs, samples):
+        assert [
+            sample_parser.RawSample(
+                description_idx=1,
+                offset=1,
+                size=1,
+                timedelta=20,
+                composition_timedelta=0,
+                is_sync=True,
+            ),
+            sample_parser.RawSample(
+                description_idx=1,
+                offset=2,
+                size=2,
+                timedelta=30,
+                composition_timedelta=0,
+                is_sync=False,
+            ),
+            sample_parser.RawSample(
+                description_idx=1,
+                offset=5,
+                size=3,
+                timedelta=30,
+                composition_timedelta=0,
+                is_sync=True,
+            ),
+            sample_parser.RawSample(
+                description_idx=1,
+                offset=8,
+                size=3,
+                timedelta=50,
+                composition_timedelta=0,
+                is_sync=False,
+            ),
+        ] == samples
+        d = builder.build_stbl_from_raw_samples(descs, samples)
+        assert d[1:] == [
+            {
+                "data": {
+                    "entries": [
+                        {"sample_count": 1, "sample_delta": 20},
+                        {"sample_count": 2, "sample_delta": 30},
+                        {"sample_count": 1, "sample_delta": 50},
+                    ]
+                },
+                "type": b"stts",
+            },
+            {
+                "data": {
+                    "entries": [
+                        {
+                            "first_chunk": 1,
+                            "sample_description_index": 1,
+                            "samples_per_chunk": 2,
+                        },
+                        {
+                            "first_chunk": 2,
+                            "sample_description_index": 1,
+                            "samples_per_chunk": 2,
+                        },
+                    ]
+                },
+                "type": b"stsc",
+            },
+            {
+                "data": {"entries": [1, 2, 3, 3], "sample_count": 4, "sample_size": 0},
+                "type": b"stsz",
+            },
+            {"data": {"entries": [1, 5]}, "type": b"co64"},
+            {"data": {"entries": [1, 3]}, "type": b"stss"},
+        ]
+
+    descs, sample_iter = sample_parser.parse_raw_samples_from_stbl_DEPRECATED(
         io.BytesIO(stbl_bytes)
     )
     samples = list(sample_iter)
-    assert [
-        sample_parser.RawSample(
-            description_idx=1,
-            offset=1,
-            size=1,
-            timedelta=20,
-            composition_timedelta=0,
-            is_sync=True,
-        ),
-        sample_parser.RawSample(
-            description_idx=1,
-            offset=2,
-            size=2,
-            timedelta=30,
-            composition_timedelta=0,
-            is_sync=False,
-        ),
-        sample_parser.RawSample(
-            description_idx=1,
-            offset=5,
-            size=3,
-            timedelta=30,
-            composition_timedelta=0,
-            is_sync=True,
-        ),
-        sample_parser.RawSample(
-            description_idx=1,
-            offset=8,
-            size=3,
-            timedelta=50,
-            composition_timedelta=0,
-            is_sync=False,
-        ),
-    ] == samples
-    d = builder.build_stbl_from_raw_samples(descs, samples)
-    assert d[1:] == [
-        {
-            "data": {
-                "entries": [
-                    {"sample_count": 1, "sample_delta": 20},
-                    {"sample_count": 2, "sample_delta": 30},
-                    {"sample_count": 1, "sample_delta": 50},
-                ]
-            },
-            "type": b"stts",
-        },
-        {
-            "data": {
-                "entries": [
-                    {
-                        "first_chunk": 1,
-                        "sample_description_index": 1,
-                        "samples_per_chunk": 2,
-                    },
-                    {
-                        "first_chunk": 2,
-                        "sample_description_index": 1,
-                        "samples_per_chunk": 2,
-                    },
-                ]
-            },
-            "type": b"stsc",
-        },
-        {
-            "data": {"entries": [1, 2, 3, 3], "sample_count": 4, "sample_size": 0},
-            "type": b"stsz",
-        },
-        {"data": {"entries": [1, 5]}, "type": b"co64"},
-        {"data": {"entries": [1, 3]}, "type": b"stss"},
-    ]
+    _verify_samples(descs, samples)
+
+    descs, sample_iter = sample_parser.parse_raw_samples_from_stbl_data(stbl_bytes)
+    samples = list(sample_iter)
+    _verify_samples(descs, samples)
 
 
 def test_box_header_0_building():

From d42e58eed6ea6fedd7cfa7e38c9a502f20387038 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Thu, 10 Aug 2023 18:26:47 -0400
Subject: [PATCH 06/14] refactor camm parser

---
 mapillary_tools/geotag/camm_parser.py         | 92 +++++++------------
 .../geotag/construct_mp4_parser.py            | 30 ++++--
 2 files changed, 51 insertions(+), 71 deletions(-)

diff --git a/mapillary_tools/geotag/camm_parser.py b/mapillary_tools/geotag/camm_parser.py
index 35553364e..777d59f7f 100644
--- a/mapillary_tools/geotag/camm_parser.py
+++ b/mapillary_tools/geotag/camm_parser.py
@@ -10,7 +10,6 @@
 import construct as C
 
 from . import (
-    construct_mp4_parser as cparser,
     geo,
     mp4_sample_parser as sample_parser,
     simple_mp4_parser as sparser,
@@ -148,15 +147,8 @@ def elst_entry_to_seconds(
     return (media_time, duration)
 
 
-def _extract_camm_samples(
-    s: T.BinaryIO,
-    maxsize: int = -1,
-) -> T.Generator[sample_parser.Sample, None, None]:
-    samples = sample_parser.parse_samples_from_trak_DEPRECATED(s, maxsize=maxsize)
-    camm_samples = (
-        sample for sample in samples if sample.description["format"] == b"camm"
-    )
-    yield from camm_samples
+def _is_camm_description(description: T.Dict) -> bool:
+    return description["format"] == b"camm"
 
 
 def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
@@ -166,59 +158,37 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
     """
 
     points = None
-    movie_timescale = None
-    media_timescale = None
-    elst_entries = None
-
-    for h, s in sparser.parse_path(fp, [b"moov", [b"mvhd", b"trak"]]):
-        if h.type == b"trak":
-            trak_start_offset = s.tell()
 
-            descriptions = sample_parser.parse_descriptions_from_trak(
-                s, maxsize=h.maxsize
+    moov = sample_parser.MovieBoxParser.parse_stream(fp)
+    for track in moov.parse_tracks():
+        descriptions = track.parse_sample_descriptions()
+        if any(_is_camm_description(d) for d in descriptions):
+            maybe_points = (
+                _parse_point_from_sample(fp, sample)
+                for sample in track.parse_samples()
+                if _is_camm_description(sample.description)
             )
-            camm_descriptions = [d for d in descriptions if d["format"] == b"camm"]
-            if camm_descriptions:
-                s.seek(trak_start_offset, io.SEEK_SET)
-                camm_samples = _extract_camm_samples(s, h.maxsize)
-
-                points_with_nones = (
-                    _parse_point_from_sample(fp, sample)
-                    for sample in camm_samples
-                    if sample.description["format"] == b"camm"
-                )
-
-                points = [p for p in points_with_nones if p is not None]
-                if points:
-                    s.seek(trak_start_offset)
-                    elst_data = sparser.parse_box_data_first(
-                        s, [b"edts", b"elst"], maxsize=h.maxsize
-                    )
-                    if elst_data is not None:
-                        elst_entries = cparser.EditBox.parse(elst_data)["entries"]
-
-                    s.seek(trak_start_offset)
-                    mdhd_data = sparser.parse_box_data_firstx(
-                        s, [b"mdia", b"mdhd"], maxsize=h.maxsize
-                    )
-                    mdhd = cparser.MediaHeaderBox.parse(mdhd_data)
-                    media_timescale = mdhd["timescale"]
-        else:
-            assert h.type == b"mvhd"
-            if not movie_timescale:
-                mvhd = cparser.MovieHeaderBox.parse(s.read(h.maxsize))
-                movie_timescale = mvhd["timescale"]
-
-        # exit when both found
-        if movie_timescale is not None and points:
-            break
-
-    if points and movie_timescale and media_timescale and elst_entries:
-        segments = [
-            elst_entry_to_seconds(entry, movie_timescale, media_timescale)
-            for entry in elst_entries
-        ]
-        points = list(filter_points_by_elst(points, segments))
+            points = [p for p in maybe_points if p is not None]
+            if points:
+                elst_boxdata = track.extract_elst_boxdata()
+                if elst_boxdata is not None:
+                    elst_entries = elst_boxdata["entries"]
+                    if elst_entries:
+                        # media_timescale
+                        mdhd_boxdata = track.extract_mdhd_boxdata()
+                        media_timescale = mdhd_boxdata["timescale"]
+                        # movie_timescale
+                        mvhd_boxdata = moov.mvhd()
+                        movie_timescale = mvhd_boxdata["timescale"]
+                        segments = [
+                            elst_entry_to_seconds(
+                                entry,
+                                movie_timescale=movie_timescale,
+                                media_timescale=media_timescale,
+                            )
+                            for entry in elst_entries
+                        ]
+                        points = list(filter_points_by_elst(points, segments))
 
     return points
 
diff --git a/mapillary_tools/geotag/construct_mp4_parser.py b/mapillary_tools/geotag/construct_mp4_parser.py
index 201a692f5..f11d6e0f0 100644
--- a/mapillary_tools/geotag/construct_mp4_parser.py
+++ b/mapillary_tools/geotag/construct_mp4_parser.py
@@ -592,8 +592,17 @@ def _new_cmap_without_boxes(
 def find_box_at_pathx(
     box: T.Union[T.Sequence[BoxDict], BoxDict], path: T.Sequence[bytes]
 ) -> BoxDict:
-    if not path:
+    found = find_box_at_path(box, path)
+    if found is None:
         raise ValueError(f"box at path {path} not found")
+    return found
+
+
+def find_box_at_path(
+    box: T.Union[T.Sequence[BoxDict], BoxDict], path: T.Sequence[bytes]
+) -> T.Optional[BoxDict]:
+    if not path:
+        return None
 
     boxes: T.Sequence[BoxDict]
     if isinstance(box, dict):
@@ -605,12 +614,13 @@ def find_box_at_pathx(
         if box["type"] == path[0]:
             if len(path) == 1:
                 return box
-            else:
-                box_data = T.cast(T.Sequence[BoxDict], box["data"])
-                # ListContainer from construct is not sequence
-                assert isinstance(
-                    box_data, T.Sequence
-                ), f"expect a list of boxes but got {type(box_data)} at path {path}"
-                return find_box_at_pathx(box_data, path[1:])
-
-    raise ValueError(f"box at path {path} not found")
+            box_data = T.cast(T.Sequence[BoxDict], box["data"])
+            # ListContainer from construct is not sequence
+            assert isinstance(
+                box_data, T.Sequence
+            ), f"expect a list of boxes but got {type(box_data)} at path {path}"
+            found = find_box_at_path(box_data, path[1:])
+            if found is not None:
+                return found
+
+    return None

From 90961fd643fd0a7cc832fc557383e5325d96d516 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Thu, 10 Aug 2023 18:27:21 -0400
Subject: [PATCH 07/14] refactor gpmf parser

---
 mapillary_tools/geotag/gpmf_parser.py        | 69 ++++++++------------
 mapillary_tools/geotag/simple_mp4_builder.py | 12 ++--
 2 files changed, 32 insertions(+), 49 deletions(-)

diff --git a/mapillary_tools/geotag/gpmf_parser.py b/mapillary_tools/geotag/gpmf_parser.py
index 3deb7fe00..439a8ab3d 100644
--- a/mapillary_tools/geotag/gpmf_parser.py
+++ b/mapillary_tools/geotag/gpmf_parser.py
@@ -11,7 +11,7 @@
 import construct as C
 
 from .. import geo
-from . import mp4_sample_parser as sample_parser, simple_mp4_parser as sparser
+from . import mp4_sample_parser as sample_parser
 
 """
 Parsing GPS from GPMF data format stored in GoPros. See the GPMF spec: https://github.com/gopro/gpmf-parser
@@ -303,18 +303,25 @@ def _extract_points_from_samples(
     return values[0] if values else []
 
 
+def _is_gpmd_description(description: T.Dict) -> bool:
+    return description["format"] == b"gpmd"
+
+
 def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.PointWithFix]]:
     """
     Return a list of points (could be empty) if it is a valid GoPro video,
     otherwise None
     """
     points = None
-    for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
-        trak_start_offset = s.tell()
-        descriptions = _extract_gpmd_descriptions_from_trak(s, h.maxsize)
-        if descriptions:
-            s.seek(trak_start_offset, io.SEEK_SET)
-            gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize)
+    moov = sample_parser.MovieBoxParser.parse_stream(fp)
+    for track in moov.parse_tracks():
+        descriptions = track.parse_sample_descriptions()
+        if any(_is_gpmd_description(d) for d in descriptions):
+            gpmd_samples = (
+                sample
+                for sample in track.parse_samples()
+                if _is_gpmd_description(sample.description)
+            )
             points = list(_extract_points_from_samples(fp, gpmd_samples))
             # return the firstly found non-empty points
             if points:
@@ -323,34 +330,19 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.PointWithFix]]:
     return points
 
 
-def _extract_gpmd_descriptions_from_trak(
-    s: T.BinaryIO,
-    maxsize: int = -1,
-):
-    descriptions = sample_parser.parse_descriptions_from_trak(s, maxsize=maxsize)
-    return [d for d in descriptions if d["format"] == b"gpmd"]
-
-
-def _extract_gpmd_samples_from_trak(
-    s: T.BinaryIO,
-    maxsize: int = -1,
-) -> T.Generator[sample_parser.Sample, None, None]:
-    trak_start_offset = s.tell()
-    gpmd_descriptions = _extract_gpmd_descriptions_from_trak(s, maxsize=maxsize)
-    if gpmd_descriptions:
-        s.seek(trak_start_offset, io.SEEK_SET)
-        samples = sample_parser.parse_samples_from_trak_DEPRECATED(s, maxsize=maxsize)
-        for sample in samples:
-            if sample.description["format"] == b"gpmd":
-                yield sample
-
-
 def extract_all_device_names(fp: T.BinaryIO) -> T.Dict[int, bytes]:
-    for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
-        gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize)
-        device_names = _extract_dvnm_from_samples(fp, gpmd_samples)
-        if device_names:
-            return device_names
+    moov = sample_parser.MovieBoxParser.parse_stream(fp)
+    for track in moov.parse_tracks():
+        descriptions = track.parse_sample_descriptions()
+        if any(_is_gpmd_description(d) for d in descriptions):
+            gpmd_samples = (
+                sample
+                for sample in track.parse_samples()
+                if _is_gpmd_description(sample.description)
+            )
+            device_names = _extract_dvnm_from_samples(fp, gpmd_samples)
+            if device_names:
+                return device_names
     return {}
 
 
@@ -391,12 +383,3 @@ def parse_gpx(path: pathlib.Path) -> T.List[geo.PointWithFix]:
     if points is None:
         return []
     return points
-
-
-def iterate_gpmd_sample_data(fp: T.BinaryIO) -> T.Generator[T.Dict, None, None]:
-    for h, s in sparser.parse_path(fp, [b"moov", b"trak"]):
-        gpmd_samples = _extract_gpmd_samples_from_trak(s, h.maxsize)
-        for sample in gpmd_samples:
-            fp.seek(sample.raw_sample.offset, io.SEEK_SET)
-            data = fp.read(sample.raw_sample.size)
-            yield T.cast(T.Dict, GPMFSampleData.parse(data))
diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py
index a3f097196..4b19605b5 100644
--- a/mapillary_tools/geotag/simple_mp4_builder.py
+++ b/mapillary_tools/geotag/simple_mp4_builder.py
@@ -15,8 +15,8 @@
 Variable naming conventions:
 
 - *_box: a BoxDict
-- *_boxes: a list of BoxDicts
-- *_children: a list of BoxDicts under the parent box
+- *_children: a list of child BoxDicts under the parent box
+- *_boxdata: BoxDict["data"]
 - *_data: the data in bytes of a box (without the header (type and size))
 - *_typed_data: the data in bytes of a box (with the header (type and size))
 """
@@ -255,8 +255,8 @@ def _update_sbtl_sample_offsets(trak: BoxDict, sample_offset: int) -> int:
         )
         sample_offset += sample.size
     stbl_box = cparser.find_box_at_pathx(trak, [b"trak", b"mdia", b"minf", b"stbl"])
-    descriptions, _ = sample_parser.parse_raw_samples_from_stbl_DEPRECATED(
-        io.BytesIO(T.cast(bytes, stbl_box["data"]))
+    descriptions, _ = sample_parser.parse_raw_samples_from_stbl_data(
+        T.cast(bytes, stbl_box["data"])
     )
     stbl_children_boxes = build_stbl_from_raw_samples(
         descriptions, repositioned_samples
@@ -274,8 +274,8 @@ def iterate_samples(
             stbl_box = cparser.find_box_at_pathx(
                 box, [b"trak", b"mdia", b"minf", b"stbl"]
             )
-            _, raw_samples_iter = sample_parser.parse_raw_samples_from_stbl_DEPRECATED(
-                io.BytesIO(T.cast(bytes, stbl_box["data"]))
+            _, raw_samples_iter = sample_parser.parse_raw_samples_from_stbl_data(
+                T.cast(bytes, stbl_box["data"])
             )
             yield from raw_samples_iter
 

From 9abba003b75dcb58a2697fa0cb854309976db2a8 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Thu, 10 Aug 2023 18:48:36 -0400
Subject: [PATCH 08/14] rename parse_ to extract_

---
 mapillary_tools/geotag/camm_parser.py        |  8 ++++----
 mapillary_tools/geotag/gpmf_parser.py        | 12 ++++++------
 mapillary_tools/geotag/mp4_sample_parser.py  | 20 ++++++++++----------
 mapillary_tools/geotag/simple_mp4_builder.py |  4 ++--
 mapillary_tools/sample_video.py              |  4 ++--
 tests/unit/test_mp4_sample_parser.py         | 14 +++++++-------
 tests/unit/test_simple_mp4_builder.py        |  4 ++--
 7 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/mapillary_tools/geotag/camm_parser.py b/mapillary_tools/geotag/camm_parser.py
index 777d59f7f..406985fa2 100644
--- a/mapillary_tools/geotag/camm_parser.py
+++ b/mapillary_tools/geotag/camm_parser.py
@@ -160,12 +160,12 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
     points = None
 
     moov = sample_parser.MovieBoxParser.parse_stream(fp)
-    for track in moov.parse_tracks():
-        descriptions = track.parse_sample_descriptions()
+    for track in moov.extract_tracks():
+        descriptions = track.extract_sample_descriptions()
         if any(_is_camm_description(d) for d in descriptions):
             maybe_points = (
                 _parse_point_from_sample(fp, sample)
-                for sample in track.parse_samples()
+                for sample in track.extract_samples()
                 if _is_camm_description(sample.description)
             )
             points = [p for p in maybe_points if p is not None]
@@ -178,7 +178,7 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.Point]]:
                         mdhd_boxdata = track.extract_mdhd_boxdata()
                         media_timescale = mdhd_boxdata["timescale"]
                         # movie_timescale
-                        mvhd_boxdata = moov.mvhd()
+                        mvhd_boxdata = moov.extract_mvhd_boxdata()
                         movie_timescale = mvhd_boxdata["timescale"]
                         segments = [
                             elst_entry_to_seconds(
diff --git a/mapillary_tools/geotag/gpmf_parser.py b/mapillary_tools/geotag/gpmf_parser.py
index 439a8ab3d..11d169048 100644
--- a/mapillary_tools/geotag/gpmf_parser.py
+++ b/mapillary_tools/geotag/gpmf_parser.py
@@ -314,12 +314,12 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.PointWithFix]]:
     """
     points = None
     moov = sample_parser.MovieBoxParser.parse_stream(fp)
-    for track in moov.parse_tracks():
-        descriptions = track.parse_sample_descriptions()
+    for track in moov.extract_tracks():
+        descriptions = track.extract_sample_descriptions()
         if any(_is_gpmd_description(d) for d in descriptions):
             gpmd_samples = (
                 sample
-                for sample in track.parse_samples()
+                for sample in track.extract_samples()
                 if _is_gpmd_description(sample.description)
             )
             points = list(_extract_points_from_samples(fp, gpmd_samples))
@@ -332,12 +332,12 @@ def extract_points(fp: T.BinaryIO) -> T.Optional[T.List[geo.PointWithFix]]:
 
 def extract_all_device_names(fp: T.BinaryIO) -> T.Dict[int, bytes]:
     moov = sample_parser.MovieBoxParser.parse_stream(fp)
-    for track in moov.parse_tracks():
-        descriptions = track.parse_sample_descriptions()
+    for track in moov.extract_tracks():
+        descriptions = track.extract_sample_descriptions()
         if any(_is_gpmd_description(d) for d in descriptions):
             gpmd_samples = (
                 sample
-                for sample in track.parse_samples()
+                for sample in track.extract_samples()
                 if _is_gpmd_description(sample.description)
             )
             device_names = _extract_dvnm_from_samples(fp, gpmd_samples)
diff --git a/mapillary_tools/geotag/mp4_sample_parser.py b/mapillary_tools/geotag/mp4_sample_parser.py
index 79dcc7d86..78e85f116 100644
--- a/mapillary_tools/geotag/mp4_sample_parser.py
+++ b/mapillary_tools/geotag/mp4_sample_parser.py
@@ -150,7 +150,7 @@ def _extract_samples(
 ).BoxList
 
 
-def parse_raw_samples_from_stbl_data(
+def extract_raw_samples_from_stbl_data(
     stbl: bytes,
 ) -> T.Tuple[T.List[T.Dict], T.Generator[RawSample, None, None]]:
     descriptions = []
@@ -225,7 +225,7 @@ def __init__(self, trak_children: T.Sequence[cparser.BoxDict]):
         )
         self.stbl_data = T.cast(bytes, stbl["data"])
 
-    def tkhd(self) -> T.Dict:
+    def extract_tkhd_boxdata(self) -> T.Dict:
         return T.cast(
             T.Dict, cparser.find_box_at_pathx(self.trak_children, [b"tkhd"])["data"]
         )
@@ -234,7 +234,7 @@ def is_video_track(self) -> bool:
         hdlr = cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"hdlr"])
         return T.cast(T.Dict[str, T.Any], hdlr["data"])["handler_type"] == b"vide"
 
-    def parse_sample_descriptions(self) -> T.List[T.Dict]:
+    def extract_sample_descriptions(self) -> T.List[T.Dict]:
         # TODO: return [] if parsing fail
         boxes = _STSDBoxListConstruct.parse(self.stbl_data)
         stsd = cparser.find_box_at_pathx(
@@ -252,12 +252,12 @@ def extract_mdhd_boxdata(self) -> T.Dict:
         box = cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"mdhd"])
         return T.cast(T.Dict, box["data"])
 
-    def parse_raw_samples(self) -> T.Generator[RawSample, None, None]:
-        _, raw_samples = parse_raw_samples_from_stbl_data(self.stbl_data)
+    def extract_raw_samples(self) -> T.Generator[RawSample, None, None]:
+        _, raw_samples = extract_raw_samples_from_stbl_data(self.stbl_data)
         yield from raw_samples
 
-    def parse_samples(self) -> T.Generator[Sample, None, None]:
-        descriptions, raw_samples = parse_raw_samples_from_stbl_data(self.stbl_data)
+    def extract_samples(self) -> T.Generator[Sample, None, None]:
+        descriptions, raw_samples = extract_raw_samples_from_stbl_data(self.stbl_data)
         mdhd = T.cast(
             T.Dict,
             cparser.find_box_at_pathx(self.trak_children, [b"mdia", b"mdhd"])["data"],
@@ -285,16 +285,16 @@ def parse_stream(cls, stream: T.BinaryIO) -> "MovieBoxParser":
         moov = sparser.parse_box_data_firstx(stream, [b"moov"])
         return MovieBoxParser(moov)
 
-    def mvhd(self) -> T.Dict:
+    def extract_mvhd_boxdata(self) -> T.Dict:
         mvhd = cparser.find_box_at_pathx(self.moov_children, [b"mvhd"])
         return T.cast(T.Dict, mvhd["data"])
 
-    def parse_tracks(self) -> T.Generator[TrackBoxParser, None, None]:
+    def extract_tracks(self) -> T.Generator[TrackBoxParser, None, None]:
         for box in self.moov_children:
             if box["type"] == b"trak":
                 yield TrackBoxParser(T.cast(T.Sequence[cparser.BoxDict], box["data"]))
 
-    def parse_track_at(self, stream_idx: int) -> TrackBoxParser:
+    def extract_track_at(self, stream_idx: int) -> TrackBoxParser:
         """
         stream_idx should be the stream_index specifier. See http://ffmpeg.org/ffmpeg.html#Stream-specifiers-1
         > Stream numbering is based on the order of the streams as detected by libavformat
diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py
index 4b19605b5..38c7dd308 100644
--- a/mapillary_tools/geotag/simple_mp4_builder.py
+++ b/mapillary_tools/geotag/simple_mp4_builder.py
@@ -255,7 +255,7 @@ def _update_sbtl_sample_offsets(trak: BoxDict, sample_offset: int) -> int:
         )
         sample_offset += sample.size
     stbl_box = cparser.find_box_at_pathx(trak, [b"trak", b"mdia", b"minf", b"stbl"])
-    descriptions, _ = sample_parser.parse_raw_samples_from_stbl_data(
+    descriptions, _ = sample_parser.extract_raw_samples_from_stbl_data(
         T.cast(bytes, stbl_box["data"])
     )
     stbl_children_boxes = build_stbl_from_raw_samples(
@@ -274,7 +274,7 @@ def iterate_samples(
             stbl_box = cparser.find_box_at_pathx(
                 box, [b"trak", b"mdia", b"minf", b"stbl"]
             )
-            _, raw_samples_iter = sample_parser.parse_raw_samples_from_stbl_data(
+            _, raw_samples_iter = sample_parser.extract_raw_samples_from_stbl_data(
                 T.cast(bytes, stbl_box["data"])
             )
             yield from raw_samples_iter
diff --git a/mapillary_tools/sample_video.py b/mapillary_tools/sample_video.py
index 6c1d80afd..b4ba18f2e 100644
--- a/mapillary_tools/sample_video.py
+++ b/mapillary_tools/sample_video.py
@@ -234,7 +234,7 @@ def _sample_video_stream_by_distance(
     """
 
     LOG.info("Extracting video samples")
-    sorted_samples = list(video_track_parser.parse_samples())
+    sorted_samples = list(video_track_parser.extract_samples())
     # we need sort sampels by composition time (CT) not the decoding offset (DT)
     # CT is the oder of videos streaming to audiences, as well as the order ffmpeg sampling
     sorted_samples.sort(key=lambda sample: sample.exact_composition_time)
@@ -316,7 +316,7 @@ def _sample_single_video_by_distance(
     LOG.info("Extracting video samples")
     video_stream_idx = video_stream["index"]
     moov_parser = mp4_sample_parser.MovieBoxParser.parse_file(video_path)
-    video_track_parser = moov_parser.parse_track_at(video_stream_idx)
+    video_track_parser = moov_parser.extract_track_at(video_stream_idx)
     sample_points_by_frame_idx = _sample_video_stream_by_distance(
         video_metadata.points, video_track_parser, sample_distance
     )
diff --git a/tests/unit/test_mp4_sample_parser.py b/tests/unit/test_mp4_sample_parser.py
index 1b08bdc6e..360cb9678 100644
--- a/tests/unit/test_mp4_sample_parser.py
+++ b/tests/unit/test_mp4_sample_parser.py
@@ -7,13 +7,13 @@ def test_movie_box_parser():
     moov_parser = mp4_sample_parser.MovieBoxParser.parse_file(
         Path("tests/data/videos/sample-5s.mp4")
     )
-    assert 2 == len(list(moov_parser.parse_tracks()))
-    video_track = moov_parser.parse_track_at(0)
+    assert 2 == len(list(moov_parser.extract_tracks()))
+    video_track = moov_parser.extract_track_at(0)
     assert video_track.is_video_track()
-    aac_track = moov_parser.parse_track_at(1)
+    aac_track = moov_parser.extract_track_at(1)
     assert not aac_track.is_video_track()
-    samples = list(video_track.parse_samples())
-    raw_samples = list(video_track.parse_raw_samples())
+    samples = list(video_track.extract_samples())
+    raw_samples = list(video_track.extract_raw_samples())
     assert 171 == len(samples)
     assert len(samples) == len(raw_samples)
     assert {
@@ -31,7 +31,7 @@ def test_movie_box_parser():
         "height": 70778880,
     } == {
         k: v
-        for k, v in video_track.tkhd().items()
+        for k, v in video_track.extract_tkhd_boxdata().items()
         if k
         in [
             "version",
@@ -47,7 +47,7 @@ def test_movie_box_parser():
             "height",
         ]
     }
-    assert isinstance(video_track.tkhd(), dict)
+    assert isinstance(video_track.extract_tkhd_boxdata(), dict)
     for sample, raw_sample in zip(samples, raw_samples):
         assert sample.raw_sample.offset == raw_sample.offset
         assert sample.raw_sample.is_sync == raw_sample.is_sync
diff --git a/tests/unit/test_simple_mp4_builder.py b/tests/unit/test_simple_mp4_builder.py
index 27cbbd84c..03e6b664b 100644
--- a/tests/unit/test_simple_mp4_builder.py
+++ b/tests/unit/test_simple_mp4_builder.py
@@ -50,7 +50,7 @@ def _build_and_parse_stbl(
         io.BytesIO(ss)
     )
     assert expected_samples == list(parsed_samples)
-    _, parsed_samples = sample_parser.parse_raw_samples_from_stbl_data(ss)
+    _, parsed_samples = sample_parser.extract_raw_samples_from_stbl_data(ss)
     assert expected_samples == list(parsed_samples)
 
 
@@ -330,7 +330,7 @@ def _verify_samples(descs, samples):
     samples = list(sample_iter)
     _verify_samples(descs, samples)
 
-    descs, sample_iter = sample_parser.parse_raw_samples_from_stbl_data(stbl_bytes)
+    descs, sample_iter = sample_parser.extract_raw_samples_from_stbl_data(stbl_bytes)
     samples = list(sample_iter)
     _verify_samples(descs, samples)
 

From 730fbae3d06d6e9532a789d9d9e183ad8ffc6721 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Mon, 18 Nov 2024 17:43:40 -0800
Subject: [PATCH 09/14] remove deprecated functions

---
 tests/unit/test_simple_mp4_builder.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/tests/unit/test_simple_mp4_builder.py b/tests/unit/test_simple_mp4_builder.py
index 03e6b664b..7a1db211a 100644
--- a/tests/unit/test_simple_mp4_builder.py
+++ b/tests/unit/test_simple_mp4_builder.py
@@ -46,10 +46,6 @@ def _build_and_parse_stbl(
     )
     ss = sparser.parse_box_data_firstx(io.BytesIO(d), [b"stbl"])
     assert d[8:] == ss
-    _, parsed_samples = sample_parser.parse_raw_samples_from_stbl_DEPRECATED(
-        io.BytesIO(ss)
-    )
-    assert expected_samples == list(parsed_samples)
     _, parsed_samples = sample_parser.extract_raw_samples_from_stbl_data(ss)
     assert expected_samples == list(parsed_samples)
 
@@ -324,12 +320,6 @@ def _verify_samples(descs, samples):
             {"data": {"entries": [1, 3]}, "type": b"stss"},
         ]
 
-    descs, sample_iter = sample_parser.parse_raw_samples_from_stbl_DEPRECATED(
-        io.BytesIO(stbl_bytes)
-    )
-    samples = list(sample_iter)
-    _verify_samples(descs, samples)
-
     descs, sample_iter = sample_parser.extract_raw_samples_from_stbl_data(stbl_bytes)
     samples = list(sample_iter)
     _verify_samples(descs, samples)

From f400b95b0abc5b0c7c4c703acb1e82a70834362f Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Wed, 27 Nov 2024 14:46:25 -0800
Subject: [PATCH 10/14] rename composition_timedelta to composition_offset

---
 mapillary_tools/geotag/camm_builder.py       |  2 +-
 mapillary_tools/geotag/mp4_sample_parser.py  | 36 ++++++++++----------
 mapillary_tools/geotag/simple_mp4_builder.py | 10 +++---
 tests/unit/test_simple_mp4_builder.py        | 30 ++++++++--------
 4 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/mapillary_tools/geotag/camm_builder.py b/mapillary_tools/geotag/camm_builder.py
index 53740d2c2..5ff61e35b 100644
--- a/mapillary_tools/geotag/camm_builder.py
+++ b/mapillary_tools/geotag/camm_builder.py
@@ -104,7 +104,7 @@ def convert_points_to_raw_samples(
             offset=0,
             size=len(camm_sample_data),
             timedelta=timedelta,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=True,
         )
 
diff --git a/mapillary_tools/geotag/mp4_sample_parser.py b/mapillary_tools/geotag/mp4_sample_parser.py
index 78e85f116..1cebd682b 100644
--- a/mapillary_tools/geotag/mp4_sample_parser.py
+++ b/mapillary_tools/geotag/mp4_sample_parser.py
@@ -23,7 +23,7 @@ class RawSample(T.NamedTuple):
     # sample composition offset that decides when to present the sample,
     # i.e. CTTS(n) in the forumula CT(n) = DT(n) + CTTS(n).
     # NOTE: timescale is not applied yet (hence int)
-    composition_timedelta: int
+    composition_offset: int
 
     # if it is a sync sample
     is_sync: bool
@@ -50,7 +50,7 @@ def _extract_raw_samples(
     chunk_entries: T.Sequence[T.Dict],
     chunk_offsets: T.Sequence[int],
     timedeltas: T.Sequence[int],
-    composition_timedeltas: T.Optional[T.Sequence[int]],
+    composition_offsets: T.Optional[T.Sequence[int]],
     syncs: T.Optional[T.Set[int]],
 ) -> T.Generator[RawSample, None, None]:
     if not sizes:
@@ -81,9 +81,9 @@ def _extract_raw_samples(
             # iterate samples in this chunk
             for _ in range(entry["samples_per_chunk"]):
                 is_sync = syncs is None or (sample_idx + 1) in syncs
-                composition_timedelta = (
-                    composition_timedeltas[sample_idx]
-                    if composition_timedeltas is not None
+                composition_offset = (
+                    composition_offsets[sample_idx]
+                    if composition_offsets is not None
                     else 0
                 )
                 yield RawSample(
@@ -91,7 +91,7 @@ def _extract_raw_samples(
                     offset=sample_offset,
                     size=sizes[sample_idx],
                     timedelta=timedeltas[sample_idx],
-                    composition_timedelta=composition_timedelta,
+                    composition_offset=composition_offset,
                     is_sync=is_sync,
                 )
                 sample_offset += sizes[sample_idx]
@@ -108,9 +108,9 @@ def _extract_raw_samples(
         # iterate samples in this chunk
         for _ in range(chunk_entries[-1]["samples_per_chunk"]):
             is_sync = syncs is None or (sample_idx + 1) in syncs
-            composition_timedelta = (
-                composition_timedeltas[sample_idx]
-                if composition_timedeltas is not None
+            composition_offset = (
+                composition_offsets[sample_idx]
+                if composition_offsets is not None
                 else 0
             )
             yield RawSample(
@@ -118,7 +118,7 @@ def _extract_raw_samples(
                 offset=sample_offset,
                 size=sizes[sample_idx],
                 timedelta=timedeltas[sample_idx],
-                composition_timedelta=composition_timedelta,
+                composition_offset=composition_offset,
                 is_sync=is_sync,
             )
             sample_offset += sizes[sample_idx]
@@ -139,7 +139,7 @@ def _extract_samples(
             exact_time=acc_delta / timescale,
             exact_timedelta=raw_sample.timedelta / timescale,
             # CT(n) = DT(n) + CTTS(n)
-            exact_composition_time=(acc_delta + raw_sample.composition_timedelta)
+            exact_composition_time=(acc_delta + raw_sample.composition_offset)
             / timescale,
         )
         acc_delta += raw_sample.timedelta
@@ -158,7 +158,7 @@ def extract_raw_samples_from_stbl_data(
     chunk_offsets = []
     chunk_entries = []
     timedeltas: T.List[int] = []
-    composition_timedeltas: T.Optional[T.List[int]] = None
+    composition_offsets: T.Optional[T.List[int]] = None
     syncs: T.Optional[T.Set[int]] = None
 
     stbl_children = T.cast(
@@ -187,10 +187,10 @@ def extract_raw_samples_from_stbl_data(
                 for _ in range(entry["sample_count"]):
                     timedeltas.append(entry["sample_delta"])
         elif box["type"] == b"ctts":
-            composition_timedeltas = []
+            composition_offsets = []
             for entry in data["entries"]:
                 for _ in range(entry["sample_count"]):
-                    composition_timedeltas.append(entry["sample_offset"])
+                    composition_offsets.append(entry["sample_offset"])
         elif box["type"] == b"stss":
             syncs = set(data["entries"])
 
@@ -198,12 +198,12 @@ def extract_raw_samples_from_stbl_data(
     # in this case append 0's to timedeltas
     while len(timedeltas) < len(sizes):
         timedeltas.append(0)
-    if composition_timedeltas is not None:
-        while len(composition_timedeltas) < len(sizes):
-            composition_timedeltas.append(0)
+    if composition_offsets is not None:
+        while len(composition_offsets) < len(sizes):
+            composition_offsets.append(0)
 
     raw_samples = _extract_raw_samples(
-        sizes, chunk_entries, chunk_offsets, timedeltas, composition_timedeltas, syncs
+        sizes, chunk_entries, chunk_offsets, timedeltas, composition_offsets, syncs
     )
     return descriptions, raw_samples
 
diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py
index 38c7dd308..632aecca0 100644
--- a/mapillary_tools/geotag/simple_mp4_builder.py
+++ b/mapillary_tools/geotag/simple_mp4_builder.py
@@ -144,10 +144,10 @@ class _CompressedSampleCompositionOffset:
     sample_offset: int
 
 
-def _build_ctts(sample_composition_timedeltas: T.Iterable[int]) -> BoxDict:
+def _build_ctts(sample_composition_offsets: T.Iterable[int]) -> BoxDict:
     # compress offsets
     compressed: T.List[_CompressedSampleCompositionOffset] = []
-    for offset in sample_composition_timedeltas:
+    for offset in sample_composition_offsets:
         if compressed and offset == compressed[-1].sample_offset:
             compressed[-1].sample_count += 1
         else:
@@ -197,8 +197,8 @@ def build_stbl_from_raw_samples(
         # so we can calculate the moov box size in advance
         _build_co64(raw_samples),
     ]
-    if any(s.composition_timedelta for s in raw_samples):
-        boxes.append(_build_ctts((s.composition_timedelta for s in raw_samples)))
+    if any(s.composition_offset for s in raw_samples):
+        boxes.append(_build_ctts((s.composition_offset for s in raw_samples)))
     if any(not s.is_sync for s in raw_samples):
         boxes.append(_build_stss((s.is_sync for s in raw_samples)))
     return boxes
@@ -249,7 +249,7 @@ def _update_sbtl_sample_offsets(trak: BoxDict, sample_offset: int) -> int:
                 offset=sample_offset,
                 size=sample.size,
                 timedelta=sample.timedelta,
-                composition_timedelta=sample.composition_timedelta,
+                composition_offset=sample.composition_offset,
                 is_sync=sample.is_sync,
             )
         )
diff --git a/tests/unit/test_simple_mp4_builder.py b/tests/unit/test_simple_mp4_builder.py
index 7a1db211a..80d971d5d 100644
--- a/tests/unit/test_simple_mp4_builder.py
+++ b/tests/unit/test_simple_mp4_builder.py
@@ -62,7 +62,7 @@ def test_build_stbl_happy():
             offset=1,
             size=1,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=True,
         ),
         sample_parser.RawSample(
@@ -70,7 +70,7 @@ def test_build_stbl_happy():
             offset=2,
             size=9,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=False,
         ),
     ]
@@ -82,7 +82,7 @@ def test_build_stbl_happy():
             offset=1,
             size=1,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=True,
         ),
         sample_parser.RawSample(
@@ -90,7 +90,7 @@ def test_build_stbl_happy():
             offset=2,
             size=2,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=False,
         ),
         # another chunk here due to a 1-byte break
@@ -99,7 +99,7 @@ def test_build_stbl_happy():
             offset=5,
             size=1,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=True,
         ),
         sample_parser.RawSample(
@@ -107,7 +107,7 @@ def test_build_stbl_happy():
             offset=6,
             size=9,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=False,
         ),
     ]
@@ -119,7 +119,7 @@ def test_build_stbl_happy():
             offset=1,
             size=1,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=False,
         ),
         sample_parser.RawSample(
@@ -127,7 +127,7 @@ def test_build_stbl_happy():
             offset=2,
             size=2,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=True,
         ),
         # another chunk here
@@ -136,7 +136,7 @@ def test_build_stbl_happy():
             offset=4,
             size=1,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=True,
         ),
         # another chunk here
@@ -145,7 +145,7 @@ def test_build_stbl_happy():
             offset=5,
             size=9,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=True,
         ),
     ]
@@ -157,7 +157,7 @@ def test_build_stbl_happy():
             offset=1,
             size=1,
             timedelta=2,
-            composition_timedelta=0,
+            composition_offset=0,
             is_sync=True,
         ),
     ]
@@ -255,7 +255,7 @@ def _verify_samples(descs, samples):
                 offset=1,
                 size=1,
                 timedelta=20,
-                composition_timedelta=0,
+                composition_offset=0,
                 is_sync=True,
             ),
             sample_parser.RawSample(
@@ -263,7 +263,7 @@ def _verify_samples(descs, samples):
                 offset=2,
                 size=2,
                 timedelta=30,
-                composition_timedelta=0,
+                composition_offset=0,
                 is_sync=False,
             ),
             sample_parser.RawSample(
@@ -271,7 +271,7 @@ def _verify_samples(descs, samples):
                 offset=5,
                 size=3,
                 timedelta=30,
-                composition_timedelta=0,
+                composition_offset=0,
                 is_sync=True,
             ),
             sample_parser.RawSample(
@@ -279,7 +279,7 @@ def _verify_samples(descs, samples):
                 offset=8,
                 size=3,
                 timedelta=50,
-                composition_timedelta=0,
+                composition_offset=0,
                 is_sync=False,
             ),
         ] == samples

From a57b42a118aa2412ce5551a07859877e89d56475 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Wed, 27 Nov 2024 15:12:20 -0800
Subject: [PATCH 11/14] move mp4 to a separate module

---
 mapillary_tools/geotag/blackvue_parser.py                | 2 +-
 mapillary_tools/geotag/camm_builder.py                   | 6 ++++--
 mapillary_tools/geotag/camm_parser.py                    | 7 ++-----
 mapillary_tools/geotag/geotag_videos_from_video.py       | 2 +-
 mapillary_tools/geotag/gpmf_parser.py                    | 2 +-
 mapillary_tools/geotag/simple_mp4_builder.py             | 8 ++++----
 mapillary_tools/{geotag => mp4}/construct_mp4_parser.py  | 0
 mapillary_tools/{geotag => mp4}/mp4_sample_parser.py     | 0
 mapillary_tools/{geotag => mp4}/simple_mp4_parser.py     | 0
 mapillary_tools/sample_video.py                          | 3 ++-
 .../video_data_extraction/extractors/blackvue_parser.py  | 9 +++++----
 .../video_data_extraction/extractors/camm_parser.py      | 9 +++++----
 .../video_data_extraction/extractors/gopro_parser.py     | 9 +++++----
 tests/unit/test_blackvue_parser.py                       | 3 ++-
 tests/unit/test_camm_parser.py                           | 2 +-
 tests/unit/test_mp4_sample_parser.py                     | 2 +-
 tests/unit/test_simple_mp4_builder.py                    | 4 +++-
 tests/unit/test_simple_mp4_parser.py                     | 2 +-
 18 files changed, 38 insertions(+), 32 deletions(-)
 rename mapillary_tools/{geotag => mp4}/construct_mp4_parser.py (100%)
 rename mapillary_tools/{geotag => mp4}/mp4_sample_parser.py (100%)
 rename mapillary_tools/{geotag => mp4}/simple_mp4_parser.py (100%)

diff --git a/mapillary_tools/geotag/blackvue_parser.py b/mapillary_tools/geotag/blackvue_parser.py
index 812dc70a3..99fc92ba3 100644
--- a/mapillary_tools/geotag/blackvue_parser.py
+++ b/mapillary_tools/geotag/blackvue_parser.py
@@ -7,7 +7,7 @@
 import pynmea2
 
 from .. import geo
-from . import simple_mp4_parser as sparser
+from ..mp4 import simple_mp4_parser as sparser
 
 
 LOG = logging.getLogger(__name__)
diff --git a/mapillary_tools/geotag/camm_builder.py b/mapillary_tools/geotag/camm_builder.py
index 5ff61e35b..5a013f578 100644
--- a/mapillary_tools/geotag/camm_builder.py
+++ b/mapillary_tools/geotag/camm_builder.py
@@ -2,11 +2,13 @@
 import typing as T
 
 from .. import geo, types
+from ..mp4 import (
+    construct_mp4_parser as cparser,
+    mp4_sample_parser as sample_parser,
+)
 
 from . import (
     camm_parser,
-    construct_mp4_parser as cparser,
-    mp4_sample_parser as sample_parser,
     simple_mp4_builder as builder,
 )
 from .simple_mp4_builder import BoxDict
diff --git a/mapillary_tools/geotag/camm_parser.py b/mapillary_tools/geotag/camm_parser.py
index 406985fa2..f93b7ffd4 100644
--- a/mapillary_tools/geotag/camm_parser.py
+++ b/mapillary_tools/geotag/camm_parser.py
@@ -9,11 +9,8 @@
 
 import construct as C
 
-from . import (
-    geo,
-    mp4_sample_parser as sample_parser,
-    simple_mp4_parser as sparser,
-)
+from . import geo
+from ..mp4 import simple_mp4_parser as sparser, mp4_sample_parser as sample_parser
 
 
 LOG = logging.getLogger(__name__)
diff --git a/mapillary_tools/geotag/geotag_videos_from_video.py b/mapillary_tools/geotag/geotag_videos_from_video.py
index 42846ffc0..d1d31c0d8 100644
--- a/mapillary_tools/geotag/geotag_videos_from_video.py
+++ b/mapillary_tools/geotag/geotag_videos_from_video.py
@@ -12,9 +12,9 @@
     camm_parser,
     gpmf_gps_filter,
     gpmf_parser,
-    simple_mp4_parser as sparser,
     utils as video_utils,
 )
+from ..mp4 import simple_mp4_parser as sparser
 from .geotag_from_generic import GeotagVideosFromGeneric
 
 LOG = logging.getLogger(__name__)
diff --git a/mapillary_tools/geotag/gpmf_parser.py b/mapillary_tools/geotag/gpmf_parser.py
index 211164b19..7feaf7134 100644
--- a/mapillary_tools/geotag/gpmf_parser.py
+++ b/mapillary_tools/geotag/gpmf_parser.py
@@ -5,7 +5,7 @@
 import construct as C
 
 from .. import geo
-from . import mp4_sample_parser as sample_parser
+from ..mp4 import mp4_sample_parser as sample_parser
 
 """
 Parsing GPS from GPMF data format stored in GoPros. See the GPMF spec: https://github.com/gopro/gpmf-parser
diff --git a/mapillary_tools/geotag/simple_mp4_builder.py b/mapillary_tools/geotag/simple_mp4_builder.py
index 632aecca0..6946f3102 100644
--- a/mapillary_tools/geotag/simple_mp4_builder.py
+++ b/mapillary_tools/geotag/simple_mp4_builder.py
@@ -2,14 +2,14 @@
 import io
 import typing as T
 
-from . import (
+from . import io_utils
+from ..mp4 import (
     construct_mp4_parser as cparser,
-    io_utils,
     mp4_sample_parser as sample_parser,
     simple_mp4_parser as sparser,
 )
-from .construct_mp4_parser import BoxDict
-from .mp4_sample_parser import RawSample
+from ..mp4.construct_mp4_parser import BoxDict
+from ..mp4.mp4_sample_parser import RawSample
 
 """
 Variable naming conventions:
diff --git a/mapillary_tools/geotag/construct_mp4_parser.py b/mapillary_tools/mp4/construct_mp4_parser.py
similarity index 100%
rename from mapillary_tools/geotag/construct_mp4_parser.py
rename to mapillary_tools/mp4/construct_mp4_parser.py
diff --git a/mapillary_tools/geotag/mp4_sample_parser.py b/mapillary_tools/mp4/mp4_sample_parser.py
similarity index 100%
rename from mapillary_tools/geotag/mp4_sample_parser.py
rename to mapillary_tools/mp4/mp4_sample_parser.py
diff --git a/mapillary_tools/geotag/simple_mp4_parser.py b/mapillary_tools/mp4/simple_mp4_parser.py
similarity index 100%
rename from mapillary_tools/geotag/simple_mp4_parser.py
rename to mapillary_tools/mp4/simple_mp4_parser.py
diff --git a/mapillary_tools/sample_video.py b/mapillary_tools/sample_video.py
index bc9cd0788..65d1baa72 100644
--- a/mapillary_tools/sample_video.py
+++ b/mapillary_tools/sample_video.py
@@ -9,7 +9,8 @@
 
 from . import constants, exceptions, ffmpeg as ffmpeglib, geo, types, utils
 from .exif_write import ExifEdit
-from .geotag import geotag_videos_from_video, mp4_sample_parser
+from .geotag import geotag_videos_from_video
+from .mp4 import mp4_sample_parser
 from .process_geotag_properties import GeotagSource
 
 LOG = logging.getLogger(__name__)
diff --git a/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py b/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py
index 7f088677a..9aef060f4 100644
--- a/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py
+++ b/mapillary_tools/video_data_extraction/extractors/blackvue_parser.py
@@ -1,8 +1,9 @@
 import typing as T
 
-from mapillary_tools import geo
-from mapillary_tools.geotag import blackvue_parser, simple_mp4_parser
-from mapillary_tools.video_data_extraction.extractors.base_parser import BaseParser
+from ... import geo
+from ...geotag import blackvue_parser
+from ...mp4 import simple_mp4_parser as sparser
+from .base_parser import BaseParser
 
 
 class BlackVueParser(BaseParser):
@@ -21,7 +22,7 @@ def extract_points(self) -> T.Sequence[geo.Point]:
                 points = blackvue_parser.extract_points(fp) or []
                 self.pointsFound = len(points) > 0
                 return points
-            except simple_mp4_parser.ParsingError:
+            except sparser.ParsingError:
                 return []
 
     def extract_make(self) -> T.Optional[str]:
diff --git a/mapillary_tools/video_data_extraction/extractors/camm_parser.py b/mapillary_tools/video_data_extraction/extractors/camm_parser.py
index 98e0b8d69..122a0ca5f 100644
--- a/mapillary_tools/video_data_extraction/extractors/camm_parser.py
+++ b/mapillary_tools/video_data_extraction/extractors/camm_parser.py
@@ -1,9 +1,10 @@
 import functools
 import typing as T
 
-from mapillary_tools import geo
-from mapillary_tools.geotag import camm_parser, simple_mp4_parser
-from mapillary_tools.video_data_extraction.extractors.base_parser import BaseParser
+from ... import geo
+from ...geotag import camm_parser
+from ...mp4 import simple_mp4_parser as sparser
+from .base_parser import BaseParser
 
 
 class CammParser(BaseParser):
@@ -23,7 +24,7 @@ def extract_points(self) -> T.Sequence[geo.Point]:
         with source_path.open("rb") as fp:
             try:
                 return camm_parser.extract_points(fp) or []
-            except simple_mp4_parser.ParsingError:
+            except sparser.ParsingError:
                 return []
 
     def extract_make(self) -> T.Optional[str]:
diff --git a/mapillary_tools/video_data_extraction/extractors/gopro_parser.py b/mapillary_tools/video_data_extraction/extractors/gopro_parser.py
index 3a4c3efde..77e488ad3 100644
--- a/mapillary_tools/video_data_extraction/extractors/gopro_parser.py
+++ b/mapillary_tools/video_data_extraction/extractors/gopro_parser.py
@@ -1,8 +1,9 @@
 import typing as T
 
-from mapillary_tools import geo
-from mapillary_tools.geotag import gpmf_parser, simple_mp4_parser
-from mapillary_tools.video_data_extraction.extractors.base_parser import BaseParser
+from ... import geo
+from ...geotag import gpmf_parser
+from ...mp4 import simple_mp4_parser as sparser
+from .base_parser import BaseParser
 
 
 class GoProParser(BaseParser):
@@ -21,7 +22,7 @@ def extract_points(self) -> T.Sequence[geo.Point]:
                 points = gpmf_parser.extract_points(fp) or []
                 self.pointsFound = len(points) > 0
                 return points
-            except simple_mp4_parser.ParsingError:
+            except sparser.ParsingError:
                 return []
 
     def extract_make(self) -> T.Optional[str]:
diff --git a/tests/unit/test_blackvue_parser.py b/tests/unit/test_blackvue_parser.py
index 9ec65a450..0832a739f 100644
--- a/tests/unit/test_blackvue_parser.py
+++ b/tests/unit/test_blackvue_parser.py
@@ -2,7 +2,8 @@
 
 import mapillary_tools.geo as geo
 
-from mapillary_tools.geotag import blackvue_parser, construct_mp4_parser as cparser
+from mapillary_tools.geotag import blackvue_parser
+from mapillary_tools.mp4 import construct_mp4_parser as cparser
 
 
 def test_parse_points():
diff --git a/tests/unit/test_camm_parser.py b/tests/unit/test_camm_parser.py
index ca22b5716..ed3237837 100644
--- a/tests/unit/test_camm_parser.py
+++ b/tests/unit/test_camm_parser.py
@@ -7,9 +7,9 @@
 from mapillary_tools.geotag import (
     camm_builder,
     camm_parser,
-    construct_mp4_parser as cparser,
     simple_mp4_builder,
 )
+from mapillary_tools.mp4 import construct_mp4_parser as cparser
 
 
 def test_filter_points_by_edit_list():
diff --git a/tests/unit/test_mp4_sample_parser.py b/tests/unit/test_mp4_sample_parser.py
index 360cb9678..6e561fcb0 100644
--- a/tests/unit/test_mp4_sample_parser.py
+++ b/tests/unit/test_mp4_sample_parser.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from mapillary_tools.geotag import mp4_sample_parser
+from mapillary_tools.mp4 import mp4_sample_parser
 
 
 def test_movie_box_parser():
diff --git a/tests/unit/test_simple_mp4_builder.py b/tests/unit/test_simple_mp4_builder.py
index 80d971d5d..8bd67e7d5 100644
--- a/tests/unit/test_simple_mp4_builder.py
+++ b/tests/unit/test_simple_mp4_builder.py
@@ -2,9 +2,11 @@
 import typing as T
 
 from mapillary_tools.geotag import (
+    simple_mp4_builder as builder,
+)
+from mapillary_tools.mp4 import (
     construct_mp4_parser as cparser,
     mp4_sample_parser as sample_parser,
-    simple_mp4_builder as builder,
     simple_mp4_parser as sparser,
 )
 
diff --git a/tests/unit/test_simple_mp4_parser.py b/tests/unit/test_simple_mp4_parser.py
index eaeb7142b..5b375842e 100644
--- a/tests/unit/test_simple_mp4_parser.py
+++ b/tests/unit/test_simple_mp4_parser.py
@@ -1,7 +1,7 @@
 import io
 import typing
 
-from mapillary_tools.geotag import (
+from mapillary_tools.mp4 import (
     construct_mp4_parser as cparser,
     simple_mp4_parser as sparser,
 )

From 88a0bd98e4fb56f7330ab058f8c43f167c96ec39 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Wed, 27 Nov 2024 16:28:45 -0800
Subject: [PATCH 12/14] add the missing __init__.py

---
 mapillary_tools/mp4/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 mapillary_tools/mp4/__init__.py

diff --git a/mapillary_tools/mp4/__init__.py b/mapillary_tools/mp4/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 55de501807f1a0b43b6bc6777708d4ff1e690924 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Wed, 27 Nov 2024 16:39:56 -0800
Subject: [PATCH 13/14] fix import

---
 tests/cli/simple_mp4_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cli/simple_mp4_parser.py b/tests/cli/simple_mp4_parser.py
index 74f24056d..da8e3f294 100644
--- a/tests/cli/simple_mp4_parser.py
+++ b/tests/cli/simple_mp4_parser.py
@@ -6,7 +6,7 @@
 import typing as T
 
 from mapillary_tools import utils
-from mapillary_tools.geotag import (
+from mapillary_tools.mp4 import (
     construct_mp4_parser as cparser,
     mp4_sample_parser as sample_parser,
     simple_mp4_parser as sparser,

From 8a4594f5637beb954852053cd43dad1c93260b85 Mon Sep 17 00:00:00 2001
From: Tao Peng <ptpttt@gmail.com>
Date: Wed, 27 Nov 2024 17:13:24 -0800
Subject: [PATCH 14/14] update setup.py

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 74b9a348f..2d09b2950 100644
--- a/setup.py
+++ b/setup.py
@@ -46,6 +46,7 @@ def readme():
         "mapillary_tools",
         "mapillary_tools.commands",
         "mapillary_tools.geotag",
+        "mapillary_tools.mp4",
         "mapillary_tools.video_data_extraction",
         "mapillary_tools.video_data_extraction.extractors",
     ],