From 011d61c6db0eaba1c06afcb2325d376f3a96c9a1 Mon Sep 17 00:00:00 2001 From: Raja amirapu Date: Fri, 24 Apr 2026 07:45:46 +0530 Subject: [PATCH] parse WAV fmt chunk, downmix multi-channel to mono, widen file sizes to int64_t --- include/rtpstream.hpp | 13 +- src/rtpstream.cpp | 268 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 237 insertions(+), 44 deletions(-) diff --git a/include/rtpstream.hpp b/include/rtpstream.hpp index a503bda8..d18400a8 100644 --- a/include/rtpstream.hpp +++ b/include/rtpstream.hpp @@ -19,6 +19,7 @@ #ifndef __RTPSTREAM__ #define __RTPSTREAM__ +#include #include #define RTPSTREAM_MAX_FILENAMELEN 256 @@ -69,10 +70,10 @@ struct taskentry_t char *video_file_bytes_start; char *audio_current_file_bytes; char *video_current_file_bytes; - int audio_file_num_bytes; - int video_file_num_bytes; - int audio_file_bytes_left; - int video_file_bytes_left; + int64_t audio_file_num_bytes; + int64_t video_file_num_bytes; + int64_t audio_file_bytes_left; + int64_t video_file_bytes_left; /* playback timing information */ int audio_ms_per_packet; @@ -91,8 +92,8 @@ struct taskentry_t char new_video_payload_type; int new_audio_loop_count; // FILE: -- PATTERN: -1 (UNUSED) int new_video_loop_count; // FILE: -- PATTERN: -1 (UNUSED) - int new_audio_file_size; - int new_video_file_size; + int64_t new_audio_file_size; + int64_t new_video_file_size; char *new_audio_file_bytes; char *new_video_file_bytes; int new_audio_ms_per_packet; diff --git a/src/rtpstream.cpp b/src/rtpstream.cpp index 88c93cb3..6d131091 100644 --- a/src/rtpstream.cpp +++ b/src/rtpstream.cpp @@ -106,21 +106,33 @@ struct threaddata_t struct cached_file_t { - char filename[RTPSTREAM_MAX_FILENAMELEN]; - char *bytes; - int filesize; + char filename[RTPSTREAM_MAX_FILENAMELEN]; + char *bytes; + int64_t filesize; + bool wav_processed; /* true once WAV header stripped and downmixed to mono */ }; struct cached_pattern_t { - int id; - char *bytes; - int filesize; + int id; + char *bytes; + int64_t filesize; }; cached_file_t *cached_files = nullptr; cached_pattern_t *cached_patterns = nullptr; int num_cached_files = 0; + +/* Forward declarations for WAV utility functions defined later in this file */ +static bool parse_wav_header(const char *data, int64_t size, + int64_t *data_offset, + uint16_t *num_channels, + uint16_t *bits_per_sample, + uint32_t *sample_rate); +static char *wav_downmix_to_mono(const char *src, int64_t src_size, + uint16_t num_channels, + uint16_t bits_per_sample, + int64_t *out_size); int next_rtp_port = 0; threaddata_t **ready_threads = nullptr; @@ -1756,9 +1768,71 @@ int rtpstream_cache_file(char* filename, } cached_files = newfilecachelist; } - cached_files[num_cached_files].bytes = filecontents; - strncpy(cached_files[num_cached_files].filename, filename, sizeof(cached_files[num_cached_files].filename) - 1); - cached_files[num_cached_files].filesize = statbuffer.st_size; + /* Parse WAV header once at cache time: extract format, strip header, + * and downmix multi-channel audio to mono so rtpstream_play() can + * forward raw mono PCM bytes directly to the RTP sender. + * + * IMPORTANT: cached_files[].bytes must always be the base of a + * malloc'd block because rtpstream_shutdown() passes it directly to + * free(). We therefore always produce a fresh allocation for the + * processed PCM and free the original load buffer when done. */ + int64_t data_offset = 0; + uint16_t num_channels = 1; + uint16_t bits_per_sample = 16; + uint32_t sample_rate = 8000; + bool wav_ok = parse_wav_header(filecontents, (int64_t)statbuffer.st_size, + &data_offset, &num_channels, + &bits_per_sample, &sample_rate); + + cached_files[num_cached_files].wav_processed = false; + + if (wav_ok && data_offset > 0 && data_offset < (int64_t)statbuffer.st_size) { + const char *pcm_start = filecontents + data_offset; + int64_t pcm_size = (int64_t)statbuffer.st_size - data_offset; + char *pcm_buf = nullptr; + + if (num_channels > 1) { + /* Downmix interleaved multi-channel PCM to mono */ + int64_t mono_size = 0; + pcm_buf = wav_downmix_to_mono(pcm_start, pcm_size, + num_channels, bits_per_sample, + &mono_size); + if (pcm_buf) { + free(filecontents); + filecontents = nullptr; + cached_files[num_cached_files].bytes = pcm_buf; + cached_files[num_cached_files].filesize = mono_size; + cached_files[num_cached_files].wav_processed = true; + } + /* else: downmix failed (unsupported bit-depth) — fall through + * to the raw-bytes path below so the file is still usable. */ + } + + if (!cached_files[num_cached_files].wav_processed) { + /* Mono WAV, or downmix unsupported: copy just the PCM data + * into a fresh buffer so free() works in rtpstream_shutdown. */ + pcm_buf = (char *)malloc((size_t)pcm_size); + if (pcm_buf) { + memcpy(pcm_buf, pcm_start, (size_t)pcm_size); + free(filecontents); + filecontents = nullptr; + cached_files[num_cached_files].bytes = pcm_buf; + cached_files[num_cached_files].filesize = pcm_size; + cached_files[num_cached_files].wav_processed = true; + } else { + /* Allocation failed — fall back to raw bytes including header */ + cached_files[num_cached_files].bytes = filecontents; + cached_files[num_cached_files].filesize = (int64_t)statbuffer.st_size; + } + } + } else { + /* Not a WAV or couldn't locate data chunk — serve raw bytes */ + cached_files[num_cached_files].bytes = filecontents; + cached_files[num_cached_files].filesize = (int64_t)statbuffer.st_size; + } + + strncpy(cached_files[num_cached_files].filename, filename, + sizeof(cached_files[num_cached_files].filename) - 1); return num_cached_files++; } } @@ -2229,40 +2303,158 @@ int rtpstream_set_srtp_video_remote(rtpstream_callinfo_t* callinfo, SrtpInfoPara return 0; } +static inline uint16_t uint16_val(const char *ptr) +{ + return static_cast((uint8_t)ptr[0] | ((uint8_t)ptr[1] << 8)); +} + static inline uint32_t uint_val(const char *ptr) { // Read as little-endian. Do not dereference as int, since it can be misaligned. - return static_cast((ptr[0]) | (ptr[1] << 8) | (ptr[2] << 16) | (ptr[3] << 24)); + return static_cast((uint8_t)ptr[0] | ((uint8_t)ptr[1] << 8) | + ((uint8_t)ptr[2] << 16) | ((uint8_t)ptr[3] << 24)); } -// wav format details: -// https://www.fatalerrors.org/a/detailed-explanation-of-wav-file-format.html -static int get_wav_header_size(const char *data, int size) +/* + * parse_wav_header - parse a WAV file's RIFF/WAVE chunks to extract format + * parameters and locate the start of PCM data. + * + * On success returns true and fills: + * *data_offset - byte offset of the first sample in the file + * *num_channels - 1=mono, 2=stereo, etc. + * *bits_per_sample - 8 or 16 + * *sample_rate - e.g. 8000 + * + * Returns false if the buffer is not a valid WAVE file or the fmt/data + * chunks cannot be found within the supplied data range. + * + * wav format details: + * https://www.fatalerrors.org/a/detailed-explanation-of-wav-file-format.html + */ +static bool parse_wav_header(const char *data, int64_t size, + int64_t *data_offset, + uint16_t *num_channels, + uint16_t *bits_per_sample, + uint32_t *sample_rate) { - const char *ptr = data; + const char *ptr = data; const char *limit = data + size; - if (size < 42) - return 0; - if (!(ptr[0] == 'R' && ptr[1] == 'I' && ptr[2] == 'F' && ptr[3] == 'F')) - return 0; - ptr += 8; - if (!(ptr[0] == 'W' && ptr[1] == 'A' && ptr[2] == 'V' && ptr[3] == 'E')) - return ptr - data; + + /* Minimum: RIFF(4) + file_size(4) + WAVE(4) + one chunk header(8) = 20 */ + if (size < 20) + return false; + + /* RIFF magic */ + if (!(ptr[0]=='R' && ptr[1]=='I' && ptr[2]=='F' && ptr[3]=='F')) + return false; + ptr += 8; /* skip "RIFF" + file-size field */ + + /* WAVE magic */ + if (!(ptr[0]=='W' && ptr[1]=='A' && ptr[2]=='V' && ptr[3]=='E')) + return false; ptr += 4; - for (;;) { - if (ptr + 8 > limit) - break; + + bool found_fmt = false; + bool found_data = false; + + *num_channels = 1; + *bits_per_sample = 16; + *sample_rate = 8000; + *data_offset = 0; + + while (ptr + 8 <= limit) { const uint32_t chunk_size = uint_val(ptr + 4); - const bool is_data = (ptr[0] == 'd' && ptr[1] == 'a' && ptr[2] == 't' && ptr[3] == 'a'); - ptr += 8; - if (ptr > limit) - return limit - data; - if (is_data) - return ptr - data; - ptr += chunk_size; + if (ptr[0]=='f' && ptr[1]=='m' && ptr[2]=='t' && ptr[3]==' ') { + /* fmt chunk — must be at least 16 bytes of PCM fields */ + if (chunk_size >= 16 && ptr + 8 + 16 <= limit) { + const char *fmt = ptr + 8; + /* uint16 AudioFormat: PCM == 1 */ + *num_channels = uint16_val(fmt + 2); + *sample_rate = uint_val (fmt + 4); + *bits_per_sample = uint16_val(fmt + 14); + found_fmt = true; + } + } else if (ptr[0]=='d' && ptr[1]=='a' && ptr[2]=='t' && ptr[3]=='a') { + *data_offset = (ptr + 8) - data; + found_data = true; + } + + ptr += 8 + chunk_size; + /* chunks are word-aligned (padded to even size) */ + if (chunk_size & 1) + ptr++; + + if (found_fmt && found_data) + return true; } - return ptr - data; + + return false; +} + +/* + * wav_downmix_to_mono - given a buffer of interleaved PCM samples with + * num_channels channels and bits_per_sample bits, produce a new heap-allocated + * buffer containing the mono downmix. The caller is responsible for free(). + * + * Supports: + * 8-bit unsigned PCM (uint8, centre = 128) + * 16-bit signed PCM (int16, little-endian) + * + * Returns NULL on unsupported format or allocation failure. + * *out_size receives the byte count of the returned buffer. + */ +static char *wav_downmix_to_mono(const char *src, int64_t src_size, + uint16_t num_channels, + uint16_t bits_per_sample, + int64_t *out_size) +{ + if (num_channels == 1) { + /* Nothing to mix — return a plain copy */ + char *buf = (char *)malloc((size_t)src_size); + if (!buf) return nullptr; + memcpy(buf, src, (size_t)src_size); + *out_size = src_size; + return buf; + } + + if (bits_per_sample == 8) { + /* 8-bit unsigned: each sample is a uint8_t, silence = 128 */ + int64_t num_frames = src_size / num_channels; + char *buf = (char *)malloc((size_t)num_frames); + if (!buf) return nullptr; + const uint8_t *s = reinterpret_cast(src); + uint8_t *d = reinterpret_cast(buf); + for (int64_t i = 0; i < num_frames; i++) { + int32_t sum = 0; + for (int ch = 0; ch < num_channels; ch++) + sum += s[i * num_channels + ch]; + d[i] = (uint8_t)(sum / num_channels); + } + *out_size = num_frames; + return buf; + } + + if (bits_per_sample == 16) { + /* 16-bit signed little-endian */ + int64_t num_frames = src_size / (num_channels * 2); + int64_t out_bytes = num_frames * 2; + char *buf = (char *)malloc((size_t)out_bytes); + if (!buf) return nullptr; + const int16_t *s = reinterpret_cast(src); + int16_t *d = reinterpret_cast(buf); + for (int64_t i = 0; i < num_frames; i++) { + int32_t sum = 0; + for (int ch = 0; ch < num_channels; ch++) + sum += s[i * num_channels + ch]; + d[i] = (int16_t)(sum / num_channels); + } + *out_size = out_bytes; + return buf; + } + + /* Unsupported bit depth — caller will use raw data */ + return nullptr; } /* code checked */ @@ -2307,12 +2499,12 @@ void rtpstream_play(rtpstream_callinfo_t* callinfo, rtpstream_actinfo_t* actioni taskinfo->new_audio_payload_type = actioninfo->payload_type; taskinfo->audio_active = actioninfo->audio_active; taskinfo->video_active = actioninfo->video_active; - /* Allow the caller to supply WAV files instead of raw audio, by skipping past headers. */ - /* Doesn't actually parse/convert anything! */ - const int header_size = get_wav_header_size(taskinfo->new_audio_file_bytes, taskinfo->new_audio_file_size); - if (header_size > 0 && taskinfo->new_audio_file_size >= header_size) { - taskinfo->new_audio_file_bytes += header_size; - taskinfo->new_audio_file_size -= header_size; + /* WAV header stripping and channel downmix are performed once at cache + * time (rtpstream_cache_file). cached_file_t::bytes already points to + * the first PCM sample and filesize reflects the processed payload, so + * there is nothing left to do here for processed files. */ + if (!cached_files[file_index].wav_processed) { + /* Legacy path for raw (non-WAV) files: no header to strip. */ } /* set flag that we have a new file to play */