diff --git a/README.md b/README.md index 7da5354..d2e56bf 100755 --- a/README.md +++ b/README.md @@ -47,6 +47,27 @@ $DATA_DIR/pyavi/$REFERENCE/video_out.avi - output video (as shown below)

+## Device Support + +This implementation supports both **CUDA GPU** and **CPU** execution: + +- **CUDA GPU**: Automatically detected and used if available for faster processing +- **CPU**: Used as fallback when CUDA is not available, or can be forced for compatibility + +### Device Selection +The code automatically detects and uses the best available device: +- If CUDA is available → Uses GPU for acceleration +- If CUDA is not available → Falls back to CPU + +### CPU-Only Execution +To force CPU-only execution (e.g., for compatibility or debugging), you can set: +```python +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '' +``` + +Or modify the device selection in the scripts directly. + ## Publications ``` diff --git a/SyncNetInstance.py b/SyncNetInstance.py index 497d44f..991f91b 100644 --- a/SyncNetInstance.py +++ b/SyncNetInstance.py @@ -34,10 +34,11 @@ def calc_pdist(feat1, feat2, vshift=10): class SyncNetInstance(torch.nn.Module): - def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024): + def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024, device='cpu'): super(SyncNetInstance, self).__init__(); - self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda(); + self.device = device + self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).to(self.device); def evaluate(self, opt, videofile): @@ -59,23 +60,40 @@ def evaluate(self, opt, videofile): output = subprocess.call(command, shell=True, stdout=None) # ========== ========== - # Load video + # Load video (original method but memory optimized) # ========== ========== - images = [] - + # Get list of image files flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg')) flist.sort() - - for fname in flist: - images.append(cv2.imread(fname)) - - im = numpy.stack(images,axis=3) - im = numpy.expand_dims(im,axis=0) - im = numpy.transpose(im,(0,3,4,1,2)) + + print(f'[INFO] Found {len(flist)} frames') + + # Load images with memory optimization + images = [] + target_size = 224 # Standard size that should work with all Conv3d layers + + for i, fname in enumerate(flist): + if i % 500 == 0: + print(f'[INFO] Loading frame {i+1}/{len(flist)}') + + img = cv2.imread(fname) + if img is not None: + # Resize to standard size for compatibility + img = cv2.resize(img, (target_size, target_size)) + images.append(img) + + print(f'[INFO] Loaded {len(images)} images, resized to {target_size}x{target_size}px') + + # Convert to the original tensor format + im = numpy.stack(images, axis=3) # (H, W, C, T) + im = numpy.expand_dims(im, axis=0) # (1, H, W, C, T) + im = numpy.transpose(im, (0, 3, 4, 1, 2)) # (1, C, T, H, W) imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) - + + print(f'[INFO] Video tensor shape: {imtv.shape}') + # ========== ========== # Load audio # ========== ========== @@ -97,7 +115,7 @@ def evaluate(self, opt, videofile): min_length = min(len(images),math.floor(len(audio)/640)) # ========== ========== - # Generate video and audio feats + # Generate video and audio feats (smaller batches) # ========== ========== lastframe = min_length-5 @@ -105,20 +123,42 @@ def evaluate(self, opt, videofile): cc_feat = [] tS = time.time() - for i in range(0,lastframe,opt.batch_size): + print(f'[INFO] Processing {lastframe} frames with smaller batches for memory efficiency') + + # Use smaller batch size for memory efficiency + small_batch_size = 1 + + for i in range(0, lastframe, small_batch_size): + if i % 200 == 0: # Progress every 200 frames + print(f'[INFO] Processing frame {i+1}/{lastframe}') - im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] - im_in = torch.cat(im_batch,0) - im_out = self.__S__.forward_lip(im_in.cuda()); + # Process video frames + im_batch = [imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i, min(lastframe, i+small_batch_size))] + im_in = torch.cat(im_batch, 0) + im_out = self.__S__.forward_lip(im_in.to(self.device)) im_feat.append(im_out.data.cpu()) + del im_batch, im_in, im_out - cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] - cc_in = torch.cat(cc_batch,0) - cc_out = self.__S__.forward_aud(cc_in.cuda()) + # Process audio frames + cc_batch = [cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i, min(lastframe, i+small_batch_size))] + cc_in = torch.cat(cc_batch, 0) + cc_out = self.__S__.forward_aud(cc_in.to(self.device)) cc_feat.append(cc_out.data.cpu()) - - im_feat = torch.cat(im_feat,0) - cc_feat = torch.cat(cc_feat,0) + del cc_batch, cc_in, cc_out + + # Garbage collection every 100 frames + if i % 100 == 0: + import gc + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + print('[INFO] Concatenating features...') + im_feat = torch.cat(im_feat, 0) + cc_feat = torch.cat(cc_feat, 0) + + # Clear audio tensor to free memory + del cct # ========== ========== # Compute offset @@ -152,46 +192,85 @@ def extract_feature(self, opt, videofile): self.__S__.eval(); # ========== ========== - # Load video + # Load video with memory optimization (same as evaluate method) # ========== ========== cap = cv2.VideoCapture(videofile) - frame_num = 1; + print(f'[INFO] Loading video frames for feature extraction...') + + frame_num = 0 images = [] - while frame_num: - frame_num += 1 + target_size = 224 # Standard size that should work with all Conv3d layers + + while True: ret, image = cap.read() if ret == 0: break - + + if frame_num % 500 == 0: + print(f'[INFO] Loading frame {frame_num+1}') + + # Resize to standard size for compatibility and memory efficiency + image = cv2.resize(image, (target_size, target_size)) images.append(image) + frame_num += 1 + + cap.release() + print(f'[INFO] Loaded {len(images)} images, resized to {target_size}x{target_size}px') - im = numpy.stack(images,axis=3) - im = numpy.expand_dims(im,axis=0) - im = numpy.transpose(im,(0,3,4,1,2)) + # Convert to the original tensor format + im = numpy.stack(images, axis=3) # (H, W, C, T) + im = numpy.expand_dims(im, axis=0) # (1, H, W, C, T) + im = numpy.transpose(im, (0, 3, 4, 1, 2)) # (1, C, T, H, W) imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + print(f'[INFO] Video tensor shape: {imtv.shape}') + # ========== ========== - # Generate video feats + # Generate video feats with memory optimization # ========== ========== lastframe = len(images)-4 im_feat = [] + print(f'[INFO] Processing {lastframe} frames with memory optimization') + + # Use smaller batch size for memory efficiency + small_batch_size = 1 + tS = time.time() - for i in range(0,lastframe,opt.batch_size): + for i in range(0, lastframe, small_batch_size): + if i % 200 == 0: # Progress every 200 frames + print(f'[INFO] Processing frame {i+1}/{lastframe}') - im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] - im_in = torch.cat(im_batch,0) - im_out = self.__S__.forward_lipfeat(im_in.cuda()); + # Process video frames with memory management + im_batch = [imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i, min(lastframe, i+small_batch_size))] + im_in = torch.cat(im_batch, 0) + im_out = self.__S__.forward_lipfeat(im_in.to(self.device)) im_feat.append(im_out.data.cpu()) - - im_feat = torch.cat(im_feat,0) - - # ========== ========== - # Compute offset - # ========== ========== + + # Clean up intermediate variables + del im_batch, im_in, im_out + + # Garbage collection every 100 frames + if i % 100 == 0: + import gc + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + print('[INFO] Concatenating features...') + im_feat = torch.cat(im_feat, 0) + + # Clean up video tensor + del imtv + + # Final garbage collection + import gc + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() print('Compute time %.3f sec.' % (time.time()-tS)) diff --git a/SyncNetModel.py b/SyncNetModel.py index c21ce25..d611a2c 100755 --- a/SyncNetModel.py +++ b/SyncNetModel.py @@ -96,7 +96,7 @@ def __init__(self, num_layers_in_fc_layers = 1024): def forward_aud(self, x): mid = self.netcnnaud(x); # N x ch x 24 x M - mid = mid.view((mid.size()[0], -1)); # N x (ch x 24) + mid = mid.reshape((mid.size()[0], -1)); # N x (ch x 24) out = self.netfcaud(mid); return out; @@ -104,7 +104,7 @@ def forward_aud(self, x): def forward_lip(self, x): mid = self.netcnnlip(x); - mid = mid.view((mid.size()[0], -1)); # N x (ch x 24) + mid = mid.reshape((mid.size()[0], -1)); # N x (ch x 24) out = self.netfclip(mid); return out; @@ -112,6 +112,6 @@ def forward_lip(self, x): def forward_lipfeat(self, x): mid = self.netcnnlip(x); - out = mid.view((mid.size()[0], -1)); # N x (ch x 24) + out = mid.reshape((mid.size()[0], -1)); # N x (ch x 24) return out; \ No newline at end of file diff --git a/demo_feature.py b/demo_feature.py index e3bd290..facd65d 100755 --- a/demo_feature.py +++ b/demo_feature.py @@ -2,31 +2,125 @@ #-*- coding: utf-8 -*- import time, pdb, argparse, subprocess +import torch +import numpy as np +import os from SyncNetInstance import * -# ==================== LOAD PARAMS ==================== +def analyze_features(features, save_path): + """ + Analyze extracted features and provide detailed statistics + """ + print("\n" + "="*60) + print("FEATURE ANALYSIS REPORT") + print("="*60) + + # Basic info + print("\n=== BASIC INFO ===") + print(f"Features shape: {features.shape}") + print(f"Features dtype: {features.dtype}") + print(f"Features device: {features.device}") + print(f"Number of dimensions: {features.dim()}") + print(f"Total elements: {features.numel()}") + print(f"Memory usage: {features.numel() * features.element_size() / (1024*1024):.2f} MB") + + # Statistical info + print("\n=== STATISTICAL INFO ===") + feats_np = features.numpy() + print(f"Min value: {feats_np.min():.6f}") + print(f"Max value: {feats_np.max():.6f}") + print(f"Mean value: {feats_np.mean():.6f}") + print(f"Std deviation: {feats_np.std():.6f}") + print(f"Number of zeros: {np.sum(feats_np == 0)}") + print(f"Number of NaN values: {np.sum(np.isnan(feats_np))}") + print(f"Number of infinite values: {np.sum(np.isinf(feats_np))}") + + # Feature analysis + print("\n=== FEATURE ANALYSIS ===") + num_frames, features_per_frame = features.shape + print(f"Features per frame: {features_per_frame}") + print(f"Total frames processed: {num_frames}") + print(f"Video duration estimate: {num_frames/25:.2f} seconds (assuming 25fps)") + + # Feature vector analysis + print("\n=== FEATURE VECTOR ANALYSIS ===") + first_frame = feats_np[0] + non_zero_count = np.sum(first_frame != 0) + l2_norm = np.linalg.norm(first_frame, ord=2) + l1_norm = np.linalg.norm(first_frame, ord=1) + print(f"First frame feature vector stats:") + print(f" - Non-zero elements: {non_zero_count}/{features_per_frame}") + print(f" - L2 norm: {l2_norm:.6f}") + print(f" - L1 norm: {l1_norm:.6f}") + + # Temporal analysis + print("\n=== TEMPORAL ANALYSIS ===") + frame_norms = np.linalg.norm(feats_np, axis=1, ord=2) + print(f"Frame-wise L2 norms:") + print(f" - Min norm: {frame_norms.min():.6f}") + print(f" - Max norm: {frame_norms.max():.6f}") + print(f" - Mean norm: {frame_norms.mean():.6f}") + print(f" - Std norm: {frame_norms.std():.6f}") + + # Feature diversity + print("\n=== FEATURE DIVERSITY ===") + feature_stds = np.std(feats_np, axis=0) + low_variance_count = np.sum(feature_stds < 0.01) + high_variance_count = np.sum(feature_stds > 1.0) + most_active_dim = np.argmax(feature_stds) + least_active_dim = np.argmin(feature_stds) + + print(f"Feature dimension statistics:") + print(f" - Dimensions with low variance (<0.01): {low_variance_count}") + print(f" - Dimensions with high variance (>1.0): {high_variance_count}") + print(f" - Most active feature dimension: {most_active_dim} (std: {feature_stds[most_active_dim]:.6f})") + print(f" - Least active feature dimension: {least_active_dim} (std: {feature_stds[least_active_dim]:.6f})") + + # File info + if os.path.exists(save_path): + file_size = os.path.getsize(save_path) / (1024*1024) + print(f"\n=== FILE INFO ===") + print(f"Saved to: {save_path}") + print(f"File size: {file_size:.2f} MB") + + print("="*60) +# ==================== LOAD PARAMS ==================== -parser = argparse.ArgumentParser(description = "SyncNet"); +parser = argparse.ArgumentParser(description = "SyncNet Feature Extractor"); -parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); -parser.add_argument('--batch_size', type=int, default='20', help=''); -parser.add_argument('--vshift', type=int, default='15', help=''); -parser.add_argument('--videofile', type=str, default="data/example.avi", help=''); -parser.add_argument('--tmp_dir', type=str, default="data", help=''); -parser.add_argument('--save_as', type=str, default="data/features.pt", help=''); +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='Path to pre-trained SyncNet model'); +parser.add_argument('--batch_size', type=int, default='1', help='Fixed to 1 for memory efficiency'); +parser.add_argument('--vshift', type=int, default='15', help='Time shift for sync analysis'); +parser.add_argument('--videofile', type=str, default="data/example.avi", help='Input video file'); +parser.add_argument('--tmp_dir', type=str, default="data", help='Temporary directory'); +parser.add_argument('--save_as', type=str, default="data/features.pt", help='Output feature file path'); +parser.add_argument('--analyze', action='store_true', help='Enable detailed feature analysis'); opt = parser.parse_args(); - # ==================== RUN EVALUATION ==================== -s = SyncNetInstance(); +# Check if CUDA is available, otherwise use CPU +device = 'cuda' if torch.cuda.is_available() else 'cpu' +print(f'[INFO] Using device: {device}') + +s = SyncNetInstance(device=device); s.loadParameters(opt.initial_model); print("Model %s loaded."%opt.initial_model); +print(f'[INFO] Extracting features from: {opt.videofile}') feats = s.extract_feature(opt, videofile=opt.videofile) +print(f'[INFO] Saving features to: {opt.save_as}') torch.save(feats, opt.save_as) + +# Perform detailed analysis +if opt.analyze: + analyze_features(feats, opt.save_as) +else: + print(f'[INFO] Features extracted and saved successfully!') + print(f'[INFO] Feature shape: {feats.shape}') + print(f'[INFO] Use --analyze flag for detailed feature analysis') diff --git a/demo_syncnet.py b/demo_syncnet.py index 01c25a6..bfc3e2b 100755 --- a/demo_syncnet.py +++ b/demo_syncnet.py @@ -2,6 +2,7 @@ #-*- coding: utf-8 -*- import time, pdb, argparse, subprocess +import torch from SyncNetInstance import * @@ -11,7 +12,7 @@ parser = argparse.ArgumentParser(description = "SyncNet"); parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); -parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--batch_size', type=int, default='1', help='Fixed to 1 for SyncNet compatibility'); parser.add_argument('--vshift', type=int, default='15', help=''); parser.add_argument('--videofile', type=str, default="data/example.avi", help=''); parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help=''); @@ -22,7 +23,11 @@ # ==================== RUN EVALUATION ==================== -s = SyncNetInstance(); +# Check if CUDA is available, otherwise use CPU +device = 'cuda' if torch.cuda.is_available() else 'cpu' +print(f'[INFO] Using device: {device}') + +s = SyncNetInstance(device=device); s.loadParameters(opt.initial_model); print("Model %s loaded."%opt.initial_model); diff --git a/detectors/s3fd/__init__.py b/detectors/s3fd/__init__.py index d7f35e0..c25e7ea 100644 --- a/detectors/s3fd/__init__.py +++ b/detectors/s3fd/__init__.py @@ -12,7 +12,7 @@ class S3FD(): - def __init__(self, device='cuda'): + def __init__(self, device='cpu'): tstamp = time.time() self.device = device diff --git a/detectors/s3fd/box_utils.py b/detectors/s3fd/box_utils.py index 0779bcd..1bf4be2 100644 --- a/detectors/s3fd/box_utils.py +++ b/detectors/s3fd/box_utils.py @@ -35,7 +35,7 @@ def nms_(dets, thresh): inds = np.where(ovr <= thresh)[0] order = order[inds + 1] - return np.array(keep).astype(np.int) + return np.array(keep).astype(int) def decode(loc, priors, variances): diff --git a/device_utils.py b/device_utils.py new file mode 100644 index 0000000..303c6c5 --- /dev/null +++ b/device_utils.py @@ -0,0 +1,41 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import torch + +def get_device(prefer_cuda=True): + """ + Get the best available device (CUDA if available and preferred, otherwise CPU) + + Args: + prefer_cuda (bool): Whether to prefer CUDA over CPU if available + + Returns: + str: Device string ('cuda' or 'cpu') + """ + if prefer_cuda and torch.cuda.is_available(): + device = 'cuda' + print(f'[INFO] Using CUDA GPU: {torch.cuda.get_device_name(0)}') + else: + device = 'cpu' + if prefer_cuda and not torch.cuda.is_available(): + print('[INFO] CUDA not available, falling back to CPU') + else: + print('[INFO] Using CPU') + + return device + +def print_device_info(): + """Print information about available devices""" + print("=" * 50) + print("Device Information:") + print(f"CUDA Available: {torch.cuda.is_available()}") + + if torch.cuda.is_available(): + print(f"CUDA Version: {torch.version.cuda}") + print(f"GPU Count: {torch.cuda.device_count()}") + for i in range(torch.cuda.device_count()): + print(f"GPU {i}: {torch.cuda.get_device_name(i)}") + + print(f"PyTorch Version: {torch.__version__}") + print("=" * 50) diff --git a/news_audio_delay_360p_500ms.mp4 b/news_audio_delay_360p_500ms.mp4 new file mode 100644 index 0000000..dced929 Binary files /dev/null and b/news_audio_delay_360p_500ms.mp4 differ diff --git a/run_pipeline.py b/run_pipeline.py index f5fc22e..67bef53 100755 --- a/run_pipeline.py +++ b/run_pipeline.py @@ -2,6 +2,7 @@ import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2 import numpy as np +import torch from shutil import rmtree import scenedetect @@ -184,7 +185,11 @@ def crop_video(opt,track,cropfile): def inference_video(opt): - DET = S3FD(device='cuda') + # Check if CUDA is available, otherwise use CPU + device = 'cuda' if torch.cuda.is_available() else 'cpu' + print(f'[INFO] Using device: {device}') + + DET = S3FD(device=device) flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) flist.sort() diff --git a/run_syncnet.py b/run_syncnet.py index 45099fd..493d262 100755 --- a/run_syncnet.py +++ b/run_syncnet.py @@ -2,6 +2,9 @@ #-*- coding: utf-8 -*- import time, pdb, argparse, subprocess, pickle, os, gzip, glob +import torch + +from SyncNetInstance import * from SyncNetInstance import * @@ -24,7 +27,11 @@ # ==================== LOAD MODEL AND FILE LIST ==================== -s = SyncNetInstance(); +# Check if CUDA is available, otherwise use CPU +device = 'cuda' if torch.cuda.is_available() else 'cpu' +print(f'[INFO] Using device: {device}') + +s = SyncNetInstance(device=device); s.loadParameters(opt.initial_model); print("Model %s loaded."%opt.initial_model);