diff --git a/README.md b/README.md
index 7da5354..d2e56bf 100755
--- a/README.md
+++ b/README.md
@@ -47,6 +47,27 @@ $DATA_DIR/pyavi/$REFERENCE/video_out.avi - output video (as shown below)
   <img src="img/ex2.jpg" width="45%"/>
 </p>
 
+## Device Support
+
+This implementation supports both **CUDA GPU** and **CPU** execution:
+
+- **CUDA GPU**: Automatically detected and used if available for faster processing
+- **CPU**: Used as fallback when CUDA is not available, or can be forced for compatibility
+
+### Device Selection
+The code automatically detects and uses the best available device:
+- If CUDA is available → Uses GPU for acceleration
+- If CUDA is not available → Falls back to CPU
+
+### CPU-Only Execution
+To force CPU-only execution (e.g., for compatibility or debugging), you can set:
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = ''
+```
+
+Or modify the device selection in the scripts directly.
+
 ## Publications
  
 ```
diff --git a/SyncNetInstance.py b/SyncNetInstance.py
index 497d44f..991f91b 100644
--- a/SyncNetInstance.py
+++ b/SyncNetInstance.py
@@ -34,10 +34,11 @@ def calc_pdist(feat1, feat2, vshift=10):
 
 class SyncNetInstance(torch.nn.Module):
 
-    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
+    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024, device='cpu'):
         super(SyncNetInstance, self).__init__();
 
-        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
+        self.device = device
+        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).to(self.device);
 
     def evaluate(self, opt, videofile):
 
@@ -59,23 +60,40 @@ def evaluate(self, opt, videofile):
         output = subprocess.call(command, shell=True, stdout=None)
         
         # ========== ==========
-        # Load video 
+        # Load video (original method but memory optimized)
         # ========== ==========
 
-        images = []
-        
+        # Get list of image files
         flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
         flist.sort()
-
-        for fname in flist:
-            images.append(cv2.imread(fname))
-
-        im = numpy.stack(images,axis=3)
-        im = numpy.expand_dims(im,axis=0)
-        im = numpy.transpose(im,(0,3,4,1,2))
+        
+        print(f'[INFO] Found {len(flist)} frames')
+        
+        # Load images with memory optimization
+        images = []
+        target_size = 224  # Standard size that should work with all Conv3d layers
+        
+        for i, fname in enumerate(flist):
+            if i % 500 == 0:
+                print(f'[INFO] Loading frame {i+1}/{len(flist)}')
+            
+            img = cv2.imread(fname)
+            if img is not None:
+                # Resize to standard size for compatibility
+                img = cv2.resize(img, (target_size, target_size))
+                images.append(img)
+        
+        print(f'[INFO] Loaded {len(images)} images, resized to {target_size}x{target_size}px')
+        
+        # Convert to the original tensor format
+        im = numpy.stack(images, axis=3)  # (H, W, C, T)
+        im = numpy.expand_dims(im, axis=0)  # (1, H, W, C, T)
+        im = numpy.transpose(im, (0, 3, 4, 1, 2))  # (1, C, T, H, W)
 
         imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
-
+        
+        print(f'[INFO] Video tensor shape: {imtv.shape}')
+        
         # ========== ==========
         # Load audio
         # ========== ==========
@@ -97,7 +115,7 @@ def evaluate(self, opt, videofile):
         min_length = min(len(images),math.floor(len(audio)/640))
         
         # ========== ==========
-        # Generate video and audio feats
+        # Generate video and audio feats (smaller batches)
         # ========== ==========
 
         lastframe = min_length-5
@@ -105,20 +123,42 @@ def evaluate(self, opt, videofile):
         cc_feat = []
 
         tS = time.time()
-        for i in range(0,lastframe,opt.batch_size):
+        print(f'[INFO] Processing {lastframe} frames with smaller batches for memory efficiency')
+        
+        # Use smaller batch size for memory efficiency  
+        small_batch_size = 1
+        
+        for i in range(0, lastframe, small_batch_size):
+            if i % 200 == 0:  # Progress every 200 frames
+                print(f'[INFO] Processing frame {i+1}/{lastframe}')
             
-            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
-            im_in = torch.cat(im_batch,0)
-            im_out  = self.__S__.forward_lip(im_in.cuda());
+            # Process video frames
+            im_batch = [imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i, min(lastframe, i+small_batch_size))]
+            im_in = torch.cat(im_batch, 0)
+            im_out = self.__S__.forward_lip(im_in.to(self.device))
             im_feat.append(im_out.data.cpu())
+            del im_batch, im_in, im_out
 
-            cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
-            cc_in = torch.cat(cc_batch,0)
-            cc_out  = self.__S__.forward_aud(cc_in.cuda())
+            # Process audio frames
+            cc_batch = [cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i, min(lastframe, i+small_batch_size))]
+            cc_in = torch.cat(cc_batch, 0)
+            cc_out = self.__S__.forward_aud(cc_in.to(self.device))
             cc_feat.append(cc_out.data.cpu())
-
-        im_feat = torch.cat(im_feat,0)
-        cc_feat = torch.cat(cc_feat,0)
+            del cc_batch, cc_in, cc_out
+            
+            # Garbage collection every 100 frames
+            if i % 100 == 0:
+                import gc
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+
+        print('[INFO] Concatenating features...')
+        im_feat = torch.cat(im_feat, 0)
+        cc_feat = torch.cat(cc_feat, 0)
+        
+        # Clear audio tensor to free memory
+        del cct
 
         # ========== ==========
         # Compute offset
@@ -152,46 +192,85 @@ def extract_feature(self, opt, videofile):
         self.__S__.eval();
         
         # ========== ==========
-        # Load video 
+        # Load video with memory optimization (same as evaluate method)
         # ========== ==========
         cap = cv2.VideoCapture(videofile)
 
-        frame_num = 1;
+        print(f'[INFO] Loading video frames for feature extraction...')
+        
+        frame_num = 0
         images = []
-        while frame_num:
-            frame_num += 1
+        target_size = 224  # Standard size that should work with all Conv3d layers
+        
+        while True:
             ret, image = cap.read()
             if ret == 0:
                 break
-
+            
+            if frame_num % 500 == 0:
+                print(f'[INFO] Loading frame {frame_num+1}')
+            
+            # Resize to standard size for compatibility and memory efficiency
+            image = cv2.resize(image, (target_size, target_size))
             images.append(image)
+            frame_num += 1
+        
+        cap.release()
+        print(f'[INFO] Loaded {len(images)} images, resized to {target_size}x{target_size}px')
 
-        im = numpy.stack(images,axis=3)
-        im = numpy.expand_dims(im,axis=0)
-        im = numpy.transpose(im,(0,3,4,1,2))
+        # Convert to the original tensor format
+        im = numpy.stack(images, axis=3)  # (H, W, C, T)
+        im = numpy.expand_dims(im, axis=0)  # (1, H, W, C, T) 
+        im = numpy.transpose(im, (0, 3, 4, 1, 2))  # (1, C, T, H, W)
 
         imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
         
+        print(f'[INFO] Video tensor shape: {imtv.shape}')
+        
         # ========== ==========
-        # Generate video feats
+        # Generate video feats with memory optimization
         # ========== ==========
 
         lastframe = len(images)-4
         im_feat = []
 
+        print(f'[INFO] Processing {lastframe} frames with memory optimization')
+        
+        # Use smaller batch size for memory efficiency
+        small_batch_size = 1
+        
         tS = time.time()
-        for i in range(0,lastframe,opt.batch_size):
+        for i in range(0, lastframe, small_batch_size):
+            if i % 200 == 0:  # Progress every 200 frames
+                print(f'[INFO] Processing frame {i+1}/{lastframe}')
             
-            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
-            im_in = torch.cat(im_batch,0)
-            im_out  = self.__S__.forward_lipfeat(im_in.cuda());
+            # Process video frames with memory management
+            im_batch = [imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i, min(lastframe, i+small_batch_size))]
+            im_in = torch.cat(im_batch, 0)
+            im_out = self.__S__.forward_lipfeat(im_in.to(self.device))
             im_feat.append(im_out.data.cpu())
-
-        im_feat = torch.cat(im_feat,0)
-
-        # ========== ==========
-        # Compute offset
-        # ========== ==========
+            
+            # Clean up intermediate variables
+            del im_batch, im_in, im_out
+            
+            # Garbage collection every 100 frames
+            if i % 100 == 0:
+                import gc
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+
+        print('[INFO] Concatenating features...')
+        im_feat = torch.cat(im_feat, 0)
+        
+        # Clean up video tensor
+        del imtv
+        
+        # Final garbage collection
+        import gc
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
             
         print('Compute time %.3f sec.' % (time.time()-tS))
 
diff --git a/SyncNetModel.py b/SyncNetModel.py
index c21ce25..d611a2c 100755
--- a/SyncNetModel.py
+++ b/SyncNetModel.py
@@ -96,7 +96,7 @@ def __init__(self, num_layers_in_fc_layers = 1024):
     def forward_aud(self, x):
 
         mid = self.netcnnaud(x); # N x ch x 24 x M
-        mid = mid.view((mid.size()[0], -1)); # N x (ch x 24)
+        mid = mid.reshape((mid.size()[0], -1)); # N x (ch x 24)
         out = self.netfcaud(mid);
 
         return out;
@@ -104,7 +104,7 @@ def forward_aud(self, x):
     def forward_lip(self, x):
 
         mid = self.netcnnlip(x); 
-        mid = mid.view((mid.size()[0], -1)); # N x (ch x 24)
+        mid = mid.reshape((mid.size()[0], -1)); # N x (ch x 24)
         out = self.netfclip(mid);
 
         return out;
@@ -112,6 +112,6 @@ def forward_lip(self, x):
     def forward_lipfeat(self, x):
 
         mid = self.netcnnlip(x);
-        out = mid.view((mid.size()[0], -1)); # N x (ch x 24)
+        out = mid.reshape((mid.size()[0], -1)); # N x (ch x 24)
 
         return out;
\ No newline at end of file
diff --git a/demo_feature.py b/demo_feature.py
index e3bd290..facd65d 100755
--- a/demo_feature.py
+++ b/demo_feature.py
@@ -2,31 +2,125 @@
 #-*- coding: utf-8 -*-
 
 import time, pdb, argparse, subprocess
+import torch
+import numpy as np
+import os
 
 from SyncNetInstance import *
 
-# ==================== LOAD PARAMS ====================
+def analyze_features(features, save_path):
+    """
+    Analyze extracted features and provide detailed statistics
+    """
+    print("\n" + "="*60)
+    print("FEATURE ANALYSIS REPORT")
+    print("="*60)
+    
+    # Basic info
+    print("\n=== BASIC INFO ===")
+    print(f"Features shape: {features.shape}")
+    print(f"Features dtype: {features.dtype}")
+    print(f"Features device: {features.device}")
+    print(f"Number of dimensions: {features.dim()}")
+    print(f"Total elements: {features.numel()}")
+    print(f"Memory usage: {features.numel() * features.element_size() / (1024*1024):.2f} MB")
+    
+    # Statistical info
+    print("\n=== STATISTICAL INFO ===")
+    feats_np = features.numpy()
+    print(f"Min value: {feats_np.min():.6f}")
+    print(f"Max value: {feats_np.max():.6f}")
+    print(f"Mean value: {feats_np.mean():.6f}")
+    print(f"Std deviation: {feats_np.std():.6f}")
+    print(f"Number of zeros: {np.sum(feats_np == 0)}")
+    print(f"Number of NaN values: {np.sum(np.isnan(feats_np))}")
+    print(f"Number of infinite values: {np.sum(np.isinf(feats_np))}")
+    
+    # Feature analysis
+    print("\n=== FEATURE ANALYSIS ===")
+    num_frames, features_per_frame = features.shape
+    print(f"Features per frame: {features_per_frame}")
+    print(f"Total frames processed: {num_frames}")
+    print(f"Video duration estimate: {num_frames/25:.2f} seconds (assuming 25fps)")
+    
+    # Feature vector analysis
+    print("\n=== FEATURE VECTOR ANALYSIS ===")
+    first_frame = feats_np[0]
+    non_zero_count = np.sum(first_frame != 0)
+    l2_norm = np.linalg.norm(first_frame, ord=2)
+    l1_norm = np.linalg.norm(first_frame, ord=1)
+    print(f"First frame feature vector stats:")
+    print(f"  - Non-zero elements: {non_zero_count}/{features_per_frame}")
+    print(f"  - L2 norm: {l2_norm:.6f}")
+    print(f"  - L1 norm: {l1_norm:.6f}")
+    
+    # Temporal analysis
+    print("\n=== TEMPORAL ANALYSIS ===")
+    frame_norms = np.linalg.norm(feats_np, axis=1, ord=2)
+    print(f"Frame-wise L2 norms:")
+    print(f"  - Min norm: {frame_norms.min():.6f}")
+    print(f"  - Max norm: {frame_norms.max():.6f}")
+    print(f"  - Mean norm: {frame_norms.mean():.6f}")
+    print(f"  - Std norm: {frame_norms.std():.6f}")
+    
+    # Feature diversity
+    print("\n=== FEATURE DIVERSITY ===")
+    feature_stds = np.std(feats_np, axis=0)
+    low_variance_count = np.sum(feature_stds < 0.01)
+    high_variance_count = np.sum(feature_stds > 1.0)
+    most_active_dim = np.argmax(feature_stds)
+    least_active_dim = np.argmin(feature_stds)
+    
+    print(f"Feature dimension statistics:")
+    print(f"  - Dimensions with low variance (<0.01): {low_variance_count}")
+    print(f"  - Dimensions with high variance (>1.0): {high_variance_count}")
+    print(f"  - Most active feature dimension: {most_active_dim} (std: {feature_stds[most_active_dim]:.6f})")
+    print(f"  - Least active feature dimension: {least_active_dim} (std: {feature_stds[least_active_dim]:.6f})")
+    
+    # File info
+    if os.path.exists(save_path):
+        file_size = os.path.getsize(save_path) / (1024*1024)
+        print(f"\n=== FILE INFO ===")
+        print(f"Saved to: {save_path}")
+        print(f"File size: {file_size:.2f} MB")
+    
+    print("="*60)
 
+# ==================== LOAD PARAMS ====================
 
-parser = argparse.ArgumentParser(description = "SyncNet");
+parser = argparse.ArgumentParser(description = "SyncNet Feature Extractor");
 
-parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
-parser.add_argument('--batch_size', type=int, default='20', help='');
-parser.add_argument('--vshift', type=int, default='15', help='');
-parser.add_argument('--videofile', type=str, default="data/example.avi", help='');
-parser.add_argument('--tmp_dir', type=str, default="data", help='');
-parser.add_argument('--save_as', type=str, default="data/features.pt", help='');
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='Path to pre-trained SyncNet model');
+parser.add_argument('--batch_size', type=int, default='1', help='Fixed to 1 for memory efficiency');
+parser.add_argument('--vshift', type=int, default='15', help='Time shift for sync analysis');
+parser.add_argument('--videofile', type=str, default="data/example.avi", help='Input video file');
+parser.add_argument('--tmp_dir', type=str, default="data", help='Temporary directory');
+parser.add_argument('--save_as', type=str, default="data/features.pt", help='Output feature file path');
+parser.add_argument('--analyze', action='store_true', help='Enable detailed feature analysis');
 
 opt = parser.parse_args();
 
-
 # ==================== RUN EVALUATION ====================
 
-s = SyncNetInstance();
+# Check if CUDA is available, otherwise use CPU
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f'[INFO] Using device: {device}')
+
+s = SyncNetInstance(device=device);
 
 s.loadParameters(opt.initial_model);
 print("Model %s loaded."%opt.initial_model);
 
+print(f'[INFO] Extracting features from: {opt.videofile}')
 feats = s.extract_feature(opt, videofile=opt.videofile)
 
+print(f'[INFO] Saving features to: {opt.save_as}')
 torch.save(feats, opt.save_as)
+
+# Perform detailed analysis
+if opt.analyze:
+    analyze_features(feats, opt.save_as)
+else:
+    print(f'[INFO] Features extracted and saved successfully!')
+    print(f'[INFO] Feature shape: {feats.shape}')
+    print(f'[INFO] Use --analyze flag for detailed feature analysis')
diff --git a/demo_syncnet.py b/demo_syncnet.py
index 01c25a6..bfc3e2b 100755
--- a/demo_syncnet.py
+++ b/demo_syncnet.py
@@ -2,6 +2,7 @@
 #-*- coding: utf-8 -*-
 
 import time, pdb, argparse, subprocess
+import torch
 
 from SyncNetInstance import *
 
@@ -11,7 +12,7 @@
 parser = argparse.ArgumentParser(description = "SyncNet");
 
 parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
-parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--batch_size', type=int, default='1', help='Fixed to 1 for SyncNet compatibility');
 parser.add_argument('--vshift', type=int, default='15', help='');
 parser.add_argument('--videofile', type=str, default="data/example.avi", help='');
 parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
@@ -22,7 +23,11 @@
 
 # ==================== RUN EVALUATION ====================
 
-s = SyncNetInstance();
+# Check if CUDA is available, otherwise use CPU
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f'[INFO] Using device: {device}')
+
+s = SyncNetInstance(device=device);
 
 s.loadParameters(opt.initial_model);
 print("Model %s loaded."%opt.initial_model);
diff --git a/detectors/s3fd/__init__.py b/detectors/s3fd/__init__.py
index d7f35e0..c25e7ea 100644
--- a/detectors/s3fd/__init__.py
+++ b/detectors/s3fd/__init__.py
@@ -12,7 +12,7 @@
 
 class S3FD():
 
-    def __init__(self, device='cuda'):
+    def __init__(self, device='cpu'):
 
         tstamp = time.time()
         self.device = device
diff --git a/detectors/s3fd/box_utils.py b/detectors/s3fd/box_utils.py
index 0779bcd..1bf4be2 100644
--- a/detectors/s3fd/box_utils.py
+++ b/detectors/s3fd/box_utils.py
@@ -35,7 +35,7 @@ def nms_(dets, thresh):
         inds = np.where(ovr <= thresh)[0]
         order = order[inds + 1]
 
-    return np.array(keep).astype(np.int)
+    return np.array(keep).astype(int)
 
 
 def decode(loc, priors, variances):
diff --git a/device_utils.py b/device_utils.py
new file mode 100644
index 0000000..303c6c5
--- /dev/null
+++ b/device_utils.py
@@ -0,0 +1,41 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import torch
+
+def get_device(prefer_cuda=True):
+    """
+    Get the best available device (CUDA if available and preferred, otherwise CPU)
+    
+    Args:
+        prefer_cuda (bool): Whether to prefer CUDA over CPU if available
+        
+    Returns:
+        str: Device string ('cuda' or 'cpu')
+    """
+    if prefer_cuda and torch.cuda.is_available():
+        device = 'cuda'
+        print(f'[INFO] Using CUDA GPU: {torch.cuda.get_device_name(0)}')
+    else:
+        device = 'cpu'
+        if prefer_cuda and not torch.cuda.is_available():
+            print('[INFO] CUDA not available, falling back to CPU')
+        else:
+            print('[INFO] Using CPU')
+    
+    return device
+
+def print_device_info():
+    """Print information about available devices"""
+    print("=" * 50)
+    print("Device Information:")
+    print(f"CUDA Available: {torch.cuda.is_available()}")
+    
+    if torch.cuda.is_available():
+        print(f"CUDA Version: {torch.version.cuda}")
+        print(f"GPU Count: {torch.cuda.device_count()}")
+        for i in range(torch.cuda.device_count()):
+            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
+    
+    print(f"PyTorch Version: {torch.__version__}")
+    print("=" * 50)
diff --git a/news_audio_delay_360p_500ms.mp4 b/news_audio_delay_360p_500ms.mp4
new file mode 100644
index 0000000..dced929
Binary files /dev/null and b/news_audio_delay_360p_500ms.mp4 differ
diff --git a/run_pipeline.py b/run_pipeline.py
index f5fc22e..67bef53 100755
--- a/run_pipeline.py
+++ b/run_pipeline.py
@@ -2,6 +2,7 @@
 
 import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2
 import numpy as np
+import torch
 from shutil import rmtree
 
 import scenedetect
@@ -184,7 +185,11 @@ def crop_video(opt,track,cropfile):
 
 def inference_video(opt):
 
-  DET = S3FD(device='cuda')
+  # Check if CUDA is available, otherwise use CPU
+  device = 'cuda' if torch.cuda.is_available() else 'cpu'
+  print(f'[INFO] Using device: {device}')
+  
+  DET = S3FD(device=device)
 
   flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
   flist.sort()
diff --git a/run_syncnet.py b/run_syncnet.py
index 45099fd..493d262 100755
--- a/run_syncnet.py
+++ b/run_syncnet.py
@@ -2,6 +2,9 @@
 #-*- coding: utf-8 -*-
 
 import time, pdb, argparse, subprocess, pickle, os, gzip, glob
+import torch
+
+from SyncNetInstance import *
 
 from SyncNetInstance import *
 
@@ -24,7 +27,11 @@
 
 # ==================== LOAD MODEL AND FILE LIST ====================
 
-s = SyncNetInstance();
+# Check if CUDA is available, otherwise use CPU
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f'[INFO] Using device: {device}')
+
+s = SyncNetInstance(device=device);
 
 s.loadParameters(opt.initial_model);
 print("Model %s loaded."%opt.initial_model);