From 2cbd524391a851bc73444dedb338dcd5da188fb2 Mon Sep 17 00:00:00 2001
From: Owen-Cochell <owencochell@gmail.com>
Date: Sun, 13 Nov 2022 22:00:15 -0500
Subject: [PATCH 1/3] Added web scraping script for automating video retrieval

---
 als_down.py      | 179 +++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  17 ++---
 2 files changed, 188 insertions(+), 8 deletions(-)
 create mode 100644 als_down.py

diff --git a/als_down.py b/als_down.py
new file mode 100644
index 0000000..53ba89e
--- /dev/null
+++ b/als_down.py
@@ -0,0 +1,179 @@
+"""
+A script that handles downloading videos from :
+
+https://www.signasl.org
+
+We automatically download, sort, and organize these incoming videos.
+We utilize yt-dlp for downloading videos from an arbitrary source
+"""
+
+import os
+
+from collections import defaultdict
+
+import requests
+
+from yt_dlp import YoutubeDL
+from bs4 import BeautifulSoup
+
+FOLDER = os.path.join("data", "videos")
+BASE_URL = 'https://www.signasl.org/sign/'
+
+def scrape_warn(text, *args):
+
+    # First, print out generic text:
+
+    print("WARNING: {}".format(text))
+
+    if args:
+
+        print("Extra Debugging Info:")
+
+        # Next, print out each argument:
+
+        for arg in args:
+
+            print(arg.prettify())
+
+def download_video(name, url, start_time, duration_time):
+
+    # First, create the directory structure:
+
+    file_path = os.path.join(FOLDER, name)
+    if not os.path.exists(file_path):
+        os.mkdir(file_path)
+
+    # Next, create the parameters:
+
+    params = {
+        'format': 'mp4',
+        'paths': {
+            'home': file_path,
+        },
+        'outtmpl': {
+            'default': '{}-%(title)s.%(ext)s'.format(name),
+        }
+    }
+
+    # Finally, create and download the video
+
+    down = YoutubeDL(params)
+
+    down.download(url)
+    
+def get_video_urls(name):
+
+    # First, generate a valid URL and fetch the content:
+
+    print("Getting page at URL: {}".format(BASE_URL + name))
+
+    data = requests.get(BASE_URL + name)
+
+    # Next, create a parser and load the content:
+
+    print("Starting parse operation ...")
+
+    soup = BeautifulSoup(data.content, "html.parser")
+
+    # Get all video elements:
+
+    results = soup.find_all("div", itemprop='video')
+
+    print("Number of videos to extract: {}".format(len(results)))
+
+    vid_map = defaultdict(list)
+
+    found = 0
+
+    for thing in results:
+
+        # Now, get URL of this video:
+
+        URL = None
+
+        vid = thing.find("video")
+
+        if vid == None:
+
+            # No video element, see if we have an iframe element:
+            # TODO: iframes might be missing the protocol header, check if this is a problem...
+
+            URL = thing.find('iframe')['src']
+
+            if URL is None:
+
+                # Alright, no valid elements found, freak out:
+
+                scrape_warn("No valid video element found!", thing)
+
+        else:
+
+            # We found our video, extract the URL:
+
+            URL = vid.find('source')['src']
+
+            # Check to ensure it is valid:
+
+            if URL is None:
+
+                scrape_warn("Video element found, but no valid URL!", thing, vid)
+
+                continue
+
+        # Sweet, got past our error checks, get the sign name:
+
+        sign = thing.find('div', style='float:left').find('i').contents[0].lower()
+
+        # Finally, add the data to the collection:
+
+        vid_map[sign].append(URL)
+
+        print("Got {} [{}]: {}".format(sign, found,  URL))
+
+        # Add to our found value:
+
+        found += 1
+
+    # Finally, ensure we found all the videos:
+
+    if (found != len(results)):
+
+        # Print a warning
+
+        scrape_warn("Not all videos scrapped! Check above for error logs")
+
+    return vid_map
+
+def get_videos(name):
+
+    # First, get a mapping of names to vids:
+
+    vid_map = get_video_urls(name)
+
+    # Next, create directory structure:
+
+    print("Creating directory structure ...")
+
+    file_path = 'data'
+    if not os.path.exists(file_path):
+        os.mkdir(file_path)
+
+    file_path = os.path.join('data', 'videos')
+    if not os.path.exists(file_path):
+        os.mkdir(file_path)
+
+    file_path = os.path.join('data', 'dataset')
+    if not os.path.exists(file_path):
+        os.mkdir(file_path)
+
+    # Next, download each video:
+
+    for sign in vid_map.keys():
+
+        for url in vid_map[sign]:
+
+            # Download this video:
+
+            download_video(sign, url, 0, 0)
+
+get_videos('test')
diff --git a/requirements.txt b/requirements.txt
index bc7c9aa..07531a8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,10 @@
-fastdtw==0.3.4
-mediapipe==0.8.7.3
+fastdtw
+mediapipe
 numpy
-opencv-contrib-python==4.5.4.60
-opencv-python==4.5.3.56
-pandas==1.3.2
-pytube==11.0.1
-tdqm==0.0.1
-flask
\ No newline at end of file
+opencv-contrib-python
+opencv-python
+pandas
+pytube
+tdqm
+flask
+yt-dlp
\ No newline at end of file

From c95dd050c06811006970c2ffdc784c33627731b6 Mon Sep 17 00:00:00 2001
From: Owen-Cochell <owencochell@gmail.com>
Date: Sun, 13 Nov 2022 23:05:22 -0500
Subject: [PATCH 2/3] Added requirements, fixed naming issue

---
 als_down.py => asl_down.py | 0
 requirements.txt           | 4 +++-
 2 files changed, 3 insertions(+), 1 deletion(-)
 rename als_down.py => asl_down.py (100%)

diff --git a/als_down.py b/asl_down.py
similarity index 100%
rename from als_down.py
rename to asl_down.py
diff --git a/requirements.txt b/requirements.txt
index 07531a8..1888e9c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,6 @@ pandas
 pytube
 tdqm
 flask
-yt-dlp
\ No newline at end of file
+yt-dlp
+beautifulsoup4
+requests
\ No newline at end of file

From f90fd964f8ff7a253415cb03790c62e830c4ceeb Mon Sep 17 00:00:00 2001
From: Owen-Cochell <owencochell@gmail.com>
Date: Tue, 15 Nov 2022 14:16:15 -0500
Subject: [PATCH 3/3] asl_down improvements, proper CLI, extract landmarks from
 video, ignore non-matching signs, get signs from file

---
 asl_down.py | 128 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 121 insertions(+), 7 deletions(-)

diff --git a/asl_down.py b/asl_down.py
index 53ba89e..2117ff0 100644
--- a/asl_down.py
+++ b/asl_down.py
@@ -8,6 +8,7 @@
 """
 
 import os
+import argparse
 
 from collections import defaultdict
 
@@ -15,6 +16,7 @@
 
 from yt_dlp import YoutubeDL
 from bs4 import BeautifulSoup
+from utils.dataset_utils import load_dataset, save_landmarks_from_video
 
 FOLDER = os.path.join("data", "videos")
 BASE_URL = 'https://www.signasl.org/sign/'
@@ -61,7 +63,7 @@ def download_video(name, url, start_time, duration_time):
 
     down.download(url)
     
-def get_video_urls(name):
+def get_video_urls(name, ignore=False):
 
     # First, generate a valid URL and fetch the content:
 
@@ -84,6 +86,7 @@ def get_video_urls(name):
     vid_map = defaultdict(list)
 
     found = 0
+    non_match = 0
 
     for thing in results:
 
@@ -124,6 +127,20 @@ def get_video_urls(name):
 
         sign = thing.find('div', style='float:left').find('i').contents[0].lower()
 
+        # Determine if we should ignore this video:
+
+        if sign != name:
+
+            non_match += 1
+
+            if ignore:
+
+                # Just continue:
+
+                print("Found non-matching sign and ignoring ....")
+
+                continue
+
         # Finally, add the data to the collection:
 
         vid_map[sign].append(URL)
@@ -134,21 +151,36 @@ def get_video_urls(name):
 
         found += 1
 
+    # Show some basic stats:
+
+    print("\nBasic Stats:")
+    print("Total Videos Found: {}".format(found))
+    print("Non-matching Videos Found: {}".format(non_match))
+    print("Sign Map:")
+
+    for key in vid_map.keys():
+
+        print("   {} : {}".format(key, len(vid_map[key])))
+
     # Finally, ensure we found all the videos:
 
-    if (found != len(results)):
+    if ((ignore and found != len(results) - non_match) or (not ignore and found != len(results))):
 
         # Print a warning
 
-        scrape_warn("Not all videos scrapped! Check above for error logs")
+        scrape_warn("Not all valid videos scrapped! Check above for error logs")
 
-    return vid_map
+    return vid_map, found
 
-def get_videos(name):
+def get_videos(name, ignore=False):
 
     # First, get a mapping of names to vids:
 
-    vid_map = get_video_urls(name)
+    print("\n --== [ Web Scraping ] ==-- \n")
+
+    vid_map, found = get_video_urls(name, ignore)
+
+    print("\n --== [ End Web Scraping ] ==-- \n")
 
     # Next, create directory structure:
 
@@ -168,12 +200,94 @@ def get_videos(name):
 
     # Next, download each video:
 
+    print("\n --== [ Video Download ] ==--\n")
+
+    down = 1
+
     for sign in vid_map.keys():
 
         for url in vid_map[sign]:
 
             # Download this video:
 
+            print("> Downloading Video {}/{}".format(down, found))
+
             download_video(sign, url, 0, 0)
 
-get_videos('test')
+            down += 1
+
+    print("\n --== [ End Video Download ] ==-- ")
+
+def parse_file(path):
+
+    # Open the file for reading:
+
+    file = open(path, 'r')
+
+    # Read the file:
+
+    signs = []
+
+    for line in file:
+
+        signs.append(line.rstrip())
+
+    return signs
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='ASL Video Downloader')
+
+    # Add names:
+
+    parser.add_argument('sign', help='Query used to download videos', nargs='*')
+
+    # Other various options:
+
+    parser.add_argument('-ne', '--no-extract', help='Does not extract landmarks from videos.', action='store_false')
+    parser.add_argument('-i', '--ignore', help='Ignores any videos that do no match our search query', action='store_true')
+    parser.add_argument('-f', '--file', help='Gets signs to download from a file, each sign on a new line', type=str, default='0')
+
+    # Get our arguments
+
+    args = parser.parse_args()
+
+    signs = set(args.sign)
+
+    # Determine if we are getting info from file:
+
+    if args.file != '0':
+
+        print("Extracting signs from file...")
+
+        signs.update(set(parse_file(args.file)))
+
+    print("Signs to download: {}".format(signs))
+
+    # Now, do the operation:
+
+    for sign in signs:
+
+        print("Doing operation for: {}".format(sign))
+
+        get_videos(sign, args.ignore)
+
+    # Determine if we should extract videos:
+
+    if args.no_extract:
+
+        # Now, extract the landmarks from these videos:
+
+        print("\n --== [ Landmark Extraction ] ==--\n")
+
+        # Load the dataset, and build reference signs:
+
+        load_dataset()
+
+        print("\n --== [End Landmark Extraction ] ==--\n")
+
+    else:
+
+        print("Skipping landmark extraction...")
+
+    print("\nDone!")