From 2cbd524391a851bc73444dedb338dcd5da188fb2 Mon Sep 17 00:00:00 2001 From: Owen-Cochell Date: Sun, 13 Nov 2022 22:00:15 -0500 Subject: [PATCH 1/3] Added web scraping script for automating video retrieval --- als_down.py | 179 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 17 ++--- 2 files changed, 188 insertions(+), 8 deletions(-) create mode 100644 als_down.py diff --git a/als_down.py b/als_down.py new file mode 100644 index 0000000..53ba89e --- /dev/null +++ b/als_down.py @@ -0,0 +1,179 @@ +""" +A script that handles downloading videos from : + +https://www.signasl.org + +We automatically download, sort, and organize these incoming videos. +We utilize yt-dlp for downloading videos from an arbitrary source +""" + +import os + +from collections import defaultdict + +import requests + +from yt_dlp import YoutubeDL +from bs4 import BeautifulSoup + +FOLDER = os.path.join("data", "videos") +BASE_URL = 'https://www.signasl.org/sign/' + +def scrape_warn(text, *args): + + # First, print out generic text: + + print("WARNING: {}".format(text)) + + if args: + + print("Extra Debugging Info:") + + # Next, print out each argument: + + for arg in args: + + print(arg.prettify()) + +def download_video(name, url, start_time, duration_time): + + # First, create the directory structure: + + file_path = os.path.join(FOLDER, name) + if not os.path.exists(file_path): + os.mkdir(file_path) + + # Next, create the parameters: + + params = { + 'format': 'mp4', + 'paths': { + 'home': file_path, + }, + 'outtmpl': { + 'default': '{}-%(title)s.%(ext)s'.format(name), + } + } + + # Finally, create and download the video + + down = YoutubeDL(params) + + down.download(url) + +def get_video_urls(name): + + # First, generate a valid URL and fetch the content: + + print("Getting page at URL: {}".format(BASE_URL + name)) + + data = requests.get(BASE_URL + name) + + # Next, create a parser and load the content: + + print("Starting parse operation ...") + + soup = BeautifulSoup(data.content, "html.parser") + + # Get all video elements: + + results = soup.find_all("div", itemprop='video') + + print("Number of videos to extract: {}".format(len(results))) + + vid_map = defaultdict(list) + + found = 0 + + for thing in results: + + # Now, get URL of this video: + + URL = None + + vid = thing.find("video") + + if vid == None: + + # No video element, see if we have an iframe element: + # TODO: iframes might be missing the protocol header, check if this is a problem... + + URL = thing.find('iframe')['src'] + + if URL is None: + + # Alright, no valid elements found, freak out: + + scrape_warn("No valid video element found!", thing) + + else: + + # We found our video, extract the URL: + + URL = vid.find('source')['src'] + + # Check to ensure it is valid: + + if URL is None: + + scrape_warn("Video element found, but no valid URL!", thing, vid) + + continue + + # Sweet, got past our error checks, get the sign name: + + sign = thing.find('div', style='float:left').find('i').contents[0].lower() + + # Finally, add the data to the collection: + + vid_map[sign].append(URL) + + print("Got {} [{}]: {}".format(sign, found, URL)) + + # Add to our found value: + + found += 1 + + # Finally, ensure we found all the videos: + + if (found != len(results)): + + # Print a warning + + scrape_warn("Not all videos scrapped! Check above for error logs") + + return vid_map + +def get_videos(name): + + # First, get a mapping of names to vids: + + vid_map = get_video_urls(name) + + # Next, create directory structure: + + print("Creating directory structure ...") + + file_path = 'data' + if not os.path.exists(file_path): + os.mkdir(file_path) + + file_path = os.path.join('data', 'videos') + if not os.path.exists(file_path): + os.mkdir(file_path) + + file_path = os.path.join('data', 'dataset') + if not os.path.exists(file_path): + os.mkdir(file_path) + + # Next, download each video: + + for sign in vid_map.keys(): + + for url in vid_map[sign]: + + # Download this video: + + download_video(sign, url, 0, 0) + +get_videos('test') diff --git a/requirements.txt b/requirements.txt index bc7c9aa..07531a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,10 @@ -fastdtw==0.3.4 -mediapipe==0.8.7.3 +fastdtw +mediapipe numpy -opencv-contrib-python==4.5.4.60 -opencv-python==4.5.3.56 -pandas==1.3.2 -pytube==11.0.1 -tdqm==0.0.1 -flask \ No newline at end of file +opencv-contrib-python +opencv-python +pandas +pytube +tdqm +flask +yt-dlp \ No newline at end of file From c95dd050c06811006970c2ffdc784c33627731b6 Mon Sep 17 00:00:00 2001 From: Owen-Cochell Date: Sun, 13 Nov 2022 23:05:22 -0500 Subject: [PATCH 2/3] Added requirements, fixed naming issue --- als_down.py => asl_down.py | 0 requirements.txt | 4 +++- 2 files changed, 3 insertions(+), 1 deletion(-) rename als_down.py => asl_down.py (100%) diff --git a/als_down.py b/asl_down.py similarity index 100% rename from als_down.py rename to asl_down.py diff --git a/requirements.txt b/requirements.txt index 07531a8..1888e9c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,6 @@ pandas pytube tdqm flask -yt-dlp \ No newline at end of file +yt-dlp +beautifulsoup4 +requests \ No newline at end of file From f90fd964f8ff7a253415cb03790c62e830c4ceeb Mon Sep 17 00:00:00 2001 From: Owen-Cochell Date: Tue, 15 Nov 2022 14:16:15 -0500 Subject: [PATCH 3/3] asl_down improvements, proper CLI, extract landmarks from video, ignore non-matching signs, get signs from file --- asl_down.py | 128 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 121 insertions(+), 7 deletions(-) diff --git a/asl_down.py b/asl_down.py index 53ba89e..2117ff0 100644 --- a/asl_down.py +++ b/asl_down.py @@ -8,6 +8,7 @@ """ import os +import argparse from collections import defaultdict @@ -15,6 +16,7 @@ from yt_dlp import YoutubeDL from bs4 import BeautifulSoup +from utils.dataset_utils import load_dataset, save_landmarks_from_video FOLDER = os.path.join("data", "videos") BASE_URL = 'https://www.signasl.org/sign/' @@ -61,7 +63,7 @@ def download_video(name, url, start_time, duration_time): down.download(url) -def get_video_urls(name): +def get_video_urls(name, ignore=False): # First, generate a valid URL and fetch the content: @@ -84,6 +86,7 @@ def get_video_urls(name): vid_map = defaultdict(list) found = 0 + non_match = 0 for thing in results: @@ -124,6 +127,20 @@ def get_video_urls(name): sign = thing.find('div', style='float:left').find('i').contents[0].lower() + # Determine if we should ignore this video: + + if sign != name: + + non_match += 1 + + if ignore: + + # Just continue: + + print("Found non-matching sign and ignoring ....") + + continue + # Finally, add the data to the collection: vid_map[sign].append(URL) @@ -134,21 +151,36 @@ def get_video_urls(name): found += 1 + # Show some basic stats: + + print("\nBasic Stats:") + print("Total Videos Found: {}".format(found)) + print("Non-matching Videos Found: {}".format(non_match)) + print("Sign Map:") + + for key in vid_map.keys(): + + print(" {} : {}".format(key, len(vid_map[key]))) + # Finally, ensure we found all the videos: - if (found != len(results)): + if ((ignore and found != len(results) - non_match) or (not ignore and found != len(results))): # Print a warning - scrape_warn("Not all videos scrapped! Check above for error logs") + scrape_warn("Not all valid videos scrapped! Check above for error logs") - return vid_map + return vid_map, found -def get_videos(name): +def get_videos(name, ignore=False): # First, get a mapping of names to vids: - vid_map = get_video_urls(name) + print("\n --== [ Web Scraping ] ==-- \n") + + vid_map, found = get_video_urls(name, ignore) + + print("\n --== [ End Web Scraping ] ==-- \n") # Next, create directory structure: @@ -168,12 +200,94 @@ def get_videos(name): # Next, download each video: + print("\n --== [ Video Download ] ==--\n") + + down = 1 + for sign in vid_map.keys(): for url in vid_map[sign]: # Download this video: + print("> Downloading Video {}/{}".format(down, found)) + download_video(sign, url, 0, 0) -get_videos('test') + down += 1 + + print("\n --== [ End Video Download ] ==-- ") + +def parse_file(path): + + # Open the file for reading: + + file = open(path, 'r') + + # Read the file: + + signs = [] + + for line in file: + + signs.append(line.rstrip()) + + return signs + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='ASL Video Downloader') + + # Add names: + + parser.add_argument('sign', help='Query used to download videos', nargs='*') + + # Other various options: + + parser.add_argument('-ne', '--no-extract', help='Does not extract landmarks from videos.', action='store_false') + parser.add_argument('-i', '--ignore', help='Ignores any videos that do no match our search query', action='store_true') + parser.add_argument('-f', '--file', help='Gets signs to download from a file, each sign on a new line', type=str, default='0') + + # Get our arguments + + args = parser.parse_args() + + signs = set(args.sign) + + # Determine if we are getting info from file: + + if args.file != '0': + + print("Extracting signs from file...") + + signs.update(set(parse_file(args.file))) + + print("Signs to download: {}".format(signs)) + + # Now, do the operation: + + for sign in signs: + + print("Doing operation for: {}".format(sign)) + + get_videos(sign, args.ignore) + + # Determine if we should extract videos: + + if args.no_extract: + + # Now, extract the landmarks from these videos: + + print("\n --== [ Landmark Extraction ] ==--\n") + + # Load the dataset, and build reference signs: + + load_dataset() + + print("\n --== [End Landmark Extraction ] ==--\n") + + else: + + print("Skipping landmark extraction...") + + print("\nDone!")