diff --git a/asl_down.py b/asl_down.py index 53ba89e..2117ff0 100644 --- a/asl_down.py +++ b/asl_down.py @@ -8,6 +8,7 @@ """ import os +import argparse from collections import defaultdict @@ -15,6 +16,7 @@ from yt_dlp import YoutubeDL from bs4 import BeautifulSoup +from utils.dataset_utils import load_dataset, save_landmarks_from_video FOLDER = os.path.join("data", "videos") BASE_URL = 'https://www.signasl.org/sign/' @@ -61,7 +63,7 @@ def download_video(name, url, start_time, duration_time): down.download(url) -def get_video_urls(name): +def get_video_urls(name, ignore=False): # First, generate a valid URL and fetch the content: @@ -84,6 +86,7 @@ def get_video_urls(name): vid_map = defaultdict(list) found = 0 + non_match = 0 for thing in results: @@ -124,6 +127,20 @@ def get_video_urls(name): sign = thing.find('div', style='float:left').find('i').contents[0].lower() + # Determine if we should ignore this video: + + if sign != name: + + non_match += 1 + + if ignore: + + # Just continue: + + print("Found non-matching sign and ignoring ....") + + continue + # Finally, add the data to the collection: vid_map[sign].append(URL) @@ -134,21 +151,36 @@ def get_video_urls(name): found += 1 + # Show some basic stats: + + print("\nBasic Stats:") + print("Total Videos Found: {}".format(found)) + print("Non-matching Videos Found: {}".format(non_match)) + print("Sign Map:") + + for key in vid_map.keys(): + + print(" {} : {}".format(key, len(vid_map[key]))) + # Finally, ensure we found all the videos: - if (found != len(results)): + if ((ignore and found != len(results) - non_match) or (not ignore and found != len(results))): # Print a warning - scrape_warn("Not all videos scrapped! Check above for error logs") + scrape_warn("Not all valid videos scrapped! Check above for error logs") - return vid_map + return vid_map, found -def get_videos(name): +def get_videos(name, ignore=False): # First, get a mapping of names to vids: - vid_map = get_video_urls(name) + print("\n --== [ Web Scraping ] ==-- \n") + + vid_map, found = get_video_urls(name, ignore) + + print("\n --== [ End Web Scraping ] ==-- \n") # Next, create directory structure: @@ -168,12 +200,94 @@ def get_videos(name): # Next, download each video: + print("\n --== [ Video Download ] ==--\n") + + down = 1 + for sign in vid_map.keys(): for url in vid_map[sign]: # Download this video: + print("> Downloading Video {}/{}".format(down, found)) + download_video(sign, url, 0, 0) -get_videos('test') + down += 1 + + print("\n --== [ End Video Download ] ==-- ") + +def parse_file(path): + + # Open the file for reading: + + file = open(path, 'r') + + # Read the file: + + signs = [] + + for line in file: + + signs.append(line.rstrip()) + + return signs + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='ASL Video Downloader') + + # Add names: + + parser.add_argument('sign', help='Query used to download videos', nargs='*') + + # Other various options: + + parser.add_argument('-ne', '--no-extract', help='Does not extract landmarks from videos.', action='store_false') + parser.add_argument('-i', '--ignore', help='Ignores any videos that do no match our search query', action='store_true') + parser.add_argument('-f', '--file', help='Gets signs to download from a file, each sign on a new line', type=str, default='0') + + # Get our arguments + + args = parser.parse_args() + + signs = set(args.sign) + + # Determine if we are getting info from file: + + if args.file != '0': + + print("Extracting signs from file...") + + signs.update(set(parse_file(args.file))) + + print("Signs to download: {}".format(signs)) + + # Now, do the operation: + + for sign in signs: + + print("Doing operation for: {}".format(sign)) + + get_videos(sign, args.ignore) + + # Determine if we should extract videos: + + if args.no_extract: + + # Now, extract the landmarks from these videos: + + print("\n --== [ Landmark Extraction ] ==--\n") + + # Load the dataset, and build reference signs: + + load_dataset() + + print("\n --== [End Landmark Extraction ] ==--\n") + + else: + + print("Skipping landmark extraction...") + + print("\nDone!")