diff --git a/README.md b/README.md index 7c325f7..d672131 100644 --- a/README.md +++ b/README.md @@ -71,15 +71,17 @@ You can now run `python main.py` to start downloading. The course will download # Advanced Usage ``` -usage: main.py [-h] [-d] [-q] [--download-assets] +usage: main.py [-h] [-d] [-q] [-l] [--download-assets] [--download-captions] Udemy Downloader optional arguments: - -h, --help show this help message and exit - -d, --debug Use test_data.json rather than fetch from the udemy api. - -q , --quality Download specific video quality. (144, 360, 480, 720, 1080) - --download-assets Download lecture assets along with lectures + -h, --help show this help message and exit + -d, --debug Use test_data.json rather than fetch from the udemy api. + -q , --quality Download specific video quality. (144, 360, 480, 720, 1080) + -l , --lang The language to download for captions (Default is en) + --download-assets If specified, lecture assets will be downloaded. + --download-captions If specified, captions will be downloaded. ``` - Download a specific quality @@ -88,6 +90,14 @@ optional arguments: - `python main.py --download-assets` - Download assets and specify a quality - `python main.py -q 360 --download-assets` +- Download captions + - `python main.py --download-captions` +- Download captions with specific language + - `python main.py --download-captions -l en` - English subtitles + - `python main.py --download-captions -l es` - Spanish subtitles + - `python main.py --download-captions -l it` - Italian subtitles + - `python main.py --download-captions -l pl` - Polish Subtitles + - etc # Getting an error about "Accepting the latest terms of service"? diff --git a/main.py b/main.py index e22f44e..ebc0262 100644 --- a/main.py +++ b/main.py @@ -9,6 +9,7 @@ from mpegdash.utils import (parse_attr_value, parse_child_nodes, parse_node_value, write_attr_value, write_child_node, write_node_value) from utils import extract_kid +from vtt_to_srt import convert load_dotenv() @@ -24,6 +25,8 @@ retry = 3 home_dir = os.getcwd() keyfile_path = "%s\keyfile.json" % os.getcwd() dl_assets = False +dl_captions = False +caption_locale = "en" quality = None # None will download the best possible valid_qualities = [144, 360, 480, 720, 1080] @@ -315,7 +318,7 @@ def download(url, path, filename): return file_size -def process_lecture(lecture, lecture_path, lecture_dir): +def process_lecture(lecture, lecture_index, lecture_path, lecture_dir): lecture_title = lecture["title"] lecture_asset = lecture["asset"] if lecture_asset["media_license_token"] == None: @@ -380,6 +383,48 @@ def process_lecture(lecture, lecture_path, lecture_dir): print("> Found %s assets for lecture '%s'" % (len(assets), lecture_title)) + # process captions + if dl_captions: + captions = [] + for caption in lecture_asset.get("captions"): + if not isinstance(caption, dict): + continue + if caption.get("_class") != "caption": + continue + download_url = caption.get("url") + if not download_url or not isinstance(download_url, str): + continue + lang = (caption.get("language") or caption.get("srclang") + or caption.get("label") + or caption.get("locale_id").split("_")[0]) + ext = "vtt" if "vtt" in download_url.rsplit(".", 1)[-1] else "srt" + if caption_locale == "all" or caption_locale == lang: + captions.append({ + "language": lang, + "locale_id": caption.get("locale_id"), + "ext": ext, + "url": download_url + }) + + for caption in captions: + filename = f"%s. %s_%s.%s" % (lecture_index, sanitize( + lecture_title), caption.get("locale_id"), caption.get("ext")) + filename_no_ext = f"%s. %s_%s" % (lecture_index, + sanitize(lecture_title), + caption.get("locale_id")) + filepath = f"%s\\%s" % (lecture_dir, filename) + + if os.path.isfile(filepath): + print("> Captions '%s' already downloaded." % filename) + else: + print(f"> Downloading captions: '%s'" % filename) + download(caption.get("url"), filepath, filename) + if caption.get("ext") == "vtt": + print("> Converting captions to SRT format...") + convert(lecture_dir, filename_no_ext) + print("> Caption conversion complete.") + os.remove(filepath) + def parse(data): chapters = [] @@ -396,10 +441,10 @@ def parse(data): except IndexError: # This is caused by there not being a starting chapter lectures.append(obj) - lecture_path = f"%s\\%s. %s.mp4" % (download_dir, - lectures.index(obj) + 1, - sanitize(obj["title"])) - process_lecture(obj, lecture_path, download_dir) + lecture_index = lectures.index(obj) + 1 + lecture_path = f"%s\\%s. %s.mp4" % ( + download_dir, lecture_index, sanitize(obj["title"])) + process_lecture(obj, lecture_index, lecture_path, download_dir) for chapter in chapters: chapter_dir = f"%s\\%s. %s" % (download_dir, chapters.index(chapter) + @@ -408,10 +453,13 @@ def parse(data): os.mkdir(chapter_dir) for lecture in chapter["lectures"]: - lecture_path = f"%s\\%s. %s.mp4" % ( - chapter_dir, chapter["lectures"].index(lecture) + 1, - sanitize(lecture["title"])) - process_lecture(lecture, lecture_path, chapter_dir) + lecture_index = chapter["lectures"].index(lecture) + 1 + lecture_path = f"%s\\%s. %s.mp4" % (chapter_dir, lecture_index, + sanitize(lecture["title"])) + process_lecture(lecture, lecture_index, lecture_path, chapter_dir) + print("\n\n\n\n\n\n\n\n=====================") + print("All downloads completed for course!") + print("=====================") if __name__ == "__main__": @@ -431,16 +479,34 @@ if __name__ == "__main__": help="Download specific video quality. (144, 360, 480, 720, 1080)", metavar="", ) + parser.add_argument( + "-l", + "--lang", + dest="lang", + type=str, + help="The language to download for captions (Default is en)", + metavar="", + ) parser.add_argument( "--download-assets", dest="download_assets", action="store_true", - help="Download lecture assets along with lectures", + help="If specified, lecture assets will be downloaded.", + ) + parser.add_argument( + "--download-captions", + dest="download_captions", + action="store_true", + help="If specified, captions will be downloaded.", ) args = parser.parse_args() if args.download_assets: dl_assets = True + if args.lang: + caption_locale = args.lang + if args.download_captions: + dl_captions = True if args.quality: if not args.quality in valid_qualities: print("Invalid quality specified! %s" % quality) diff --git a/requirements.txt b/requirements.txt index 9093806..23940f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ sanitize_filename tqdm requests python-dotenv -protobuf \ No newline at end of file +protobuf +webvtt-py +pysrt \ No newline at end of file diff --git a/vtt_to_srt.py b/vtt_to_srt.py new file mode 100644 index 0000000..b101e35 --- /dev/null +++ b/vtt_to_srt.py @@ -0,0 +1,19 @@ +from webvtt import WebVTT +import html +from pysrt.srtitem import SubRipItem +from pysrt.srttime import SubRipTime + + +def convert(directory, filename): + index = 0 + vtt_filepath = f"%s\\%s.vtt" % (directory, filename) + srt_filepath = f"%s\\%s.srt" % (directory, filename) + srt = open(srt_filepath, "w") + + for caption in WebVTT().read(vtt_filepath): + index += 1 + start = SubRipTime(0, 0, caption.start_in_seconds) + end = SubRipTime(0, 0, caption.end_in_seconds) + srt.write( + SubRipItem(index, start, end, html.unescape( + caption.text)).__str__() + "\n")