Subtitle Support

+ Added support for downloading subtitles (see readme for usage)
2025-05-29 10:50:12 +02:00 · 2021-05-19 15:59:49 -04:00 · 2021-05-19 15:59:49 -04:00 · 6c5b7870a9
commit 6c5b7870a9
parent a867f82f2b
4 changed files with 113 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -71,15 +71,17 @@ You can now run `python main.py` to start downloading. The course will download
 # Advanced Usage
 ```
-usage: main.py [-h] [-d] [-q] [--download-assets]
+usage: main.py [-h] [-d] [-q] [-l] [--download-assets] [--download-captions]
 Udemy Downloader
 optional arguments:
-  -h, --help         show this help message and exit
+  -h, --help           show this help message and exit
-  -d, --debug        Use test_data.json rather than fetch from the udemy api.
+  -d, --debug          Use test_data.json rather than fetch from the udemy api.
-  -q , --quality     Download specific video quality. (144, 360, 480, 720, 1080)
+  -q , --quality       Download specific video quality. (144, 360, 480, 720, 1080)
-  --download-assets  Download lecture assets along with lectures
+  -l , --lang          The language to download for captions (Default is en)
  --download-assets    If specified, lecture assets will be downloaded.
  --download-captions  If specified, captions will be downloaded.
 ```
 - Download a specific quality
@ -88,6 +90,14 @@ optional arguments:
  - `python main.py --download-assets`
 - Download assets and specify a quality
  - `python main.py -q 360 --download-assets`
 - Download captions
  - `python main.py --download-captions`
 - Download captions with specific language
  - `python main.py --download-captions -l en` - English subtitles
  - `python main.py --download-captions -l es` - Spanish subtitles
  - `python main.py --download-captions -l it` - Italian subtitles
  - `python main.py --download-captions -l pl` - Polish Subtitles
  - etc
 # Getting an error about "Accepting the latest terms of service"?
--- a/main.py
+++ b/main.py
@ -9,6 +9,7 @@ from mpegdash.utils import (parse_attr_value, parse_child_nodes,
                            parse_node_value, write_attr_value,
                            write_child_node, write_node_value)
 from utils import extract_kid
 from vtt_to_srt import convert
 load_dotenv()
@ -24,6 +25,8 @@ retry = 3
 home_dir = os.getcwd()
 keyfile_path = "%s\keyfile.json" % os.getcwd()
 dl_assets = False
 dl_captions = False
 caption_locale = "en"
 quality = None  # None will download the best possible
 valid_qualities = [144, 360, 480, 720, 1080]
@ -315,7 +318,7 @@ def download(url, path, filename):
    return file_size
-def process_lecture(lecture, lecture_path, lecture_dir):
+def process_lecture(lecture, lecture_index, lecture_path, lecture_dir):
    lecture_title = lecture["title"]
    lecture_asset = lecture["asset"]
    if lecture_asset["media_license_token"] == None:
@ -380,6 +383,48 @@ def process_lecture(lecture, lecture_path, lecture_dir):
        print("> Found %s assets for lecture '%s'" %
              (len(assets), lecture_title))
    # process captions
    if dl_captions:
        captions = []
        for caption in lecture_asset.get("captions"):
            if not isinstance(caption, dict):
                continue
            if caption.get("_class") != "caption":
                continue
            download_url = caption.get("url")
            if not download_url or not isinstance(download_url, str):
                continue
            lang = (caption.get("language") or caption.get("srclang")
                    or caption.get("label")
                    or caption.get("locale_id").split("_")[0])
            ext = "vtt" if "vtt" in download_url.rsplit(".", 1)[-1] else "srt"
            if caption_locale == "all" or caption_locale == lang:
                captions.append({
                    "language": lang,
                    "locale_id": caption.get("locale_id"),
                    "ext": ext,
                    "url": download_url
                })
        for caption in captions:
            filename = f"%s. %s_%s.%s" % (lecture_index, sanitize(
                lecture_title), caption.get("locale_id"), caption.get("ext"))
            filename_no_ext = f"%s. %s_%s" % (lecture_index,
                                              sanitize(lecture_title),
                                              caption.get("locale_id"))
            filepath = f"%s\\%s" % (lecture_dir, filename)
            if os.path.isfile(filepath):
                print("> Captions '%s' already downloaded." % filename)
            else:
                print(f"> Downloading captions: '%s'" % filename)
                download(caption.get("url"), filepath, filename)
                if caption.get("ext") == "vtt":
                    print("> Converting captions to SRT format...")
                    convert(lecture_dir, filename_no_ext)
                    print("> Caption conversion complete.")
                    os.remove(filepath)
 def parse(data):
    chapters = []
@ -396,10 +441,10 @@ def parse(data):
            except IndexError:
                # This is caused by there not being a starting chapter
                lectures.append(obj)
-                lecture_path = f"%s\\%s. %s.mp4" % (download_dir,
+                lecture_index = lectures.index(obj) + 1
-                                                    lectures.index(obj) + 1,
+                lecture_path = f"%s\\%s. %s.mp4" % (
-                                                    sanitize(obj["title"]))
+                    download_dir, lecture_index, sanitize(obj["title"]))
-                process_lecture(obj, lecture_path, download_dir)
+                process_lecture(obj, lecture_index, lecture_path, download_dir)
    for chapter in chapters:
        chapter_dir = f"%s\\%s. %s" % (download_dir, chapters.index(chapter) +
@ -408,10 +453,13 @@ def parse(data):
            os.mkdir(chapter_dir)
        for lecture in chapter["lectures"]:
-            lecture_path = f"%s\\%s. %s.mp4" % (
+            lecture_index = chapter["lectures"].index(lecture) + 1
-                chapter_dir, chapter["lectures"].index(lecture) + 1,
+            lecture_path = f"%s\\%s. %s.mp4" % (chapter_dir, lecture_index,
-                sanitize(lecture["title"]))
+                                                sanitize(lecture["title"]))
-            process_lecture(lecture, lecture_path, chapter_dir)
+            process_lecture(lecture, lecture_index, lecture_path, chapter_dir)
    print("\n\n\n\n\n\n\n\n=====================")
    print("All downloads completed for course!")
    print("=====================")
 if __name__ == "__main__":
@ -431,16 +479,34 @@ if __name__ == "__main__":
        help="Download specific video quality. (144, 360, 480, 720, 1080)",
        metavar="",
    )
    parser.add_argument(
        "-l",
        "--lang",
        dest="lang",
        type=str,
        help="The language to download for captions (Default is en)",
        metavar="",
    )
    parser.add_argument(
        "--download-assets",
        dest="download_assets",
        action="store_true",
-        help="Download lecture assets along with lectures",
+        help="If specified, lecture assets will be downloaded.",
    )
    parser.add_argument(
        "--download-captions",
        dest="download_captions",
        action="store_true",
        help="If specified, captions will be downloaded.",
    )
    args = parser.parse_args()
    if args.download_assets:
        dl_assets = True
    if args.lang:
        caption_locale = args.lang
    if args.download_captions:
        dl_captions = True
    if args.quality:
        if not args.quality in valid_qualities:
            print("Invalid quality specified! %s" % quality)
--- a/requirements.txt
+++ b/requirements.txt
@ -3,4 +3,6 @@ sanitize_filename
 tqdm
 requests
 python-dotenv
-protobuf
+protobuf
 webvtt-py
 pysrt
--- a/vtt_to_srt.py
+++ b/vtt_to_srt.py
@ -0,0 +1,19 @@
 from webvtt import WebVTT
 import html
 from pysrt.srtitem import SubRipItem
 from pysrt.srttime import SubRipTime
 def convert(directory, filename):
    index = 0
    vtt_filepath = f"%s\\%s.vtt" % (directory, filename)
    srt_filepath = f"%s\\%s.srt" % (directory, filename)
    srt = open(srt_filepath, "w")
    for caption in WebVTT().read(vtt_filepath):
        index += 1
        start = SubRipTime(0, 0, caption.start_in_seconds)
        end = SubRipTime(0, 0, caption.end_in_seconds)
        srt.write(
            SubRipItem(index, start, end, html.unescape(
                caption.text)).__str__() + "\n")