Subtitle Support

+ Added support for downloading subtitles (see readme for usage)
2025-05-28 05:20:14 +02:00 · 2021-05-19 15:59:49 -04:00 · 2021-05-19 15:59:49 -04:00 · 6c5b7870a9
commit 6c5b7870a9
parent a867f82f2b
4 changed files with 113 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -71,15 +71,17 @@ You can now run `python main.py` to start downloading. The course will download
 # Advanced Usage

 ```
-usage: main.py [-h] [-d] [-q] [--download-assets]
+usage: main.py [-h] [-d] [-q] [-l] [--download-assets] [--download-captions]

 Udemy Downloader

 optional arguments:
-  -h, --help         show this help message and exit
-  -d, --debug        Use test_data.json rather than fetch from the udemy api.
-  -q , --quality     Download specific video quality. (144, 360, 480, 720, 1080)
-  --download-assets  Download lecture assets along with lectures
+  -h, --help           show this help message and exit
+  -d, --debug          Use test_data.json rather than fetch from the udemy api.
+  -q , --quality       Download specific video quality. (144, 360, 480, 720, 1080)
+  -l , --lang          The language to download for captions (Default is en)
+  --download-assets    If specified, lecture assets will be downloaded.
+  --download-captions  If specified, captions will be downloaded.
 ```

 - Download a specific quality
@ -88,6 +90,14 @@ optional arguments:
  - `python main.py --download-assets`
 - Download assets and specify a quality
  - `python main.py -q 360 --download-assets`
+- Download captions
+  - `python main.py --download-captions`
+- Download captions with specific language
+  - `python main.py --download-captions -l en` - English subtitles
+  - `python main.py --download-captions -l es` - Spanish subtitles
+  - `python main.py --download-captions -l it` - Italian subtitles
+  - `python main.py --download-captions -l pl` - Polish Subtitles
+  - etc

 # Getting an error about "Accepting the latest terms of service"?

--- a/main.py
+++ b/main.py
@ -9,6 +9,7 @@ from mpegdash.utils import (parse_attr_value, parse_child_nodes,
                            parse_node_value, write_attr_value,
                            write_child_node, write_node_value)
 from utils import extract_kid
+from vtt_to_srt import convert

 load_dotenv()

@ -24,6 +25,8 @@ retry = 3
 home_dir = os.getcwd()
 keyfile_path = "%s\keyfile.json" % os.getcwd()
 dl_assets = False
+dl_captions = False
+caption_locale = "en"
 quality = None  # None will download the best possible
 valid_qualities = [144, 360, 480, 720, 1080]

@ -315,7 +318,7 @@ def download(url, path, filename):
    return file_size


-def process_lecture(lecture, lecture_path, lecture_dir):
+def process_lecture(lecture, lecture_index, lecture_path, lecture_dir):
    lecture_title = lecture["title"]
    lecture_asset = lecture["asset"]
    if lecture_asset["media_license_token"] == None:
@ -380,6 +383,48 @@ def process_lecture(lecture, lecture_path, lecture_dir):
        print("> Found %s assets for lecture '%s'" %
              (len(assets), lecture_title))

+    # process captions
+    if dl_captions:
+        captions = []
+        for caption in lecture_asset.get("captions"):
+            if not isinstance(caption, dict):
+                continue
+            if caption.get("_class") != "caption":
+                continue
+            download_url = caption.get("url")
+            if not download_url or not isinstance(download_url, str):
+                continue
+            lang = (caption.get("language") or caption.get("srclang")
+                    or caption.get("label")
+                    or caption.get("locale_id").split("_")[0])
+            ext = "vtt" if "vtt" in download_url.rsplit(".", 1)[-1] else "srt"
+            if caption_locale == "all" or caption_locale == lang:
+                captions.append({
+                    "language": lang,
+                    "locale_id": caption.get("locale_id"),
+                    "ext": ext,
+                    "url": download_url
+                })
+
+        for caption in captions:
+            filename = f"%s. %s_%s.%s" % (lecture_index, sanitize(
+                lecture_title), caption.get("locale_id"), caption.get("ext"))
+            filename_no_ext = f"%s. %s_%s" % (lecture_index,
+                                              sanitize(lecture_title),
+                                              caption.get("locale_id"))
+            filepath = f"%s\\%s" % (lecture_dir, filename)
+
+            if os.path.isfile(filepath):
+                print("> Captions '%s' already downloaded." % filename)
+            else:
+                print(f"> Downloading captions: '%s'" % filename)
+                download(caption.get("url"), filepath, filename)
+                if caption.get("ext") == "vtt":
+                    print("> Converting captions to SRT format...")
+                    convert(lecture_dir, filename_no_ext)
+                    print("> Caption conversion complete.")
+                    os.remove(filepath)
+

 def parse(data):
    chapters = []
@ -396,10 +441,10 @@ def parse(data):
            except IndexError:
                # This is caused by there not being a starting chapter
                lectures.append(obj)
-                lecture_path = f"%s\\%s. %s.mp4" % (download_dir,
-                                                    lectures.index(obj) + 1,
-                                                    sanitize(obj["title"]))
-                process_lecture(obj, lecture_path, download_dir)
+                lecture_index = lectures.index(obj) + 1
+                lecture_path = f"%s\\%s. %s.mp4" % (
+                    download_dir, lecture_index, sanitize(obj["title"]))
+                process_lecture(obj, lecture_index, lecture_path, download_dir)

    for chapter in chapters:
        chapter_dir = f"%s\\%s. %s" % (download_dir, chapters.index(chapter) +
@ -408,10 +453,13 @@ def parse(data):
            os.mkdir(chapter_dir)

        for lecture in chapter["lectures"]:
-            lecture_path = f"%s\\%s. %s.mp4" % (
-                chapter_dir, chapter["lectures"].index(lecture) + 1,
-                sanitize(lecture["title"]))
-            process_lecture(lecture, lecture_path, chapter_dir)
+            lecture_index = chapter["lectures"].index(lecture) + 1
+            lecture_path = f"%s\\%s. %s.mp4" % (chapter_dir, lecture_index,
+                                                sanitize(lecture["title"]))
+            process_lecture(lecture, lecture_index, lecture_path, chapter_dir)
+    print("\n\n\n\n\n\n\n\n=====================")
+    print("All downloads completed for course!")
+    print("=====================")


 if __name__ == "__main__":
@ -431,16 +479,34 @@ if __name__ == "__main__":
        help="Download specific video quality. (144, 360, 480, 720, 1080)",
        metavar="",
    )
+    parser.add_argument(
+        "-l",
+        "--lang",
+        dest="lang",
+        type=str,
+        help="The language to download for captions (Default is en)",
+        metavar="",
+    )
    parser.add_argument(
        "--download-assets",
        dest="download_assets",
        action="store_true",
-        help="Download lecture assets along with lectures",
+        help="If specified, lecture assets will be downloaded.",
+    )
+    parser.add_argument(
+        "--download-captions",
+        dest="download_captions",
+        action="store_true",
+        help="If specified, captions will be downloaded.",
    )

    args = parser.parse_args()
    if args.download_assets:
        dl_assets = True
+    if args.lang:
+        caption_locale = args.lang
+    if args.download_captions:
+        dl_captions = True
    if args.quality:
        if not args.quality in valid_qualities:
            print("Invalid quality specified! %s" % quality)
--- a/requirements.txt
+++ b/requirements.txt
@ -3,4 +3,6 @@ sanitize_filename
 tqdm
 requests
 python-dotenv
-protobuf
+protobuf
+webvtt-py
+pysrt
--- a/vtt_to_srt.py
+++ b/vtt_to_srt.py
@ -0,0 +1,19 @@
+from webvtt import WebVTT
+import html
+from pysrt.srtitem import SubRipItem
+from pysrt.srttime import SubRipTime
+
+
+def convert(directory, filename):
+    index = 0
+    vtt_filepath = f"%s\\%s.vtt" % (directory, filename)
+    srt_filepath = f"%s\\%s.srt" % (directory, filename)
+    srt = open(srt_filepath, "w")
+
+    for caption in WebVTT().read(vtt_filepath):
+        index += 1
+        start = SubRipTime(0, 0, caption.start_in_seconds)
+        end = SubRipTime(0, 0, caption.end_in_seconds)
+        srt.write(
+            SubRipItem(index, start, end, html.unescape(
+                caption.text)).__str__() + "\n")