HLS parsing for 1080p+ quality

+ Added a new command argument ``--skip-hls`` to skip parsing hls playlists + Updated README to reflect code changes
2025-04-30 02:14:25 +02:00 · 2021-05-27 17:38:29 -04:00 · 2021-05-27 17:38:29 -04:00 · 5ffef4736e
commit 5ffef4736e
parent f0e06106fc
4 changed files with 361 additions and 29 deletions
--- a/README.md
+++ b/README.md
@ -69,7 +69,7 @@ You can now run `python main.py` to start downloading. The course will download

 ```
 usage: main.py [-h] -c COURSE_URL [-b BEARER_TOKEN] [-q QUALITY] [-l LANG] [--skip-lectures] [--download-assets] [--download-captions]
-               [--keep-vtt]
+               [--keep-vtt] [--skip-hls]

 Udemy Downloader

@ -87,6 +87,8 @@ optional arguments:
  --download-assets     If specified, lecture assets will be downloaded
  --download-captions   If specified, captions will be downloaded
  --keep-vtt            If specified, .vtt files won't be removed
+  --skip-hls            If specified, hls streams will be skipped (faster fetching) (hls streams usually contain 1080p quality for non-drm
+                        lectures)
 ```

 - Passing a Bearer Token and Course ID as an argument
@ -112,6 +114,8 @@ optional arguments:
  - `python main.py -c <Course URL> --skip-lectures --download-assets` - Downloads only assets
 - Keep .VTT caption files:
  - `python main.py -c <Course URL> --download-captions --keep-vtt`
+- Skip parsing HLS Streams (HLS streams usually contain 1080p quality for Non-DRM lectures):
+  - `python main.py -c <Course URL> --skip-hls`

 # Credits

--- a/ffmpeg.py
+++ b/ffmpeg.py
@ -0,0 +1,274 @@
+#!/usr/bin/python3
+# pylint: disable=R,C,W,E
+"""
+Author  : Nasir Khan (r0ot h3x49)
+Github  : https://github.com/r0oth3x49
+License : MIT
+Copyright (c) 2018-2025 Nasir Khan (r0ot h3x49)
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the
+Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 
+and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
+ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH 
+THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+import re, time, logging, subprocess, sys
+from colorama import Fore, Style
+
+
+class FFMPeg:
+
+    _PROGRESS_PATTERN = re.compile(
+        r"(frame|fps|total_size|out_time|bitrate|speed|progress)\s*\=\s*(\S+)")
+
+    def __init__(self,
+                 duration,
+                 url,
+                 token,
+                 filepath,
+                 quiet=False,
+                 callback=lambda *x: None):
+        self.url = url
+        self.filepath = filepath
+        self.quiet = quiet
+        self.duration = duration
+        self.callback = callback
+        self.token = token
+
+    def _command(self):
+        """
+        ffmpeg.exe -headers "Authorization: Bearer {token}" -i "" -c copy -bsf:a aac_adtstoasc out.mp4
+        """
+        command = [
+            "ffmpeg",
+            "-headers",
+            f"Authorization: Bearer {self.token}",
+            "-i",
+            f"{self.url}",
+            "-c",
+            "copy",
+            "-bsf:a",
+            "aac_adtstoasc",
+            f"{self.filepath}",
+            "-y",
+            "-progress",
+            "pipe:2",
+        ]
+        return command
+
+    def _fetch_total_duration(self, line):
+        duration_in_secs = 0
+        duration_regex = re.compile(
+            r"Duration: (\d{2}):(\d{2}):(\d{2})\.\d{2}")
+        mobj = duration_regex.search(line)
+        if mobj:
+            duration_tuple = mobj.groups()
+            duration_in_secs = (int(duration_tuple[0]) * 60 +
+                                int(duration_tuple[1]) * 60 +
+                                int(duration_tuple[2]))
+        else:
+            duration_in_secs = self.duration
+        return duration_in_secs
+
+    def _fetch_current_duration_done(self, time_str):
+        time_str = time_str.split(":")
+        return (int(time_str[0]) * 60 + int(time_str[1]) * 60 +
+                int(time_str[2].split(".")[0]))
+
+    def _prepare_time_str(self, secs):
+        (mins, secs) = divmod(secs, 60)
+        (hours, mins) = divmod(mins, 60)
+        if hours > 99:
+            time_str = "--:--:--"
+        if hours == 0:
+            time_str = "%02d:%02ds" % (mins, secs)
+        else:
+            time_str = "%02d:%02d:%02ds" % (hours, mins, secs)
+        return time_str
+
+    def _progress(self,
+                  iterations,
+                  total,
+                  bytesdone,
+                  speed,
+                  elapsed,
+                  bar_length=30,
+                  fps=None):
+        offset = 0
+        filled_length = int(round(bar_length * iterations / float(total)))
+        percents = format(100.00 * (iterations * 1.0 / float(total)), ".2f")
+
+        if bytesdone <= 1048576:
+            _receiving = round(float(bytesdone) / 1024.00, 2)
+            _received = format(
+                _receiving if _receiving < 1024.00 else _receiving / 1024.00,
+                ".2f")
+            suffix_recvd = "KB" if _receiving < 1024.00 else "MB"
+        else:
+            _receiving = round(float(bytesdone) / 1048576, 2)
+            _received = format(
+                _receiving if _receiving < 1024.00 else _receiving / 1024.00,
+                ".2f")
+            suffix_recvd = "MB" if _receiving < 1024.00 else "GB"
+
+        suffix_rate = "Kb/s" if speed < 1024.00 else "Mb/s"
+        if fps:
+            suffix_rate += f" {fps}/fps"
+        if elapsed:
+            rate = ((float(iterations) - float(offset)) / 1024.0) / elapsed
+            eta = (total - iterations) / (rate * 1024.0)
+        else:
+            rate = 0
+            eta = 0
+        rate = format(speed if speed < 1024.00 else speed / 1024.00, ".2f")
+        (mins, secs) = divmod(eta, 60)
+        (hours, mins) = divmod(mins, 60)
+        if hours > 99:
+            eta = "--:--:--"
+        if hours == 0:
+            eta = "eta %02d:%02ds" % (mins, secs)
+        else:
+            eta = "eta %02d:%02d:%02ds" % (hours, mins, secs)
+        if secs == 0:
+            eta = "\n"
+
+        total_time = self._prepare_time_str(total)
+        done_time = self._prepare_time_str(iterations)
+        downloaded = f"{total_time}/{done_time}"
+
+        received_bytes = str(_received) + str(suffix_recvd)
+        percents = f"{received_bytes} {percents}"
+
+        self.hls_progress(
+            downloaded=downloaded,
+            percents=percents,
+            filled_length=filled_length,
+            rate=str(rate) + str(suffix_rate),
+            suffix=eta,
+            bar_length=bar_length,
+        )
+
+    def hls_progress(self,
+                     downloaded,
+                     percents,
+                     filled_length,
+                     rate,
+                     suffix,
+                     bar_length=30):
+        bar = (Fore.CYAN + Style.DIM + "#" * filled_length + Fore.WHITE +
+               Style.DIM + "-" * (bar_length - filled_length))
+        sys.stdout.write(
+            "\033[2K\033[1G\r\r{}{}[{}{}*{}{}] : {}{}{} {}% |{}{}{}| {} {}".
+            format(
+                Fore.CYAN,
+                Style.DIM,
+                Fore.MAGENTA,
+                Style.BRIGHT,
+                Fore.CYAN,
+                Style.DIM,
+                Fore.GREEN,
+                Style.BRIGHT,
+                downloaded,
+                percents,
+                bar,
+                Fore.GREEN,
+                Style.BRIGHT,
+                rate,
+                suffix,
+            ))
+        sys.stdout.flush()
+
+    def _parse_progress(self, line):
+        items = {
+            key: value
+            for key, value in self._PROGRESS_PATTERN.findall(line)
+        }
+        return items
+
+    def download(self):
+        total_time = None
+        t0 = time.time()
+        progress_lines = []
+        active = True
+        retVal = {}
+        command = self._command()
+        bytes_done = 0
+        download_speed = 0
+        try:
+            with subprocess.Popen(command,
+                                  stdout=subprocess.PIPE,
+                                  stderr=subprocess.PIPE) as proc:
+                while active:
+                    elapsed = time.time() - t0
+                    try:
+                        line = proc.stderr.readline().decode("utf-8").strip()
+                        if not total_time:
+                            total_time = self._fetch_total_duration(line)
+                        if "progress=end" in line:
+                            try:
+                                self._progress(
+                                    total_time,
+                                    total_time,
+                                    bytes_done,
+                                    download_speed,
+                                    elapsed,
+                                )
+                            except KeyboardInterrupt:
+                                retVal = {
+                                    "status": "False",
+                                    "msg": "Error: KeyboardInterrupt",
+                                }
+                                raise KeyboardInterrupt
+                            except Exception as err:
+                                {"status": "False", "msg": f"Error: {err}"}
+                            active = False
+                            retVal = {"status": "True", "msg": "download"}
+                            break
+                        if "progress" not in line:
+                            progress_lines.append(line)
+                        else:
+                            lines = "\n".join(progress_lines)
+                            items = self._parse_progress(lines)
+                            if items:
+                                secs = self._fetch_current_duration_done(
+                                    items.get("out_time"))
+                                _tsize = (
+                                    items.get("total_size").lower().replace(
+                                        "kb", ""))
+                                _brate = (items.get("bitrate").lower().replace(
+                                    "kbits/s", ""))
+                                fps = items.get("fps")
+                                bytes_done = float(
+                                    _tsize) if _tsize != "n/a" else 0
+                                download_speed = float(
+                                    _brate) if _brate != "n/a" else 0
+                                try:
+                                    self._progress(
+                                        secs,
+                                        total_time,
+                                        bytes_done,
+                                        download_speed,
+                                        elapsed,
+                                        fps=fps,
+                                    )
+                                except KeyboardInterrupt:
+                                    retVal = {
+                                        "status": "False",
+                                        "msg": "Error: KeyboardInterrupt",
+                                    }
+                                    raise KeyboardInterrupt
+                                except Exception as err:
+                                    {"status": "False", "msg": f"Error: {err}"}
+                            progress_lines = []
+                    except KeyboardInterrupt:
+                        active = False
+                        retVal = {
+                            "status": "False",
+                            "msg": "Error: KeyboardInterrupt"
+                        }
+                        raise KeyboardInterrupt
+        except KeyboardInterrupt:
+            raise KeyboardInterrupt
+        return retVal
--- a/main.py
+++ b/main.py
@ -1,14 +1,13 @@
-import os, requests, json, glob, argparse, sys, re, time, asyncio, json
-# from sanitize_filename import sanitize
+import os, requests, json, glob, argparse, sys, re, time, asyncio, json, cloudscraper, m3u8
 from tqdm import tqdm
 from dotenv import load_dotenv
 from mpegdash.parser import MPEGDASHParser
 from utils import extract_kid
 from vtt_to_srt import convert
-import cloudscraper
 from requests.exceptions import ConnectionError as conn_error
 from html.parser import HTMLParser as compat_HTMLParser
 from sanitize import sanitize, slugify, SLUG_OK
+from ffmpeg import FFMPeg

 home_dir = os.getcwd()
 download_dir = os.path.join(os.getcwd(), "out_dir")
@ -167,7 +166,7 @@ class Udemy:
            })
        return _temp

-    def _extract_sources(self, sources):
+    def _extract_sources(self, sources, skip_hls):
        _temp = []
        if sources and isinstance(sources, list):
            for source in sources:
@ -195,17 +194,21 @@ class Udemy:
                else:
                    width = "256"
                if (source.get("type") == "application/x-mpegURL"
-                        or "m3u8" in download_url or height == "Audio"):
-                    continue
-
-                _type = source.get("type")
-                _temp.append({
-                    "type": "video",
-                    "height": height,
-                    "width": width,
-                    "extension": _type.replace("video/", ""),
-                    "download_url": download_url,
-                })
+                        or "m3u8" in download_url):
+                    if not skip_hls:
+                        out = self._extract_m3u8(download_url)
+                        print(out)
+                        if out:
+                            _temp.extend(out)
+                else:
+                    _type = source.get("type")
+                    _temp.append({
+                        "type": "video",
+                        "height": height,
+                        "width": width,
+                        "extension": _type.replace("video/", ""),
+                        "download_url": download_url,
+                    })
        return _temp

    def _extract_media_sources(self, sources):
@ -247,6 +250,38 @@ class Udemy:
                })
        return _temp

+    def _extract_m3u8(self, url):
+        """extracts m3u8 streams"""
+        _temp = []
+        try:
+            resp = self.session._get(url)
+            resp.raise_for_status()
+            raw_data = resp.text
+            m3u8_object = m3u8.loads(raw_data)
+            playlists = m3u8_object.playlists
+            seen = set()
+            for pl in playlists:
+                resolution = pl.stream_info.resolution
+                codecs = pl.stream_info.codecs
+                if not resolution:
+                    continue
+                if not codecs:
+                    continue
+                width, height = resolution
+                download_url = pl.uri
+                if height not in seen:
+                    seen.add(height)
+                    _temp.append({
+                        "type": "hls",
+                        "height": height,
+                        "width": width,
+                        "extension": "mp4",
+                        "download_url": download_url,
+                    })
+        except Exception as error:
+            print(f"Udemy Says : '{error}' while fetching hls streams..")
+        return _temp
+
    def _extract_mpd(self, url):
        """extract mpd streams"""
        _video = []
@ -922,13 +957,8 @@ def process_caption(caption, lecture_title, lecture_dir, keep_vtt, tries=0):
                print(f"> Error converting caption: {e}")


-def process_lecture(
-    lecture,
-    lecture_index,
-    lecture_path,
-    lecture_dir,
-    quality,
-):
+def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
+                    access_token):
    # TODO: Make this more efficent, some lectures are html articles not videos so we should check if the extension is html
    index = lecture.get("index")
    lecture_index = lecture.get("lecture_index")
@ -984,7 +1014,17 @@ def process_lecture(
                        key=lambda x: abs(int(x.get("height")) - quality))
                try:
                    url = source.get("download_url")
-                    download(url, lecture_path, lecture_title)
+                    source_type = source.get("type")
+                    if source_type == "hls":
+                        temp_filepath = lecture_path.replace(".mp4", "")
+                        temp_filepath = temp_filepath + ".hls-part.mp4"
+                        retVal = FFMPeg(None, url, access_token,
+                                        temp_filepath).download()
+                        if retVal:
+                            os.rename(temp_filepath, lecture_path)
+                            print("> HLS Download success")
+                    else:
+                        download(url, lecture_path, lecture_title)
                except Exception as e:
                    print(f"> Error downloading lecture: ", e)
            else:
@ -995,7 +1035,7 @@ def process_lecture(


 def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
-              caption_locale, keep_vtt):
+              caption_locale, keep_vtt, access_token):
    total_chapters = _udemy.get("total_chapters")
    total_lectures = _udemy.get("total_lectures")
    print(f"Chapter(s) ({total_chapters})")
@ -1040,7 +1080,7 @@ def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
                    lecture_path = os.path.join(
                        chapter_dir, "{}.mp4".format(sanitize(lecture_title)))
                    process_lecture(lecture, lecture_index, lecture_path,
-                                    chapter_dir, quality)
+                                    chapter_dir, quality, access_token)

            if dl_assets:
                assets = lecture.get("assets")
@ -1148,6 +1188,13 @@ if __name__ == "__main__":
        action="store_true",
        help="If specified, .vtt files won't be removed",
    )
+    parser.add_argument(
+        "--skip-hls",
+        dest="skip_hls",
+        action="store_true",
+        help=
+        "If specified, hls streams will be skipped (faster fetching) (hls streams usually contain 1080p quality for non-drm lectures)",
+    )

    parser.add_argument(
        "--save-to-file",
@ -1171,6 +1218,7 @@ if __name__ == "__main__":
    portal_name = None
    course_name = None
    keep_vtt = False
+    skip_hls = False

    args = parser.parse_args()
    if args.download_assets:
@ -1185,6 +1233,8 @@ if __name__ == "__main__":
        quality = args.quality
    if args.keep_vtt:
        keep_vtt = args.keep_vtt
+    if args.skip_hls:
+        skip_hls = args.skip_hls

    if args.load_from_file:
        print(
@ -1237,7 +1287,7 @@ if __name__ == "__main__":
        _udemy = json.loads(
            open(os.path.join(os.getcwd(), "saved", "_udemy.json")).read())
        parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
-                  caption_locale, keep_vtt)
+                  caption_locale, keep_vtt, access_token)
    else:
        _udemy = {}
        _udemy["access_token"] = access_token
@ -1328,7 +1378,8 @@ if __name__ == "__main__":
                                sources = data.get("Video")
                                tracks = asset.get("captions")
                                #duration = asset.get("time_estimation")
-                                sources = udemy._extract_sources(sources)
+                                sources = udemy._extract_sources(
+                                    sources, skip_hls)
                                subtitles = udemy._extract_subtitles(tracks)
                                sources_count = len(sources)
                                subtitle_count = len(subtitles)
@ -1463,6 +1514,7 @@ if __name__ == "__main__":
                      'w') as f:
                f.write(json.dumps(_udemy))
                f.close()
+            print("Saved parsed data to json")

        parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
-                  caption_locale, keep_vtt)
+                  caption_locale, keep_vtt, access_token)
--- a/requirements.txt
+++ b/requirements.txt
@ -6,3 +6,5 @@ python-dotenv
 protobuf
 webvtt-py
 pysrt
+m3u8
+colorama