diff --git a/README.md b/README.md index c588223..1d5c959 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ You can now run `python main.py` to start downloading. The course will download ``` usage: main.py [-h] -c COURSE_URL [-b BEARER_TOKEN] [-q QUALITY] [-l LANG] [--skip-lectures] [--download-assets] [--download-captions] - [--keep-vtt] + [--keep-vtt] [--skip-hls] Udemy Downloader @@ -87,6 +87,8 @@ optional arguments: --download-assets If specified, lecture assets will be downloaded --download-captions If specified, captions will be downloaded --keep-vtt If specified, .vtt files won't be removed + --skip-hls If specified, hls streams will be skipped (faster fetching) (hls streams usually contain 1080p quality for non-drm + lectures) ``` - Passing a Bearer Token and Course ID as an argument @@ -112,6 +114,8 @@ optional arguments: - `python main.py -c --skip-lectures --download-assets` - Downloads only assets - Keep .VTT caption files: - `python main.py -c --download-captions --keep-vtt` +- Skip parsing HLS Streams (HLS streams usually contain 1080p quality for Non-DRM lectures): + - `python main.py -c --skip-hls` # Credits diff --git a/ffmpeg.py b/ffmpeg.py new file mode 100644 index 0000000..7e0ae87 --- /dev/null +++ b/ffmpeg.py @@ -0,0 +1,274 @@ +#!/usr/bin/python3 +# pylint: disable=R,C,W,E +""" +Author : Nasir Khan (r0ot h3x49) +Github : https://github.com/r0oth3x49 +License : MIT +Copyright (c) 2018-2025 Nasir Khan (r0ot h3x49) +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the +Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR +ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH +THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" +import re, time, logging, subprocess, sys +from colorama import Fore, Style + + +class FFMPeg: + + _PROGRESS_PATTERN = re.compile( + r"(frame|fps|total_size|out_time|bitrate|speed|progress)\s*\=\s*(\S+)") + + def __init__(self, + duration, + url, + token, + filepath, + quiet=False, + callback=lambda *x: None): + self.url = url + self.filepath = filepath + self.quiet = quiet + self.duration = duration + self.callback = callback + self.token = token + + def _command(self): + """ + ffmpeg.exe -headers "Authorization: Bearer {token}" -i "" -c copy -bsf:a aac_adtstoasc out.mp4 + """ + command = [ + "ffmpeg", + "-headers", + f"Authorization: Bearer {self.token}", + "-i", + f"{self.url}", + "-c", + "copy", + "-bsf:a", + "aac_adtstoasc", + f"{self.filepath}", + "-y", + "-progress", + "pipe:2", + ] + return command + + def _fetch_total_duration(self, line): + duration_in_secs = 0 + duration_regex = re.compile( + r"Duration: (\d{2}):(\d{2}):(\d{2})\.\d{2}") + mobj = duration_regex.search(line) + if mobj: + duration_tuple = mobj.groups() + duration_in_secs = (int(duration_tuple[0]) * 60 + + int(duration_tuple[1]) * 60 + + int(duration_tuple[2])) + else: + duration_in_secs = self.duration + return duration_in_secs + + def _fetch_current_duration_done(self, time_str): + time_str = time_str.split(":") + return (int(time_str[0]) * 60 + int(time_str[1]) * 60 + + int(time_str[2].split(".")[0])) + + def _prepare_time_str(self, secs): + (mins, secs) = divmod(secs, 60) + (hours, mins) = divmod(mins, 60) + if hours > 99: + time_str = "--:--:--" + if hours == 0: + time_str = "%02d:%02ds" % (mins, secs) + else: + time_str = "%02d:%02d:%02ds" % (hours, mins, secs) + return time_str + + def _progress(self, + iterations, + total, + bytesdone, + speed, + elapsed, + bar_length=30, + fps=None): + offset = 0 + filled_length = int(round(bar_length * iterations / float(total))) + percents = format(100.00 * (iterations * 1.0 / float(total)), ".2f") + + if bytesdone <= 1048576: + _receiving = round(float(bytesdone) / 1024.00, 2) + _received = format( + _receiving if _receiving < 1024.00 else _receiving / 1024.00, + ".2f") + suffix_recvd = "KB" if _receiving < 1024.00 else "MB" + else: + _receiving = round(float(bytesdone) / 1048576, 2) + _received = format( + _receiving if _receiving < 1024.00 else _receiving / 1024.00, + ".2f") + suffix_recvd = "MB" if _receiving < 1024.00 else "GB" + + suffix_rate = "Kb/s" if speed < 1024.00 else "Mb/s" + if fps: + suffix_rate += f" {fps}/fps" + if elapsed: + rate = ((float(iterations) - float(offset)) / 1024.0) / elapsed + eta = (total - iterations) / (rate * 1024.0) + else: + rate = 0 + eta = 0 + rate = format(speed if speed < 1024.00 else speed / 1024.00, ".2f") + (mins, secs) = divmod(eta, 60) + (hours, mins) = divmod(mins, 60) + if hours > 99: + eta = "--:--:--" + if hours == 0: + eta = "eta %02d:%02ds" % (mins, secs) + else: + eta = "eta %02d:%02d:%02ds" % (hours, mins, secs) + if secs == 0: + eta = "\n" + + total_time = self._prepare_time_str(total) + done_time = self._prepare_time_str(iterations) + downloaded = f"{total_time}/{done_time}" + + received_bytes = str(_received) + str(suffix_recvd) + percents = f"{received_bytes} {percents}" + + self.hls_progress( + downloaded=downloaded, + percents=percents, + filled_length=filled_length, + rate=str(rate) + str(suffix_rate), + suffix=eta, + bar_length=bar_length, + ) + + def hls_progress(self, + downloaded, + percents, + filled_length, + rate, + suffix, + bar_length=30): + bar = (Fore.CYAN + Style.DIM + "#" * filled_length + Fore.WHITE + + Style.DIM + "-" * (bar_length - filled_length)) + sys.stdout.write( + "\033[2K\033[1G\r\r{}{}[{}{}*{}{}] : {}{}{} {}% |{}{}{}| {} {}". + format( + Fore.CYAN, + Style.DIM, + Fore.MAGENTA, + Style.BRIGHT, + Fore.CYAN, + Style.DIM, + Fore.GREEN, + Style.BRIGHT, + downloaded, + percents, + bar, + Fore.GREEN, + Style.BRIGHT, + rate, + suffix, + )) + sys.stdout.flush() + + def _parse_progress(self, line): + items = { + key: value + for key, value in self._PROGRESS_PATTERN.findall(line) + } + return items + + def download(self): + total_time = None + t0 = time.time() + progress_lines = [] + active = True + retVal = {} + command = self._command() + bytes_done = 0 + download_speed = 0 + try: + with subprocess.Popen(command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) as proc: + while active: + elapsed = time.time() - t0 + try: + line = proc.stderr.readline().decode("utf-8").strip() + if not total_time: + total_time = self._fetch_total_duration(line) + if "progress=end" in line: + try: + self._progress( + total_time, + total_time, + bytes_done, + download_speed, + elapsed, + ) + except KeyboardInterrupt: + retVal = { + "status": "False", + "msg": "Error: KeyboardInterrupt", + } + raise KeyboardInterrupt + except Exception as err: + {"status": "False", "msg": f"Error: {err}"} + active = False + retVal = {"status": "True", "msg": "download"} + break + if "progress" not in line: + progress_lines.append(line) + else: + lines = "\n".join(progress_lines) + items = self._parse_progress(lines) + if items: + secs = self._fetch_current_duration_done( + items.get("out_time")) + _tsize = ( + items.get("total_size").lower().replace( + "kb", "")) + _brate = (items.get("bitrate").lower().replace( + "kbits/s", "")) + fps = items.get("fps") + bytes_done = float( + _tsize) if _tsize != "n/a" else 0 + download_speed = float( + _brate) if _brate != "n/a" else 0 + try: + self._progress( + secs, + total_time, + bytes_done, + download_speed, + elapsed, + fps=fps, + ) + except KeyboardInterrupt: + retVal = { + "status": "False", + "msg": "Error: KeyboardInterrupt", + } + raise KeyboardInterrupt + except Exception as err: + {"status": "False", "msg": f"Error: {err}"} + progress_lines = [] + except KeyboardInterrupt: + active = False + retVal = { + "status": "False", + "msg": "Error: KeyboardInterrupt" + } + raise KeyboardInterrupt + except KeyboardInterrupt: + raise KeyboardInterrupt + return retVal \ No newline at end of file diff --git a/main.py b/main.py index c5e9754..e267bd9 100644 --- a/main.py +++ b/main.py @@ -1,14 +1,13 @@ -import os, requests, json, glob, argparse, sys, re, time, asyncio, json -# from sanitize_filename import sanitize +import os, requests, json, glob, argparse, sys, re, time, asyncio, json, cloudscraper, m3u8 from tqdm import tqdm from dotenv import load_dotenv from mpegdash.parser import MPEGDASHParser from utils import extract_kid from vtt_to_srt import convert -import cloudscraper from requests.exceptions import ConnectionError as conn_error from html.parser import HTMLParser as compat_HTMLParser from sanitize import sanitize, slugify, SLUG_OK +from ffmpeg import FFMPeg home_dir = os.getcwd() download_dir = os.path.join(os.getcwd(), "out_dir") @@ -167,7 +166,7 @@ class Udemy: }) return _temp - def _extract_sources(self, sources): + def _extract_sources(self, sources, skip_hls): _temp = [] if sources and isinstance(sources, list): for source in sources: @@ -195,17 +194,21 @@ class Udemy: else: width = "256" if (source.get("type") == "application/x-mpegURL" - or "m3u8" in download_url or height == "Audio"): - continue - - _type = source.get("type") - _temp.append({ - "type": "video", - "height": height, - "width": width, - "extension": _type.replace("video/", ""), - "download_url": download_url, - }) + or "m3u8" in download_url): + if not skip_hls: + out = self._extract_m3u8(download_url) + print(out) + if out: + _temp.extend(out) + else: + _type = source.get("type") + _temp.append({ + "type": "video", + "height": height, + "width": width, + "extension": _type.replace("video/", ""), + "download_url": download_url, + }) return _temp def _extract_media_sources(self, sources): @@ -247,6 +250,38 @@ class Udemy: }) return _temp + def _extract_m3u8(self, url): + """extracts m3u8 streams""" + _temp = [] + try: + resp = self.session._get(url) + resp.raise_for_status() + raw_data = resp.text + m3u8_object = m3u8.loads(raw_data) + playlists = m3u8_object.playlists + seen = set() + for pl in playlists: + resolution = pl.stream_info.resolution + codecs = pl.stream_info.codecs + if not resolution: + continue + if not codecs: + continue + width, height = resolution + download_url = pl.uri + if height not in seen: + seen.add(height) + _temp.append({ + "type": "hls", + "height": height, + "width": width, + "extension": "mp4", + "download_url": download_url, + }) + except Exception as error: + print(f"Udemy Says : '{error}' while fetching hls streams..") + return _temp + def _extract_mpd(self, url): """extract mpd streams""" _video = [] @@ -922,13 +957,8 @@ def process_caption(caption, lecture_title, lecture_dir, keep_vtt, tries=0): print(f"> Error converting caption: {e}") -def process_lecture( - lecture, - lecture_index, - lecture_path, - lecture_dir, - quality, -): +def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality, + access_token): # TODO: Make this more efficent, some lectures are html articles not videos so we should check if the extension is html index = lecture.get("index") lecture_index = lecture.get("lecture_index") @@ -984,7 +1014,17 @@ def process_lecture( key=lambda x: abs(int(x.get("height")) - quality)) try: url = source.get("download_url") - download(url, lecture_path, lecture_title) + source_type = source.get("type") + if source_type == "hls": + temp_filepath = lecture_path.replace(".mp4", "") + temp_filepath = temp_filepath + ".hls-part.mp4" + retVal = FFMPeg(None, url, access_token, + temp_filepath).download() + if retVal: + os.rename(temp_filepath, lecture_path) + print("> HLS Download success") + else: + download(url, lecture_path, lecture_title) except Exception as e: print(f"> Error downloading lecture: ", e) else: @@ -995,7 +1035,7 @@ def process_lecture( def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions, - caption_locale, keep_vtt): + caption_locale, keep_vtt, access_token): total_chapters = _udemy.get("total_chapters") total_lectures = _udemy.get("total_lectures") print(f"Chapter(s) ({total_chapters})") @@ -1040,7 +1080,7 @@ def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions, lecture_path = os.path.join( chapter_dir, "{}.mp4".format(sanitize(lecture_title))) process_lecture(lecture, lecture_index, lecture_path, - chapter_dir, quality) + chapter_dir, quality, access_token) if dl_assets: assets = lecture.get("assets") @@ -1148,6 +1188,13 @@ if __name__ == "__main__": action="store_true", help="If specified, .vtt files won't be removed", ) + parser.add_argument( + "--skip-hls", + dest="skip_hls", + action="store_true", + help= + "If specified, hls streams will be skipped (faster fetching) (hls streams usually contain 1080p quality for non-drm lectures)", + ) parser.add_argument( "--save-to-file", @@ -1171,6 +1218,7 @@ if __name__ == "__main__": portal_name = None course_name = None keep_vtt = False + skip_hls = False args = parser.parse_args() if args.download_assets: @@ -1185,6 +1233,8 @@ if __name__ == "__main__": quality = args.quality if args.keep_vtt: keep_vtt = args.keep_vtt + if args.skip_hls: + skip_hls = args.skip_hls if args.load_from_file: print( @@ -1237,7 +1287,7 @@ if __name__ == "__main__": _udemy = json.loads( open(os.path.join(os.getcwd(), "saved", "_udemy.json")).read()) parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions, - caption_locale, keep_vtt) + caption_locale, keep_vtt, access_token) else: _udemy = {} _udemy["access_token"] = access_token @@ -1328,7 +1378,8 @@ if __name__ == "__main__": sources = data.get("Video") tracks = asset.get("captions") #duration = asset.get("time_estimation") - sources = udemy._extract_sources(sources) + sources = udemy._extract_sources( + sources, skip_hls) subtitles = udemy._extract_subtitles(tracks) sources_count = len(sources) subtitle_count = len(subtitles) @@ -1463,6 +1514,7 @@ if __name__ == "__main__": 'w') as f: f.write(json.dumps(_udemy)) f.close() + print("Saved parsed data to json") parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions, - caption_locale, keep_vtt) + caption_locale, keep_vtt, access_token) diff --git a/requirements.txt b/requirements.txt index 980a9df..446e5b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ python-dotenv protobuf webvtt-py pysrt +m3u8 +colorama \ No newline at end of file