mirror of
https://cdm-project.com/Download-Tools/udemy-downloader.git
synced 2025-04-30 02:14:25 +02:00
HLS parsing for 1080p+ quality
+ Added a new command argument ``--skip-hls`` to skip parsing hls playlists + Updated README to reflect code changes
This commit is contained in:
parent
f0e06106fc
commit
5ffef4736e
@ -69,7 +69,7 @@ You can now run `python main.py` to start downloading. The course will download
|
||||
|
||||
```
|
||||
usage: main.py [-h] -c COURSE_URL [-b BEARER_TOKEN] [-q QUALITY] [-l LANG] [--skip-lectures] [--download-assets] [--download-captions]
|
||||
[--keep-vtt]
|
||||
[--keep-vtt] [--skip-hls]
|
||||
|
||||
Udemy Downloader
|
||||
|
||||
@ -87,6 +87,8 @@ optional arguments:
|
||||
--download-assets If specified, lecture assets will be downloaded
|
||||
--download-captions If specified, captions will be downloaded
|
||||
--keep-vtt If specified, .vtt files won't be removed
|
||||
--skip-hls If specified, hls streams will be skipped (faster fetching) (hls streams usually contain 1080p quality for non-drm
|
||||
lectures)
|
||||
```
|
||||
|
||||
- Passing a Bearer Token and Course ID as an argument
|
||||
@ -112,6 +114,8 @@ optional arguments:
|
||||
- `python main.py -c <Course URL> --skip-lectures --download-assets` - Downloads only assets
|
||||
- Keep .VTT caption files:
|
||||
- `python main.py -c <Course URL> --download-captions --keep-vtt`
|
||||
- Skip parsing HLS Streams (HLS streams usually contain 1080p quality for Non-DRM lectures):
|
||||
- `python main.py -c <Course URL> --skip-hls`
|
||||
|
||||
# Credits
|
||||
|
||||
|
274
ffmpeg.py
Normal file
274
ffmpeg.py
Normal file
@ -0,0 +1,274 @@
|
||||
#!/usr/bin/python3
|
||||
# pylint: disable=R,C,W,E
|
||||
"""
|
||||
Author : Nasir Khan (r0ot h3x49)
|
||||
Github : https://github.com/r0oth3x49
|
||||
License : MIT
|
||||
Copyright (c) 2018-2025 Nasir Khan (r0ot h3x49)
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
|
||||
ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
|
||||
THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
"""
|
||||
import re, time, logging, subprocess, sys
|
||||
from colorama import Fore, Style
|
||||
|
||||
|
||||
class FFMPeg:
|
||||
|
||||
_PROGRESS_PATTERN = re.compile(
|
||||
r"(frame|fps|total_size|out_time|bitrate|speed|progress)\s*\=\s*(\S+)")
|
||||
|
||||
def __init__(self,
|
||||
duration,
|
||||
url,
|
||||
token,
|
||||
filepath,
|
||||
quiet=False,
|
||||
callback=lambda *x: None):
|
||||
self.url = url
|
||||
self.filepath = filepath
|
||||
self.quiet = quiet
|
||||
self.duration = duration
|
||||
self.callback = callback
|
||||
self.token = token
|
||||
|
||||
def _command(self):
|
||||
"""
|
||||
ffmpeg.exe -headers "Authorization: Bearer {token}" -i "" -c copy -bsf:a aac_adtstoasc out.mp4
|
||||
"""
|
||||
command = [
|
||||
"ffmpeg",
|
||||
"-headers",
|
||||
f"Authorization: Bearer {self.token}",
|
||||
"-i",
|
||||
f"{self.url}",
|
||||
"-c",
|
||||
"copy",
|
||||
"-bsf:a",
|
||||
"aac_adtstoasc",
|
||||
f"{self.filepath}",
|
||||
"-y",
|
||||
"-progress",
|
||||
"pipe:2",
|
||||
]
|
||||
return command
|
||||
|
||||
def _fetch_total_duration(self, line):
|
||||
duration_in_secs = 0
|
||||
duration_regex = re.compile(
|
||||
r"Duration: (\d{2}):(\d{2}):(\d{2})\.\d{2}")
|
||||
mobj = duration_regex.search(line)
|
||||
if mobj:
|
||||
duration_tuple = mobj.groups()
|
||||
duration_in_secs = (int(duration_tuple[0]) * 60 +
|
||||
int(duration_tuple[1]) * 60 +
|
||||
int(duration_tuple[2]))
|
||||
else:
|
||||
duration_in_secs = self.duration
|
||||
return duration_in_secs
|
||||
|
||||
def _fetch_current_duration_done(self, time_str):
|
||||
time_str = time_str.split(":")
|
||||
return (int(time_str[0]) * 60 + int(time_str[1]) * 60 +
|
||||
int(time_str[2].split(".")[0]))
|
||||
|
||||
def _prepare_time_str(self, secs):
|
||||
(mins, secs) = divmod(secs, 60)
|
||||
(hours, mins) = divmod(mins, 60)
|
||||
if hours > 99:
|
||||
time_str = "--:--:--"
|
||||
if hours == 0:
|
||||
time_str = "%02d:%02ds" % (mins, secs)
|
||||
else:
|
||||
time_str = "%02d:%02d:%02ds" % (hours, mins, secs)
|
||||
return time_str
|
||||
|
||||
def _progress(self,
|
||||
iterations,
|
||||
total,
|
||||
bytesdone,
|
||||
speed,
|
||||
elapsed,
|
||||
bar_length=30,
|
||||
fps=None):
|
||||
offset = 0
|
||||
filled_length = int(round(bar_length * iterations / float(total)))
|
||||
percents = format(100.00 * (iterations * 1.0 / float(total)), ".2f")
|
||||
|
||||
if bytesdone <= 1048576:
|
||||
_receiving = round(float(bytesdone) / 1024.00, 2)
|
||||
_received = format(
|
||||
_receiving if _receiving < 1024.00 else _receiving / 1024.00,
|
||||
".2f")
|
||||
suffix_recvd = "KB" if _receiving < 1024.00 else "MB"
|
||||
else:
|
||||
_receiving = round(float(bytesdone) / 1048576, 2)
|
||||
_received = format(
|
||||
_receiving if _receiving < 1024.00 else _receiving / 1024.00,
|
||||
".2f")
|
||||
suffix_recvd = "MB" if _receiving < 1024.00 else "GB"
|
||||
|
||||
suffix_rate = "Kb/s" if speed < 1024.00 else "Mb/s"
|
||||
if fps:
|
||||
suffix_rate += f" {fps}/fps"
|
||||
if elapsed:
|
||||
rate = ((float(iterations) - float(offset)) / 1024.0) / elapsed
|
||||
eta = (total - iterations) / (rate * 1024.0)
|
||||
else:
|
||||
rate = 0
|
||||
eta = 0
|
||||
rate = format(speed if speed < 1024.00 else speed / 1024.00, ".2f")
|
||||
(mins, secs) = divmod(eta, 60)
|
||||
(hours, mins) = divmod(mins, 60)
|
||||
if hours > 99:
|
||||
eta = "--:--:--"
|
||||
if hours == 0:
|
||||
eta = "eta %02d:%02ds" % (mins, secs)
|
||||
else:
|
||||
eta = "eta %02d:%02d:%02ds" % (hours, mins, secs)
|
||||
if secs == 0:
|
||||
eta = "\n"
|
||||
|
||||
total_time = self._prepare_time_str(total)
|
||||
done_time = self._prepare_time_str(iterations)
|
||||
downloaded = f"{total_time}/{done_time}"
|
||||
|
||||
received_bytes = str(_received) + str(suffix_recvd)
|
||||
percents = f"{received_bytes} {percents}"
|
||||
|
||||
self.hls_progress(
|
||||
downloaded=downloaded,
|
||||
percents=percents,
|
||||
filled_length=filled_length,
|
||||
rate=str(rate) + str(suffix_rate),
|
||||
suffix=eta,
|
||||
bar_length=bar_length,
|
||||
)
|
||||
|
||||
def hls_progress(self,
|
||||
downloaded,
|
||||
percents,
|
||||
filled_length,
|
||||
rate,
|
||||
suffix,
|
||||
bar_length=30):
|
||||
bar = (Fore.CYAN + Style.DIM + "#" * filled_length + Fore.WHITE +
|
||||
Style.DIM + "-" * (bar_length - filled_length))
|
||||
sys.stdout.write(
|
||||
"\033[2K\033[1G\r\r{}{}[{}{}*{}{}] : {}{}{} {}% |{}{}{}| {} {}".
|
||||
format(
|
||||
Fore.CYAN,
|
||||
Style.DIM,
|
||||
Fore.MAGENTA,
|
||||
Style.BRIGHT,
|
||||
Fore.CYAN,
|
||||
Style.DIM,
|
||||
Fore.GREEN,
|
||||
Style.BRIGHT,
|
||||
downloaded,
|
||||
percents,
|
||||
bar,
|
||||
Fore.GREEN,
|
||||
Style.BRIGHT,
|
||||
rate,
|
||||
suffix,
|
||||
))
|
||||
sys.stdout.flush()
|
||||
|
||||
def _parse_progress(self, line):
|
||||
items = {
|
||||
key: value
|
||||
for key, value in self._PROGRESS_PATTERN.findall(line)
|
||||
}
|
||||
return items
|
||||
|
||||
def download(self):
|
||||
total_time = None
|
||||
t0 = time.time()
|
||||
progress_lines = []
|
||||
active = True
|
||||
retVal = {}
|
||||
command = self._command()
|
||||
bytes_done = 0
|
||||
download_speed = 0
|
||||
try:
|
||||
with subprocess.Popen(command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE) as proc:
|
||||
while active:
|
||||
elapsed = time.time() - t0
|
||||
try:
|
||||
line = proc.stderr.readline().decode("utf-8").strip()
|
||||
if not total_time:
|
||||
total_time = self._fetch_total_duration(line)
|
||||
if "progress=end" in line:
|
||||
try:
|
||||
self._progress(
|
||||
total_time,
|
||||
total_time,
|
||||
bytes_done,
|
||||
download_speed,
|
||||
elapsed,
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
retVal = {
|
||||
"status": "False",
|
||||
"msg": "Error: KeyboardInterrupt",
|
||||
}
|
||||
raise KeyboardInterrupt
|
||||
except Exception as err:
|
||||
{"status": "False", "msg": f"Error: {err}"}
|
||||
active = False
|
||||
retVal = {"status": "True", "msg": "download"}
|
||||
break
|
||||
if "progress" not in line:
|
||||
progress_lines.append(line)
|
||||
else:
|
||||
lines = "\n".join(progress_lines)
|
||||
items = self._parse_progress(lines)
|
||||
if items:
|
||||
secs = self._fetch_current_duration_done(
|
||||
items.get("out_time"))
|
||||
_tsize = (
|
||||
items.get("total_size").lower().replace(
|
||||
"kb", ""))
|
||||
_brate = (items.get("bitrate").lower().replace(
|
||||
"kbits/s", ""))
|
||||
fps = items.get("fps")
|
||||
bytes_done = float(
|
||||
_tsize) if _tsize != "n/a" else 0
|
||||
download_speed = float(
|
||||
_brate) if _brate != "n/a" else 0
|
||||
try:
|
||||
self._progress(
|
||||
secs,
|
||||
total_time,
|
||||
bytes_done,
|
||||
download_speed,
|
||||
elapsed,
|
||||
fps=fps,
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
retVal = {
|
||||
"status": "False",
|
||||
"msg": "Error: KeyboardInterrupt",
|
||||
}
|
||||
raise KeyboardInterrupt
|
||||
except Exception as err:
|
||||
{"status": "False", "msg": f"Error: {err}"}
|
||||
progress_lines = []
|
||||
except KeyboardInterrupt:
|
||||
active = False
|
||||
retVal = {
|
||||
"status": "False",
|
||||
"msg": "Error: KeyboardInterrupt"
|
||||
}
|
||||
raise KeyboardInterrupt
|
||||
except KeyboardInterrupt:
|
||||
raise KeyboardInterrupt
|
||||
return retVal
|
108
main.py
108
main.py
@ -1,14 +1,13 @@
|
||||
import os, requests, json, glob, argparse, sys, re, time, asyncio, json
|
||||
# from sanitize_filename import sanitize
|
||||
import os, requests, json, glob, argparse, sys, re, time, asyncio, json, cloudscraper, m3u8
|
||||
from tqdm import tqdm
|
||||
from dotenv import load_dotenv
|
||||
from mpegdash.parser import MPEGDASHParser
|
||||
from utils import extract_kid
|
||||
from vtt_to_srt import convert
|
||||
import cloudscraper
|
||||
from requests.exceptions import ConnectionError as conn_error
|
||||
from html.parser import HTMLParser as compat_HTMLParser
|
||||
from sanitize import sanitize, slugify, SLUG_OK
|
||||
from ffmpeg import FFMPeg
|
||||
|
||||
home_dir = os.getcwd()
|
||||
download_dir = os.path.join(os.getcwd(), "out_dir")
|
||||
@ -167,7 +166,7 @@ class Udemy:
|
||||
})
|
||||
return _temp
|
||||
|
||||
def _extract_sources(self, sources):
|
||||
def _extract_sources(self, sources, skip_hls):
|
||||
_temp = []
|
||||
if sources and isinstance(sources, list):
|
||||
for source in sources:
|
||||
@ -195,17 +194,21 @@ class Udemy:
|
||||
else:
|
||||
width = "256"
|
||||
if (source.get("type") == "application/x-mpegURL"
|
||||
or "m3u8" in download_url or height == "Audio"):
|
||||
continue
|
||||
|
||||
_type = source.get("type")
|
||||
_temp.append({
|
||||
"type": "video",
|
||||
"height": height,
|
||||
"width": width,
|
||||
"extension": _type.replace("video/", ""),
|
||||
"download_url": download_url,
|
||||
})
|
||||
or "m3u8" in download_url):
|
||||
if not skip_hls:
|
||||
out = self._extract_m3u8(download_url)
|
||||
print(out)
|
||||
if out:
|
||||
_temp.extend(out)
|
||||
else:
|
||||
_type = source.get("type")
|
||||
_temp.append({
|
||||
"type": "video",
|
||||
"height": height,
|
||||
"width": width,
|
||||
"extension": _type.replace("video/", ""),
|
||||
"download_url": download_url,
|
||||
})
|
||||
return _temp
|
||||
|
||||
def _extract_media_sources(self, sources):
|
||||
@ -247,6 +250,38 @@ class Udemy:
|
||||
})
|
||||
return _temp
|
||||
|
||||
def _extract_m3u8(self, url):
|
||||
"""extracts m3u8 streams"""
|
||||
_temp = []
|
||||
try:
|
||||
resp = self.session._get(url)
|
||||
resp.raise_for_status()
|
||||
raw_data = resp.text
|
||||
m3u8_object = m3u8.loads(raw_data)
|
||||
playlists = m3u8_object.playlists
|
||||
seen = set()
|
||||
for pl in playlists:
|
||||
resolution = pl.stream_info.resolution
|
||||
codecs = pl.stream_info.codecs
|
||||
if not resolution:
|
||||
continue
|
||||
if not codecs:
|
||||
continue
|
||||
width, height = resolution
|
||||
download_url = pl.uri
|
||||
if height not in seen:
|
||||
seen.add(height)
|
||||
_temp.append({
|
||||
"type": "hls",
|
||||
"height": height,
|
||||
"width": width,
|
||||
"extension": "mp4",
|
||||
"download_url": download_url,
|
||||
})
|
||||
except Exception as error:
|
||||
print(f"Udemy Says : '{error}' while fetching hls streams..")
|
||||
return _temp
|
||||
|
||||
def _extract_mpd(self, url):
|
||||
"""extract mpd streams"""
|
||||
_video = []
|
||||
@ -922,13 +957,8 @@ def process_caption(caption, lecture_title, lecture_dir, keep_vtt, tries=0):
|
||||
print(f"> Error converting caption: {e}")
|
||||
|
||||
|
||||
def process_lecture(
|
||||
lecture,
|
||||
lecture_index,
|
||||
lecture_path,
|
||||
lecture_dir,
|
||||
quality,
|
||||
):
|
||||
def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
|
||||
access_token):
|
||||
# TODO: Make this more efficent, some lectures are html articles not videos so we should check if the extension is html
|
||||
index = lecture.get("index")
|
||||
lecture_index = lecture.get("lecture_index")
|
||||
@ -984,7 +1014,17 @@ def process_lecture(
|
||||
key=lambda x: abs(int(x.get("height")) - quality))
|
||||
try:
|
||||
url = source.get("download_url")
|
||||
download(url, lecture_path, lecture_title)
|
||||
source_type = source.get("type")
|
||||
if source_type == "hls":
|
||||
temp_filepath = lecture_path.replace(".mp4", "")
|
||||
temp_filepath = temp_filepath + ".hls-part.mp4"
|
||||
retVal = FFMPeg(None, url, access_token,
|
||||
temp_filepath).download()
|
||||
if retVal:
|
||||
os.rename(temp_filepath, lecture_path)
|
||||
print("> HLS Download success")
|
||||
else:
|
||||
download(url, lecture_path, lecture_title)
|
||||
except Exception as e:
|
||||
print(f"> Error downloading lecture: ", e)
|
||||
else:
|
||||
@ -995,7 +1035,7 @@ def process_lecture(
|
||||
|
||||
|
||||
def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
|
||||
caption_locale, keep_vtt):
|
||||
caption_locale, keep_vtt, access_token):
|
||||
total_chapters = _udemy.get("total_chapters")
|
||||
total_lectures = _udemy.get("total_lectures")
|
||||
print(f"Chapter(s) ({total_chapters})")
|
||||
@ -1040,7 +1080,7 @@ def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
|
||||
lecture_path = os.path.join(
|
||||
chapter_dir, "{}.mp4".format(sanitize(lecture_title)))
|
||||
process_lecture(lecture, lecture_index, lecture_path,
|
||||
chapter_dir, quality)
|
||||
chapter_dir, quality, access_token)
|
||||
|
||||
if dl_assets:
|
||||
assets = lecture.get("assets")
|
||||
@ -1148,6 +1188,13 @@ if __name__ == "__main__":
|
||||
action="store_true",
|
||||
help="If specified, .vtt files won't be removed",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-hls",
|
||||
dest="skip_hls",
|
||||
action="store_true",
|
||||
help=
|
||||
"If specified, hls streams will be skipped (faster fetching) (hls streams usually contain 1080p quality for non-drm lectures)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--save-to-file",
|
||||
@ -1171,6 +1218,7 @@ if __name__ == "__main__":
|
||||
portal_name = None
|
||||
course_name = None
|
||||
keep_vtt = False
|
||||
skip_hls = False
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.download_assets:
|
||||
@ -1185,6 +1233,8 @@ if __name__ == "__main__":
|
||||
quality = args.quality
|
||||
if args.keep_vtt:
|
||||
keep_vtt = args.keep_vtt
|
||||
if args.skip_hls:
|
||||
skip_hls = args.skip_hls
|
||||
|
||||
if args.load_from_file:
|
||||
print(
|
||||
@ -1237,7 +1287,7 @@ if __name__ == "__main__":
|
||||
_udemy = json.loads(
|
||||
open(os.path.join(os.getcwd(), "saved", "_udemy.json")).read())
|
||||
parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
|
||||
caption_locale, keep_vtt)
|
||||
caption_locale, keep_vtt, access_token)
|
||||
else:
|
||||
_udemy = {}
|
||||
_udemy["access_token"] = access_token
|
||||
@ -1328,7 +1378,8 @@ if __name__ == "__main__":
|
||||
sources = data.get("Video")
|
||||
tracks = asset.get("captions")
|
||||
#duration = asset.get("time_estimation")
|
||||
sources = udemy._extract_sources(sources)
|
||||
sources = udemy._extract_sources(
|
||||
sources, skip_hls)
|
||||
subtitles = udemy._extract_subtitles(tracks)
|
||||
sources_count = len(sources)
|
||||
subtitle_count = len(subtitles)
|
||||
@ -1463,6 +1514,7 @@ if __name__ == "__main__":
|
||||
'w') as f:
|
||||
f.write(json.dumps(_udemy))
|
||||
f.close()
|
||||
print("Saved parsed data to json")
|
||||
|
||||
parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
|
||||
caption_locale, keep_vtt)
|
||||
caption_locale, keep_vtt, access_token)
|
||||
|
@ -6,3 +6,5 @@ python-dotenv
|
||||
protobuf
|
||||
webvtt-py
|
||||
pysrt
|
||||
m3u8
|
||||
colorama
|
Loading…
x
Reference in New Issue
Block a user