HLS parsing for 1080p+ quality

+ Added a new command argument ``--skip-hls`` to skip parsing hls playlists
+ Updated README to reflect code changes
This commit is contained in:
Puyodead1 2021-05-27 17:38:29 -04:00
parent f0e06106fc
commit 5ffef4736e
4 changed files with 361 additions and 29 deletions

View File

@ -69,7 +69,7 @@ You can now run `python main.py` to start downloading. The course will download
```
usage: main.py [-h] -c COURSE_URL [-b BEARER_TOKEN] [-q QUALITY] [-l LANG] [--skip-lectures] [--download-assets] [--download-captions]
[--keep-vtt]
[--keep-vtt] [--skip-hls]
Udemy Downloader
@ -87,6 +87,8 @@ optional arguments:
--download-assets If specified, lecture assets will be downloaded
--download-captions If specified, captions will be downloaded
--keep-vtt If specified, .vtt files won't be removed
--skip-hls If specified, hls streams will be skipped (faster fetching) (hls streams usually contain 1080p quality for non-drm
lectures)
```
- Passing a Bearer Token and Course ID as an argument
@ -112,6 +114,8 @@ optional arguments:
- `python main.py -c <Course URL> --skip-lectures --download-assets` - Downloads only assets
- Keep .VTT caption files:
- `python main.py -c <Course URL> --download-captions --keep-vtt`
- Skip parsing HLS Streams (HLS streams usually contain 1080p quality for Non-DRM lectures):
- `python main.py -c <Course URL> --skip-hls`
# Credits

274
ffmpeg.py Normal file
View File

@ -0,0 +1,274 @@
#!/usr/bin/python3
# pylint: disable=R,C,W,E
"""
Author : Nasir Khan (r0ot h3x49)
Github : https://github.com/r0oth3x49
License : MIT
Copyright (c) 2018-2025 Nasir Khan (r0ot h3x49)
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the
Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
import re, time, logging, subprocess, sys
from colorama import Fore, Style
class FFMPeg:
_PROGRESS_PATTERN = re.compile(
r"(frame|fps|total_size|out_time|bitrate|speed|progress)\s*\=\s*(\S+)")
def __init__(self,
duration,
url,
token,
filepath,
quiet=False,
callback=lambda *x: None):
self.url = url
self.filepath = filepath
self.quiet = quiet
self.duration = duration
self.callback = callback
self.token = token
def _command(self):
"""
ffmpeg.exe -headers "Authorization: Bearer {token}" -i "" -c copy -bsf:a aac_adtstoasc out.mp4
"""
command = [
"ffmpeg",
"-headers",
f"Authorization: Bearer {self.token}",
"-i",
f"{self.url}",
"-c",
"copy",
"-bsf:a",
"aac_adtstoasc",
f"{self.filepath}",
"-y",
"-progress",
"pipe:2",
]
return command
def _fetch_total_duration(self, line):
duration_in_secs = 0
duration_regex = re.compile(
r"Duration: (\d{2}):(\d{2}):(\d{2})\.\d{2}")
mobj = duration_regex.search(line)
if mobj:
duration_tuple = mobj.groups()
duration_in_secs = (int(duration_tuple[0]) * 60 +
int(duration_tuple[1]) * 60 +
int(duration_tuple[2]))
else:
duration_in_secs = self.duration
return duration_in_secs
def _fetch_current_duration_done(self, time_str):
time_str = time_str.split(":")
return (int(time_str[0]) * 60 + int(time_str[1]) * 60 +
int(time_str[2].split(".")[0]))
def _prepare_time_str(self, secs):
(mins, secs) = divmod(secs, 60)
(hours, mins) = divmod(mins, 60)
if hours > 99:
time_str = "--:--:--"
if hours == 0:
time_str = "%02d:%02ds" % (mins, secs)
else:
time_str = "%02d:%02d:%02ds" % (hours, mins, secs)
return time_str
def _progress(self,
iterations,
total,
bytesdone,
speed,
elapsed,
bar_length=30,
fps=None):
offset = 0
filled_length = int(round(bar_length * iterations / float(total)))
percents = format(100.00 * (iterations * 1.0 / float(total)), ".2f")
if bytesdone <= 1048576:
_receiving = round(float(bytesdone) / 1024.00, 2)
_received = format(
_receiving if _receiving < 1024.00 else _receiving / 1024.00,
".2f")
suffix_recvd = "KB" if _receiving < 1024.00 else "MB"
else:
_receiving = round(float(bytesdone) / 1048576, 2)
_received = format(
_receiving if _receiving < 1024.00 else _receiving / 1024.00,
".2f")
suffix_recvd = "MB" if _receiving < 1024.00 else "GB"
suffix_rate = "Kb/s" if speed < 1024.00 else "Mb/s"
if fps:
suffix_rate += f" {fps}/fps"
if elapsed:
rate = ((float(iterations) - float(offset)) / 1024.0) / elapsed
eta = (total - iterations) / (rate * 1024.0)
else:
rate = 0
eta = 0
rate = format(speed if speed < 1024.00 else speed / 1024.00, ".2f")
(mins, secs) = divmod(eta, 60)
(hours, mins) = divmod(mins, 60)
if hours > 99:
eta = "--:--:--"
if hours == 0:
eta = "eta %02d:%02ds" % (mins, secs)
else:
eta = "eta %02d:%02d:%02ds" % (hours, mins, secs)
if secs == 0:
eta = "\n"
total_time = self._prepare_time_str(total)
done_time = self._prepare_time_str(iterations)
downloaded = f"{total_time}/{done_time}"
received_bytes = str(_received) + str(suffix_recvd)
percents = f"{received_bytes} {percents}"
self.hls_progress(
downloaded=downloaded,
percents=percents,
filled_length=filled_length,
rate=str(rate) + str(suffix_rate),
suffix=eta,
bar_length=bar_length,
)
def hls_progress(self,
downloaded,
percents,
filled_length,
rate,
suffix,
bar_length=30):
bar = (Fore.CYAN + Style.DIM + "#" * filled_length + Fore.WHITE +
Style.DIM + "-" * (bar_length - filled_length))
sys.stdout.write(
"\033[2K\033[1G\r\r{}{}[{}{}*{}{}] : {}{}{} {}% |{}{}{}| {} {}".
format(
Fore.CYAN,
Style.DIM,
Fore.MAGENTA,
Style.BRIGHT,
Fore.CYAN,
Style.DIM,
Fore.GREEN,
Style.BRIGHT,
downloaded,
percents,
bar,
Fore.GREEN,
Style.BRIGHT,
rate,
suffix,
))
sys.stdout.flush()
def _parse_progress(self, line):
items = {
key: value
for key, value in self._PROGRESS_PATTERN.findall(line)
}
return items
def download(self):
total_time = None
t0 = time.time()
progress_lines = []
active = True
retVal = {}
command = self._command()
bytes_done = 0
download_speed = 0
try:
with subprocess.Popen(command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE) as proc:
while active:
elapsed = time.time() - t0
try:
line = proc.stderr.readline().decode("utf-8").strip()
if not total_time:
total_time = self._fetch_total_duration(line)
if "progress=end" in line:
try:
self._progress(
total_time,
total_time,
bytes_done,
download_speed,
elapsed,
)
except KeyboardInterrupt:
retVal = {
"status": "False",
"msg": "Error: KeyboardInterrupt",
}
raise KeyboardInterrupt
except Exception as err:
{"status": "False", "msg": f"Error: {err}"}
active = False
retVal = {"status": "True", "msg": "download"}
break
if "progress" not in line:
progress_lines.append(line)
else:
lines = "\n".join(progress_lines)
items = self._parse_progress(lines)
if items:
secs = self._fetch_current_duration_done(
items.get("out_time"))
_tsize = (
items.get("total_size").lower().replace(
"kb", ""))
_brate = (items.get("bitrate").lower().replace(
"kbits/s", ""))
fps = items.get("fps")
bytes_done = float(
_tsize) if _tsize != "n/a" else 0
download_speed = float(
_brate) if _brate != "n/a" else 0
try:
self._progress(
secs,
total_time,
bytes_done,
download_speed,
elapsed,
fps=fps,
)
except KeyboardInterrupt:
retVal = {
"status": "False",
"msg": "Error: KeyboardInterrupt",
}
raise KeyboardInterrupt
except Exception as err:
{"status": "False", "msg": f"Error: {err}"}
progress_lines = []
except KeyboardInterrupt:
active = False
retVal = {
"status": "False",
"msg": "Error: KeyboardInterrupt"
}
raise KeyboardInterrupt
except KeyboardInterrupt:
raise KeyboardInterrupt
return retVal

108
main.py
View File

@ -1,14 +1,13 @@
import os, requests, json, glob, argparse, sys, re, time, asyncio, json
# from sanitize_filename import sanitize
import os, requests, json, glob, argparse, sys, re, time, asyncio, json, cloudscraper, m3u8
from tqdm import tqdm
from dotenv import load_dotenv
from mpegdash.parser import MPEGDASHParser
from utils import extract_kid
from vtt_to_srt import convert
import cloudscraper
from requests.exceptions import ConnectionError as conn_error
from html.parser import HTMLParser as compat_HTMLParser
from sanitize import sanitize, slugify, SLUG_OK
from ffmpeg import FFMPeg
home_dir = os.getcwd()
download_dir = os.path.join(os.getcwd(), "out_dir")
@ -167,7 +166,7 @@ class Udemy:
})
return _temp
def _extract_sources(self, sources):
def _extract_sources(self, sources, skip_hls):
_temp = []
if sources and isinstance(sources, list):
for source in sources:
@ -195,17 +194,21 @@ class Udemy:
else:
width = "256"
if (source.get("type") == "application/x-mpegURL"
or "m3u8" in download_url or height == "Audio"):
continue
_type = source.get("type")
_temp.append({
"type": "video",
"height": height,
"width": width,
"extension": _type.replace("video/", ""),
"download_url": download_url,
})
or "m3u8" in download_url):
if not skip_hls:
out = self._extract_m3u8(download_url)
print(out)
if out:
_temp.extend(out)
else:
_type = source.get("type")
_temp.append({
"type": "video",
"height": height,
"width": width,
"extension": _type.replace("video/", ""),
"download_url": download_url,
})
return _temp
def _extract_media_sources(self, sources):
@ -247,6 +250,38 @@ class Udemy:
})
return _temp
def _extract_m3u8(self, url):
"""extracts m3u8 streams"""
_temp = []
try:
resp = self.session._get(url)
resp.raise_for_status()
raw_data = resp.text
m3u8_object = m3u8.loads(raw_data)
playlists = m3u8_object.playlists
seen = set()
for pl in playlists:
resolution = pl.stream_info.resolution
codecs = pl.stream_info.codecs
if not resolution:
continue
if not codecs:
continue
width, height = resolution
download_url = pl.uri
if height not in seen:
seen.add(height)
_temp.append({
"type": "hls",
"height": height,
"width": width,
"extension": "mp4",
"download_url": download_url,
})
except Exception as error:
print(f"Udemy Says : '{error}' while fetching hls streams..")
return _temp
def _extract_mpd(self, url):
"""extract mpd streams"""
_video = []
@ -922,13 +957,8 @@ def process_caption(caption, lecture_title, lecture_dir, keep_vtt, tries=0):
print(f"> Error converting caption: {e}")
def process_lecture(
lecture,
lecture_index,
lecture_path,
lecture_dir,
quality,
):
def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
access_token):
# TODO: Make this more efficent, some lectures are html articles not videos so we should check if the extension is html
index = lecture.get("index")
lecture_index = lecture.get("lecture_index")
@ -984,7 +1014,17 @@ def process_lecture(
key=lambda x: abs(int(x.get("height")) - quality))
try:
url = source.get("download_url")
download(url, lecture_path, lecture_title)
source_type = source.get("type")
if source_type == "hls":
temp_filepath = lecture_path.replace(".mp4", "")
temp_filepath = temp_filepath + ".hls-part.mp4"
retVal = FFMPeg(None, url, access_token,
temp_filepath).download()
if retVal:
os.rename(temp_filepath, lecture_path)
print("> HLS Download success")
else:
download(url, lecture_path, lecture_title)
except Exception as e:
print(f"> Error downloading lecture: ", e)
else:
@ -995,7 +1035,7 @@ def process_lecture(
def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
caption_locale, keep_vtt):
caption_locale, keep_vtt, access_token):
total_chapters = _udemy.get("total_chapters")
total_lectures = _udemy.get("total_lectures")
print(f"Chapter(s) ({total_chapters})")
@ -1040,7 +1080,7 @@ def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
lecture_path = os.path.join(
chapter_dir, "{}.mp4".format(sanitize(lecture_title)))
process_lecture(lecture, lecture_index, lecture_path,
chapter_dir, quality)
chapter_dir, quality, access_token)
if dl_assets:
assets = lecture.get("assets")
@ -1148,6 +1188,13 @@ if __name__ == "__main__":
action="store_true",
help="If specified, .vtt files won't be removed",
)
parser.add_argument(
"--skip-hls",
dest="skip_hls",
action="store_true",
help=
"If specified, hls streams will be skipped (faster fetching) (hls streams usually contain 1080p quality for non-drm lectures)",
)
parser.add_argument(
"--save-to-file",
@ -1171,6 +1218,7 @@ if __name__ == "__main__":
portal_name = None
course_name = None
keep_vtt = False
skip_hls = False
args = parser.parse_args()
if args.download_assets:
@ -1185,6 +1233,8 @@ if __name__ == "__main__":
quality = args.quality
if args.keep_vtt:
keep_vtt = args.keep_vtt
if args.skip_hls:
skip_hls = args.skip_hls
if args.load_from_file:
print(
@ -1237,7 +1287,7 @@ if __name__ == "__main__":
_udemy = json.loads(
open(os.path.join(os.getcwd(), "saved", "_udemy.json")).read())
parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
caption_locale, keep_vtt)
caption_locale, keep_vtt, access_token)
else:
_udemy = {}
_udemy["access_token"] = access_token
@ -1328,7 +1378,8 @@ if __name__ == "__main__":
sources = data.get("Video")
tracks = asset.get("captions")
#duration = asset.get("time_estimation")
sources = udemy._extract_sources(sources)
sources = udemy._extract_sources(
sources, skip_hls)
subtitles = udemy._extract_subtitles(tracks)
sources_count = len(sources)
subtitle_count = len(subtitles)
@ -1463,6 +1514,7 @@ if __name__ == "__main__":
'w') as f:
f.write(json.dumps(_udemy))
f.close()
print("Saved parsed data to json")
parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
caption_locale, keep_vtt)
caption_locale, keep_vtt, access_token)

View File

@ -6,3 +6,5 @@ python-dotenv
protobuf
webvtt-py
pysrt
m3u8
colorama