New Experimental Downloader, bug fixes, and small updates

+ Updated cleanup function to remove the entire temporary lecture folder instead of just leaving behind tons of empty folders
+ Fixed typo in mux function
+ Segment count is now properly calculated from segment timeline
+ Manifest is now parsed from the URL instead of being downloaded, this should be better for downloading multiple courses at once.
+ Fixed a bug where audio content_type would try to find a max quality
+ New Downloader: Threaded Downloader uses multiple threads to download files, this should improve download speeds greatly. By default, the threaded downloader is not used, you can use the threaded downloader by passing ``--use-threaded-downloader``. By default, it only uses 10 threads, you can set a custom number of threads with the ``--threads`` option
This commit is contained in:
Puyodead1 2021-05-21 13:38:24 -04:00
parent 88a411d708
commit aab19bf66f
3 changed files with 344 additions and 96 deletions

View File

@ -68,8 +68,8 @@ You can now run `python main.py` to start downloading. The course will download
# Advanced Usage
```
usage: main.py [-h] -c COURSE_URL [-b BEARER_TOKEN] [-d] [-q QUALITY] [-l LANG] [--skip-lectures] [--download-assets]
[--download-captions]
usage: main.py [-h] -c COURSE_URL [-b BEARER_TOKEN] [-q QUALITY] [-t THREADS] [-l LANG] [--skip-lectures] [--download-assets]
[--download-captions] [--use-threaded-downloader] [-d]
Udemy Downloader
@ -79,13 +79,15 @@ optional arguments:
The URL of the course to download
-b BEARER_TOKEN, --bearer BEARER_TOKEN
The Bearer token to use
-d, --debug Use test_data.json rather than fetch from the udemy api.
-q QUALITY, --quality QUALITY
Download specific video quality. (144, 360, 480, 720, 1080)
-t THREADS, --threads THREADS
Max number of threads to use when using the threaded downloader (default 10)
-l LANG, --lang LANG The language to download for captions (Default is en)
--skip-lectures If specified, lectures won't be downloaded.
--download-assets If specified, lecture assets will be downloaded.
--download-captions If specified, captions will be downloaded.
--use-threaded-downloader If specified, the experimental threaded downloader will be used
```
- Passing a Bearer Token and Course ID as an argument
@ -107,8 +109,12 @@ optional arguments:
- `python main.py -c <Course URL> --download-captions -l all` - Downloads all subtitles
- etc
- Skip downloading lecture videos
- `python main.py --skip-lectures --download-captions` - Downloads only captions
- `python main.py --skip-lectures --download-assets` - Downloads only assets
- `python main.py -c <Course URL> --skip-lectures --download-captions` - Downloads only captions
- `python main.py -c <Course URL> --skip-lectures --download-assets` - Downloads only assets
- Use threaded downloader
- `python main.py -c <Course URL> --use-threaded-downloader`
- Use threaded downloader with custom max threads
- `python main.py -c <Course URL> --use-threaded-downloader --threads 15`
# Credits

120
downloader.py Normal file
View File

@ -0,0 +1,120 @@
import os, threading, requests
from tqdm import tqdm
class FileDownloader():
"""
@source: https://gist.github.com/stefanfortuin/9dbfe8618701507d0ef2b5515b165c5f
"""
def __init__(self, max_threads=10):
print("> Threaded downloader using {} threads.".format(
str(max_threads)))
self.sema = threading.Semaphore(value=max_threads)
self.headers = {
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
self.block_size = 1024
def t_getfile(self, link, filepath, filename, bar, session):
"""
Threaded function that uses a semaphore
to not instantiate too many threads
"""
self.sema.acquire()
os.makedirs(os.path.dirname(filepath), exist_ok=True)
if not os.path.isfile(filepath):
headers = requests.head(link).headers
if 'content-length' not in headers:
print(f"server doesn't support content-length for {link}")
self.sema.release()
return
total_bytes = int(requests.head(link).headers['content-length'])
if not bar:
bar = tqdm(total=total_bytes,
initial=0,
unit='B',
unit_scale=True,
desc=filename)
self.download_new_file(link, filename, filepath, total_bytes, bar,
session)
else:
current_bytes = os.stat(filepath).st_size
headers = requests.head(link).headers
if 'content-length' not in headers:
print(f"server doesn't support content-length for {link}")
self.sema.release()
return
total_bytes = int(requests.head(link).headers['content-length'])
if not bar:
bar = tqdm(total=total_bytes,
initial=current_bytes,
unit='B',
unit_scale=True,
desc=filename)
if current_bytes < total_bytes:
self.continue_file_download(link, filename, filepath,
current_bytes, total_bytes, bar)
else:
# print(f"already done: {filename}")
if bar.unit == "B":
bar.update(self.block_size)
else:
bar.update(1)
self.sema.release()
def download_new_file(self, link, filename, filepath, total_bytes, bar,
session):
if session == None:
try:
request = requests.get(link,
headers=self.headers,
timeout=30,
stream=True)
self.write_file(request, filepath, 'wb', bar)
except requests.exceptions.RequestException as e:
print(e)
else:
request = session.get(link, stream=True)
self.write_file(request, filepath, 'wb', bar)
def continue_file_download(self, link, filename, filepath, current_bytes,
total_bytes, bar):
range_header = self.headers.copy()
range_header['Range'] = f"bytes={current_bytes}-{total_bytes}"
try:
request = requests.get(link,
headers=range_header,
timeout=30,
stream=True)
self.write_file(request, filepath, 'ab', bar)
except requests.exceptions.RequestException as e:
print(e)
def write_file(self, content, filepath, writemode, bar):
with open(filepath, writemode) as f:
for chunk in content.iter_content(chunk_size=self.block_size):
if chunk:
f.write(chunk)
if bar.unit == "B":
bar.update(self.block_size)
# print(f"completed file {filepath}", end='\n')
f.close()
bar.update(1)
def get_file(self, link, path, filename, bar=None, session=None):
""" Downloads the file"""
thread = threading.Thread(target=self.t_getfile,
args=(link, path, filename, bar, session))
thread.start()
return thread

304
main.py
View File

@ -5,6 +5,7 @@ from dotenv import load_dotenv
from mpegdash.parser import MPEGDASHParser
from utils import extract_kid
from vtt_to_srt import convert
from downloader import FileDownloader
download_dir = os.path.join(os.getcwd(), "out_dir")
working_dir = os.path.join(os.getcwd(), "working_dir")
@ -12,6 +13,7 @@ retry = 3
home_dir = os.getcwd()
keyfile_path = os.path.join(os.getcwd(), "keyfile.json")
valid_qualities = [144, 360, 480, 720, 1080]
downloader = None
if not os.path.exists(working_dir):
os.makedirs(working_dir)
@ -115,13 +117,12 @@ def cleanup(path):
@author Jayapraveen
"""
leftover_files = glob.glob(path + '/*.mp4', recursive=True)
mpd_files = glob.glob(path + '/*.mpd', recursive=True)
leftover_files = leftover_files + mpd_files
for file_list in leftover_files:
try:
os.remove(file_list)
except OSError:
print(f"Error deleting file: {file_list}")
os.removedirs(path)
def mux_process(video_title, lecture_working_dir, outfile):
@ -147,30 +148,30 @@ def decrypt(kid, filename, lecture_working_dir):
"""
try:
key = keyfile[kid.lower()]
except KeyError as error:
except KeyError:
exit("Key not found")
if (os.name == "nt"):
os.system("mp4decrypt --key 1:{} \"{}\" \"{}\"".format(
code = os.system("mp4decrypt --key 1:{0} \"{1}\" \"{2}\"".format(
key,
os.path.join(lecture_working_dir,
"encrypted_{}.mp4".format(filename)),
"encrypted_{0}.mp4".format(filename)),
os.path.join(lecture_working_dir,
"decrypted{}.mp4".format(filename))))
"decrypted_{0}.mp4".format(filename))))
else:
os.system("nice -n 7 mp4decrypt --key 1:{} \"{}\" \"{}\"".format(
os.system("nice -n 7 mp4decrypt --key 1:{0} \"{1}\" \"{2}\"".format(
key,
os.path.join(lecture_working_dir,
"encrypted_{}.mp4".format(filename)),
"encrypted_{0}.mp4".format(filename)),
os.path.join(lecture_working_dir,
"decrypted{}.mp4".format(filename))))
"decrypted_{0}.mp4".format(filename))))
def handle_irregular_segments(media_info, video_title, lecture_working_dir,
output_path):
def handle_segments(media_info, video_title, lecture_working_dir, output_path):
"""
@author Jayapraveen
"""
no_segment, video_url, video_init, video_extension, no_segment, audio_url, audio_init, audio_extension = media_info
no_segment += 10 # because the download_media function relies on hittin a 404 to know when to finish
download_media("video_0.seg.mp4", video_init, lecture_working_dir)
video_kid = extract_kid(
os.path.join(lecture_working_dir, "video_0.seg.mp4"))
@ -179,7 +180,7 @@ def handle_irregular_segments(media_info, video_title, lecture_working_dir,
audio_kid = extract_kid(
os.path.join(lecture_working_dir, "audio_0.seg.mp4"))
print("KID for audio file is: " + audio_kid)
for count in range(1, no_segment):
for count in range(1, no_segment + 4):
video_segment_url = video_url.replace("$Number$", str(count))
audio_segment_url = audio_url.replace("$Number$", str(count))
video_status = download_media(
@ -214,73 +215,138 @@ def handle_irregular_segments(media_info, video_title, lecture_working_dir,
decrypt(audio_kid, "audio", lecture_working_dir)
os.chdir(home_dir)
mux_process(video_title, lecture_working_dir, output_path)
cleanup(lecture_working_dir)
break
def manifest_parser(mpd_url):
def handle_segments_threaded(media_info, video_title, lecture_working_dir,
output_path):
"""
@author Jayapraveen
"""
no_segment, video_url, video_init, video_extension, no_segment, audio_url, audio_init, audio_extension = media_info
download_media("video_0.seg.mp4", video_init, lecture_working_dir)
video_kid = extract_kid(
os.path.join(lecture_working_dir, "video_0.seg.mp4"))
print("KID for video file is: " + video_kid)
download_media("audio_0.seg.mp4", audio_init, lecture_working_dir)
audio_kid = extract_kid(
os.path.join(lecture_working_dir, "audio_0.seg.mp4"))
print("KID for audio file is: " + audio_kid)
vbar = tqdm(total=no_segment,
initial=1,
unit='Video Segments',
desc=video_title + " (Video)")
abar = tqdm(total=no_segment,
initial=1,
unit='Audio Segments',
desc=video_title + " (Audio)")
threads = []
for count in range(1, no_segment):
video_filename = f"video_{str(count)}.seg.{video_extension}"
video_path = os.path.join(lecture_working_dir, video_filename)
video_segment_url = video_url.replace("$Number$", str(count))
video = downloader.get_file(video_segment_url, video_path,
video_filename, vbar)
threads.append(video)
for count in range(1, no_segment):
audio_filename = f"audio_{str(count)}.seg.{audio_extension}"
audio_path = os.path.join(lecture_working_dir, audio_filename)
audio_segment_url = audio_url.replace("$Number$", str(count))
audio = downloader.get_file(audio_segment_url, audio_path,
audio_filename, abar)
threads.append(audio)
for x in threads:
x.join()
os.chdir(lecture_working_dir)
if os.name == "nt":
video_concat_command = "copy /b " + "+".join(
[f"video_{i}.seg.{video_extension}"
for i in range(0, count)]) + " encrypted_video.mp4"
audio_concat_command = "copy /b " + "+".join(
[f"audio_{i}.seg.{audio_extension}"
for i in range(0, count)]) + " encrypted_audio.mp4"
else:
video_concat_command = "cat " + " ".join(
[f"video_{i}.seg.{video_extension}"
for i in range(0, count)]) + " > encrypted_video.mp4"
audio_concat_command = "cat " + " ".join(
[f"audio_{i}.seg.{audio_extension}"
for i in range(0, count)]) + " > encrypted_audio.mp4"
os.system(video_concat_command)
os.system(audio_concat_command)
decrypt(video_kid, "video", lecture_working_dir)
decrypt(audio_kid, "audio", lecture_working_dir)
os.chdir(home_dir)
mux_process(video_title, lecture_working_dir, output_path)
cleanup(lecture_working_dir)
def manifest_parser(mpd_url, quality):
"""
@author Jayapraveen
"""
video = []
audio = []
manifest = requests.get(mpd_url).text
with open("manifest.mpd", 'w') as manifest_handler:
manifest_handler.write(manifest)
mpd = MPEGDASHParser.parse("./manifest.mpd")
running_time = durationtoseconds(mpd.media_presentation_duration)
mpd = MPEGDASHParser.parse(mpd_url)
for period in mpd.periods:
for adapt_set in period.adaptation_sets:
print("Processing " + adapt_set.mime_type)
content_type = adapt_set.mime_type
if quality and content_type == "video/mp4":
print(adapt_set.representations[0].height, quality)
repr = next((x for x in adapt_set.representations
if x.height == quality), None)
if not repr:
qualities = []
for rep in adapt_set.representations:
qualities.append(rep.height)
print(quality, qualities)
if quality < qualities[0]:
# they want a lower quality than whats available
repr = adapt_set.representations[0] # Lowest Quality
elif quality > qualities[-1]:
# they want a higher quality than whats available
repr = adapt_set.representations[-1] # Max Quality
print(
"> Could not find video with requested quality, falling back to closest!"
)
print("> Using quality of %s" % repr.height)
if content_type == "video/mp4":
if quality:
repr = next((x for x in adapt_set.representations
if x.height == quality), None)
if not repr:
qualities = []
for rep in adapt_set.representations:
qualities.append(rep.height)
if quality < qualities[0]:
# they want a lower quality than whats available
repr = adapt_set.representations[
0] # Lowest Quality
elif quality > qualities[-1]:
# they want a higher quality than whats available
repr = adapt_set.representations[-1] # Max Quality
print(
"> Could not find video with requested quality, falling back to closest!"
)
print("> Using quality of %s" % repr.height)
else:
print("> Found MPD representation with quality %s" %
repr.height)
else:
print("> Found MPD representation with quality %s" %
repr.height)
else:
repr = adapt_set.representations[-1] # Max Quality
print("> Using max quality of %s" % repr.height)
for segment in repr.segment_templates:
if (segment.duration):
print("Media segments are of equal timeframe")
segment_time = segment.duration / segment.timescale
total_segments = running_time / segment_time
else:
print("Media segments are of inequal timeframe")
repr = adapt_set.representations[-1] # Max Quality
print("> Using max quality of %s" % repr.height)
segment_count = 0
segment = repr.segment_templates[0]
timeline = segment.segment_timelines[0]
segment_count += len(timeline.Ss)
for s in timeline.Ss:
if s.r:
segment_count += s.r
print("Expected No of segments:", segment_count)
if (content_type == "audio/mp4"):
segment_extension = segment.media.split(".")[-1]
audio.append(segment_count)
audio.append(segment.media)
audio.append(segment.initialization)
audio.append(segment_extension)
elif (content_type == "video/mp4"):
segment_extension = segment.media.split(".")[-1]
video.append(segment_count)
video.append(segment.media)
video.append(segment.initialization)
video.append(segment_extension)
approx_no_segments = round(
running_time /
6) + 10 # aproximate of 6 sec per segment
print("Expected No of segments:", approx_no_segments)
if (content_type == "audio/mp4"):
segment_extension = segment.media.split(".")[-1]
audio.append(approx_no_segments)
audio.append(segment.media)
audio.append(segment.initialization)
audio.append(segment_extension)
elif (content_type == "video/mp4"):
segment_extension = segment.media.split(".")[-1]
video.append(approx_no_segments)
video.append(segment.media)
video.append(segment.initialization)
video.append(segment_extension)
return video + audio
@ -316,6 +382,8 @@ def process_caption(caption,
lecture_index,
lecture_title,
lecture_dir,
use_threaded_downloader,
threads,
tries=0):
filename = f"%s. %s_%s.%s" % (lecture_index, sanitize(lecture_title),
caption.get("locale_id"), caption.get("ext"))
@ -328,7 +396,12 @@ def process_caption(caption,
else:
print(f"> Downloading captions: '%s'" % filename)
try:
download(caption.get("url"), filepath, filename)
if use_threaded_downloader:
thread = downloader.get_file(caption.get("url"), filepath,
filename)
thread.join()
else:
download(caption.get("url"), filepath, filename)
except Exception as e:
if tries >= 3:
print(
@ -340,7 +413,8 @@ def process_caption(caption,
f"> Error downloading captions: {e}. Will retry {3-tries} more times."
)
process_caption(caption, lecture_index, lecture_title,
lecture_dir, tries + 1)
lecture_dir, use_threaded_downloader, threads,
tries + 1)
if caption.get("ext") == "vtt":
try:
print("> Converting captions to SRT format...")
@ -352,7 +426,8 @@ def process_caption(caption,
def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
skip_lectures, dl_assets, dl_captions, caption_locale):
skip_lectures, dl_assets, dl_captions, caption_locale,
use_threaded_downloader):
lecture_title = lecture["title"]
lecture_asset = lecture["asset"]
if not skip_lectures:
@ -371,7 +446,12 @@ def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
if not os.path.isfile(lecture_path):
try:
download(lecture_url, lecture_path, lecture_title)
if use_threaded_downloader:
thread = downloader.get_file(lecture_url, lecture_path,
lecture_title)
thread.join()
else:
download(lecture_url, lecture_path, lecture_title)
except Exception as e:
# We could add a retry here
print(f"> Error downloading lecture: {e}. Skipping...")
@ -396,10 +476,13 @@ def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
"> Couldn't find dash url for lecture '%s', skipping...",
lecture_title)
return
media_info = manifest_parser(mpd_url)
handle_irregular_segments(media_info, lecture_title,
lecture_working_dir, lecture_path)
cleanup(lecture_working_dir)
media_info = manifest_parser(mpd_url, quality)
if use_threaded_downloader:
handle_segments_threaded(media_info, lecture_title,
lecture_working_dir, lecture_path)
else:
handle_segments(media_info, lecture_title,
lecture_working_dir, lecture_path)
else:
print("> Lecture '%s' is already downloaded, skipping..." %
lecture_title)
@ -418,9 +501,16 @@ def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
if x["label"] == "download"), None)
if download_url:
try:
download(download_url,
os.path.join(lecture_dir, asset_filename),
asset_filename)
if use_threaded_downloader:
thread = downloader.get_file(
download_url,
os.path.join(lecture_dir, asset_filename),
asset_filename)
thread.join()
else:
download(download_url,
os.path.join(lecture_dir, asset_filename),
asset_filename)
except Exception as e:
print(
f"> Error downloading lecture asset: {e}. Skipping"
@ -472,11 +562,12 @@ def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
})
for caption in captions:
process_caption(caption, lecture_index, lecture_title, lecture_dir)
process_caption(caption, lecture_index, lecture_title, lecture_dir,
use_threaded_downloader)
def parse(data, course_id, course_name, skip_lectures, dl_assets, dl_captions,
quality, caption_locale):
quality, caption_locale, use_threaded_downloader):
course_dir = os.path.join(download_dir, course_name)
if not os.path.exists(course_dir):
os.mkdir(course_dir)
@ -498,9 +589,18 @@ def parse(data, course_id, course_name, skip_lectures, dl_assets, dl_captions,
lecture_path = os.path.join(
course_dir, "{}. {}.mp4".format(lecture_index,
sanitize(obj["title"])))
process_lecture(obj, lecture_index, lecture_path, download_dir,
quality, skip_lectures, dl_assets, dl_captions,
caption_locale)
process_lecture(
obj,
lecture_index,
lecture_path,
download_dir,
quality,
skip_lectures,
dl_assets,
dl_captions,
caption_locale,
use_threaded_downloader,
)
for chapter in chapters:
chapter_dir = os.path.join(
@ -516,7 +616,7 @@ def parse(data, course_id, course_name, skip_lectures, dl_assets, dl_captions,
sanitize(lecture["title"])))
process_lecture(lecture, lecture_index, lecture_path, chapter_dir,
quality, skip_lectures, dl_assets, dl_captions,
caption_locale)
caption_locale, use_threaded_downloader)
print("\n\n\n\n\n\n\n\n=====================")
print("All downloads completed for course!")
print("=====================")
@ -571,13 +671,6 @@ if __name__ == "__main__":
type=str,
help="The Bearer token to use",
)
parser.add_argument(
"-d",
"--debug",
dest="debug",
action="store_true",
help="Use test_data.json rather than fetch from the udemy api.",
)
parser.add_argument(
"-q",
"--quality",
@ -585,6 +678,14 @@ if __name__ == "__main__":
type=int,
help="Download specific video quality. (144, 360, 480, 720, 1080)",
)
parser.add_argument(
"-t",
"--threads",
dest="threads",
type=int,
help=
"Max number of threads to use when using the threaded downloader (default 10)",
)
parser.add_argument(
"-l",
"--lang",
@ -610,6 +711,19 @@ if __name__ == "__main__":
action="store_true",
help="If specified, captions will be downloaded.",
)
parser.add_argument(
"--use-threaded-downloader",
dest="use_threaded_downloader",
action="store_true",
help="If specified, the experimental threaded downloader will be used",
)
parser.add_argument(
"-d",
"--debug",
dest="debug",
action="store_true",
help="Use test_data.json rather than fetch from the udemy api.",
)
dl_assets = False
skip_lectures = False
@ -619,6 +733,8 @@ if __name__ == "__main__":
bearer_token = None
portal_name = None
course_name = None
use_threaded_downloader = False
threads = 10
args = parser.parse_args()
if args.download_assets:
@ -635,6 +751,11 @@ if __name__ == "__main__":
sys.exit(1)
else:
quality = args.quality
if args.use_threaded_downloader:
use_threaded_downloader = args.use_threaded_downloader
if args.threads:
threads = args.threads
downloader = FileDownloader(max_threads=threads)
load_dotenv()
if args.bearer_token:
@ -693,7 +814,8 @@ if __name__ == "__main__":
course_data = json.loads(f.read())
parse(course_data["results"], course_id, course_name,
skip_lectures, dl_assets, dl_captions, quality,
caption_locale)
caption_locale, use_threaded_downloader)
else:
parse(course_data["results"], course_id, course_name, skip_lectures,
dl_assets, dl_captions, quality, caption_locale)
dl_assets, dl_captions, quality, caption_locale,
use_threaded_downloader)