From ca40ff2b6d06e9e2c0c0aaed2185fbb282fea077 Mon Sep 17 00:00:00 2001 From: Puyodead1 <23562356riley@gmail.com> Date: Sun, 8 Aug 2021 14:03:24 -0400 Subject: [PATCH] Updates + Update sanitization regex to clean exclaimation points + Update _sanitize method to use _clean method + Moved lecture file exist check to parse method, this will now also check if html files exist + Update chapter folder titles to use - for separating number from name --- main.py | 88 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/main.py b/main.py index 7c76cbb..51f043a 100644 --- a/main.py +++ b/main.py @@ -42,15 +42,15 @@ COLLECTION_URL = "https://{portal_name}.udemy.com/api-2.0/users/me/subscribed-co def _clean(text): - ok = re.compile(r'[^\\/:*?"<>|]') + ok = re.compile(r'[^\\/:*?!"<>|]') text = "".join(x if ok.match(x) else "_" for x in text) text = re.sub(r"\.+$", "", text.strip()) return text def _sanitize(self, unsafetext): - text = sanitize( - slugify(unsafetext, lower=False, spaces=True, ok=SLUG_OK + "().[]")) + text = _clean(sanitize( + slugify(unsafetext, lower=False, spaces=True, ok=SLUG_OK + "().[]"))) return text @@ -1028,22 +1028,17 @@ def process_lecture(lecture, lecture_path, lecture_file_name, quality, access_to if is_encrypted: if len(lecture_sources) > 0: - if not os.path.isfile(lecture_path): - source = lecture_sources[-1] # last index is the best quality - if isinstance(quality, int): - source = min( - lecture_sources, - key=lambda x: abs(int(x.get("height")) - quality)) - print(f" > Lecture '%s' has DRM, attempting to download" % - lecture_title) - handle_segments(source.get("download_url"), - source.get( - "format_id"), lecture_title, lecture_path, lecture_file_name, - concurrent_connections, chapter_dir) - else: - print( - " > Lecture '%s' is already downloaded, skipping..." % - lecture_title) + source = lecture_sources[-1] # last index is the best quality + if isinstance(quality, int): + source = min( + lecture_sources, + key=lambda x: abs(int(x.get("height")) - quality)) + print(f" > Lecture '%s' has DRM, attempting to download" % + lecture_title) + handle_segments(source.get("download_url"), + source.get( + "format_id"), lecture_title, lecture_path, lecture_file_name, + concurrent_connections, chapter_dir) else: print(f" > Lecture '%s' is missing media links" % lecture_title) @@ -1117,31 +1112,44 @@ def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions, for lecture in chapter.get("lectures"): lecture_title = lecture.get("lecture_title") lecture_index = lecture.get("lecture_index") + lecture_extension = lecture.get("extension") + extension = "mp4" # video lectures dont have an extension property, so we assume its mp4 + if lecture_extension != None: + # if the lecture extension property isnt none, set the extension to the lecture extension + extension = lecture_extension + lecture_file_name = sanitize(lecture_title + "." + extension) + lecture_path = os.path.join( + chapter_dir, + lecture_file_name) - extension = lecture.get("extension") print( f" > Processing lecture {lecture_index} of {total_lectures}") if not skip_lectures: - if extension == "html": - html_content = lecture.get("html_content").encode( - "ascii", "ignore").decode("utf8") - lecture_path = os.path.join( - chapter_dir, "{}.html".format(sanitize(lecture_title))) - try: - with open(lecture_path, 'w') as f: - f.write(html_content) - f.close() - except Exception as e: - print(" > Failed to write html file: ", e) - continue + print(lecture_file_name) + # Check if the lecture is already downloaded + if os.path.isfile(lecture_path): + print( + " > Lecture '%s' is already downloaded, skipping..." % + lecture_title) + continue else: - lecture_file_name = sanitize(lecture_title + ".mp4") - lecture_path = os.path.join( - chapter_dir, - lecture_file_name) - process_lecture(lecture, lecture_path, lecture_file_name, - quality, access_token, - concurrent_connections, chapter_dir) + # Check if the file is an html file + if extension == "html": + html_content = lecture.get("html_content").encode( + "ascii", "ignore").decode("utf8") + lecture_path = os.path.join( + chapter_dir, "{}.html".format(sanitize(lecture_title))) + try: + with open(lecture_path, 'w') as f: + f.write(html_content) + f.close() + except Exception as e: + print(" > Failed to write html file: ", e) + continue + else: + process_lecture(lecture, lecture_path, lecture_file_name, + quality, access_token, + concurrent_connections, chapter_dir) if dl_assets: assets = lecture.get("assets") @@ -1521,7 +1529,7 @@ if __name__ == "__main__": lecture_counter = 0 lectures = [] chapter_index = entry.get("object_index") - chapter_title = "{0:02d} ".format(chapter_index) + _clean( + chapter_title = "{0:02d} - ".format(chapter_index) + _clean( entry.get("title")) if chapter_title not in _udemy["chapters"]: