+ Update sanitization regex to clean exclaimation points
+ Update _sanitize method to use _clean method
+ Moved lecture file exist check to parse method, this will now also check if html files exist
+ Update chapter folder titles to use - for separating number from name
This commit is contained in:
Puyodead1 2021-08-08 14:03:24 -04:00
parent 6137e44d76
commit ca40ff2b6d

88
main.py
View File

@ -42,15 +42,15 @@ COLLECTION_URL = "https://{portal_name}.udemy.com/api-2.0/users/me/subscribed-co
def _clean(text): def _clean(text):
ok = re.compile(r'[^\\/:*?"<>|]') ok = re.compile(r'[^\\/:*?!"<>|]')
text = "".join(x if ok.match(x) else "_" for x in text) text = "".join(x if ok.match(x) else "_" for x in text)
text = re.sub(r"\.+$", "", text.strip()) text = re.sub(r"\.+$", "", text.strip())
return text return text
def _sanitize(self, unsafetext): def _sanitize(self, unsafetext):
text = sanitize( text = _clean(sanitize(
slugify(unsafetext, lower=False, spaces=True, ok=SLUG_OK + "().[]")) slugify(unsafetext, lower=False, spaces=True, ok=SLUG_OK + "().[]")))
return text return text
@ -1028,22 +1028,17 @@ def process_lecture(lecture, lecture_path, lecture_file_name, quality, access_to
if is_encrypted: if is_encrypted:
if len(lecture_sources) > 0: if len(lecture_sources) > 0:
if not os.path.isfile(lecture_path): source = lecture_sources[-1] # last index is the best quality
source = lecture_sources[-1] # last index is the best quality if isinstance(quality, int):
if isinstance(quality, int): source = min(
source = min( lecture_sources,
lecture_sources, key=lambda x: abs(int(x.get("height")) - quality))
key=lambda x: abs(int(x.get("height")) - quality)) print(f" > Lecture '%s' has DRM, attempting to download" %
print(f" > Lecture '%s' has DRM, attempting to download" % lecture_title)
lecture_title) handle_segments(source.get("download_url"),
handle_segments(source.get("download_url"), source.get(
source.get( "format_id"), lecture_title, lecture_path, lecture_file_name,
"format_id"), lecture_title, lecture_path, lecture_file_name, concurrent_connections, chapter_dir)
concurrent_connections, chapter_dir)
else:
print(
" > Lecture '%s' is already downloaded, skipping..." %
lecture_title)
else: else:
print(f" > Lecture '%s' is missing media links" % print(f" > Lecture '%s' is missing media links" %
lecture_title) lecture_title)
@ -1117,31 +1112,44 @@ def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
for lecture in chapter.get("lectures"): for lecture in chapter.get("lectures"):
lecture_title = lecture.get("lecture_title") lecture_title = lecture.get("lecture_title")
lecture_index = lecture.get("lecture_index") lecture_index = lecture.get("lecture_index")
lecture_extension = lecture.get("extension")
extension = "mp4" # video lectures dont have an extension property, so we assume its mp4
if lecture_extension != None:
# if the lecture extension property isnt none, set the extension to the lecture extension
extension = lecture_extension
lecture_file_name = sanitize(lecture_title + "." + extension)
lecture_path = os.path.join(
chapter_dir,
lecture_file_name)
extension = lecture.get("extension")
print( print(
f" > Processing lecture {lecture_index} of {total_lectures}") f" > Processing lecture {lecture_index} of {total_lectures}")
if not skip_lectures: if not skip_lectures:
if extension == "html": print(lecture_file_name)
html_content = lecture.get("html_content").encode( # Check if the lecture is already downloaded
"ascii", "ignore").decode("utf8") if os.path.isfile(lecture_path):
lecture_path = os.path.join( print(
chapter_dir, "{}.html".format(sanitize(lecture_title))) " > Lecture '%s' is already downloaded, skipping..." %
try: lecture_title)
with open(lecture_path, 'w') as f: continue
f.write(html_content)
f.close()
except Exception as e:
print(" > Failed to write html file: ", e)
continue
else: else:
lecture_file_name = sanitize(lecture_title + ".mp4") # Check if the file is an html file
lecture_path = os.path.join( if extension == "html":
chapter_dir, html_content = lecture.get("html_content").encode(
lecture_file_name) "ascii", "ignore").decode("utf8")
process_lecture(lecture, lecture_path, lecture_file_name, lecture_path = os.path.join(
quality, access_token, chapter_dir, "{}.html".format(sanitize(lecture_title)))
concurrent_connections, chapter_dir) try:
with open(lecture_path, 'w') as f:
f.write(html_content)
f.close()
except Exception as e:
print(" > Failed to write html file: ", e)
continue
else:
process_lecture(lecture, lecture_path, lecture_file_name,
quality, access_token,
concurrent_connections, chapter_dir)
if dl_assets: if dl_assets:
assets = lecture.get("assets") assets = lecture.get("assets")
@ -1521,7 +1529,7 @@ if __name__ == "__main__":
lecture_counter = 0 lecture_counter = 0
lectures = [] lectures = []
chapter_index = entry.get("object_index") chapter_index = entry.get("object_index")
chapter_title = "{0:02d} ".format(chapter_index) + _clean( chapter_title = "{0:02d} - ".format(chapter_index) + _clean(
entry.get("title")) entry.get("title"))
if chapter_title not in _udemy["chapters"]: if chapter_title not in _udemy["chapters"]: