Updates

+ Update sanitization regex to clean exclaimation points + Update _sanitize method to use _clean method + Moved lecture file exist check to parse method, this will now also check if html files exist + Update chapter folder titles to use - for separating number from name
2025-05-01 00:14:25 +02:00 · 2021-08-08 14:03:24 -04:00 · 2021-08-08 14:03:24 -04:00 · ca40ff2b6d
commit ca40ff2b6d
parent 6137e44d76
1 changed files with 48 additions and 40 deletions
--- a/main.py
+++ b/main.py
@ -42,15 +42,15 @@ COLLECTION_URL = "https://{portal_name}.udemy.com/api-2.0/users/me/subscribed-co
 def _clean(text):
-    ok = re.compile(r'[^\\/:*?"<>|]')
+    ok = re.compile(r'[^\\/:*?!"<>|]')
    text = "".join(x if ok.match(x) else "_" for x in text)
    text = re.sub(r"\.+$", "", text.strip())
    return text
 def _sanitize(self, unsafetext):
-    text = sanitize(
+    text = _clean(sanitize(
-        slugify(unsafetext, lower=False, spaces=True, ok=SLUG_OK + "().[]"))
+        slugify(unsafetext, lower=False, spaces=True, ok=SLUG_OK + "().[]")))
    return text
@ -1028,7 +1028,6 @@ def process_lecture(lecture, lecture_path, lecture_file_name, quality, access_to
    if is_encrypted:
        if len(lecture_sources) > 0:
            if not os.path.isfile(lecture_path):
            source = lecture_sources[-1]  # last index is the best quality
            if isinstance(quality, int):
                source = min(
@ -1040,10 +1039,6 @@ def process_lecture(lecture, lecture_path, lecture_file_name, quality, access_to
                            source.get(
                                "format_id"), lecture_title, lecture_path, lecture_file_name,
                            concurrent_connections, chapter_dir)
            else:
                print(
                    "      > Lecture '%s' is already downloaded, skipping..." %
                    lecture_title)
        else:
            print(f"      > Lecture '%s' is missing media links" %
                  lecture_title)
@ -1117,11 +1112,28 @@ def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
        for lecture in chapter.get("lectures"):
            lecture_title = lecture.get("lecture_title")
            lecture_index = lecture.get("lecture_index")
            lecture_extension = lecture.get("extension")
            extension = "mp4"  # video lectures dont have an extension property, so we assume its mp4
            if lecture_extension != None:
                # if the lecture extension property isnt none, set the extension to the lecture extension
                extension = lecture_extension
            lecture_file_name = sanitize(lecture_title + "." + extension)
            lecture_path = os.path.join(
                chapter_dir,
                lecture_file_name)
            extension = lecture.get("extension")
            print(
                f"  > Processing lecture {lecture_index} of {total_lectures}")
            if not skip_lectures:
                print(lecture_file_name)
                # Check if the lecture is already downloaded
                if os.path.isfile(lecture_path):
                    print(
                        "      > Lecture '%s' is already downloaded, skipping..." %
                        lecture_title)
                    continue
                else:
                    # Check if the file is an html file
                    if extension == "html":
                        html_content = lecture.get("html_content").encode(
                            "ascii", "ignore").decode("utf8")
@ -1135,10 +1147,6 @@ def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
                            print("    > Failed to write html file: ", e)
                            continue
                    else:
                    lecture_file_name = sanitize(lecture_title + ".mp4")
                    lecture_path = os.path.join(
                        chapter_dir,
                        lecture_file_name)
                        process_lecture(lecture, lecture_path, lecture_file_name,
                                        quality, access_token,
                                        concurrent_connections, chapter_dir)
@ -1521,7 +1529,7 @@ if __name__ == "__main__":
                    lecture_counter = 0
                    lectures = []
                    chapter_index = entry.get("object_index")
-                    chapter_title = "{0:02d} ".format(chapter_index) + _clean(
+                    chapter_title = "{0:02d} - ".format(chapter_index) + _clean(
                        entry.get("title"))
                    if chapter_title not in _udemy["chapters"]: