Updates

+ Update sanitization regex to clean exclaimation points + Update _sanitize method to use _clean method + Moved lecture file exist check to parse method, this will now also check if html files exist + Update chapter folder titles to use - for separating number from name
2025-04-30 01:44:25 +02:00 · 2021-08-08 14:03:24 -04:00 · 2021-08-08 14:03:24 -04:00 · ca40ff2b6d
commit ca40ff2b6d
parent 6137e44d76
1 changed files with 48 additions and 40 deletions
--- a/main.py
+++ b/main.py
@ -42,15 +42,15 @@ COLLECTION_URL = "https://{portal_name}.udemy.com/api-2.0/users/me/subscribed-co


 def _clean(text):
-    ok = re.compile(r'[^\\/:*?"<>|]')
+    ok = re.compile(r'[^\\/:*?!"<>|]')
    text = "".join(x if ok.match(x) else "_" for x in text)
    text = re.sub(r"\.+$", "", text.strip())
    return text


 def _sanitize(self, unsafetext):
-    text = sanitize(
-        slugify(unsafetext, lower=False, spaces=True, ok=SLUG_OK + "().[]"))
+    text = _clean(sanitize(
+        slugify(unsafetext, lower=False, spaces=True, ok=SLUG_OK + "().[]")))
    return text


@ -1028,22 +1028,17 @@ def process_lecture(lecture, lecture_path, lecture_file_name, quality, access_to

    if is_encrypted:
        if len(lecture_sources) > 0:
-            if not os.path.isfile(lecture_path):
-                source = lecture_sources[-1]  # last index is the best quality
-                if isinstance(quality, int):
-                    source = min(
-                        lecture_sources,
-                        key=lambda x: abs(int(x.get("height")) - quality))
-                print(f"      > Lecture '%s' has DRM, attempting to download" %
-                      lecture_title)
-                handle_segments(source.get("download_url"),
-                                source.get(
-                                    "format_id"), lecture_title, lecture_path, lecture_file_name,
-                                concurrent_connections, chapter_dir)
-            else:
-                print(
-                    "      > Lecture '%s' is already downloaded, skipping..." %
-                    lecture_title)
+            source = lecture_sources[-1]  # last index is the best quality
+            if isinstance(quality, int):
+                source = min(
+                    lecture_sources,
+                    key=lambda x: abs(int(x.get("height")) - quality))
+            print(f"      > Lecture '%s' has DRM, attempting to download" %
+                  lecture_title)
+            handle_segments(source.get("download_url"),
+                            source.get(
+                                "format_id"), lecture_title, lecture_path, lecture_file_name,
+                            concurrent_connections, chapter_dir)
        else:
            print(f"      > Lecture '%s' is missing media links" %
                  lecture_title)
@ -1117,31 +1112,44 @@ def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
        for lecture in chapter.get("lectures"):
            lecture_title = lecture.get("lecture_title")
            lecture_index = lecture.get("lecture_index")
+            lecture_extension = lecture.get("extension")
+            extension = "mp4"  # video lectures dont have an extension property, so we assume its mp4
+            if lecture_extension != None:
+                # if the lecture extension property isnt none, set the extension to the lecture extension
+                extension = lecture_extension
+            lecture_file_name = sanitize(lecture_title + "." + extension)
+            lecture_path = os.path.join(
+                chapter_dir,
+                lecture_file_name)

-            extension = lecture.get("extension")
            print(
                f"  > Processing lecture {lecture_index} of {total_lectures}")
            if not skip_lectures:
-                if extension == "html":
-                    html_content = lecture.get("html_content").encode(
-                        "ascii", "ignore").decode("utf8")
-                    lecture_path = os.path.join(
-                        chapter_dir, "{}.html".format(sanitize(lecture_title)))
-                    try:
-                        with open(lecture_path, 'w') as f:
-                            f.write(html_content)
-                            f.close()
-                    except Exception as e:
-                        print("    > Failed to write html file: ", e)
-                        continue
+                print(lecture_file_name)
+                # Check if the lecture is already downloaded
+                if os.path.isfile(lecture_path):
+                    print(
+                        "      > Lecture '%s' is already downloaded, skipping..." %
+                        lecture_title)
+                    continue
                else:
-                    lecture_file_name = sanitize(lecture_title + ".mp4")
-                    lecture_path = os.path.join(
-                        chapter_dir,
-                        lecture_file_name)
-                    process_lecture(lecture, lecture_path, lecture_file_name,
-                                    quality, access_token,
-                                    concurrent_connections, chapter_dir)
+                    # Check if the file is an html file
+                    if extension == "html":
+                        html_content = lecture.get("html_content").encode(
+                            "ascii", "ignore").decode("utf8")
+                        lecture_path = os.path.join(
+                            chapter_dir, "{}.html".format(sanitize(lecture_title)))
+                        try:
+                            with open(lecture_path, 'w') as f:
+                                f.write(html_content)
+                                f.close()
+                        except Exception as e:
+                            print("    > Failed to write html file: ", e)
+                            continue
+                    else:
+                        process_lecture(lecture, lecture_path, lecture_file_name,
+                                        quality, access_token,
+                                        concurrent_connections, chapter_dir)

            if dl_assets:
                assets = lecture.get("assets")
@ -1521,7 +1529,7 @@ if __name__ == "__main__":
                    lecture_counter = 0
                    lectures = []
                    chapter_index = entry.get("object_index")
-                    chapter_title = "{0:02d} ".format(chapter_index) + _clean(
+                    chapter_title = "{0:02d} - ".format(chapter_index) + _clean(
                        entry.get("title"))

                    if chapter_title not in _udemy["chapters"]: