update manifest downloading

2025-05-04 03:44:26 +02:00 · 2023-05-30 18:08:47 -04:00 · 2023-05-30 18:08:47 -04:00 · 06e295d2b6
commit 06e295d2b6
parent 340d4c6786
2 changed files with 55 additions and 19 deletions
--- a/.gitignore
+++ b/.gitignore
@ -128,4 +128,5 @@ cookies.txt
 selenium_test.py
 selenium_data/
 config.dev.toml
-temp/*.exe
+temp/
 *.exe
--- a/main.py
+++ b/main.py
@ -483,24 +483,52 @@ class Udemy:
    def _extract_m3u8(self, url):
        """extracts m3u8 streams"""
        asset_id_re = re.compile(r"assets/(?P<id>\d+)/")
        _temp = []
        # get temp folder
        temp_path = Path(Path.cwd(), "temp")
        # ensure the folder exists
        temp_path.mkdir(parents=True, exist_ok=True)
        # # extract the asset id from the url
        asset_id = asset_id_re.search(url).group("id")
        m3u8_path = Path(temp_path, f"index_{asset_id}.m3u8")
        try:
-            resp = self.session._get(url)
+            r = self.session._get(url)
-            resp.raise_for_status()
+            r.raise_for_status()
-            raw_data = resp.text
+            raw_data = r.text
            # write to temp file for later
            with open(m3u8_path, "w") as f:
                f.write(r.text)
            m3u8_object = m3u8.loads(raw_data)
            playlists = m3u8_object.playlists
            seen = set()
            for pl in playlists:
                resolution = pl.stream_info.resolution
                codecs = pl.stream_info.codecs
                if not resolution:
                    continue
                if not codecs:
                    continue
                width, height = resolution
-                download_url = pl.uri
+                
-                if height not in seen:
+                if height in seen: continue
                # we need to save the individual playlists to disk also
                playlist_path = Path(temp_path, f"index_{asset_id}_{width}x{height}.m3u8")
                with open(playlist_path, "w") as f:
                    r = self.session._get(pl.uri)
                    r.raise_for_status()
                    f.write(r.text)
                seen.add(height)
                _temp.append(
                    {
@ -508,7 +536,7 @@ class Udemy:
                        "height": height,
                        "width": width,
                        "extension": "mp4",
-                            "download_url": download_url,
+                        "download_url": playlist_path.as_uri(),
                    }
                )
        except Exception as error:
@ -517,8 +545,7 @@ class Udemy:
    def _extract_mpd(self, url):
        """extracts mpd streams"""
-
+        asset_id_re = re.compile(r"assets/(?P<id>\d+)/")
        asset_id_re = re.compile(r"assets/(?P<id>\d+)/files")
        _temp = []
        # get temp folder
@ -536,6 +563,7 @@ class Udemy:
        try:
            with open(mpd_path, "wb") as f:
                r = self.session._get(url)
                r.raise_for_status()
                f.write(r.content)
            ytdl = yt_dlp.YoutubeDL({"quiet": True, "no_warnings": True, "allow_unplayable_formats": True, "enable_file_urls": True})
@ -1168,6 +1196,7 @@ def handle_segments(url, format_id, video_title, output_path, lecture_file_name,
    logger.info("> Downloading Lecture Tracks...")
    args = [
        "yt-dlp",
        "--enable-file-urls",
        "--force-generic-extractor",
        "--allow-unplayable-formats",
        "--concurrent-fragments",
@ -1237,6 +1266,12 @@ def handle_segments(url, format_id, video_title, output_path, lecture_file_name,
        logger.exception(f"Error: ")
    finally:
        os.chdir(HOME_DIR)
        # if the url is a file url, we need to remove the file after we're done with it
        if url.startswith("file://"):
            try:
                os.unlink(url[7:])
            except:
                pass
 def check_for_aria():
@ -1372,7 +1407,7 @@ def process_lecture(lecture, lecture_path, lecture_file_name, chapter_dir):
                    source_type = source.get("type")
                    if source_type == "hls":
                        temp_filepath = lecture_path.replace(".mp4", ".%(ext)s")
-                        cmd = ["yt-dlp", "--force-generic-extractor", "--concurrent-fragments", f"{concurrent_downloads}", "--downloader", "aria2c", "-o", f"{temp_filepath}", f"{url}"]
+                        cmd = ["yt-dlp",  "--enable-file-urls", "--force-generic-extractor", "--concurrent-fragments", f"{concurrent_downloads}", "--downloader", "aria2c", "-o", f"{temp_filepath}", f"{url}"]
                        if disable_ipv6:
                            cmd.append("--downloader-args")
                            cmd.append('aria2c:"--disable-ipv6"')