Small bug fixes and removed experimental downloader

+ Fixed bug with video being downloaded in place of audio + Minor tweaks - Removed the threaded downloader from main code, the current download system is the most reliable.
2025-05-05 03:04:26 +02:00 · 2021-05-21 19:50:39 -04:00 · 2021-05-21 19:50:39 -04:00 · 9a1a318f93
commit 9a1a318f93
parent aab19bf66f
2 changed files with 149 additions and 231 deletions
--- a/downloader.py
+++ b/downloader.py
@ -1,120 +1,62 @@
-import os, threading, requests
+import asyncio, aiohttp, os
-from tqdm import tqdm
+import tqdm
 r_semaphore = asyncio.Semaphore(10)
-class FileDownloader():
+# get content and write it to file
-    """
+def write_to_file(filename, content):
-    @source: https://gist.github.com/stefanfortuin/9dbfe8618701507d0ef2b5515b165c5f
+    f = open(filename, 'wb')
-    """
+    f.write(content)
-    def __init__(self, max_threads=10):
+    f.close()
        print("> Threaded downloader using {} threads.".format(
            str(max_threads)))
        self.sema = threading.Semaphore(value=max_threads)
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
        }
        self.block_size = 1024
    def t_getfile(self, link, filepath, filename, bar, session):
        """ 
        Threaded function that uses a semaphore 
        to not instantiate too many threads 
        """
-        self.sema.acquire()
+async def get(*args, **kwargs):
    response = await aiohttp.request('GET', *args, **kwargs)
    return response
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
-        if not os.path.isfile(filepath):
+async def head(*args, **kwargs):
-            headers = requests.head(link).headers
+    response = await aiohttp.request('HEAD', *args, **kwargs)
-            if 'content-length' not in headers:
+    return response
                print(f"server doesn't support content-length for {link}")
                self.sema.release()
                return
            total_bytes = int(requests.head(link).headers['content-length'])
-            if not bar:
+async def download_file(url, filepath, filename):
-                bar = tqdm(total=total_bytes,
+    async with r_semaphore:
-                           initial=0,
+        if os.path.isfile(filepath):
-                           unit='B',
+            # already downloaded
-                           unit_scale=True,
+            pass
                           desc=filename)
            self.download_new_file(link, filename, filepath, total_bytes, bar,
                                   session)
        else:
-            current_bytes = os.stat(filepath).st_size
+            async with aiohttp.request("GET", url, chunked=True) as media:
-
+                media_length = int(media.headers.get("content-length"))
-            headers = requests.head(link).headers
+                if media.status == 200:
-            if 'content-length' not in headers:
+                    if os.path.isfile(filepath) and os.path.getsize(
-                print(f"server doesn't support content-length for {link}")
+                            filepath >= media_length):
-                self.sema.release()
+                        # already downloaded
-                return
+                        pass
            total_bytes = int(requests.head(link).headers['content-length'])
            if not bar:
                bar = tqdm(total=total_bytes,
                           initial=current_bytes,
                           unit='B',
                           unit_scale=True,
                           desc=filename)
            if current_bytes < total_bytes:
                self.continue_file_download(link, filename, filepath,
                                            current_bytes, total_bytes, bar)
                    else:
                # print(f"already done: {filename}")
                if bar.unit == "B":
                    bar.update(self.block_size)
                else:
                    bar.update(1)
        self.sema.release()
    def download_new_file(self, link, filename, filepath, total_bytes, bar,
                          session):
        if session == None:
                        try:
-                request = requests.get(link,
+                            with open(filepath, 'wb') as f:
-                                       headers=self.headers,
+                                async for chunk in media.content.iter_chunked(
-                                       timeout=30,
+                                        1024):
                                       stream=True)
                self.write_file(request, filepath, 'wb', bar)
            except requests.exceptions.RequestException as e:
                print(e)
        else:
            request = session.get(link, stream=True)
            self.write_file(request, filepath, 'wb', bar)
    def continue_file_download(self, link, filename, filepath, current_bytes,
                               total_bytes, bar):
        range_header = self.headers.copy()
        range_header['Range'] = f"bytes={current_bytes}-{total_bytes}"
        try:
            request = requests.get(link,
                                   headers=range_header,
                                   timeout=30,
                                   stream=True)
            self.write_file(request, filepath, 'ab', bar)
        except requests.exceptions.RequestException as e:
            print(e)
    def write_file(self, content, filepath, writemode, bar):
        with open(filepath, writemode) as f:
            for chunk in content.iter_content(chunk_size=self.block_size):
                                    if chunk:
                                        f.write(chunk)
                    if bar.unit == "B":
                        bar.update(self.block_size)
        # print(f"completed file {filepath}", end='\n')
                                f.close()
-        bar.update(1)
+                                # success
                        except Exception as e:
                            raise e
-    def get_file(self, link, path, filename, bar=None, session=None):
+                    if os.path.getsize(filepath) >= media_length:
-        """ Downloads the file"""
+                        pass
-        thread = threading.Thread(target=self.t_getfile,
+                    else:
-                                  args=(link, path, filename, bar, session))
+                        print("Segment is corrupt")
-        thread.start()
+                elif media.status == 404:
-        return thread
+                    print("404")
                else:
                    print("Error fetching segment")
@asyncio.coroutine
 def wait_with_progressbar(coros):
    for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)):
        yield from f
--- a/main.py
+++ b/main.py
@ -1,11 +1,11 @@
-import os, requests, json, glob, argparse, sys, re
+import os, requests, json, glob, argparse, sys, re, time, asyncio
 from sanitize_filename import sanitize
 from tqdm import tqdm
 from dotenv import load_dotenv
 from mpegdash.parser import MPEGDASHParser
 from utils import extract_kid
 from vtt_to_srt import convert
-from downloader import FileDownloader
+from downloader import download_file, wait_with_progressbar
 download_dir = os.path.join(os.getcwd(), "out_dir")
 working_dir = os.path.join(os.getcwd(), "working_dir")
@ -125,7 +125,7 @@ def cleanup(path):
    os.removedirs(path)
-def mux_process(video_title, lecture_working_dir, outfile):
+def mux_process(video_title, lecture_working_dir, output_path):
    """
    @author Jayapraveen
    """
@ -133,12 +133,12 @@ def mux_process(video_title, lecture_working_dir, outfile):
        command = "ffmpeg -y -i \"{}\" -i \"{}\" -acodec copy -vcodec copy -fflags +bitexact -map_metadata -1 -metadata title=\"{}\" \"{}\"".format(
            os.path.join(lecture_working_dir, "decrypted_audio.mp4"),
            os.path.join(lecture_working_dir, "decrypted_video.mp4"),
-            video_title, outfile)
+            video_title, output_path)
    else:
        command = "nice -n 7 ffmpeg -y -i \"{}\" -i \"{}\" -acodec copy -vcodec copy -fflags +bitexact -map_metadata -1 -metadata title=\"{}\" \"{}\"".format(
            os.path.join(lecture_working_dir, "decrypted_audio.mp4"),
            os.path.join(lecture_working_dir, "decrypted_video.mp4"),
-            video_title, outfile)
+            video_title, output_path)
    os.system(command)
@ -146,24 +146,26 @@ def decrypt(kid, filename, lecture_working_dir):
    """
    @author Jayapraveen
    """
    print("> Decrypting, this might take a minute...")
    try:
        key = keyfile[kid.lower()]
    except KeyError:
        exit("Key not found")
    if (os.name == "nt"):
-        code = os.system("mp4decrypt --key 1:{0} \"{1}\" \"{2}\"".format(
+        code = os.system(f"mp4decrypt --key 1:%s \"%s\" \"%s\"" %
-            key,
+                         (key,
                          os.path.join(lecture_working_dir,
-                         "encrypted_{0}.mp4".format(filename)),
+                                       "encrypted_{}.mp4".format(filename)),
                          os.path.join(lecture_working_dir,
-                         "decrypted_{0}.mp4".format(filename))))
+                                       "decrypted_{}.mp4".format(filename))))
    else:
-        os.system("nice -n 7 mp4decrypt --key 1:{0} \"{1}\" \"{2}\"".format(
+        os.system(f"nice -n 7 mp4decrypt --key 1:%s \"%s\" \"%s\"" %
-            key,
+                  (key,
                   os.path.join(lecture_working_dir,
-                         "encrypted_{0}.mp4".format(filename)),
+                                "encrypted_{}.mp4".format(filename)),
                   os.path.join(lecture_working_dir,
-                         "decrypted_{0}.mp4".format(filename))))
+                                "decrypted_{}.mp4".format(filename))))
    print("> Decryption complete")
 def handle_segments(media_info, video_title, lecture_working_dir, output_path):
@ -219,7 +221,7 @@ def handle_segments(media_info, video_title, lecture_working_dir, output_path):
            break
-def handle_segments_threaded(media_info, video_title, lecture_working_dir,
+def handle_segments_async(media_info, video_title, lecture_working_dir,
                          output_path):
    """
    @author Jayapraveen
@ -234,35 +236,42 @@ def handle_segments_threaded(media_info, video_title, lecture_working_dir,
        os.path.join(lecture_working_dir, "audio_0.seg.mp4"))
    print("KID for audio file is: " + audio_kid)
-    vbar = tqdm(total=no_segment,
+    # bar = tqdm(total=(no_segment * 2) - 2,
-                initial=1,
+    #            initial=0,
-                unit='Video Segments',
+    #            unit='Segments',
-                desc=video_title + " (Video)")
+    #            desc=video_title)
    abar = tqdm(total=no_segment,
                initial=1,
                unit='Audio Segments',
                desc=video_title + " (Audio)")
-    threads = []
+    urls = []
    for count in range(1, no_segment):
        video_filename = f"video_{str(count)}.seg.{video_extension}"
        video_path = os.path.join(lecture_working_dir, video_filename)
        video_segment_url = video_url.replace("$Number$", str(count))
        video = downloader.get_file(video_segment_url, video_path,
                                    video_filename, vbar)
        threads.append(video)
    for count in range(1, no_segment):
        audio_filename = f"audio_{str(count)}.seg.{audio_extension}"
        video_path = os.path.join(lecture_working_dir, video_filename)
        audio_path = os.path.join(lecture_working_dir, audio_filename)
        video_segment_url = video_url.replace("$Number$", str(count))
        audio_segment_url = audio_url.replace("$Number$", str(count))
        audio = downloader.get_file(audio_segment_url, audio_path,
                                    audio_filename, abar)
        threads.append(audio)
-    for x in threads:
+        urls.append({
-        x.join()
+            "url": audio_segment_url,
            "filename": audio_filename,
            "path": audio_path
        })
        urls.append({
            "url": video_segment_url,
            "filename": video_filename,
            "path": video_path
        })
    start_time = time.time()
    eloop = asyncio.get_event_loop()
    coroutines = [
        eloop.create_task(
            download_file(obj.get("url"), obj.get("path"),
                          obj.get("filename"))) for obj in urls
    ]
    eloop.run_until_complete(wait_with_progressbar(coroutines))
    duration = time.time() - start_time
    print(f"Downloaded {len(urls)} files in {duration} seconds")
    os.chdir(lecture_working_dir)
    if os.name == "nt":
@ -324,15 +333,16 @@ def manifest_parser(mpd_url, quality):
                else:
                    repr = adapt_set.representations[-1]  # Max Quality
                    print("> Using max quality of %s" % repr.height)
-            segment_count = 0
+            elif content_type == "audio/mp4":
                repr = adapt_set.representations[-1]  # Best
-            segment = repr.segment_templates[0]
+            for segment in repr.segment_templates:
                segment_count = 1
                timeline = segment.segment_timelines[0]
                segment_count += len(timeline.Ss)
                for s in timeline.Ss:
                    if s.r:
                        segment_count += s.r
                print("Expected No of segments:", segment_count)
                if (content_type == "audio/mp4"):
                    segment_extension = segment.media.split(".")[-1]
@ -382,7 +392,6 @@ def process_caption(caption,
                    lecture_index,
                    lecture_title,
                    lecture_dir,
                    use_threaded_downloader,
                    threads,
                    tries=0):
    filename = f"%s. %s_%s.%s" % (lecture_index, sanitize(lecture_title),
@ -396,11 +405,6 @@ def process_caption(caption,
    else:
        print(f"> Downloading captions: '%s'" % filename)
        try:
            if use_threaded_downloader:
                thread = downloader.get_file(caption.get("url"), filepath,
                                             filename)
                thread.join()
            else:
            download(caption.get("url"), filepath, filename)
        except Exception as e:
            if tries >= 3:
@ -413,8 +417,7 @@ def process_caption(caption,
                    f"> Error downloading captions: {e}. Will retry {3-tries} more times."
                )
                process_caption(caption, lecture_index, lecture_title,
-                                lecture_dir, use_threaded_downloader, threads,
+                                lecture_dir, tries + 1)
                                tries + 1)
        if caption.get("ext") == "vtt":
            try:
                print("> Converting captions to SRT format...")
@ -425,9 +428,17 @@ def process_caption(caption,
                print(f"> Error converting captions: {e}")
-def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
+def process_lecture(
-                    skip_lectures, dl_assets, dl_captions, caption_locale,
+    lecture,
-                    use_threaded_downloader):
+    lecture_index,
    lecture_path,
    lecture_dir,
    quality,
    skip_lectures,
    dl_assets,
    dl_captions,
    caption_locale,
 ):
    lecture_title = lecture["title"]
    lecture_asset = lecture["asset"]
    if not skip_lectures:
@ -446,11 +457,6 @@ def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
            if not os.path.isfile(lecture_path):
                try:
                    if use_threaded_downloader:
                        thread = downloader.get_file(lecture_url, lecture_path,
                                                     lecture_title)
                        thread.join()
                    else:
                    download(lecture_url, lecture_path, lecture_title)
                except Exception as e:
                    # We could add a retry here
@ -477,12 +483,8 @@ def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
                        lecture_title)
                    return
                media_info = manifest_parser(mpd_url, quality)
-                if use_threaded_downloader:
+                handle_segments(media_info, lecture_title, lecture_working_dir,
-                    handle_segments_threaded(media_info, lecture_title,
+                                lecture_path)
                                             lecture_working_dir, lecture_path)
                else:
                    handle_segments(media_info, lecture_title,
                                    lecture_working_dir, lecture_path)
            else:
                print("> Lecture '%s' is already downloaded, skipping..." %
                      lecture_title)
@ -501,13 +503,6 @@ def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
                                     if x["label"] == "download"), None)
                if download_url:
                    try:
                        if use_threaded_downloader:
                            thread = downloader.get_file(
                                download_url,
                                os.path.join(lecture_dir, asset_filename),
                                asset_filename)
                            thread.join()
                        else:
                        download(download_url,
                                 os.path.join(lecture_dir, asset_filename),
                                 asset_filename)
@ -562,12 +557,16 @@ def process_lecture(lecture, lecture_index, lecture_path, lecture_dir, quality,
                })
        for caption in captions:
-            process_caption(caption, lecture_index, lecture_title, lecture_dir,
+            process_caption(
-                            use_threaded_downloader)
+                caption,
                lecture_index,
                lecture_title,
                lecture_dir,
            )
 def parse(data, course_id, course_name, skip_lectures, dl_assets, dl_captions,
-          quality, caption_locale, use_threaded_downloader):
+          quality, caption_locale):
    course_dir = os.path.join(download_dir, course_name)
    if not os.path.exists(course_dir):
        os.mkdir(course_dir)
@ -599,7 +598,6 @@ def parse(data, course_id, course_name, skip_lectures, dl_assets, dl_captions,
                    dl_assets,
                    dl_captions,
                    caption_locale,
                    use_threaded_downloader,
                )
    for chapter in chapters:
@ -616,7 +614,7 @@ def parse(data, course_id, course_name, skip_lectures, dl_assets, dl_captions,
                                                 sanitize(lecture["title"])))
            process_lecture(lecture, lecture_index, lecture_path, chapter_dir,
                            quality, skip_lectures, dl_assets, dl_captions,
-                            caption_locale, use_threaded_downloader)
+                            caption_locale)
    print("\n\n\n\n\n\n\n\n=====================")
    print("All downloads completed for course!")
    print("=====================")
@ -678,14 +676,6 @@ if __name__ == "__main__":
        type=int,
        help="Download specific video quality. (144, 360, 480, 720, 1080)",
    )
    parser.add_argument(
        "-t",
        "--threads",
        dest="threads",
        type=int,
        help=
        "Max number of threads to use when using the threaded downloader (default 10)",
    )
    parser.add_argument(
        "-l",
        "--lang",
@ -711,12 +701,6 @@ if __name__ == "__main__":
        action="store_true",
        help="If specified, captions will be downloaded.",
    )
    parser.add_argument(
        "--use-threaded-downloader",
        dest="use_threaded_downloader",
        action="store_true",
        help="If specified, the experimental threaded downloader will be used",
    )
    parser.add_argument(
        "-d",
        "--debug",
@ -733,8 +717,6 @@ if __name__ == "__main__":
    bearer_token = None
    portal_name = None
    course_name = None
    use_threaded_downloader = False
    threads = 10
    args = parser.parse_args()
    if args.download_assets:
@ -751,11 +733,6 @@ if __name__ == "__main__":
            sys.exit(1)
        else:
            quality = args.quality
    if args.use_threaded_downloader:
        use_threaded_downloader = args.use_threaded_downloader
    if args.threads:
        threads = args.threads
    downloader = FileDownloader(max_threads=threads)
    load_dotenv()
    if args.bearer_token:
@ -814,8 +791,7 @@ if __name__ == "__main__":
            course_data = json.loads(f.read())
            parse(course_data["results"], course_id, course_name,
                  skip_lectures, dl_assets, dl_captions, quality,
-                  caption_locale, use_threaded_downloader)
+                  caption_locale)
    else:
        parse(course_data["results"], course_id, course_name, skip_lectures,
-              dl_assets, dl_captions, quality, caption_locale,
+              dl_assets, dl_captions, quality, caption_locale)
              use_threaded_downloader)