udemy-downloader/main.py

import os,requests,shutil,json,glob,urllib.request
from sanitize_filename import sanitize
import urllib.request
from tqdm import tqdm
from dotenv import load_dotenv
from mpegdash.parser import MPEGDASHParser
from mpegdash.nodes import Descriptor
from mpegdash.utils import (
    parse_attr_value, parse_child_nodes, parse_node_value,
    write_attr_value, write_child_node, write_node_value
)
from utils import extract_kid

load_dotenv()

course_id = os.getenv("UDEMY_COURSE_ID") # the course id to download
bearer_token = os.getenv("UDEMY_BEARER") # you can find this in the network tab, its a request header under Authorization/x-udemy-authorization
header_bearer = "Bearer " + bearer_token
download_dir = "%s\out_dir" % os.getcwd()
working_dir = "%s\working_dir" % os.getcwd() # set the folder to download segments for DRM videos
retry = 3
home_dir = os.getcwd();
keyfile_path = "%s\keyfile.json" % os.getcwd()
debug = False # keep this to False unless you can tell what its for

if not os.path.exists(working_dir):
    os.makedirs(working_dir)

if not os.path.exists(download_dir):
    os.makedirs(download_dir)

#Get the keys
with open(keyfile_path,'r') as keyfile:
    keyfile = keyfile.read()
keyfile = json.loads(keyfile)

"""
@author Jayapraveen
"""
def durationtoseconds(period):
    #Duration format in PTxDxHxMxS
    if(period[:2] == "PT"):
        period = period[2:]
        day = int(period.split("D")[0] if 'D' in period else 0)
        hour = int(period.split("H")[0].split("D")[-1]  if 'H' in period else 0)
        minute = int(period.split("M")[0].split("H")[-1] if 'M' in period else 0)
        second = period.split("S")[0].split("M")[-1]
        print("Total time: " + str(day) + " days " + str(hour) + " hours " + str(minute) + " minutes and " + str(second) + " seconds")
        total_time = float(str((day * 24 * 60 * 60) + (hour * 60 * 60) + (minute * 60) + (int(second.split('.')[0]))) + '.' + str(int(second.split('.')[-1])))
        return total_time

    else:
        print("Duration Format Error")
        return None

def download_media(filename,url,lecture_working_dir,epoch = 0):
    if(os.path.isfile(filename)):
        print("Segment already downloaded.. skipping..")
    else:
        media = requests.get(url, stream=True)
        media_length = int(media.headers.get("content-length"))
        if media.status_code == 200:
            if(os.path.isfile(filename) and os.path.getsize(filename) >= media_length):
                print("Segment already downloaded.. skipping write to disk..")
            else:
                try:
                    pbar = tqdm(total=media_length, initial=0,unit='B', unit_scale=True, desc=filename)
                    with open(f"{lecture_working_dir}\\{filename}", 'wb') as video_file:
                        for chunk in media.iter_content(chunk_size=1024):
                            if chunk:
                                video_file.write(chunk)
                                pbar.update(1024)
                    pbar.close()
                    print("Segment downloaded: " + filename)
                    return False #Successfully downloaded the file
                except:
                    print("Connection error: Reattempting download of segment..")
                    download_media(filename,url, lecture_working_dir,epoch + 1)

            if os.path.getsize(filename) >= media_length:
                pass
            else:
                print("Segment is faulty.. Redownloading...")
                download_media(filename,url, lecture_working_dir,epoch + 1)
        elif(media.status_code == 404):
            print("Probably end hit!\n",url)
            return True #Probably hit the last of the file
        else:
            if (epoch > retry):
                exit("Error fetching segment, exceeded retry times.")
            print("Error fetching segment file.. Redownloading...")
            download_media(filename,url, lecture_working_dir,epoch + 1)

"""
@author Jayapraveen
"""
def cleanup(path):
    leftover_files = glob.glob(path + '/*.mp4', recursive=True)
    mpd_files = glob.glob(path + '/*.mpd', recursive=True)
    leftover_files = leftover_files + mpd_files
    for file_list in leftover_files:
        try:
            os.remove(file_list)
        except OSError:
            print(f"Error deleting file: {file_list}")

"""
@author Jayapraveen
"""
def mux_process(video_title,lecture_working_dir,outfile):
    if os.name == "nt":
        command = f"ffmpeg -y -i \"{lecture_working_dir}\\decrypted_audio.mp4\" -i \"{lecture_working_dir}\\decrypted_video.mp4\" -acodec copy -vcodec copy -fflags +bitexact -map_metadata -1 -metadata title=\"{video_title}\" -metadata creation_time=2020-00-00T70:05:30.000000Z \"{outfile}\""
    else:
        command = f"nice -n 7 ffmpeg -y -i \"{lecture_working_dir}\\decrypted_audio.mp4\" -i \"{lecture_working_dir}\\decrypted_video.mp4\" -acodec copy -vcodec copy -fflags +bitexact -map_metadata -1 -metadata title=\"{video_title}\" -metadata creation_time=2020-00-00T70:05:30.000000Z \"{outfile}\""
    os.system(command)

"""
@author Jayapraveen
"""
def decrypt(kid,filename,lecture_working_dir):
    try:
        key = keyfile[kid.lower()]
    except KeyError as error:
        exit("Key not found")
    if(os.name == "nt"):
        os.system(f"mp4decrypt --key 1:{key} \"{lecture_working_dir}\\encrypted_{filename}.mp4\" \"{lecture_working_dir}\\decrypted_{filename}.mp4\"")
    else:
        os.system(f"nice -n 7 mp4decrypt --key 1:{key} \"{lecture_working_dir}\\encrypted_{filename}.mp4\" \"{lecture_working_dir}\\decrypted_{filename}.mp4\"")

"""
@author Jayapraveen
"""
def handle_irregular_segments(media_info,video_title,lecture_working_dir,output_path):
    no_segment,video_url,video_init,video_extension,no_segment,audio_url,audio_init,audio_extension = media_info
    download_media("video_0.seg.mp4",video_init,lecture_working_dir)
    video_kid = extract_kid(f"{lecture_working_dir}\\video_0.seg.mp4")
    print("KID for video file is: " + video_kid)
    download_media("audio_0.seg.mp4",audio_init,lecture_working_dir)
    audio_kid = extract_kid(f"{lecture_working_dir}\\audio_0.seg.mp4")
    print("KID for audio file is: " + audio_kid)
    for count in range(1,no_segment):
        video_segment_url = video_url.replace("$Number$",str(count))
        audio_segment_url = audio_url.replace("$Number$",str(count))
        video_status = download_media(f"video_{str(count)}.seg.{video_extension}",video_segment_url,lecture_working_dir)
        audio_status = download_media(f"audio_{str(count)}.seg.{audio_extension}",audio_segment_url,lecture_working_dir)
        os.chdir(lecture_working_dir)
        if(video_status):
            if os.name == "nt":
                video_concat_command = "copy /b " + "+".join([f"video_{i}.seg.{video_extension}" for i in range(0,count)]) + " encrypted_video.mp4"
                audio_concat_command = "copy /b " + "+".join([f"audio_{i}.seg.{audio_extension}" for i in range(0,count)]) + " encrypted_audio.mp4"
            else:
                video_concat_command = "cat " + " ".join([f"video_{i}.seg.{video_extension}" for i in range(0,count)]) + " > encrypted_video.mp4"
                audio_concat_command = "cat " + " ".join([f"audio_{i}.seg.{audio_extension}" for i in range(0,count)]) + " > encrypted_audio.mp4"
            os.system(video_concat_command)
            os.system(audio_concat_command)
            decrypt(video_kid,"video",lecture_working_dir)
            decrypt(audio_kid,"audio",lecture_working_dir)
            os.chdir(home_dir)
            mux_process(video_title,lecture_working_dir,output_path)
            break

"""
@author Jayapraveen
"""
def manifest_parser(mpd_url):
    video = []
    audio = []
    manifest = requests.get(mpd_url).text
    with open("manifest.mpd",'w') as manifest_handler:
        manifest_handler.write(manifest)
    mpd = MPEGDASHParser.parse("./manifest.mpd")
    running_time = durationtoseconds(mpd.media_presentation_duration)
    for period in mpd.periods:
        for adapt_set in period.adaptation_sets:
            print("Processing " + adapt_set.mime_type)
            content_type = adapt_set.mime_type
            repr = adapt_set.representations[-1] # Max Quality
            for segment in repr.segment_templates:
                if(segment.duration):
                    print("Media segments are of equal timeframe")
                    segment_time = segment.duration / segment.timescale
                    total_segments = running_time / segment_time
                else:
                    print("Media segments are of inequal timeframe")

                    approx_no_segments = round(running_time / 6) + 10 # aproximate of 6 sec per segment
                    print("Expected No of segments:",approx_no_segments)
                    if(content_type == "audio/mp4"):
                        segment_extension = segment.media.split(".")[-1]
                        audio.append(approx_no_segments)
                        audio.append(segment.media)
                        audio.append(segment.initialization)
                        audio.append(segment_extension)
                    elif(content_type == "video/mp4"):
                        segment_extension = segment.media.split(".")[-1]
                        video.append(approx_no_segments)
                        video.append(segment.media)
                        video.append(segment.initialization)
                        video.append(segment_extension)
    return video + audio


"""
@author Puyodead1
"""
def download(url, path, filename):
    """
    @param: url to download file
    @param: path place to put the file
    @oaram: filename used for progress bar
    """
    file_size = int(requests.head(url).headers["Content-Length"])
    if os.path.exists(path):
        print("file exists")
        first_byte = os.path.getsize(path)
    else:
        first_byte = 0
    if first_byte >= file_size:
        return file_size
    header = {"Range": "bytes=%s-%s" % (first_byte, file_size)}
    pbar = tqdm(
        total=file_size, initial=first_byte,
        unit='B', unit_scale=True, desc=filename)
    req = requests.get(url, headers=header, stream=True)
    with(open(path, 'ab')) as f:
        for chunk in req.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                pbar.update(1024)
    pbar.close()
    return file_size

def process_lecture(lecture, lecture_path):
    lecture_title = lecture["title"]
    lecture_asset = lecture["asset"]
    if lecture_asset["media_license_token"] == None:
        # not encrypted
        lecture_url = lecture_asset["media_sources"][0]["src"] # best quality is the first index
        if not os.path.isfile(lecture_path):
            download(lecture_url, lecture_path, lecture_title)
        else:
            print(f"> Lecture '%s' is already downloaded, skipping..." % lecture_title)
    else:
        # encrypted
        print(f"> Lecture '%s' has DRM, attempting to download" % lecture_title)
        lecture_working_dir = "%s\%s" % (working_dir, lecture_asset["id"]) # set the folder to download ephemeral files
        if not os.path.exists(lecture_working_dir):
            os.mkdir(lecture_working_dir)
        if not os.path.isfile(lecture_path):
            mpd_url = lecture_asset["media_sources"][1]["src"] # index 1 is the dash
            base_url = mpd_url.split("index.mpd")[0]
            media_info = manifest_parser(mpd_url)
            handle_irregular_segments(media_info,lecture_title,lecture_working_dir,lecture_path)
            cleanup(lecture_working_dir)
        else:
            print("> Lecture '%s' is already downloaded, skipping..." % lecture_title)

def parse(data):
    chapters = []
    lectures = []

    for obj in data:
        if obj["_class"] == "chapter":
            obj["lectures"] = []
            chapters.append(obj)
        elif obj["_class"] == "lecture" and obj["asset"]["asset_type"] == "Video":
            try:
                chapters[-1]["lectures"].append(obj)
            except IndexError:
                # This is caused by there not being a starting chapter
                lectures.append(obj)
                lecture_path = f"%s\\%s. %s.mp4" % (download_dir, lectures.index(obj) + 1, sanitize(obj["title"]))
                process_lecture(obj, lecture_path)

    for chapter in chapters:
        chapter_dir = f"%s\\%s. %s" % (download_dir,chapters.index(chapter) + 1,sanitize(chapter["title"]))
        if not os.path.exists(chapter_dir):
            os.mkdir(chapter_dir)

        for lecture in chapter["lectures"]:
            lecture_path = f"%s\\%s. %s.mp4" % (chapter_dir, chapter["lectures"].index(lecture) + 1,sanitize(lecture["title"]))
            process_lecture(lecture, lecture_path)

if __name__ == "__main__":
    if debug:
        # this is for development purposes so we dont need to make tons of requests when testing
        # course data json is just stored and read from a file
        with open("test_data.json", encoding="utf8") as f:
            data = json.loads(f.read())["results"]
            parse(data)
    else:
        print("Fetching Course data, this may take a minute...")
        r = requests.get(f"https://udemy.com/api-2.0/courses/{course_id}/cached-subscriber-curriculum-items?fields[asset]=results,title,external_url,time_estimation,download_urls,slide_urls,filename,asset_type,captions,media_license_token,course_is_drmed,media_sources,stream_urls,body&fields[chapter]=object_index,title,sort_order&fields[lecture]=id,title,object_index,asset,supplementary_assets,view_html&page_size=10000".format(course_id), headers={"Authorization": header_bearer, "x-udemy-authorization": header_bearer})
        if r.status_code == 200:
            print("Course data retrieved!")
            data = r.json()
            parse(data["results"])
        else:
            print("An error occurred while trying to fetch the course data! " + r.text)