mirror of
https://cdm-project.com/Download-Tools/udemy-downloader.git
synced 2025-05-03 08:14:26 +02:00
Bug fixes
+ Support for large course content in subscription courses (#92) + Attempt to fix encoding problems with caption conversion (#92, #98, #97)
This commit is contained in:
parent
ad74eed395
commit
15b7d92afc
95
main.py
95
main.py
@ -780,22 +780,29 @@ class Udemy:
|
|||||||
raise Exception("[-] Cloudflare captcha detected!")
|
raise Exception("[-] Cloudflare captcha detected!")
|
||||||
|
|
||||||
# wait for page load
|
# wait for page load
|
||||||
WebDriverWait(selenium.driver, 60).until(EC.visibility_of_element_located((By.TAG_NAME, "pre")))
|
WebDriverWait(selenium.driver, 60).until(
|
||||||
|
EC.visibility_of_element_located((By.TAG_NAME, "body")))
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
# TODO: determine if the course content is large
|
body_text = selenium.driver.find_element(By.TAG_NAME, "body").text
|
||||||
|
if not body_text:
|
||||||
# get the text from the page
|
raise Exception("[-] Could not get page body text!")
|
||||||
page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
|
if "502 Bad Gateway" in body_text:
|
||||||
if not page_text or not isinstance(page_text, str):
|
# its a large course, handle accordingly
|
||||||
raise Exception("[-] Could not get page text!")
|
logger.info("[+] Detected large course content, using large content extractor...")
|
||||||
page_json = json.loads(page_text)
|
return self._extract_large_course_content_sub(url=url, selenium=selenium)
|
||||||
if page_json:
|
|
||||||
return page_json
|
|
||||||
else:
|
else:
|
||||||
logger.error("[-] Failed to extract course json!")
|
# get the text from the page
|
||||||
time.sleep(0.8)
|
page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
|
||||||
sys.exit(1)
|
if not page_text or not isinstance(page_text, str):
|
||||||
|
raise Exception("[-] Could not get page pre text!")
|
||||||
|
page_json = json.loads(page_text)
|
||||||
|
if page_json:
|
||||||
|
return page_json
|
||||||
|
else:
|
||||||
|
logger.error("[-] Failed to extract course json!")
|
||||||
|
time.sleep(0.8)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
def _extract_large_course_content(self, url):
|
def _extract_large_course_content(self, url):
|
||||||
url = url.replace("10000", "50") if url.endswith("10000") else url
|
url = url.replace("10000", "50") if url.endswith("10000") else url
|
||||||
@ -823,6 +830,68 @@ class Udemy:
|
|||||||
data["results"].append(d)
|
data["results"].append(d)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def _extract_large_course_content_sub(self, url, selenium: Selenium):
|
||||||
|
url = url.replace("10000", "50") if url.endswith("10000") else url
|
||||||
|
try:
|
||||||
|
selenium.driver.get(url)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
if "Attention" in selenium.driver.title:
|
||||||
|
# cloudflare captcha, panic
|
||||||
|
raise Exception("[-] Cloudflare captcha detected!")
|
||||||
|
|
||||||
|
# wait for page load
|
||||||
|
WebDriverWait(selenium.driver, 60).until(
|
||||||
|
EC.visibility_of_element_located((By.TAG_NAME, "body")))
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# get the text from the page
|
||||||
|
page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
|
||||||
|
if not page_text or not isinstance(page_text, str):
|
||||||
|
raise Exception("[-] Could not get page pre text!")
|
||||||
|
data = json.loads(page_text)
|
||||||
|
logger.debug(data)
|
||||||
|
|
||||||
|
except conn_error as error:
|
||||||
|
logger.fatal(f"[-] Udemy Says: Connection error, {error}")
|
||||||
|
time.sleep(0.8)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
_next = data.get("next")
|
||||||
|
while _next:
|
||||||
|
logger.info("> Downloading course information.. ")
|
||||||
|
try:
|
||||||
|
selenium.driver.get(_next)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
if "Attention" in selenium.driver.title:
|
||||||
|
# cloudflare captcha, panic
|
||||||
|
raise Exception("[-] Cloudflare captcha detected!")
|
||||||
|
|
||||||
|
# wait for page load
|
||||||
|
WebDriverWait(selenium.driver, 60).until(
|
||||||
|
EC.visibility_of_element_located((By.TAG_NAME, "body")))
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# get the text from the page
|
||||||
|
page_text = selenium.driver.find_element(
|
||||||
|
By.TAG_NAME, "pre").text
|
||||||
|
if not page_text or not isinstance(page_text, str):
|
||||||
|
raise Exception("[-] Could not get page pre text!")
|
||||||
|
resp = json.loads(page_text)
|
||||||
|
logger.debug(resp)
|
||||||
|
except conn_error as error:
|
||||||
|
logger.fatal(f"[-] Udemy Says: Connection error, {error}")
|
||||||
|
time.sleep(0.8)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
_next = resp.get("next")
|
||||||
|
results = resp.get("results")
|
||||||
|
if results and isinstance(results, list):
|
||||||
|
for d in resp["results"]:
|
||||||
|
data["results"].append(d)
|
||||||
|
return data
|
||||||
|
|
||||||
def _extract_course(self, response, course_name):
|
def _extract_course(self, response, course_name):
|
||||||
_temp = {}
|
_temp = {}
|
||||||
if response:
|
if response:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user