From 15b7d92afc6e6ccb0338e9dd6d31d7d1962e5f46 Mon Sep 17 00:00:00 2001 From: Puyodead1 <14828766+Puyodead1@users.noreply.github.com> Date: Wed, 12 Jan 2022 19:51:11 -0500 Subject: [PATCH] Bug fixes + Support for large course content in subscription courses (#92) + Attempt to fix encoding problems with caption conversion (#92, #98, #97) --- main.py | 95 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index e5cdef7..456c9c5 100644 --- a/main.py +++ b/main.py @@ -780,22 +780,29 @@ class Udemy: raise Exception("[-] Cloudflare captcha detected!") # wait for page load - WebDriverWait(selenium.driver, 60).until(EC.visibility_of_element_located((By.TAG_NAME, "pre"))) + WebDriverWait(selenium.driver, 60).until( + EC.visibility_of_element_located((By.TAG_NAME, "body"))) time.sleep(2) - # TODO: determine if the course content is large - - # get the text from the page - page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text - if not page_text or not isinstance(page_text, str): - raise Exception("[-] Could not get page text!") - page_json = json.loads(page_text) - if page_json: - return page_json + body_text = selenium.driver.find_element(By.TAG_NAME, "body").text + if not body_text: + raise Exception("[-] Could not get page body text!") + if "502 Bad Gateway" in body_text: + # its a large course, handle accordingly + logger.info("[+] Detected large course content, using large content extractor...") + return self._extract_large_course_content_sub(url=url, selenium=selenium) else: - logger.error("[-] Failed to extract course json!") - time.sleep(0.8) - sys.exit(1) + # get the text from the page + page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text + if not page_text or not isinstance(page_text, str): + raise Exception("[-] Could not get page pre text!") + page_json = json.loads(page_text) + if page_json: + return page_json + else: + logger.error("[-] Failed to extract course json!") + time.sleep(0.8) + sys.exit(1) def _extract_large_course_content(self, url): url = url.replace("10000", "50") if url.endswith("10000") else url @@ -823,6 +830,68 @@ class Udemy: data["results"].append(d) return data + def _extract_large_course_content_sub(self, url, selenium: Selenium): + url = url.replace("10000", "50") if url.endswith("10000") else url + try: + selenium.driver.get(url) + time.sleep(2) + + if "Attention" in selenium.driver.title: + # cloudflare captcha, panic + raise Exception("[-] Cloudflare captcha detected!") + + # wait for page load + WebDriverWait(selenium.driver, 60).until( + EC.visibility_of_element_located((By.TAG_NAME, "body"))) + time.sleep(2) + + # get the text from the page + page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text + if not page_text or not isinstance(page_text, str): + raise Exception("[-] Could not get page pre text!") + data = json.loads(page_text) + logger.debug(data) + + except conn_error as error: + logger.fatal(f"[-] Udemy Says: Connection error, {error}") + time.sleep(0.8) + sys.exit(1) + else: + _next = data.get("next") + while _next: + logger.info("> Downloading course information.. ") + try: + selenium.driver.get(_next) + time.sleep(2) + + if "Attention" in selenium.driver.title: + # cloudflare captcha, panic + raise Exception("[-] Cloudflare captcha detected!") + + # wait for page load + WebDriverWait(selenium.driver, 60).until( + EC.visibility_of_element_located((By.TAG_NAME, "body"))) + time.sleep(2) + + # get the text from the page + page_text = selenium.driver.find_element( + By.TAG_NAME, "pre").text + if not page_text or not isinstance(page_text, str): + raise Exception("[-] Could not get page pre text!") + resp = json.loads(page_text) + logger.debug(resp) + except conn_error as error: + logger.fatal(f"[-] Udemy Says: Connection error, {error}") + time.sleep(0.8) + sys.exit(1) + else: + _next = resp.get("next") + results = resp.get("results") + if results and isinstance(results, list): + for d in resp["results"]: + data["results"].append(d) + return data + def _extract_course(self, response, course_name): _temp = {} if response: