diff --git a/sources/browser.py b/sources/browser.py index 0351cb9..50f8526 100644 --- a/sources/browser.py +++ b/sources/browser.py @@ -27,53 +27,84 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from sources.utility import pretty_print, animate_thinking from sources.logger import Logger + def get_chrome_path() -> str: """Get the path to the Chrome executable.""" if sys.platform.startswith("win"): paths = [ "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe", - os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe") # User install + os.path.join( + os.environ.get("LOCALAPPDATA", ""), + "Google\\Chrome\\Application\\chrome.exe", + ), # User install ] elif sys.platform.startswith("darwin"): # macOS - paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"] + paths = [ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta", + ] else: # Linux - paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium", "/opt/chrome/chrome", "/usr/local/bin/chrome"] + paths = [ + "/usr/bin/google-chrome", + "/usr/bin/chromium-browser", + "/usr/bin/chromium", + "/opt/chrome/chrome", + "/usr/local/bin/chrome", + ] for path in paths: if os.path.exists(path) and os.access(path, os.X_OK): # Check if executable return path print("Looking for Google Chrome in these locations failed:") - print('\n'.join(paths)) + print("\n".join(paths)) chrome_path_env = os.environ.get("CHROME_EXECUTABLE_PATH") - if chrome_path_env and os.path.exists(chrome_path_env) and os.access(chrome_path_env, os.X_OK): + if ( + chrome_path_env + and os.path.exists(chrome_path_env) + and os.access(chrome_path_env, os.X_OK) + ): return chrome_path_env - path = input("Google Chrome not found. Please enter the path to the Chrome executable: ") + path = input( + "Google Chrome not found. Please enter the path to the Chrome executable: " + ) if os.path.exists(path) and os.access(path, os.X_OK): os.environ["CHROME_EXECUTABLE_PATH"] = path print(f"Chrome path saved to environment variable CHROME_EXECUTABLE_PATH") return path return None + def get_random_user_agent() -> str: """Get a random user agent string with associated vendor.""" user_agents = [ - {"ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36", "vendor": "Google Inc."}, - {"ua": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_6_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", "vendor": "Apple Inc."}, - {"ua": "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0", "vendor": ""}, + { + "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36", + "vendor": "Google Inc.", + }, + { + "ua": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_6_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", + "vendor": "Apple Inc.", + }, + { + "ua": "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0", + "vendor": "", + }, ] return random.choice(user_agents) -def create_driver(headless=False, stealth_mode=True, crx_path="./crx/nopecha.crx") -> webdriver.Chrome: + +def create_driver( + headless=False, stealth_mode=True, crx_path="./crx/nopecha.crx" +) -> webdriver.Chrome: """Create a Chrome WebDriver with specified options.""" chrome_options = Options() chrome_path = get_chrome_path() - + if not chrome_path: raise FileNotFoundError("Google Chrome not found. Please install it.") chrome_options.binary_location = chrome_path - + if headless: chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") @@ -90,31 +121,40 @@ def create_driver(headless=False, stealth_mode=True, crx_path="./crx/nopecha.crx chrome_options.add_argument(f'user-agent={user_agent["ua"]}') resolutions = [(1920, 1080), (1366, 768), (1440, 900)] width, height = random.choice(resolutions) - chrome_options.add_argument(f'--window-size={width},{height}') + chrome_options.add_argument(f"--window-size={width},{height}") if not stealth_mode: # crx file can't be installed in stealth mode if not os.path.exists(crx_path): pretty_print(f"Anti-captcha CRX not found at {crx_path}.", color="failure") else: chrome_options.add_extension(crx_path) - - chromedriver_path = shutil.which("chromedriver") - if not chromedriver_path: + + chromedriver_path = "/usr/bin/chromedriver" + if not os.path.exists(chromedriver_path): chromedriver_path = chromedriver_autoinstaller.install() - + if not chromedriver_path: - raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.") - + raise FileNotFoundError( + "ChromeDriver not found. Please install it or add it to your PATH." + ) + service = Service(chromedriver_path) if stealth_mode: chrome_options.add_argument("--disable-blink-features=AutomationControlled") driver = uc.Chrome(service=service, options=chrome_options) - driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") - chrome_version = driver.capabilities['browserVersion'] - stealth(driver, + driver.execute_script( + "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" + ) + chrome_version = driver.capabilities["browserVersion"] + stealth( + driver, languages=["en-US", "en"], vendor=user_agent["vendor"], - platform="Win64" if "Windows" in user_agent["ua"] else "MacIntel" if "Macintosh" in user_agent["ua"] else "Linux x86_64", + platform=( + "Win64" + if "Windows" in user_agent["ua"] + else "MacIntel" if "Macintosh" in user_agent["ua"] else "Linux x86_64" + ), webgl_vendor="Intel Inc.", renderer="Intel Iris OpenGL Engine", fix_hairline=True, @@ -127,13 +167,16 @@ def create_driver(headless=False, stealth_mode=True, crx_path="./crx/nopecha.crx } chrome_options.add_experimental_option("prefs", security_prefs) chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) - chrome_options.add_experimental_option('useAutomationExtension', False) + chrome_options.add_experimental_option("useAutomationExtension", False) return webdriver.Chrome(service=service, options=chrome_options) + class Browser: def __init__(self, driver, anticaptcha_manual_install=False): """Initialize the browser with optional AntiCaptcha installation.""" - self.js_scripts_folder = "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/" + self.js_scripts_folder = ( + "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/" + ) self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related" self.logger = Logger("browser.log") self.screenshot_folder = os.path.join(os.getcwd(), ".screenshots") @@ -146,23 +189,26 @@ class Browser: self.setup_tabs() if anticaptcha_manual_install: self.load_anticatpcha_manually() - + def setup_tabs(self): self.tabs = self.driver.window_handles self.driver.get("https://www.google.com") self.screenshot() - + def switch_control_tab(self): self.logger.log("Switching to control tab.") self.driver.switch_to.window(self.tabs[0]) - + def load_anticatpcha_manually(self): - pretty_print("You might want to install the AntiCaptcha extension for captchas.", color="warning") + pretty_print( + "You might want to install the AntiCaptcha extension for captchas.", + color="warning", + ) self.driver.get(self.anticaptcha) - def go_to(self, url:str) -> bool: + def go_to(self, url: str) -> bool: """Navigate to a specified URL.""" - time.sleep(random.uniform(0.4, 2.5)) # more human behavior + time.sleep(random.uniform(0.4, 2.5)) # more human behavior try: initial_handles = self.driver.window_handles self.driver.get(url) @@ -170,12 +216,17 @@ class Browser: wait = WebDriverWait(self.driver, timeout=10) wait.until( lambda driver: ( - not any(keyword in driver.page_source.lower() for keyword in ["checking your browser", "captcha"]) + not any( + keyword in driver.page_source.lower() + for keyword in ["checking your browser", "captcha"] + ) ), - message="stuck on 'checking browser' or verification screen" + message="stuck on 'checking browser' or verification screen", ) except TimeoutException: - self.logger.warning("Timeout while waiting for page to bypass 'checking your browser'") + self.logger.warning( + "Timeout while waiting for page to bypass 'checking your browser'" + ) self.apply_web_safety() self.logger.log(f"Navigated to: {url}") return True @@ -189,30 +240,33 @@ class Browser: self.logger.error(f"Fatal error with go_to method on {url}:\n{str(e)}") raise e - def is_sentence(self, text:str) -> bool: + def is_sentence(self, text: str) -> bool: """Check if the text qualifies as a meaningful sentence or contains important error codes.""" text = text.strip() if any(c.isdigit() for c in text): return True - words = re.findall(r'\w+', text, re.UNICODE) + words = re.findall(r"\w+", text, re.UNICODE) word_count = len(words) - has_punctuation = any(text.endswith(p) for p in ['.', ',', ',', '!', '?', '。', '!', '?', '।', '۔']) + has_punctuation = any( + text.endswith(p) + for p in [".", ",", ",", "!", "?", "。", "!", "?", "।", "۔"] + ) is_long_enough = word_count > 4 - return (word_count >= 5 and (has_punctuation or is_long_enough)) + return word_count >= 5 and (has_punctuation or is_long_enough) def get_text(self) -> str | None: """Get page text as formatted Markdown""" try: - soup = BeautifulSoup(self.driver.page_source, 'html.parser') - for element in soup(['script', 'style', 'noscript', 'meta', 'link']): + soup = BeautifulSoup(self.driver.page_source, "html.parser") + for element in soup(["script", "style", "noscript", "meta", "link"]): element.decompose() markdown_converter = markdownify.MarkdownConverter( heading_style="ATX", - strip=['a'], + strip=["a"], autolinks=False, - bullets='•', - strong_em_symbol='*', + bullets="•", + strong_em_symbol="*", default_title=False, ) markdown_text = markdown_converter.convert(str(soup.body)) @@ -220,35 +274,43 @@ class Browser: for line in markdown_text.splitlines(): stripped = line.strip() if stripped and self.is_sentence(stripped): - cleaned = ' '.join(stripped.split()) + cleaned = " ".join(stripped.split()) lines.append(cleaned) result = "[Start of page]\n\n" + "\n\n".join(lines) + "\n\n[End of page]" - result = re.sub(r'!\[(.*?)\]\(.*?\)', r'[IMAGE: \1]', result) + result = re.sub(r"!\[(.*?)\]\(.*?\)", r"[IMAGE: \1]", result) self.logger.info(f"Extracted text: {result[:100]}...") self.logger.info(f"Extracted text length: {len(result)}") return result[:8192] except Exception as e: self.logger.error(f"Error getting text: {str(e)}") return None - - def clean_url(self, url:str) -> str: + + def clean_url(self, url: str) -> str: """Clean URL to keep only the part needed for navigation to the page""" - clean = url.split('#')[0] - parts = clean.split('?', 1) + clean = url.split("#")[0] + parts = clean.split("?", 1) base_url = parts[0] if len(parts) > 1: query = parts[1] essential_params = [] - for param in query.split('&'): - if param.startswith('_skw=') or param.startswith('q=') or param.startswith('s='): + for param in query.split("&"): + if ( + param.startswith("_skw=") + or param.startswith("q=") + or param.startswith("s=") + ): essential_params.append(param) - elif param.startswith('_') or param.startswith('hash=') or param.startswith('itmmeta='): + elif ( + param.startswith("_") + or param.startswith("hash=") + or param.startswith("itmmeta=") + ): break if essential_params: return f"{base_url}?{'&'.join(essential_params)}" return base_url - - def is_link_valid(self, url:str) -> bool: + + def is_link_valid(self, url: str) -> bool: """Check if a URL is a valid link (page, not related to icon or metadata).""" if len(url) > 72: self.logger.warning(f"URL too long: {url}") @@ -257,10 +319,10 @@ class Browser: if not parsed_url.scheme or not parsed_url.netloc: self.logger.warning(f"Invalid URL: {url}") return False - if re.search(r'/\d+$', parsed_url.path): + if re.search(r"/\d+$", parsed_url.path): return False - image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'] - metadata_extensions = ['.ico', '.xml', '.json', '.rss', '.atom'] + image_extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"] + metadata_extensions = [".ico", ".xml", ".json", ".rss", ".atom"] for ext in image_extensions + metadata_extensions: if url.lower().endswith(ext): return False @@ -271,18 +333,24 @@ class Browser: try: links = [] elements = self.driver.find_elements(By.TAG_NAME, "a") - + for element in elements: href = element.get_attribute("href") if href and href.startswith(("http", "https")): - links.append({ - "url": href, - "text": element.text.strip(), - "is_displayed": element.is_displayed() - }) - + links.append( + { + "url": href, + "text": element.text.strip(), + "is_displayed": element.is_displayed(), + } + ) + self.logger.info(f"Found {len(links)} navigable links") - return [self.clean_url(link['url']) for link in links if (link['is_displayed'] == True and self.is_link_valid(link['url']))] + return [ + self.clean_url(link["url"]) + for link in links + if (link["is_displayed"] == True and self.is_link_valid(link["url"])) + ] except Exception as e: self.logger.error(f"Error getting navigable links: {str(e)}") return [] @@ -297,7 +365,10 @@ class Browser: return False try: self.logger.error(f"Scrolling to element for click_element.") - self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", element) + self.driver.execute_script( + "arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", + element, + ) time.sleep(0.1) element.click() self.logger.info(f"Clicked element at {xpath}") @@ -311,13 +382,13 @@ class Browser: except Exception as e: self.logger.error(f"Unexpected error clicking element at {xpath}: {str(e)}") return False - + def load_js(self, file_name: str) -> str: """Load javascript from script folder to inject to page.""" path = os.path.join(self.js_scripts_folder, file_name) self.logger.info(f"Loading js at {path}") try: - with open(path, 'r') as f: + with open(path, "r") as f: return f.read() except FileNotFoundError as e: raise Exception(f"Could not find: {path}") from e @@ -349,17 +420,22 @@ class Browser: form_strings = [] for element in input_elements: input_type = element.get("type") or "text" - if input_type in ["hidden", "submit", "button", "image"] or not element["displayed"]: + if ( + input_type in ["hidden", "submit", "button", "image"] + or not element["displayed"] + ): continue input_name = element.get("text") or element.get("id") or input_type if input_type == "checkbox" or input_type == "radio": try: - checked_status = "checked" if element.is_selected() else "unchecked" + checked_status = ( + "checked" if element.is_selected() else "unchecked" + ) except Exception as e: continue form_strings.append(f"[{input_name}]({checked_status})") else: - form_strings.append(f"[{input_name}]("")") + form_strings.append(f"[{input_name}](" ")") return form_strings except Exception as e: @@ -369,13 +445,18 @@ class Browser: """ Find buttons and return their type and xpath. """ - buttons = self.driver.find_elements(By.TAG_NAME, "button") + \ - self.driver.find_elements(By.XPATH, "//input[@type='submit']") + buttons = self.driver.find_elements( + By.TAG_NAME, "button" + ) + self.driver.find_elements(By.XPATH, "//input[@type='submit']") result = [] for i, button in enumerate(buttons): if not button.is_displayed() or not button.is_enabled(): continue - text = (button.text or button.get_attribute("value") or "").lower().replace(' ', '') + text = ( + (button.text or button.get_attribute("value") or "") + .lower() + .replace(" ", "") + ) xpath = f"(//button | //input[@type='submit'])[{i + 1}]" result.append((text, xpath)) result.sort(key=lambda x: len(x[0])) @@ -389,8 +470,8 @@ class Browser: self.logger.info("Waiting for submission outcome...") wait = WebDriverWait(self.driver, timeout) wait.until( - lambda driver: driver.current_url != self.driver.current_url or - driver.find_elements(By.XPATH, "//*[contains(text(), 'success')]") + lambda driver: driver.current_url != self.driver.current_url + or driver.find_elements(By.XPATH, "//*[contains(text(), 'success')]") ) self.logger.info("Detected submission outcome") return True @@ -398,7 +479,7 @@ class Browser: self.logger.warning("No submission outcome detected") return False - def find_and_click_btn(self, btn_type: str = 'login', timeout: int = 5) -> bool: + def find_and_click_btn(self, btn_type: str = "login", timeout: int = 5) -> bool: """Find and click a submit button matching the specified type.""" buttons = self.get_buttons_xpath() if not buttons: @@ -406,24 +487,35 @@ class Browser: return False for button_text, xpath in buttons: - if btn_type.lower() in button_text.lower() or btn_type.lower() in xpath.lower(): + if ( + btn_type.lower() in button_text.lower() + or btn_type.lower() in xpath.lower() + ): try: wait = WebDriverWait(self.driver, timeout) element = wait.until( EC.element_to_be_clickable((By.XPATH, xpath)), - message=f"Button with XPath '{xpath}' not clickable within {timeout} seconds" + message=f"Button with XPath '{xpath}' not clickable within {timeout} seconds", ) if self.click_element(xpath): - self.logger.info(f"Clicked button '{button_text}' at XPath: {xpath}") + self.logger.info( + f"Clicked button '{button_text}' at XPath: {xpath}" + ) return True else: - self.logger.warning(f"Button '{button_text}' at XPath: {xpath} not clickable") + self.logger.warning( + f"Button '{button_text}' at XPath: {xpath} not clickable" + ) return False except TimeoutException: - self.logger.warning(f"Timeout waiting for '{button_text}' button at XPath: {xpath}") + self.logger.warning( + f"Timeout waiting for '{button_text}' button at XPath: {xpath}" + ) return False except Exception as e: - self.logger.error(f"Error clicking button '{button_text}' at XPath: {xpath} - {str(e)}") + self.logger.error( + f"Error clicking button '{button_text}' at XPath: {xpath} - {str(e)}" + ) return False self.logger.warning(f"No button matching '{btn_type}' found") return False @@ -434,7 +526,9 @@ class Browser: Returns True if successful, False if any issues occur. """ try: - checkboxes = self.driver.find_elements(By.XPATH, "//input[@type='checkbox']") + checkboxes = self.driver.find_elements( + By.XPATH, "//input[@type='checkbox']" + ) if not checkboxes: self.logger.info("No checkboxes found on the page") return True @@ -445,19 +539,24 @@ class Browser: EC.element_to_be_clickable(checkbox) ) self.driver.execute_script( - "arguments[0].scrollIntoView({block: 'center', inline: 'center'});", checkbox + "arguments[0].scrollIntoView({block: 'center', inline: 'center'});", + checkbox, ) if not checkbox.is_selected(): try: checkbox.click() self.logger.info(f"Ticked checkbox {index}") except ElementClickInterceptedException: - self.driver.execute_script("arguments[0].click();", checkbox) + self.driver.execute_script( + "arguments[0].click();", checkbox + ) self.logger.warning(f"Click checkbox {index} intercepted") else: self.logger.info(f"Checkbox {index} already ticked") except TimeoutException: - self.logger.warning(f"Timeout waiting for checkbox {index} to be clickable") + self.logger.warning( + f"Timeout waiting for checkbox {index} to be clickable" + ) continue except Exception as e: self.logger.error(f"Error ticking checkbox {index}: {str(e)}") @@ -468,16 +567,28 @@ class Browser: return False def find_and_click_submission(self, timeout: int = 10) -> bool: - possible_submissions = ["login", "submit", "register", "continue", "apply", - "ok", "confirm", "proceed", "accept", - "done", "finish", "start", "calculate"] + possible_submissions = [ + "login", + "submit", + "register", + "continue", + "apply", + "ok", + "confirm", + "proceed", + "accept", + "done", + "finish", + "start", + "calculate", + ] for submission in possible_submissions: if self.find_and_click_btn(submission, timeout): self.logger.info(f"Clicked on submission button: {submission}") return True self.logger.warning("No submission button found") return False - + def find_input_xpath_by_name(self, inputs, name: str) -> str | None: for field in inputs: if name in field["text"]: @@ -492,7 +603,7 @@ class Browser: inputs = self.find_all_inputs() try: for input_str in input_list: - match = re.match(r'\[(.*?)\]\((.*?)\)', input_str) + match = re.match(r"\[(.*?)\]\((.*?)\)", input_str) if not match: self.logger.warning(f"Invalid format for input: {input_str}") continue @@ -509,11 +620,17 @@ class Browser: EC.element_to_be_clickable((By.XPATH, xpath)) ) except TimeoutException: - self.logger.error(f"Timeout waiting for element '{name}' to be clickable") + self.logger.error( + f"Timeout waiting for element '{name}' to be clickable" + ) continue - self.driver.execute_script("arguments[0].scrollIntoView(true);", element) + self.driver.execute_script( + "arguments[0].scrollIntoView(true);", element + ) if not element.is_displayed() or not element.is_enabled(): - self.logger.warning(f"Element '{name}' is not interactable (not displayed or disabled)") + self.logger.warning( + f"Element '{name}' is not interactable (not displayed or disabled)" + ) continue input_type = (element.get_attribute("type") or "text").lower() if input_type in ["checkbox", "radio"]: @@ -531,7 +648,7 @@ class Browser: except Exception as e: self.logger.error(f"Error filling form inputs: {str(e)}") return False - + def fill_form(self, input_list: List[str]) -> bool: """Fill form inputs based on a list of [name](value) and submit.""" if not isinstance(input_list, list): @@ -571,11 +688,11 @@ class Browser: except Exception as e: self.logger.error(f"Error scrolling: {str(e)}") return False - + def get_screenshot(self) -> str: return self.screenshot_folder + "/updated_screen.png" - def screenshot(self, filename:str = 'updated_screen.png') -> bool: + def screenshot(self, filename: str = "updated_screen.png") -> bool: """Take a screenshot of the current page.""" self.logger.info("Taking screenshot...") time.sleep(0.1) @@ -598,18 +715,19 @@ class Browser: script = self.load_js("inject_safety_script.js") input_elements = self.driver.execute_script(script) + if __name__ == "__main__": driver = create_driver(headless=False, stealth_mode=True) browser = Browser(driver, anticaptcha_manual_install=True) - + input("press enter to continue") print("AntiCaptcha / Form Test") - #browser.go_to("https://www.browserscan.net/bot-detection") - #txt = browser.get_text() - #browser.go_to("https://www.google.com/recaptcha/api2/demo") + # browser.go_to("https://www.browserscan.net/bot-detection") + # txt = browser.get_text() + # browser.go_to("https://www.google.com/recaptcha/api2/demo") browser.go_to("https://home.openweathermap.org/users/sign_up") inputs_visible = browser.get_form_inputs() print("inputs:", inputs_visible) - #inputs_fill = ['[q](checked)', '[q](checked)', '[user[username]](mlg)', '[user[email]](mlg.fcu@gmail.com)', '[user[password]](placeholder_P@ssw0rd123)', '[user[password_confirmation]](placeholder_P@ssw0rd123)'] - #browser.fill_form(inputs_fill) + # inputs_fill = ['[q](checked)', '[q](checked)', '[user[username]](mlg)', '[user[email]](mlg.fcu@gmail.com)', '[user[password]](placeholder_P@ssw0rd123)', '[user[password_confirmation]](placeholder_P@ssw0rd123)'] + # browser.fill_form(inputs_fill) input("press enter to exit")