feat : stealth browser with selenium_stealth for captcha

This commit is contained in:
martin legrand 2025-03-30 15:20:43 +02:00
parent 69f276955a
commit 3dbef96cf0
5 changed files with 52 additions and 16 deletions

View File

@ -7,12 +7,14 @@ echo "Detecting operating system..."
OS_TYPE=$(uname -s) OS_TYPE=$(uname -s)
case "$OS_TYPE" in case "$OS_TYPE" in
"Linux"*) "Linux"*)
echo "Detected Linux OS" echo "Detected Linux OS"
if [ -f "$SCRIPTS_DIR/linux_install.sh" ]; then if [ -f "$SCRIPTS_DIR/linux_install.sh" ]; then
echo "Running Linux installation script..." echo "Running Linux installation script..."
bash "$SCRIPTS_DIR/linux_install.sh" bash "$SCRIPTS_DIR/linux_install.sh"
bash -c "wget https://github.com/Fosowl/fosowl.github.io/raw/refs/heads/main/usefull/anticaptcha.crx"
bash -c "cd $LLM_ROUTER_DIR && ./dl_safetensors.sh" bash -c "cd $LLM_ROUTER_DIR && ./dl_safetensors.sh"
else else
echo "Error: $SCRIPTS_DIR/linux_install.sh not found!" echo "Error: $SCRIPTS_DIR/linux_install.sh not found!"
@ -24,6 +26,7 @@ case "$OS_TYPE" in
if [ -f "$SCRIPTS_DIR/macos_install.sh" ]; then if [ -f "$SCRIPTS_DIR/macos_install.sh" ]; then
echo "Running macOS installation script..." echo "Running macOS installation script..."
bash "$SCRIPTS_DIR/macos_install.sh" bash "$SCRIPTS_DIR/macos_install.sh"
bash -c "wget https://github.com/Fosowl/fosowl.github.io/raw/refs/heads/main/usefull/anticaptcha.crx"
bash -c "cd $LLM_ROUTER_DIR && ./dl_safetensors.sh" bash -c "cd $LLM_ROUTER_DIR && ./dl_safetensors.sh"
else else
echo "Error: $SCRIPTS_DIR/macos_install.sh not found!" echo "Error: $SCRIPTS_DIR/macos_install.sh not found!"

View File

@ -29,6 +29,9 @@ distro>=1.7.0,<2
jiter>=0.4.0,<1 jiter>=0.4.0,<1
sniffio sniffio
tqdm>4 tqdm>4
fake_useragent>=2.1.0
selenium_stealth>=1.0.6
undetected-chromedriver>=3.5.5
# for api provider # for api provider
openai openai
# if use chinese # if use chinese

View File

@ -5,6 +5,9 @@ echo "Starting installation for macOS..."
# Install Python dependencies from requirements.txt # Install Python dependencies from requirements.txt
pip3 install -r requirements.txt pip3 install -r requirements.txt
# make sure wget installed
brew install wget
# Install chromedriver using Homebrew # Install chromedriver using Homebrew
brew install --cask chromedriver brew install --cask chromedriver

View File

@ -41,6 +41,9 @@ setup(
"anyio>=3.5.0,<5", "anyio>=3.5.0,<5",
"distro>=1.7.0,<2", "distro>=1.7.0,<2",
"jiter>=0.4.0,<1", "jiter>=0.4.0,<1",
"fake_useragent>=2.1.0",
"selenium_stealth>=1.0.6",
"undetected-chromedriver>=3.5.5",
"sniffio", "sniffio",
"tqdm>4" "tqdm>4"
], ],

View File

@ -7,17 +7,21 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from typing import List, Tuple from typing import List, Tuple
from fake_useragent import UserAgent
from selenium_stealth import stealth
import undetected_chromedriver as uc
import chromedriver_autoinstaller import chromedriver_autoinstaller
import time import time
import random
import os import os
import shutil import shutil
from bs4 import BeautifulSoup
import markdownify import markdownify
import logging import logging
import sys import sys
import re import re
from urllib.parse import urlparse
from sources.utility import pretty_print, animate_thinking from sources.utility import pretty_print, animate_thinking
@ -39,7 +43,8 @@ def get_chrome_path() -> str:
return path return path
return None return None
def create_driver(headless=False): def create_driver(headless=False, stealth_mode=True) -> webdriver.Chrome:
"""Create a Chrome WebDriver with specified options."""
chrome_options = Options() chrome_options = Options()
chrome_path = get_chrome_path() chrome_path = get_chrome_path()
@ -51,20 +56,19 @@ def create_driver(headless=False):
chrome_options.add_argument("--headless") chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-webgl") chrome_options.add_argument("--disable-webgl")
#ua = UserAgent()
#user_agent = ua.random # NOTE sometime return wrong user agent, investigate
#chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--autoplay-policy=user-gesture-required") chrome_options.add_argument("--autoplay-policy=user-gesture-required")
chrome_options.add_argument("--mute-audio") chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--disable-notifications") chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument('--window-size=1080,560') chrome_options.add_argument('--window-size=1080,560')
security_prefs = { try:
"profile.default_content_setting_values.media_stream": 2, chrome_options.add_extension("./anticaptcha.crx")
"profile.default_content_setting_values.geolocation": 2, except Exception as e:
"safebrowsing.enabled": True, print(f"Failed to load AntiCaptcha extension: {str(e)}")
}
chrome_options.add_experimental_option("prefs", security_prefs)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chromedriver_path = shutil.which("chromedriver") chromedriver_path = shutil.which("chromedriver")
if not chromedriver_path: if not chromedriver_path:
@ -74,10 +78,29 @@ def create_driver(headless=False):
raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.") raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.")
service = Service(chromedriver_path) service = Service(chromedriver_path)
if stealth_mode:
driver = uc.Chrome(service=service, options=chrome_options)
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
return driver
security_prefs = {
"profile.default_content_setting_values.media_stream": 2,
"profile.default_content_setting_values.geolocation": 2,
"safebrowsing.enabled": True,
}
chrome_options.add_experimental_option("prefs", security_prefs)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
return webdriver.Chrome(service=service, options=chrome_options) return webdriver.Chrome(service=service, options=chrome_options)
class Browser: class Browser:
def __init__(self, driver, anticaptcha_install=True): def __init__(self, driver, anticaptcha_manual_install=False):
"""Initialize the browser with optional AntiCaptcha installation.""" """Initialize the browser with optional AntiCaptcha installation."""
self.js_scripts_folder = "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/" self.js_scripts_folder = "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/"
self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related" self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related"
@ -88,10 +111,11 @@ class Browser:
self.logger.info("Browser initialized successfully") self.logger.info("Browser initialized successfully")
except Exception as e: except Exception as e:
raise Exception(f"Failed to initialize browser: {str(e)}") raise Exception(f"Failed to initialize browser: {str(e)}")
if anticaptcha_install: self.driver.get("https://www.google.com")
self.load_anticatpcha() if anticaptcha_manual_install:
self.load_anticatpcha_manually()
def load_anticatpcha(self): def load_anticatpcha_manually(self):
print("You might want to install the AntiCaptcha extension for captchas.") print("You might want to install the AntiCaptcha extension for captchas.")
self.driver.get(self.anticaptcha) self.driver.get(self.anticaptcha)
@ -129,11 +153,11 @@ class Browser:
for element in soup(['script', 'style']): for element in soup(['script', 'style']):
element.decompose() element.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines()) lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk and self.is_sentence(chunk)) text = "\n".join(chunk for chunk in chunks if chunk and self.is_sentence(chunk))
text = text[:4096]
#markdown_text = markdownify.markdownify(text, heading_style="ATX") #markdown_text = markdownify.markdownify(text, heading_style="ATX")
return "[Start of page]\n" + text + "\n[End of page]" return "[Start of page]\n" + text + "\n[End of page]"
except Exception as e: except Exception as e: