feat : stealth browser with selenium_stealth for captcha

This commit is contained in:
martin legrand 2025-03-30 15:20:43 +02:00
parent 69f276955a
commit 3dbef96cf0
5 changed files with 52 additions and 16 deletions

View File

@ -7,12 +7,14 @@ echo "Detecting operating system..."
OS_TYPE=$(uname -s)
case "$OS_TYPE" in
"Linux"*)
echo "Detected Linux OS"
if [ -f "$SCRIPTS_DIR/linux_install.sh" ]; then
echo "Running Linux installation script..."
bash "$SCRIPTS_DIR/linux_install.sh"
bash -c "wget https://github.com/Fosowl/fosowl.github.io/raw/refs/heads/main/usefull/anticaptcha.crx"
bash -c "cd $LLM_ROUTER_DIR && ./dl_safetensors.sh"
else
echo "Error: $SCRIPTS_DIR/linux_install.sh not found!"
@ -24,6 +26,7 @@ case "$OS_TYPE" in
if [ -f "$SCRIPTS_DIR/macos_install.sh" ]; then
echo "Running macOS installation script..."
bash "$SCRIPTS_DIR/macos_install.sh"
bash -c "wget https://github.com/Fosowl/fosowl.github.io/raw/refs/heads/main/usefull/anticaptcha.crx"
bash -c "cd $LLM_ROUTER_DIR && ./dl_safetensors.sh"
else
echo "Error: $SCRIPTS_DIR/macos_install.sh not found!"

View File

@ -29,6 +29,9 @@ distro>=1.7.0,<2
jiter>=0.4.0,<1
sniffio
tqdm>4
fake_useragent>=2.1.0
selenium_stealth>=1.0.6
undetected-chromedriver>=3.5.5
# for api provider
openai
# if use chinese

View File

@ -5,6 +5,9 @@ echo "Starting installation for macOS..."
# Install Python dependencies from requirements.txt
pip3 install -r requirements.txt
# make sure wget installed
brew install wget
# Install chromedriver using Homebrew
brew install --cask chromedriver

View File

@ -41,6 +41,9 @@ setup(
"anyio>=3.5.0,<5",
"distro>=1.7.0,<2",
"jiter>=0.4.0,<1",
"fake_useragent>=2.1.0",
"selenium_stealth>=1.0.6",
"undetected-chromedriver>=3.5.5",
"sniffio",
"tqdm>4"
],

View File

@ -7,17 +7,21 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from typing import List, Tuple
from fake_useragent import UserAgent
from selenium_stealth import stealth
import undetected_chromedriver as uc
import chromedriver_autoinstaller
import time
import random
import os
import shutil
from bs4 import BeautifulSoup
import markdownify
import logging
import sys
import re
from urllib.parse import urlparse
from sources.utility import pretty_print, animate_thinking
@ -39,7 +43,8 @@ def get_chrome_path() -> str:
return path
return None
def create_driver(headless=False):
def create_driver(headless=False, stealth_mode=True) -> webdriver.Chrome:
"""Create a Chrome WebDriver with specified options."""
chrome_options = Options()
chrome_path = get_chrome_path()
@ -51,20 +56,19 @@ def create_driver(headless=False):
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-webgl")
#ua = UserAgent()
#user_agent = ua.random # NOTE sometime return wrong user agent, investigate
#chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--autoplay-policy=user-gesture-required")
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument('--window-size=1080,560')
security_prefs = {
"profile.default_content_setting_values.media_stream": 2,
"profile.default_content_setting_values.geolocation": 2,
"safebrowsing.enabled": True,
}
chrome_options.add_experimental_option("prefs", security_prefs)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
try:
chrome_options.add_extension("./anticaptcha.crx")
except Exception as e:
print(f"Failed to load AntiCaptcha extension: {str(e)}")
chromedriver_path = shutil.which("chromedriver")
if not chromedriver_path:
@ -74,10 +78,29 @@ def create_driver(headless=False):
raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.")
service = Service(chromedriver_path)
if stealth_mode:
driver = uc.Chrome(service=service, options=chrome_options)
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
return driver
security_prefs = {
"profile.default_content_setting_values.media_stream": 2,
"profile.default_content_setting_values.geolocation": 2,
"safebrowsing.enabled": True,
}
chrome_options.add_experimental_option("prefs", security_prefs)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
return webdriver.Chrome(service=service, options=chrome_options)
class Browser:
def __init__(self, driver, anticaptcha_install=True):
def __init__(self, driver, anticaptcha_manual_install=False):
"""Initialize the browser with optional AntiCaptcha installation."""
self.js_scripts_folder = "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/"
self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related"
@ -88,10 +111,11 @@ class Browser:
self.logger.info("Browser initialized successfully")
except Exception as e:
raise Exception(f"Failed to initialize browser: {str(e)}")
if anticaptcha_install:
self.load_anticatpcha()
self.driver.get("https://www.google.com")
if anticaptcha_manual_install:
self.load_anticatpcha_manually()
def load_anticatpcha(self):
def load_anticatpcha_manually(self):
print("You might want to install the AntiCaptcha extension for captchas.")
self.driver.get(self.anticaptcha)
@ -129,11 +153,11 @@ class Browser:
for element in soup(['script', 'style']):
element.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk and self.is_sentence(chunk))
text = text[:4096]
#markdown_text = markdownify.markdownify(text, heading_style="ATX")
return "[Start of page]\n" + text + "\n[End of page]"
except Exception as e: