From 8c425f62b64c8562b72e55e491787e4ba89cf5ec Mon Sep 17 00:00:00 2001 From: martin legrand Date: Wed, 26 Mar 2025 14:17:52 +0100 Subject: [PATCH] fix : browser duplication, isolate driver creation --- main.py | 14 ++-- sources/agents/agent.py | 5 +- sources/agents/browser_agent.py | 6 +- sources/agents/casual_agent.py | 2 +- sources/agents/code_agent.py | 2 +- sources/agents/file_agent.py | 2 +- sources/agents/planner_agent.py | 7 +- sources/browser.py | 121 +++++++++++++++----------------- sources/interaction.py | 1 - sources/router.py | 1 + 10 files changed, 82 insertions(+), 79 deletions(-) diff --git a/main.py b/main.py index d6442c1..7208143 100755 --- a/main.py +++ b/main.py @@ -8,6 +8,7 @@ import configparser from sources.llm_provider import Provider from sources.interaction import Interaction from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent, BrowserAgent +from sources.browser import Browser, create_driver import warnings warnings.filterwarnings("ignore") @@ -28,6 +29,8 @@ def main(): model=config["MAIN"]["provider_model"], server_address=config["MAIN"]["provider_server_address"]) + browser = Browser(create_driver(), headless=False) + agents = [ CasualAgent(name=config["MAIN"]["agent_name"], prompt_path="prompts/casual_agent.txt", @@ -40,16 +43,17 @@ def main(): provider=provider, verbose=False), BrowserAgent(name="Browser", prompt_path="prompts/browser_agent.txt", - provider=provider, verbose=False), + provider=provider, verbose=False, browser=browser), # Planner agent is experimental, might work poorly, especially with model < 32b PlannerAgent(name="Planner", prompt_path="prompts/planner_agent.txt", - provider=provider, verbose=False) + provider=provider, verbose=False, browser=browser) ] - interaction = Interaction(agents, tts_enabled=config.getboolean('MAIN', 'speak'), - stt_enabled=config.getboolean('MAIN', 'listen'), - recover_last_session=config.getboolean('MAIN', 'recover_last_session')) + interaction = Interaction(agents, + tts_enabled=config.getboolean('MAIN', 'speak'), + stt_enabled=config.getboolean('MAIN', 'listen'), + recover_last_session=config.getboolean('MAIN', 'recover_last_session')) try: while interaction.is_active: interaction.get_user() diff --git a/sources/agents/agent.py b/sources/agents/agent.py index af7bf4e..636d6a5 100644 --- a/sources/agents/agent.py +++ b/sources/agents/agent.py @@ -34,7 +34,8 @@ class Agent(): prompt_path:str, provider, recover_last_session=True, - verbose=False) -> None: + verbose=False, + browser=None) -> None: """ Args: name (str): Name of the agent. @@ -42,9 +43,11 @@ class Agent(): provider: The provider for the LLM. recover_last_session (bool, optional): Whether to recover the last conversation. verbose (bool, optional): Enable verbose logging if True. Defaults to False. + browser: The browser class for web navigation (only for browser agent). """ self.agent_name = name + self.browser = browser self.role = None self.type = None self.current_directory = os.getcwd() diff --git a/sources/agents/browser_agent.py b/sources/agents/browser_agent.py index 0f46956..cec1485 100644 --- a/sources/agents/browser_agent.py +++ b/sources/agents/browser_agent.py @@ -9,11 +9,11 @@ from datetime import date from typing import List, Tuple class BrowserAgent(Agent): - def __init__(self, name, prompt_path, provider, verbose=False): + def __init__(self, name, prompt_path, provider, verbose=False, browser=None): """ The Browser agent is an agent that navigate the web autonomously in search of answer """ - super().__init__(name, prompt_path, provider, verbose) + super().__init__(name, prompt_path, provider, verbose, browser) self.tools = { "web_search": searxSearch(), } @@ -24,7 +24,7 @@ class BrowserAgent(Agent): "es": "web" } self.type = "browser_agent" - self.browser = Browser() + self.browser = browser self.current_page = "" self.search_history = [] self.navigable_links = [] diff --git a/sources/agents/casual_agent.py b/sources/agents/casual_agent.py index a9d95f7..dca9be5 100644 --- a/sources/agents/casual_agent.py +++ b/sources/agents/casual_agent.py @@ -11,7 +11,7 @@ class CasualAgent(Agent): """ The casual agent is a special for casual talk to the user without specific tasks. """ - super().__init__(name, prompt_path, provider, verbose) + super().__init__(name, prompt_path, provider, verbose, None) self.tools = { } # No tools for the casual agent self.role = { diff --git a/sources/agents/code_agent.py b/sources/agents/code_agent.py index 1d9d681..1418f00 100644 --- a/sources/agents/code_agent.py +++ b/sources/agents/code_agent.py @@ -12,7 +12,7 @@ class CoderAgent(Agent): The code agent is an agent that can write and execute code. """ def __init__(self, name, prompt_path, provider, verbose=False): - super().__init__(name, prompt_path, provider, verbose) + super().__init__(name, prompt_path, provider, verbose, None) self.tools = { "bash": BashInterpreter(), "python": PyInterpreter(), diff --git a/sources/agents/file_agent.py b/sources/agents/file_agent.py index 0b8192d..b94e288 100644 --- a/sources/agents/file_agent.py +++ b/sources/agents/file_agent.py @@ -9,7 +9,7 @@ class FileAgent(Agent): """ The file agent is a special agent for file operations. """ - super().__init__(name, prompt_path, provider, verbose) + super().__init__(name, prompt_path, provider, verbose, None) self.tools = { "file_finder": FileFinder(), "bash": BashInterpreter() diff --git a/sources/agents/planner_agent.py b/sources/agents/planner_agent.py index 7e2f072..b25e854 100644 --- a/sources/agents/planner_agent.py +++ b/sources/agents/planner_agent.py @@ -7,19 +7,20 @@ from sources.agents.browser_agent import BrowserAgent from sources.tools.tools import Tools class PlannerAgent(Agent): - def __init__(self, name, prompt_path, provider, verbose=False): + def __init__(self, name, prompt_path, provider, verbose=False, browser=None): """ The planner agent is a special agent that divides and conquers the task. """ - super().__init__(name, prompt_path, provider, verbose) + super().__init__(name, prompt_path, provider, verbose, None) self.tools = { "json": Tools() } self.tools['json'].tag = "json" + self.browser = browser self.agents = { "coder": CoderAgent(name, "prompts/coder_agent.txt", provider, verbose=False), "file": FileAgent(name, "prompts/file_agent.txt", provider, verbose=False), - "web": BrowserAgent(name, "prompts/browser_agent.txt", provider, verbose=False) + "web": BrowserAgent(name, "prompts/browser_agent.txt", provider, verbose=False, browser=browser) } self.role = { "en": "Research, setup and code", diff --git a/sources/browser.py b/sources/browser.py index 5da5711..0e67a2d 100644 --- a/sources/browser.py +++ b/sources/browser.py @@ -21,80 +21,75 @@ from urllib.parse import urlparse from sources.utility import pretty_print +def get_chrome_path() -> str: + if sys.platform.startswith("win"): + paths = [ + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe", + os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe") # User install + ] + elif sys.platform.startswith("darwin"): # macOS + paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"] + else: # Linux + paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium"] + + for path in paths: + if os.path.exists(path) and os.access(path, os.X_OK): # Check if executable + return path + return None + +def create_driver(headless=False): + chrome_options = Options() + chrome_path = get_chrome_path() + + if not chrome_path: + raise FileNotFoundError("Google Chrome not found. Please install it.") + chrome_options.binary_location = chrome_path + + if headless: + chrome_options.add_argument("--headless") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--autoplay-policy=user-gesture-required") + chrome_options.add_argument("--mute-audio") + chrome_options.add_argument("--disable-webgl") + chrome_options.add_argument("--disable-notifications") + security_prefs = { + "profile.default_content_setting_values.media_stream": 2, + "profile.default_content_setting_values.notifications": 2, + "profile.default_content_setting_values.popups": 2, + "profile.default_content_setting_values.geolocation": 2, + "safebrowsing.enabled": True, + } + chrome_options.add_experimental_option("prefs", security_prefs) + + chromedriver_path = shutil.which("chromedriver") + if not chromedriver_path: + chromedriver_path = chromedriver_autoinstaller.install() + + if not chromedriver_path: + raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.") + + service = Service(chromedriver_path) + return webdriver.Chrome(service=service, options=chrome_options) + class Browser: - def __init__(self, headless=False, anticaptcha_install=False): + def __init__(self, driver, headless=False, anticaptcha_install=True): """Initialize the browser with optional headless mode.""" - self.headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.9', - 'Referer': 'https://www.google.com/', - } self.js_scripts_folder = "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/" self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related" try: - chrome_options = Options() - chrome_path = self.get_chrome_path() - - if not chrome_path: - raise FileNotFoundError("Google Chrome not found. Please install it.") - chrome_options.binary_location = chrome_path - - if headless: - chrome_options.add_argument("--headless") - chrome_options.add_argument("--disable-gpu") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--disable-dev-shm-usage") - chrome_options.add_argument("--autoplay-policy=user-gesture-required") - chrome_options.add_argument("--mute-audio") - chrome_options.add_argument("--disable-webgl") - chrome_options.add_argument("--disable-notifications") - security_prefs = { - "profile.default_content_setting_values.media_stream": 2, # Block webcam/mic - "profile.default_content_setting_values.notifications": 2, # Block notifications - "profile.default_content_setting_values.popups": 2, # Block pop-ups - "profile.default_content_setting_values.geolocation": 2, # Block geolocation - "safebrowsing.enabled": True, # Enable safe browsing - } - chrome_options.add_experimental_option("prefs", security_prefs) - - chromedriver_path = shutil.which("chromedriver") # system installed driver. - - #If not found, try auto-installing the correct version - if not chromedriver_path: - chromedriver_path = chromedriver_autoinstaller.install() - - if not chromedriver_path: - raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.") - - service = Service(chromedriver_path) - self.driver = webdriver.Chrome(service=service, options=chrome_options) + self.driver = driver self.wait = WebDriverWait(self.driver, 10) self.logger = logging.getLogger(__name__) self.logger.info("Browser initialized successfully") except Exception as e: raise Exception(f"Failed to initialize browser: {str(e)}") - self.load_anticatpcha() + if anticaptcha_install: + self.load_anticatpcha() - @staticmethod - def get_chrome_path() -> str: - if sys.platform.startswith("win"): - paths = [ - "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", - "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe", - os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe") # User install - ] - elif sys.platform.startswith("darwin"): # macOS - paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"] - else: # Linux - paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium"] - - for path in paths: - if os.path.exists(path) and os.access(path, os.X_OK): # Check if executable - return path - return None - def load_anticatpcha(self): print("You might want to install the AntiCaptcha extension for captchas.") self.driver.get(self.anticaptcha) diff --git a/sources/interaction.py b/sources/interaction.py index 02b0f3e..9b0e5f6 100644 --- a/sources/interaction.py +++ b/sources/interaction.py @@ -12,7 +12,6 @@ class Interaction: tts_enabled: bool = True, stt_enabled: bool = True, recover_last_session: bool = False): - self.tts_enabled = tts_enabled self.agents = agents self.current_agent = None self.router = AgentRouter(self.agents) diff --git a/sources/router.py b/sources/router.py index 75d1386..ef53443 100644 --- a/sources/router.py +++ b/sources/router.py @@ -142,6 +142,7 @@ class AgentRouter: ("i would like to setup a new AI project, index as mark2", "files"), ("Hey, can you find the old_project.zip file somewhere on my drive?", "files"), ("Tell me a funny story", "talk"), + ("can you make a snake game in python", "code"), ("Can you locate the backup folder I created last month on my system?", "files"), ("Share a random fun fact about space.", "talk"), ("Write a script to rename all files in a directory to lowercase.", "files"),