fix : browser duplication, isolate driver creation

2025-07-24 10:20:13 +00:00 · 2025-03-26 14:17:52 +01:00 · 2025-03-26 14:17:52 +01:00 · 8c425f62b6
commit 8c425f62b6
parent 9080697dc0
10 changed files with 82 additions and 79 deletions
--- a/main.py
+++ b/main.py
@ -8,6 +8,7 @@ import configparser
 from sources.llm_provider import Provider
 from sources.interaction import Interaction
 from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent, BrowserAgent
 from sources.browser import Browser, create_driver
 import warnings
 warnings.filterwarnings("ignore")
@ -28,6 +29,8 @@ def main():
                                   model=config["MAIN"]["provider_model"],
                                   server_address=config["MAIN"]["provider_server_address"])
    browser = Browser(create_driver(), headless=False)
    agents = [
        CasualAgent(name=config["MAIN"]["agent_name"],
                    prompt_path="prompts/casual_agent.txt",
@ -40,16 +43,17 @@ def main():
                  provider=provider, verbose=False),
        BrowserAgent(name="Browser",
                     prompt_path="prompts/browser_agent.txt",
-                     provider=provider, verbose=False),
+                     provider=provider, verbose=False, browser=browser),
        # Planner agent is experimental, might work poorly, especially with model < 32b
        PlannerAgent(name="Planner",
                     prompt_path="prompts/planner_agent.txt",
-                     provider=provider, verbose=False)
+                     provider=provider, verbose=False, browser=browser)
    ]
-    interaction = Interaction(agents, tts_enabled=config.getboolean('MAIN', 'speak'),
+    interaction = Interaction(agents,
-                                      stt_enabled=config.getboolean('MAIN', 'listen'),
+                              tts_enabled=config.getboolean('MAIN', 'speak'),
-                                      recover_last_session=config.getboolean('MAIN', 'recover_last_session'))
+                              stt_enabled=config.getboolean('MAIN', 'listen'),
                              recover_last_session=config.getboolean('MAIN', 'recover_last_session'))
    try:
        while interaction.is_active:
            interaction.get_user()
--- a/sources/agents/agent.py
+++ b/sources/agents/agent.py
@ -34,7 +34,8 @@ class Agent():
                       prompt_path:str,
                       provider,
                       recover_last_session=True,
-                       verbose=False) -> None:
+                       verbose=False,
                       browser=None) -> None:
        """
        Args:
            name (str): Name of the agent.
@ -42,9 +43,11 @@ class Agent():
            provider: The provider for the LLM.
            recover_last_session (bool, optional): Whether to recover the last conversation. 
            verbose (bool, optional): Enable verbose logging if True. Defaults to False.
            browser: The browser class for web navigation (only for browser agent).
        """
        self.agent_name = name
        self.browser = browser
        self.role = None
        self.type = None
        self.current_directory = os.getcwd()
--- a/sources/agents/browser_agent.py
+++ b/sources/agents/browser_agent.py
@ -9,11 +9,11 @@ from datetime import date
 from typing import List, Tuple
 class BrowserAgent(Agent):
-    def __init__(self, name, prompt_path, provider, verbose=False):
+    def __init__(self, name, prompt_path, provider, verbose=False, browser=None):
        """
        The Browser agent is an agent that navigate the web autonomously in search of answer
        """
-        super().__init__(name, prompt_path, provider, verbose)
+        super().__init__(name, prompt_path, provider, verbose, browser)
        self.tools = {
            "web_search": searxSearch(),
        }
@ -24,7 +24,7 @@ class BrowserAgent(Agent):
            "es": "web"
        }
        self.type = "browser_agent"
-        self.browser = Browser()
+        self.browser = browser
        self.current_page = ""
        self.search_history = []
        self.navigable_links = []
--- a/sources/agents/casual_agent.py
+++ b/sources/agents/casual_agent.py
@ -11,7 +11,7 @@ class CasualAgent(Agent):
        """
        The casual agent is a special for casual talk to the user without specific tasks.
        """
-        super().__init__(name, prompt_path, provider, verbose)
+        super().__init__(name, prompt_path, provider, verbose, None)
        self.tools = {
        } # No tools for the casual agent
        self.role = {
--- a/sources/agents/code_agent.py
+++ b/sources/agents/code_agent.py
@ -12,7 +12,7 @@ class CoderAgent(Agent):
    The code agent is an agent that can write and execute code.
    """
    def __init__(self, name, prompt_path, provider, verbose=False):
-        super().__init__(name, prompt_path, provider, verbose)
+        super().__init__(name, prompt_path, provider, verbose, None)
        self.tools = {
            "bash": BashInterpreter(),
            "python": PyInterpreter(),
--- a/sources/agents/file_agent.py
+++ b/sources/agents/file_agent.py
@ -9,7 +9,7 @@ class FileAgent(Agent):
        """
        The file agent is a special agent for file operations.
        """
-        super().__init__(name, prompt_path, provider, verbose)
+        super().__init__(name, prompt_path, provider, verbose, None)
        self.tools = {
            "file_finder": FileFinder(),
            "bash": BashInterpreter()
--- a/sources/agents/planner_agent.py
+++ b/sources/agents/planner_agent.py
@ -7,19 +7,20 @@ from sources.agents.browser_agent import BrowserAgent
 from sources.tools.tools import Tools
 class PlannerAgent(Agent):
-    def __init__(self, name, prompt_path, provider, verbose=False):
+    def __init__(self, name, prompt_path, provider, verbose=False, browser=None):
        """
        The planner agent is a special agent that divides and conquers the task.
        """
-        super().__init__(name, prompt_path, provider, verbose)
+        super().__init__(name, prompt_path, provider, verbose, None)
        self.tools = {
            "json": Tools()
        }
        self.tools['json'].tag = "json"
        self.browser = browser
        self.agents = {
            "coder": CoderAgent(name, "prompts/coder_agent.txt", provider, verbose=False),
            "file": FileAgent(name, "prompts/file_agent.txt", provider, verbose=False),
-            "web": BrowserAgent(name, "prompts/browser_agent.txt", provider, verbose=False)
+            "web": BrowserAgent(name, "prompts/browser_agent.txt", provider, verbose=False, browser=browser)
        }
        self.role = {
            "en": "Research, setup and code",
--- a/sources/browser.py
+++ b/sources/browser.py
@ -21,79 +21,74 @@ from urllib.parse import urlparse
 from sources.utility import pretty_print
 def get_chrome_path() -> str:
    if sys.platform.startswith("win"):
        paths = [
            "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
            "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
            os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe")  # User install
        ]
    elif sys.platform.startswith("darwin"):  # macOS
        paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
                 "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"]
    else:  # Linux
        paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium"]
    for path in paths:
        if os.path.exists(path) and os.access(path, os.X_OK):  # Check if executable
            return path
    return None
 def create_driver(headless=False):
    chrome_options = Options()
    chrome_path = get_chrome_path()
    if not chrome_path:
        raise FileNotFoundError("Google Chrome not found. Please install it.")
    chrome_options.binary_location = chrome_path
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--autoplay-policy=user-gesture-required")
    chrome_options.add_argument("--mute-audio")
    chrome_options.add_argument("--disable-webgl")
    chrome_options.add_argument("--disable-notifications")
    security_prefs = {
        "profile.default_content_setting_values.media_stream": 2,
        "profile.default_content_setting_values.notifications": 2,
        "profile.default_content_setting_values.popups": 2,
        "profile.default_content_setting_values.geolocation": 2,
        "safebrowsing.enabled": True,
    }
    chrome_options.add_experimental_option("prefs", security_prefs)
    chromedriver_path = shutil.which("chromedriver")
    if not chromedriver_path:
        chromedriver_path = chromedriver_autoinstaller.install()
    if not chromedriver_path:
        raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.")
    service = Service(chromedriver_path)
    return webdriver.Chrome(service=service, options=chrome_options)
 class Browser:
-    def __init__(self, headless=False, anticaptcha_install=False):
+    def __init__(self, driver, headless=False, anticaptcha_install=True):
        """Initialize the browser with optional headless mode."""
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Referer': 'https://www.google.com/',
        }
        self.js_scripts_folder = "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/"
        self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related"
        try:
-            chrome_options = Options()
+            self.driver = driver
            chrome_path = self.get_chrome_path()
            if not chrome_path:
                raise FileNotFoundError("Google Chrome not found. Please install it.")
            chrome_options.binary_location = chrome_path
            if headless:
                chrome_options.add_argument("--headless")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--autoplay-policy=user-gesture-required")
            chrome_options.add_argument("--mute-audio")
            chrome_options.add_argument("--disable-webgl")
            chrome_options.add_argument("--disable-notifications")
            security_prefs = {
                "profile.default_content_setting_values.media_stream": 2,  # Block webcam/mic
                "profile.default_content_setting_values.notifications": 2,  # Block notifications
                "profile.default_content_setting_values.popups": 2,  # Block pop-ups
                "profile.default_content_setting_values.geolocation": 2,  # Block geolocation
                "safebrowsing.enabled": True,  # Enable safe browsing
            }
            chrome_options.add_experimental_option("prefs", security_prefs)
            chromedriver_path = shutil.which("chromedriver") # system installed driver.
            #If not found, try auto-installing the correct version
            if not chromedriver_path:
                chromedriver_path = chromedriver_autoinstaller.install()
            if not chromedriver_path:
                raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.")
            service = Service(chromedriver_path)
            self.driver = webdriver.Chrome(service=service, options=chrome_options)
            self.wait = WebDriverWait(self.driver, 10)
            self.logger = logging.getLogger(__name__)
            self.logger.info("Browser initialized successfully")
        except Exception as e:
            raise Exception(f"Failed to initialize browser: {str(e)}")
-        self.load_anticatpcha()
+        if anticaptcha_install:
-            
+            self.load_anticatpcha()
    @staticmethod
    def get_chrome_path() -> str:
        if sys.platform.startswith("win"):
            paths = [
                "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
                "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
                os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe")  # User install
            ]
        elif sys.platform.startswith("darwin"):  # macOS
            paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
                     "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"]
        else:  # Linux
            paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium"]
        for path in paths:
            if os.path.exists(path) and os.access(path, os.X_OK):  # Check if executable
                return path
        return None
    def load_anticatpcha(self):
        print("You might want to install the AntiCaptcha extension for captchas.")
--- a/sources/interaction.py
+++ b/sources/interaction.py
@ -12,7 +12,6 @@ class Interaction:
                 tts_enabled: bool = True,
                 stt_enabled: bool = True,
                 recover_last_session: bool = False):
        self.tts_enabled = tts_enabled
        self.agents = agents
        self.current_agent = None
        self.router = AgentRouter(self.agents)
--- a/sources/router.py
+++ b/sources/router.py
@ -142,6 +142,7 @@ class AgentRouter:
            ("i would like to setup a new AI project, index as mark2", "files"),
            ("Hey, can you find the old_project.zip file somewhere on my drive?", "files"),
            ("Tell me a funny story", "talk"),
            ("can you make a snake game in python", "code"),
            ("Can you locate the backup folder I created last month on my system?", "files"),
            ("Share a random fun fact about space.", "talk"),
            ("Write a script to rename all files in a directory to lowercase.", "files"),