fix : browser duplication, isolate driver creation

2025-07-22 09:20:07 +00:00 · 2025-03-26 14:17:52 +01:00 · 2025-03-26 14:17:52 +01:00 · 8c425f62b6
commit 8c425f62b6
parent 9080697dc0
10 changed files with 82 additions and 79 deletions
--- a/main.py
+++ b/main.py
@ -8,6 +8,7 @@ import configparser
 from sources.llm_provider import Provider
 from sources.interaction import Interaction
 from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent, BrowserAgent
+from sources.browser import Browser, create_driver

 import warnings
 warnings.filterwarnings("ignore")
@ -28,6 +29,8 @@ def main():
                                   model=config["MAIN"]["provider_model"],
                                   server_address=config["MAIN"]["provider_server_address"])

+    browser = Browser(create_driver(), headless=False)
+
    agents = [
        CasualAgent(name=config["MAIN"]["agent_name"],
                    prompt_path="prompts/casual_agent.txt",
@ -40,16 +43,17 @@ def main():
                  provider=provider, verbose=False),
        BrowserAgent(name="Browser",
                     prompt_path="prompts/browser_agent.txt",
-                     provider=provider, verbose=False),
+                     provider=provider, verbose=False, browser=browser),
        # Planner agent is experimental, might work poorly, especially with model < 32b
        PlannerAgent(name="Planner",
                     prompt_path="prompts/planner_agent.txt",
-                     provider=provider, verbose=False)
+                     provider=provider, verbose=False, browser=browser)
    ]

-    interaction = Interaction(agents, tts_enabled=config.getboolean('MAIN', 'speak'),
-                                      stt_enabled=config.getboolean('MAIN', 'listen'),
-                                      recover_last_session=config.getboolean('MAIN', 'recover_last_session'))
+    interaction = Interaction(agents,
+                              tts_enabled=config.getboolean('MAIN', 'speak'),
+                              stt_enabled=config.getboolean('MAIN', 'listen'),
+                              recover_last_session=config.getboolean('MAIN', 'recover_last_session'))
    try:
        while interaction.is_active:
            interaction.get_user()
--- a/sources/agents/agent.py
+++ b/sources/agents/agent.py
@ -34,7 +34,8 @@ class Agent():
                       prompt_path:str,
                       provider,
                       recover_last_session=True,
-                       verbose=False) -> None:
+                       verbose=False,
+                       browser=None) -> None:
        """
        Args:
            name (str): Name of the agent.
@ -42,9 +43,11 @@ class Agent():
            provider: The provider for the LLM.
            recover_last_session (bool, optional): Whether to recover the last conversation. 
            verbose (bool, optional): Enable verbose logging if True. Defaults to False.
+            browser: The browser class for web navigation (only for browser agent).
        """
            
        self.agent_name = name
+        self.browser = browser
        self.role = None
        self.type = None
        self.current_directory = os.getcwd()
--- a/sources/agents/browser_agent.py
+++ b/sources/agents/browser_agent.py
@ -9,11 +9,11 @@ from datetime import date
 from typing import List, Tuple

 class BrowserAgent(Agent):
-    def __init__(self, name, prompt_path, provider, verbose=False):
+    def __init__(self, name, prompt_path, provider, verbose=False, browser=None):
        """
        The Browser agent is an agent that navigate the web autonomously in search of answer
        """
-        super().__init__(name, prompt_path, provider, verbose)
+        super().__init__(name, prompt_path, provider, verbose, browser)
        self.tools = {
            "web_search": searxSearch(),
        }
@ -24,7 +24,7 @@ class BrowserAgent(Agent):
            "es": "web"
        }
        self.type = "browser_agent"
-        self.browser = Browser()
+        self.browser = browser
        self.current_page = ""
        self.search_history = []
        self.navigable_links = []
--- a/sources/agents/casual_agent.py
+++ b/sources/agents/casual_agent.py
@ -11,7 +11,7 @@ class CasualAgent(Agent):
        """
        The casual agent is a special for casual talk to the user without specific tasks.
        """
-        super().__init__(name, prompt_path, provider, verbose)
+        super().__init__(name, prompt_path, provider, verbose, None)
        self.tools = {
        } # No tools for the casual agent
        self.role = {
--- a/sources/agents/code_agent.py
+++ b/sources/agents/code_agent.py
@ -12,7 +12,7 @@ class CoderAgent(Agent):
    The code agent is an agent that can write and execute code.
    """
    def __init__(self, name, prompt_path, provider, verbose=False):
-        super().__init__(name, prompt_path, provider, verbose)
+        super().__init__(name, prompt_path, provider, verbose, None)
        self.tools = {
            "bash": BashInterpreter(),
            "python": PyInterpreter(),
--- a/sources/agents/file_agent.py
+++ b/sources/agents/file_agent.py
@ -9,7 +9,7 @@ class FileAgent(Agent):
        """
        The file agent is a special agent for file operations.
        """
-        super().__init__(name, prompt_path, provider, verbose)
+        super().__init__(name, prompt_path, provider, verbose, None)
        self.tools = {
            "file_finder": FileFinder(),
            "bash": BashInterpreter()
--- a/sources/agents/planner_agent.py
+++ b/sources/agents/planner_agent.py
@ -7,19 +7,20 @@ from sources.agents.browser_agent import BrowserAgent
 from sources.tools.tools import Tools

 class PlannerAgent(Agent):
-    def __init__(self, name, prompt_path, provider, verbose=False):
+    def __init__(self, name, prompt_path, provider, verbose=False, browser=None):
        """
        The planner agent is a special agent that divides and conquers the task.
        """
-        super().__init__(name, prompt_path, provider, verbose)
+        super().__init__(name, prompt_path, provider, verbose, None)
        self.tools = {
            "json": Tools()
        }
        self.tools['json'].tag = "json"
+        self.browser = browser
        self.agents = {
            "coder": CoderAgent(name, "prompts/coder_agent.txt", provider, verbose=False),
            "file": FileAgent(name, "prompts/file_agent.txt", provider, verbose=False),
-            "web": BrowserAgent(name, "prompts/browser_agent.txt", provider, verbose=False)
+            "web": BrowserAgent(name, "prompts/browser_agent.txt", provider, verbose=False, browser=browser)
        }
        self.role = {
            "en": "Research, setup and code",
--- a/sources/browser.py
+++ b/sources/browser.py
@ -21,80 +21,75 @@ from urllib.parse import urlparse

 from sources.utility import pretty_print

+def get_chrome_path() -> str:
+    if sys.platform.startswith("win"):
+        paths = [
+            "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
+            "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
+            os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe")  # User install
+        ]
+    elif sys.platform.startswith("darwin"):  # macOS
+        paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+                 "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"]
+    else:  # Linux
+        paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium"]
+
+    for path in paths:
+        if os.path.exists(path) and os.access(path, os.X_OK):  # Check if executable
+            return path
+    return None
+
+def create_driver(headless=False):
+    chrome_options = Options()
+    chrome_path = get_chrome_path()
+    
+    if not chrome_path:
+        raise FileNotFoundError("Google Chrome not found. Please install it.")
+    chrome_options.binary_location = chrome_path
+    
+    if headless:
+        chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--autoplay-policy=user-gesture-required")
+    chrome_options.add_argument("--mute-audio")
+    chrome_options.add_argument("--disable-webgl")
+    chrome_options.add_argument("--disable-notifications")
+    security_prefs = {
+        "profile.default_content_setting_values.media_stream": 2,
+        "profile.default_content_setting_values.notifications": 2,
+        "profile.default_content_setting_values.popups": 2,
+        "profile.default_content_setting_values.geolocation": 2,
+        "safebrowsing.enabled": True,
+    }
+    chrome_options.add_experimental_option("prefs", security_prefs)
+    
+    chromedriver_path = shutil.which("chromedriver")
+    if not chromedriver_path:
+        chromedriver_path = chromedriver_autoinstaller.install()
+    
+    if not chromedriver_path:
+        raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.")
+    
+    service = Service(chromedriver_path)
+    return webdriver.Chrome(service=service, options=chrome_options)
+
 class Browser:
-    def __init__(self, headless=False, anticaptcha_install=False):
+    def __init__(self, driver, headless=False, anticaptcha_install=True):
        """Initialize the browser with optional headless mode."""
-        self.headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.9',
-            'Referer': 'https://www.google.com/',
-        }
        self.js_scripts_folder = "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/"
        self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related"
        try:
-            chrome_options = Options()
-            chrome_path = self.get_chrome_path()
-            
-            if not chrome_path:
-                raise FileNotFoundError("Google Chrome not found. Please install it.")
-            chrome_options.binary_location = chrome_path
-            
-            if headless:
-                chrome_options.add_argument("--headless")
-            chrome_options.add_argument("--disable-gpu")
-            chrome_options.add_argument("--no-sandbox")
-            chrome_options.add_argument("--disable-dev-shm-usage")
-            chrome_options.add_argument("--autoplay-policy=user-gesture-required")
-            chrome_options.add_argument("--mute-audio")
-            chrome_options.add_argument("--disable-webgl")
-            chrome_options.add_argument("--disable-notifications")
-            security_prefs = {
-                "profile.default_content_setting_values.media_stream": 2,  # Block webcam/mic
-                "profile.default_content_setting_values.notifications": 2,  # Block notifications
-                "profile.default_content_setting_values.popups": 2,  # Block pop-ups
-                "profile.default_content_setting_values.geolocation": 2,  # Block geolocation
-                "safebrowsing.enabled": True,  # Enable safe browsing
-            }
-            chrome_options.add_experimental_option("prefs", security_prefs)
-            
-            chromedriver_path = shutil.which("chromedriver") # system installed driver.
-            
-            #If not found, try auto-installing the correct version
-            if not chromedriver_path:
-                chromedriver_path = chromedriver_autoinstaller.install()
-          
-            if not chromedriver_path:
-                raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.")
-                
-            service = Service(chromedriver_path)
-            self.driver = webdriver.Chrome(service=service, options=chrome_options)
+            self.driver = driver
            self.wait = WebDriverWait(self.driver, 10)
            self.logger = logging.getLogger(__name__)
            self.logger.info("Browser initialized successfully")
        except Exception as e:
            raise Exception(f"Failed to initialize browser: {str(e)}")
-        self.load_anticatpcha()
+        if anticaptcha_install:
+            self.load_anticatpcha()
            
-    @staticmethod
-    def get_chrome_path() -> str:
-        if sys.platform.startswith("win"):
-            paths = [
-                "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
-                "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
-                os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe")  # User install
-            ]
-        elif sys.platform.startswith("darwin"):  # macOS
-            paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
-                     "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"]
-        else:  # Linux
-            paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium"]
-
-        for path in paths:
-            if os.path.exists(path) and os.access(path, os.X_OK):  # Check if executable
-                return path
-        return None
-    
    def load_anticatpcha(self):
        print("You might want to install the AntiCaptcha extension for captchas.")
        self.driver.get(self.anticaptcha)
--- a/sources/interaction.py
+++ b/sources/interaction.py
@ -12,7 +12,6 @@ class Interaction:
                 tts_enabled: bool = True,
                 stt_enabled: bool = True,
                 recover_last_session: bool = False):
-        self.tts_enabled = tts_enabled
        self.agents = agents
        self.current_agent = None
        self.router = AgentRouter(self.agents)
--- a/sources/router.py
+++ b/sources/router.py
@ -142,6 +142,7 @@ class AgentRouter:
            ("i would like to setup a new AI project, index as mark2", "files"),
            ("Hey, can you find the old_project.zip file somewhere on my drive?", "files"),
            ("Tell me a funny story", "talk"),
+            ("can you make a snake game in python", "code"),
            ("Can you locate the backup folder I created last month on my system?", "files"),
            ("Share a random fun fact about space.", "talk"),
            ("Write a script to rename all files in a directory to lowercase.", "files"),