fix : browser duplication, isolate driver creation

This commit is contained in:
martin legrand 2025-03-26 14:17:52 +01:00
parent 9080697dc0
commit 8c425f62b6
10 changed files with 82 additions and 79 deletions

14
main.py
View File

@ -8,6 +8,7 @@ import configparser
from sources.llm_provider import Provider
from sources.interaction import Interaction
from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent, BrowserAgent
from sources.browser import Browser, create_driver
import warnings
warnings.filterwarnings("ignore")
@ -28,6 +29,8 @@ def main():
model=config["MAIN"]["provider_model"],
server_address=config["MAIN"]["provider_server_address"])
browser = Browser(create_driver(), headless=False)
agents = [
CasualAgent(name=config["MAIN"]["agent_name"],
prompt_path="prompts/casual_agent.txt",
@ -40,16 +43,17 @@ def main():
provider=provider, verbose=False),
BrowserAgent(name="Browser",
prompt_path="prompts/browser_agent.txt",
provider=provider, verbose=False),
provider=provider, verbose=False, browser=browser),
# Planner agent is experimental, might work poorly, especially with model < 32b
PlannerAgent(name="Planner",
prompt_path="prompts/planner_agent.txt",
provider=provider, verbose=False)
provider=provider, verbose=False, browser=browser)
]
interaction = Interaction(agents, tts_enabled=config.getboolean('MAIN', 'speak'),
stt_enabled=config.getboolean('MAIN', 'listen'),
recover_last_session=config.getboolean('MAIN', 'recover_last_session'))
interaction = Interaction(agents,
tts_enabled=config.getboolean('MAIN', 'speak'),
stt_enabled=config.getboolean('MAIN', 'listen'),
recover_last_session=config.getboolean('MAIN', 'recover_last_session'))
try:
while interaction.is_active:
interaction.get_user()

View File

@ -34,7 +34,8 @@ class Agent():
prompt_path:str,
provider,
recover_last_session=True,
verbose=False) -> None:
verbose=False,
browser=None) -> None:
"""
Args:
name (str): Name of the agent.
@ -42,9 +43,11 @@ class Agent():
provider: The provider for the LLM.
recover_last_session (bool, optional): Whether to recover the last conversation.
verbose (bool, optional): Enable verbose logging if True. Defaults to False.
browser: The browser class for web navigation (only for browser agent).
"""
self.agent_name = name
self.browser = browser
self.role = None
self.type = None
self.current_directory = os.getcwd()

View File

@ -9,11 +9,11 @@ from datetime import date
from typing import List, Tuple
class BrowserAgent(Agent):
def __init__(self, name, prompt_path, provider, verbose=False):
def __init__(self, name, prompt_path, provider, verbose=False, browser=None):
"""
The Browser agent is an agent that navigate the web autonomously in search of answer
"""
super().__init__(name, prompt_path, provider, verbose)
super().__init__(name, prompt_path, provider, verbose, browser)
self.tools = {
"web_search": searxSearch(),
}
@ -24,7 +24,7 @@ class BrowserAgent(Agent):
"es": "web"
}
self.type = "browser_agent"
self.browser = Browser()
self.browser = browser
self.current_page = ""
self.search_history = []
self.navigable_links = []

View File

@ -11,7 +11,7 @@ class CasualAgent(Agent):
"""
The casual agent is a special for casual talk to the user without specific tasks.
"""
super().__init__(name, prompt_path, provider, verbose)
super().__init__(name, prompt_path, provider, verbose, None)
self.tools = {
} # No tools for the casual agent
self.role = {

View File

@ -12,7 +12,7 @@ class CoderAgent(Agent):
The code agent is an agent that can write and execute code.
"""
def __init__(self, name, prompt_path, provider, verbose=False):
super().__init__(name, prompt_path, provider, verbose)
super().__init__(name, prompt_path, provider, verbose, None)
self.tools = {
"bash": BashInterpreter(),
"python": PyInterpreter(),

View File

@ -9,7 +9,7 @@ class FileAgent(Agent):
"""
The file agent is a special agent for file operations.
"""
super().__init__(name, prompt_path, provider, verbose)
super().__init__(name, prompt_path, provider, verbose, None)
self.tools = {
"file_finder": FileFinder(),
"bash": BashInterpreter()

View File

@ -7,19 +7,20 @@ from sources.agents.browser_agent import BrowserAgent
from sources.tools.tools import Tools
class PlannerAgent(Agent):
def __init__(self, name, prompt_path, provider, verbose=False):
def __init__(self, name, prompt_path, provider, verbose=False, browser=None):
"""
The planner agent is a special agent that divides and conquers the task.
"""
super().__init__(name, prompt_path, provider, verbose)
super().__init__(name, prompt_path, provider, verbose, None)
self.tools = {
"json": Tools()
}
self.tools['json'].tag = "json"
self.browser = browser
self.agents = {
"coder": CoderAgent(name, "prompts/coder_agent.txt", provider, verbose=False),
"file": FileAgent(name, "prompts/file_agent.txt", provider, verbose=False),
"web": BrowserAgent(name, "prompts/browser_agent.txt", provider, verbose=False)
"web": BrowserAgent(name, "prompts/browser_agent.txt", provider, verbose=False, browser=browser)
}
self.role = {
"en": "Research, setup and code",

View File

@ -21,80 +21,75 @@ from urllib.parse import urlparse
from sources.utility import pretty_print
def get_chrome_path() -> str:
if sys.platform.startswith("win"):
paths = [
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe") # User install
]
elif sys.platform.startswith("darwin"): # macOS
paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"]
else: # Linux
paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium"]
for path in paths:
if os.path.exists(path) and os.access(path, os.X_OK): # Check if executable
return path
return None
def create_driver(headless=False):
chrome_options = Options()
chrome_path = get_chrome_path()
if not chrome_path:
raise FileNotFoundError("Google Chrome not found. Please install it.")
chrome_options.binary_location = chrome_path
if headless:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--autoplay-policy=user-gesture-required")
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--disable-webgl")
chrome_options.add_argument("--disable-notifications")
security_prefs = {
"profile.default_content_setting_values.media_stream": 2,
"profile.default_content_setting_values.notifications": 2,
"profile.default_content_setting_values.popups": 2,
"profile.default_content_setting_values.geolocation": 2,
"safebrowsing.enabled": True,
}
chrome_options.add_experimental_option("prefs", security_prefs)
chromedriver_path = shutil.which("chromedriver")
if not chromedriver_path:
chromedriver_path = chromedriver_autoinstaller.install()
if not chromedriver_path:
raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.")
service = Service(chromedriver_path)
return webdriver.Chrome(service=service, options=chrome_options)
class Browser:
def __init__(self, headless=False, anticaptcha_install=False):
def __init__(self, driver, headless=False, anticaptcha_install=True):
"""Initialize the browser with optional headless mode."""
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://www.google.com/',
}
self.js_scripts_folder = "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/"
self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related"
try:
chrome_options = Options()
chrome_path = self.get_chrome_path()
if not chrome_path:
raise FileNotFoundError("Google Chrome not found. Please install it.")
chrome_options.binary_location = chrome_path
if headless:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--autoplay-policy=user-gesture-required")
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--disable-webgl")
chrome_options.add_argument("--disable-notifications")
security_prefs = {
"profile.default_content_setting_values.media_stream": 2, # Block webcam/mic
"profile.default_content_setting_values.notifications": 2, # Block notifications
"profile.default_content_setting_values.popups": 2, # Block pop-ups
"profile.default_content_setting_values.geolocation": 2, # Block geolocation
"safebrowsing.enabled": True, # Enable safe browsing
}
chrome_options.add_experimental_option("prefs", security_prefs)
chromedriver_path = shutil.which("chromedriver") # system installed driver.
#If not found, try auto-installing the correct version
if not chromedriver_path:
chromedriver_path = chromedriver_autoinstaller.install()
if not chromedriver_path:
raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.")
service = Service(chromedriver_path)
self.driver = webdriver.Chrome(service=service, options=chrome_options)
self.driver = driver
self.wait = WebDriverWait(self.driver, 10)
self.logger = logging.getLogger(__name__)
self.logger.info("Browser initialized successfully")
except Exception as e:
raise Exception(f"Failed to initialize browser: {str(e)}")
self.load_anticatpcha()
if anticaptcha_install:
self.load_anticatpcha()
@staticmethod
def get_chrome_path() -> str:
if sys.platform.startswith("win"):
paths = [
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe") # User install
]
elif sys.platform.startswith("darwin"): # macOS
paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"]
else: # Linux
paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium"]
for path in paths:
if os.path.exists(path) and os.access(path, os.X_OK): # Check if executable
return path
return None
def load_anticatpcha(self):
print("You might want to install the AntiCaptcha extension for captchas.")
self.driver.get(self.anticaptcha)

View File

@ -12,7 +12,6 @@ class Interaction:
tts_enabled: bool = True,
stt_enabled: bool = True,
recover_last_session: bool = False):
self.tts_enabled = tts_enabled
self.agents = agents
self.current_agent = None
self.router = AgentRouter(self.agents)

View File

@ -142,6 +142,7 @@ class AgentRouter:
("i would like to setup a new AI project, index as mark2", "files"),
("Hey, can you find the old_project.zip file somewhere on my drive?", "files"),
("Tell me a funny story", "talk"),
("can you make a snake game in python", "code"),
("Can you locate the backup folder I created last month on my system?", "files"),
("Share a random fun fact about space.", "talk"),
("Write a script to rename all files in a directory to lowercase.", "files"),