fix : browser duplication, isolate driver creation

This commit is contained in:
martin legrand 2025-03-26 14:17:52 +01:00
parent 9080697dc0
commit 8c425f62b6
10 changed files with 82 additions and 79 deletions

14
main.py
View File

@ -8,6 +8,7 @@ import configparser
from sources.llm_provider import Provider from sources.llm_provider import Provider
from sources.interaction import Interaction from sources.interaction import Interaction
from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent, BrowserAgent from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent, BrowserAgent
from sources.browser import Browser, create_driver
import warnings import warnings
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
@ -28,6 +29,8 @@ def main():
model=config["MAIN"]["provider_model"], model=config["MAIN"]["provider_model"],
server_address=config["MAIN"]["provider_server_address"]) server_address=config["MAIN"]["provider_server_address"])
browser = Browser(create_driver(), headless=False)
agents = [ agents = [
CasualAgent(name=config["MAIN"]["agent_name"], CasualAgent(name=config["MAIN"]["agent_name"],
prompt_path="prompts/casual_agent.txt", prompt_path="prompts/casual_agent.txt",
@ -40,16 +43,17 @@ def main():
provider=provider, verbose=False), provider=provider, verbose=False),
BrowserAgent(name="Browser", BrowserAgent(name="Browser",
prompt_path="prompts/browser_agent.txt", prompt_path="prompts/browser_agent.txt",
provider=provider, verbose=False), provider=provider, verbose=False, browser=browser),
# Planner agent is experimental, might work poorly, especially with model < 32b # Planner agent is experimental, might work poorly, especially with model < 32b
PlannerAgent(name="Planner", PlannerAgent(name="Planner",
prompt_path="prompts/planner_agent.txt", prompt_path="prompts/planner_agent.txt",
provider=provider, verbose=False) provider=provider, verbose=False, browser=browser)
] ]
interaction = Interaction(agents, tts_enabled=config.getboolean('MAIN', 'speak'), interaction = Interaction(agents,
stt_enabled=config.getboolean('MAIN', 'listen'), tts_enabled=config.getboolean('MAIN', 'speak'),
recover_last_session=config.getboolean('MAIN', 'recover_last_session')) stt_enabled=config.getboolean('MAIN', 'listen'),
recover_last_session=config.getboolean('MAIN', 'recover_last_session'))
try: try:
while interaction.is_active: while interaction.is_active:
interaction.get_user() interaction.get_user()

View File

@ -34,7 +34,8 @@ class Agent():
prompt_path:str, prompt_path:str,
provider, provider,
recover_last_session=True, recover_last_session=True,
verbose=False) -> None: verbose=False,
browser=None) -> None:
""" """
Args: Args:
name (str): Name of the agent. name (str): Name of the agent.
@ -42,9 +43,11 @@ class Agent():
provider: The provider for the LLM. provider: The provider for the LLM.
recover_last_session (bool, optional): Whether to recover the last conversation. recover_last_session (bool, optional): Whether to recover the last conversation.
verbose (bool, optional): Enable verbose logging if True. Defaults to False. verbose (bool, optional): Enable verbose logging if True. Defaults to False.
browser: The browser class for web navigation (only for browser agent).
""" """
self.agent_name = name self.agent_name = name
self.browser = browser
self.role = None self.role = None
self.type = None self.type = None
self.current_directory = os.getcwd() self.current_directory = os.getcwd()

View File

@ -9,11 +9,11 @@ from datetime import date
from typing import List, Tuple from typing import List, Tuple
class BrowserAgent(Agent): class BrowserAgent(Agent):
def __init__(self, name, prompt_path, provider, verbose=False): def __init__(self, name, prompt_path, provider, verbose=False, browser=None):
""" """
The Browser agent is an agent that navigate the web autonomously in search of answer The Browser agent is an agent that navigate the web autonomously in search of answer
""" """
super().__init__(name, prompt_path, provider, verbose) super().__init__(name, prompt_path, provider, verbose, browser)
self.tools = { self.tools = {
"web_search": searxSearch(), "web_search": searxSearch(),
} }
@ -24,7 +24,7 @@ class BrowserAgent(Agent):
"es": "web" "es": "web"
} }
self.type = "browser_agent" self.type = "browser_agent"
self.browser = Browser() self.browser = browser
self.current_page = "" self.current_page = ""
self.search_history = [] self.search_history = []
self.navigable_links = [] self.navigable_links = []

View File

@ -11,7 +11,7 @@ class CasualAgent(Agent):
""" """
The casual agent is a special for casual talk to the user without specific tasks. The casual agent is a special for casual talk to the user without specific tasks.
""" """
super().__init__(name, prompt_path, provider, verbose) super().__init__(name, prompt_path, provider, verbose, None)
self.tools = { self.tools = {
} # No tools for the casual agent } # No tools for the casual agent
self.role = { self.role = {

View File

@ -12,7 +12,7 @@ class CoderAgent(Agent):
The code agent is an agent that can write and execute code. The code agent is an agent that can write and execute code.
""" """
def __init__(self, name, prompt_path, provider, verbose=False): def __init__(self, name, prompt_path, provider, verbose=False):
super().__init__(name, prompt_path, provider, verbose) super().__init__(name, prompt_path, provider, verbose, None)
self.tools = { self.tools = {
"bash": BashInterpreter(), "bash": BashInterpreter(),
"python": PyInterpreter(), "python": PyInterpreter(),

View File

@ -9,7 +9,7 @@ class FileAgent(Agent):
""" """
The file agent is a special agent for file operations. The file agent is a special agent for file operations.
""" """
super().__init__(name, prompt_path, provider, verbose) super().__init__(name, prompt_path, provider, verbose, None)
self.tools = { self.tools = {
"file_finder": FileFinder(), "file_finder": FileFinder(),
"bash": BashInterpreter() "bash": BashInterpreter()

View File

@ -7,19 +7,20 @@ from sources.agents.browser_agent import BrowserAgent
from sources.tools.tools import Tools from sources.tools.tools import Tools
class PlannerAgent(Agent): class PlannerAgent(Agent):
def __init__(self, name, prompt_path, provider, verbose=False): def __init__(self, name, prompt_path, provider, verbose=False, browser=None):
""" """
The planner agent is a special agent that divides and conquers the task. The planner agent is a special agent that divides and conquers the task.
""" """
super().__init__(name, prompt_path, provider, verbose) super().__init__(name, prompt_path, provider, verbose, None)
self.tools = { self.tools = {
"json": Tools() "json": Tools()
} }
self.tools['json'].tag = "json" self.tools['json'].tag = "json"
self.browser = browser
self.agents = { self.agents = {
"coder": CoderAgent(name, "prompts/coder_agent.txt", provider, verbose=False), "coder": CoderAgent(name, "prompts/coder_agent.txt", provider, verbose=False),
"file": FileAgent(name, "prompts/file_agent.txt", provider, verbose=False), "file": FileAgent(name, "prompts/file_agent.txt", provider, verbose=False),
"web": BrowserAgent(name, "prompts/browser_agent.txt", provider, verbose=False) "web": BrowserAgent(name, "prompts/browser_agent.txt", provider, verbose=False, browser=browser)
} }
self.role = { self.role = {
"en": "Research, setup and code", "en": "Research, setup and code",

View File

@ -21,79 +21,74 @@ from urllib.parse import urlparse
from sources.utility import pretty_print from sources.utility import pretty_print
def get_chrome_path() -> str:
if sys.platform.startswith("win"):
paths = [
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe") # User install
]
elif sys.platform.startswith("darwin"): # macOS
paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"]
else: # Linux
paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium"]
for path in paths:
if os.path.exists(path) and os.access(path, os.X_OK): # Check if executable
return path
return None
def create_driver(headless=False):
chrome_options = Options()
chrome_path = get_chrome_path()
if not chrome_path:
raise FileNotFoundError("Google Chrome not found. Please install it.")
chrome_options.binary_location = chrome_path
if headless:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--autoplay-policy=user-gesture-required")
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--disable-webgl")
chrome_options.add_argument("--disable-notifications")
security_prefs = {
"profile.default_content_setting_values.media_stream": 2,
"profile.default_content_setting_values.notifications": 2,
"profile.default_content_setting_values.popups": 2,
"profile.default_content_setting_values.geolocation": 2,
"safebrowsing.enabled": True,
}
chrome_options.add_experimental_option("prefs", security_prefs)
chromedriver_path = shutil.which("chromedriver")
if not chromedriver_path:
chromedriver_path = chromedriver_autoinstaller.install()
if not chromedriver_path:
raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.")
service = Service(chromedriver_path)
return webdriver.Chrome(service=service, options=chrome_options)
class Browser: class Browser:
def __init__(self, headless=False, anticaptcha_install=False): def __init__(self, driver, headless=False, anticaptcha_install=True):
"""Initialize the browser with optional headless mode.""" """Initialize the browser with optional headless mode."""
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://www.google.com/',
}
self.js_scripts_folder = "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/" self.js_scripts_folder = "./sources/web_scripts/" if not __name__ == "__main__" else "./web_scripts/"
self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related" self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related"
try: try:
chrome_options = Options() self.driver = driver
chrome_path = self.get_chrome_path()
if not chrome_path:
raise FileNotFoundError("Google Chrome not found. Please install it.")
chrome_options.binary_location = chrome_path
if headless:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--autoplay-policy=user-gesture-required")
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--disable-webgl")
chrome_options.add_argument("--disable-notifications")
security_prefs = {
"profile.default_content_setting_values.media_stream": 2, # Block webcam/mic
"profile.default_content_setting_values.notifications": 2, # Block notifications
"profile.default_content_setting_values.popups": 2, # Block pop-ups
"profile.default_content_setting_values.geolocation": 2, # Block geolocation
"safebrowsing.enabled": True, # Enable safe browsing
}
chrome_options.add_experimental_option("prefs", security_prefs)
chromedriver_path = shutil.which("chromedriver") # system installed driver.
#If not found, try auto-installing the correct version
if not chromedriver_path:
chromedriver_path = chromedriver_autoinstaller.install()
if not chromedriver_path:
raise FileNotFoundError("ChromeDriver not found. Please install it or add it to your PATH.")
service = Service(chromedriver_path)
self.driver = webdriver.Chrome(service=service, options=chrome_options)
self.wait = WebDriverWait(self.driver, 10) self.wait = WebDriverWait(self.driver, 10)
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
self.logger.info("Browser initialized successfully") self.logger.info("Browser initialized successfully")
except Exception as e: except Exception as e:
raise Exception(f"Failed to initialize browser: {str(e)}") raise Exception(f"Failed to initialize browser: {str(e)}")
self.load_anticatpcha() if anticaptcha_install:
self.load_anticatpcha()
@staticmethod
def get_chrome_path() -> str:
if sys.platform.startswith("win"):
paths = [
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
os.path.join(os.environ.get("LOCALAPPDATA", ""), "Google\\Chrome\\Application\\chrome.exe") # User install
]
elif sys.platform.startswith("darwin"): # macOS
paths = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"]
else: # Linux
paths = ["/usr/bin/google-chrome", "/usr/bin/chromium-browser", "/usr/bin/chromium"]
for path in paths:
if os.path.exists(path) and os.access(path, os.X_OK): # Check if executable
return path
return None
def load_anticatpcha(self): def load_anticatpcha(self):
print("You might want to install the AntiCaptcha extension for captchas.") print("You might want to install the AntiCaptcha extension for captchas.")

View File

@ -12,7 +12,6 @@ class Interaction:
tts_enabled: bool = True, tts_enabled: bool = True,
stt_enabled: bool = True, stt_enabled: bool = True,
recover_last_session: bool = False): recover_last_session: bool = False):
self.tts_enabled = tts_enabled
self.agents = agents self.agents = agents
self.current_agent = None self.current_agent = None
self.router = AgentRouter(self.agents) self.router = AgentRouter(self.agents)

View File

@ -142,6 +142,7 @@ class AgentRouter:
("i would like to setup a new AI project, index as mark2", "files"), ("i would like to setup a new AI project, index as mark2", "files"),
("Hey, can you find the old_project.zip file somewhere on my drive?", "files"), ("Hey, can you find the old_project.zip file somewhere on my drive?", "files"),
("Tell me a funny story", "talk"), ("Tell me a funny story", "talk"),
("can you make a snake game in python", "code"),
("Can you locate the backup folder I created last month on my system?", "files"), ("Can you locate the backup folder I created last month on my system?", "files"),
("Share a random fun fact about space.", "talk"), ("Share a random fun fact about space.", "talk"),
("Write a script to rename all files in a directory to lowercase.", "files"), ("Write a script to rename all files in a directory to lowercase.", "files"),