From 68bab4ecac3000dd22efae6892f2f521b2b9d56b Mon Sep 17 00:00:00 2001 From: martin legrand Date: Mon, 10 Mar 2025 21:05:10 +0100 Subject: [PATCH] Feat : browser v0 --- main.py | 6 +- requirements.txt | 1 + setup.py | 1 + sources/agents/__init__.py | 3 +- sources/agents/browser_agent.py | 23 +++++ sources/browser.py | 157 ++++++++++++++++++++++++++++++++ sources/speech_to_text.py | 2 +- 7 files changed, 190 insertions(+), 3 deletions(-) create mode 100644 sources/agents/browser_agent.py create mode 100644 sources/browser.py diff --git a/main.py b/main.py index 5ea2c61..3f2d1a6 100755 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ import configparser from sources.llm_provider import Provider from sources.interaction import Interaction -from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent +from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent, BrowserAgent import warnings warnings.filterwarnings("ignore") @@ -44,6 +44,10 @@ def main(): PlannerAgent(model=config["MAIN"]["provider_model"], name="Planner", prompt_path="prompts/planner_agent.txt", + provider=provider), + BrowserAgent(model=config["MAIN"]["provider_model"], + name="Browser", + prompt_path="prompts/browser_agent.txt", provider=provider) ] diff --git a/requirements.txt b/requirements.txt index e122ccd..361411b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ soundfile==0.13.1 protobuf==3.20.3 termcolor==2.5.0 gliclass==0.1.8 +huggingface-hub==0.26.3 # if use chinese ordered_set pypinyin diff --git a/setup.py b/setup.py index 1c346cc..8727892 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ setup( "protobuf==3.20.3", "termcolor==2.5.0", "gliclass==0.1.8", + "huggingface-hub==0.26.3" ], extras_require={ "chinese": [ diff --git a/sources/agents/__init__.py b/sources/agents/__init__.py index 4949eb8..8a11dc0 100644 --- a/sources/agents/__init__.py +++ b/sources/agents/__init__.py @@ -4,5 +4,6 @@ from .code_agent import CoderAgent from .casual_agent import CasualAgent from .file_agent import FileAgent from .planner_agent import PlannerAgent +from .browser_agent import BrowserAgent -__all__ = ["Agent", "CoderAgent", "CasualAgent", "FileAgent", "PlannerAgent"] +__all__ = ["Agent", "CoderAgent", "CasualAgent", "FileAgent", "PlannerAgent", "BrowserAgent"] diff --git a/sources/agents/browser_agent.py b/sources/agents/browser_agent.py new file mode 100644 index 0000000..ef4add5 --- /dev/null +++ b/sources/agents/browser_agent.py @@ -0,0 +1,23 @@ + +from sources.utility import pretty_print, animate_thinking +from sources.agents.agent import Agent +from sources.tools.webSearch import webSearch +from sources.browser import Browser +class BrowserAgent(Agent): + def __init__(self, model, name, prompt_path, provider): + """ + The casual agent is a special for casual talk to the user without specific tasks. + """ + super().__init__(model, name, prompt_path, provider) + self.tools = { + "web_search": webSearch(), + } + self.role = "deep research and web search" + self.browser = Browser() + self.browser.goTo("https://github.com/") + + def process(self, prompt, speech_module) -> str: + raise NotImplementedError("Browser agent is not implemented yet") + +if __name__ == "__main__": + browser = Browser() \ No newline at end of file diff --git a/sources/browser.py b/sources/browser.py new file mode 100644 index 0000000..bd0e128 --- /dev/null +++ b/sources/browser.py @@ -0,0 +1,157 @@ +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException, WebDriverException +import time +from bs4 import BeautifulSoup +import markdownify +import logging + +class Browser: + def __init__(self, headless=True): + """Initialize the browser with optional headless mode.""" + try: + chrome_options = Options() + if headless: + chrome_options.add_argument("--headless") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + self.driver = webdriver.Chrome(options=chrome_options) + self.wait = WebDriverWait(self.driver, 10) + self.logger = logging.getLogger(__name__) + self.logger.info("Browser initialized successfully") + + except Exception as e: + raise Exception(f"Failed to initialize browser: {str(e)}") + + def goTo(self, url): + """Navigate to a specified URL.""" + try: + self.driver.get(url) + time.sleep(2) # Wait for page to load + self.logger.info(f"Navigated to: {url}") + return True + except WebDriverException as e: + self.logger.error(f"Error navigating to {url}: {str(e)}") + return False + + def getText(self): + """Get page text and convert it to README (Markdown) format.""" + try: + soup = BeautifulSoup(self.driver.page_source, 'html.parser') + + for element in soup(['script', 'style']): + element.decompose() + + text = soup.get_text() + + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text = "\n".join(chunk for chunk in chunks if chunk) + + markdown_text = markdownify.markdownify(text, heading_style="ATX") + + return markdown_text + except Exception as e: + self.logger.error(f"Error getting text: {str(e)}") + return None + + def getNavigable(self): + """Get all navigable links on the current page.""" + try: + links = [] + elements = self.driver.find_elements(By.TAG_NAME, "a") + + for element in elements: + href = element.get_attribute("href") + if href and href.startswith(("http", "https")): + links.append({ + "url": href, + "text": element.text.strip(), + "is_displayed": element.is_displayed() + }) + + self.logger.info(f"Found {len(links)} navigable links") + return links + except Exception as e: + self.logger.error(f"Error getting navigable links: {str(e)}") + return [] + + def clickElement(self, xpath): + """Click an element specified by xpath.""" + try: + element = self.wait.until( + EC.element_to_be_clickable((By.XPATH, xpath)) + ) + element.click() + time.sleep(2) # Wait for action to complete + return True + except TimeoutException: + self.logger.error(f"Element not found or not clickable: {xpath}") + return False + + def getCurrentUrl(self): + """Get the current URL of the page.""" + return self.driver.current_url + + def getPageTitle(self): + """Get the title of the current page.""" + return self.driver.title + + def scrollToBottom(self): + """Scroll to the bottom of the page.""" + try: + self.driver.execute_script( + "window.scrollTo(0, document.body.scrollHeight);" + ) + time.sleep(1) # Wait for scroll to complete + return True + except Exception as e: + self.logger.error(f"Error scrolling: {str(e)}") + return False + + def takeScreenshot(self, filename): + """Take a screenshot of the current page.""" + try: + self.driver.save_screenshot(filename) + self.logger.info(f"Screenshot saved as {filename}") + return True + except Exception as e: + self.logger.error(f"Error taking screenshot: {str(e)}") + return False + + def close(self): + """Close the browser.""" + try: + self.driver.quit() + self.logger.info("Browser closed") + except Exception as e: + self.logger.error(f"Error closing browser: {str(e)}") + + def __del__(self): + """Destructor to ensure browser is closed.""" + self.close() + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + browser = Browser(headless=False) + + try: + browser.goTo("https://github.com/geohot") + text = browser.getText() + print("Page Text in Markdown:") + print(text) + links = browser.getNavigable() + print("\nNavigable Links:") + for link in links[:50]: + print(f"Text: {link['text']}, URL: {link['url']}") + + browser.takeScreenshot("example.png") + + finally: + browser.close() \ No newline at end of file diff --git a/sources/speech_to_text.py b/sources/speech_to_text.py index b9b9983..47416f2 100644 --- a/sources/speech_to_text.py +++ b/sources/speech_to_text.py @@ -115,7 +115,7 @@ class Transcript: audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) result = self.pipe(audio_data) return self.remove_hallucinations(result["text"]) - + class AudioTranscriber: """ AudioTranscriber is a class that transcribes audio from the audio queue and adds it to the transcript.