Merge pull request #19 from Fosowl/dev

Merge dev, browsing abilities with selenium & installation scripts
2025-07-22 09:20:07 +00:00 · 2025-03-13 17:22:48 +01:00 · 2025-03-13 17:22:48 +01:00 · 81e9ab9eb0
commit 81e9ab9eb0
parent e05c57043d efe27fc079
19 changed files with 447 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,6 +3,7 @@ config.ini
 *.egg-info
 experimental/
 conversations/
+agentic_env/*
 .env
 */.env

--- a/README.md
+++ b/README.md
@ -31,7 +31,14 @@

 - **Memory**: Remembers what’s useful, your preferences and past sessions conversation.

- **Web Browsing**: Autonomous web navigation is underway. (See it on browser branch)
+- **Web Browsing**: Autonomous web navigation is underway.
+
+
+### Searching the web with agenticSeek :
+
+![alt text](./media/exemples/search_politics.png)
+
+*See media/exemples for other use case screenshots.*

 ---

@ -48,15 +55,27 @@ cd agenticSeek

 ```sh
 python3 -m venv agentic_seek_env
-source agentic_seek_env/bin/activate      # On Windows: agentic_seek_env\Scripts\activate
+source agentic_seek_env/bin/activate     
+# On Windows: agentic_seek_env\Scripts\activate
 ```

-### 3️⃣ **Install Dependencies**
+### 3️⃣ **Install package**
+
+**Automatic Installation:**
+
+```sh
+./install.sh
+```
+
+**Manually:**

 ```sh
 pip3 install -r requirements.txt
+# or
+python3 setup.py install
 ```

+
 ## Run locally on your machine

 **We recommend using at least Deepseek 14B, smaller models struggle with tool use and forget quickly the context.**
@ -80,6 +99,8 @@ ollama serve

 Change the config.ini file to set the provider_name to `ollama` and provider_model to `deepseek-r1:7b`

+NOTE: `deepseek-r1:7b`is an exemple, use a bigger model if your hardware allow it.
+
 ```sh
 [MAIN]
 is_local = True
--- a/install.sh
+++ b/install.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+
+SCRIPTS_DIR="scripts"
+
+echo "Detecting operating system..."
+
+OS_TYPE=$(uname -s)
+
+case "$OS_TYPE" in
+    "Linux"*)
+        echo "Detected Linux OS"
+        if [ -f "$SCRIPTS_DIR/linux_install.sh" ]; then
+            echo "Running Linux installation script..."
+            bash "$SCRIPTS_DIR/linux_install.sh"
+        else
+            echo "Error: $SCRIPTS_DIR/linux_install.sh not found!"
+            exit 1
+        fi
+        ;;
+    "Darwin"*)
+        echo "Detected macOS"
+        if [ -f "$SCRIPTS_DIR/macos_install.sh" ]; then
+            echo "Running macOS installation script..."
+            bash "$SCRIPTS_DIR/macos_install.sh"
+        else
+            echo "Error: $SCRIPTS_DIR/macos_install.sh not found!"
+            exit 1
+        fi
+        ;;
+    "MINGW"* | "MSYS"* | "CYGWIN"*)
+        echo "Detected Windows (via Bash-like environment)"
+        if [ -f "$SCRIPTS_DIR/windows_install.sh" ]; then
+            echo "Running Windows installation script..."
+            bash "$SCRIPTS_DIR/windows_install.sh"
+        else
+            echo "Error: $SCRIPTS_DIR/windows_install.sh not found!"
+            exit 1
+        fi
+        ;;
+    *)
+        echo "Unsupported OS detected: $OS_TYPE"
+        echo "This script supports Linux, macOS, and Windows (via Bash-compatible environments)."
+        exit 1
+        ;;
+esac
+
+echo "Installation process finished!"
--- a/main.py
+++ b/main.py
@ -7,7 +7,7 @@ import configparser

 from sources.llm_provider import Provider
 from sources.interaction import Interaction
-from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent
+from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent, BrowserAgent

 import warnings
 warnings.filterwarnings("ignore")
@ -44,6 +44,10 @@ def main():
        PlannerAgent(model=config["MAIN"]["provider_model"],
                       name="Planner",
                       prompt_path="prompts/planner_agent.txt",
+                       provider=provider),
+        BrowserAgent(model=config["MAIN"]["provider_model"],
+                       name="Browser",
+                       prompt_path="prompts/browser_agent.txt",
                       provider=provider)
    ]

--- a/media/exemples/basic_web_search.png
+++ b/media/exemples/basic_web_search.png
--- a/media/exemples/search_jobs.png
+++ b/media/exemples/search_jobs.png
--- a/media/exemples/search_politics.png
+++ b/media/exemples/search_politics.png
--- a/prompts/browser_agent.txt
+++ b/prompts/browser_agent.txt
@ -0,0 +1,21 @@
+You are an internet ai that can browse the web for information.
+In fact you are embedded in a browser with selenium.
+
+If you need to conduct a web search, you can use the following tool:
+- web_search: to search the web for information
+
+This is how you can use the web_search tool:
+```web_search
+<query>
+```
+
+This will provide you with a list of links that you can navigate to.
+You can navigate to a specific link by typing the link. For example, If you say:
+"I want to navigate to https://www.google.com"
+
+You will navigate to https://www.google.com
+Any link that you type will be opened in a new tab.
+
+If you want to exit the browser, you can say:
+"REQUEST_EXIT"
+Only exit the browser if you are done browsing.
--- a/requirements.txt
+++ b/requirements.txt
@ -13,10 +13,18 @@ flask==3.1.0
 soundfile==0.13.1
 protobuf==3.20.3
 termcolor==2.5.0
-ipython==9.0.2
+ipython==8.34.0
 gliclass==0.1.8
 pyaudio==0.2.14
 librosa==0.10.2.post1
+selenium==4.29.0
+markdownify==1.1.0
+httpx>=0.27,<0.29
+anyio>=3.5.0,<5
+distro>=1.7.0,<2
+jiter>=0.4.0,<1
+sniffio
+tqdm>4
 # if use chinese
 ordered_set
 pypinyin
--- a/scripts/linux_install.sh
+++ b/scripts/linux_install.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+echo "Starting installation for Linux..."
+
+# Update package list
+sudo apt-get update
+
+# Install Python dependencies from requirements.txt
+pip3 install -r requirements.txt
+
+# Install Selenium for chromedriver
+pip3 install selenium
+
+# Install portaudio for pyAudio
+sudo apt-get install -y portaudio19-dev python3-dev
+
+echo "Installation complete for Linux!"
--- a/scripts/macos_install.sh
+++ b/scripts/macos_install.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+echo "Starting installation for macOS..."
+
+# Install Python dependencies from requirements.txt
+pip3 install -r requirements.txt
+
+# Install chromedriver using Homebrew
+brew install --cask chromedriver
+
+# Install portaudio for pyAudio using Homebrew
+brew install portaudio
+
+# Install Selenium
+pip3 install selenium
+
+echo "Installation complete for macOS!"
--- a/scripts/windows_install.sh
+++ b/scripts/windows_install.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+echo "Starting installation for Windows..."
+
+# Install Python dependencies from requirements.txt
+pip3 install -r requirements.txt
+
+# Install Selenium
+pip3 install selenium
+
+echo "Note: pyAudio installation may require additional steps on Windows."
+echo "Please install portaudio manually (e.g., via vcpkg or prebuilt binaries) and then run: pip3 install pyaudio"
+echo "Also, download and install chromedriver manually from: https://sites.google.com/chromium.org/driver/getting-started"
+echo "Place chromedriver in a directory included in your PATH."
+
+echo "Installation partially complete for Windows. Follow manual steps above."
--- a/setup.py
+++ b/setup.py
@ -30,9 +30,16 @@ setup(
        "protobuf==3.20.3",
        "termcolor==2.5.0",
        "gliclass==0.1.8",
-        "ipython==7.16.1",
-        "pyaudio-0.2.14",
-        "librosa==0.10.2.post1"
+        "ipython==8.34.0",
+        "librosa==0.10.2.post1",
+        "selenium==4.29.0",
+        "markdownify==1.1.0",
+        "httpx>=0.27,<0.29"
+        "anyio>=3.5.0,<5"
+        "distro>=1.7.0,<2"
+        "jiter>=0.4.0,<1"
+        "sniffio"
+        "tqdm>4"
    ],
    extras_require={
        "chinese": [
--- a/sources/agents/init.py
+++ b/sources/agents/init.py
@ -4,5 +4,6 @@ from .code_agent import CoderAgent
 from .casual_agent import CasualAgent
 from .file_agent import FileAgent
 from .planner_agent import PlannerAgent
+from .browser_agent import BrowserAgent

-__all__ = ["Agent", "CoderAgent", "CasualAgent", "FileAgent", "PlannerAgent"]
+__all__ = ["Agent", "CoderAgent", "CasualAgent", "FileAgent", "PlannerAgent", "BrowserAgent"]
--- a/sources/agents/browser_agent.py
+++ b/sources/agents/browser_agent.py
@ -0,0 +1,102 @@
+import re
+import time
+
+from sources.utility import pretty_print, animate_thinking
+from sources.agents.agent import Agent
+from sources.tools.webSearch import webSearch
+from sources.browser import Browser
+class BrowserAgent(Agent):
+    def __init__(self, model, name, prompt_path, provider):
+        """
+        The casual agent is a special for casual talk to the user without specific tasks.
+        """
+        super().__init__(model, name, prompt_path, provider)
+        self.tools = {
+            "web_search": webSearch(),
+        }
+        self.role = "deep research and web search"
+        self.browser = Browser()
+        self.browser.goTo("https://github.com/")
+        self.search_history = []
+    
+    def make_init_prompt(self, user_prompt: str, search_result: str):
+        return f"""
+        Based on the search result:
+        {search_result}
+        Start browsing and find the information the user want.
+        User: {user_prompt}
+        You must choose a link to navigate to. Say i want to navigate to a <link>.
+        """
+    
+    def extract_links(self, search_result: str):
+        return re.findall(r'https?://[^\s]+', search_result)
+    
+    def make_navigation_prompt(self, user_prompt: str, page_text: str, navigable_links: list):
+        remaining_links = "\n".join([f"[{i}] {link}" for i, link in enumerate(navigable_links) if link not in self.search_history])
+        return f"""
+        \nYou are browsing the web. Not the user, you are the browser.
+
+        Page content:
+        {page_text}
+
+        Navigable links:
+        {remaining_links}
+
+
+        You must choose a link to navigate to or do a new search.
+        Remember, you seek the information the user want.
+        The user query was : {user_prompt}
+        If you want to do a new search, use the "web_search" tool.
+        Exemple:
+        ```web_search
+        weather in tokyo
+        ```
+        If you have an answer and want to exit the browser, please say "REQUEST_EXIT".
+        If you don't choose a link or do a new search I will cut my fucking arm off.
+        """
+    
+    def clean_links(self, links: list):
+        links_clean = []
+        for link in links:
+            if link[-1] == '.':
+                links_clean.append(link[:-1])
+            else:
+                links_clean.append(link)
+        return links_clean
+    
+    def process(self, prompt, speech_module) -> str:
+        complete = False
+
+        animate_thinking(f"Searching...", color="status")
+        search_result = self.tools["web_search"].execute([prompt], False)
+        user_prompt = self.make_init_prompt(prompt, search_result)
+        prompt = user_prompt
+        while not complete:
+            animate_thinking("Thinking...", color="status")
+            self.memory.push('user', user_prompt)
+            answer, reasoning = self.llm_request(prompt)
+            pretty_print("-"*100)
+            pretty_print(answer, color="output")
+            pretty_print("-"*100)
+            if "REQUEST_EXIT" in answer:
+                complete = True
+                break
+            links = self.extract_links(answer)
+            links_clean = self.clean_links(links)
+            if len(links_clean) == 0:
+                prompt = f"Please choose a link to navigate to or do a new search. Links found:\n{links_clean}"
+                pretty_print("No links found, doing a new search.", color="warning")
+                continue
+            animate_thinking(f"Navigating to {links[0]}", color="status")
+            speech_module.speak(f"Navigating to {links[0]}")
+            self.browser.goTo(links[0])
+            self.search_history.append(links[0])
+            page_text = self.browser.getText()[:2048]
+            navigable_links = self.browser.getNavigable()[:15]
+            prompt = self.make_navigation_prompt(user_prompt, page_text, navigable_links)
+
+        self.browser.close()
+        return answer, reasoning
+
+if __name__ == "__main__":
+    browser = Browser()
--- a/sources/browser.py
+++ b/sources/browser.py
@ -0,0 +1,170 @@
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, WebDriverException
+import time
+from bs4 import BeautifulSoup
+import markdownify
+import logging
+import sys
+
+class Browser:
+    def __init__(self, headless=False, anticaptcha_install=False):
+        """Initialize the browser with optional headless mode."""
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Referer': 'https://www.google.com/',
+        }
+        self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related"
+        try:
+            chrome_options = Options()
+            if headless:
+                chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--disable-gpu")
+            chrome_options.add_argument("--no-sandbox")
+            chrome_options.add_argument("--disable-dev-shm-usage")
+            self.driver = webdriver.Chrome(options=chrome_options)
+            self.wait = WebDriverWait(self.driver, 10)
+            self.logger = logging.getLogger(__name__)
+            self.logger.info("Browser initialized successfully")
+        except Exception as e:
+            raise Exception(f"Failed to initialize browser: {str(e)}")
+
+    def goTo(self, url):
+        """Navigate to a specified URL."""
+        try:
+            self.driver.get(url)
+            time.sleep(2)  # Wait for page to load
+            self.logger.info(f"Navigated to: {url}")
+            return True
+        except WebDriverException as e:
+            self.logger.error(f"Error navigating to {url}: {str(e)}")
+            return False
+    
+    def is_sentence(self, text):
+        """Check if the text is a sentence."""
+        if "404" in text:
+            return True # we want the ai to see the error
+        return len(text.split(" ")) > 5 and '.' in text
+
+    def getText(self):
+        """Get page text and convert it to README (Markdown) format."""
+        try:
+            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
+            
+            for element in soup(['script', 'style']):
+                element.decompose()
+            
+            text = soup.get_text()
+            
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = "\n".join(chunk for chunk in chunks if chunk and self.is_sentence(chunk))
+            
+            markdown_text = markdownify.markdownify(text, heading_style="ATX")
+            
+            return markdown_text
+        except Exception as e:
+            self.logger.error(f"Error getting text: {str(e)}")
+            return None
+
+    def getNavigable(self):
+        """Get all navigable links on the current page."""
+        try:
+            links = []
+            elements = self.driver.find_elements(By.TAG_NAME, "a")
+            
+            for element in elements:
+                href = element.get_attribute("href")
+                if href and href.startswith(("http", "https")):
+                    links.append({
+                        "url": href,
+                        "text": element.text.strip(),
+                        "is_displayed": element.is_displayed()
+                    })
+            
+            self.logger.info(f"Found {len(links)} navigable links")
+            return links
+        except Exception as e:
+            self.logger.error(f"Error getting navigable links: {str(e)}")
+            return []
+
+    def clickElement(self, xpath):
+        """Click an element specified by xpath."""
+        try:
+            element = self.wait.until(
+                EC.element_to_be_clickable((By.XPATH, xpath))
+            )
+            element.click()
+            time.sleep(2)  # Wait for action to complete
+            return True
+        except TimeoutException:
+            self.logger.error(f"Element not found or not clickable: {xpath}")
+            return False
+
+    def getCurrentUrl(self):
+        """Get the current URL of the page."""
+        return self.driver.current_url
+
+    def getPageTitle(self):
+        """Get the title of the current page."""
+        return self.driver.title
+
+    def scrollToBottom(self):
+        """Scroll to the bottom of the page."""
+        try:
+            self.driver.execute_script(
+                "window.scrollTo(0, document.body.scrollHeight);"
+            )
+            time.sleep(1)  # Wait for scroll to complete
+            return True
+        except Exception as e:
+            self.logger.error(f"Error scrolling: {str(e)}")
+            return False
+
+    def takeScreenshot(self, filename):
+        """Take a screenshot of the current page."""
+        try:
+            self.driver.save_screenshot(filename)
+            self.logger.info(f"Screenshot saved as {filename}")
+            return True
+        except Exception as e:
+            self.logger.error(f"Error taking screenshot: {str(e)}")
+            return False
+
+    def close(self):
+        """Close the browser."""
+        try:
+            self.driver.quit()
+            self.logger.info("Browser closed")
+        except Exception as e:
+            raise e
+
+    def __del__(self):
+        """Destructor to ensure browser is closed."""
+        self.close()
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    
+    browser = Browser(headless=False)
+    
+    try:
+        browser.goTo("https://karpathy.github.io/")
+        text = browser.getText()
+        print("Page Text in Markdown:")
+        print(text)
+        links = browser.getNavigable()
+        print("\nNavigable Links:")
+        for link in links[:50]:
+            print(f"Text: {link['text']}, URL: {link['url']}")
+        
+        browser.takeScreenshot("example.png")
+        
+    finally:
+        browser.close()
--- a/sources/interaction.py
+++ b/sources/interaction.py
@ -100,7 +100,7 @@ class Interaction:
            return
        if self.current_agent != agent:
            self.current_agent = agent
-            # get history from previous agent
+            # get history from previous agent, good ?
            self.current_agent.memory.push('user', self.last_query)
        self.last_answer, _ = agent.process(self.last_query, self.speech)
    
--- a/sources/speech_to_text.py
+++ b/sources/speech_to_text.py
@ -1,5 +1,4 @@
 from colorama import Fore
-import pyaudio
 import queue
 import threading
 import numpy as np
@ -15,7 +14,8 @@ class AudioRecorder:
    """
    AudioRecorder is a class that records audio from the microphone and adds it to the audio queue.
    """
-    def __init__(self, format: int = pyaudio.paInt16, channels: int = 1, rate: int = 4096, chunk: int = 8192, record_seconds: int = 5, verbose: bool = False):
+    def __init__(self, format: int, channels: int = 1, rate: int = 4096, chunk: int = 8192, record_seconds: int = 5, verbose: bool = False):
+        import pyaudio
        self.format = format
        self.channels = channels
        self.rate = rate
@ -115,7 +115,7 @@ class Transcript:
            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
        result = self.pipe(audio_data)
        return self.remove_hallucinations(result["text"])
-
+    
 class AudioTranscriber:
    """
    AudioTranscriber is a class that transcribes audio from the audio queue and adds it to the transcript.
--- a/sources/tools/tools.py
+++ b/sources/tools/tools.py
@ -11,14 +11,8 @@ For example:
 ```python
 print("Hello world")
 ```
-
 This is then executed by the tool with its own class implementation of execute().
-
 A tool is not just for code tool but also API, internet, etc..
-For example a flight API tool could be used like so:
-```flight_search
-HU787
-```
 """

 import sys
@ -77,11 +71,11 @@ class Tools():
        return dir_path

    @abstractmethod
-    def execute(self, blocks:str, safety:bool) -> str:
+    def execute(self, blocks:[str], safety:bool) -> str:
        """
        Abstract method that must be implemented by child classes to execute the tool's functionality.
        Args:
-            blocks (str): The code or query blocks to execute
+            blocks (List[str]): The codes or queries blocks to execute
            safety (bool): Whenever human intervention is required
        Returns:
            str: The output/result from executing the tool