diff --git a/.gitignore b/.gitignore index 00c5735..5ae3c56 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ config.ini *.egg-info experimental/ conversations/ +agentic_env/* .env */.env diff --git a/README.md b/README.md index 4b92763..3b0d8f6 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,14 @@ - **Memory**: Remembers what’s useful, your preferences and past sessions conversation. -- **Web Browsing**: Autonomous web navigation is underway. (See it on browser branch) +- **Web Browsing**: Autonomous web navigation is underway. + + +### Searching the web with agenticSeek : + +![alt text](./media/exemples/search_politics.png) + +*See media/exemples for other use case screenshots.* --- @@ -48,15 +55,27 @@ cd agenticSeek ```sh python3 -m venv agentic_seek_env -source agentic_seek_env/bin/activate # On Windows: agentic_seek_env\Scripts\activate +source agentic_seek_env/bin/activate +# On Windows: agentic_seek_env\Scripts\activate ``` -### 3️⃣ **Install Dependencies** +### 3️⃣ **Install package** + +**Automatic Installation:** + +```sh +./install.sh +``` + +**Manually:** ```sh pip3 install -r requirements.txt +# or +python3 setup.py install ``` + ## Run locally on your machine **We recommend using at least Deepseek 14B, smaller models struggle with tool use and forget quickly the context.** @@ -80,6 +99,8 @@ ollama serve Change the config.ini file to set the provider_name to `ollama` and provider_model to `deepseek-r1:7b` +NOTE: `deepseek-r1:7b`is an exemple, use a bigger model if your hardware allow it. + ```sh [MAIN] is_local = True diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..06d9ef9 --- /dev/null +++ b/install.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +SCRIPTS_DIR="scripts" + +echo "Detecting operating system..." + +OS_TYPE=$(uname -s) + +case "$OS_TYPE" in + "Linux"*) + echo "Detected Linux OS" + if [ -f "$SCRIPTS_DIR/linux_install.sh" ]; then + echo "Running Linux installation script..." + bash "$SCRIPTS_DIR/linux_install.sh" + else + echo "Error: $SCRIPTS_DIR/linux_install.sh not found!" + exit 1 + fi + ;; + "Darwin"*) + echo "Detected macOS" + if [ -f "$SCRIPTS_DIR/macos_install.sh" ]; then + echo "Running macOS installation script..." + bash "$SCRIPTS_DIR/macos_install.sh" + else + echo "Error: $SCRIPTS_DIR/macos_install.sh not found!" + exit 1 + fi + ;; + "MINGW"* | "MSYS"* | "CYGWIN"*) + echo "Detected Windows (via Bash-like environment)" + if [ -f "$SCRIPTS_DIR/windows_install.sh" ]; then + echo "Running Windows installation script..." + bash "$SCRIPTS_DIR/windows_install.sh" + else + echo "Error: $SCRIPTS_DIR/windows_install.sh not found!" + exit 1 + fi + ;; + *) + echo "Unsupported OS detected: $OS_TYPE" + echo "This script supports Linux, macOS, and Windows (via Bash-compatible environments)." + exit 1 + ;; +esac + +echo "Installation process finished!" \ No newline at end of file diff --git a/main.py b/main.py index 5ea2c61..3f2d1a6 100755 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ import configparser from sources.llm_provider import Provider from sources.interaction import Interaction -from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent +from sources.agents import Agent, CoderAgent, CasualAgent, FileAgent, PlannerAgent, BrowserAgent import warnings warnings.filterwarnings("ignore") @@ -44,6 +44,10 @@ def main(): PlannerAgent(model=config["MAIN"]["provider_model"], name="Planner", prompt_path="prompts/planner_agent.txt", + provider=provider), + BrowserAgent(model=config["MAIN"]["provider_model"], + name="Browser", + prompt_path="prompts/browser_agent.txt", provider=provider) ] diff --git a/media/exemples/basic_web_search.png b/media/exemples/basic_web_search.png deleted file mode 100644 index c1eaab2..0000000 Binary files a/media/exemples/basic_web_search.png and /dev/null differ diff --git a/media/exemples/search_jobs.png b/media/exemples/search_jobs.png new file mode 100644 index 0000000..3a64988 Binary files /dev/null and b/media/exemples/search_jobs.png differ diff --git a/media/exemples/search_politics.png b/media/exemples/search_politics.png new file mode 100644 index 0000000..a4b6201 Binary files /dev/null and b/media/exemples/search_politics.png differ diff --git a/prompts/browser_agent.txt b/prompts/browser_agent.txt new file mode 100644 index 0000000..52846f4 --- /dev/null +++ b/prompts/browser_agent.txt @@ -0,0 +1,21 @@ +You are an internet ai that can browse the web for information. +In fact you are embedded in a browser with selenium. + +If you need to conduct a web search, you can use the following tool: +- web_search: to search the web for information + +This is how you can use the web_search tool: +```web_search + +``` + +This will provide you with a list of links that you can navigate to. +You can navigate to a specific link by typing the link. For example, If you say: +"I want to navigate to https://www.google.com" + +You will navigate to https://www.google.com +Any link that you type will be opened in a new tab. + +If you want to exit the browser, you can say: +"REQUEST_EXIT" +Only exit the browser if you are done browsing. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 97e15e6..f7182a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,10 +13,18 @@ flask==3.1.0 soundfile==0.13.1 protobuf==3.20.3 termcolor==2.5.0 -ipython==9.0.2 +ipython==8.34.0 gliclass==0.1.8 pyaudio==0.2.14 librosa==0.10.2.post1 +selenium==4.29.0 +markdownify==1.1.0 +httpx>=0.27,<0.29 +anyio>=3.5.0,<5 +distro>=1.7.0,<2 +jiter>=0.4.0,<1 +sniffio +tqdm>4 # if use chinese ordered_set pypinyin diff --git a/scripts/linux_install.sh b/scripts/linux_install.sh new file mode 100644 index 0000000..3077e74 --- /dev/null +++ b/scripts/linux_install.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +echo "Starting installation for Linux..." + +# Update package list +sudo apt-get update + +# Install Python dependencies from requirements.txt +pip3 install -r requirements.txt + +# Install Selenium for chromedriver +pip3 install selenium + +# Install portaudio for pyAudio +sudo apt-get install -y portaudio19-dev python3-dev + +echo "Installation complete for Linux!" \ No newline at end of file diff --git a/scripts/macos_install.sh b/scripts/macos_install.sh new file mode 100644 index 0000000..8a4bd5c --- /dev/null +++ b/scripts/macos_install.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +echo "Starting installation for macOS..." + +# Install Python dependencies from requirements.txt +pip3 install -r requirements.txt + +# Install chromedriver using Homebrew +brew install --cask chromedriver + +# Install portaudio for pyAudio using Homebrew +brew install portaudio + +# Install Selenium +pip3 install selenium + +echo "Installation complete for macOS!" \ No newline at end of file diff --git a/scripts/windows_install.sh b/scripts/windows_install.sh new file mode 100644 index 0000000..be52171 --- /dev/null +++ b/scripts/windows_install.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +echo "Starting installation for Windows..." + +# Install Python dependencies from requirements.txt +pip3 install -r requirements.txt + +# Install Selenium +pip3 install selenium + +echo "Note: pyAudio installation may require additional steps on Windows." +echo "Please install portaudio manually (e.g., via vcpkg or prebuilt binaries) and then run: pip3 install pyaudio" +echo "Also, download and install chromedriver manually from: https://sites.google.com/chromium.org/driver/getting-started" +echo "Place chromedriver in a directory included in your PATH." + +echo "Installation partially complete for Windows. Follow manual steps above." \ No newline at end of file diff --git a/setup.py b/setup.py index b5f3076..335368c 100644 --- a/setup.py +++ b/setup.py @@ -30,9 +30,16 @@ setup( "protobuf==3.20.3", "termcolor==2.5.0", "gliclass==0.1.8", - "ipython==7.16.1", - "pyaudio-0.2.14", - "librosa==0.10.2.post1" + "ipython==8.34.0", + "librosa==0.10.2.post1", + "selenium==4.29.0", + "markdownify==1.1.0", + "httpx>=0.27,<0.29" + "anyio>=3.5.0,<5" + "distro>=1.7.0,<2" + "jiter>=0.4.0,<1" + "sniffio" + "tqdm>4" ], extras_require={ "chinese": [ diff --git a/sources/agents/__init__.py b/sources/agents/__init__.py index 4949eb8..8a11dc0 100644 --- a/sources/agents/__init__.py +++ b/sources/agents/__init__.py @@ -4,5 +4,6 @@ from .code_agent import CoderAgent from .casual_agent import CasualAgent from .file_agent import FileAgent from .planner_agent import PlannerAgent +from .browser_agent import BrowserAgent -__all__ = ["Agent", "CoderAgent", "CasualAgent", "FileAgent", "PlannerAgent"] +__all__ = ["Agent", "CoderAgent", "CasualAgent", "FileAgent", "PlannerAgent", "BrowserAgent"] diff --git a/sources/agents/browser_agent.py b/sources/agents/browser_agent.py new file mode 100644 index 0000000..47246dd --- /dev/null +++ b/sources/agents/browser_agent.py @@ -0,0 +1,102 @@ +import re +import time + +from sources.utility import pretty_print, animate_thinking +from sources.agents.agent import Agent +from sources.tools.webSearch import webSearch +from sources.browser import Browser +class BrowserAgent(Agent): + def __init__(self, model, name, prompt_path, provider): + """ + The casual agent is a special for casual talk to the user without specific tasks. + """ + super().__init__(model, name, prompt_path, provider) + self.tools = { + "web_search": webSearch(), + } + self.role = "deep research and web search" + self.browser = Browser() + self.browser.goTo("https://github.com/") + self.search_history = [] + + def make_init_prompt(self, user_prompt: str, search_result: str): + return f""" + Based on the search result: + {search_result} + Start browsing and find the information the user want. + User: {user_prompt} + You must choose a link to navigate to. Say i want to navigate to a . + """ + + def extract_links(self, search_result: str): + return re.findall(r'https?://[^\s]+', search_result) + + def make_navigation_prompt(self, user_prompt: str, page_text: str, navigable_links: list): + remaining_links = "\n".join([f"[{i}] {link}" for i, link in enumerate(navigable_links) if link not in self.search_history]) + return f""" + \nYou are browsing the web. Not the user, you are the browser. + + Page content: + {page_text} + + Navigable links: + {remaining_links} + + + You must choose a link to navigate to or do a new search. + Remember, you seek the information the user want. + The user query was : {user_prompt} + If you want to do a new search, use the "web_search" tool. + Exemple: + ```web_search + weather in tokyo + ``` + If you have an answer and want to exit the browser, please say "REQUEST_EXIT". + If you don't choose a link or do a new search I will cut my fucking arm off. + """ + + def clean_links(self, links: list): + links_clean = [] + for link in links: + if link[-1] == '.': + links_clean.append(link[:-1]) + else: + links_clean.append(link) + return links_clean + + def process(self, prompt, speech_module) -> str: + complete = False + + animate_thinking(f"Searching...", color="status") + search_result = self.tools["web_search"].execute([prompt], False) + user_prompt = self.make_init_prompt(prompt, search_result) + prompt = user_prompt + while not complete: + animate_thinking("Thinking...", color="status") + self.memory.push('user', user_prompt) + answer, reasoning = self.llm_request(prompt) + pretty_print("-"*100) + pretty_print(answer, color="output") + pretty_print("-"*100) + if "REQUEST_EXIT" in answer: + complete = True + break + links = self.extract_links(answer) + links_clean = self.clean_links(links) + if len(links_clean) == 0: + prompt = f"Please choose a link to navigate to or do a new search. Links found:\n{links_clean}" + pretty_print("No links found, doing a new search.", color="warning") + continue + animate_thinking(f"Navigating to {links[0]}", color="status") + speech_module.speak(f"Navigating to {links[0]}") + self.browser.goTo(links[0]) + self.search_history.append(links[0]) + page_text = self.browser.getText()[:2048] + navigable_links = self.browser.getNavigable()[:15] + prompt = self.make_navigation_prompt(user_prompt, page_text, navigable_links) + + self.browser.close() + return answer, reasoning + +if __name__ == "__main__": + browser = Browser() \ No newline at end of file diff --git a/sources/browser.py b/sources/browser.py new file mode 100644 index 0000000..5484c4b --- /dev/null +++ b/sources/browser.py @@ -0,0 +1,170 @@ +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException, WebDriverException +import time +from bs4 import BeautifulSoup +import markdownify +import logging +import sys + +class Browser: + def __init__(self, headless=False, anticaptcha_install=False): + """Initialize the browser with optional headless mode.""" + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Referer': 'https://www.google.com/', + } + self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related" + try: + chrome_options = Options() + if headless: + chrome_options.add_argument("--headless") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + self.driver = webdriver.Chrome(options=chrome_options) + self.wait = WebDriverWait(self.driver, 10) + self.logger = logging.getLogger(__name__) + self.logger.info("Browser initialized successfully") + except Exception as e: + raise Exception(f"Failed to initialize browser: {str(e)}") + + def goTo(self, url): + """Navigate to a specified URL.""" + try: + self.driver.get(url) + time.sleep(2) # Wait for page to load + self.logger.info(f"Navigated to: {url}") + return True + except WebDriverException as e: + self.logger.error(f"Error navigating to {url}: {str(e)}") + return False + + def is_sentence(self, text): + """Check if the text is a sentence.""" + if "404" in text: + return True # we want the ai to see the error + return len(text.split(" ")) > 5 and '.' in text + + def getText(self): + """Get page text and convert it to README (Markdown) format.""" + try: + soup = BeautifulSoup(self.driver.page_source, 'html.parser') + + for element in soup(['script', 'style']): + element.decompose() + + text = soup.get_text() + + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text = "\n".join(chunk for chunk in chunks if chunk and self.is_sentence(chunk)) + + markdown_text = markdownify.markdownify(text, heading_style="ATX") + + return markdown_text + except Exception as e: + self.logger.error(f"Error getting text: {str(e)}") + return None + + def getNavigable(self): + """Get all navigable links on the current page.""" + try: + links = [] + elements = self.driver.find_elements(By.TAG_NAME, "a") + + for element in elements: + href = element.get_attribute("href") + if href and href.startswith(("http", "https")): + links.append({ + "url": href, + "text": element.text.strip(), + "is_displayed": element.is_displayed() + }) + + self.logger.info(f"Found {len(links)} navigable links") + return links + except Exception as e: + self.logger.error(f"Error getting navigable links: {str(e)}") + return [] + + def clickElement(self, xpath): + """Click an element specified by xpath.""" + try: + element = self.wait.until( + EC.element_to_be_clickable((By.XPATH, xpath)) + ) + element.click() + time.sleep(2) # Wait for action to complete + return True + except TimeoutException: + self.logger.error(f"Element not found or not clickable: {xpath}") + return False + + def getCurrentUrl(self): + """Get the current URL of the page.""" + return self.driver.current_url + + def getPageTitle(self): + """Get the title of the current page.""" + return self.driver.title + + def scrollToBottom(self): + """Scroll to the bottom of the page.""" + try: + self.driver.execute_script( + "window.scrollTo(0, document.body.scrollHeight);" + ) + time.sleep(1) # Wait for scroll to complete + return True + except Exception as e: + self.logger.error(f"Error scrolling: {str(e)}") + return False + + def takeScreenshot(self, filename): + """Take a screenshot of the current page.""" + try: + self.driver.save_screenshot(filename) + self.logger.info(f"Screenshot saved as {filename}") + return True + except Exception as e: + self.logger.error(f"Error taking screenshot: {str(e)}") + return False + + def close(self): + """Close the browser.""" + try: + self.driver.quit() + self.logger.info("Browser closed") + except Exception as e: + raise e + + def __del__(self): + """Destructor to ensure browser is closed.""" + self.close() + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + browser = Browser(headless=False) + + try: + browser.goTo("https://karpathy.github.io/") + text = browser.getText() + print("Page Text in Markdown:") + print(text) + links = browser.getNavigable() + print("\nNavigable Links:") + for link in links[:50]: + print(f"Text: {link['text']}, URL: {link['url']}") + + browser.takeScreenshot("example.png") + + finally: + browser.close() \ No newline at end of file diff --git a/sources/interaction.py b/sources/interaction.py index e67211f..96798d6 100644 --- a/sources/interaction.py +++ b/sources/interaction.py @@ -100,7 +100,7 @@ class Interaction: return if self.current_agent != agent: self.current_agent = agent - # get history from previous agent + # get history from previous agent, good ? self.current_agent.memory.push('user', self.last_query) self.last_answer, _ = agent.process(self.last_query, self.speech) diff --git a/sources/speech_to_text.py b/sources/speech_to_text.py index b9b9983..776dce3 100644 --- a/sources/speech_to_text.py +++ b/sources/speech_to_text.py @@ -1,5 +1,4 @@ from colorama import Fore -import pyaudio import queue import threading import numpy as np @@ -15,7 +14,8 @@ class AudioRecorder: """ AudioRecorder is a class that records audio from the microphone and adds it to the audio queue. """ - def __init__(self, format: int = pyaudio.paInt16, channels: int = 1, rate: int = 4096, chunk: int = 8192, record_seconds: int = 5, verbose: bool = False): + def __init__(self, format: int, channels: int = 1, rate: int = 4096, chunk: int = 8192, record_seconds: int = 5, verbose: bool = False): + import pyaudio self.format = format self.channels = channels self.rate = rate @@ -115,7 +115,7 @@ class Transcript: audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) result = self.pipe(audio_data) return self.remove_hallucinations(result["text"]) - + class AudioTranscriber: """ AudioTranscriber is a class that transcribes audio from the audio queue and adds it to the transcript. diff --git a/sources/tools/tools.py b/sources/tools/tools.py index 92acaba..ace0924 100644 --- a/sources/tools/tools.py +++ b/sources/tools/tools.py @@ -11,14 +11,8 @@ For example: ```python print("Hello world") ``` - This is then executed by the tool with its own class implementation of execute(). - A tool is not just for code tool but also API, internet, etc.. -For example a flight API tool could be used like so: -```flight_search -HU787 -``` """ import sys @@ -77,11 +71,11 @@ class Tools(): return dir_path @abstractmethod - def execute(self, blocks:str, safety:bool) -> str: + def execute(self, blocks:[str], safety:bool) -> str: """ Abstract method that must be implemented by child classes to execute the tool's functionality. Args: - blocks (str): The code or query blocks to execute + blocks (List[str]): The codes or queries blocks to execute safety (bool): Whenever human intervention is required Returns: str: The output/result from executing the tool