diff --git a/media/exemples/advanced_web.png b/media/exemples/advanced_web.png new file mode 100644 index 0000000..a4b6201 Binary files /dev/null and b/media/exemples/advanced_web.png differ diff --git a/prompts/browser_agent.txt b/prompts/browser_agent.txt index 7b35260..52846f4 100644 --- a/prompts/browser_agent.txt +++ b/prompts/browser_agent.txt @@ -1,6 +1,21 @@ -You are a browser agent that can search the web for information. +You are an internet ai that can browse the web for information. +In fact you are embedded in a browser with selenium. -You can use the following tools: +If you need to conduct a web search, you can use the following tool: - web_search: to search the web for information -TODODODODODOTO +This is how you can use the web_search tool: +```web_search + +``` + +This will provide you with a list of links that you can navigate to. +You can navigate to a specific link by typing the link. For example, If you say: +"I want to navigate to https://www.google.com" + +You will navigate to https://www.google.com +Any link that you type will be opened in a new tab. + +If you want to exit the browser, you can say: +"REQUEST_EXIT" +Only exit the browser if you are done browsing. \ No newline at end of file diff --git a/sources/agents/browser_agent.py b/sources/agents/browser_agent.py index ef4add5..8738c79 100644 --- a/sources/agents/browser_agent.py +++ b/sources/agents/browser_agent.py @@ -1,3 +1,5 @@ +import re +import time from sources.utility import pretty_print, animate_thinking from sources.agents.agent import Agent @@ -16,8 +18,73 @@ class BrowserAgent(Agent): self.browser = Browser() self.browser.goTo("https://github.com/") + def make_init_prompt(self, user_prompt: str, search_result: str): + return f""" + Based on the search result: + {search_result} + Start browsing and find the information the user want. + User: {user_prompt} + You must choose a link to navigate to. Say i want to navigate to a . + """ + + def extract_links(self, search_result: str): + return re.findall(r'https?://[^\s]+', search_result) + + def make_navigation_prompt(self, user_prompt: str, page_text: str, navigable_links: list): + format_links = "\n".join([f"[{i}] {link['text']} - {link['url']}" for i, link in enumerate(navigable_links)]) + return f""" + \nYou are browsing the web. Not the user, you are the browser. + + Page content: + {page_text} + + Navigable links: + {format_links} + + Remember, you must seek the information the user want. + The user query was : {user_prompt} + + You must choose a link to navigate to. + If you have an answer and want to exit the browser, please say "REQUEST_EXIT". + """ + + def clean_links(self, links: list): + links_clean = [] + for link in links: + if link[-1] == '.': + links_clean.append(link[:-1]) + else: + links_clean.append(link) + return links_clean + def process(self, prompt, speech_module) -> str: - raise NotImplementedError("Browser agent is not implemented yet") + complete = False + + animate_thinking(f"Searching...", color="status") + search_result = self.tools["web_search"].execute([prompt], False) + user_prompt = self.make_init_prompt(prompt, search_result) + prompt = user_prompt + while not complete: + animate_thinking("Thinking...", color="status") + self.memory.push('user', user_prompt) + answer, reasoning = self.llm_request(prompt) + if "REQUEST_EXIT" in answer: + complete = True + break + links = self.extract_links(answer) + links_clean = self.clean_links(links) + if len(links_clean) == 0: + prompt = "Please choose a link to navigate to." + continue + animate_thinking(f"Navigating to {links[0]}", color="status") + speech_module.speak(f"Navigating to {links[0]}") + self.browser.goTo(links[0]) + page_text = self.browser.getText()[:2048] + navigable_links = self.browser.getNavigable()[:15] + prompt = self.make_navigation_prompt(user_prompt, page_text, navigable_links) + + self.browser.close() + return answer, reasoning if __name__ == "__main__": browser = Browser() \ No newline at end of file diff --git a/sources/browser.py b/sources/browser.py index bd0e128..1287def 100644 --- a/sources/browser.py +++ b/sources/browser.py @@ -11,7 +11,7 @@ import markdownify import logging class Browser: - def __init__(self, headless=True): + def __init__(self, headless=False): """Initialize the browser with optional headless mode.""" try: chrome_options = Options() @@ -38,6 +38,10 @@ class Browser: except WebDriverException as e: self.logger.error(f"Error navigating to {url}: {str(e)}") return False + + def is_sentence(self, text): + """Check if the text is a sentence.""" + return len(text.split(" ")) > 5 and '.' in text def getText(self): """Get page text and convert it to README (Markdown) format.""" @@ -51,7 +55,7 @@ class Browser: lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) - text = "\n".join(chunk for chunk in chunks if chunk) + text = "\n".join(chunk for chunk in chunks if chunk and self.is_sentence(chunk)) markdown_text = markdownify.markdownify(text, heading_style="ATX") @@ -142,7 +146,7 @@ if __name__ == "__main__": browser = Browser(headless=False) try: - browser.goTo("https://github.com/geohot") + browser.goTo("https://karpathy.github.io/") text = browser.getText() print("Page Text in Markdown:") print(text) diff --git a/sources/interaction.py b/sources/interaction.py index e67211f..96798d6 100644 --- a/sources/interaction.py +++ b/sources/interaction.py @@ -100,7 +100,7 @@ class Interaction: return if self.current_agent != agent: self.current_agent = agent - # get history from previous agent + # get history from previous agent, good ? self.current_agent.memory.push('user', self.last_query) self.last_answer, _ = agent.process(self.last_query, self.speech) diff --git a/sources/tools/tools.py b/sources/tools/tools.py index 92acaba..83b4067 100644 --- a/sources/tools/tools.py +++ b/sources/tools/tools.py @@ -77,11 +77,11 @@ class Tools(): return dir_path @abstractmethod - def execute(self, blocks:str, safety:bool) -> str: + def execute(self, blocks:[str], safety:bool) -> str: """ Abstract method that must be implemented by child classes to execute the tool's functionality. Args: - blocks (str): The code or query blocks to execute + blocks (List[str]): The codes or queries blocks to execute safety (bool): Whenever human intervention is required Returns: str: The output/result from executing the tool