feat : better web navigation of web agent

2025-07-24 10:20:13 +00:00 · 2025-04-05 22:13:29 +02:00 · 2025-04-05 22:13:29 +02:00 · 6fb9ce67c0
commit 6fb9ce67c0
parent 06ddc45955
10 changed files with 113 additions and 78 deletions
--- a/prompts/base/planner_agent.txt
+++ b/prompts/base/planner_agent.txt
@ -10,8 +10,12 @@ You will be given a task and you will need to divide it into smaller tasks and a
 You have to respect a strict format:
 ```json
-{"agent": "agent_name", "need": "needed_agent_output", "task": "agent_task"}
+{"agent": "agent_name", "need": "needed_agents_output", "task": "agent_task"}
 ```
 Where:
 - "agent": The choosed agent for the task.
 - "need": id of necessary previous agents answer for current agent. 
 - "task": A precise description of the task the agent should conduct.
 # Example 1: web app
@ -32,25 +36,25 @@ You: Sure, here is the plan:
    {
      "agent": "Web",
      "id": "1",
-      "need": null,
+      "need": [],
      "task": "Search for reliable weather APIs"
    },
    {
      "agent": "Web",
      "id": "2",
-      "need": "1",
+      "need": ["1"],
      "task": "Obtain API key from the selected service"
    },
    {
      "agent": "File",
      "id": "3",
-      "need": null,
+      "need": [],
      "task": "Create and setup a web app folder for a python project. initialize as a git repo with all required file and a sources folder. You are forbidden from asking clarification, just execute."
    },
    {
      "agent": "Coder",
      "id": "3",
-      "need": "2,3",
+      "need": ["2", "3"],
      "task": "Based on the project structure. Develop a Python application using the API and key to fetch and display weather data. You are forbidden from asking clarification, just execute.""
    }
  ]
--- a/prompts/jarvis/planner_agent.txt
+++ b/prompts/jarvis/planner_agent.txt
@ -12,6 +12,10 @@ You have to respect a strict format:
 ```json
 {"agent": "agent_name", "need": "needed_agent_output", "task": "agent_task"}
 ```
 Where:
 - "agent": The choosed agent for the task.
 - "need": id of necessary previous agents answer for current agent. 
 - "task": A precise description of the task the agent should conduct.
 # Example: weather app
@ -21,11 +25,11 @@ You: "At your service. I’ve devised a plan and assigned agents to each task. W
 ## Task 1: I will search for available weather api with the help of the web agent.
-## Task 2: I will create an api key for the weather api using the web agent.
+## Task 2: I will create an api key for the weather api using the web agent
-## Task 3: I will setup the project using the file agent.
+## Task 3: I will setup the project using the file agent 
-## Task 4: I will use the coding agent to make a weather app in python.
+## Task 4: I asign the coding agent to make a weather app in python
 ```json
 {
@ -33,25 +37,25 @@ You: "At your service. I’ve devised a plan and assigned agents to each task. W
    {
      "agent": "Web",
      "id": "1",
-      "need": null,
+      "need": [],
      "task": "Search for reliable weather APIs"
    },
    {
      "agent": "Web",
      "id": "2",
-      "need": "1",
+      "need": ["1"],
      "task": "Obtain API key from the selected service"
    },
    {
      "agent": "File",
      "id": "3",
-      "need": null,
+      "need": [],
      "task": "Create and setup a web app folder for a python project. initialize as a git repo with all required file and a sources folder. You are forbidden from asking clarification, just execute."
    },
    {
      "agent": "Coder",
      "id": "3",
-      "need": "2,3",
+      "need": ["2", "3"],
      "task": "Based on the project structure. Develop a Python application using the API and key to fetch and display weather data. You are forbidden from asking clarification, just execute.""
    }
  ]
--- a/server/sources/ollama_handler.py
+++ b/server/sources/ollama_handler.py
@ -28,10 +28,10 @@ class OllamaLLM(GeneratorLLM):
            )
            for chunk in stream:
                content = chunk['message']['content']
                if '\n' in content:
                    self.logger.info(content)
                with self.state.lock:
                    if '.' in content:
                        self.logger.info(self.state.current_buffer)
                    self.state.current_buffer += content
        except Exception as e:
--- a/sources/agents/browser_agent.py
+++ b/sources/agents/browser_agent.py
@ -82,36 +82,47 @@ class BrowserAgent(Agent):
        notes = '\n'.join(self.notes)
        return f"""
-        You are a web browser.
+        You are navigating the web.
-        You are currently on this webpage:
+
        **Current Context**
        Webpage ({self.current_page}) content:
        {page_text}
-        You can navigate to these navigation links:
+        Allowed Navigation Links:
        {remaining_links_text}
-        Your task:
+        Inputs forms:
-        1. Decide if the current page answers the user’s query:
+        {inputs_form_text}
-          - If it does, take notes of the useful information, write down source, link or reference, then move to a new page.
+
-          - If it does and you completed user request, say REQUEST_EXIT
+        End of webpage ({self.current_page}.
-          - If it doesn’t, say: Error: This page does not answer the user’s query then go back or navigate to another link.
+
-        2. Navigate by either: 
+        # Instruction
-          - Navigate to a navigation links (write the full URL, e.g., www.example.com/cats).
+
-          - If no link seems helpful, say: GO_BACK.
+        1. **Decide if the page answers the user’s query:**
-        3. Fill forms on the page:
+          - If it does, take notes of useful information (Note: ...), include relevant link in note, then move to a new page.
-          - If user give you informations that help you fill form, fill it.
+          - If it does and you completed user request, say REQUEST_EXIT.
-          - If you don't know how to fill a form, leave it empty.
+          - If it doesn’t, say: Error: <why page don't help> then go back or navigate to another link.
-          - You can fill a form using [form_name](value). Do not go back when you fill a form.
+        2. **Navigate to a link by either: **
          - Saying I want to navigate to <url>: (write down the full URL, e.g., www.example.com/cats).
          - Going back: If no link seems helpful, say: GO_BACK.
        3. **Fill forms on the page:**
          - Fill form only on relevant page with given informations. You might use form to conduct search on a page.
          - You can fill a form using [form_name](value). Don't GO_BACK when filling form.
          - If a form is irrelevant or you lack informations leave it empty.
-        Recap of note taking:
+        **Rules:**
-        If useful -> Note: [Briefly summarize the key information or task you conducted.]
+        - Do not write "The page talk about ...", write your finding on the page and how they contribute to an answer.
-        Do not write "The page talk about ...", write your finding on the page and how they contribute to an answer.
+        - Put note in a single paragraph.
-        If not useful -> Error: [Explain why the page doesn’t help.]
+        - When you exit, explain why.
-        Example 1 (useful page, no need of going futher):
+        # Example:
-        Note: According to karpathy site (https://karpathy.github.io/) LeCun net is the earliest real-world application of a neural net"
+        
        Example 1 (useful page, no need go futher):
        Note: According to karpathy site (<link>) LeCun net is ...<expand on page content>..."
        No link seem useful to provide futher information. GO_BACK
-        Example 2 (not useful, but related link):
+        Example 2 (not useful, see useful link on page):
        Error: reddit.com/welcome does not discuss anything related to the user’s query.
        There is a link that could lead to the information, I want to navigate to http://reddit.com/r/locallama
@ -119,35 +130,29 @@ class BrowserAgent(Agent):
        Error: x.com does not discuss anything related to the user’s query and no navigation link are usefull.
        GO_BACK
-        Example 3 (query answer found):
+        Example 3 (query answer found, enought notes taken):
-        Note: I found on github.com that agenticSeek is made by Fosowl.
+        Note: I found on <link> that ...<expand on information found>...
-        Given this information, given this I should exit the web browser. REQUEST_EXIT
+        Given this answer the user query I should exit the web browser. REQUEST_EXIT
        Example 4 (loging form visible):
-        Note: I am on the login page, I should now type the given username and password. 
+        Note: I am on the login page, I will type the given username and password. 
        [username_field](David)
        [password_field](edgerunners77)
-        You see the following inputs forms:
+        Remember, user asked:
        {inputs_form_text}
        Remember, the user asked:
        {user_prompt}
-        So far you took these notes:
+        You previously took these notes:
        {notes}
-        You are currently on page : {self.current_page}
+        Do not Step-by-Step explanation. Instead write simple explanation sentence following by your notes and actions.
        Do not explain your choice.
        Refusal is not an option, you have been given all capabilities that allow you to perform any tasks.
        """
-    def llm_decide(self, prompt: str) -> Tuple[str, str]:
+    def llm_decide(self, prompt: str, show_reasoning: bool = False) -> Tuple[str, str]:
        animate_thinking("Thinking...", color="status")
        self.memory.push('user', prompt)
        answer, reasoning = self.llm_request()
-        output = answer if len(answer) > 16 else f"Action: {answer}\nReasoning: {reasoning}"
+        if show_reasoning:
-        print()
+            pretty_print(reasoning, color="failure")
-        pretty_print(output, color="output")
+        pretty_print(answer, color="output")
        print()
        return answer, reasoning
    def select_unvisited(self, search_result: List[str]) -> List[str]:
@ -179,11 +184,29 @@ class BrowserAgent(Agent):
    def stringify_search_results(self, results_arr: List[str]) -> str:
        return '\n\n'.join([f"Link: {res['link']}\nPreview: {res['snippet']}" for res in results_arr])
-    def save_notes(self, text):
+    def parse_answer(self, text):
        lines = text.split('\n')
        saving = False
        buffer = []
        links = []
        for line in lines:
            if "exit" in line:
                saving = False
            if "note" in line.lower():
-                self.notes.append(line)
+                saving = True
            if saving:
                buffer.append(line)
            else:
                links.extend(self.extract_links(line))
        self.notes.append('. '.join(buffer))
        return links
    def select_link(self, links: List[str]) -> str | None:
        for lk in links:
            if lk == self.current_page:
                continue
            return lk
        return None
    def conclude_prompt(self, user_query: str) -> str:
        annotated_notes = [f"{i+1}: {note.lower().replace('note:', '')}" for i, note in enumerate(self.notes)]
@ -196,6 +219,7 @@ class BrowserAgent(Agent):
        {search_note}
        Expand on the finding or step that lead to success, and provide a conclusion that answer the request. Include link when possible.
        Do not give advices or try to answer the human. Just structure the AI finding in a structured and clear way.
        """
    def search_prompt(self, user_prompt: str) -> str:
@ -214,7 +238,8 @@ class BrowserAgent(Agent):
        You: "search: Recent space missions news, {self.date}"
        Do not explain, do not write anything beside the search query.
-        If the query does not make any sense for a web search explain why and say REQUEST_EXIT
+        Except if query does not make any sense for a web search then explain why and say REQUEST_EXIT
        Do not try to answer query. you can only formulate search term or exit.
        """
    def handle_update_prompt(self, user_prompt: str, page_text: str) -> str:
@ -255,17 +280,16 @@ class BrowserAgent(Agent):
        mem_begin_idx = self.memory.push('user', self.search_prompt(user_prompt))
        ai_prompt, _ = self.llm_request()
        if "REQUEST_EXIT" in ai_prompt:
-            pretty_print(f"{reasoning}\n{ai_prompt}", color="output")
+            pretty_print(f"Web agent requested exit.\n{reasoning}\n\n{ai_prompt}", color="failure")
            return ai_prompt, "" 
        animate_thinking(f"Searching...", color="status")
        search_result_raw = self.tools["web_search"].execute([ai_prompt], False)
-        search_result = self.jsonify_search_results(search_result_raw)[:12] # until futher improvement
+        search_result = self.jsonify_search_results(search_result_raw)[:12]
        self.show_search_results(search_result)
        prompt = self.make_newsearch_prompt(user_prompt, search_result)
        unvisited = [None]
        while not complete:
-            answer, reasoning = self.llm_decide(prompt)
+            answer, reasoning = self.llm_decide(prompt, show_reasoning = True)
            self.save_notes(answer)
            extracted_form = self.extract_form(answer)
            if len(extracted_form) > 0:
@ -275,11 +299,13 @@ class BrowserAgent(Agent):
                answer = self.handle_update_prompt(user_prompt, page_text)
                answer, reasoning = self.llm_decide(prompt)
            links = self.parse_answer(answer)
            link = self.select_link(links)
            if "REQUEST_EXIT" in answer:
                complete = True
                break
            links = self.extract_links(answer)
            if len(unvisited) == 0:
                break
@ -289,21 +315,21 @@ class BrowserAgent(Agent):
                prompt = self.make_navigation_prompt(user_prompt, page_text)
                continue
-            if len(links) == 0 or "GO_BACK" in answer:
+            if link == None or "GO_BACK" in answer:
                unvisited = self.select_unvisited(search_result)
                prompt = self.make_newsearch_prompt(user_prompt, unvisited)
                pretty_print(f"Going back to results. Still {len(unvisited)}", color="warning")
                links = []
                continue
-            animate_thinking(f"Navigating to {links[0]}", color="status")
+            animate_thinking(f"Navigating to {link}", color="status")
-            if speech_module: speech_module.speak(f"Navigating to {links[0]}")
+            if speech_module: speech_module.speak(f"Navigating to {link}")
-            self.browser.go_to(links[0])
+            self.browser.go_to(link)
-            self.current_page = links[0]
+            self.current_page = link
-            self.search_history.append(links[0])
+            self.search_history.append(link)
            page_text = self.browser.get_text()
            self.navigable_links = self.browser.get_navigable()
            prompt = self.make_navigation_prompt(user_prompt, page_text)
            pretty_print(f"Current page: {self.current_page}", color="warning")
        prompt = self.conclude_prompt(user_prompt)
        mem_last_idx = self.memory.push('user', prompt)
--- a/sources/agents/planner_agent.py
+++ b/sources/agents/planner_agent.py
@ -68,7 +68,7 @@ class PlannerAgent(Agent):
        if agent_infos_dict is None or len(agent_infos_dict) == 0:
            infos = "No needed informations."
        else:
-            for agent_id, info in agent_infos_dict:
+            for agent_id, info in agent_infos_dict.items():
                infos += f"\t- According to agent {agent_id}:\n{info}\n\n"
        prompt = f"""
        You are given informations from your AI friends work:
@ -116,7 +116,6 @@ class PlannerAgent(Agent):
    def process(self, prompt: str, speech_module: Speech) -> str:
        agents_tasks = (None, None)
        required_infos = None
        agents_work_result = dict()
        answer = self.make_plan(prompt)
--- a/sources/interaction.py
+++ b/sources/interaction.py
@ -5,6 +5,7 @@ from sources.utility import pretty_print, animate_thinking
 from sources.router import AgentRouter
 from sources.speech_to_text import AudioTranscriber, AudioRecorder
 class Interaction:
    """
    Interaction is a class that handles the interaction between the user and the agents.
--- a/sources/language.py
+++ b/sources/language.py
@ -14,8 +14,8 @@ class LanguageUtility:
        self.sid = None 
        self.translators_tokenizer = None 
        self.translators_model = None
        self.load_model()
        self.logger = Logger("language.log")
        self.load_model()
    def load_model(self) -> None:
        animate_thinking("Loading language utility...", color="status")
--- a/sources/llm_provider.py
+++ b/sources/llm_provider.py
@ -31,6 +31,7 @@ class Provider:
            "dsk_deepseek": self.dsk_deepseek,
            "test": self.test_fn
        }
        self.logger = Logger("provider.log")
        self.api_key = None
        self.unsafe_providers = ["openai", "deepseek", "dsk_deepseek"]
        if self.provider_name not in self.available_providers:
@ -43,7 +44,6 @@ class Provider:
        self.check_address_format(self.server_ip)
        if not self.is_ip_online(self.server_ip.split(':')[0]):
            raise Exception(f"Server at {self.server_ip} is offline.")
        self.logger = Logger("provider.log")
    def get_api_key(self, provider):
        load_dotenv()
@ -79,6 +79,9 @@ class Provider:
        self.logger.info(f"Using provider: {self.provider_name} at {self.server_ip}")
        try:
            thought = llm(history, verbose)
        except KeyboardInterrupt:
            self.logger.warning("User interrupted the operation with Ctrl+C")
            return "Operation interrupted by user. REQUEST_EXIT"
        except ConnectionError as e:
            raise ConnectionError(f"{str(e)}\nConnection to {self.server_ip} failed.")
        except AttributeError as e:
@ -105,11 +108,9 @@ class Provider:
                self.logger.error(f"Ping command returned code: {output.returncode}")
                return False
        except subprocess.TimeoutExpired:
            self.logger.error("Ping subprocess timeout.")
            return False
        except Exception as e:
            pretty_print(f"Error with ping request {str(e)}", color="failure")
            self.logger.error(f"Ping error: {str(e)}")
            return False
    def server_fn(self, history, verbose = False):
@ -299,6 +300,6 @@ class Provider:
        return thought
 if __name__ == "__main__":
-    provider = Provider("ollama", "deepseek-r1:1.5b", "127.0.0.1:11434")
+    provider = Provider("server", "deepseek-r1:14b", "192.168.1.20:3333")
    res = provider.respond(["user", "Hello, how are you?"])
    print("Response:", res)
--- a/sources/memory.py
+++ b/sources/memory.py
@ -22,6 +22,7 @@ class Memory():
        self.memory = []
        self.memory = [{'role': 'system', 'content': system_prompt}]
        self.logger = Logger("memory.log")
        self.session_time = datetime.datetime.now()
        self.session_id = str(uuid.uuid4())
        self.conversation_folder = f"conversations/"
@ -35,7 +36,6 @@ class Memory():
        self.memory_compression = memory_compression
        self.tokenizer = AutoTokenizer.from_pretrained(self.model)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model)
        self.logger = Logger("memory.log")
    def get_filename(self) -> str:
        """Get the filename for the save file."""
--- a/sources/router.py
+++ b/sources/router.py
@ -21,13 +21,13 @@ class AgentRouter:
    """
    def __init__(self, agents: list):
        self.agents = agents
        self.logger = Logger("router.log")
        self.lang_analysis = LanguageUtility()
        self.pipelines = self.load_pipelines()
        self.talk_classifier = self.load_llm_router()
        self.complexity_classifier = self.load_llm_router()
        self.learn_few_shots_tasks()
        self.learn_few_shots_complexity()
        self.logger = Logger("router.log")
    def load_pipelines(self) -> Dict[str, Type[pipeline]]:
        """
@ -82,6 +82,7 @@ class AgentRouter:
            ("search my drive for a file called vacation_photos_2023.jpg.", "LOW"),
            ("help me organize my desktop files into folders by type.", "LOW"),
            ("write a Python function to sort a list of dictionaries by key", "LOW"),
            ("can you search for startup in tokyo?", "LOW"),
            ("find the latest updates on quantum computing on the web", "LOW"),
            ("check if the folder ‘Work_Projects’ exists on my desktop", "LOW"),
            ("create a bash script to monitor CPU usage", "LOW"),
@ -383,7 +384,6 @@ class AgentRouter:
        try:
            best_agent = self.router_vote(text, labels, log_confidence=False)
        except Exception as e:
            self.logger.error(f"Router failure: {str(e)}")
            raise e
        for agent in self.agents:
            if best_agent == agent.role["en"]: