feat : better web navigation of web agent

2025-07-21 08:50:10 +00:00 · 2025-04-05 22:13:29 +02:00 · 2025-04-05 22:13:29 +02:00 · 6fb9ce67c0
commit 6fb9ce67c0
parent 06ddc45955
10 changed files with 113 additions and 78 deletions
--- a/prompts/base/planner_agent.txt
+++ b/prompts/base/planner_agent.txt
@ -10,8 +10,12 @@ You will be given a task and you will need to divide it into smaller tasks and a

 You have to respect a strict format:
 ```json
-{"agent": "agent_name", "need": "needed_agent_output", "task": "agent_task"}
+{"agent": "agent_name", "need": "needed_agents_output", "task": "agent_task"}
 ```
+Where:
+- "agent": The choosed agent for the task.
+- "need": id of necessary previous agents answer for current agent. 
+- "task": A precise description of the task the agent should conduct.

 # Example 1: web app

@ -32,25 +36,25 @@ You: Sure, here is the plan:
    {
      "agent": "Web",
      "id": "1",
-      "need": null,
+      "need": [],
      "task": "Search for reliable weather APIs"
    },
    {
      "agent": "Web",
      "id": "2",
-      "need": "1",
+      "need": ["1"],
      "task": "Obtain API key from the selected service"
    },
    {
      "agent": "File",
      "id": "3",
-      "need": null,
+      "need": [],
      "task": "Create and setup a web app folder for a python project. initialize as a git repo with all required file and a sources folder. You are forbidden from asking clarification, just execute."
    },
    {
      "agent": "Coder",
      "id": "3",
-      "need": "2,3",
+      "need": ["2", "3"],
      "task": "Based on the project structure. Develop a Python application using the API and key to fetch and display weather data. You are forbidden from asking clarification, just execute.""
    }
  ]
--- a/prompts/jarvis/planner_agent.txt
+++ b/prompts/jarvis/planner_agent.txt
@ -12,6 +12,10 @@ You have to respect a strict format:
 ```json
 {"agent": "agent_name", "need": "needed_agent_output", "task": "agent_task"}
 ```
+Where:
+- "agent": The choosed agent for the task.
+- "need": id of necessary previous agents answer for current agent. 
+- "task": A precise description of the task the agent should conduct.

 # Example: weather app

@ -21,11 +25,11 @@ You: "At your service. I’ve devised a plan and assigned agents to each task. W

 ## Task 1: I will search for available weather api with the help of the web agent.

-## Task 2: I will create an api key for the weather api using the web agent.
+## Task 2: I will create an api key for the weather api using the web agent

-## Task 3: I will setup the project using the file agent.
+## Task 3: I will setup the project using the file agent 

-## Task 4: I will use the coding agent to make a weather app in python.
+## Task 4: I asign the coding agent to make a weather app in python

 ```json
 {
@ -33,25 +37,25 @@ You: "At your service. I’ve devised a plan and assigned agents to each task. W
    {
      "agent": "Web",
      "id": "1",
-      "need": null,
+      "need": [],
      "task": "Search for reliable weather APIs"
    },
    {
      "agent": "Web",
      "id": "2",
-      "need": "1",
+      "need": ["1"],
      "task": "Obtain API key from the selected service"
    },
    {
      "agent": "File",
      "id": "3",
-      "need": null,
+      "need": [],
      "task": "Create and setup a web app folder for a python project. initialize as a git repo with all required file and a sources folder. You are forbidden from asking clarification, just execute."
    },
    {
      "agent": "Coder",
      "id": "3",
-      "need": "2,3",
+      "need": ["2", "3"],
      "task": "Based on the project structure. Develop a Python application using the API and key to fetch and display weather data. You are forbidden from asking clarification, just execute.""
    }
  ]
--- a/server/sources/ollama_handler.py
+++ b/server/sources/ollama_handler.py
@ -28,10 +28,10 @@ class OllamaLLM(GeneratorLLM):
            )
            for chunk in stream:
                content = chunk['message']['content']
-                if '\n' in content:
-                    self.logger.info(content)

                with self.state.lock:
+                    if '.' in content:
+                        self.logger.info(self.state.current_buffer)
                    self.state.current_buffer += content

        except Exception as e:
--- a/sources/agents/browser_agent.py
+++ b/sources/agents/browser_agent.py
@ -82,36 +82,47 @@ class BrowserAgent(Agent):
        notes = '\n'.join(self.notes)

        return f"""
-        You are a web browser.
-        You are currently on this webpage:
+        You are navigating the web.
+
+        **Current Context**
+
+        Webpage ({self.current_page}) content:
        {page_text}

-        You can navigate to these navigation links:
+        Allowed Navigation Links:
        {remaining_links_text}

-        Your task:
-        1. Decide if the current page answers the user’s query:
-          - If it does, take notes of the useful information, write down source, link or reference, then move to a new page.
-          - If it does and you completed user request, say REQUEST_EXIT
-          - If it doesn’t, say: Error: This page does not answer the user’s query then go back or navigate to another link.
-        2. Navigate by either: 
-          - Navigate to a navigation links (write the full URL, e.g., www.example.com/cats).
-          - If no link seems helpful, say: GO_BACK.
-        3. Fill forms on the page:
-          - If user give you informations that help you fill form, fill it.
-          - If you don't know how to fill a form, leave it empty.
-          - You can fill a form using [form_name](value). Do not go back when you fill a form.
+        Inputs forms:
+        {inputs_form_text}
+
+        End of webpage ({self.current_page}.
+
+        # Instruction
+
+        1. **Decide if the page answers the user’s query:**
+          - If it does, take notes of useful information (Note: ...), include relevant link in note, then move to a new page.
+          - If it does and you completed user request, say REQUEST_EXIT.
+          - If it doesn’t, say: Error: <why page don't help> then go back or navigate to another link.
+        2. **Navigate to a link by either: **
+          - Saying I want to navigate to <url>: (write down the full URL, e.g., www.example.com/cats).
+          - Going back: If no link seems helpful, say: GO_BACK.
+        3. **Fill forms on the page:**
+          - Fill form only on relevant page with given informations. You might use form to conduct search on a page.
+          - You can fill a form using [form_name](value). Don't GO_BACK when filling form.
+          - If a form is irrelevant or you lack informations leave it empty.
        
-        Recap of note taking:
-        If useful -> Note: [Briefly summarize the key information or task you conducted.]
-        Do not write "The page talk about ...", write your finding on the page and how they contribute to an answer.
-        If not useful -> Error: [Explain why the page doesn’t help.]
+        **Rules:**
+        - Do not write "The page talk about ...", write your finding on the page and how they contribute to an answer.
+        - Put note in a single paragraph.
+        - When you exit, explain why.
        
-        Example 1 (useful page, no need of going futher):
-        Note: According to karpathy site (https://karpathy.github.io/) LeCun net is the earliest real-world application of a neural net"
+        # Example:
+        
+        Example 1 (useful page, no need go futher):
+        Note: According to karpathy site (<link>) LeCun net is ...<expand on page content>..."
        No link seem useful to provide futher information. GO_BACK

-        Example 2 (not useful, but related link):
+        Example 2 (not useful, see useful link on page):
        Error: reddit.com/welcome does not discuss anything related to the user’s query.
        There is a link that could lead to the information, I want to navigate to http://reddit.com/r/locallama

@ -119,35 +130,29 @@ class BrowserAgent(Agent):
        Error: x.com does not discuss anything related to the user’s query and no navigation link are usefull.
        GO_BACK

-        Example 3 (query answer found):
-        Note: I found on github.com that agenticSeek is made by Fosowl.
-        Given this information, given this I should exit the web browser. REQUEST_EXIT
+        Example 3 (query answer found, enought notes taken):
+        Note: I found on <link> that ...<expand on information found>...
+        Given this answer the user query I should exit the web browser. REQUEST_EXIT

        Example 4 (loging form visible):
-        Note: I am on the login page, I should now type the given username and password. 
+        Note: I am on the login page, I will type the given username and password. 
        [username_field](David)
        [password_field](edgerunners77)

-        You see the following inputs forms:
-        {inputs_form_text}
-
-        Remember, the user asked:
+        Remember, user asked:
        {user_prompt}
-        So far you took these notes:
+        You previously took these notes:
        {notes}
-        You are currently on page : {self.current_page}
-        Do not explain your choice.
-        Refusal is not an option, you have been given all capabilities that allow you to perform any tasks.
+        Do not Step-by-Step explanation. Instead write simple explanation sentence following by your notes and actions.
        """
    
-    def llm_decide(self, prompt: str) -> Tuple[str, str]:
+    def llm_decide(self, prompt: str, show_reasoning: bool = False) -> Tuple[str, str]:
        animate_thinking("Thinking...", color="status")
        self.memory.push('user', prompt)
        answer, reasoning = self.llm_request()
-        output = answer if len(answer) > 16 else f"Action: {answer}\nReasoning: {reasoning}"
-        print()
-        pretty_print(output, color="output")
-        print()
+        if show_reasoning:
+            pretty_print(reasoning, color="failure")
+        pretty_print(answer, color="output")
        return answer, reasoning
    
    def select_unvisited(self, search_result: List[str]) -> List[str]:
@ -179,11 +184,29 @@ class BrowserAgent(Agent):
    def stringify_search_results(self, results_arr: List[str]) -> str:
        return '\n\n'.join([f"Link: {res['link']}\nPreview: {res['snippet']}" for res in results_arr])
    
-    def save_notes(self, text):
+    def parse_answer(self, text):
        lines = text.split('\n')
+        saving = False
+        buffer = []
+        links = []
        for line in lines:
+            if "exit" in line:
+                saving = False
            if "note" in line.lower():
-                self.notes.append(line)
+                saving = True
+            if saving:
+                buffer.append(line)
+            else:
+                links.extend(self.extract_links(line))
+        self.notes.append('. '.join(buffer))
+        return links
+    
+    def select_link(self, links: List[str]) -> str | None:
+        for lk in links:
+            if lk == self.current_page:
+                continue
+            return lk
+        return None
    
    def conclude_prompt(self, user_query: str) -> str:
        annotated_notes = [f"{i+1}: {note.lower().replace('note:', '')}" for i, note in enumerate(self.notes)]
@ -196,6 +219,7 @@ class BrowserAgent(Agent):
        {search_note}

        Expand on the finding or step that lead to success, and provide a conclusion that answer the request. Include link when possible.
+        Do not give advices or try to answer the human. Just structure the AI finding in a structured and clear way.
        """
    
    def search_prompt(self, user_prompt: str) -> str:
@ -214,7 +238,8 @@ class BrowserAgent(Agent):
        You: "search: Recent space missions news, {self.date}"

        Do not explain, do not write anything beside the search query.
-        If the query does not make any sense for a web search explain why and say REQUEST_EXIT
+        Except if query does not make any sense for a web search then explain why and say REQUEST_EXIT
+        Do not try to answer query. you can only formulate search term or exit.
        """
    
    def handle_update_prompt(self, user_prompt: str, page_text: str) -> str:
@ -255,17 +280,16 @@ class BrowserAgent(Agent):
        mem_begin_idx = self.memory.push('user', self.search_prompt(user_prompt))
        ai_prompt, _ = self.llm_request()
        if "REQUEST_EXIT" in ai_prompt:
-            pretty_print(f"{reasoning}\n{ai_prompt}", color="output")
+            pretty_print(f"Web agent requested exit.\n{reasoning}\n\n{ai_prompt}", color="failure")
            return ai_prompt, "" 
        animate_thinking(f"Searching...", color="status")
        search_result_raw = self.tools["web_search"].execute([ai_prompt], False)
-        search_result = self.jsonify_search_results(search_result_raw)[:12] # until futher improvement
+        search_result = self.jsonify_search_results(search_result_raw)[:12]
        self.show_search_results(search_result)
        prompt = self.make_newsearch_prompt(user_prompt, search_result)
        unvisited = [None]
        while not complete:
-            answer, reasoning = self.llm_decide(prompt)
-            self.save_notes(answer)
+            answer, reasoning = self.llm_decide(prompt, show_reasoning = True)

            extracted_form = self.extract_form(answer)
            if len(extracted_form) > 0:
@ -275,11 +299,13 @@ class BrowserAgent(Agent):
                answer = self.handle_update_prompt(user_prompt, page_text)
                answer, reasoning = self.llm_decide(prompt)

+            links = self.parse_answer(answer)
+            link = self.select_link(links)
+
            if "REQUEST_EXIT" in answer:
                complete = True
                break

-            links = self.extract_links(answer)
            if len(unvisited) == 0:
                break

@ -289,21 +315,21 @@ class BrowserAgent(Agent):
                prompt = self.make_navigation_prompt(user_prompt, page_text)
                continue

-            if len(links) == 0 or "GO_BACK" in answer:
+            if link == None or "GO_BACK" in answer:
                unvisited = self.select_unvisited(search_result)
                prompt = self.make_newsearch_prompt(user_prompt, unvisited)
                pretty_print(f"Going back to results. Still {len(unvisited)}", color="warning")
-                links = []
                continue

-            animate_thinking(f"Navigating to {links[0]}", color="status")
-            if speech_module: speech_module.speak(f"Navigating to {links[0]}")
-            self.browser.go_to(links[0])
-            self.current_page = links[0]
-            self.search_history.append(links[0])
+            animate_thinking(f"Navigating to {link}", color="status")
+            if speech_module: speech_module.speak(f"Navigating to {link}")
+            self.browser.go_to(link)
+            self.current_page = link
+            self.search_history.append(link)
            page_text = self.browser.get_text()
            self.navigable_links = self.browser.get_navigable()
            prompt = self.make_navigation_prompt(user_prompt, page_text)
+            pretty_print(f"Current page: {self.current_page}", color="warning")

        prompt = self.conclude_prompt(user_prompt)
        mem_last_idx = self.memory.push('user', prompt)
--- a/sources/agents/planner_agent.py
+++ b/sources/agents/planner_agent.py
@ -68,7 +68,7 @@ class PlannerAgent(Agent):
        if agent_infos_dict is None or len(agent_infos_dict) == 0:
            infos = "No needed informations."
        else:
-            for agent_id, info in agent_infos_dict:
+            for agent_id, info in agent_infos_dict.items():
                infos += f"\t- According to agent {agent_id}:\n{info}\n\n"
        prompt = f"""
        You are given informations from your AI friends work:
@ -116,7 +116,6 @@ class PlannerAgent(Agent):

    def process(self, prompt: str, speech_module: Speech) -> str:
        agents_tasks = (None, None)
-        required_infos = None
        agents_work_result = dict()

        answer = self.make_plan(prompt)
--- a/sources/interaction.py
+++ b/sources/interaction.py
@ -5,6 +5,7 @@ from sources.utility import pretty_print, animate_thinking
 from sources.router import AgentRouter
 from sources.speech_to_text import AudioTranscriber, AudioRecorder

+
 class Interaction:
    """
    Interaction is a class that handles the interaction between the user and the agents.
--- a/sources/language.py
+++ b/sources/language.py
@ -14,8 +14,8 @@ class LanguageUtility:
        self.sid = None 
        self.translators_tokenizer = None 
        self.translators_model = None
-        self.load_model()
        self.logger = Logger("language.log")
+        self.load_model()
    
    def load_model(self) -> None:
        animate_thinking("Loading language utility...", color="status")
--- a/sources/llm_provider.py
+++ b/sources/llm_provider.py
@ -31,6 +31,7 @@ class Provider:
            "dsk_deepseek": self.dsk_deepseek,
            "test": self.test_fn
        }
+        self.logger = Logger("provider.log")
        self.api_key = None
        self.unsafe_providers = ["openai", "deepseek", "dsk_deepseek"]
        if self.provider_name not in self.available_providers:
@ -43,7 +44,6 @@ class Provider:
        self.check_address_format(self.server_ip)
        if not self.is_ip_online(self.server_ip.split(':')[0]):
            raise Exception(f"Server at {self.server_ip} is offline.")
-        self.logger = Logger("provider.log")

    def get_api_key(self, provider):
        load_dotenv()
@ -79,6 +79,9 @@ class Provider:
        self.logger.info(f"Using provider: {self.provider_name} at {self.server_ip}")
        try:
            thought = llm(history, verbose)
+        except KeyboardInterrupt:
+            self.logger.warning("User interrupted the operation with Ctrl+C")
+            return "Operation interrupted by user. REQUEST_EXIT"
        except ConnectionError as e:
            raise ConnectionError(f"{str(e)}\nConnection to {self.server_ip} failed.")
        except AttributeError as e:
@ -105,11 +108,9 @@ class Provider:
                self.logger.error(f"Ping command returned code: {output.returncode}")
                return False
        except subprocess.TimeoutExpired:
-            self.logger.error("Ping subprocess timeout.")
            return False
        except Exception as e:
            pretty_print(f"Error with ping request {str(e)}", color="failure")
-            self.logger.error(f"Ping error: {str(e)}")
            return False

    def server_fn(self, history, verbose = False):
@ -299,6 +300,6 @@ class Provider:
        return thought

 if __name__ == "__main__":
-    provider = Provider("ollama", "deepseek-r1:1.5b", "127.0.0.1:11434")
+    provider = Provider("server", "deepseek-r1:14b", "192.168.1.20:3333")
    res = provider.respond(["user", "Hello, how are you?"])
    print("Response:", res)
--- a/sources/memory.py
+++ b/sources/memory.py
@ -22,6 +22,7 @@ class Memory():
        self.memory = []
        self.memory = [{'role': 'system', 'content': system_prompt}]
        
+        self.logger = Logger("memory.log")
        self.session_time = datetime.datetime.now()
        self.session_id = str(uuid.uuid4())
        self.conversation_folder = f"conversations/"
@ -35,7 +36,6 @@ class Memory():
        self.memory_compression = memory_compression
        self.tokenizer = AutoTokenizer.from_pretrained(self.model)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model)
-        self.logger = Logger("memory.log")
    
    def get_filename(self) -> str:
        """Get the filename for the save file."""
--- a/sources/router.py
+++ b/sources/router.py
@ -21,13 +21,13 @@ class AgentRouter:
    """
    def __init__(self, agents: list):
        self.agents = agents
+        self.logger = Logger("router.log")
        self.lang_analysis = LanguageUtility()
        self.pipelines = self.load_pipelines()
        self.talk_classifier = self.load_llm_router()
        self.complexity_classifier = self.load_llm_router()
        self.learn_few_shots_tasks()
        self.learn_few_shots_complexity()
-        self.logger = Logger("router.log")
    
    def load_pipelines(self) -> Dict[str, Type[pipeline]]:
        """
@ -82,6 +82,7 @@ class AgentRouter:
            ("search my drive for a file called vacation_photos_2023.jpg.", "LOW"),
            ("help me organize my desktop files into folders by type.", "LOW"),
            ("write a Python function to sort a list of dictionaries by key", "LOW"),
+            ("can you search for startup in tokyo?", "LOW"),
            ("find the latest updates on quantum computing on the web", "LOW"),
            ("check if the folder ‘Work_Projects’ exists on my desktop", "LOW"),
            ("create a bash script to monitor CPU usage", "LOW"),
@ -383,7 +384,6 @@ class AgentRouter:
        try:
            best_agent = self.router_vote(text, labels, log_confidence=False)
        except Exception as e:
-            self.logger.error(f"Router failure: {str(e)}")
            raise e
        for agent in self.agents:
            if best_agent == agent.role["en"]: