feat : better web navigation of web agent

This commit is contained in:
martin legrand 2025-04-05 22:13:29 +02:00
parent 06ddc45955
commit 6fb9ce67c0
10 changed files with 113 additions and 78 deletions

View File

@ -10,8 +10,12 @@ You will be given a task and you will need to divide it into smaller tasks and a
You have to respect a strict format: You have to respect a strict format:
```json ```json
{"agent": "agent_name", "need": "needed_agent_output", "task": "agent_task"} {"agent": "agent_name", "need": "needed_agents_output", "task": "agent_task"}
``` ```
Where:
- "agent": The choosed agent for the task.
- "need": id of necessary previous agents answer for current agent.
- "task": A precise description of the task the agent should conduct.
# Example 1: web app # Example 1: web app
@ -32,25 +36,25 @@ You: Sure, here is the plan:
{ {
"agent": "Web", "agent": "Web",
"id": "1", "id": "1",
"need": null, "need": [],
"task": "Search for reliable weather APIs" "task": "Search for reliable weather APIs"
}, },
{ {
"agent": "Web", "agent": "Web",
"id": "2", "id": "2",
"need": "1", "need": ["1"],
"task": "Obtain API key from the selected service" "task": "Obtain API key from the selected service"
}, },
{ {
"agent": "File", "agent": "File",
"id": "3", "id": "3",
"need": null, "need": [],
"task": "Create and setup a web app folder for a python project. initialize as a git repo with all required file and a sources folder. You are forbidden from asking clarification, just execute." "task": "Create and setup a web app folder for a python project. initialize as a git repo with all required file and a sources folder. You are forbidden from asking clarification, just execute."
}, },
{ {
"agent": "Coder", "agent": "Coder",
"id": "3", "id": "3",
"need": "2,3", "need": ["2", "3"],
"task": "Based on the project structure. Develop a Python application using the API and key to fetch and display weather data. You are forbidden from asking clarification, just execute."" "task": "Based on the project structure. Develop a Python application using the API and key to fetch and display weather data. You are forbidden from asking clarification, just execute.""
} }
] ]

View File

@ -12,6 +12,10 @@ You have to respect a strict format:
```json ```json
{"agent": "agent_name", "need": "needed_agent_output", "task": "agent_task"} {"agent": "agent_name", "need": "needed_agent_output", "task": "agent_task"}
``` ```
Where:
- "agent": The choosed agent for the task.
- "need": id of necessary previous agents answer for current agent.
- "task": A precise description of the task the agent should conduct.
# Example: weather app # Example: weather app
@ -21,11 +25,11 @@ You: "At your service. Ive devised a plan and assigned agents to each task. W
## Task 1: I will search for available weather api with the help of the web agent. ## Task 1: I will search for available weather api with the help of the web agent.
## Task 2: I will create an api key for the weather api using the web agent. ## Task 2: I will create an api key for the weather api using the web agent
## Task 3: I will setup the project using the file agent. ## Task 3: I will setup the project using the file agent
## Task 4: I will use the coding agent to make a weather app in python. ## Task 4: I asign the coding agent to make a weather app in python
```json ```json
{ {
@ -33,25 +37,25 @@ You: "At your service. Ive devised a plan and assigned agents to each task. W
{ {
"agent": "Web", "agent": "Web",
"id": "1", "id": "1",
"need": null, "need": [],
"task": "Search for reliable weather APIs" "task": "Search for reliable weather APIs"
}, },
{ {
"agent": "Web", "agent": "Web",
"id": "2", "id": "2",
"need": "1", "need": ["1"],
"task": "Obtain API key from the selected service" "task": "Obtain API key from the selected service"
}, },
{ {
"agent": "File", "agent": "File",
"id": "3", "id": "3",
"need": null, "need": [],
"task": "Create and setup a web app folder for a python project. initialize as a git repo with all required file and a sources folder. You are forbidden from asking clarification, just execute." "task": "Create and setup a web app folder for a python project. initialize as a git repo with all required file and a sources folder. You are forbidden from asking clarification, just execute."
}, },
{ {
"agent": "Coder", "agent": "Coder",
"id": "3", "id": "3",
"need": "2,3", "need": ["2", "3"],
"task": "Based on the project structure. Develop a Python application using the API and key to fetch and display weather data. You are forbidden from asking clarification, just execute."" "task": "Based on the project structure. Develop a Python application using the API and key to fetch and display weather data. You are forbidden from asking clarification, just execute.""
} }
] ]

View File

@ -28,10 +28,10 @@ class OllamaLLM(GeneratorLLM):
) )
for chunk in stream: for chunk in stream:
content = chunk['message']['content'] content = chunk['message']['content']
if '\n' in content:
self.logger.info(content)
with self.state.lock: with self.state.lock:
if '.' in content:
self.logger.info(self.state.current_buffer)
self.state.current_buffer += content self.state.current_buffer += content
except Exception as e: except Exception as e:

View File

@ -82,36 +82,47 @@ class BrowserAgent(Agent):
notes = '\n'.join(self.notes) notes = '\n'.join(self.notes)
return f""" return f"""
You are a web browser. You are navigating the web.
You are currently on this webpage:
**Current Context**
Webpage ({self.current_page}) content:
{page_text} {page_text}
You can navigate to these navigation links: Allowed Navigation Links:
{remaining_links_text} {remaining_links_text}
Your task: Inputs forms:
1. Decide if the current page answers the users query: {inputs_form_text}
- If it does, take notes of the useful information, write down source, link or reference, then move to a new page.
- If it does and you completed user request, say REQUEST_EXIT End of webpage ({self.current_page}.
- If it doesnt, say: Error: This page does not answer the users query then go back or navigate to another link.
2. Navigate by either: # Instruction
- Navigate to a navigation links (write the full URL, e.g., www.example.com/cats).
- If no link seems helpful, say: GO_BACK. 1. **Decide if the page answers the users query:**
3. Fill forms on the page: - If it does, take notes of useful information (Note: ...), include relevant link in note, then move to a new page.
- If user give you informations that help you fill form, fill it. - If it does and you completed user request, say REQUEST_EXIT.
- If you don't know how to fill a form, leave it empty. - If it doesnt, say: Error: <why page don't help> then go back or navigate to another link.
- You can fill a form using [form_name](value). Do not go back when you fill a form. 2. **Navigate to a link by either: **
- Saying I want to navigate to <url>: (write down the full URL, e.g., www.example.com/cats).
- Going back: If no link seems helpful, say: GO_BACK.
3. **Fill forms on the page:**
- Fill form only on relevant page with given informations. You might use form to conduct search on a page.
- You can fill a form using [form_name](value). Don't GO_BACK when filling form.
- If a form is irrelevant or you lack informations leave it empty.
Recap of note taking: **Rules:**
If useful -> Note: [Briefly summarize the key information or task you conducted.] - Do not write "The page talk about ...", write your finding on the page and how they contribute to an answer.
Do not write "The page talk about ...", write your finding on the page and how they contribute to an answer. - Put note in a single paragraph.
If not useful -> Error: [Explain why the page doesnt help.] - When you exit, explain why.
Example 1 (useful page, no need of going futher): # Example:
Note: According to karpathy site (https://karpathy.github.io/) LeCun net is the earliest real-world application of a neural net"
Example 1 (useful page, no need go futher):
Note: According to karpathy site (<link>) LeCun net is ...<expand on page content>..."
No link seem useful to provide futher information. GO_BACK No link seem useful to provide futher information. GO_BACK
Example 2 (not useful, but related link): Example 2 (not useful, see useful link on page):
Error: reddit.com/welcome does not discuss anything related to the users query. Error: reddit.com/welcome does not discuss anything related to the users query.
There is a link that could lead to the information, I want to navigate to http://reddit.com/r/locallama There is a link that could lead to the information, I want to navigate to http://reddit.com/r/locallama
@ -119,35 +130,29 @@ class BrowserAgent(Agent):
Error: x.com does not discuss anything related to the users query and no navigation link are usefull. Error: x.com does not discuss anything related to the users query and no navigation link are usefull.
GO_BACK GO_BACK
Example 3 (query answer found): Example 3 (query answer found, enought notes taken):
Note: I found on github.com that agenticSeek is made by Fosowl. Note: I found on <link> that ...<expand on information found>...
Given this information, given this I should exit the web browser. REQUEST_EXIT Given this answer the user query I should exit the web browser. REQUEST_EXIT
Example 4 (loging form visible): Example 4 (loging form visible):
Note: I am on the login page, I should now type the given username and password. Note: I am on the login page, I will type the given username and password.
[username_field](David) [username_field](David)
[password_field](edgerunners77) [password_field](edgerunners77)
You see the following inputs forms: Remember, user asked:
{inputs_form_text}
Remember, the user asked:
{user_prompt} {user_prompt}
So far you took these notes: You previously took these notes:
{notes} {notes}
You are currently on page : {self.current_page} Do not Step-by-Step explanation. Instead write simple explanation sentence following by your notes and actions.
Do not explain your choice.
Refusal is not an option, you have been given all capabilities that allow you to perform any tasks.
""" """
def llm_decide(self, prompt: str) -> Tuple[str, str]: def llm_decide(self, prompt: str, show_reasoning: bool = False) -> Tuple[str, str]:
animate_thinking("Thinking...", color="status") animate_thinking("Thinking...", color="status")
self.memory.push('user', prompt) self.memory.push('user', prompt)
answer, reasoning = self.llm_request() answer, reasoning = self.llm_request()
output = answer if len(answer) > 16 else f"Action: {answer}\nReasoning: {reasoning}" if show_reasoning:
print() pretty_print(reasoning, color="failure")
pretty_print(output, color="output") pretty_print(answer, color="output")
print()
return answer, reasoning return answer, reasoning
def select_unvisited(self, search_result: List[str]) -> List[str]: def select_unvisited(self, search_result: List[str]) -> List[str]:
@ -179,11 +184,29 @@ class BrowserAgent(Agent):
def stringify_search_results(self, results_arr: List[str]) -> str: def stringify_search_results(self, results_arr: List[str]) -> str:
return '\n\n'.join([f"Link: {res['link']}\nPreview: {res['snippet']}" for res in results_arr]) return '\n\n'.join([f"Link: {res['link']}\nPreview: {res['snippet']}" for res in results_arr])
def save_notes(self, text): def parse_answer(self, text):
lines = text.split('\n') lines = text.split('\n')
saving = False
buffer = []
links = []
for line in lines: for line in lines:
if "exit" in line:
saving = False
if "note" in line.lower(): if "note" in line.lower():
self.notes.append(line) saving = True
if saving:
buffer.append(line)
else:
links.extend(self.extract_links(line))
self.notes.append('. '.join(buffer))
return links
def select_link(self, links: List[str]) -> str | None:
for lk in links:
if lk == self.current_page:
continue
return lk
return None
def conclude_prompt(self, user_query: str) -> str: def conclude_prompt(self, user_query: str) -> str:
annotated_notes = [f"{i+1}: {note.lower().replace('note:', '')}" for i, note in enumerate(self.notes)] annotated_notes = [f"{i+1}: {note.lower().replace('note:', '')}" for i, note in enumerate(self.notes)]
@ -196,6 +219,7 @@ class BrowserAgent(Agent):
{search_note} {search_note}
Expand on the finding or step that lead to success, and provide a conclusion that answer the request. Include link when possible. Expand on the finding or step that lead to success, and provide a conclusion that answer the request. Include link when possible.
Do not give advices or try to answer the human. Just structure the AI finding in a structured and clear way.
""" """
def search_prompt(self, user_prompt: str) -> str: def search_prompt(self, user_prompt: str) -> str:
@ -214,7 +238,8 @@ class BrowserAgent(Agent):
You: "search: Recent space missions news, {self.date}" You: "search: Recent space missions news, {self.date}"
Do not explain, do not write anything beside the search query. Do not explain, do not write anything beside the search query.
If the query does not make any sense for a web search explain why and say REQUEST_EXIT Except if query does not make any sense for a web search then explain why and say REQUEST_EXIT
Do not try to answer query. you can only formulate search term or exit.
""" """
def handle_update_prompt(self, user_prompt: str, page_text: str) -> str: def handle_update_prompt(self, user_prompt: str, page_text: str) -> str:
@ -255,17 +280,16 @@ class BrowserAgent(Agent):
mem_begin_idx = self.memory.push('user', self.search_prompt(user_prompt)) mem_begin_idx = self.memory.push('user', self.search_prompt(user_prompt))
ai_prompt, _ = self.llm_request() ai_prompt, _ = self.llm_request()
if "REQUEST_EXIT" in ai_prompt: if "REQUEST_EXIT" in ai_prompt:
pretty_print(f"{reasoning}\n{ai_prompt}", color="output") pretty_print(f"Web agent requested exit.\n{reasoning}\n\n{ai_prompt}", color="failure")
return ai_prompt, "" return ai_prompt, ""
animate_thinking(f"Searching...", color="status") animate_thinking(f"Searching...", color="status")
search_result_raw = self.tools["web_search"].execute([ai_prompt], False) search_result_raw = self.tools["web_search"].execute([ai_prompt], False)
search_result = self.jsonify_search_results(search_result_raw)[:12] # until futher improvement search_result = self.jsonify_search_results(search_result_raw)[:12]
self.show_search_results(search_result) self.show_search_results(search_result)
prompt = self.make_newsearch_prompt(user_prompt, search_result) prompt = self.make_newsearch_prompt(user_prompt, search_result)
unvisited = [None] unvisited = [None]
while not complete: while not complete:
answer, reasoning = self.llm_decide(prompt) answer, reasoning = self.llm_decide(prompt, show_reasoning = True)
self.save_notes(answer)
extracted_form = self.extract_form(answer) extracted_form = self.extract_form(answer)
if len(extracted_form) > 0: if len(extracted_form) > 0:
@ -275,11 +299,13 @@ class BrowserAgent(Agent):
answer = self.handle_update_prompt(user_prompt, page_text) answer = self.handle_update_prompt(user_prompt, page_text)
answer, reasoning = self.llm_decide(prompt) answer, reasoning = self.llm_decide(prompt)
links = self.parse_answer(answer)
link = self.select_link(links)
if "REQUEST_EXIT" in answer: if "REQUEST_EXIT" in answer:
complete = True complete = True
break break
links = self.extract_links(answer)
if len(unvisited) == 0: if len(unvisited) == 0:
break break
@ -289,21 +315,21 @@ class BrowserAgent(Agent):
prompt = self.make_navigation_prompt(user_prompt, page_text) prompt = self.make_navigation_prompt(user_prompt, page_text)
continue continue
if len(links) == 0 or "GO_BACK" in answer: if link == None or "GO_BACK" in answer:
unvisited = self.select_unvisited(search_result) unvisited = self.select_unvisited(search_result)
prompt = self.make_newsearch_prompt(user_prompt, unvisited) prompt = self.make_newsearch_prompt(user_prompt, unvisited)
pretty_print(f"Going back to results. Still {len(unvisited)}", color="warning") pretty_print(f"Going back to results. Still {len(unvisited)}", color="warning")
links = []
continue continue
animate_thinking(f"Navigating to {links[0]}", color="status") animate_thinking(f"Navigating to {link}", color="status")
if speech_module: speech_module.speak(f"Navigating to {links[0]}") if speech_module: speech_module.speak(f"Navigating to {link}")
self.browser.go_to(links[0]) self.browser.go_to(link)
self.current_page = links[0] self.current_page = link
self.search_history.append(links[0]) self.search_history.append(link)
page_text = self.browser.get_text() page_text = self.browser.get_text()
self.navigable_links = self.browser.get_navigable() self.navigable_links = self.browser.get_navigable()
prompt = self.make_navigation_prompt(user_prompt, page_text) prompt = self.make_navigation_prompt(user_prompt, page_text)
pretty_print(f"Current page: {self.current_page}", color="warning")
prompt = self.conclude_prompt(user_prompt) prompt = self.conclude_prompt(user_prompt)
mem_last_idx = self.memory.push('user', prompt) mem_last_idx = self.memory.push('user', prompt)

View File

@ -68,7 +68,7 @@ class PlannerAgent(Agent):
if agent_infos_dict is None or len(agent_infos_dict) == 0: if agent_infos_dict is None or len(agent_infos_dict) == 0:
infos = "No needed informations." infos = "No needed informations."
else: else:
for agent_id, info in agent_infos_dict: for agent_id, info in agent_infos_dict.items():
infos += f"\t- According to agent {agent_id}:\n{info}\n\n" infos += f"\t- According to agent {agent_id}:\n{info}\n\n"
prompt = f""" prompt = f"""
You are given informations from your AI friends work: You are given informations from your AI friends work:
@ -116,7 +116,6 @@ class PlannerAgent(Agent):
def process(self, prompt: str, speech_module: Speech) -> str: def process(self, prompt: str, speech_module: Speech) -> str:
agents_tasks = (None, None) agents_tasks = (None, None)
required_infos = None
agents_work_result = dict() agents_work_result = dict()
answer = self.make_plan(prompt) answer = self.make_plan(prompt)

View File

@ -5,6 +5,7 @@ from sources.utility import pretty_print, animate_thinking
from sources.router import AgentRouter from sources.router import AgentRouter
from sources.speech_to_text import AudioTranscriber, AudioRecorder from sources.speech_to_text import AudioTranscriber, AudioRecorder
class Interaction: class Interaction:
""" """
Interaction is a class that handles the interaction between the user and the agents. Interaction is a class that handles the interaction between the user and the agents.

View File

@ -14,8 +14,8 @@ class LanguageUtility:
self.sid = None self.sid = None
self.translators_tokenizer = None self.translators_tokenizer = None
self.translators_model = None self.translators_model = None
self.load_model()
self.logger = Logger("language.log") self.logger = Logger("language.log")
self.load_model()
def load_model(self) -> None: def load_model(self) -> None:
animate_thinking("Loading language utility...", color="status") animate_thinking("Loading language utility...", color="status")

View File

@ -31,6 +31,7 @@ class Provider:
"dsk_deepseek": self.dsk_deepseek, "dsk_deepseek": self.dsk_deepseek,
"test": self.test_fn "test": self.test_fn
} }
self.logger = Logger("provider.log")
self.api_key = None self.api_key = None
self.unsafe_providers = ["openai", "deepseek", "dsk_deepseek"] self.unsafe_providers = ["openai", "deepseek", "dsk_deepseek"]
if self.provider_name not in self.available_providers: if self.provider_name not in self.available_providers:
@ -43,7 +44,6 @@ class Provider:
self.check_address_format(self.server_ip) self.check_address_format(self.server_ip)
if not self.is_ip_online(self.server_ip.split(':')[0]): if not self.is_ip_online(self.server_ip.split(':')[0]):
raise Exception(f"Server at {self.server_ip} is offline.") raise Exception(f"Server at {self.server_ip} is offline.")
self.logger = Logger("provider.log")
def get_api_key(self, provider): def get_api_key(self, provider):
load_dotenv() load_dotenv()
@ -79,6 +79,9 @@ class Provider:
self.logger.info(f"Using provider: {self.provider_name} at {self.server_ip}") self.logger.info(f"Using provider: {self.provider_name} at {self.server_ip}")
try: try:
thought = llm(history, verbose) thought = llm(history, verbose)
except KeyboardInterrupt:
self.logger.warning("User interrupted the operation with Ctrl+C")
return "Operation interrupted by user. REQUEST_EXIT"
except ConnectionError as e: except ConnectionError as e:
raise ConnectionError(f"{str(e)}\nConnection to {self.server_ip} failed.") raise ConnectionError(f"{str(e)}\nConnection to {self.server_ip} failed.")
except AttributeError as e: except AttributeError as e:
@ -105,11 +108,9 @@ class Provider:
self.logger.error(f"Ping command returned code: {output.returncode}") self.logger.error(f"Ping command returned code: {output.returncode}")
return False return False
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
self.logger.error("Ping subprocess timeout.")
return False return False
except Exception as e: except Exception as e:
pretty_print(f"Error with ping request {str(e)}", color="failure") pretty_print(f"Error with ping request {str(e)}", color="failure")
self.logger.error(f"Ping error: {str(e)}")
return False return False
def server_fn(self, history, verbose = False): def server_fn(self, history, verbose = False):
@ -299,6 +300,6 @@ class Provider:
return thought return thought
if __name__ == "__main__": if __name__ == "__main__":
provider = Provider("ollama", "deepseek-r1:1.5b", "127.0.0.1:11434") provider = Provider("server", "deepseek-r1:14b", "192.168.1.20:3333")
res = provider.respond(["user", "Hello, how are you?"]) res = provider.respond(["user", "Hello, how are you?"])
print("Response:", res) print("Response:", res)

View File

@ -22,6 +22,7 @@ class Memory():
self.memory = [] self.memory = []
self.memory = [{'role': 'system', 'content': system_prompt}] self.memory = [{'role': 'system', 'content': system_prompt}]
self.logger = Logger("memory.log")
self.session_time = datetime.datetime.now() self.session_time = datetime.datetime.now()
self.session_id = str(uuid.uuid4()) self.session_id = str(uuid.uuid4())
self.conversation_folder = f"conversations/" self.conversation_folder = f"conversations/"
@ -35,7 +36,6 @@ class Memory():
self.memory_compression = memory_compression self.memory_compression = memory_compression
self.tokenizer = AutoTokenizer.from_pretrained(self.model) self.tokenizer = AutoTokenizer.from_pretrained(self.model)
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model) self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model)
self.logger = Logger("memory.log")
def get_filename(self) -> str: def get_filename(self) -> str:
"""Get the filename for the save file.""" """Get the filename for the save file."""

View File

@ -21,13 +21,13 @@ class AgentRouter:
""" """
def __init__(self, agents: list): def __init__(self, agents: list):
self.agents = agents self.agents = agents
self.logger = Logger("router.log")
self.lang_analysis = LanguageUtility() self.lang_analysis = LanguageUtility()
self.pipelines = self.load_pipelines() self.pipelines = self.load_pipelines()
self.talk_classifier = self.load_llm_router() self.talk_classifier = self.load_llm_router()
self.complexity_classifier = self.load_llm_router() self.complexity_classifier = self.load_llm_router()
self.learn_few_shots_tasks() self.learn_few_shots_tasks()
self.learn_few_shots_complexity() self.learn_few_shots_complexity()
self.logger = Logger("router.log")
def load_pipelines(self) -> Dict[str, Type[pipeline]]: def load_pipelines(self) -> Dict[str, Type[pipeline]]:
""" """
@ -82,6 +82,7 @@ class AgentRouter:
("search my drive for a file called vacation_photos_2023.jpg.", "LOW"), ("search my drive for a file called vacation_photos_2023.jpg.", "LOW"),
("help me organize my desktop files into folders by type.", "LOW"), ("help me organize my desktop files into folders by type.", "LOW"),
("write a Python function to sort a list of dictionaries by key", "LOW"), ("write a Python function to sort a list of dictionaries by key", "LOW"),
("can you search for startup in tokyo?", "LOW"),
("find the latest updates on quantum computing on the web", "LOW"), ("find the latest updates on quantum computing on the web", "LOW"),
("check if the folder Work_Projects exists on my desktop", "LOW"), ("check if the folder Work_Projects exists on my desktop", "LOW"),
("create a bash script to monitor CPU usage", "LOW"), ("create a bash script to monitor CPU usage", "LOW"),
@ -383,7 +384,6 @@ class AgentRouter:
try: try:
best_agent = self.router_vote(text, labels, log_confidence=False) best_agent = self.router_vote(text, labels, log_confidence=False)
except Exception as e: except Exception as e:
self.logger.error(f"Router failure: {str(e)}")
raise e raise e
for agent in self.agents: for agent in self.agents:
if best_agent == agent.role["en"]: if best_agent == agent.role["en"]: