feat : better web navigation of web agent

This commit is contained in:
martin legrand 2025-04-05 22:13:29 +02:00
parent 06ddc45955
commit 6fb9ce67c0
10 changed files with 113 additions and 78 deletions

View File

@ -10,8 +10,12 @@ You will be given a task and you will need to divide it into smaller tasks and a
You have to respect a strict format:
```json
{"agent": "agent_name", "need": "needed_agent_output", "task": "agent_task"}
{"agent": "agent_name", "need": "needed_agents_output", "task": "agent_task"}
```
Where:
- "agent": The choosed agent for the task.
- "need": id of necessary previous agents answer for current agent.
- "task": A precise description of the task the agent should conduct.
# Example 1: web app
@ -32,25 +36,25 @@ You: Sure, here is the plan:
{
"agent": "Web",
"id": "1",
"need": null,
"need": [],
"task": "Search for reliable weather APIs"
},
{
"agent": "Web",
"id": "2",
"need": "1",
"need": ["1"],
"task": "Obtain API key from the selected service"
},
{
"agent": "File",
"id": "3",
"need": null,
"need": [],
"task": "Create and setup a web app folder for a python project. initialize as a git repo with all required file and a sources folder. You are forbidden from asking clarification, just execute."
},
{
"agent": "Coder",
"id": "3",
"need": "2,3",
"need": ["2", "3"],
"task": "Based on the project structure. Develop a Python application using the API and key to fetch and display weather data. You are forbidden from asking clarification, just execute.""
}
]

View File

@ -12,6 +12,10 @@ You have to respect a strict format:
```json
{"agent": "agent_name", "need": "needed_agent_output", "task": "agent_task"}
```
Where:
- "agent": The choosed agent for the task.
- "need": id of necessary previous agents answer for current agent.
- "task": A precise description of the task the agent should conduct.
# Example: weather app
@ -21,11 +25,11 @@ You: "At your service. Ive devised a plan and assigned agents to each task. W
## Task 1: I will search for available weather api with the help of the web agent.
## Task 2: I will create an api key for the weather api using the web agent.
## Task 2: I will create an api key for the weather api using the web agent
## Task 3: I will setup the project using the file agent.
## Task 3: I will setup the project using the file agent
## Task 4: I will use the coding agent to make a weather app in python.
## Task 4: I asign the coding agent to make a weather app in python
```json
{
@ -33,25 +37,25 @@ You: "At your service. Ive devised a plan and assigned agents to each task. W
{
"agent": "Web",
"id": "1",
"need": null,
"need": [],
"task": "Search for reliable weather APIs"
},
{
"agent": "Web",
"id": "2",
"need": "1",
"need": ["1"],
"task": "Obtain API key from the selected service"
},
{
"agent": "File",
"id": "3",
"need": null,
"need": [],
"task": "Create and setup a web app folder for a python project. initialize as a git repo with all required file and a sources folder. You are forbidden from asking clarification, just execute."
},
{
"agent": "Coder",
"id": "3",
"need": "2,3",
"need": ["2", "3"],
"task": "Based on the project structure. Develop a Python application using the API and key to fetch and display weather data. You are forbidden from asking clarification, just execute.""
}
]

View File

@ -28,10 +28,10 @@ class OllamaLLM(GeneratorLLM):
)
for chunk in stream:
content = chunk['message']['content']
if '\n' in content:
self.logger.info(content)
with self.state.lock:
if '.' in content:
self.logger.info(self.state.current_buffer)
self.state.current_buffer += content
except Exception as e:

View File

@ -82,36 +82,47 @@ class BrowserAgent(Agent):
notes = '\n'.join(self.notes)
return f"""
You are a web browser.
You are currently on this webpage:
You are navigating the web.
**Current Context**
Webpage ({self.current_page}) content:
{page_text}
You can navigate to these navigation links:
Allowed Navigation Links:
{remaining_links_text}
Your task:
1. Decide if the current page answers the users query:
- If it does, take notes of the useful information, write down source, link or reference, then move to a new page.
- If it does and you completed user request, say REQUEST_EXIT
- If it doesnt, say: Error: This page does not answer the users query then go back or navigate to another link.
2. Navigate by either:
- Navigate to a navigation links (write the full URL, e.g., www.example.com/cats).
- If no link seems helpful, say: GO_BACK.
3. Fill forms on the page:
- If user give you informations that help you fill form, fill it.
- If you don't know how to fill a form, leave it empty.
- You can fill a form using [form_name](value). Do not go back when you fill a form.
Inputs forms:
{inputs_form_text}
End of webpage ({self.current_page}.
# Instruction
1. **Decide if the page answers the users query:**
- If it does, take notes of useful information (Note: ...), include relevant link in note, then move to a new page.
- If it does and you completed user request, say REQUEST_EXIT.
- If it doesnt, say: Error: <why page don't help> then go back or navigate to another link.
2. **Navigate to a link by either: **
- Saying I want to navigate to <url>: (write down the full URL, e.g., www.example.com/cats).
- Going back: If no link seems helpful, say: GO_BACK.
3. **Fill forms on the page:**
- Fill form only on relevant page with given informations. You might use form to conduct search on a page.
- You can fill a form using [form_name](value). Don't GO_BACK when filling form.
- If a form is irrelevant or you lack informations leave it empty.
Recap of note taking:
If useful -> Note: [Briefly summarize the key information or task you conducted.]
Do not write "The page talk about ...", write your finding on the page and how they contribute to an answer.
If not useful -> Error: [Explain why the page doesnt help.]
**Rules:**
- Do not write "The page talk about ...", write your finding on the page and how they contribute to an answer.
- Put note in a single paragraph.
- When you exit, explain why.
Example 1 (useful page, no need of going futher):
Note: According to karpathy site (https://karpathy.github.io/) LeCun net is the earliest real-world application of a neural net"
# Example:
Example 1 (useful page, no need go futher):
Note: According to karpathy site (<link>) LeCun net is ...<expand on page content>..."
No link seem useful to provide futher information. GO_BACK
Example 2 (not useful, but related link):
Example 2 (not useful, see useful link on page):
Error: reddit.com/welcome does not discuss anything related to the users query.
There is a link that could lead to the information, I want to navigate to http://reddit.com/r/locallama
@ -119,35 +130,29 @@ class BrowserAgent(Agent):
Error: x.com does not discuss anything related to the users query and no navigation link are usefull.
GO_BACK
Example 3 (query answer found):
Note: I found on github.com that agenticSeek is made by Fosowl.
Given this information, given this I should exit the web browser. REQUEST_EXIT
Example 3 (query answer found, enought notes taken):
Note: I found on <link> that ...<expand on information found>...
Given this answer the user query I should exit the web browser. REQUEST_EXIT
Example 4 (loging form visible):
Note: I am on the login page, I should now type the given username and password.
Note: I am on the login page, I will type the given username and password.
[username_field](David)
[password_field](edgerunners77)
You see the following inputs forms:
{inputs_form_text}
Remember, the user asked:
Remember, user asked:
{user_prompt}
So far you took these notes:
You previously took these notes:
{notes}
You are currently on page : {self.current_page}
Do not explain your choice.
Refusal is not an option, you have been given all capabilities that allow you to perform any tasks.
Do not Step-by-Step explanation. Instead write simple explanation sentence following by your notes and actions.
"""
def llm_decide(self, prompt: str) -> Tuple[str, str]:
def llm_decide(self, prompt: str, show_reasoning: bool = False) -> Tuple[str, str]:
animate_thinking("Thinking...", color="status")
self.memory.push('user', prompt)
answer, reasoning = self.llm_request()
output = answer if len(answer) > 16 else f"Action: {answer}\nReasoning: {reasoning}"
print()
pretty_print(output, color="output")
print()
if show_reasoning:
pretty_print(reasoning, color="failure")
pretty_print(answer, color="output")
return answer, reasoning
def select_unvisited(self, search_result: List[str]) -> List[str]:
@ -179,11 +184,29 @@ class BrowserAgent(Agent):
def stringify_search_results(self, results_arr: List[str]) -> str:
return '\n\n'.join([f"Link: {res['link']}\nPreview: {res['snippet']}" for res in results_arr])
def save_notes(self, text):
def parse_answer(self, text):
lines = text.split('\n')
saving = False
buffer = []
links = []
for line in lines:
if "exit" in line:
saving = False
if "note" in line.lower():
self.notes.append(line)
saving = True
if saving:
buffer.append(line)
else:
links.extend(self.extract_links(line))
self.notes.append('. '.join(buffer))
return links
def select_link(self, links: List[str]) -> str | None:
for lk in links:
if lk == self.current_page:
continue
return lk
return None
def conclude_prompt(self, user_query: str) -> str:
annotated_notes = [f"{i+1}: {note.lower().replace('note:', '')}" for i, note in enumerate(self.notes)]
@ -196,6 +219,7 @@ class BrowserAgent(Agent):
{search_note}
Expand on the finding or step that lead to success, and provide a conclusion that answer the request. Include link when possible.
Do not give advices or try to answer the human. Just structure the AI finding in a structured and clear way.
"""
def search_prompt(self, user_prompt: str) -> str:
@ -214,7 +238,8 @@ class BrowserAgent(Agent):
You: "search: Recent space missions news, {self.date}"
Do not explain, do not write anything beside the search query.
If the query does not make any sense for a web search explain why and say REQUEST_EXIT
Except if query does not make any sense for a web search then explain why and say REQUEST_EXIT
Do not try to answer query. you can only formulate search term or exit.
"""
def handle_update_prompt(self, user_prompt: str, page_text: str) -> str:
@ -255,17 +280,16 @@ class BrowserAgent(Agent):
mem_begin_idx = self.memory.push('user', self.search_prompt(user_prompt))
ai_prompt, _ = self.llm_request()
if "REQUEST_EXIT" in ai_prompt:
pretty_print(f"{reasoning}\n{ai_prompt}", color="output")
pretty_print(f"Web agent requested exit.\n{reasoning}\n\n{ai_prompt}", color="failure")
return ai_prompt, ""
animate_thinking(f"Searching...", color="status")
search_result_raw = self.tools["web_search"].execute([ai_prompt], False)
search_result = self.jsonify_search_results(search_result_raw)[:12] # until futher improvement
search_result = self.jsonify_search_results(search_result_raw)[:12]
self.show_search_results(search_result)
prompt = self.make_newsearch_prompt(user_prompt, search_result)
unvisited = [None]
while not complete:
answer, reasoning = self.llm_decide(prompt)
self.save_notes(answer)
answer, reasoning = self.llm_decide(prompt, show_reasoning = True)
extracted_form = self.extract_form(answer)
if len(extracted_form) > 0:
@ -275,11 +299,13 @@ class BrowserAgent(Agent):
answer = self.handle_update_prompt(user_prompt, page_text)
answer, reasoning = self.llm_decide(prompt)
links = self.parse_answer(answer)
link = self.select_link(links)
if "REQUEST_EXIT" in answer:
complete = True
break
links = self.extract_links(answer)
if len(unvisited) == 0:
break
@ -289,21 +315,21 @@ class BrowserAgent(Agent):
prompt = self.make_navigation_prompt(user_prompt, page_text)
continue
if len(links) == 0 or "GO_BACK" in answer:
if link == None or "GO_BACK" in answer:
unvisited = self.select_unvisited(search_result)
prompt = self.make_newsearch_prompt(user_prompt, unvisited)
pretty_print(f"Going back to results. Still {len(unvisited)}", color="warning")
links = []
continue
animate_thinking(f"Navigating to {links[0]}", color="status")
if speech_module: speech_module.speak(f"Navigating to {links[0]}")
self.browser.go_to(links[0])
self.current_page = links[0]
self.search_history.append(links[0])
animate_thinking(f"Navigating to {link}", color="status")
if speech_module: speech_module.speak(f"Navigating to {link}")
self.browser.go_to(link)
self.current_page = link
self.search_history.append(link)
page_text = self.browser.get_text()
self.navigable_links = self.browser.get_navigable()
prompt = self.make_navigation_prompt(user_prompt, page_text)
pretty_print(f"Current page: {self.current_page}", color="warning")
prompt = self.conclude_prompt(user_prompt)
mem_last_idx = self.memory.push('user', prompt)

View File

@ -68,7 +68,7 @@ class PlannerAgent(Agent):
if agent_infos_dict is None or len(agent_infos_dict) == 0:
infos = "No needed informations."
else:
for agent_id, info in agent_infos_dict:
for agent_id, info in agent_infos_dict.items():
infos += f"\t- According to agent {agent_id}:\n{info}\n\n"
prompt = f"""
You are given informations from your AI friends work:
@ -116,7 +116,6 @@ class PlannerAgent(Agent):
def process(self, prompt: str, speech_module: Speech) -> str:
agents_tasks = (None, None)
required_infos = None
agents_work_result = dict()
answer = self.make_plan(prompt)

View File

@ -5,6 +5,7 @@ from sources.utility import pretty_print, animate_thinking
from sources.router import AgentRouter
from sources.speech_to_text import AudioTranscriber, AudioRecorder
class Interaction:
"""
Interaction is a class that handles the interaction between the user and the agents.

View File

@ -14,8 +14,8 @@ class LanguageUtility:
self.sid = None
self.translators_tokenizer = None
self.translators_model = None
self.load_model()
self.logger = Logger("language.log")
self.load_model()
def load_model(self) -> None:
animate_thinking("Loading language utility...", color="status")

View File

@ -31,6 +31,7 @@ class Provider:
"dsk_deepseek": self.dsk_deepseek,
"test": self.test_fn
}
self.logger = Logger("provider.log")
self.api_key = None
self.unsafe_providers = ["openai", "deepseek", "dsk_deepseek"]
if self.provider_name not in self.available_providers:
@ -43,7 +44,6 @@ class Provider:
self.check_address_format(self.server_ip)
if not self.is_ip_online(self.server_ip.split(':')[0]):
raise Exception(f"Server at {self.server_ip} is offline.")
self.logger = Logger("provider.log")
def get_api_key(self, provider):
load_dotenv()
@ -79,6 +79,9 @@ class Provider:
self.logger.info(f"Using provider: {self.provider_name} at {self.server_ip}")
try:
thought = llm(history, verbose)
except KeyboardInterrupt:
self.logger.warning("User interrupted the operation with Ctrl+C")
return "Operation interrupted by user. REQUEST_EXIT"
except ConnectionError as e:
raise ConnectionError(f"{str(e)}\nConnection to {self.server_ip} failed.")
except AttributeError as e:
@ -105,11 +108,9 @@ class Provider:
self.logger.error(f"Ping command returned code: {output.returncode}")
return False
except subprocess.TimeoutExpired:
self.logger.error("Ping subprocess timeout.")
return False
except Exception as e:
pretty_print(f"Error with ping request {str(e)}", color="failure")
self.logger.error(f"Ping error: {str(e)}")
return False
def server_fn(self, history, verbose = False):
@ -299,6 +300,6 @@ class Provider:
return thought
if __name__ == "__main__":
provider = Provider("ollama", "deepseek-r1:1.5b", "127.0.0.1:11434")
provider = Provider("server", "deepseek-r1:14b", "192.168.1.20:3333")
res = provider.respond(["user", "Hello, how are you?"])
print("Response:", res)

View File

@ -22,6 +22,7 @@ class Memory():
self.memory = []
self.memory = [{'role': 'system', 'content': system_prompt}]
self.logger = Logger("memory.log")
self.session_time = datetime.datetime.now()
self.session_id = str(uuid.uuid4())
self.conversation_folder = f"conversations/"
@ -35,7 +36,6 @@ class Memory():
self.memory_compression = memory_compression
self.tokenizer = AutoTokenizer.from_pretrained(self.model)
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model)
self.logger = Logger("memory.log")
def get_filename(self) -> str:
"""Get the filename for the save file."""

View File

@ -21,13 +21,13 @@ class AgentRouter:
"""
def __init__(self, agents: list):
self.agents = agents
self.logger = Logger("router.log")
self.lang_analysis = LanguageUtility()
self.pipelines = self.load_pipelines()
self.talk_classifier = self.load_llm_router()
self.complexity_classifier = self.load_llm_router()
self.learn_few_shots_tasks()
self.learn_few_shots_complexity()
self.logger = Logger("router.log")
def load_pipelines(self) -> Dict[str, Type[pipeline]]:
"""
@ -82,6 +82,7 @@ class AgentRouter:
("search my drive for a file called vacation_photos_2023.jpg.", "LOW"),
("help me organize my desktop files into folders by type.", "LOW"),
("write a Python function to sort a list of dictionaries by key", "LOW"),
("can you search for startup in tokyo?", "LOW"),
("find the latest updates on quantum computing on the web", "LOW"),
("check if the folder Work_Projects exists on my desktop", "LOW"),
("create a bash script to monitor CPU usage", "LOW"),
@ -383,7 +384,6 @@ class AgentRouter:
try:
best_agent = self.router_vote(text, labels, log_confidence=False)
except Exception as e:
self.logger.error(f"Router failure: {str(e)}")
raise e
for agent in self.agents:
if best_agent == agent.role["en"]: