fix : browser not handling properly web form

This commit is contained in:
martin legrand 2025-04-01 13:09:16 +02:00
parent a3e95abfde
commit b3efd09fb3
2 changed files with 118 additions and 18 deletions

View File

@ -6,7 +6,8 @@ from sources.agents.agent import Agent
from sources.tools.searxSearch import searxSearch from sources.tools.searxSearch import searxSearch
from sources.browser import Browser from sources.browser import Browser
from datetime import date from datetime import date
from typing import List, Tuple from typing import List, Tuple, Type, Dict, Tuple
class BrowserAgent(Agent): class BrowserAgent(Agent):
def __init__(self, name, prompt_path, provider, verbose=False, browser=None): def __init__(self, name, prompt_path, provider, verbose=False, browser=None):
@ -92,7 +93,7 @@ class BrowserAgent(Agent):
Your task: Your task:
1. Decide if the current page answers the users query: {user_prompt} 1. Decide if the current page answers the users query: {user_prompt}
- If it does, take notes of the useful information, write down source, link or reference, then move to a new page. - If it does, take notes of the useful information, write down source, link or reference, then move to a new page.
- If it does and you are 100% certain that it provide a definive answer, say REQUEST_EXIT - If it does and you completed use request, say REQUEST_EXIT
- If it doesnt, say: Error: This page does not answer the users query then go back or navigate to another link. - If it doesnt, say: Error: This page does not answer the users query then go back or navigate to another link.
2. Navigate by either: 2. Navigate by either:
- Navigate to a navigation links (write the full URL, e.g., www.example.com/cats). - Navigate to a navigation links (write the full URL, e.g., www.example.com/cats).
@ -100,7 +101,7 @@ class BrowserAgent(Agent):
3. Fill forms on the page: 3. Fill forms on the page:
- If user give you informations that help you fill form, fill it. - If user give you informations that help you fill form, fill it.
- If you don't know how to fill a form, leave it empty. - If you don't know how to fill a form, leave it empty.
- You can fill a form using [form_name](value). - You can fill a form using [form_name](value). Do not go back when you fill a form.
Recap of note taking: Recap of note taking:
If useful -> Note: [Briefly summarize the key information or task you conducted.] If useful -> Note: [Briefly summarize the key information or task you conducted.]
@ -125,8 +126,8 @@ class BrowserAgent(Agent):
Example 4 (loging form visible): Example 4 (loging form visible):
Note: I am on the login page, I should now type the given username and password. Note: I am on the login page, I should now type the given username and password.
[form_name_1](David) [username_field](David)
[form_name_2](edgerunners_2077) [password_field](edgerunners77)
You see the following inputs forms: You see the following inputs forms:
{inputs_form_text} {inputs_form_text}
@ -143,8 +144,9 @@ class BrowserAgent(Agent):
animate_thinking("Thinking...", color="status") animate_thinking("Thinking...", color="status")
self.memory.push('user', prompt) self.memory.push('user', prompt)
answer, reasoning = self.llm_request() answer, reasoning = self.llm_request()
output = f"Answer: {answer}" if len(answer) > 16 else f"Action: {answer}\nReasoning: {reasoning}"
pretty_print("-"*100) pretty_print("-"*100)
pretty_print(answer, color="output") pretty_print(output, color="output")
pretty_print("-"*100) pretty_print("-"*100)
return answer, reasoning return answer, reasoning
@ -175,7 +177,7 @@ class BrowserAgent(Agent):
return parsed_results return parsed_results
def stringify_search_results(self, results_arr: List[str]) -> str: def stringify_search_results(self, results_arr: List[str]) -> str:
return '\n\n'.join([f"Link: {res['link']}" for res in results_arr]) return '\n\n'.join([f"Link: {res['link']}\nPreview: {res['snippet']}" for res in results_arr])
def save_notes(self, text): def save_notes(self, text):
lines = text.split('\n') lines = text.split('\n')
@ -215,18 +217,49 @@ class BrowserAgent(Agent):
If the query does not make any sense for a web search explain why and say REQUEST_EXIT If the query does not make any sense for a web search explain why and say REQUEST_EXIT
""" """
def process(self, user_prompt, speech_module) -> str: def handle_update_prompt(self, user_prompt: str, page_text: str) -> str:
return f"""
You are a web browser.
You just filled a form on the page.
Now you should see the result of the form submission on the page:
Page text:
{page_text}
The user asked: {user_prompt}
Does the page answer the users query now?
If it does, take notes of the useful information, write down result and say FORM_FILLED.
If you were previously on a login form, no need to explain.
If it does and you completed user request, say REQUEST_EXIT
if it doesnt, say: Error: This page does not answer the users query then GO_BACK.
"""
def show_search_results(self, search_result: List[str]):
pretty_print("\nSearch results:", color="output")
for res in search_result:
pretty_print(f"Title: {res['title']} - Link: {res['link']}", color="output")
def process(self, user_prompt: str, speech_module: type) -> Tuple[str, str]:
"""
Process the user prompt to conduct an autonomous web search.
Start with a google search with searxng using web_search tool.
Then enter a navigation logic to find the answer or conduct required actions.
Args:
user_prompt: The user's input query
speech_module: Optional speech output module
Returns:
tuple containing the final answer and reasoning
"""
complete = False complete = False
animate_thinking(f"Thinking...", color="status") animate_thinking(f"Thinking...", color="status")
self.memory.push('user', self.search_prompt(user_prompt)) self.memory.push('user', self.search_prompt(user_prompt))
ai_prompt, _ = self.llm_request() ai_prompt, _ = self.llm_request()
if "REQUEST_EXIT" in ai_prompt: if "REQUEST_EXIT" in ai_prompt:
# request make no sense, maybe wrong agent was allocated? pretty_print(f"{reasoning}\n{ai_prompt}", color="output")
return ai_prompt, "" return ai_prompt, ""
animate_thinking(f"Searching...", color="status") animate_thinking(f"Searching...", color="status")
search_result_raw = self.tools["web_search"].execute([ai_prompt], False) search_result_raw = self.tools["web_search"].execute([ai_prompt], False)
search_result = self.jsonify_search_results(search_result_raw)[:12] # until futher improvement search_result = self.jsonify_search_results(search_result_raw)[:12] # until futher improvement
self.show_search_results(search_result)
prompt = self.make_newsearch_prompt(user_prompt, search_result) prompt = self.make_newsearch_prompt(user_prompt, search_result)
unvisited = [None] unvisited = [None]
while not complete: while not complete:
@ -236,7 +269,10 @@ class BrowserAgent(Agent):
extracted_form = self.extract_form(answer) extracted_form = self.extract_form(answer)
if len(extracted_form) > 0: if len(extracted_form) > 0:
self.browser.fill_form_inputs(extracted_form) self.browser.fill_form_inputs(extracted_form)
self.browser.find_and_click_submit() self.browser.find_and_click_submission()
page_text = self.browser.get_text()
answer = self.handle_update_prompt(user_prompt, page_text)
answer, reasoning = self.llm_decide(prompt)
if "REQUEST_EXIT" in answer: if "REQUEST_EXIT" in answer:
complete = True complete = True
@ -246,6 +282,12 @@ class BrowserAgent(Agent):
if len(unvisited) == 0: if len(unvisited) == 0:
break break
if "FORM_FILLED" in answer:
page_text = self.browser.get_text()
self.navigable_links = self.browser.get_navigable()
prompt = self.make_navigation_prompt(user_prompt, page_text)
continue
if len(links) == 0 or "GO_BACK" in answer: if len(links) == 0 or "GO_BACK" in answer:
unvisited = self.select_unvisited(search_result) unvisited = self.select_unvisited(search_result)
prompt = self.make_newsearch_prompt(user_prompt, unvisited) prompt = self.make_newsearch_prompt(user_prompt, unvisited)

View File

@ -6,10 +6,9 @@ from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse
from typing import List, Tuple from typing import List, Tuple, Type, Dict, Tuple
from fake_useragent import UserAgent from fake_useragent import UserAgent
from selenium_stealth import stealth from selenium_stealth import stealth
import undetected_chromedriver as uc import undetected_chromedriver as uc
@ -126,13 +125,26 @@ class Browser:
try: try:
initial_handles = self.driver.window_handles initial_handles = self.driver.window_handles
self.driver.get(url) self.driver.get(url)
time.sleep(1) wait = WebDriverWait(self.driver, timeout=30)
wait.until(
lambda driver: (
driver.execute_script("return document.readyState") == "complete" and
not any(keyword in driver.page_source.lower() for keyword in ["checking your browser", "verifying", "captcha"])
),
message="stuck on 'checking browser' or verification screen"
)
self.apply_web_safety() self.apply_web_safety()
self.logger.info(f"Navigated to: {url}") self.logger.info(f"Navigated to: {url}")
return True return True
except TimeoutException as e:
self.logger.error(f"Timeout waiting for {url} to load: {str(e)}")
return False
except WebDriverException as e: except WebDriverException as e:
self.logger.error(f"Error navigating to {url}: {str(e)}") self.logger.error(f"Error navigating to {url}: {str(e)}")
return False return False
except Exception as e:
self.logger.error(f"Fatal error with go_to method on {url}:\n{str(e)}")
raise e
def is_sentence(self, text:str) -> bool: def is_sentence(self, text:str) -> bool:
"""Check if the text qualifies as a meaningful sentence or contains important error codes.""" """Check if the text qualifies as a meaningful sentence or contains important error codes."""
@ -199,7 +211,7 @@ class Browser:
return False return False
return True return True
def get_navigable(self) -> [str]: def get_navigable(self) -> List[str]:
"""Get all navigable links on the current page.""" """Get all navigable links on the current page."""
try: try:
links = [] links = []
@ -301,13 +313,55 @@ class Browser:
result.sort(key=lambda x: len(x[0])) result.sort(key=lambda x: len(x[0]))
return result return result
def find_and_click_submit(self, btn_type:str = 'login') -> None: """
def find_and_click_submission(self, btn_type:str = 'login') -> None:
buttons = self.get_buttons_xpath() buttons = self.get_buttons_xpath()
if len(buttons) == 0: if len(buttons) == 0:
self.logger.warning(f"No visible buttons found") self.logger.warning(f"No visible buttons found")
for button in buttons: for button in buttons:
if button[0] == btn_type: if button[0] == btn_type:
self.click_element(button[1]) self.click_element(button[1])
"""
def find_and_click_submission(self, timeout: int = 10) -> bool:
possible_submissions = ["login", "submit", "register"]
for submission in possible_submissions:
if self.find_and_click_btn(submission, timeout):
return True
self.logger.warning("No submission button found")
return False
def find_and_click_btn(self, btn_type: str = 'login', timeout: int = 10) -> bool:
"""
Find and click a submit button matching the specified type.
Args:
btn_type: The type of button to find (e.g., 'login', 'submit'), matched against button text.
timeout: Maximum time (in seconds) to wait for the button to appear.
Returns:
bool: True if the button was found and clicked, False otherwise.
"""
buttons = self.get_buttons_xpath()
if not buttons:
self.logger.warning("No visible buttons found")
return False
for button_text, xpath in buttons:
if btn_type.lower() in button_text.lower():
try:
wait = WebDriverWait(self.driver, timeout)
element = wait.until(
EC.element_to_be_clickable((By.XPATH, xpath)),
message=f"Button with XPath '{xpath}' not clickable within {timeout} seconds"
)
if self.click_element(xpath):
return True
else:
return False
except TimeoutException:
self.logger.warning(f"Timeout waiting for '{button_text}' button at XPath: {xpath}")
return False
self.logger.warning(f"No button matching '{btn_type}' found")
return False
def find_input_xpath_by_name(self, inputs, name: str) -> str | None: def find_input_xpath_by_name(self, inputs, name: str) -> str | None:
for field in inputs: for field in inputs:
@ -315,8 +369,11 @@ class Browser:
return field["xpath"] return field["xpath"]
return None return None
def fill_form_inputs(self, input_list:[str]) -> bool: def fill_form_inputs(self, input_list: List[str]) -> bool:
"""Fill form inputs based on a list of [name](value) strings.""" """Fill form inputs based on a list of [name](value) strings."""
if not isinstance(input_list, list):
self.logger.error("input_list must be a list")
return False
inputs = self.find_all_inputs() inputs = self.find_all_inputs()
try: try:
for input_str in input_list: for input_str in input_list:
@ -389,7 +446,7 @@ if __name__ == "__main__":
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
driver = create_driver() driver = create_driver()
browser = Browser(driver) browser = Browser(driver, anticaptcha_manual_install=True)
time.sleep(10) time.sleep(10)
print("AntiCaptcha Test") print("AntiCaptcha Test")
@ -400,4 +457,5 @@ if __name__ == "__main__":
inputs = browser.get_form_inputs() inputs = browser.get_form_inputs()
inputs = ['[username](student)', f'[password](Password123)', '[appOtp]()', '[backupOtp]()'] inputs = ['[username](student)', f'[password](Password123)', '[appOtp]()', '[backupOtp]()']
browser.fill_form_inputs(inputs) browser.fill_form_inputs(inputs)
browser.find_and_click_submit() browser.find_and_click_submission()
time.sleep(30)