fix : browser not handling properly web form

This commit is contained in:
martin legrand 2025-04-01 13:09:16 +02:00
parent a3e95abfde
commit b3efd09fb3
2 changed files with 118 additions and 18 deletions

View File

@ -6,7 +6,8 @@ from sources.agents.agent import Agent
from sources.tools.searxSearch import searxSearch
from sources.browser import Browser
from datetime import date
from typing import List, Tuple
from typing import List, Tuple, Type, Dict, Tuple
class BrowserAgent(Agent):
def __init__(self, name, prompt_path, provider, verbose=False, browser=None):
@ -92,7 +93,7 @@ class BrowserAgent(Agent):
Your task:
1. Decide if the current page answers the users query: {user_prompt}
- If it does, take notes of the useful information, write down source, link or reference, then move to a new page.
- If it does and you are 100% certain that it provide a definive answer, say REQUEST_EXIT
- If it does and you completed use request, say REQUEST_EXIT
- If it doesnt, say: Error: This page does not answer the users query then go back or navigate to another link.
2. Navigate by either:
- Navigate to a navigation links (write the full URL, e.g., www.example.com/cats).
@ -100,7 +101,7 @@ class BrowserAgent(Agent):
3. Fill forms on the page:
- If user give you informations that help you fill form, fill it.
- If you don't know how to fill a form, leave it empty.
- You can fill a form using [form_name](value).
- You can fill a form using [form_name](value). Do not go back when you fill a form.
Recap of note taking:
If useful -> Note: [Briefly summarize the key information or task you conducted.]
@ -125,8 +126,8 @@ class BrowserAgent(Agent):
Example 4 (loging form visible):
Note: I am on the login page, I should now type the given username and password.
[form_name_1](David)
[form_name_2](edgerunners_2077)
[username_field](David)
[password_field](edgerunners77)
You see the following inputs forms:
{inputs_form_text}
@ -143,8 +144,9 @@ class BrowserAgent(Agent):
animate_thinking("Thinking...", color="status")
self.memory.push('user', prompt)
answer, reasoning = self.llm_request()
output = f"Answer: {answer}" if len(answer) > 16 else f"Action: {answer}\nReasoning: {reasoning}"
pretty_print("-"*100)
pretty_print(answer, color="output")
pretty_print(output, color="output")
pretty_print("-"*100)
return answer, reasoning
@ -175,7 +177,7 @@ class BrowserAgent(Agent):
return parsed_results
def stringify_search_results(self, results_arr: List[str]) -> str:
return '\n\n'.join([f"Link: {res['link']}" for res in results_arr])
return '\n\n'.join([f"Link: {res['link']}\nPreview: {res['snippet']}" for res in results_arr])
def save_notes(self, text):
lines = text.split('\n')
@ -214,19 +216,50 @@ class BrowserAgent(Agent):
Do not explain, do not write anything beside the search query.
If the query does not make any sense for a web search explain why and say REQUEST_EXIT
"""
def handle_update_prompt(self, user_prompt: str, page_text: str) -> str:
return f"""
You are a web browser.
You just filled a form on the page.
Now you should see the result of the form submission on the page:
Page text:
{page_text}
The user asked: {user_prompt}
Does the page answer the users query now?
If it does, take notes of the useful information, write down result and say FORM_FILLED.
If you were previously on a login form, no need to explain.
If it does and you completed user request, say REQUEST_EXIT
if it doesnt, say: Error: This page does not answer the users query then GO_BACK.
"""
def show_search_results(self, search_result: List[str]):
pretty_print("\nSearch results:", color="output")
for res in search_result:
pretty_print(f"Title: {res['title']} - Link: {res['link']}", color="output")
def process(self, user_prompt, speech_module) -> str:
def process(self, user_prompt: str, speech_module: type) -> Tuple[str, str]:
"""
Process the user prompt to conduct an autonomous web search.
Start with a google search with searxng using web_search tool.
Then enter a navigation logic to find the answer or conduct required actions.
Args:
user_prompt: The user's input query
speech_module: Optional speech output module
Returns:
tuple containing the final answer and reasoning
"""
complete = False
animate_thinking(f"Thinking...", color="status")
self.memory.push('user', self.search_prompt(user_prompt))
ai_prompt, _ = self.llm_request()
if "REQUEST_EXIT" in ai_prompt:
# request make no sense, maybe wrong agent was allocated?
pretty_print(f"{reasoning}\n{ai_prompt}", color="output")
return ai_prompt, ""
animate_thinking(f"Searching...", color="status")
search_result_raw = self.tools["web_search"].execute([ai_prompt], False)
search_result = self.jsonify_search_results(search_result_raw)[:12] # until futher improvement
self.show_search_results(search_result)
prompt = self.make_newsearch_prompt(user_prompt, search_result)
unvisited = [None]
while not complete:
@ -236,7 +269,10 @@ class BrowserAgent(Agent):
extracted_form = self.extract_form(answer)
if len(extracted_form) > 0:
self.browser.fill_form_inputs(extracted_form)
self.browser.find_and_click_submit()
self.browser.find_and_click_submission()
page_text = self.browser.get_text()
answer = self.handle_update_prompt(user_prompt, page_text)
answer, reasoning = self.llm_decide(prompt)
if "REQUEST_EXIT" in answer:
complete = True
@ -246,6 +282,12 @@ class BrowserAgent(Agent):
if len(unvisited) == 0:
break
if "FORM_FILLED" in answer:
page_text = self.browser.get_text()
self.navigable_links = self.browser.get_navigable()
prompt = self.make_navigation_prompt(user_prompt, page_text)
continue
if len(links) == 0 or "GO_BACK" in answer:
unvisited = self.select_unvisited(search_result)
prompt = self.make_newsearch_prompt(user_prompt, unvisited)

View File

@ -6,10 +6,9 @@ from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from typing import List, Tuple
from typing import List, Tuple, Type, Dict, Tuple
from fake_useragent import UserAgent
from selenium_stealth import stealth
import undetected_chromedriver as uc
@ -126,13 +125,26 @@ class Browser:
try:
initial_handles = self.driver.window_handles
self.driver.get(url)
time.sleep(1)
wait = WebDriverWait(self.driver, timeout=30)
wait.until(
lambda driver: (
driver.execute_script("return document.readyState") == "complete" and
not any(keyword in driver.page_source.lower() for keyword in ["checking your browser", "verifying", "captcha"])
),
message="stuck on 'checking browser' or verification screen"
)
self.apply_web_safety()
self.logger.info(f"Navigated to: {url}")
return True
except TimeoutException as e:
self.logger.error(f"Timeout waiting for {url} to load: {str(e)}")
return False
except WebDriverException as e:
self.logger.error(f"Error navigating to {url}: {str(e)}")
return False
except Exception as e:
self.logger.error(f"Fatal error with go_to method on {url}:\n{str(e)}")
raise e
def is_sentence(self, text:str) -> bool:
"""Check if the text qualifies as a meaningful sentence or contains important error codes."""
@ -199,7 +211,7 @@ class Browser:
return False
return True
def get_navigable(self) -> [str]:
def get_navigable(self) -> List[str]:
"""Get all navigable links on the current page."""
try:
links = []
@ -301,13 +313,55 @@ class Browser:
result.sort(key=lambda x: len(x[0]))
return result
def find_and_click_submit(self, btn_type:str = 'login') -> None:
"""
def find_and_click_submission(self, btn_type:str = 'login') -> None:
buttons = self.get_buttons_xpath()
if len(buttons) == 0:
self.logger.warning(f"No visible buttons found")
for button in buttons:
if button[0] == btn_type:
self.click_element(button[1])
"""
def find_and_click_submission(self, timeout: int = 10) -> bool:
possible_submissions = ["login", "submit", "register"]
for submission in possible_submissions:
if self.find_and_click_btn(submission, timeout):
return True
self.logger.warning("No submission button found")
return False
def find_and_click_btn(self, btn_type: str = 'login', timeout: int = 10) -> bool:
"""
Find and click a submit button matching the specified type.
Args:
btn_type: The type of button to find (e.g., 'login', 'submit'), matched against button text.
timeout: Maximum time (in seconds) to wait for the button to appear.
Returns:
bool: True if the button was found and clicked, False otherwise.
"""
buttons = self.get_buttons_xpath()
if not buttons:
self.logger.warning("No visible buttons found")
return False
for button_text, xpath in buttons:
if btn_type.lower() in button_text.lower():
try:
wait = WebDriverWait(self.driver, timeout)
element = wait.until(
EC.element_to_be_clickable((By.XPATH, xpath)),
message=f"Button with XPath '{xpath}' not clickable within {timeout} seconds"
)
if self.click_element(xpath):
return True
else:
return False
except TimeoutException:
self.logger.warning(f"Timeout waiting for '{button_text}' button at XPath: {xpath}")
return False
self.logger.warning(f"No button matching '{btn_type}' found")
return False
def find_input_xpath_by_name(self, inputs, name: str) -> str | None:
for field in inputs:
@ -315,8 +369,11 @@ class Browser:
return field["xpath"]
return None
def fill_form_inputs(self, input_list:[str]) -> bool:
def fill_form_inputs(self, input_list: List[str]) -> bool:
"""Fill form inputs based on a list of [name](value) strings."""
if not isinstance(input_list, list):
self.logger.error("input_list must be a list")
return False
inputs = self.find_all_inputs()
try:
for input_str in input_list:
@ -389,7 +446,7 @@ if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
driver = create_driver()
browser = Browser(driver)
browser = Browser(driver, anticaptcha_manual_install=True)
time.sleep(10)
print("AntiCaptcha Test")
@ -400,4 +457,5 @@ if __name__ == "__main__":
inputs = browser.get_form_inputs()
inputs = ['[username](student)', f'[password](Password123)', '[appOtp]()', '[backupOtp]()']
browser.fill_form_inputs(inputs)
browser.find_and_click_submit()
browser.find_and_click_submission()
time.sleep(30)