diff --git a/sources/agents/browser_agent.py b/sources/agents/browser_agent.py index 66929b0..e476b35 100644 --- a/sources/agents/browser_agent.py +++ b/sources/agents/browser_agent.py @@ -6,7 +6,8 @@ from sources.agents.agent import Agent from sources.tools.searxSearch import searxSearch from sources.browser import Browser from datetime import date -from typing import List, Tuple +from typing import List, Tuple, Type, Dict, Tuple + class BrowserAgent(Agent): def __init__(self, name, prompt_path, provider, verbose=False, browser=None): @@ -92,7 +93,7 @@ class BrowserAgent(Agent): Your task: 1. Decide if the current page answers the user’s query: {user_prompt} - If it does, take notes of the useful information, write down source, link or reference, then move to a new page. - - If it does and you are 100% certain that it provide a definive answer, say REQUEST_EXIT + - If it does and you completed use request, say REQUEST_EXIT - If it doesn’t, say: Error: This page does not answer the user’s query then go back or navigate to another link. 2. Navigate by either: - Navigate to a navigation links (write the full URL, e.g., www.example.com/cats). @@ -100,7 +101,7 @@ class BrowserAgent(Agent): 3. Fill forms on the page: - If user give you informations that help you fill form, fill it. - If you don't know how to fill a form, leave it empty. - - You can fill a form using [form_name](value). + - You can fill a form using [form_name](value). Do not go back when you fill a form. Recap of note taking: If useful -> Note: [Briefly summarize the key information or task you conducted.] @@ -125,8 +126,8 @@ class BrowserAgent(Agent): Example 4 (loging form visible): Note: I am on the login page, I should now type the given username and password. - [form_name_1](David) - [form_name_2](edgerunners_2077) + [username_field](David) + [password_field](edgerunners77) You see the following inputs forms: {inputs_form_text} @@ -143,8 +144,9 @@ class BrowserAgent(Agent): animate_thinking("Thinking...", color="status") self.memory.push('user', prompt) answer, reasoning = self.llm_request() + output = f"Answer: {answer}" if len(answer) > 16 else f"Action: {answer}\nReasoning: {reasoning}" pretty_print("-"*100) - pretty_print(answer, color="output") + pretty_print(output, color="output") pretty_print("-"*100) return answer, reasoning @@ -175,7 +177,7 @@ class BrowserAgent(Agent): return parsed_results def stringify_search_results(self, results_arr: List[str]) -> str: - return '\n\n'.join([f"Link: {res['link']}" for res in results_arr]) + return '\n\n'.join([f"Link: {res['link']}\nPreview: {res['snippet']}" for res in results_arr]) def save_notes(self, text): lines = text.split('\n') @@ -214,19 +216,50 @@ class BrowserAgent(Agent): Do not explain, do not write anything beside the search query. If the query does not make any sense for a web search explain why and say REQUEST_EXIT """ + + def handle_update_prompt(self, user_prompt: str, page_text: str) -> str: + return f""" + You are a web browser. + You just filled a form on the page. + Now you should see the result of the form submission on the page: + Page text: + {page_text} + The user asked: {user_prompt} + Does the page answer the user’s query now? + If it does, take notes of the useful information, write down result and say FORM_FILLED. + If you were previously on a login form, no need to explain. + If it does and you completed user request, say REQUEST_EXIT + if it doesn’t, say: Error: This page does not answer the user’s query then GO_BACK. + """ + + def show_search_results(self, search_result: List[str]): + pretty_print("\nSearch results:", color="output") + for res in search_result: + pretty_print(f"Title: {res['title']} - Link: {res['link']}", color="output") - def process(self, user_prompt, speech_module) -> str: + def process(self, user_prompt: str, speech_module: type) -> Tuple[str, str]: + """ + Process the user prompt to conduct an autonomous web search. + Start with a google search with searxng using web_search tool. + Then enter a navigation logic to find the answer or conduct required actions. + Args: + user_prompt: The user's input query + speech_module: Optional speech output module + Returns: + tuple containing the final answer and reasoning + """ complete = False animate_thinking(f"Thinking...", color="status") self.memory.push('user', self.search_prompt(user_prompt)) ai_prompt, _ = self.llm_request() if "REQUEST_EXIT" in ai_prompt: - # request make no sense, maybe wrong agent was allocated? + pretty_print(f"{reasoning}\n{ai_prompt}", color="output") return ai_prompt, "" animate_thinking(f"Searching...", color="status") search_result_raw = self.tools["web_search"].execute([ai_prompt], False) search_result = self.jsonify_search_results(search_result_raw)[:12] # until futher improvement + self.show_search_results(search_result) prompt = self.make_newsearch_prompt(user_prompt, search_result) unvisited = [None] while not complete: @@ -236,7 +269,10 @@ class BrowserAgent(Agent): extracted_form = self.extract_form(answer) if len(extracted_form) > 0: self.browser.fill_form_inputs(extracted_form) - self.browser.find_and_click_submit() + self.browser.find_and_click_submission() + page_text = self.browser.get_text() + answer = self.handle_update_prompt(user_prompt, page_text) + answer, reasoning = self.llm_decide(prompt) if "REQUEST_EXIT" in answer: complete = True @@ -246,6 +282,12 @@ class BrowserAgent(Agent): if len(unvisited) == 0: break + if "FORM_FILLED" in answer: + page_text = self.browser.get_text() + self.navigable_links = self.browser.get_navigable() + prompt = self.make_navigation_prompt(user_prompt, page_text) + continue + if len(links) == 0 or "GO_BACK" in answer: unvisited = self.select_unvisited(search_result) prompt = self.make_newsearch_prompt(user_prompt, unvisited) diff --git a/sources/browser.py b/sources/browser.py index b6f7651..6c68b77 100644 --- a/sources/browser.py +++ b/sources/browser.py @@ -6,10 +6,9 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, WebDriverException from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.chrome.options import Options from bs4 import BeautifulSoup from urllib.parse import urlparse -from typing import List, Tuple +from typing import List, Tuple, Type, Dict, Tuple from fake_useragent import UserAgent from selenium_stealth import stealth import undetected_chromedriver as uc @@ -126,13 +125,26 @@ class Browser: try: initial_handles = self.driver.window_handles self.driver.get(url) - time.sleep(1) + wait = WebDriverWait(self.driver, timeout=30) + wait.until( + lambda driver: ( + driver.execute_script("return document.readyState") == "complete" and + not any(keyword in driver.page_source.lower() for keyword in ["checking your browser", "verifying", "captcha"]) + ), + message="stuck on 'checking browser' or verification screen" + ) self.apply_web_safety() self.logger.info(f"Navigated to: {url}") return True + except TimeoutException as e: + self.logger.error(f"Timeout waiting for {url} to load: {str(e)}") + return False except WebDriverException as e: self.logger.error(f"Error navigating to {url}: {str(e)}") return False + except Exception as e: + self.logger.error(f"Fatal error with go_to method on {url}:\n{str(e)}") + raise e def is_sentence(self, text:str) -> bool: """Check if the text qualifies as a meaningful sentence or contains important error codes.""" @@ -199,7 +211,7 @@ class Browser: return False return True - def get_navigable(self) -> [str]: + def get_navigable(self) -> List[str]: """Get all navigable links on the current page.""" try: links = [] @@ -301,13 +313,55 @@ class Browser: result.sort(key=lambda x: len(x[0])) return result - def find_and_click_submit(self, btn_type:str = 'login') -> None: + """ + def find_and_click_submission(self, btn_type:str = 'login') -> None: buttons = self.get_buttons_xpath() if len(buttons) == 0: self.logger.warning(f"No visible buttons found") for button in buttons: if button[0] == btn_type: self.click_element(button[1]) + """ + + def find_and_click_submission(self, timeout: int = 10) -> bool: + possible_submissions = ["login", "submit", "register"] + for submission in possible_submissions: + if self.find_and_click_btn(submission, timeout): + return True + self.logger.warning("No submission button found") + return False + + def find_and_click_btn(self, btn_type: str = 'login', timeout: int = 10) -> bool: + """ + Find and click a submit button matching the specified type. + Args: + btn_type: The type of button to find (e.g., 'login', 'submit'), matched against button text. + timeout: Maximum time (in seconds) to wait for the button to appear. + Returns: + bool: True if the button was found and clicked, False otherwise. + """ + buttons = self.get_buttons_xpath() + if not buttons: + self.logger.warning("No visible buttons found") + return False + + for button_text, xpath in buttons: + if btn_type.lower() in button_text.lower(): + try: + wait = WebDriverWait(self.driver, timeout) + element = wait.until( + EC.element_to_be_clickable((By.XPATH, xpath)), + message=f"Button with XPath '{xpath}' not clickable within {timeout} seconds" + ) + if self.click_element(xpath): + return True + else: + return False + except TimeoutException: + self.logger.warning(f"Timeout waiting for '{button_text}' button at XPath: {xpath}") + return False + self.logger.warning(f"No button matching '{btn_type}' found") + return False def find_input_xpath_by_name(self, inputs, name: str) -> str | None: for field in inputs: @@ -315,8 +369,11 @@ class Browser: return field["xpath"] return None - def fill_form_inputs(self, input_list:[str]) -> bool: + def fill_form_inputs(self, input_list: List[str]) -> bool: """Fill form inputs based on a list of [name](value) strings.""" + if not isinstance(input_list, list): + self.logger.error("input_list must be a list") + return False inputs = self.find_all_inputs() try: for input_str in input_list: @@ -389,7 +446,7 @@ if __name__ == "__main__": logging.basicConfig(level=logging.INFO) driver = create_driver() - browser = Browser(driver) + browser = Browser(driver, anticaptcha_manual_install=True) time.sleep(10) print("AntiCaptcha Test") @@ -400,4 +457,5 @@ if __name__ == "__main__": inputs = browser.get_form_inputs() inputs = ['[username](student)', f'[password](Password123)', '[appOtp]()', '[backupOtp]()'] browser.fill_form_inputs(inputs) - browser.find_and_click_submit() + browser.find_and_click_submission() + time.sleep(30)