feat : better numerical value handling on webpage

This commit is contained in:
martin legrand 2025-04-04 11:56:11 +02:00
parent 4f7e30b498
commit ff9c1576b6
2 changed files with 16 additions and 11 deletions

View File

@ -90,7 +90,7 @@ class BrowserAgent(Agent):
{remaining_links_text} {remaining_links_text}
Your task: Your task:
1. Decide if the current page answers the users query: {user_prompt} 1. Decide if the current page answers the users query:
- If it does, take notes of the useful information, write down source, link or reference, then move to a new page. - If it does, take notes of the useful information, write down source, link or reference, then move to a new page.
- If it does and you completed user request, say REQUEST_EXIT - If it does and you completed user request, say REQUEST_EXIT
- If it doesnt, say: Error: This page does not answer the users query then go back or navigate to another link. - If it doesnt, say: Error: This page does not answer the users query then go back or navigate to another link.
@ -120,7 +120,7 @@ class BrowserAgent(Agent):
GO_BACK GO_BACK
Example 3 (query answer found): Example 3 (query answer found):
Note: I found on github.com that agenticSeek is Fosowl. Note: I found on github.com that agenticSeek is made by Fosowl.
Given this information, given this I should exit the web browser. REQUEST_EXIT Given this information, given this I should exit the web browser. REQUEST_EXIT
Example 4 (loging form visible): Example 4 (loging form visible):
@ -131,7 +131,8 @@ class BrowserAgent(Agent):
You see the following inputs forms: You see the following inputs forms:
{inputs_form_text} {inputs_form_text}
Remember, the user asked: {user_prompt} Remember, the user asked:
{user_prompt}
So far you took these notes: So far you took these notes:
{notes} {notes}
You are currently on page : {self.current_page} You are currently on page : {self.current_page}

View File

@ -22,7 +22,10 @@ import logging
import sys import sys
import re import re
from sources.utility import pretty_print, animate_thinking if __name__ == "__main__":
from utility import pretty_print, animate_thinking
else:
from sources.utility import pretty_print, animate_thinking
logging.basicConfig(filename='browser.log', level=logging.ERROR, logging.basicConfig(filename='browser.log', level=logging.ERROR,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
@ -155,13 +158,12 @@ class Browser:
"""Check if the text qualifies as a meaningful sentence or contains important error codes.""" """Check if the text qualifies as a meaningful sentence or contains important error codes."""
text = text.strip() text = text.strip()
error_codes = ["404", "403", "500", "502", "503"] if any(c.isdigit() for c in text):
if any(code in text for code in error_codes):
return True return True
words = re.findall(r'\w+', text, re.UNICODE) words = re.findall(r'\w+', text, re.UNICODE)
word_count = len(words) word_count = len(words)
has_punctuation = any(text.endswith(p) for p in ['.', '', ',', '!', '?', '', '', '', '', '۔']) has_punctuation = any(text.endswith(p) for p in ['.', '', ',', '!', '?', '', '', '', '', '۔'])
is_long_enough = word_count > 5 is_long_enough = word_count > 4
return (word_count >= 5 and (has_punctuation or is_long_enough)) return (word_count >= 5 and (has_punctuation or is_long_enough))
def get_text(self) -> str | None: def get_text(self) -> str | None:
@ -173,9 +175,8 @@ class Browser:
element.decompose() element.decompose()
text = soup.get_text() text = soup.get_text()
lines = (line.strip() for line in text.splitlines()) lines = (f"{line.strip()}\n" for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = "\n".join(chunk for chunk in lines if chunk and self.is_sentence(chunk))
text = "\n".join(chunk for chunk in chunks if chunk and self.is_sentence(chunk))
text = text[:4096] text = text[:4096]
#markdown_text = markdownify.markdownify(text, heading_style="ATX") #markdown_text = markdownify.markdownify(text, heading_style="ATX")
return "[Start of page]\n" + text + "\n[End of page]" return "[Start of page]\n" + text + "\n[End of page]"
@ -448,6 +449,9 @@ if __name__ == "__main__":
browser = Browser(driver, anticaptcha_manual_install=True) browser = Browser(driver, anticaptcha_manual_install=True)
time.sleep(10) time.sleep(10)
#browser.go_to("https://coinmarketcap.com/")
#txt = browser.get_text()
#print(txt)
print("AntiCaptcha / Form Test") print("AntiCaptcha / Form Test")
browser.go_to("https://www.google.com/recaptcha/api2/demo") browser.go_to("https://www.google.com/recaptcha/api2/demo")
#browser.go_to("https://practicetestautomation.com/practice-test-login/") #browser.go_to("https://practicetestautomation.com/practice-test-login/")
@ -456,4 +460,4 @@ if __name__ == "__main__":
inputs = ['[input1](Martin)', f'[input2](Test)', '[input3](test@gmail.com)'] inputs = ['[input1](Martin)', f'[input2](Test)', '[input3](test@gmail.com)']
browser.fill_form_inputs(inputs) browser.fill_form_inputs(inputs)
browser.find_and_click_submission() browser.find_and_click_submission()
time.sleep(30) time.sleep(10)