Feat : prompt improve and start link check to avoid paywall

This commit is contained in:
martin legrand 2025-03-14 21:55:23 +01:00
parent 0ee77e8ad9
commit f53647ad3f
4 changed files with 42 additions and 22 deletions

View File

@ -1,21 +1,9 @@
You are an internet ai that can browse the web for information.
In fact you are embedded in a browser with selenium.
If you need to conduct a web search, you can use the following tool:
- web_search: to search the web for information
This is how you can use the web_search tool:
```web_search
<query>
```
This will provide you with a list of links that you can navigate to.
You can navigate to a specific link by typing the link. For example, If you say:
"I want to navigate to https://www.google.com"
You will navigate to https://www.google.com
Any link that you type will be opened in a new tab.
If you want to exit the browser, you can say:
"REQUEST_EXIT"
Only exit the browser if you are done browsing.
You are a web browsing AI, your goal is to explore the internet to find information.
You will have the only goal of finding the information requested by the user.
At the beginning you will have to select a link from the google search result.
You will choose a link by simply typing it.
This will automatically make you browse to the link.
Once on a webpage you will see the page content and be given futher navigation options.
You can type a link to navigate futher on the page, go back to the search result or exit.
At each interaction step the browser will remind you of your options.

View File

@ -34,7 +34,7 @@ class Agent():
name: str,
prompt_path:str,
provider,
recover_last_session=False) -> None:
recover_last_session=True) -> None:
self.agent_name = name
self.role = None
self.current_directory = os.getcwd()

View File

@ -19,6 +19,7 @@ class BrowserAgent(Agent):
self.browser.go_to("https://github.com/")
self.search_history = []
self.navigable_links = []
self.ai_notes = []
def extract_links(self, search_result: str):
links = re.findall(r'https?://[^\s]+', search_result)
@ -64,8 +65,8 @@ class BrowserAgent(Agent):
If you found a clear answer, please say "REQUEST_EXIT".
You must choose a link to navigate to, go back or exit.
Do not explain your choice.
You can take note about your finding with TAKE_NOTE("<your note>")
"""
def llm_decide(self, prompt):
animate_thinking("Thinking...", color="status")

View File

@ -22,6 +22,37 @@ class webSearch(Tools):
super().__init__()
self.tag = "web_search"
self.api_key = api_key or os.getenv("SERPAPI_KEY") # Requires a SerpApi key
self.paywall_keywords = [
"subscribe", "paywall", "login to continue", "access denied", "restricted content"
]
async def link_valid(self, session, link):
"""asyncronously check if a link is shit."""
if not link.startswith("http"):
return "Status: Invalid URL"
try:
async with session.get(link, timeout=aiohttp.ClientTimeout(total=5)) as response:
status = response.status
if status == 200:
content = await response.text(encoding='utf-8', errors='ignore')[:1000]
if any(keyword in content.lower() for keyword in self.paywall_keywords):
return "Status: Possible Paywall"
return "Status: Accessible"
elif status == 404:
return "Status: 404 Not Found"
elif status == 403:
return "Status: 403 Forbidden"
else:
return f"Status: {status} {response.reason}"
except Exception as e:
return f"Error: {str(e)}"
async def check_all_links(self, links):
"""Check all links asynchronously using a single session."""
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
async with aiohttp.ClientSession(headers=headers) as session:
tasks = [self.link_valid(session, link) for link in links]
return await asyncio.gather(*tasks)
def execute(self, blocks: str, safety: bool = True) -> str:
if self.api_key is None: