From f53647ad3fcd6e68d7f899ab1785c60f2938e161 Mon Sep 17 00:00:00 2001
From: martin legrand <martin.legrand@epitech.eu>
Date: Fri, 14 Mar 2025 21:55:23 +0100
Subject: [PATCH] Feat : prompt improve and start link check to avoid paywall

---
 prompts/browser_agent.txt       | 28 ++++++++--------------------
 sources/agents/agent.py         |  2 +-
 sources/agents/browser_agent.py |  3 ++-
 sources/tools/webSearch.py      | 31 +++++++++++++++++++++++++++++++
 4 files changed, 42 insertions(+), 22 deletions(-)
diff --git a/prompts/browser_agent.txt b/prompts/browser_agent.txt
index 52846f4..77c6a9e 100644
--- a/prompts/browser_agent.txt
+++ b/prompts/browser_agent.txt
@@ -1,21 +1,9 @@
-You are an internet ai that can browse the web for information.
-In fact you are embedded in a browser with selenium.
 
-If you need to conduct a web search, you can use the following tool:
-- web_search: to search the web for information
-
-This is how you can use the web_search tool:
-```web_search
-<query>
-```
-
-This will provide you with a list of links that you can navigate to.
-You can navigate to a specific link by typing the link. For example, If you say:
-"I want to navigate to https://www.google.com"
-
-You will navigate to https://www.google.com
-Any link that you type will be opened in a new tab.
-
-If you want to exit the browser, you can say:
-"REQUEST_EXIT"
-Only exit the browser if you are done browsing.
\ No newline at end of file
+You are a web browsing AI, your goal is to explore the internet to find information.
+You will have the only goal of finding the information requested by the user. 
+At the beginning you will have to select a link from the google search result.
+You will choose a link by simply typing it.
+This will automatically make you browse to the link.
+Once on a webpage you will see the page content and be given futher navigation options.
+You can type a link to navigate futher on the page, go back to the search result or exit.
+At each interaction step the browser will remind you of your options.
diff --git a/sources/agents/agent.py b/sources/agents/agent.py
index 37205c3..af2559a 100644
--- a/sources/agents/agent.py
+++ b/sources/agents/agent.py
@@ -34,7 +34,7 @@ class Agent():
                        name: str,
                        prompt_path:str,
                        provider,
-                       recover_last_session=False) -> None:
+                       recover_last_session=True) -> None:
         self.agent_name = name
         self.role = None
         self.current_directory = os.getcwd()
diff --git a/sources/agents/browser_agent.py b/sources/agents/browser_agent.py
index 7ccb8c9..daf3e88 100644
--- a/sources/agents/browser_agent.py
+++ b/sources/agents/browser_agent.py
@@ -19,6 +19,7 @@ class BrowserAgent(Agent):
         self.browser.go_to("https://github.com/")
         self.search_history = []
         self.navigable_links = []
+        self.ai_notes = []
     
     def extract_links(self, search_result: str):
         links = re.findall(r'https?://[^\s]+', search_result)
@@ -64,8 +65,8 @@ class BrowserAgent(Agent):
         If you found a clear answer, please say "REQUEST_EXIT".
         You must choose a link to navigate to, go back or exit.
         Do not explain your choice.
+        You can take note about your finding with TAKE_NOTE("<your note>")
         """
-
     
     def llm_decide(self, prompt):
         animate_thinking("Thinking...", color="status")
diff --git a/sources/tools/webSearch.py b/sources/tools/webSearch.py
index 4544d1f..00f219b 100644
--- a/sources/tools/webSearch.py
+++ b/sources/tools/webSearch.py
@@ -22,6 +22,37 @@ class webSearch(Tools):
         super().__init__()
         self.tag = "web_search"
         self.api_key = api_key or os.getenv("SERPAPI_KEY")  # Requires a SerpApi key
+        self.paywall_keywords = [
+            "subscribe", "paywall", "login to continue", "access denied", "restricted content"
+        ]
+
+    async def link_valid(self, session, link):
+        """asyncronously check if a link is shit."""
+        if not link.startswith("http"):
+            return "Status: Invalid URL"
+        try:
+            async with session.get(link, timeout=aiohttp.ClientTimeout(total=5)) as response:
+                status = response.status
+                if status == 200:
+                    content = await response.text(encoding='utf-8', errors='ignore')[:1000]
+                    if any(keyword in content.lower() for keyword in self.paywall_keywords):
+                        return "Status: Possible Paywall"
+                    return "Status: Accessible"
+                elif status == 404:
+                    return "Status: 404 Not Found"
+                elif status == 403:
+                    return "Status: 403 Forbidden"
+                else:
+                    return f"Status: {status} {response.reason}"
+        except Exception as e:
+            return f"Error: {str(e)}"
+
+    async def check_all_links(self, links):
+        """Check all links asynchronously using a single session."""
+        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
+        async with aiohttp.ClientSession(headers=headers) as session:
+            tasks = [self.link_valid(session, link) for link in links]
+            return await asyncio.gather(*tasks)
 
     def execute(self, blocks: str, safety: bool = True) -> str:
         if self.api_key is None: