Feat : web scrawl to validate link for web search

2025-07-22 17:30:08 +00:00 · 2025-03-15 10:46:50 +01:00 · 2025-03-15 10:46:50 +01:00 · e909d9d84c
commit e909d9d84c
parent f53647ad3f
2 changed files with 44 additions and 30 deletions
--- a/sources/agents/browser_agent.py
+++ b/sources/agents/browser_agent.py
@ -21,13 +21,18 @@ class BrowserAgent(Agent):
        self.navigable_links = []
        self.ai_notes = []
    
+
    def extract_links(self, search_result: str):
-        links = re.findall(r'https?://[^\s]+', search_result)
-        return self.clean_links(links)
+        pattern = r'(https?://\S+|www\.\S+)'
+        matches = re.findall(pattern, search_result)
+        trailing_punct = ".,!?;:"
+        cleaned_links = [link.rstrip(trailing_punct) for link in matches]
+        return self.clean_links(cleaned_links)
        
    def clean_links(self, links: list):
        links_clean = []
        for link in links:
+            link = link.strip()
            if link[-1] == '.':
                links_clean.append(link[:-1])
            else:
@ -65,7 +70,6 @@ class BrowserAgent(Agent):
        If you found a clear answer, please say "REQUEST_EXIT".
        You must choose a link to navigate to, go back or exit.
        Do not explain your choice.
-        You can take note about your finding with TAKE_NOTE("<your note>")
        """
    
    def llm_decide(self, prompt):
--- a/sources/tools/webSearch.py
+++ b/sources/tools/webSearch.py
@ -23,36 +23,41 @@ class webSearch(Tools):
        self.tag = "web_search"
        self.api_key = api_key or os.getenv("SERPAPI_KEY")  # Requires a SerpApi key
        self.paywall_keywords = [
-            "subscribe", "paywall", "login to continue", "access denied", "restricted content"
+            "subscribe", "login to continue", "access denied", "restricted content", "404", "this page is not working"
        ]

-    async def link_valid(self, session, link):
-        """asyncronously check if a link is shit."""
+    def link_valid(self, link):
+        """check if a link is valid."""
        if not link.startswith("http"):
            return "Status: Invalid URL"
+        
+        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
        try:
-            async with session.get(link, timeout=aiohttp.ClientTimeout(total=5)) as response:
-                status = response.status
-                if status == 200:
-                    content = await response.text(encoding='utf-8', errors='ignore')[:1000]
-                    if any(keyword in content.lower() for keyword in self.paywall_keywords):
-                        return "Status: Possible Paywall"
-                    return "Status: Accessible"
-                elif status == 404:
-                    return "Status: 404 Not Found"
-                elif status == 403:
-                    return "Status: 403 Forbidden"
-                else:
-                    return f"Status: {status} {response.reason}"
-        except Exception as e:
+            response = requests.get(link, headers=headers, timeout=5)
+            status = response.status_code
+            if status == 200:
+                content = response.text[:1000].lower()
+                if any(keyword in content for keyword in self.paywall_keywords):
+                    return "Status: Possible Paywall"
+                return "Status: OK"
+            elif status == 404:
+                return "Status: 404 Not Found"
+            elif status == 403:
+                return "Status: 403 Forbidden"
+            else:
+                return f"Status: {status} {response.reason}"
+        except requests.exceptions.RequestException as e:
            return f"Error: {str(e)}"

-    async def check_all_links(self, links):
-        """Check all links asynchronously using a single session."""
-        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
-        async with aiohttp.ClientSession(headers=headers) as session:
-            tasks = [self.link_valid(session, link) for link in links]
-            return await asyncio.gather(*tasks)
+    def check_all_links(self, links):
+        """Check all links, one by one."""
+        # TODO Make it asyncromous or smth
+        statuses = []
+        print("Workers started, scrawling the web...")
+        for i, link in enumerate(links):
+            status = self.link_valid(link)
+            statuses.append(status)
+        return statuses

    def execute(self, blocks: str, safety: bool = True) -> str:
        if self.api_key is None:
@ -68,7 +73,7 @@ class webSearch(Tools):
                params = {
                    "q": query,
                    "api_key": self.api_key,
-                    "num": 100,
+                    "num": 50,
                    "output": "json"
                }
                response = requests.get(url, params=params)
@ -77,7 +82,12 @@ class webSearch(Tools):
                data = response.json()
                results = []
                if "organic_results" in data and len(data["organic_results"]) > 0:
-                    for result in data["organic_results"][:50]:
+                    organic_results = data["organic_results"][:50]
+                    links = [result.get("link", "No link available") for result in organic_results]
+                    statuses = self.check_all_links(links)
+                    for result, status in zip(organic_results, statuses):
+                        if not "OK" in status:
+                            continue
                        title = result.get("title", "No title")
                        snippet = result.get("snippet", "No snippet available")
                        link = result.get("link", "No link available")
@ -104,5 +114,5 @@ if __name__ == "__main__":
    search_tool = webSearch(api_key=os.getenv("SERPAPI_KEY"))
    query = "when did covid start"
    result = search_tool.execute([query], safety=True)
-    feedback = search_tool.interpreter_feedback(result)
-    print(feedback)
+    output = search_tool.interpreter_feedback(result)
+    print(output)