mirror of
https://github.com/tcsenpai/agenticSeek.git
synced 2025-06-06 11:05:26 +00:00
Feat : web scrawl to validate link for web search
This commit is contained in:
parent
f53647ad3f
commit
e909d9d84c
@ -21,13 +21,18 @@ class BrowserAgent(Agent):
|
||||
self.navigable_links = []
|
||||
self.ai_notes = []
|
||||
|
||||
|
||||
def extract_links(self, search_result: str):
|
||||
links = re.findall(r'https?://[^\s]+', search_result)
|
||||
return self.clean_links(links)
|
||||
pattern = r'(https?://\S+|www\.\S+)'
|
||||
matches = re.findall(pattern, search_result)
|
||||
trailing_punct = ".,!?;:"
|
||||
cleaned_links = [link.rstrip(trailing_punct) for link in matches]
|
||||
return self.clean_links(cleaned_links)
|
||||
|
||||
def clean_links(self, links: list):
|
||||
links_clean = []
|
||||
for link in links:
|
||||
link = link.strip()
|
||||
if link[-1] == '.':
|
||||
links_clean.append(link[:-1])
|
||||
else:
|
||||
@ -65,7 +70,6 @@ class BrowserAgent(Agent):
|
||||
If you found a clear answer, please say "REQUEST_EXIT".
|
||||
You must choose a link to navigate to, go back or exit.
|
||||
Do not explain your choice.
|
||||
You can take note about your finding with TAKE_NOTE("<your note>")
|
||||
"""
|
||||
|
||||
def llm_decide(self, prompt):
|
||||
|
@ -23,36 +23,41 @@ class webSearch(Tools):
|
||||
self.tag = "web_search"
|
||||
self.api_key = api_key or os.getenv("SERPAPI_KEY") # Requires a SerpApi key
|
||||
self.paywall_keywords = [
|
||||
"subscribe", "paywall", "login to continue", "access denied", "restricted content"
|
||||
"subscribe", "login to continue", "access denied", "restricted content", "404", "this page is not working"
|
||||
]
|
||||
|
||||
async def link_valid(self, session, link):
|
||||
"""asyncronously check if a link is shit."""
|
||||
def link_valid(self, link):
|
||||
"""check if a link is valid."""
|
||||
if not link.startswith("http"):
|
||||
return "Status: Invalid URL"
|
||||
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
||||
try:
|
||||
async with session.get(link, timeout=aiohttp.ClientTimeout(total=5)) as response:
|
||||
status = response.status
|
||||
if status == 200:
|
||||
content = await response.text(encoding='utf-8', errors='ignore')[:1000]
|
||||
if any(keyword in content.lower() for keyword in self.paywall_keywords):
|
||||
return "Status: Possible Paywall"
|
||||
return "Status: Accessible"
|
||||
elif status == 404:
|
||||
return "Status: 404 Not Found"
|
||||
elif status == 403:
|
||||
return "Status: 403 Forbidden"
|
||||
else:
|
||||
return f"Status: {status} {response.reason}"
|
||||
except Exception as e:
|
||||
response = requests.get(link, headers=headers, timeout=5)
|
||||
status = response.status_code
|
||||
if status == 200:
|
||||
content = response.text[:1000].lower()
|
||||
if any(keyword in content for keyword in self.paywall_keywords):
|
||||
return "Status: Possible Paywall"
|
||||
return "Status: OK"
|
||||
elif status == 404:
|
||||
return "Status: 404 Not Found"
|
||||
elif status == 403:
|
||||
return "Status: 403 Forbidden"
|
||||
else:
|
||||
return f"Status: {status} {response.reason}"
|
||||
except requests.exceptions.RequestException as e:
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
async def check_all_links(self, links):
|
||||
"""Check all links asynchronously using a single session."""
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
||||
async with aiohttp.ClientSession(headers=headers) as session:
|
||||
tasks = [self.link_valid(session, link) for link in links]
|
||||
return await asyncio.gather(*tasks)
|
||||
def check_all_links(self, links):
|
||||
"""Check all links, one by one."""
|
||||
# TODO Make it asyncromous or smth
|
||||
statuses = []
|
||||
print("Workers started, scrawling the web...")
|
||||
for i, link in enumerate(links):
|
||||
status = self.link_valid(link)
|
||||
statuses.append(status)
|
||||
return statuses
|
||||
|
||||
def execute(self, blocks: str, safety: bool = True) -> str:
|
||||
if self.api_key is None:
|
||||
@ -68,7 +73,7 @@ class webSearch(Tools):
|
||||
params = {
|
||||
"q": query,
|
||||
"api_key": self.api_key,
|
||||
"num": 100,
|
||||
"num": 50,
|
||||
"output": "json"
|
||||
}
|
||||
response = requests.get(url, params=params)
|
||||
@ -77,7 +82,12 @@ class webSearch(Tools):
|
||||
data = response.json()
|
||||
results = []
|
||||
if "organic_results" in data and len(data["organic_results"]) > 0:
|
||||
for result in data["organic_results"][:50]:
|
||||
organic_results = data["organic_results"][:50]
|
||||
links = [result.get("link", "No link available") for result in organic_results]
|
||||
statuses = self.check_all_links(links)
|
||||
for result, status in zip(organic_results, statuses):
|
||||
if not "OK" in status:
|
||||
continue
|
||||
title = result.get("title", "No title")
|
||||
snippet = result.get("snippet", "No snippet available")
|
||||
link = result.get("link", "No link available")
|
||||
@ -104,5 +114,5 @@ if __name__ == "__main__":
|
||||
search_tool = webSearch(api_key=os.getenv("SERPAPI_KEY"))
|
||||
query = "when did covid start"
|
||||
result = search_tool.execute([query], safety=True)
|
||||
feedback = search_tool.interpreter_feedback(result)
|
||||
print(feedback)
|
||||
output = search_tool.interpreter_feedback(result)
|
||||
print(output)
|
Loading…
x
Reference in New Issue
Block a user