mirror of
https://github.com/tcsenpai/agenticSeek.git
synced 2025-06-06 19:15:28 +00:00
Feat : web scrawl to validate link for web search
This commit is contained in:
parent
f53647ad3f
commit
e909d9d84c
@ -21,13 +21,18 @@ class BrowserAgent(Agent):
|
|||||||
self.navigable_links = []
|
self.navigable_links = []
|
||||||
self.ai_notes = []
|
self.ai_notes = []
|
||||||
|
|
||||||
|
|
||||||
def extract_links(self, search_result: str):
|
def extract_links(self, search_result: str):
|
||||||
links = re.findall(r'https?://[^\s]+', search_result)
|
pattern = r'(https?://\S+|www\.\S+)'
|
||||||
return self.clean_links(links)
|
matches = re.findall(pattern, search_result)
|
||||||
|
trailing_punct = ".,!?;:"
|
||||||
|
cleaned_links = [link.rstrip(trailing_punct) for link in matches]
|
||||||
|
return self.clean_links(cleaned_links)
|
||||||
|
|
||||||
def clean_links(self, links: list):
|
def clean_links(self, links: list):
|
||||||
links_clean = []
|
links_clean = []
|
||||||
for link in links:
|
for link in links:
|
||||||
|
link = link.strip()
|
||||||
if link[-1] == '.':
|
if link[-1] == '.':
|
||||||
links_clean.append(link[:-1])
|
links_clean.append(link[:-1])
|
||||||
else:
|
else:
|
||||||
@ -65,7 +70,6 @@ class BrowserAgent(Agent):
|
|||||||
If you found a clear answer, please say "REQUEST_EXIT".
|
If you found a clear answer, please say "REQUEST_EXIT".
|
||||||
You must choose a link to navigate to, go back or exit.
|
You must choose a link to navigate to, go back or exit.
|
||||||
Do not explain your choice.
|
Do not explain your choice.
|
||||||
You can take note about your finding with TAKE_NOTE("<your note>")
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def llm_decide(self, prompt):
|
def llm_decide(self, prompt):
|
||||||
|
@ -23,36 +23,41 @@ class webSearch(Tools):
|
|||||||
self.tag = "web_search"
|
self.tag = "web_search"
|
||||||
self.api_key = api_key or os.getenv("SERPAPI_KEY") # Requires a SerpApi key
|
self.api_key = api_key or os.getenv("SERPAPI_KEY") # Requires a SerpApi key
|
||||||
self.paywall_keywords = [
|
self.paywall_keywords = [
|
||||||
"subscribe", "paywall", "login to continue", "access denied", "restricted content"
|
"subscribe", "login to continue", "access denied", "restricted content", "404", "this page is not working"
|
||||||
]
|
]
|
||||||
|
|
||||||
async def link_valid(self, session, link):
|
def link_valid(self, link):
|
||||||
"""asyncronously check if a link is shit."""
|
"""check if a link is valid."""
|
||||||
if not link.startswith("http"):
|
if not link.startswith("http"):
|
||||||
return "Status: Invalid URL"
|
return "Status: Invalid URL"
|
||||||
|
|
||||||
|
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
||||||
try:
|
try:
|
||||||
async with session.get(link, timeout=aiohttp.ClientTimeout(total=5)) as response:
|
response = requests.get(link, headers=headers, timeout=5)
|
||||||
status = response.status
|
status = response.status_code
|
||||||
if status == 200:
|
if status == 200:
|
||||||
content = await response.text(encoding='utf-8', errors='ignore')[:1000]
|
content = response.text[:1000].lower()
|
||||||
if any(keyword in content.lower() for keyword in self.paywall_keywords):
|
if any(keyword in content for keyword in self.paywall_keywords):
|
||||||
return "Status: Possible Paywall"
|
return "Status: Possible Paywall"
|
||||||
return "Status: Accessible"
|
return "Status: OK"
|
||||||
elif status == 404:
|
elif status == 404:
|
||||||
return "Status: 404 Not Found"
|
return "Status: 404 Not Found"
|
||||||
elif status == 403:
|
elif status == 403:
|
||||||
return "Status: 403 Forbidden"
|
return "Status: 403 Forbidden"
|
||||||
else:
|
else:
|
||||||
return f"Status: {status} {response.reason}"
|
return f"Status: {status} {response.reason}"
|
||||||
except Exception as e:
|
except requests.exceptions.RequestException as e:
|
||||||
return f"Error: {str(e)}"
|
return f"Error: {str(e)}"
|
||||||
|
|
||||||
async def check_all_links(self, links):
|
def check_all_links(self, links):
|
||||||
"""Check all links asynchronously using a single session."""
|
"""Check all links, one by one."""
|
||||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
# TODO Make it asyncromous or smth
|
||||||
async with aiohttp.ClientSession(headers=headers) as session:
|
statuses = []
|
||||||
tasks = [self.link_valid(session, link) for link in links]
|
print("Workers started, scrawling the web...")
|
||||||
return await asyncio.gather(*tasks)
|
for i, link in enumerate(links):
|
||||||
|
status = self.link_valid(link)
|
||||||
|
statuses.append(status)
|
||||||
|
return statuses
|
||||||
|
|
||||||
def execute(self, blocks: str, safety: bool = True) -> str:
|
def execute(self, blocks: str, safety: bool = True) -> str:
|
||||||
if self.api_key is None:
|
if self.api_key is None:
|
||||||
@ -68,7 +73,7 @@ class webSearch(Tools):
|
|||||||
params = {
|
params = {
|
||||||
"q": query,
|
"q": query,
|
||||||
"api_key": self.api_key,
|
"api_key": self.api_key,
|
||||||
"num": 100,
|
"num": 50,
|
||||||
"output": "json"
|
"output": "json"
|
||||||
}
|
}
|
||||||
response = requests.get(url, params=params)
|
response = requests.get(url, params=params)
|
||||||
@ -77,7 +82,12 @@ class webSearch(Tools):
|
|||||||
data = response.json()
|
data = response.json()
|
||||||
results = []
|
results = []
|
||||||
if "organic_results" in data and len(data["organic_results"]) > 0:
|
if "organic_results" in data and len(data["organic_results"]) > 0:
|
||||||
for result in data["organic_results"][:50]:
|
organic_results = data["organic_results"][:50]
|
||||||
|
links = [result.get("link", "No link available") for result in organic_results]
|
||||||
|
statuses = self.check_all_links(links)
|
||||||
|
for result, status in zip(organic_results, statuses):
|
||||||
|
if not "OK" in status:
|
||||||
|
continue
|
||||||
title = result.get("title", "No title")
|
title = result.get("title", "No title")
|
||||||
snippet = result.get("snippet", "No snippet available")
|
snippet = result.get("snippet", "No snippet available")
|
||||||
link = result.get("link", "No link available")
|
link = result.get("link", "No link available")
|
||||||
@ -104,5 +114,5 @@ if __name__ == "__main__":
|
|||||||
search_tool = webSearch(api_key=os.getenv("SERPAPI_KEY"))
|
search_tool = webSearch(api_key=os.getenv("SERPAPI_KEY"))
|
||||||
query = "when did covid start"
|
query = "when did covid start"
|
||||||
result = search_tool.execute([query], safety=True)
|
result = search_tool.execute([query], safety=True)
|
||||||
feedback = search_tool.interpreter_feedback(result)
|
output = search_tool.interpreter_feedback(result)
|
||||||
print(feedback)
|
print(output)
|
Loading…
x
Reference in New Issue
Block a user