mirror of
https://github.com/tcsenpai/agenticSeek.git
synced 2025-06-06 19:15:28 +00:00
170 lines
6.2 KiB
Python
170 lines
6.2 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.common.exceptions import TimeoutException, WebDriverException
|
|
import time
|
|
from bs4 import BeautifulSoup
|
|
import markdownify
|
|
import logging
|
|
import sys
|
|
|
|
class Browser:
|
|
def __init__(self, headless=False, anticaptcha_install=False):
|
|
"""Initialize the browser with optional headless mode."""
|
|
self.headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Referer': 'https://www.google.com/',
|
|
}
|
|
self.anticaptcha = "https://chrome.google.com/webstore/detail/nopecha-captcha-solver/dknlfmjaanfblgfdfebhijalfmhmjjjo/related"
|
|
try:
|
|
chrome_options = Options()
|
|
if headless:
|
|
chrome_options.add_argument("--headless")
|
|
chrome_options.add_argument("--disable-gpu")
|
|
chrome_options.add_argument("--no-sandbox")
|
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
self.driver = webdriver.Chrome(options=chrome_options)
|
|
self.wait = WebDriverWait(self.driver, 10)
|
|
self.logger = logging.getLogger(__name__)
|
|
self.logger.info("Browser initialized successfully")
|
|
except Exception as e:
|
|
raise Exception(f"Failed to initialize browser: {str(e)}")
|
|
|
|
def goTo(self, url):
|
|
"""Navigate to a specified URL."""
|
|
try:
|
|
self.driver.get(url)
|
|
time.sleep(2) # Wait for page to load
|
|
self.logger.info(f"Navigated to: {url}")
|
|
return True
|
|
except WebDriverException as e:
|
|
self.logger.error(f"Error navigating to {url}: {str(e)}")
|
|
return False
|
|
|
|
def is_sentence(self, text):
|
|
"""Check if the text is a sentence."""
|
|
if "404" in text:
|
|
return True # we want the ai to see the error
|
|
return len(text.split(" ")) > 5 and '.' in text
|
|
|
|
def getText(self):
|
|
"""Get page text and convert it to README (Markdown) format."""
|
|
try:
|
|
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
|
|
|
for element in soup(['script', 'style']):
|
|
element.decompose()
|
|
|
|
text = soup.get_text()
|
|
|
|
lines = (line.strip() for line in text.splitlines())
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
text = "\n".join(chunk for chunk in chunks if chunk and self.is_sentence(chunk))
|
|
|
|
markdown_text = markdownify.markdownify(text, heading_style="ATX")
|
|
|
|
return markdown_text
|
|
except Exception as e:
|
|
self.logger.error(f"Error getting text: {str(e)}")
|
|
return None
|
|
|
|
def getNavigable(self):
|
|
"""Get all navigable links on the current page."""
|
|
try:
|
|
links = []
|
|
elements = self.driver.find_elements(By.TAG_NAME, "a")
|
|
|
|
for element in elements:
|
|
href = element.get_attribute("href")
|
|
if href and href.startswith(("http", "https")):
|
|
links.append({
|
|
"url": href,
|
|
"text": element.text.strip(),
|
|
"is_displayed": element.is_displayed()
|
|
})
|
|
|
|
self.logger.info(f"Found {len(links)} navigable links")
|
|
return links
|
|
except Exception as e:
|
|
self.logger.error(f"Error getting navigable links: {str(e)}")
|
|
return []
|
|
|
|
def clickElement(self, xpath):
|
|
"""Click an element specified by xpath."""
|
|
try:
|
|
element = self.wait.until(
|
|
EC.element_to_be_clickable((By.XPATH, xpath))
|
|
)
|
|
element.click()
|
|
time.sleep(2) # Wait for action to complete
|
|
return True
|
|
except TimeoutException:
|
|
self.logger.error(f"Element not found or not clickable: {xpath}")
|
|
return False
|
|
|
|
def getCurrentUrl(self):
|
|
"""Get the current URL of the page."""
|
|
return self.driver.current_url
|
|
|
|
def getPageTitle(self):
|
|
"""Get the title of the current page."""
|
|
return self.driver.title
|
|
|
|
def scrollToBottom(self):
|
|
"""Scroll to the bottom of the page."""
|
|
try:
|
|
self.driver.execute_script(
|
|
"window.scrollTo(0, document.body.scrollHeight);"
|
|
)
|
|
time.sleep(1) # Wait for scroll to complete
|
|
return True
|
|
except Exception as e:
|
|
self.logger.error(f"Error scrolling: {str(e)}")
|
|
return False
|
|
|
|
def takeScreenshot(self, filename):
|
|
"""Take a screenshot of the current page."""
|
|
try:
|
|
self.driver.save_screenshot(filename)
|
|
self.logger.info(f"Screenshot saved as {filename}")
|
|
return True
|
|
except Exception as e:
|
|
self.logger.error(f"Error taking screenshot: {str(e)}")
|
|
return False
|
|
|
|
def close(self):
|
|
"""Close the browser."""
|
|
try:
|
|
self.driver.quit()
|
|
self.logger.info("Browser closed")
|
|
except Exception as e:
|
|
raise e
|
|
|
|
def __del__(self):
|
|
"""Destructor to ensure browser is closed."""
|
|
self.close()
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
browser = Browser(headless=False)
|
|
|
|
try:
|
|
browser.goTo("https://karpathy.github.io/")
|
|
text = browser.getText()
|
|
print("Page Text in Markdown:")
|
|
print(text)
|
|
links = browser.getNavigable()
|
|
print("\nNavigable Links:")
|
|
for link in links[:50]:
|
|
print(f"Text: {link['text']}, URL: {link['url']}")
|
|
|
|
browser.takeScreenshot("example.png")
|
|
|
|
finally:
|
|
browser.close() |