mirror of
https://github.com/tcsenpai/agenticSeek.git
synced 2025-06-06 19:15:28 +00:00
Fix : browser not supporting non-alphabetic language
This commit is contained in:
parent
dfcbacd464
commit
292623ab52
@ -12,6 +12,7 @@ from bs4 import BeautifulSoup
|
|||||||
import markdownify
|
import markdownify
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
class Browser:
|
class Browser:
|
||||||
def __init__(self, headless=False, anticaptcha_install=False):
|
def __init__(self, headless=False, anticaptcha_install=False):
|
||||||
@ -91,6 +92,19 @@ class Browser:
|
|||||||
has_letters = any(word.isalpha() for word in words)
|
has_letters = any(word.isalpha() for word in words)
|
||||||
return (word_count >= 5 and (has_punctuation or is_long_enough) and has_letters)
|
return (word_count >= 5 and (has_punctuation or is_long_enough) and has_letters)
|
||||||
|
|
||||||
|
def is_sentence(self, text):
|
||||||
|
"""Check if the text qualifies as a meaningful sentence or contains important error codes."""
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
error_codes = ["404", "403", "500", "502", "503"]
|
||||||
|
if any(code in text for code in error_codes):
|
||||||
|
return True
|
||||||
|
words = re.findall(r'\w+', text, re.UNICODE)
|
||||||
|
word_count = len(words)
|
||||||
|
has_punctuation = any(text.endswith(p) for p in ['.', ',', ',', '!', '?', '。', '!', '?', '।', '۔'])
|
||||||
|
is_long_enough = word_count > 5
|
||||||
|
return (word_count >= 5 and (has_punctuation or is_long_enough))
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
"""Get page text and convert it to README (Markdown) format."""
|
"""Get page text and convert it to README (Markdown) format."""
|
||||||
try:
|
try:
|
||||||
@ -145,7 +159,7 @@ class Browser:
|
|||||||
})
|
})
|
||||||
|
|
||||||
self.logger.info(f"Found {len(links)} navigable links")
|
self.logger.info(f"Found {len(links)} navigable links")
|
||||||
return [self.clean_url(link['url']) for link in links if link['is_displayed'] == True and len(link) < 256]
|
return [self.clean_url(link['url']) for link in links if (link['is_displayed'] == True and len(link['url']) < 64)]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Error getting navigable links: {str(e)}")
|
self.logger.error(f"Error getting navigable links: {str(e)}")
|
||||||
return []
|
return []
|
||||||
@ -211,7 +225,7 @@ if __name__ == "__main__":
|
|||||||
browser = Browser(headless=False)
|
browser = Browser(headless=False)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
browser.go_to("https://karpathy.github.io/")
|
browser.go_to("https://www.seoul.co.kr/")
|
||||||
text = browser.get_text()
|
text = browser.get_text()
|
||||||
print("Page Text in Markdown:")
|
print("Page Text in Markdown:")
|
||||||
print(text)
|
print(text)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user