2024-05-23 08:50:29 +02:00

131 lines
3.4 KiB
Python

# 19.04.24
import time
import logging
from urllib.parse import quote_plus, urlparse, parse_qs
from typing import Generator, Optional
# External library
from bs4 import BeautifulSoup
# Internal utilities
from Src.Lib.Request import requests
def filter_result(link: str) -> Optional[str]:
"""
Filters search result links to remove unwanted ones.
Args:
- link (str): The URL of the search result.
Returns:
Optional[str]: The filtered URL if valid, None otherwise.
"""
try:
logging.info(f"Filter url: {link}")
if link.startswith('/url?'):
# Extract the actual URL from Google's redirect link
o = urlparse(link, 'http')
link = parse_qs(o.query)['q'][0]
o = urlparse(link, 'http')
# Filter out Google links
if o.netloc and 'google' not in o.netloc:
return link
except Exception:
pass
def search(query: str, num: int = 10, stop: Optional[int] = None, pause: float = 2.0) -> Generator[str, None, None]:
"""
Performs a Google search and yields the URLs of search results.
Args:
- query (str): The search query.
- num (int): Number of results to fetch per request. Default is 10.
- stop (int, optional): Total number of results to retrieve. Default is None.
- pause (float): Pause duration between requests. Default is 2.0.
Yields:
str: The URL of a search result.
Example:
>>> for url in search("Python tutorials", num=5, stop=10):
... print(url)
...
https://www.python.org/about/gettingstarted/
"""
# Set to store unique URLs
hashes = set()
# Counter for the number of fetched URLs
count = 0
# Encode the query for URL
query = quote_plus(query)
while not stop or count < stop:
last_count = count
# Construct the Google search URL
url = f"https://www.google.com/search?client=opera&q={query}&sourceid=opera&oe=UTF-8"
# Pause before making the request
time.sleep(pause)
# Fetch the HTML content of the search page
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
try:
# Find all anchor tags containing search result links
anchors = soup.find(id='search').findAll('a')
except AttributeError:
# Handle cases where search results are not found in the usual div
gbar = soup.find(id='gbar')
if gbar:
gbar.clear()
anchors = soup.findAll('a')
# Iterate over each anchor tag
for a in anchors:
try:
link = a['href']
except KeyError:
continue
# Filter out unwanted links
link = filter_result(link)
if not link:
continue
# Check for duplicate URLs
h = hash(link)
if h in hashes:
continue
hashes.add(h)
# Yield the valid URL
yield link
# Increment the counter
count += 1
# Check if the desired number of URLs is reached
if stop and count >= stop:
return
# Break the loop if no new URLs are found
if last_count == count:
break