Add light and strong search domain @inklook

2025-07-23 18:40:05 +00:00 · 2024-04-29 20:23:11 +02:00 · 2024-04-29 20:23:11 +02:00 · 02edd4d5cd
commit 02edd4d5cd
parent a60dcb2b79
17 changed files with 390 additions and 66 deletions
--- a/README.md
+++ b/README.md
@ -18,6 +18,7 @@ You can chat, help improve this repo, or just hang around for some fun in the **
 * [CONFIGURATION](#Configuration)
 * [DOCKER](#docker)
 * [TUTORIAL](#tutorial)
+* [TO DO](#to-do)

 ## Requirement

@ -232,3 +233,7 @@ docker run -it -p 8000:8000 -v /path/to/download:/app/Video streaming-community-

 ## Tutorial
 For a detailed walkthrough, refer to the [video tutorial](https://www.youtube.com/watch?v=Ok7hQCgxqLg&ab_channel=Nothing)
+
+## To do
+- Add a gui
+- Add a website api
--- a/Src/Api/Class/Video.py
+++ b/Src/Api/Class/Video.py
@ -1,6 +1,5 @@
 # 01.03.24

-import requests
 import re
 import json
 import binascii
@ -9,12 +8,12 @@ from urllib.parse import urljoin, urlencode, quote


 # External libraries
-import requests
 from bs4 import BeautifulSoup


 # Internal utilities
 from Src.Util.headers import get_headers
+from Src.Lib.Request import requests
 from .SeriesType import TitleManager
 from .EpisodeType import EpisodeManager
 from .WindowType import WindowVideo, WindowParameter
--- a/Src/Api/Util/init.py
+++ b/Src/Api/Util/init.py
@ -0,0 +1,3 @@
+# 29.04.24
+
+from .extract_domain import grab_sc_top_level_domain as get_sc_domain
--- a/Src/Api/Util/extract_domain.py
+++ b/Src/Api/Util/extract_domain.py
@ -0,0 +1,128 @@
+# 29.04.24
+
+import threading
+import logging
+import os
+
+
+# Internal utilities
+from Src.Lib.Google import search as google_search
+from Src.Lib.Request import requests
+
+
+def check_url_for_content(url: str, content: str) -> bool:
+    """
+    Check if a URL contains specific content.
+
+    Args:
+        url (str): The URL to check.
+        content (str): The content to search for in the response.
+
+    Returns:
+        bool: True if the content is found, False otherwise.
+    """
+    try:
+        r = requests.get(url, timeout = 1)
+        if r.status_code == 200 and content in r.text:
+            return True
+    except Exception as e:
+        pass
+    return False
+
+def grab_top_level_domain(base_url: str, target_content: str) -> str:
+    """
+    Get the top-level domain (TLD) from a list of URLs.
+
+    Args:
+        base_url (str): The base URL to construct complete URLs.
+        target_content (str): The content to search for in the response.
+
+    Returns:
+        str: The found TLD, if any.
+    """
+    results = []
+    threads = []
+
+    def url_checker(url: str):
+        if check_url_for_content(url, target_content):
+            results.append(url.split(".")[-1])
+
+    if not os.path.exists("tld_list.txt"):
+        raise FileNotFoundError("The file 'tld_list.txt' does not exist.")
+
+    urls = [f"{base_url}.{x.strip().lower()}" for x in open("tld_list.txt", "r")]
+
+    for url in urls:
+        thread = threading.Thread(target=url_checker, args=(url,))
+        thread.start()
+        threads.append(thread)
+
+    for thread in threads:
+        thread.join()
+
+    if results:
+        return results[-1]
+
+def grab_top_level_domain_light(query: str) -> str:
+    """
+    Get the top-level domain (TLD) using a light method via Google search.
+
+    Args:
+        query (str): The search query for Google search.
+
+    Returns:
+        str: The found TLD, if any.
+    """
+    for result in google_search(query, num=1, stop=1, pause=2):
+        return result.split(".", 2)[-1].replace("/", "")
+
+def grab_sc_top_level_domain(method: str) -> str:
+    """
+    Get the top-level domain (TLD) for the streaming community.
+
+    Args:
+        method (str): The method to use to obtain the TLD ("light" or "strong").
+
+    Returns:
+        str: The found TLD, if any.
+    """
+    if method == "light":
+        return grab_top_level_domain_light("streaming community")
+    elif method == "strong":
+        return grab_top_level_domain("https://streamingcommunity", '<meta name="author" content="StreamingCommunity">')
+
+def grab_au_top_level_domain(method: str) -> str:
+    """
+    Get the top-level domain (TLD) for Anime Unity.
+
+    Args:
+        method (str): The method to use to obtain the TLD ("light" or "strong").
+
+    Returns:
+        str: The found TLD, if any.
+    """
+    if method == "light":
+        return grab_top_level_domain_light("animeunity")
+    elif method == "strong":
+        return grab_top_level_domain("https://www.animeunity", '<meta name="author" content="AnimeUnity Staff">')
+
+def compose_both_top_level_domains(method: str) -> dict:
+    """
+    Compose TLDs for both the streaming community and Anime Unity.
+
+    Args:
+        method (str): The method to use to obtain the TLD ("light" or "strong").
+
+    Returns:
+        dict: A dictionary containing the TLDs for the streaming community and Anime Unity.
+    """
+    sc_tld = grab_sc_top_level_domain(method)
+    au_tld = grab_au_top_level_domain(method)
+
+    if not sc_tld:
+        sc_tld = grab_sc_top_level_domain("strong")
+
+    if not au_tld:
+        au_tld = grab_au_top_level_domain("strong")
+
+    return {"streaming_community": sc_tld, "anime_unity": au_tld}
--- a/Src/Api/anime.py
+++ b/Src/Api/anime.py
@ -4,13 +4,10 @@ import os
 import logging


-# External libraries
-import requests
-
-
 # Internal utilities
 from Src.Util.console import console, msg
 from Src.Util.config import config_manager
+from Src.Lib.Request import requests
 from Src.Lib.FFmpeg.my_m3u8 import Downloader
 from Src.Util.message import start_message
 from .Class import VideoSource
--- a/Src/Api/site.py
+++ b/Src/Api/site.py
@ -7,15 +7,16 @@ from typing import Tuple


 # External libraries
-import requests
 from bs4 import BeautifulSoup


 # Internal utilities
 from Src.Util.table import TVShowManager
 from Src.Util.headers import get_headers
+from Src.Lib.Request import requests
 from Src.Util.console import console
 from Src.Util.config import config_manager
+from .Util import get_sc_domain
 from .Class import MediaManager, MediaItem


@ -109,10 +110,6 @@ def get_moment_titles(domain: str, version: str, prefix: str):
        return None


-def get_domain() -> str:
-    pass
-
-
 def test_site(domain: str) -> str:
    """
    Tests the availability of a website.
@ -180,7 +177,7 @@ def get_version_and_domain() -> Tuple[str, str]:
        response_test_site = test_site(config_domain)

        if response_test_site is None:
-            config_domain = get_domain()
+            config_domain = get_sc_domain('light')
            response_test_site = test_site(config_domain)

        if response_test_site:
--- a/Src/Lib/FFmpeg/my_m3u8.py
+++ b/Src/Lib/FFmpeg/my_m3u8.py
@ -14,7 +14,6 @@ warnings.filterwarnings("ignore", category=UserWarning, module="cryptography")


 # External libraries
-import requests
 from tqdm.rich import tqdm
 from concurrent.futures import ThreadPoolExecutor, as_completed

@ -22,6 +21,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 # Internal utilities
 from Src.Util.console import console
 from Src.Util.headers import get_headers
+from Src.Lib.Request import requests
 from Src.Util.config import config_manager
 from Src.Util.os import (
    remove_folder, 
--- a/Src/Lib/FFmpeg/util/parser.py
+++ b/Src/Lib/FFmpeg/util/parser.py
@ -5,10 +5,10 @@ import logging

 # Internal utilities
 from Src.Util.headers import get_headers
+from Src.Lib.Request import requests


 # External libraries
-import requests
 from m3u8 import M3U8


--- a/Src/Lib/Google/init.py
+++ b/Src/Lib/Google/init.py
@ -0,0 +1,3 @@
+# 29.04.24
+
+from .page import search
--- a/Src/Lib/Google/page.py
+++ b/Src/Lib/Google/page.py
@ -0,0 +1,140 @@
+# 29.04.24
+
+import time
+import ssl
+from urllib.request import Request, urlopen
+from urllib.parse import quote_plus, urlparse, parse_qs
+from bs4 import BeautifulSoup
+
+from typing import Generator, Optional
+
+
+# Internal utilities
+from Src.Lib.Request import requests
+
+
+
+def get_page(url: str) -> bytes:
+    """
+    Fetches the HTML content of a webpage given its URL.
+
+    Args:
+        url (str): The URL of the webpage.
+
+    Returns:
+        bytes: The HTML content of the webpage.
+    """
+
+    response = requests.get(url)
+    return response.text
+
+
+def filter_result(link: str) -> Optional[str]:
+    """
+    Filters search result links to remove unwanted ones.
+
+    Args:
+        link (str): The URL of the search result.
+
+    Returns:
+        Optional[str]: The filtered URL if valid, None otherwise.
+    """
+    try:
+        if link.startswith('/url?'):
+
+            # Extract the actual URL from Google's redirect link
+            o = urlparse(link, 'http')
+            link = parse_qs(o.query)['q'][0]
+
+        o = urlparse(link, 'http')
+
+        # Filter out Google links
+        if o.netloc and 'google' not in o.netloc:
+            return link
+        
+    except Exception:
+        pass
+
+
+def search(query: str, num: int = 10, stop: Optional[int] = None, pause: float = 2.0) -> Generator[str, None, None]:
+    """
+    Performs a Google search and yields the URLs of search results.
+
+    Args:
+        query (str): The search query.
+        num (int): Number of results to fetch per request. Default is 10.
+        stop (int, optional): Total number of results to retrieve. Default is None.
+        pause (float): Pause duration between requests. Default is 2.0.
+
+    Yields:
+        str: The URL of a search result.
+        
+    Example:
+        >>> for url in search("Python tutorials", num=5, stop=10):
+        ...     print(url)
+        ...
+        https://www.python.org/about/gettingstarted/
+    """
+
+    # Set to store unique URLs
+    hashes = set()
+
+    # Counter for the number of fetched URLs
+    count = 0
+
+    # Encode the query for URL
+    query = quote_plus(query)
+
+    while not stop or count < stop:
+        last_count = count
+
+        # Construct the Google search URL
+        url = f"https://www.google.com/search?client=opera&q={query}&sourceid=opera&oe=UTF-8"
+
+        # Pause before making the request
+        time.sleep(pause)
+        
+        # Fetch the HTML content of the search page
+        html = get_page(url)
+        soup = BeautifulSoup(html, 'html.parser')
+
+        try:
+            # Find all anchor tags containing search result links
+            anchors = soup.find(id='search').findAll('a')
+        except AttributeError:
+            # Handle cases where search results are not found in the usual div
+            gbar = soup.find(id='gbar')
+            if gbar:
+                gbar.clear()
+            anchors = soup.findAll('a')
+
+        # Iterate over each anchor tag
+        for a in anchors:
+            try:
+                link = a['href']
+            except KeyError:
+                continue
+
+            # Filter out unwanted links
+            link = filter_result(link)
+            if not link:
+                continue
+
+            # Check for duplicate URLs
+            h = hash(link)
+            if h in hashes:
+                continue
+            hashes.add(h)
+
+            # Yield the valid URL
+            yield link
+
+            # Increment the counter
+            count += 1
+            # Check if the desired number of URLs is reached
+            if stop and count >= stop:
+                return
+
+        # Break the loop if no new URLs are found
+        if last_count == count:
+            break
--- a/Src/Lib/Request/init.py
+++ b/Src/Lib/Request/init.py
@ -0,0 +1,3 @@
+# 21.04.24
+
+from .my_requests import requests
--- a/Src/Lib/Request/my_requests.py
+++ b/Src/Lib/Request/my_requests.py
@ -9,7 +9,8 @@ import re
 import urllib.parse
 import urllib.request
 import urllib.error
-from typing import Dict, Optional, Union
+
+from typing import Dict, Optional, Union, TypedDict, Any

 try:
    from typing import Unpack
@ -22,9 +23,13 @@ except ImportError:
                          "Please make sure you have the necessary libraries installed.")


+# External library
+from bs4 import BeautifulSoup
+
+
 # Constants
-HTTP_TIMEOUT = 4
-HTTP_RETRIES = 2
+HTTP_TIMEOUT = 3
+HTTP_RETRIES = 1
 HTTP_DELAY = 1


@ -95,12 +100,43 @@ class Response:
            raise RequestError(f"Request failed with status code {self.status_code}")

    def json(self):
-        """Return the response content as JSON if it is JSON."""
+        """
+        Return the response content as JSON if it is JSON.
+
+        Returns:
+            dict or list or None: A Python dictionary or list parsed from JSON if the response content is JSON, otherwise None.
+        """
        if self.is_json:
            return json.loads(self.text)
        else:
            return None
        
+    def get_redirects(self):
+        """
+        Extracts unique site URLs from HTML <link> elements within the <head> section.
+
+        Returns:
+            list or None: A list of unique site URLs if found, otherwise None.
+        """
+
+        site_find = []
+
+        if self.text:
+            soup = BeautifulSoup(self.text, "html.parser")
+
+            for links in soup.find("head").find_all('link'):
+                if links is not None:
+                    parsed_url = urllib.parse.urlparse(links.get('href'))
+                    site = parsed_url.scheme + "://" + parsed_url.netloc
+
+                    if site not in site_find:
+                        site_find.append(site)
+
+        if site_find:
+            return site_find
+        else:
+            return None
+

 class ManageRequests:
    """Class for managing HTTP requests."""
@ -116,6 +152,7 @@ class ManageRequests:
        auth: Optional[tuple] = None,
        proxy: Optional[str] = None,
        cookies: Optional[Dict[str, str]] = None,
+        json_data: Optional[Dict[str, Any]] = None,
        redirection_handling: bool = True,
    ):
        """
@ -136,7 +173,7 @@ class ManageRequests:
        """
        self.url = url
        self.method = method
-        self.headers = headers or {'User-Agent': 'Mozilla/5.0'}
+        self.headers = headers or {}
        self.timeout = timeout
        self.retries = retries
        self.params = params
@ -144,6 +181,7 @@ class ManageRequests:
        self.auth = auth
        self.proxy = proxy
        self.cookies = cookies
+        self.json_data = json_data
        self.redirection_handling = redirection_handling

    def add_header(self, key: str, value: str) -> None:
@ -152,6 +190,7 @@ class ManageRequests:

    def send(self) -> Response:
        """Send the HTTP request."""
+
        start_time = time.time()
        self.attempt = 0
        redirect_url = None
@ -160,24 +199,53 @@ class ManageRequests:
            try:
                req = self._build_request()
                response = self._perform_request(req)
+
                return self._process_response(response, start_time, redirect_url)
+            
            except (urllib.error.URLError, urllib.error.HTTPError) as e:
                self._handle_error(e)
-                attempt += 1
+                self.attempt += 1

    def _build_request(self) -> urllib.request.Request:
        """Build the urllib Request object."""
        headers = self.headers.copy()
+
        if self.params:
            url = self.url + '?' + urllib.parse.urlencode(self.params)
        else:
            url = self.url
+
        req = urllib.request.Request(url, headers=headers, method=self.method)
+
+        if self.json_data:
+            req.add_header('Content-Type', 'application/json')
+            req.body = json.dumps(self.json_data).encode('utf-8')
+        else:
+            req = urllib.request.Request(url, headers=headers, method=self.method)
+
        if self.auth:
            req.add_header('Authorization', 'Basic ' + base64.b64encode(f"{self.auth[0]}:{self.auth[1]}".encode()).decode())
+
        if self.cookies:
            cookie_str = '; '.join([f"{name}={value}" for name, value in self.cookies.items()])
            req.add_header('Cookie', cookie_str)
+
+        if self.headers:
+            for key, value in self.headers.items():
+                req.add_header(key, value)
+
+        # Add default user agent
+        if True:
+            there_is_agent = False
+
+            for key, value in self.headers.items():
+                if str(key).lower() == 'user-agent':
+                    there_is_agent = True
+
+            if not there_is_agent:
+                default_user_agent = 'Mozilla/5.0'
+                req.add_header('user-agent', default_user_agent)
+
        return req

    def _perform_request(self, req: urllib.request.Request) -> urllib.response.addinfourl:
@ -186,26 +254,30 @@ class ManageRequests:
            proxy_handler = urllib.request.ProxyHandler({'http': self.proxy, 'https': self.proxy})
            opener = urllib.request.build_opener(proxy_handler)
            urllib.request.install_opener(opener)
+
        if not self.verify_ssl:
            ssl_context = ssl.create_default_context()
            ssl_context.check_hostname = False
            ssl_context.verify_mode = ssl.CERT_NONE
            response = urllib.request.urlopen(req, timeout=self.timeout, context=ssl_context)
+
        else:
            response = urllib.request.urlopen(req, timeout=self.timeout)
+
        return response

    def _process_response(self, response: urllib.response.addinfourl, start_time: float, redirect_url: Optional[str]) -> Response:
        """Process the HTTP response."""
        response_data = response.read()
        content_type = response.headers.get('Content-Type', '').lower()
-        is_response_api = "json" in content_type
+
        if self.redirection_handling and response.status in (301, 302, 303, 307, 308):
            location = response.headers.get('Location')
            logging.info(f"Redirecting to: {location}")
            redirect_url = location
            self.url = location
            return self.send()
+        
        return self._build_response(response, response_data, start_time, redirect_url, content_type)

    def _build_response(self, response: urllib.response.addinfourl, response_data: bytes, start_time: float, redirect_url: Optional[str], content_type: str) -> Response:
@ -216,7 +288,7 @@ class ManageRequests:

        for cookie in response.headers.get_all('Set-Cookie', []):
            cookie_parts = cookie.split(';')
-            cookie_name, cookie_value = cookie_parts[0].split('=')
+            cookie_name, cookie_value = cookie_parts[0].split('=', 1)
            response_cookies[cookie_name.strip()] = cookie_value.strip()

        return Response(
@ -234,9 +306,11 @@ class ManageRequests:
    def _handle_error(self, e: Union[urllib.error.URLError, urllib.error.HTTPError]) -> None:
        """Handle request error."""
        logging.error(f"Request failed for URL '{self.url}': {str(e)}")
+
        if self.attempt < self.retries:
            logging.info(f"Retrying request for URL '{self.url}' (attempt {self.attempt}/{self.retries})")
            time.sleep(HTTP_DELAY)
+
        else:
            logging.error(f"Maximum retries reached for URL '{self.url}'")
            raise RequestError(str(e))
@ -251,9 +325,7 @@ class ValidateRequest:
            r'^(?:http|ftp)s?://'  # http:// or https://
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
            r'localhost|'  # localhost...
-            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or IP
-            r'(?::\d+)?'  # optional port
-            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', re.IGNORECASE)
        return re.match(url_regex, url) is not None

    @staticmethod
@ -287,14 +359,14 @@ class SSLHandler:
        ssl_context.verify_mode = ssl.CERT_NONE


-class KwargsRequest():
-    """Class representing keyword arguments for a request."""
+class KwargsRequest(TypedDict, total = False):
    url: str
    headers: Optional[Dict[str, str]] = None
    timeout: float = HTTP_TIMEOUT
    retries: int = HTTP_RETRIES
    params: Optional[Dict[str, str]] = None
    cookies: Optional[Dict[str, str]] = None
+    json_data: Optional[Dict[str, Any]] = None


 class Request:
@ -302,7 +374,7 @@ class Request:
    def __init__(self) -> None:
        pass

-    def get(self, url: str, **kwargs: Unpack[KwargsRequest]):
+    def get(self, url: str, **kwargs: Unpack[KwargsRequest])-> 'Response':
        """
        Send a GET request.

@ -315,7 +387,7 @@ class Request:
        """
        return self._send_request(url, 'GET', **kwargs)

-    def post(self, url: str, **kwargs: Unpack[KwargsRequest]):
+    def post(self, url: str, **kwargs: Unpack[KwargsRequest]) -> 'Response':
        """
        Send a POST request.

@ -328,35 +400,8 @@ class Request:
        """
        return self._send_request(url, 'POST', **kwargs)
    
-    def put(self, url: str, **kwargs: Unpack[KwargsRequest]):
-        """
-        Send a PUT request.
-
-        Args:
-            url (str): The URL to which the request will be sent.
-            **kwargs: Additional keyword arguments for the request.
-
-        Returns:
-            Response: The response object.
-        """
-        return self._send_request(url, 'PUT', **kwargs)
-
-    def delete(self, url: str, **kwargs: Unpack[KwargsRequest]):
-        """
-        Send a DELETE request.
-
-        Args:
-            url (str): The URL to which the request will be sent.
-            **kwargs: Additional keyword arguments for the request.
-
-        Returns:
-            Response: The response object.
-        """
-        return self._send_request(url, 'DELETE', **kwargs)
-
-    def _send_request(self, url: str, method: str, **kwargs: Unpack[KwargsRequest]):
+    def _send_request(self, url: str, method: str, **kwargs: Unpack[KwargsRequest]) -> 'Response':
        """Send an HTTP request."""
-        # Add validation checks for URL and headers
        if not ValidateRequest.validate_url(url):
            raise ValueError("Invalid URL format")

@ -365,6 +410,5 @@ class Request:

        return ManageRequests(url, method, **kwargs).send()
    
-
 # Out
-request = Request()
+requests: Request = Request()
--- a/Src/Lib/UserAgent/init.py
+++ b/Src/Lib/UserAgent/init.py
@ -0,0 +1,3 @@
+# 21.04.24
+
+from .user_agent import ua
--- a/Src/Lib/UserAgent/user_agent.py
+++ b/Src/Lib/UserAgent/user_agent.py
@ -7,10 +7,12 @@ import random
 import threading
 import json
 import tempfile
+
 from typing import Dict, List

+
 # Internal utilities
-from .my_requests import request
+from ..Request import requests


 def get_browser_user_agents_online(browser: str) -> List[str]:
@ -28,7 +30,7 @@ def get_browser_user_agents_online(browser: str) -> List[str]:
    try:

        # Make request and find all user agents
-        html = request.get(url).text
+        html = requests.get(url).text
        browser_user_agents = re.findall(r"<a href=\'/.*?>(.+?)</a>", html, re.UNICODE)
        return [ua for ua in browser_user_agents if "more" not in ua.lower()]
    
@ -103,4 +105,4 @@ class UserAgentManager:


 # Output
-ua = UserAgentManager()
+ua: UserAgentManager = UserAgentManager()
--- a/Src/Upload/update.py
+++ b/Src/Upload/update.py
@ -1,13 +1,13 @@
 # 01.03.2023

 import os
-import requests
 import time


 # Internal utilities
 from .version import __version__
 from Src.Util.console import console
+from Src.Lib.Request import requests


 # Variable
--- a/Src/Util/headers.py
+++ b/Src/Util/headers.py
@ -4,7 +4,7 @@ import logging


 # Internal utilities
-from Src.Lib.Request.user_agent import ua
+from Src.Lib.UserAgent import ua


 def get_headers() -> str:
--- a/requirements.txt
+++ b/requirements.txt