Add new auto find new domain.

2025-06-07 12:05:35 +00:00 · 2024-06-18 23:37:55 +02:00 · 2024-06-18 23:37:55 +02:00 · 6036bbeb20
commit 6036bbeb20
parent 678e39cc46
7 changed files with 1707 additions and 73 deletions
--- a/Src/Api/Template/Util/get_domain.py
+++ b/Src/Api/Template/Util/get_domain.py
@ -0,0 +1,192 @@
+# 18.06.24
+
+import os
+import sys
+import time
+import logging
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+
+# External libraries
+import httpx
+import psutil
+from tqdm import tqdm
+
+
+# Internal utilities
+from Src.Util.color import Colors
+from Src.Util.headers import get_headers
+from Src.Util.console import console
+from Src.Util._jsonConfig import config_manager
+
+
+
+def check_url_for_content(url: str, content: str, timeout: int = 1) -> bool:
+    """
+    Check if a URL contains specific content.
+
+    Args:
+        - url (str): The URL to check.
+        - content (str): The content to search for in the response.
+        - timeout (int): Timeout for the request in seconds.
+
+    Returns:
+        bool: True if the content is found, False otherwise.
+    """
+    try:
+
+        response = httpx.get(url, timeout=timeout, headers={'user-agent': get_headers()})
+        logging.info(f"Testing site to extract domain: {url}, response: {response.status_code}")
+
+        # Raise an error if the status is not successful
+        response.raise_for_status()
+
+        # Check if the target content is in the response text
+        if content in response.text:
+            return True
+        
+    except httpx.RequestError as e:
+        logging.warning(f"Request error for {url}: {e}")
+
+    except httpx.HTTPStatusError as e:
+        logging.warning(f"HTTP status error for {url}: {e}")
+
+    except Exception as e:
+        logging.warning(f"Error for {url}: {e}")
+
+    return False
+
+def get_top_level_domain(base_url: str, target_content: str, max_workers: int = os.cpu_count(), timeout: int = 2, retries: int = 1) -> str:
+    """
+    Get the top-level domain (TLD) from a list of URLs.
+
+    Args:
+        - base_url (str): The base URL to construct complete URLs.
+        - target_content (str): The content to search for in the response.
+        - max_workers (int): Maximum number of threads.
+        - timeout (int): Timeout for the request in seconds.
+        - retries (int): Number of retries for failed requests.
+
+    Returns:
+        str: The found TLD, if any.
+    """
+
+    results = []
+    failed_urls = []
+    path_file = os.path.join("Test", "data", "TLD", "tld_list_complete.txt")
+    logging.info(f"Loading file: {path_file}")
+
+    if not os.path.exists(path_file):
+        raise FileNotFoundError("The file 'tld_list_complete.txt' does not exist.")
+
+    # Read TLDs from file and create URLs to test
+    with open(path_file, "r") as file:
+        urls = [f"{base_url}.{x.strip().lower()}" for x in file]
+    urls = list(set(urls))  # Remove duplicates
+
+    start_time = time.time()
+
+    bar_format=f"{Colors.YELLOW}Testing URLS{Colors.WHITE}: {Colors.RED}{{percentage:.2f}}% {Colors.MAGENTA}{{bar}} {Colors.WHITE}[ {Colors.YELLOW}{{n_fmt}}{Colors.WHITE} / {Colors.RED}{{total_fmt}} {Colors.WHITE}] {Colors.YELLOW}{{elapsed}} {Colors.WHITE}< {Colors.CYAN}{{remaining}}{Colors.GREEN}{{postfix}} {Colors.WHITE}]"
+    progress_bar = tqdm(
+        total=len(urls), 
+        unit='url',
+        ascii='░▒█',
+        bar_format=bar_format
+    )
+
+    # Event to signal when to stop checking URLs
+    stop_event = threading.Event()
+
+    def url_checker(url: str):
+        for attempt in range(retries):
+            if stop_event.is_set():
+                return None
+            
+            if check_url_for_content(url, target_content, timeout):
+                stop_event.set()
+                progress_bar.update(1)
+                return url.split(".")[-1]
+            
+            logging.info(f"Retrying {url} ({attempt+1}/{retries})")
+
+        failed_urls.append(url)
+        progress_bar.update(1)
+        return None
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(url_checker, url): url for url in urls}
+
+        for future in as_completed(futures):
+            tld = future.result()
+
+            if tld:
+                results.append(tld)
+                if stop_event.is_set():
+                    break
+
+            # Update the progress bar with CPU usage info
+            progress_bar.set_postfix(cpu_usage=f"{psutil.cpu_percent()}%")
+
+    progress_bar.close()
+
+    end_time = time.time()
+    total_time = end_time - start_time
+    avg_time_per_url = total_time / len(urls) if urls else 0
+
+    logging.info(f"Tested {len(urls)} URLs: {len(results)} passed, {len(failed_urls)} failed.")
+    logging.info(f"Total time: {total_time:.2f} seconds, Average time per URL: {avg_time_per_url:.2f} seconds.")
+
+    if results:
+        return results[-1]
+    else:
+        return None
+    
+
+def search_domain(site_name: str, target_content: str, base_url: str):
+    """
+    Search for a valid domain for the given site name and base URL.
+
+    Args:
+        - site_name (str): The name of the site to search the domain for.
+        - target_content (str): The content to search for in the response.
+        - base_url (str): The base URL to construct complete URLs.
+
+    Returns:
+        tuple: The found domain and the complete URL.
+    """
+
+    # Extract config domain
+    domain = config_manager.get("SITE", site_name)
+    console.print(f"[cyan]Test site[white]: [red]{base_url}.{domain}")
+
+    try:
+        # Test the current domain
+        response = httpx.get(f"{base_url}.{domain}", headers={'user-agent': get_headers()}, timeout=2)
+        console.print(f"[cyan]Test response site[white]: [red]{response.status_code}")
+        response.raise_for_status()
+
+        # Return config domain
+        console.print(f"[cyan]Use domain: [red]{domain}")
+        return domain, f"{base_url}.{domain}"
+
+    except:
+
+        # If the current domain fails, find a new one
+        print()
+        console.print("[red]Extract new DOMAIN from TLD list.")
+        new_domain = get_top_level_domain(base_url=base_url, target_content=target_content)
+
+        if new_domain is not None:
+
+            # Update domain in config.json
+            config_manager.set_key('SITE', site_name, new_domain)
+            config_manager.write_config()
+
+            # Return new config domain
+            console.print(f"[cyan]Use domain: [red]{new_domain}")
+            return new_domain, f"{base_url}.{new_domain}"
+        
+        else:
+            logging.error(f"Failed to find a new domain for: {base_url}")
+            sys.exit(0)
--- a/Src/Api/Template/init.py
+++ b/Src/Api/Template/init.py
@ -0,0 +1 @@
+from .Util.get_domain import search_domain
--- a/Src/Api/altadefinizione/site.py
+++ b/Src/Api/altadefinizione/site.py
@ -11,9 +11,10 @@ from unidecode import unidecode


 # Internal utilities
-from Src.Util.table import TVShowManager
-from Src.Util.console import console
 from Src.Util.headers import get_headers
+from Src.Util.console import console
+from Src.Util.table import TVShowManager
+from ..Template import search_domain


 # Logic class
@ -37,9 +38,12 @@ def title_search(title_search: str) -> int:
    Returns:
        int: The number of titles found.
    """
+
+    # Find new domain if prev dont work
+    domain_to_use, _ = search_domain(SITE_NAME, '<meta name="generator" content="altadefinizione">', f"https://{SITE_NAME}")
    
    # Send request to search for titles
-    response = httpx.get(f"https://{SITE_NAME}.{DOMAIN_NOW}/page/1/?story={unidecode(title_search.replace(' ', '+'))}&do=search&subaction=search&titleonly=3", headers={'user-agent': get_headers()})
+    response = httpx.get(f"https://{SITE_NAME}.{domain_to_use}/page/1/?story={unidecode(title_search.replace(' ', '+'))}&do=search&subaction=search&titleonly=3", headers={'user-agent': get_headers()})
    response.raise_for_status()

    # Create soup and find table
--- a/Src/Api/animeunity/site.py
+++ b/Src/Api/animeunity/site.py
@ -11,9 +11,10 @@ from unidecode import unidecode


 # Internal utilities
-from Src.Util.table import TVShowManager
 from Src.Util.console import console
 from Src.Util._jsonConfig import config_manager
+from Src.Util.table import TVShowManager
+from ..Template import search_domain


 # Logic class
@ -65,28 +66,6 @@ def get_token(site_name: str, domain: str) -> dict:
    }


-def update_domain():
-    """
-    Update the domain for the anime streaming site.
-
-    This function tests the accessibility of the current anime streaming site.
-    If the current domain is inaccessible, it attempts to obtain and set a new domain.
-    It uses the 'light' method to extract a new domain from Anime Unity.
-    """
-
-    # Test current site's accessibility
-    try:
-        
-        console.log(f"[cyan]Test site: [red]https://{SITE_NAME}.{DOMAIN_NOW}")
-        response = httpx.get(f"https://www.{SITE_NAME}.{DOMAIN_NOW}")
-        response.raise_for_status()
-
-    except Exception as e:
-
-        console.log("[red]Upload domain")
-        sys.exit(0)
-
-
 def get_real_title(record):
    """
    Get the real title from a record.
@ -122,12 +101,9 @@ def title_search(title: str) -> int:
        - int: A number containing the length of media search manager.
    """

-    # Update domain
-    update_domain()
-
    # Get token and session value from configuration
-    url_domain = config_manager.get('SITE', SITE_NAME)  
-    data = get_token(SITE_NAME, url_domain)
+    domain_to_use, _ = search_domain(SITE_NAME, '<meta name="author" content="AnimeUnity Staff">', f"https://www.{SITE_NAME}")
+    data = get_token(SITE_NAME, domain_to_use)

    # Prepare cookies to be used in the request
    cookies = {
@ -148,7 +124,7 @@ def title_search(title: str) -> int:
    }

    # Send a POST request to the API endpoint for live search
-    response = httpx.post(f'https://www.{SITE_NAME}.{url_domain}/livesearch', cookies=cookies, headers=headers, json=json_data)
+    response = httpx.post(f'https://www.{SITE_NAME}.{domain_to_use}/livesearch', cookies=cookies, headers=headers, json=json_data)
    response.raise_for_status()

    # Process each record returned in the response
--- a/Src/Api/streamingcommunity/site.py
+++ b/Src/Api/streamingcommunity/site.py
@ -15,9 +15,10 @@ from unidecode import unidecode

 # Internal utilities
 from Src.Util.headers import get_headers
-from Src.Util._jsonConfig import config_manager
 from Src.Util.console import console
 from Src.Util.table import TVShowManager
+from ..Template import search_domain
+


 # Logic class
@ -75,45 +76,15 @@ def get_version(text: str) -> tuple[str, list]:
        raise


-def get_version_and_domain(new_domain = None) -> Tuple[str, str]:
-    """
-    Retrieves the version and domain of the streaming website.
+def get_version_and_domain() -> Tuple[str, str]:

-    This function retrieves the version and domain of the streaming website.
-    It first checks the accessibility of the current site.
-    If the site is accessible, it extracts the version from the response.
-    If configured to do so, it also scrapes and prints the titles of the moments.
-    If the site is inaccessible, it attempts to obtain a new domain using the 'insta' method.
+    # Find new domain if prev dont work
+    domain_to_use, base_url = search_domain(SITE_NAME, '<meta name="author" content="StreamingCommunity">', f"https://{SITE_NAME}")

-    Returns:
-        Tuple[str, str]: A tuple containing the version and domain.
-    """
-    
-    # Get the current domain from the configuration
-    if new_domain is None:
-        config_domain = config_manager.get('SITE', SITE_NAME)
-    else:
-        config_domain = new_domain
+    # Extract version from the response
+    version, list_title_top_10 = get_version(httpx.get(base_url, headers={'user-agent': get_headers()}).text)

-    # Test the accessibility of the current site
-    try:
-
-        # Make requests to site to get text
-        console.print(f"[cyan]Test site[white]: [red]https://{SITE_NAME}.{config_domain}")
-        response = httpx.get(f"https://{SITE_NAME}.{config_domain}")
-        response.raise_for_status()
-
-        console.print(f"[cyan]Test respost site[white]: [red]{response.status_code} \n")
-
-        # Extract version from the response
-        version, list_title_top_10 = get_version(response.text)
-
-        return version, config_domain
-
-    except:
-
-        console.log("[red]Upload domain.")
-        sys.exit(0)
+    return version, domain_to_use


 def title_search(title_search: str, domain: str) -> int:
--- a/Test/data/TLD/creation.py
+++ b/Test/data/TLD/creation.py
@ -1,12 +1,11 @@
 # 29.04.24

 import httpx
-import json
 from bs4 import BeautifulSoup


 # URL of the webpage containing the table
-url = 'https://icannwiki.org/New_gTLD_Generic_Applications'
+url = 'https://icannwiki.org/All_New_gTLD_Applications'


 # List to store scraped data
@ -78,8 +77,9 @@ def main():
    print(len(data))

    # Write the scraped data to a JSON file
-    with open('data.json', 'w') as json_file:
-        json.dump(data, json_file)
+    with open('data.txt', 'w') as json_file:
+        for find_tld in data:
+            json_file.write(find_tld['application_id'] + "\n")


 if __name__ == '__main__':
--- a/Test/data/TLD/tld_list_complete.txt
+++ b/Test/data/TLD/tld_list_complete.txt