Update get_domain

2025-06-06 19:45:24 +00:00 · 2025-01-06 22:17:24 +01:00 · 2025-01-06 22:17:24 +01:00 · 7b94b17d2d
commit 7b94b17d2d
parent 5a091d94d7
12 changed files with 121 additions and 130 deletions
--- a/README.md
+++ b/README.md
@ -406,7 +406,7 @@ The `run-container` command mounts also the `config.json` file, so any change to
 | [Ilcorsaronero](https://ilcorsaronero.link/) |   ✅   |
 | [CB01New](https://cb01new.quest/) |   ✅   |
 | [DDLStreamItaly](https://ddlstreamitaly.co/) |   ✅   |
-| [GuardaSerie](https://guardaserie.com/) |   ✅   |
+| [GuardaSerie](https://guardaserie.academy/) |   ✅   |
 | [MostraGuarda](https://mostraguarda.stream/) |   ✅   |
 | [StreamingCommunity](https://streamingcommunity.prof/) |   ✅   |
--- a/StreamingCommunity/Api/Site/1337xx/site.py
+++ b/StreamingCommunity/Api/Site/1337xx/site.py
@ -43,7 +43,7 @@ def title_search(word_to_search: str) -> int:
    domain_to_use = DOMAIN_NOW
    if not disable_searchDomain:
-        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}")
+        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}.{DOMAIN_NOW}")
    # Construct the full site URL and load the search page
    try:
--- a/StreamingCommunity/Api/Site/altadefinizione/site.py
+++ b/StreamingCommunity/Api/Site/altadefinizione/site.py
@ -43,7 +43,7 @@ def title_search(title_search: str) -> int:
    domain_to_use = DOMAIN_NOW
    if not disable_searchDomain:
-        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}")
+        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}.{DOMAIN_NOW}")
    # Send request to search for title
    client = httpx.Client()
--- a/StreamingCommunity/Api/Site/animeunity/site.py
+++ b/StreamingCommunity/Api/Site/animeunity/site.py
@ -110,7 +110,7 @@ def title_search(title: str) -> int:
    domain_to_use = DOMAIN_NOW
    if not disable_searchDomain:
-        domain_to_use, base_url = search_domain(SITE_NAME, f"https://www.{SITE_NAME}")
+        domain_to_use, base_url = search_domain(SITE_NAME, f"https://www.{SITE_NAME}.{DOMAIN_NOW}")
    data = get_token(SITE_NAME, domain_to_use)
--- a/StreamingCommunity/Api/Site/cb01new/site.py
+++ b/StreamingCommunity/Api/Site/cb01new/site.py
@ -42,7 +42,7 @@ def title_search(word_to_search: str) -> int:
    domain_to_use = DOMAIN_NOW
    if not disable_searchDomain:
-        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}")
+        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}.{DOMAIN_NOW}")
    response = httpx.get(
        url=f"https://{SITE_NAME}.{domain_to_use}/?s={word_to_search}",
--- a/StreamingCommunity/Api/Site/ddlstreamitaly/site.py
+++ b/StreamingCommunity/Api/Site/ddlstreamitaly/site.py
@ -46,7 +46,7 @@ def title_search(word_to_search: str) -> int:
    domain_to_use = DOMAIN_NOW
    if not disable_searchDomain:
-        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}")
+        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}.{DOMAIN_NOW}")
    # Send request to search for titles
    try:
--- a/StreamingCommunity/Api/Site/guardaserie/site.py
+++ b/StreamingCommunity/Api/Site/guardaserie/site.py
@ -43,7 +43,7 @@ def title_search(word_to_search: str) -> int:
    domain_to_use = DOMAIN_NOW
    if not disable_searchDomain:
-        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}")
+        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}.{DOMAIN_NOW}")
    # Send request to search for titles
    try:
--- a/StreamingCommunity/Api/Site/ilcorsaronero/site.py
+++ b/StreamingCommunity/Api/Site/ilcorsaronero/site.py
@ -38,7 +38,7 @@ async def title_search(word_to_search: str) -> int:
    domain_to_use = DOMAIN_NOW
    if not disable_searchDomain:
-        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}")
+        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}.{DOMAIN_NOW}")
    # Create scraper and collect result
    print("\n")
--- a/StreamingCommunity/Api/Site/streamingcommunity/site.py
+++ b/StreamingCommunity/Api/Site/streamingcommunity/site.py
@ -81,7 +81,7 @@ def get_version_and_domain():
    domain_to_use = DOMAIN_NOW
    if not disable_searchDomain:
-        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}")
+        domain_to_use, base_url = search_domain(SITE_NAME, f"https://{SITE_NAME}.{DOMAIN_NOW}")
    version = get_version(domain_to_use)
--- a/StreamingCommunity/Api/Template/Util/get_domain.py
+++ b/StreamingCommunity/Api/Template/Util/get_domain.py
@ -1,5 +1,8 @@
 # 18.06.24
 import ssl
 import time
 import certifi
 from urllib.parse import urlparse, unquote
@ -26,158 +29,146 @@ base_headers = {
    'sec-fetch-site': 'none',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
-    'user-agent': None
+    'user-agent': ''
 }
 def get_tld(url_str):
-    """Extract the TLD (Top-Level Domain) from the URL without using external libraries."""
+    """Extract the TLD (Top-Level Domain) from the URL."""
-    url_str = unquote(url_str)
+    try:
-    
+        url_str = unquote(url_str)
-    parsed = urlparse(url_str)
+        parsed = urlparse(url_str)
-    domain = parsed.netloc.lower()
+        domain = parsed.netloc.lower()
-    if domain.startswith('www.'):
+        if domain.startswith('www.'):
-        domain = domain[4:]
+            domain = domain[4:]
-    
+        parts = domain.split('.')
-    parts = domain.split('.')
+        return parts[-1] if len(parts) >= 2 else None
-    
+    except Exception:
-    if len(parts) >= 2:
+        return None
        return parts[-1]
    return None
 def get_base_domain(url_str):
-    """Extract base domain without protocol, www and path"""
+    """Extract base domain without protocol, www and path."""
-    parsed = urlparse(url_str)
+    try:
-    domain = parsed.netloc.lower()
+        parsed = urlparse(url_str)
-    if domain.startswith('www.'):
+        domain = parsed.netloc.lower()
-        domain = domain[4:]
+        if domain.startswith('www.'):
-    return domain.split('.')[0]
+            domain = domain[4:]
        # Check if domain has multiple parts separated by dots
        parts = domain.split('.')
        if len(parts) > 2:  # Handle subdomains
            return '.'.join(parts[:-1])  # Return everything except TLD
        return parts[0]  # Return base domain
    except Exception:
        return None
 def validate_url(url, base_url, max_timeout, max_retries=5):
-    """
+    """Validate if URL is accessible and matches expected base domain."""
    Validate if URL is accessible and matches expected base domain, with retry mechanism for 403 errors.
    """
    console.print(f"\n[cyan]Starting validation for URL[white]: [yellow]{url}")
    # Verify URL structure matches base_url structure
    base_domain = get_base_domain(base_url)
    url_domain = get_base_domain(url)
    base_headers['user-agent'] = get_headers()
    if base_domain != url_domain:
        console.print(f"[red]Domain structure mismatch: {url_domain} != {base_domain}")
        return False, None
    # Count dots to ensure we don't have extra subdomains
    base_dots = base_url.count('.')
    url_dots = url.count('.')
    if url_dots > base_dots + 1:  # Allow for one extra dot for TLD change
        console.print(f"[red]Too many subdomains in URL")
        return False, None
-    def check_response(response, check_num):
+    client = httpx.Client(
-        if response.status_code == 403:
+        verify=certifi.where(),
-            console.print(f"[red]Check {check_num} failed: Access forbidden (403)")
+        headers=base_headers,
-            return False
+        timeout=max_timeout
-        if response.status_code >= 400:
+    )
            console.print(f"[red]Check {check_num} failed: HTTP {response.status_code}")
            return False
        console.print(f"[green]Check {check_num} passed: HTTP {response.status_code}")
        return True
-    retries = 0
+    for retry in range(max_retries):
    while retries < max_retries:
        try:
-            # Check 1: Initial request without following redirects
+            time.sleep(2)  # Add delay between retries
-            #console.print("[cyan]Performing initial connection check...")
+            
-            base_headers['user-agent'] = get_headers()
+            # Initial check without redirects
-
+            response = client.get(url, follow_redirects=False)
-            with httpx.Client(
+            if response.status_code == 403:
-                headers=base_headers,
+                console.print(f"[red]Check failed (403) - Attempt {retry + 1}/{max_retries}")
-                follow_redirects=False,
+                continue
-                timeout=max_timeout
+                
-            ) as client:
+            if response.status_code >= 400:
-                response = client.get(url)
+                console.print(f"[red]Check failed: HTTP {response.status_code}")
-                if not check_response(response, 1):
+                return False, None
-                    if response.status_code == 403:
+                
-                        retries += 1
+            # Follow redirects and verify final domain
-                        console.print(f"[yellow]Retrying... Attempt {retries}/{max_retries}")
+            final_response = client.get(url, follow_redirects=True)
-                        continue  # Retry on 403 error
+            final_domain = get_base_domain(str(final_response.url))
-                    return False, None
+            console.print(f"[cyan]Redirect url: [red]{final_response.url}")
-
+            
-            # Check 2: Follow redirects and verify final domain
+            if final_domain != base_domain:
-            #console.print("[cyan]Checking redirect destination...")
+                console.print(f"[red]Final domain mismatch: {final_domain} != {base_domain}")
-            with httpx.Client(
+                return False, None
-                headers=base_headers,
+                
-                follow_redirects=True,
+            new_tld = get_tld(str(final_response.url))
-                timeout=max_timeout
+            if new_tld != get_tld(url):
-            ) as client:
+                return True, new_tld
-                response = client.get(url)
+                
-                if not check_response(response, 2):
+            return True, None
-                    return False, None
+            
-
+        except (httpx.RequestError, ssl.SSLError) as e:
-                # Compare base domains
+            console.print(f"[red]Connection error: {str(e)}")
-                original_base = get_base_domain(url)
+            time.sleep(2)  # Add delay after error
-                final_base = get_base_domain(str(response.url))
+            continue
-
+            
                """console.print(f"[cyan]Comparing domains:")
                console.print(f"Original base domain: [yellow]{original_base}.{get_tld(str(url))}")
                console.print(f"Final base domain: [yellow]{final_base}.{get_tld(str(response.url))}")"""
                if original_base != final_base:
                    return False, None
                expected_base = get_base_domain(base_url)
                if final_base != expected_base:
                    return False, None
                if get_tld(str(url)) != get_tld(str(response.url)):
                    return True, get_tld(str(response.url))
                #console.print(f"[green]All checks passed: URL is valid and matches expected domain")
                return True, None
        except Exception as e:
            console.print(f"[red]Error during validation: {str(e)}")
            return False, None
    console.print(f"[red]Maximum retries reached for URL: {url}")
    return False, None
 def search_domain(site_name: str, base_url: str, get_first: bool = False):
-    """
+    """Search for valid domain matching site name and base URL."""
    Search for valid domain matching site name and base URL.
    """
    max_timeout = config_manager.get_int("REQUESTS", "timeout")
    domain = str(config_manager.get_dict("SITE", site_name)['domain'])
-
+    
    # Test initial URL
    try:
-        is_correct, redirect_tld = validate_url(base_url, base_url, max_timeout, max_retries=5)
+        is_correct, redirect_tld = validate_url(base_url, base_url, max_timeout)
        if is_correct and redirect_tld is not None:
            config_manager.config['SITE'][site_name]['domain'] = redirect_tld
            config_manager.write_config()
            console.print(f"[green]Successfully validated initial URL")
            return redirect_tld, base_url
        if is_correct:
-            parsed_url = urlparse(base_url)
+            tld = redirect_tld or get_tld(base_url)
            tld = parsed_url.netloc.split('.')[-1]
            config_manager.config['SITE'][site_name]['domain'] = tld
            config_manager.write_config()
            console.print(f"[green]Successfully validated initial URL")
            return tld, base_url
    except Exception as e:
        console.print(f"[red]Error testing initial URL: {str(e)}")
    # Google search phase
-    query = base_url.split("/")[-1]
+    base_domain = get_base_domain(base_url)
-    console.print(f"\n[cyan]Performing Google search for[white]: [yellow]{query}")
+    console.print(f"\n[cyan]Searching for alternate domains for[white]: [yellow]{base_domain}")
-    search_results = list(search(query, num_results=20, lang="it"))
+    
-
+    try:
-    for idx, result_url in enumerate(search_results, 1):
+        search_results = list(search(base_domain, num_results=20, lang="it"))
-        if get_base_domain(result_url) == get_base_domain(base_url):
+        filtered_results = [
-            console.print(f"\n[cyan]Checking Google result {idx}/20[white]: [yellow]{result_url}")
+            url for url in search_results 
-
+            if get_base_domain(url) == base_domain 
-            if validate_url(result_url, base_url, max_timeout):
+            and url.count('.') <= base_url.count('.') + 1
-                parsed_result = urlparse(result_url)
+        ]
                new_domain = parsed_result.netloc.split(".")[-1]
        for idx, result_url in enumerate(filtered_results, 1):
            console.print(f"\n[cyan]Checking result {idx}/{len(filtered_results)}[white]: [yellow]{result_url}")
            is_valid, new_tld = validate_url(result_url, base_url, max_timeout)
            if is_valid:
                final_tld = new_tld or get_tld(result_url)
                if get_first or msg.ask(
-                    f"\n[cyan]Do you want to update site[white] [red]'{site_name}'[cyan] with domain[white] [red]'{new_domain}'",
+                    f"\n[cyan]Update site[white] [red]'{site_name}'[cyan] with domain[white] [red]'{final_tld}'",
                    choices=["y", "n"],
                    default="y"
                ).lower() == "y":
-
+                    config_manager.config['SITE'][site_name]['domain'] = final_tld
                    config_manager.config['SITE'][site_name]['domain'] = new_domain
                    config_manager.write_config()
-                    return new_domain, f"{base_url}.{new_domain}"
+                    return final_tld, f"{base_url}.{final_tld}"
    except Exception as e:
        console.print(f"[red]Error during search: {str(e)}")
    console.print("[bold red]No valid URLs found matching the base URL.")
    return domain, f"{base_url}.{domain}"
--- a/Test/call_updateDomain.py
+++ b/Test/call_updateDomain.py
@ -123,4 +123,4 @@ if __name__ == "__main__":
            update_readme(alias, domain_to_use)
            print("------------------------------------")
-            time.sleep(3)
+            time.sleep(2)
--- a/config.json
+++ b/config.json
@ -64,7 +64,7 @@
            "domain": "prof"
        },
        "guardaserie": {
-            "domain": "com"
+            "domain": "academy"
        },
        "mostraguarda": {
            "domain": "stream"