From 3727b5dea79f6e9505875c9ff6e135622927ef4e Mon Sep 17 00:00:00 2001
From: Lovi <62809003+Lovi-0@users.noreply.github.com>
Date: Fri, 3 Jan 2025 11:51:28 +0100
Subject: [PATCH] Update get_domain

---
 StreamingCommunity/Api/Site/1337xx/site.py    |   5 +-
 .../Api/Site/altadefinizione/site.py          |  30 ++-
 .../Api/Site/animeunity/site.py               |  26 ++-
 StreamingCommunity/Api/Site/cb01new/site.py   |  23 +-
 .../Api/Site/ddlstreamitaly/site.py           |   4 +-
 .../Api/Site/guardaserie/site.py              |   6 +-
 .../Api/Site/ilcorsaronero/site.py            |  23 +-
 .../Api/Site/streamingcommunity/site.py       |  22 +-
 .../Api/Template/Util/get_domain.py           | 209 ++++++------------
 Test/call_updateDomain.py                     |   5 +-
 10 files changed, 152 insertions(+), 201 deletions(-)

diff --git a/StreamingCommunity/Api/Site/1337xx/site.py b/StreamingCommunity/Api/Site/1337xx/site.py
index 31b375b..69192f8 100644
--- a/StreamingCommunity/Api/Site/1337xx/site.py
+++ b/StreamingCommunity/Api/Site/1337xx/site.py
@@ -57,7 +57,6 @@ def title_search(word_to_search: str) -> int:
     # Create soup and find table
     soup = BeautifulSoup(response.text, "html.parser")
 
-    # Scrape div film in table on single page
     for tr in soup.find_all('tr'):
         try:
 
@@ -72,8 +71,8 @@ def title_search(word_to_search: str) -> int:
             
             media_search_manager.add_media(title_info)
 
-        except:
-            continue
+        except Exception as e:
+            print(f"Error parsing a film entry: {e}")
 
     # Return the number of titles found
     return media_search_manager.get_length()
diff --git a/StreamingCommunity/Api/Site/altadefinizione/site.py b/StreamingCommunity/Api/Site/altadefinizione/site.py
index 70fc10f..227831a 100644
--- a/StreamingCommunity/Api/Site/altadefinizione/site.py
+++ b/StreamingCommunity/Api/Site/altadefinizione/site.py
@@ -62,21 +62,27 @@ def title_search(title_search: str) -> int:
 
     # Create soup and find table
     soup = BeautifulSoup(response.text, "html.parser")
-    table_content = soup.find('div', id="dle-content")
 
-    # Scrape div film in table on single page
-    for film_div in table_content.find_all('div', class_='col-lg-3'):
-        title = film_div.find('h2', class_='titleFilm').get_text(strip=True)
-        link = film_div.find('h2', class_='titleFilm').find('a')['href']
-        imdb_rating = film_div.find('div', class_='imdb-rate').get_text(strip=True).split(":")[-1]
+    for row in soup.find_all('div', class_='col-lg-3 col-md-3 col-xs-4'):
+        try:
+            
+            title_element = row.find('h2', class_='titleFilm').find('a')
+            title = title_element.get_text(strip=True)
+            link = title_element['href']
 
-        film_info = {
-            'name': title,
-            'url': link,
-            'score': imdb_rating
-        }
+            imdb_element = row.find('div', class_='imdb-rate')
+            imdb_rating = imdb_element.get_text(strip=True).split(":")[-1]
 
-        media_search_manager.add_media(film_info)
+            film_info = {
+                'name': title,
+                'url': link,
+                'score': imdb_rating
+            }
+            
+            media_search_manager.add_media(film_info)
+
+        except AttributeError as e:
+            print(f"Error parsing a film entry: {e}")
 
     # Return the number of titles found
     return media_search_manager.get_length()
diff --git a/StreamingCommunity/Api/Site/animeunity/site.py b/StreamingCommunity/Api/Site/animeunity/site.py
index 514e455..f37a20f 100644
--- a/StreamingCommunity/Api/Site/animeunity/site.py
+++ b/StreamingCommunity/Api/Site/animeunity/site.py
@@ -140,21 +140,23 @@ def title_search(title: str) -> int:
     except Exception as e:
         console.print(f"Site: {SITE_NAME}, request search error: {e}")
 
-    # Process each record returned in the response
     for dict_title in response.json()['records']:
+        try:
 
-        # Rename keys for consistency
-        dict_title['name'] = get_real_title(dict_title)
+            # Rename keys for consistency
+            dict_title['name'] = get_real_title(dict_title)
 
-        # Add the record to media search manager if the name is not None
-        media_search_manager.add_media({
-            'id': dict_title.get('id'),
-            'slug': dict_title.get('slug'),
-            'name': dict_title.get('name'),
-            'type': dict_title.get('type'),
-            'score': dict_title.get('score'),
-            'episodes_count': dict_title.get('episodes_count')
-        })
+            media_search_manager.add_media({
+                'id': dict_title.get('id'),
+                'slug': dict_title.get('slug'),
+                'name': dict_title.get('name'),
+                'type': dict_title.get('type'),
+                'score': dict_title.get('score'),
+                'episodes_count': dict_title.get('episodes_count')
+            })
+
+        except Exception as e:
+            print(f"Error parsing a film entry: {e}")
 
     # Return the length of media search manager
     return media_search_manager.get_length()
diff --git a/StreamingCommunity/Api/Site/cb01new/site.py b/StreamingCommunity/Api/Site/cb01new/site.py
index ddaa40a..559e232 100644
--- a/StreamingCommunity/Api/Site/cb01new/site.py
+++ b/StreamingCommunity/Api/Site/cb01new/site.py
@@ -50,20 +50,23 @@ def title_search(word_to_search: str) -> int:
     # Create soup and find table
     soup = BeautifulSoup(response.text, "html.parser")
 
-    # For all element in table
     for div in soup.find_all("div", class_ = "card-content"):
+        try:
 
-        url = div.find("h3").find("a").get("href")
-        title = div.find("h3").find("a").get_text(strip=True)
-        desc = div.find("p").find("strong").text
+            url = div.find("h3").find("a").get("href")
+            title = div.find("h3").find("a").get_text(strip=True)
+            desc = div.find("p").find("strong").text
 
-        title_info = {
-            'name': title,
-            'desc': desc,
-            'url': url
-        }
+            title_info = {
+                'name': title,
+                'desc': desc,
+                'url': url
+            }
 
-        media_search_manager.add_media(title_info)
+            media_search_manager.add_media(title_info)
+
+        except Exception as e:
+            print(f"Error parsing a film entry: {e}")
 
     # Return the number of titles found
     return media_search_manager.get_length()
diff --git a/StreamingCommunity/Api/Site/ddlstreamitaly/site.py b/StreamingCommunity/Api/Site/ddlstreamitaly/site.py
index 0caaffc..aa429aa 100644
--- a/StreamingCommunity/Api/Site/ddlstreamitaly/site.py
+++ b/StreamingCommunity/Api/Site/ddlstreamitaly/site.py
@@ -75,9 +75,9 @@ def title_search(word_to_search: str) -> int:
                 }
 
                 media_search_manager.add_media(title_info)
-
+                    
             except Exception as e:
-                logging.error(f"Error processing title div: {e}")
+                print(f"Error parsing a film entry: {e}")
 
         return media_search_manager.get_length()
     
diff --git a/StreamingCommunity/Api/Site/guardaserie/site.py b/StreamingCommunity/Api/Site/guardaserie/site.py
index 982b4aa..e1a12f7 100644
--- a/StreamingCommunity/Api/Site/guardaserie/site.py
+++ b/StreamingCommunity/Api/Site/guardaserie/site.py
@@ -58,8 +58,8 @@ def title_search(word_to_search: str) -> int:
     table_content = soup.find('div', class_="mlnew-list")
 
     for serie_div in table_content.find_all('div', class_='mlnew'):
-
         try:
+            
             title = serie_div.find('div', class_='mlnh-2').find("h2").get_text(strip=True)
             link = serie_div.find('div', class_='mlnh-2').find('a')['href']
             imdb_rating = serie_div.find('span', class_='mlnh-imdb').get_text(strip=True)
@@ -72,8 +72,8 @@ def title_search(word_to_search: str) -> int:
 
             media_search_manager.add_media(serie_info)
 
-        except:
-            pass
+        except Exception as e:
+            print(f"Error parsing a film entry: {e}")
 
     # Return the number of titles found
     return media_search_manager.get_length()
diff --git a/StreamingCommunity/Api/Site/ilcorsaronero/site.py b/StreamingCommunity/Api/Site/ilcorsaronero/site.py
index 210ef91..a9458c6 100644
--- a/StreamingCommunity/Api/Site/ilcorsaronero/site.py
+++ b/StreamingCommunity/Api/Site/ilcorsaronero/site.py
@@ -39,18 +39,21 @@ async def title_search(word_to_search: str) -> int:
     scraper = IlCorsaroNeroScraper(f"https://{SITE_NAME}.{domain_to_use}/", 1)
     results = await scraper.search(word_to_search)
 
-    # Add all result to media manager
     for i, torrent in enumerate(results):
-        media_search_manager.add_media({
-            'name': torrent['name'],
-            'type': torrent['type'],
-            'seed': torrent['seed'],
-            'leech': torrent['leech'],
-            'size': torrent['size'],
-            'date': torrent['date'],
-            'url': torrent['url']
-        })
+        try:
+            
+            media_search_manager.add_media({
+                'name': torrent['name'],
+                'type': torrent['type'],
+                'seed': torrent['seed'],
+                'leech': torrent['leech'],
+                'size': torrent['size'],
+                'date': torrent['date'],
+                'url': torrent['url']
+            })
 
+        except Exception as e:
+            print(f"Error parsing a film entry: {e}")
 
     # Return the number of titles found
     return media_search_manager.get_length()
diff --git a/StreamingCommunity/Api/Site/streamingcommunity/site.py b/StreamingCommunity/Api/Site/streamingcommunity/site.py
index ecb549e..3a9b4af 100644
--- a/StreamingCommunity/Api/Site/streamingcommunity/site.py
+++ b/StreamingCommunity/Api/Site/streamingcommunity/site.py
@@ -116,16 +116,20 @@ def title_search(title_search: str, domain: str) -> int:
     except Exception as e:
         console.print(f"Site: {SITE_NAME}, request search error: {e}")
 
-    # Add found titles to media search manager
     for dict_title in response.json()['data']:
-        media_search_manager.add_media({
-            'id': dict_title.get('id'),
-            'slug': dict_title.get('slug'),
-            'name': dict_title.get('name'),
-            'type': dict_title.get('type'),
-            'date': dict_title.get('last_air_date'),
-            'score': dict_title.get('score')
-        })
+        try:
+
+            media_search_manager.add_media({
+                'id': dict_title.get('id'),
+                'slug': dict_title.get('slug'),
+                'name': dict_title.get('name'),
+                'type': dict_title.get('type'),
+                'date': dict_title.get('last_air_date'),
+                'score': dict_title.get('score')
+            })
+
+        except Exception as e:
+            print(f"Error parsing a film entry: {e}")
 
     # Return the number of titles found
     return media_search_manager.get_length()
diff --git a/StreamingCommunity/Api/Template/Util/get_domain.py b/StreamingCommunity/Api/Template/Util/get_domain.py
index 2e7dd73..04e1553 100644
--- a/StreamingCommunity/Api/Template/Util/get_domain.py
+++ b/StreamingCommunity/Api/Template/Util/get_domain.py
@@ -1,6 +1,5 @@
 # 18.06.24
 
-import sys
 from urllib.parse import urlparse
 
 
@@ -15,43 +14,34 @@ from StreamingCommunity.Util.console import console, msg
 from StreamingCommunity.Util._jsonConfig import config_manager
 
 
-def google_search(query):
-    """
-    Perform a Google search and return the first result.
-    
-    Args:
-        query (str): Search query to execute
-        
-    Returns:
-        str: First URL from search results, None if no results found
-    """
+def get_base_domain(url_str):
+    """Extract base domain without protocol, www and path"""
+    parsed = urlparse(url_str)
+    domain = parsed.netloc.lower()
+    if domain.startswith('www.'):
+        domain = domain[4:]
+    return domain.split('.')[0]
 
-    # Perform search with single result limit
-    search_results = search(query, num_results=1)
-    first_result = next(search_results, None)
+def validate_url(url, base_url, max_timeout):
+    """
+    Validate if URL is accessible and matches expected base domain
+    """
+    console.print(f"\n[cyan]Starting validation for URL[white]: [yellow]{url}")
     
-    if not first_result:
-        console.print("[red]No results found.[/red]")
-    
-    return first_result
+    def check_response(response, check_num):
+        if response.status_code == 403:
+            console.print(f"[red]Check {check_num} failed: Access forbidden (403)")
+            return False
+        if response.status_code >= 400:
+            console.print(f"[red]Check {check_num} failed: HTTP {response.status_code}")
+            return False
+        console.print(f"[green]Check {check_num} passed: HTTP {response.status_code}")
+        return True
 
-def validate_url(url, max_timeout):
-    """
-    Validate if a URL is accessible and check if its redirect destination is significantly different.
-    
-    Args:
-        url (str): URL to validate
-        max_timeout (int): Maximum timeout for request
-        
-    Returns:
-        bool: True if URL is valid, accessible and redirect destination is acceptable
-    """
-    def get_domain_parts(url_str):
-        parsed = urlparse(url_str)
-        return parsed.netloc.lower().split('.')[-2:]  # Get last two parts of domain
-    
     try:
-        # First check without following redirects
+        
+        # Check 1: Initial request without following redirects
+        console.print("[cyan]Performing initial connection check...")
         with httpx.Client(
             headers={
                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
@@ -62,11 +52,11 @@ def validate_url(url, max_timeout):
             timeout=max_timeout
         ) as client:
             response = client.get(url)
-            if response.status_code == 403:
+            if not check_response(response, 1):
                 return False
-            response.raise_for_status()
-            
-        # Then check with redirects enabled
+
+        # Check 2: Follow redirects and verify final domain
+        console.print("[cyan]Checking redirect destination...")
         with httpx.Client(
             headers={
                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
@@ -77,131 +67,76 @@ def validate_url(url, max_timeout):
             timeout=max_timeout
         ) as client:
             response = client.get(url)
-            if response.status_code == 403:
+            if not check_response(response, 2):
                 return False
-            response.raise_for_status()
             
-            # Compare original and final URLs
-            original_domain = get_domain_parts(url)
-            final_domain = get_domain_parts(str(response.url))
+            # Compare base domains
+            original_base = get_base_domain(url)
+            final_base = get_base_domain(str(response.url))
             
-            # Check if domains are significantly different
-            if original_domain != final_domain:
-                console.print(f"[yellow]Warning: URL redirects to different domain: {response.url}[/yellow]")
+            console.print(f"[cyan]Comparing domains:")
+            console.print(f"Original base domain: [yellow]{original_base}")
+            console.print(f"Final base domain: [yellow]{final_base}")
+            
+            if original_base != final_base:
+                console.print(f"[red]Domain mismatch: Redirected to different base domain")
+                return False
+            
+            # Verify against expected base_url
+            expected_base = get_base_domain(base_url)
+            if final_base != expected_base:
+                console.print(f"[red]Domain mismatch: Final domain does not match expected base URL")
+                console.print(f"Expected: [yellow]{expected_base}")
                 return False
                 
+            console.print(f"[green]All checks passed: URL is valid and matches expected domain")
             return True
             
-    except Exception:
-        return False
-    
-def get_final_redirect_url(initial_url, max_timeout):
-    """
-    Follow all redirects for a URL and return final destination.
-    
-    Args:
-        initial_url (str): Starting URL to follow redirects from
-        max_timeout (int): Maximum timeout for request
-        
-    Returns:
-        str: Final URL after all redirects, None if error occurs
-    """
-    try:
-        with httpx.Client(
-            headers={
-                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
-                'accept-language': 'it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7',
-                'User-Agent': get_headers()
-            },
-            follow_redirects=True,
-            timeout=max_timeout
-        ) as client:
-            
-            # Follow redirects and get response
-            response = client.get(initial_url)
-
-            if response.status_code == 403:
-                console.print("[bold red]The owner of this website has banned your IP[/bold red]")
-                raise
-
-            response.raise_for_status()
-            return response.url
-    
     except Exception as e:
-        console.print(f"\n[cyan]Test url[white]: [red]{initial_url}, [cyan]error[white]: [red]{e}")
-        return None
+        console.print(f"[red]Error during validation: {str(e)}")
+        return False
 
 def search_domain(site_name: str, base_url: str, get_first: bool = False):
     """
     Search for valid domain matching site name and base URL.
-    
-    Args:
-        site_name (str): Name of site to find domain for
-        base_url (str): Base URL to construct complete URLs
-        get_first (bool): Auto-update config with first valid match if True
-        
-    Returns:
-        tuple: (found_domain, complete_url)
     """
-    # Get configuration values
     max_timeout = config_manager.get_int("REQUESTS", "timeout")
     domain = str(config_manager.get_dict("SITE", site_name)['domain'])
     test_url = f"{base_url}.{domain}"
 
+    console.print(f"\n[cyan]Testing initial URL[white]: [yellow]{test_url}")
+    
     try:
-        if validate_url(test_url, max_timeout):
+        if validate_url(test_url, base_url, max_timeout):
             parsed_url = urlparse(test_url)
             tld = parsed_url.netloc.split('.')[-1]
             config_manager.config['SITE'][site_name]['domain'] = tld
             config_manager.write_config()
+            console.print(f"[green]Successfully validated initial URL")
             return tld, test_url
+    except Exception as e:
+        console.print(f"[red]Error testing initial URL: {str(e)}")
 
-    except Exception:
-        pass
-
-    # Perform Google search if current domain fails
+    # Google search phase
     query = base_url.split("/")[-1]
+    console.print(f"\n[cyan]Performing Google search for[white]: [yellow]{query}")
     search_results = list(search(query, num_results=15, lang="it"))
-    console.print(f"Google search: {search_results}")
-
-    def normalize_for_comparison(url):
-        """Normalize URL by removing protocol, www, and trailing slashes"""
-        url = url.lower()
-        url = url.replace("https://", "").replace("http://", "")
-        url = url.replace("www.", "")
-        return url.rstrip("/")
-
-    target_url = normalize_for_comparison(base_url)
-
-    # Check each search result
-    for result_url in search_results:
-        #console.print(f"[green]Checking url[white]: [red]{result_url}")
-
-        # Skip invalid URLs
-        if not validate_url(result_url, max_timeout):
-            #console.print(f"[red]URL validation failed for: {result_url}")
-            continue
-
-        parsed_result = urlparse(result_url)
-        result_domain = normalize_for_comparison(parsed_result.netloc)
-
-        # Check if domain matches target
-        if result_domain.startswith(target_url.split("/")[-1]):
-            final_url = get_final_redirect_url(result_url, max_timeout)
+    
+    for idx, result_url in enumerate(search_results, 1):
+        console.print(f"\n[cyan]Checking Google result {idx}/15[white]: [yellow]{result_url}")
+        
+        if validate_url(result_url, base_url, max_timeout):
+            parsed_result = urlparse(result_url)
+            new_domain = parsed_result.netloc.split(".")[-1]
             
-            if final_url is not None:
-                new_domain = urlparse(str(final_url)).netloc.split(".")[-1]
+            if get_first or msg.ask(
+                f"\n[cyan]Do you want to update site[white] [red]'{site_name}'[cyan] with domain[white] [red]'{new_domain}'",
+                choices=["y", "n"],
+                default="y"
+            ).lower() == "y":
+                config_manager.config['SITE'][site_name]['domain'] = new_domain
+                config_manager.write_config()
+                return new_domain, f"{base_url}.{new_domain}"
 
-                # Update config if auto-update enabled or user confirms
-                if get_first or msg.ask(
-                    f"\n[cyan]Do you want to auto update site[white] [red]'{site_name}'[cyan] with domain[white] [red]'{new_domain}'.",
-                    choices=["y", "n"],
-                    default="y"
-                ).lower() == "y":
-                    config_manager.config['SITE'][site_name]['domain'] = new_domain
-                    config_manager.write_config()
-                    return new_domain, f"{base_url}.{new_domain}"
-
-    # Return original domain if no valid matches found
-    console.print("[bold red]No valid URL found matching the base URL.[/bold red]")
+    console.print("[bold red]No valid URLs found matching the base URL.")
     return domain, f"{base_url}.{domain}"
\ No newline at end of file
diff --git a/Test/call_updateDomain.py b/Test/call_updateDomain.py
index 4e9ccba..c8e7557 100644
--- a/Test/call_updateDomain.py
+++ b/Test/call_updateDomain.py
@@ -79,7 +79,7 @@ def load_site_names():
 
     return site_names
 
-def update_readme(site_names):
+def update_readme(site_names, domain_to_use):
     if not os.path.exists(README_PATH):
         console.print(f"[red]README file not found at {README_PATH}")
         return
@@ -95,7 +95,6 @@ def update_readme(site_names):
             alias = f"{site_name.lower()}"
 
             if alias in site_names:
-                domain_to_use, _ = search_domain(site_name=alias, base_url=f"https://{alias}", get_first=True)
                 print("Update line: ", line)
                 
                 if site_name == "animeunity":
@@ -126,4 +125,4 @@ if __name__ == "__main__":
         # Update readme
         print("\n")
         print("Return domain: ", domain_to_use)
-        update_readme(alias)
\ No newline at end of file
+        update_readme(alias, domain_to_use)
\ No newline at end of file