github: Update domains

2025-07-18 16:10:01 +00:00 · 2025-05-31 11:28:38 +02:00 · 2025-05-31 11:28:38 +02:00 · 1776538c6c
commit 1776538c6c
parent 884bcf656c
2 changed files with 180 additions and 169 deletions
--- a/.github/.domain/domain_update.py
+++ b/.github/.domain/domain_update.py
@ -1,17 +1,14 @@
 # 20.04.2024

-import os
 import re
-import time
+import os
 import json
 from datetime import datetime
 from urllib.parse import urlparse, urlunparse

-
 import httpx
 import ua_generator

-
 JSON_FILE_PATH = os.path.join(".github", ".domain", "domains.json")


@ -50,69 +47,137 @@ def get_new_tld(full_url):

    return None

-def try_url_with_retries(url_to_try, headers, timeout=15, retries=3, backoff_factor=0.5):
-    for attempt in range(retries):
-        try:
-            with httpx.Client(headers=headers, timeout=timeout, follow_redirects=True) as client:
-                response = client.get(url_to_try)
-                response.raise_for_status()
-            return response
-        
-        except (httpx.TimeoutException, httpx.ConnectError) as e:
-            print(f"  [!] Attempt {attempt + 1}/{retries} for {url_to_try}: Network error ({type(e).__name__}). Retrying in {backoff_factor * (2 ** attempt)}s...")
-            if attempt + 1 == retries:
-                print(f"  [!] Failed all {retries} attempts for {url_to_try} due to {type(e).__name__}.")
-                return None
-            time.sleep(backoff_factor * (2 ** attempt))
+def extract_domain_from_response(response, original_url):
+    if 'location' in response.headers:
+        return response.headers['location']
+    
+    if str(response.url) != original_url:
+        return str(response.url)
+    
+    try:
+        content_type = response.headers.get('content-type', '').lower()
+        if 'text/html' in content_type or 'text/plain' in content_type:
+            response_text = response.text
            
-        except httpx.HTTPStatusError as http_err:
-            if http_err.response.status_code in [403, 429, 503]:
-                print(f"  [!] HTTP error {http_err.response.status_code} for {url_to_try}. Suspected Cloudflare, checking for <base href>...")
-                try:
-                    with httpx.Client(headers=headers, timeout=timeout, follow_redirects=False) as cf_client:
-                        cf_page_response = cf_client.get(url_to_try)
-                        if cf_page_response.status_code != http_err.response.status_code and not (200 <= cf_page_response.status_code < 300) :
-                            cf_page_response.raise_for_status()
-
-                    match = re.search(r'<base\s+href="([^"]+)"', cf_page_response.text, re.IGNORECASE)
-                    if match:
-                        base_href_url = match.group(1)
-                        parsed_base_href = urlparse(base_href_url)
-                        if not parsed_base_href.scheme or not parsed_base_href.netloc:
-                            original_parsed_url = urlparse(url_to_try)
-                            base_href_url = urlunparse(original_parsed_url._replace(path=base_href_url if base_href_url.startswith('/') else '/' + base_href_url, query='', fragment=''))
-
-                        print(f"    [+] Found <base href>: {base_href_url}")
-                        try:
-                            print(f"    [] Attempting request to <base href> URL: {base_href_url}")
-                            with httpx.Client(headers=headers, timeout=timeout, follow_redirects=True) as base_client:
-                                final_response_from_base = base_client.get(base_href_url)
-                                final_response_from_base.raise_for_status()
-                            print(f"    [+] Successfully fetched from <base href> URL.")
-                            return final_response_from_base
-                        
-                        except httpx.RequestError as base_req_e:
-                            print(f"    [!] Error requesting <base href> URL {base_href_url}: {base_req_e}")
-                            return None
-                        
-                    else:
-                        print(f"    [!] No <base href> found in page content for {url_to_try}.")
-                        return None
-                    
-                except httpx.RequestError as cf_req_e:
-                    print(f"    [!] Error fetching Cloudflare-like page content for {url_to_try}: {cf_req_e}")
-                    return None
-                
-            else:
-                print(f"  [!] HTTP error {http_err.response.status_code} for {url_to_try}. No retry.")
-                return None
+            js_redirect_patterns = [
+                r'window\.location\.href\s*=\s*["\']([^"\']+)["\']',
+                r'window\.location\s*=\s*["\']([^"\']+)["\']',
+                r'location\.href\s*=\s*["\']([^"\']+)["\']',
+                r'document\.location\s*=\s*["\']([^"\']+)["\']'
+            ]
            
-        except httpx.RequestError as e:
-            print(f"  [!] Generic error for {url_to_try}: {e}. No retry.")
-            return None
-        
+            for pattern in js_redirect_patterns:
+                js_match = re.search(pattern, response_text, re.IGNORECASE)
+                if js_match:
+                    return js_match.group(1)
+            
+            meta_patterns = [
+                r'<meta[^>]*http-equiv=["\']?refresh["\']?[^>]*content=["\'][^"\']*url=([^"\'>\s]+)',
+                r'<meta[^>]*content=["\'][^"\']*url=([^"\'>\s]+)[^>]*http-equiv=["\']?refresh["\']?'
+            ]
+            
+            for pattern in meta_patterns:
+                meta_match = re.search(pattern, response_text, re.IGNORECASE)
+                if meta_match:
+                    return meta_match.group(1)
+            
+            canonical_match = re.search(r'<link[^>]*rel=["\']?canonical["\']?[^>]*href=["\']([^"\']+)["\']', response_text, re.IGNORECASE)
+            if canonical_match:
+                return canonical_match.group(1)
+            
+            base_match = re.search(r'<base[^>]*href=["\']([^"\']+)["\']', response_text, re.IGNORECASE)
+            if base_match:
+                return base_match.group(1)
+            
+            error_redirect_patterns = [
+                r'[Rr]edirect(?:ed)?\s+to:?\s*([^\s<>"\']+)',
+                r'[Nn]ew\s+[Uu][Rr][Ll]:?\s*([^\s<>"\']+)',
+                r'[Mm]oved\s+to:?\s*([^\s<>"\']+)',
+                r'[Ff]ound\s+at:?\s*([^\s<>"\']+)'
+            ]
+            
+            for pattern in error_redirect_patterns:
+                error_match = re.search(pattern, response_text)
+                if error_match:
+                    potential_url = error_match.group(1)
+                    if potential_url.startswith(('http://', 'https://', '//')):
+                        return potential_url
+    
+    except Exception as e:
+        print(f"    [!] Error extracting from response content: {e}")
+    
    return None

+def try_url(url_to_try, headers, timeout=15):
+    try:
+        with httpx.Client(headers=headers, timeout=timeout, follow_redirects=False) as client:
+            response = client.get(url_to_try)
+            
+            if response.status_code in [301, 302, 303, 307, 308]:
+                location = response.headers.get('location')
+                if location:
+                    print(f"    [+] Found redirect ({response.status_code}) to: {location}")
+                    try:
+                        final_response = client.get(location)
+                        if 200 <= final_response.status_code < 400:
+                            return final_response
+                        else:
+                            return httpx.Response(
+                                status_code=200,
+                                headers={"location": location},
+                                content=b"",
+                                request=response.request
+                            )
+                    except Exception:
+                        return httpx.Response(
+                            status_code=200,
+                            headers={"location": location},
+                            content=b"",
+                            request=response.request
+                        )
+            
+            elif response.status_code in [403, 409, 429, 503]:
+                print(f"    [!] HTTP {response.status_code} - attempting to extract redirect info")
+                
+                location = response.headers.get('location')
+                if location:
+                    print(f"    [+] Found location header in error response: {location}")
+                    return httpx.Response(
+                        status_code=200,
+                        headers={"location": location},
+                        content=b"",
+                        request=response.request
+                    )
+                
+                new_url = extract_domain_from_response(response, url_to_try)
+                if new_url and new_url != url_to_try:
+                    print(f"    [+] Found redirect URL in error response content: {new_url}")
+                    return httpx.Response(
+                        status_code=200,
+                        headers={"location": new_url},
+                        content=b"",
+                        request=response.request
+                    )
+            
+            if 200 <= response.status_code < 400:
+                return response
+            
+            print(f"  [!] HTTP {response.status_code} for {url_to_try}")
+        
+    except httpx.HTTPStatusError as http_err:
+        new_url = extract_domain_from_response(http_err.response, url_to_try)
+        if new_url:
+            print(f"    [+] Found new URL from HTTPStatusError response: {new_url}")
+            return httpx.Response(
+                status_code=200,
+                headers={"location": new_url},
+                content=b"",
+                request=http_err.request
+            )
+    except Exception as e:
+        print(f"  [!] Error for {url_to_try}: {type(e).__name__}")
+    
+    return None

 def update_domain_entries(data):
    if not data:
@ -135,100 +200,47 @@ def update_domain_entries(data):
        print(f"  [] Stored URL: {original_full_url}")
        if original_domain_in_entry:
            print(f"  [] Stored Domain (TLD): {original_domain_in_entry}")
-
-        potential_urls_to_try = []
-        potential_urls_to_try.append(("Original", original_full_url))
-
-        try:
-            parsed_original = urlparse(original_full_url)
-            
-            current_netloc = parsed_original.netloc
-            if current_netloc.startswith("www."):
-                varied_netloc = current_netloc[4:]
-                potential_urls_to_try.append(("Without www", urlunparse(parsed_original._replace(netloc=varied_netloc))))
-            else:
-                varied_netloc = "www." + current_netloc
-                potential_urls_to_try.append(("With www", urlunparse(parsed_original._replace(netloc=varied_netloc))))
-
-            current_path = parsed_original.path
-            if not current_path:
-                potential_urls_to_try.append(("With trailing slash", urlunparse(parsed_original._replace(path='/'))))
-            elif current_path.endswith('/'):
-                potential_urls_to_try.append(("Without trailing slash", urlunparse(parsed_original._replace(path=current_path[:-1]))))
-            else:
-                potential_urls_to_try.append(("With trailing slash", urlunparse(parsed_original._replace(path=current_path + '/'))))
-            
-        except Exception as e:
-            print(f"  [!] Error generating URL variations: {e}")
-
-        entry_updated_in_this_run = False
        
-        seen_urls_for_entry = set()
-        unique_potential_urls = []
-        for label, url_val in potential_urls_to_try:
-            if url_val not in seen_urls_for_entry:
-                unique_potential_urls.append((label, url_val))
-                seen_urls_for_entry.add(url_val)
-        
-        parsed_original_for_http_check = urlparse(original_full_url)
-        if parsed_original_for_http_check.scheme == 'https':
-            http_url = urlunparse(parsed_original_for_http_check._replace(scheme='http'))
-            if http_url not in seen_urls_for_entry:
-                unique_potential_urls.append(("HTTP Fallback", http_url))
+        print(f"  [] Testing URL: {original_full_url}")
+        response = try_url(original_full_url, current_headers)

-        for label, url_to_check in unique_potential_urls:
-            if entry_updated_in_this_run:
-                break
+        if response:
+            final_url_from_request = str(response.url)
+            print(f"    [+] Redirect/Response to: {final_url_from_request}")
+
+            parsed_final_url = urlparse(final_url_from_request)
+            normalized_full_url = urlunparse(parsed_final_url._replace(path='/', params='', query='', fragment=''))
+            if parsed_final_url.path == '' and not normalized_full_url.endswith('/'):
+                normalized_full_url += '/'
            
-            print(f"  [] Testing URL ({label}): {url_to_check}")
-            response = try_url_with_retries(url_to_check, current_headers)
+            if normalized_full_url != final_url_from_request:
+                print(f"    [+] Normalized URL: {normalized_full_url}")

-            if response:
-                final_url_from_request = str(response.url)
-                print(f"    [+] Redirect/Response to: {final_url_from_request}")
-
-                parsed_final_url = urlparse(final_url_from_request)
-                normalized_full_url = urlunparse(parsed_final_url._replace(path='/', params='', query='', fragment=''))
-                if parsed_final_url.path == '' and not normalized_full_url.endswith('/'):
-                    normalized_full_url += '/'
+            if normalized_full_url != original_full_url:
+                new_tld_val = get_new_tld(final_url_from_request)
                
-                if normalized_full_url != final_url_from_request:
-                    print(f"    [+] Normalized URL: {normalized_full_url}")
-
-                if normalized_full_url != original_full_url:
-                    new_tld_val = get_new_tld(final_url_from_request)
+                if new_tld_val:
+                    entry["full_url"] = normalized_full_url
                    
-                    if new_tld_val:
-                        entry["full_url"] = normalized_full_url
-                        
-                        if new_tld_val != original_domain_in_entry:
-                            print(f"    [-] Domain TLD Changed: '{original_domain_in_entry}' -> '{new_tld_val}'")
-                            entry["old_domain"] = original_domain_in_entry if original_domain_in_entry else entry.get("old_domain", "")
-                            entry["domain"] = new_tld_val
-                            entry["time_change"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-                            print(f"    [-] Domain & URL Updated: New TLD '{new_tld_val}', New URL '{normalized_full_url}'")
-
-                        else:
-                            entry["domain"] = new_tld_val
-                            print(f"    [-] URL Updated (TLD Unchanged '{new_tld_val}'): New URL '{normalized_full_url}'")
-                        
-                        updated_count += 1
-                        entry_updated_in_this_run = True
-
+                    if new_tld_val != original_domain_in_entry:
+                        print(f"    [-] Domain TLD Changed: '{original_domain_in_entry}' -> '{new_tld_val}'")
+                        entry["old_domain"] = original_domain_in_entry if original_domain_in_entry else entry.get("old_domain", "")
+                        entry["domain"] = new_tld_val
+                        entry["time_change"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+                        print(f"    [-] Domain & URL Updated: New TLD '{new_tld_val}', New URL '{normalized_full_url}'")
                    else:
-                        print(f"    [!] Could not extract TLD from {final_url_from_request}. URL not updated despite potential change.")
+                        entry["domain"] = new_tld_val
+                        print(f"    [-] URL Updated (TLD Unchanged '{new_tld_val}'): New URL '{normalized_full_url}'")
+                    
+                    updated_count += 1
+
                else:
-                    if final_url_from_request != original_full_url:
-                        print(f"    [] Same Domain (after normalization): {final_url_from_request} -> {normalized_full_url}")
+                    print(f"    [!] Could not extract TLD from {final_url_from_request}. URL not updated despite potential change.")
+            else:
+                print(f"    [] Same Domain: {final_url_from_request}")

-                    else:
-                        print(f"    [] Same Domain: {final_url_from_request}")
-                    
-                    if label == "Original" or normalized_full_url == original_full_url :
-                        entry_updated_in_this_run = True
-
-        if not entry_updated_in_this_run:
-            print(f"  [-] No Update for {key} after {len(unique_potential_urls)} attempts.")
+        else:
+            print(f"  [-] No response for {key}")
        
    return updated_count > 0

@ -240,10 +252,8 @@ def main():
        if update_domain_entries(domain_data):
            save_domains(JSON_FILE_PATH, domain_data)
            print("\nUpdate complete. Some entries were modified.")
-
        else:
            print("\nUpdate complete. No domains were modified.")
-
    else:
        print("\nCannot proceed without domain data.")
    
--- a/.github/workflows/update_domain.yml
+++ b/.github/workflows/update_domain.yml
@ -1,8 +1,8 @@
-name: Aggiorna Domini Periodicamente
+name: Update domains

 on:
  schedule:
-    - cron: "*/45 * * * *"
+    - cron: "0 */2 * * *"
  workflow_dispatch:

 jobs:
@ -12,37 +12,38 @@ jobs:
      contents: write

    steps:
-      - name: Checkout del codice
+      - name: Checkout code
        uses: actions/checkout@v4

      - name: Setup Python
        uses: actions/setup-python@v5
        with:
-          python-version: '3.12'
+          python-version: '3.12'      
+          
+      - name: Install dependencies
+        run: |
+          pip install httpx ua-generator requests
+          pip install --upgrade pip setuptools wheel

-      - name: Installa dipendenze
-        run: pip install httpx ua-generator
-
-      - name: Configura DNS
+      - name: Configure DNS
        run: |
          sudo sh -c 'echo "nameserver 9.9.9.9" > /etc/resolv.conf'
-          sudo sh -c 'echo "nameserver 149.112.112.122" >> /etc/resolv.conf'
          cat /etc/resolv.conf

-      - name: Esegui lo script di aggiornamento domini
-        run: python domain_updater.py
+      - name: Execute domain update script
+        run: python .github/.domain/domain_update.py

-      - name: Commit e Push delle modifiche (se presenti)
+      - name: Commit and push changes (if any)
        run: |
          git config --global user.name 'github-actions[bot]'
          git config --global user.email 'github-actions[bot]@users.noreply.github.com'
          
-          # Controlla se domain.json è stato modificato
-          if ! git diff --quiet domain.json; then
-            git add domain.json
-            git commit -m "Aggiornamento automatico domini [skip ci]"
-            echo "Modifiche committate. Tentativo di push..."
+          # Check if domains.json was modified
+          if ! git diff --quiet .github/.domain/domains.json; then
+            git add .github/.domain/domains.json
+            git commit -m "Automatic domain update [skip ci]"
+            echo "Changes committed. Attempting to push..."
            git push
          else
-            echo "Nessuna modifica a domain.json da committare."
-          fi
+            echo "No changes to .github/.domain/domains.json to commit."
+          fi