github: Update domains

2025-06-06 19:45:24 +00:00 · 2025-05-31 11:28:38 +02:00 · 2025-05-31 11:28:38 +02:00 · 1776538c6c
commit 1776538c6c
parent 884bcf656c
2 changed files with 180 additions and 169 deletions
--- a/.github/.domain/domain_update.py
+++ b/.github/.domain/domain_update.py
@ -1,17 +1,14 @@
 # 20.04.2024
 import os
 import re
-import time
+import os
 import json
 from datetime import datetime
 from urllib.parse import urlparse, urlunparse
 import httpx
 import ua_generator
 JSON_FILE_PATH = os.path.join(".github", ".domain", "domains.json")
@ -50,70 +47,138 @@ def get_new_tld(full_url):
    return None
-def try_url_with_retries(url_to_try, headers, timeout=15, retries=3, backoff_factor=0.5):
+def extract_domain_from_response(response, original_url):
-    for attempt in range(retries):
+    if 'location' in response.headers:
        return response.headers['location']
    if str(response.url) != original_url:
        return str(response.url)
    try:
-            with httpx.Client(headers=headers, timeout=timeout, follow_redirects=True) as client:
+        content_type = response.headers.get('content-type', '').lower()
        if 'text/html' in content_type or 'text/plain' in content_type:
            response_text = response.text
            js_redirect_patterns = [
                r'window\.location\.href\s*=\s*["\']([^"\']+)["\']',
                r'window\.location\s*=\s*["\']([^"\']+)["\']',
                r'location\.href\s*=\s*["\']([^"\']+)["\']',
                r'document\.location\s*=\s*["\']([^"\']+)["\']'
            ]
            for pattern in js_redirect_patterns:
                js_match = re.search(pattern, response_text, re.IGNORECASE)
                if js_match:
                    return js_match.group(1)
            meta_patterns = [
                r'<meta[^>]*http-equiv=["\']?refresh["\']?[^>]*content=["\'][^"\']*url=([^"\'>\s]+)',
                r'<meta[^>]*content=["\'][^"\']*url=([^"\'>\s]+)[^>]*http-equiv=["\']?refresh["\']?'
            ]
            for pattern in meta_patterns:
                meta_match = re.search(pattern, response_text, re.IGNORECASE)
                if meta_match:
                    return meta_match.group(1)
            canonical_match = re.search(r'<link[^>]*rel=["\']?canonical["\']?[^>]*href=["\']([^"\']+)["\']', response_text, re.IGNORECASE)
            if canonical_match:
                return canonical_match.group(1)
            base_match = re.search(r'<base[^>]*href=["\']([^"\']+)["\']', response_text, re.IGNORECASE)
            if base_match:
                return base_match.group(1)
            error_redirect_patterns = [
                r'[Rr]edirect(?:ed)?\s+to:?\s*([^\s<>"\']+)',
                r'[Nn]ew\s+[Uu][Rr][Ll]:?\s*([^\s<>"\']+)',
                r'[Mm]oved\s+to:?\s*([^\s<>"\']+)',
                r'[Ff]ound\s+at:?\s*([^\s<>"\']+)'
            ]
            for pattern in error_redirect_patterns:
                error_match = re.search(pattern, response_text)
                if error_match:
                    potential_url = error_match.group(1)
                    if potential_url.startswith(('http://', 'https://', '//')):
                        return potential_url
    except Exception as e:
        print(f"    [!] Error extracting from response content: {e}")
    return None
 def try_url(url_to_try, headers, timeout=15):
    try:
        with httpx.Client(headers=headers, timeout=timeout, follow_redirects=False) as client:
            response = client.get(url_to_try)
-                response.raise_for_status()
+            
            if response.status_code in [301, 302, 303, 307, 308]:
                location = response.headers.get('location')
                if location:
                    print(f"    [+] Found redirect ({response.status_code}) to: {location}")
                    try:
                        final_response = client.get(location)
                        if 200 <= final_response.status_code < 400:
                            return final_response
                        else:
                            return httpx.Response(
                                status_code=200,
                                headers={"location": location},
                                content=b"",
                                request=response.request
                            )
                    except Exception:
                        return httpx.Response(
                            status_code=200,
                            headers={"location": location},
                            content=b"",
                            request=response.request
                        )
            elif response.status_code in [403, 409, 429, 503]:
                print(f"    [!] HTTP {response.status_code} - attempting to extract redirect info")
                location = response.headers.get('location')
                if location:
                    print(f"    [+] Found location header in error response: {location}")
                    return httpx.Response(
                        status_code=200,
                        headers={"location": location},
                        content=b"",
                        request=response.request
                    )
                new_url = extract_domain_from_response(response, url_to_try)
                if new_url and new_url != url_to_try:
                    print(f"    [+] Found redirect URL in error response content: {new_url}")
                    return httpx.Response(
                        status_code=200,
                        headers={"location": new_url},
                        content=b"",
                        request=response.request
                    )
            if 200 <= response.status_code < 400:
                return response
-        except (httpx.TimeoutException, httpx.ConnectError) as e:
+            print(f"  [!] HTTP {response.status_code} for {url_to_try}")
            print(f"  [!] Attempt {attempt + 1}/{retries} for {url_to_try}: Network error ({type(e).__name__}). Retrying in {backoff_factor * (2 ** attempt)}s...")
            if attempt + 1 == retries:
                print(f"  [!] Failed all {retries} attempts for {url_to_try} due to {type(e).__name__}.")
                return None
            time.sleep(backoff_factor * (2 ** attempt))
    except httpx.HTTPStatusError as http_err:
-            if http_err.response.status_code in [403, 429, 503]:
+        new_url = extract_domain_from_response(http_err.response, url_to_try)
-                print(f"  [!] HTTP error {http_err.response.status_code} for {url_to_try}. Suspected Cloudflare, checking for <base href>...")
+        if new_url:
-                try:
+            print(f"    [+] Found new URL from HTTPStatusError response: {new_url}")
-                    with httpx.Client(headers=headers, timeout=timeout, follow_redirects=False) as cf_client:
+            return httpx.Response(
-                        cf_page_response = cf_client.get(url_to_try)
+                status_code=200,
-                        if cf_page_response.status_code != http_err.response.status_code and not (200 <= cf_page_response.status_code < 300) :
+                headers={"location": new_url},
-                            cf_page_response.raise_for_status()
+                content=b"",
-
+                request=http_err.request
-                    match = re.search(r'<base\s+href="([^"]+)"', cf_page_response.text, re.IGNORECASE)
+            )
-                    if match:
+    except Exception as e:
-                        base_href_url = match.group(1)
+        print(f"  [!] Error for {url_to_try}: {type(e).__name__}")
                        parsed_base_href = urlparse(base_href_url)
                        if not parsed_base_href.scheme or not parsed_base_href.netloc:
                            original_parsed_url = urlparse(url_to_try)
                            base_href_url = urlunparse(original_parsed_url._replace(path=base_href_url if base_href_url.startswith('/') else '/' + base_href_url, query='', fragment=''))
                        print(f"    [+] Found <base href>: {base_href_url}")
                        try:
                            print(f"    [] Attempting request to <base href> URL: {base_href_url}")
                            with httpx.Client(headers=headers, timeout=timeout, follow_redirects=True) as base_client:
                                final_response_from_base = base_client.get(base_href_url)
                                final_response_from_base.raise_for_status()
                            print(f"    [+] Successfully fetched from <base href> URL.")
                            return final_response_from_base
                        except httpx.RequestError as base_req_e:
                            print(f"    [!] Error requesting <base href> URL {base_href_url}: {base_req_e}")
                            return None
                    else:
                        print(f"    [!] No <base href> found in page content for {url_to_try}.")
                        return None
                except httpx.RequestError as cf_req_e:
                    print(f"    [!] Error fetching Cloudflare-like page content for {url_to_try}: {cf_req_e}")
                    return None
            else:
                print(f"  [!] HTTP error {http_err.response.status_code} for {url_to_try}. No retry.")
                return None
        except httpx.RequestError as e:
            print(f"  [!] Generic error for {url_to_try}: {e}. No retry.")
            return None
    return None
 def update_domain_entries(data):
    if not data:
        return False
@ -136,52 +201,8 @@ def update_domain_entries(data):
        if original_domain_in_entry:
            print(f"  [] Stored Domain (TLD): {original_domain_in_entry}")
-        potential_urls_to_try = []
+        print(f"  [] Testing URL: {original_full_url}")
-        potential_urls_to_try.append(("Original", original_full_url))
+        response = try_url(original_full_url, current_headers)
        try:
            parsed_original = urlparse(original_full_url)
            current_netloc = parsed_original.netloc
            if current_netloc.startswith("www."):
                varied_netloc = current_netloc[4:]
                potential_urls_to_try.append(("Without www", urlunparse(parsed_original._replace(netloc=varied_netloc))))
            else:
                varied_netloc = "www." + current_netloc
                potential_urls_to_try.append(("With www", urlunparse(parsed_original._replace(netloc=varied_netloc))))
            current_path = parsed_original.path
            if not current_path:
                potential_urls_to_try.append(("With trailing slash", urlunparse(parsed_original._replace(path='/'))))
            elif current_path.endswith('/'):
                potential_urls_to_try.append(("Without trailing slash", urlunparse(parsed_original._replace(path=current_path[:-1]))))
            else:
                potential_urls_to_try.append(("With trailing slash", urlunparse(parsed_original._replace(path=current_path + '/'))))
        except Exception as e:
            print(f"  [!] Error generating URL variations: {e}")
        entry_updated_in_this_run = False
        seen_urls_for_entry = set()
        unique_potential_urls = []
        for label, url_val in potential_urls_to_try:
            if url_val not in seen_urls_for_entry:
                unique_potential_urls.append((label, url_val))
                seen_urls_for_entry.add(url_val)
        parsed_original_for_http_check = urlparse(original_full_url)
        if parsed_original_for_http_check.scheme == 'https':
            http_url = urlunparse(parsed_original_for_http_check._replace(scheme='http'))
            if http_url not in seen_urls_for_entry:
                unique_potential_urls.append(("HTTP Fallback", http_url))
        for label, url_to_check in unique_potential_urls:
            if entry_updated_in_this_run:
                break
            print(f"  [] Testing URL ({label}): {url_to_check}")
            response = try_url_with_retries(url_to_check, current_headers)
        if response:
            final_url_from_request = str(response.url)
@ -207,28 +228,19 @@ def update_domain_entries(data):
                        entry["domain"] = new_tld_val
                        entry["time_change"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        print(f"    [-] Domain & URL Updated: New TLD '{new_tld_val}', New URL '{normalized_full_url}'")
                    else:
                        entry["domain"] = new_tld_val
                        print(f"    [-] URL Updated (TLD Unchanged '{new_tld_val}'): New URL '{normalized_full_url}'")
                    updated_count += 1
                        entry_updated_in_this_run = True
                else:
                    print(f"    [!] Could not extract TLD from {final_url_from_request}. URL not updated despite potential change.")
                else:
                    if final_url_from_request != original_full_url:
                        print(f"    [] Same Domain (after normalization): {final_url_from_request} -> {normalized_full_url}")
            else:
                print(f"    [] Same Domain: {final_url_from_request}")
-                    if label == "Original" or normalized_full_url == original_full_url :
+        else:
-                        entry_updated_in_this_run = True
+            print(f"  [-] No response for {key}")
        if not entry_updated_in_this_run:
            print(f"  [-] No Update for {key} after {len(unique_potential_urls)} attempts.")
    return updated_count > 0
@ -240,10 +252,8 @@ def main():
        if update_domain_entries(domain_data):
            save_domains(JSON_FILE_PATH, domain_data)
            print("\nUpdate complete. Some entries were modified.")
        else:
            print("\nUpdate complete. No domains were modified.")
    else:
        print("\nCannot proceed without domain data.")
--- a/.github/workflows/update_domain.yml
+++ b/.github/workflows/update_domain.yml
@ -1,8 +1,8 @@
-name: Aggiorna Domini Periodicamente
+name: Update domains
 on:
  schedule:
-    - cron: "*/45 * * * *"
+    - cron: "0 */2 * * *"
  workflow_dispatch:
 jobs:
@ -12,7 +12,7 @@ jobs:
      contents: write
    steps:
-      - name: Checkout del codice
+      - name: Checkout code
        uses: actions/checkout@v4
      - name: Setup Python
@ -20,29 +20,30 @@ jobs:
        with:
          python-version: '3.12'      
-      - name: Installa dipendenze
+      - name: Install dependencies
-        run: pip install httpx ua-generator
+        run: |
          pip install httpx ua-generator requests
          pip install --upgrade pip setuptools wheel
-      - name: Configura DNS
+      - name: Configure DNS
        run: |
          sudo sh -c 'echo "nameserver 9.9.9.9" > /etc/resolv.conf'
          sudo sh -c 'echo "nameserver 149.112.112.122" >> /etc/resolv.conf'
          cat /etc/resolv.conf
-      - name: Esegui lo script di aggiornamento domini
+      - name: Execute domain update script
-        run: python domain_updater.py
+        run: python .github/.domain/domain_update.py
-      - name: Commit e Push delle modifiche (se presenti)
+      - name: Commit and push changes (if any)
        run: |
          git config --global user.name 'github-actions[bot]'
          git config --global user.email 'github-actions[bot]@users.noreply.github.com'
-          # Controlla se domain.json è stato modificato
+          # Check if domains.json was modified
-          if ! git diff --quiet domain.json; then
+          if ! git diff --quiet .github/.domain/domains.json; then
-            git add domain.json
+            git add .github/.domain/domains.json
-            git commit -m "Aggiornamento automatico domini [skip ci]"
+            git commit -m "Automatic domain update [skip ci]"
-            echo "Modifiche committate. Tentativo di push..."
+            echo "Changes committed. Attempting to push..."
            git push
          else
-            echo "Nessuna modifica a domain.json da committare."
+            echo "No changes to .github/.domain/domains.json to commit."
          fi