Add new auto find new domain.

This commit is contained in:
Lovi 2024-06-18 23:37:55 +02:00
parent 678e39cc46
commit 6036bbeb20
7 changed files with 1707 additions and 73 deletions

View File

@ -0,0 +1,192 @@
# 18.06.24
import os
import sys
import time
import logging
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
# External libraries
import httpx
import psutil
from tqdm import tqdm
# Internal utilities
from Src.Util.color import Colors
from Src.Util.headers import get_headers
from Src.Util.console import console
from Src.Util._jsonConfig import config_manager
def check_url_for_content(url: str, content: str, timeout: int = 1) -> bool:
"""
Check if a URL contains specific content.
Args:
- url (str): The URL to check.
- content (str): The content to search for in the response.
- timeout (int): Timeout for the request in seconds.
Returns:
bool: True if the content is found, False otherwise.
"""
try:
response = httpx.get(url, timeout=timeout, headers={'user-agent': get_headers()})
logging.info(f"Testing site to extract domain: {url}, response: {response.status_code}")
# Raise an error if the status is not successful
response.raise_for_status()
# Check if the target content is in the response text
if content in response.text:
return True
except httpx.RequestError as e:
logging.warning(f"Request error for {url}: {e}")
except httpx.HTTPStatusError as e:
logging.warning(f"HTTP status error for {url}: {e}")
except Exception as e:
logging.warning(f"Error for {url}: {e}")
return False
def get_top_level_domain(base_url: str, target_content: str, max_workers: int = os.cpu_count(), timeout: int = 2, retries: int = 1) -> str:
"""
Get the top-level domain (TLD) from a list of URLs.
Args:
- base_url (str): The base URL to construct complete URLs.
- target_content (str): The content to search for in the response.
- max_workers (int): Maximum number of threads.
- timeout (int): Timeout for the request in seconds.
- retries (int): Number of retries for failed requests.
Returns:
str: The found TLD, if any.
"""
results = []
failed_urls = []
path_file = os.path.join("Test", "data", "TLD", "tld_list_complete.txt")
logging.info(f"Loading file: {path_file}")
if not os.path.exists(path_file):
raise FileNotFoundError("The file 'tld_list_complete.txt' does not exist.")
# Read TLDs from file and create URLs to test
with open(path_file, "r") as file:
urls = [f"{base_url}.{x.strip().lower()}" for x in file]
urls = list(set(urls)) # Remove duplicates
start_time = time.time()
bar_format=f"{Colors.YELLOW}Testing URLS{Colors.WHITE}: {Colors.RED}{{percentage:.2f}}% {Colors.MAGENTA}{{bar}} {Colors.WHITE}[ {Colors.YELLOW}{{n_fmt}}{Colors.WHITE} / {Colors.RED}{{total_fmt}} {Colors.WHITE}] {Colors.YELLOW}{{elapsed}} {Colors.WHITE}< {Colors.CYAN}{{remaining}}{Colors.GREEN}{{postfix}} {Colors.WHITE}]"
progress_bar = tqdm(
total=len(urls),
unit='url',
ascii='░▒█',
bar_format=bar_format
)
# Event to signal when to stop checking URLs
stop_event = threading.Event()
def url_checker(url: str):
for attempt in range(retries):
if stop_event.is_set():
return None
if check_url_for_content(url, target_content, timeout):
stop_event.set()
progress_bar.update(1)
return url.split(".")[-1]
logging.info(f"Retrying {url} ({attempt+1}/{retries})")
failed_urls.append(url)
progress_bar.update(1)
return None
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(url_checker, url): url for url in urls}
for future in as_completed(futures):
tld = future.result()
if tld:
results.append(tld)
if stop_event.is_set():
break
# Update the progress bar with CPU usage info
progress_bar.set_postfix(cpu_usage=f"{psutil.cpu_percent()}%")
progress_bar.close()
end_time = time.time()
total_time = end_time - start_time
avg_time_per_url = total_time / len(urls) if urls else 0
logging.info(f"Tested {len(urls)} URLs: {len(results)} passed, {len(failed_urls)} failed.")
logging.info(f"Total time: {total_time:.2f} seconds, Average time per URL: {avg_time_per_url:.2f} seconds.")
if results:
return results[-1]
else:
return None
def search_domain(site_name: str, target_content: str, base_url: str):
"""
Search for a valid domain for the given site name and base URL.
Args:
- site_name (str): The name of the site to search the domain for.
- target_content (str): The content to search for in the response.
- base_url (str): The base URL to construct complete URLs.
Returns:
tuple: The found domain and the complete URL.
"""
# Extract config domain
domain = config_manager.get("SITE", site_name)
console.print(f"[cyan]Test site[white]: [red]{base_url}.{domain}")
try:
# Test the current domain
response = httpx.get(f"{base_url}.{domain}", headers={'user-agent': get_headers()}, timeout=2)
console.print(f"[cyan]Test response site[white]: [red]{response.status_code}")
response.raise_for_status()
# Return config domain
console.print(f"[cyan]Use domain: [red]{domain}")
return domain, f"{base_url}.{domain}"
except:
# If the current domain fails, find a new one
print()
console.print("[red]Extract new DOMAIN from TLD list.")
new_domain = get_top_level_domain(base_url=base_url, target_content=target_content)
if new_domain is not None:
# Update domain in config.json
config_manager.set_key('SITE', site_name, new_domain)
config_manager.write_config()
# Return new config domain
console.print(f"[cyan]Use domain: [red]{new_domain}")
return new_domain, f"{base_url}.{new_domain}"
else:
logging.error(f"Failed to find a new domain for: {base_url}")
sys.exit(0)

View File

@ -0,0 +1 @@
from .Util.get_domain import search_domain

View File

@ -11,9 +11,10 @@ from unidecode import unidecode
# Internal utilities
from Src.Util.table import TVShowManager
from Src.Util.console import console
from Src.Util.headers import get_headers
from Src.Util.console import console
from Src.Util.table import TVShowManager
from ..Template import search_domain
# Logic class
@ -37,9 +38,12 @@ def title_search(title_search: str) -> int:
Returns:
int: The number of titles found.
"""
# Find new domain if prev dont work
domain_to_use, _ = search_domain(SITE_NAME, '<meta name="generator" content="altadefinizione">', f"https://{SITE_NAME}")
# Send request to search for titles
response = httpx.get(f"https://{SITE_NAME}.{DOMAIN_NOW}/page/1/?story={unidecode(title_search.replace(' ', '+'))}&do=search&subaction=search&titleonly=3", headers={'user-agent': get_headers()})
response = httpx.get(f"https://{SITE_NAME}.{domain_to_use}/page/1/?story={unidecode(title_search.replace(' ', '+'))}&do=search&subaction=search&titleonly=3", headers={'user-agent': get_headers()})
response.raise_for_status()
# Create soup and find table

View File

@ -11,9 +11,10 @@ from unidecode import unidecode
# Internal utilities
from Src.Util.table import TVShowManager
from Src.Util.console import console
from Src.Util._jsonConfig import config_manager
from Src.Util.table import TVShowManager
from ..Template import search_domain
# Logic class
@ -65,28 +66,6 @@ def get_token(site_name: str, domain: str) -> dict:
}
def update_domain():
"""
Update the domain for the anime streaming site.
This function tests the accessibility of the current anime streaming site.
If the current domain is inaccessible, it attempts to obtain and set a new domain.
It uses the 'light' method to extract a new domain from Anime Unity.
"""
# Test current site's accessibility
try:
console.log(f"[cyan]Test site: [red]https://{SITE_NAME}.{DOMAIN_NOW}")
response = httpx.get(f"https://www.{SITE_NAME}.{DOMAIN_NOW}")
response.raise_for_status()
except Exception as e:
console.log("[red]Upload domain")
sys.exit(0)
def get_real_title(record):
"""
Get the real title from a record.
@ -122,12 +101,9 @@ def title_search(title: str) -> int:
- int: A number containing the length of media search manager.
"""
# Update domain
update_domain()
# Get token and session value from configuration
url_domain = config_manager.get('SITE', SITE_NAME)
data = get_token(SITE_NAME, url_domain)
domain_to_use, _ = search_domain(SITE_NAME, '<meta name="author" content="AnimeUnity Staff">', f"https://www.{SITE_NAME}")
data = get_token(SITE_NAME, domain_to_use)
# Prepare cookies to be used in the request
cookies = {
@ -148,7 +124,7 @@ def title_search(title: str) -> int:
}
# Send a POST request to the API endpoint for live search
response = httpx.post(f'https://www.{SITE_NAME}.{url_domain}/livesearch', cookies=cookies, headers=headers, json=json_data)
response = httpx.post(f'https://www.{SITE_NAME}.{domain_to_use}/livesearch', cookies=cookies, headers=headers, json=json_data)
response.raise_for_status()
# Process each record returned in the response

View File

@ -15,9 +15,10 @@ from unidecode import unidecode
# Internal utilities
from Src.Util.headers import get_headers
from Src.Util._jsonConfig import config_manager
from Src.Util.console import console
from Src.Util.table import TVShowManager
from ..Template import search_domain
# Logic class
@ -75,45 +76,15 @@ def get_version(text: str) -> tuple[str, list]:
raise
def get_version_and_domain(new_domain = None) -> Tuple[str, str]:
"""
Retrieves the version and domain of the streaming website.
def get_version_and_domain() -> Tuple[str, str]:
This function retrieves the version and domain of the streaming website.
It first checks the accessibility of the current site.
If the site is accessible, it extracts the version from the response.
If configured to do so, it also scrapes and prints the titles of the moments.
If the site is inaccessible, it attempts to obtain a new domain using the 'insta' method.
# Find new domain if prev dont work
domain_to_use, base_url = search_domain(SITE_NAME, '<meta name="author" content="StreamingCommunity">', f"https://{SITE_NAME}")
Returns:
Tuple[str, str]: A tuple containing the version and domain.
"""
# Get the current domain from the configuration
if new_domain is None:
config_domain = config_manager.get('SITE', SITE_NAME)
else:
config_domain = new_domain
# Extract version from the response
version, list_title_top_10 = get_version(httpx.get(base_url, headers={'user-agent': get_headers()}).text)
# Test the accessibility of the current site
try:
# Make requests to site to get text
console.print(f"[cyan]Test site[white]: [red]https://{SITE_NAME}.{config_domain}")
response = httpx.get(f"https://{SITE_NAME}.{config_domain}")
response.raise_for_status()
console.print(f"[cyan]Test respost site[white]: [red]{response.status_code} \n")
# Extract version from the response
version, list_title_top_10 = get_version(response.text)
return version, config_domain
except:
console.log("[red]Upload domain.")
sys.exit(0)
return version, domain_to_use
def title_search(title_search: str, domain: str) -> int:

View File

@ -1,12 +1,11 @@
# 29.04.24
import httpx
import json
from bs4 import BeautifulSoup
# URL of the webpage containing the table
url = 'https://icannwiki.org/New_gTLD_Generic_Applications'
url = 'https://icannwiki.org/All_New_gTLD_Applications'
# List to store scraped data
@ -78,8 +77,9 @@ def main():
print(len(data))
# Write the scraped data to a JSON file
with open('data.json', 'w') as json_file:
json.dump(data, json_file)
with open('data.txt', 'w') as json_file:
for find_tld in data:
json_file.write(find_tld['application_id'] + "\n")
if __name__ == '__main__':

File diff suppressed because it is too large Load Diff