mirror of
https://github.com/Arrowar/StreamingCommunity.git
synced 2025-06-07 12:05:35 +00:00
Add new auto find new domain.
This commit is contained in:
parent
678e39cc46
commit
6036bbeb20
192
Src/Api/Template/Util/get_domain.py
Normal file
192
Src/Api/Template/Util/get_domain.py
Normal file
@ -0,0 +1,192 @@
|
||||
# 18.06.24
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
|
||||
# External libraries
|
||||
import httpx
|
||||
import psutil
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# Internal utilities
|
||||
from Src.Util.color import Colors
|
||||
from Src.Util.headers import get_headers
|
||||
from Src.Util.console import console
|
||||
from Src.Util._jsonConfig import config_manager
|
||||
|
||||
|
||||
|
||||
def check_url_for_content(url: str, content: str, timeout: int = 1) -> bool:
|
||||
"""
|
||||
Check if a URL contains specific content.
|
||||
|
||||
Args:
|
||||
- url (str): The URL to check.
|
||||
- content (str): The content to search for in the response.
|
||||
- timeout (int): Timeout for the request in seconds.
|
||||
|
||||
Returns:
|
||||
bool: True if the content is found, False otherwise.
|
||||
"""
|
||||
try:
|
||||
|
||||
response = httpx.get(url, timeout=timeout, headers={'user-agent': get_headers()})
|
||||
logging.info(f"Testing site to extract domain: {url}, response: {response.status_code}")
|
||||
|
||||
# Raise an error if the status is not successful
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if the target content is in the response text
|
||||
if content in response.text:
|
||||
return True
|
||||
|
||||
except httpx.RequestError as e:
|
||||
logging.warning(f"Request error for {url}: {e}")
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logging.warning(f"HTTP status error for {url}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Error for {url}: {e}")
|
||||
|
||||
return False
|
||||
|
||||
def get_top_level_domain(base_url: str, target_content: str, max_workers: int = os.cpu_count(), timeout: int = 2, retries: int = 1) -> str:
|
||||
"""
|
||||
Get the top-level domain (TLD) from a list of URLs.
|
||||
|
||||
Args:
|
||||
- base_url (str): The base URL to construct complete URLs.
|
||||
- target_content (str): The content to search for in the response.
|
||||
- max_workers (int): Maximum number of threads.
|
||||
- timeout (int): Timeout for the request in seconds.
|
||||
- retries (int): Number of retries for failed requests.
|
||||
|
||||
Returns:
|
||||
str: The found TLD, if any.
|
||||
"""
|
||||
|
||||
results = []
|
||||
failed_urls = []
|
||||
path_file = os.path.join("Test", "data", "TLD", "tld_list_complete.txt")
|
||||
logging.info(f"Loading file: {path_file}")
|
||||
|
||||
if not os.path.exists(path_file):
|
||||
raise FileNotFoundError("The file 'tld_list_complete.txt' does not exist.")
|
||||
|
||||
# Read TLDs from file and create URLs to test
|
||||
with open(path_file, "r") as file:
|
||||
urls = [f"{base_url}.{x.strip().lower()}" for x in file]
|
||||
urls = list(set(urls)) # Remove duplicates
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
bar_format=f"{Colors.YELLOW}Testing URLS{Colors.WHITE}: {Colors.RED}{{percentage:.2f}}% {Colors.MAGENTA}{{bar}} {Colors.WHITE}[ {Colors.YELLOW}{{n_fmt}}{Colors.WHITE} / {Colors.RED}{{total_fmt}} {Colors.WHITE}] {Colors.YELLOW}{{elapsed}} {Colors.WHITE}< {Colors.CYAN}{{remaining}}{Colors.GREEN}{{postfix}} {Colors.WHITE}]"
|
||||
progress_bar = tqdm(
|
||||
total=len(urls),
|
||||
unit='url',
|
||||
ascii='░▒█',
|
||||
bar_format=bar_format
|
||||
)
|
||||
|
||||
# Event to signal when to stop checking URLs
|
||||
stop_event = threading.Event()
|
||||
|
||||
def url_checker(url: str):
|
||||
for attempt in range(retries):
|
||||
if stop_event.is_set():
|
||||
return None
|
||||
|
||||
if check_url_for_content(url, target_content, timeout):
|
||||
stop_event.set()
|
||||
progress_bar.update(1)
|
||||
return url.split(".")[-1]
|
||||
|
||||
logging.info(f"Retrying {url} ({attempt+1}/{retries})")
|
||||
|
||||
failed_urls.append(url)
|
||||
progress_bar.update(1)
|
||||
return None
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = {executor.submit(url_checker, url): url for url in urls}
|
||||
|
||||
for future in as_completed(futures):
|
||||
tld = future.result()
|
||||
|
||||
if tld:
|
||||
results.append(tld)
|
||||
if stop_event.is_set():
|
||||
break
|
||||
|
||||
# Update the progress bar with CPU usage info
|
||||
progress_bar.set_postfix(cpu_usage=f"{psutil.cpu_percent()}%")
|
||||
|
||||
progress_bar.close()
|
||||
|
||||
end_time = time.time()
|
||||
total_time = end_time - start_time
|
||||
avg_time_per_url = total_time / len(urls) if urls else 0
|
||||
|
||||
logging.info(f"Tested {len(urls)} URLs: {len(results)} passed, {len(failed_urls)} failed.")
|
||||
logging.info(f"Total time: {total_time:.2f} seconds, Average time per URL: {avg_time_per_url:.2f} seconds.")
|
||||
|
||||
if results:
|
||||
return results[-1]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def search_domain(site_name: str, target_content: str, base_url: str):
|
||||
"""
|
||||
Search for a valid domain for the given site name and base URL.
|
||||
|
||||
Args:
|
||||
- site_name (str): The name of the site to search the domain for.
|
||||
- target_content (str): The content to search for in the response.
|
||||
- base_url (str): The base URL to construct complete URLs.
|
||||
|
||||
Returns:
|
||||
tuple: The found domain and the complete URL.
|
||||
"""
|
||||
|
||||
# Extract config domain
|
||||
domain = config_manager.get("SITE", site_name)
|
||||
console.print(f"[cyan]Test site[white]: [red]{base_url}.{domain}")
|
||||
|
||||
try:
|
||||
# Test the current domain
|
||||
response = httpx.get(f"{base_url}.{domain}", headers={'user-agent': get_headers()}, timeout=2)
|
||||
console.print(f"[cyan]Test response site[white]: [red]{response.status_code}")
|
||||
response.raise_for_status()
|
||||
|
||||
# Return config domain
|
||||
console.print(f"[cyan]Use domain: [red]{domain}")
|
||||
return domain, f"{base_url}.{domain}"
|
||||
|
||||
except:
|
||||
|
||||
# If the current domain fails, find a new one
|
||||
print()
|
||||
console.print("[red]Extract new DOMAIN from TLD list.")
|
||||
new_domain = get_top_level_domain(base_url=base_url, target_content=target_content)
|
||||
|
||||
if new_domain is not None:
|
||||
|
||||
# Update domain in config.json
|
||||
config_manager.set_key('SITE', site_name, new_domain)
|
||||
config_manager.write_config()
|
||||
|
||||
# Return new config domain
|
||||
console.print(f"[cyan]Use domain: [red]{new_domain}")
|
||||
return new_domain, f"{base_url}.{new_domain}"
|
||||
|
||||
else:
|
||||
logging.error(f"Failed to find a new domain for: {base_url}")
|
||||
sys.exit(0)
|
1
Src/Api/Template/__init__.py
Normal file
1
Src/Api/Template/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .Util.get_domain import search_domain
|
@ -11,9 +11,10 @@ from unidecode import unidecode
|
||||
|
||||
|
||||
# Internal utilities
|
||||
from Src.Util.table import TVShowManager
|
||||
from Src.Util.console import console
|
||||
from Src.Util.headers import get_headers
|
||||
from Src.Util.console import console
|
||||
from Src.Util.table import TVShowManager
|
||||
from ..Template import search_domain
|
||||
|
||||
|
||||
# Logic class
|
||||
@ -37,9 +38,12 @@ def title_search(title_search: str) -> int:
|
||||
Returns:
|
||||
int: The number of titles found.
|
||||
"""
|
||||
|
||||
# Find new domain if prev dont work
|
||||
domain_to_use, _ = search_domain(SITE_NAME, '<meta name="generator" content="altadefinizione">', f"https://{SITE_NAME}")
|
||||
|
||||
# Send request to search for titles
|
||||
response = httpx.get(f"https://{SITE_NAME}.{DOMAIN_NOW}/page/1/?story={unidecode(title_search.replace(' ', '+'))}&do=search&subaction=search&titleonly=3", headers={'user-agent': get_headers()})
|
||||
response = httpx.get(f"https://{SITE_NAME}.{domain_to_use}/page/1/?story={unidecode(title_search.replace(' ', '+'))}&do=search&subaction=search&titleonly=3", headers={'user-agent': get_headers()})
|
||||
response.raise_for_status()
|
||||
|
||||
# Create soup and find table
|
||||
|
@ -11,9 +11,10 @@ from unidecode import unidecode
|
||||
|
||||
|
||||
# Internal utilities
|
||||
from Src.Util.table import TVShowManager
|
||||
from Src.Util.console import console
|
||||
from Src.Util._jsonConfig import config_manager
|
||||
from Src.Util.table import TVShowManager
|
||||
from ..Template import search_domain
|
||||
|
||||
|
||||
# Logic class
|
||||
@ -65,28 +66,6 @@ def get_token(site_name: str, domain: str) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def update_domain():
|
||||
"""
|
||||
Update the domain for the anime streaming site.
|
||||
|
||||
This function tests the accessibility of the current anime streaming site.
|
||||
If the current domain is inaccessible, it attempts to obtain and set a new domain.
|
||||
It uses the 'light' method to extract a new domain from Anime Unity.
|
||||
"""
|
||||
|
||||
# Test current site's accessibility
|
||||
try:
|
||||
|
||||
console.log(f"[cyan]Test site: [red]https://{SITE_NAME}.{DOMAIN_NOW}")
|
||||
response = httpx.get(f"https://www.{SITE_NAME}.{DOMAIN_NOW}")
|
||||
response.raise_for_status()
|
||||
|
||||
except Exception as e:
|
||||
|
||||
console.log("[red]Upload domain")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def get_real_title(record):
|
||||
"""
|
||||
Get the real title from a record.
|
||||
@ -122,12 +101,9 @@ def title_search(title: str) -> int:
|
||||
- int: A number containing the length of media search manager.
|
||||
"""
|
||||
|
||||
# Update domain
|
||||
update_domain()
|
||||
|
||||
# Get token and session value from configuration
|
||||
url_domain = config_manager.get('SITE', SITE_NAME)
|
||||
data = get_token(SITE_NAME, url_domain)
|
||||
domain_to_use, _ = search_domain(SITE_NAME, '<meta name="author" content="AnimeUnity Staff">', f"https://www.{SITE_NAME}")
|
||||
data = get_token(SITE_NAME, domain_to_use)
|
||||
|
||||
# Prepare cookies to be used in the request
|
||||
cookies = {
|
||||
@ -148,7 +124,7 @@ def title_search(title: str) -> int:
|
||||
}
|
||||
|
||||
# Send a POST request to the API endpoint for live search
|
||||
response = httpx.post(f'https://www.{SITE_NAME}.{url_domain}/livesearch', cookies=cookies, headers=headers, json=json_data)
|
||||
response = httpx.post(f'https://www.{SITE_NAME}.{domain_to_use}/livesearch', cookies=cookies, headers=headers, json=json_data)
|
||||
response.raise_for_status()
|
||||
|
||||
# Process each record returned in the response
|
||||
|
@ -15,9 +15,10 @@ from unidecode import unidecode
|
||||
|
||||
# Internal utilities
|
||||
from Src.Util.headers import get_headers
|
||||
from Src.Util._jsonConfig import config_manager
|
||||
from Src.Util.console import console
|
||||
from Src.Util.table import TVShowManager
|
||||
from ..Template import search_domain
|
||||
|
||||
|
||||
|
||||
# Logic class
|
||||
@ -75,45 +76,15 @@ def get_version(text: str) -> tuple[str, list]:
|
||||
raise
|
||||
|
||||
|
||||
def get_version_and_domain(new_domain = None) -> Tuple[str, str]:
|
||||
"""
|
||||
Retrieves the version and domain of the streaming website.
|
||||
def get_version_and_domain() -> Tuple[str, str]:
|
||||
|
||||
This function retrieves the version and domain of the streaming website.
|
||||
It first checks the accessibility of the current site.
|
||||
If the site is accessible, it extracts the version from the response.
|
||||
If configured to do so, it also scrapes and prints the titles of the moments.
|
||||
If the site is inaccessible, it attempts to obtain a new domain using the 'insta' method.
|
||||
# Find new domain if prev dont work
|
||||
domain_to_use, base_url = search_domain(SITE_NAME, '<meta name="author" content="StreamingCommunity">', f"https://{SITE_NAME}")
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: A tuple containing the version and domain.
|
||||
"""
|
||||
|
||||
# Get the current domain from the configuration
|
||||
if new_domain is None:
|
||||
config_domain = config_manager.get('SITE', SITE_NAME)
|
||||
else:
|
||||
config_domain = new_domain
|
||||
# Extract version from the response
|
||||
version, list_title_top_10 = get_version(httpx.get(base_url, headers={'user-agent': get_headers()}).text)
|
||||
|
||||
# Test the accessibility of the current site
|
||||
try:
|
||||
|
||||
# Make requests to site to get text
|
||||
console.print(f"[cyan]Test site[white]: [red]https://{SITE_NAME}.{config_domain}")
|
||||
response = httpx.get(f"https://{SITE_NAME}.{config_domain}")
|
||||
response.raise_for_status()
|
||||
|
||||
console.print(f"[cyan]Test respost site[white]: [red]{response.status_code} \n")
|
||||
|
||||
# Extract version from the response
|
||||
version, list_title_top_10 = get_version(response.text)
|
||||
|
||||
return version, config_domain
|
||||
|
||||
except:
|
||||
|
||||
console.log("[red]Upload domain.")
|
||||
sys.exit(0)
|
||||
return version, domain_to_use
|
||||
|
||||
|
||||
def title_search(title_search: str, domain: str) -> int:
|
||||
|
@ -1,12 +1,11 @@
|
||||
# 29.04.24
|
||||
|
||||
import httpx
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# URL of the webpage containing the table
|
||||
url = 'https://icannwiki.org/New_gTLD_Generic_Applications'
|
||||
url = 'https://icannwiki.org/All_New_gTLD_Applications'
|
||||
|
||||
|
||||
# List to store scraped data
|
||||
@ -78,8 +77,9 @@ def main():
|
||||
print(len(data))
|
||||
|
||||
# Write the scraped data to a JSON file
|
||||
with open('data.json', 'w') as json_file:
|
||||
json.dump(data, json_file)
|
||||
with open('data.txt', 'w') as json_file:
|
||||
for find_tld in data:
|
||||
json_file.write(find_tld['application_id'] + "\n")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
1490
Test/data/TLD/tld_list_complete.txt
Normal file
1490
Test/data/TLD/tld_list_complete.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user