mirror of
https://github.com/Arrowar/StreamingCommunity.git
synced 2025-06-07 20:15:24 +00:00
Add new auto find new domain.
This commit is contained in:
parent
678e39cc46
commit
6036bbeb20
192
Src/Api/Template/Util/get_domain.py
Normal file
192
Src/Api/Template/Util/get_domain.py
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
# 18.06.24
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
|
||||||
|
# External libraries
|
||||||
|
import httpx
|
||||||
|
import psutil
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
# Internal utilities
|
||||||
|
from Src.Util.color import Colors
|
||||||
|
from Src.Util.headers import get_headers
|
||||||
|
from Src.Util.console import console
|
||||||
|
from Src.Util._jsonConfig import config_manager
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def check_url_for_content(url: str, content: str, timeout: int = 1) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a URL contains specific content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- url (str): The URL to check.
|
||||||
|
- content (str): The content to search for in the response.
|
||||||
|
- timeout (int): Timeout for the request in seconds.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the content is found, False otherwise.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
|
||||||
|
response = httpx.get(url, timeout=timeout, headers={'user-agent': get_headers()})
|
||||||
|
logging.info(f"Testing site to extract domain: {url}, response: {response.status_code}")
|
||||||
|
|
||||||
|
# Raise an error if the status is not successful
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Check if the target content is in the response text
|
||||||
|
if content in response.text:
|
||||||
|
return True
|
||||||
|
|
||||||
|
except httpx.RequestError as e:
|
||||||
|
logging.warning(f"Request error for {url}: {e}")
|
||||||
|
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
logging.warning(f"HTTP status error for {url}: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"Error for {url}: {e}")
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_top_level_domain(base_url: str, target_content: str, max_workers: int = os.cpu_count(), timeout: int = 2, retries: int = 1) -> str:
|
||||||
|
"""
|
||||||
|
Get the top-level domain (TLD) from a list of URLs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- base_url (str): The base URL to construct complete URLs.
|
||||||
|
- target_content (str): The content to search for in the response.
|
||||||
|
- max_workers (int): Maximum number of threads.
|
||||||
|
- timeout (int): Timeout for the request in seconds.
|
||||||
|
- retries (int): Number of retries for failed requests.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The found TLD, if any.
|
||||||
|
"""
|
||||||
|
|
||||||
|
results = []
|
||||||
|
failed_urls = []
|
||||||
|
path_file = os.path.join("Test", "data", "TLD", "tld_list_complete.txt")
|
||||||
|
logging.info(f"Loading file: {path_file}")
|
||||||
|
|
||||||
|
if not os.path.exists(path_file):
|
||||||
|
raise FileNotFoundError("The file 'tld_list_complete.txt' does not exist.")
|
||||||
|
|
||||||
|
# Read TLDs from file and create URLs to test
|
||||||
|
with open(path_file, "r") as file:
|
||||||
|
urls = [f"{base_url}.{x.strip().lower()}" for x in file]
|
||||||
|
urls = list(set(urls)) # Remove duplicates
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
bar_format=f"{Colors.YELLOW}Testing URLS{Colors.WHITE}: {Colors.RED}{{percentage:.2f}}% {Colors.MAGENTA}{{bar}} {Colors.WHITE}[ {Colors.YELLOW}{{n_fmt}}{Colors.WHITE} / {Colors.RED}{{total_fmt}} {Colors.WHITE}] {Colors.YELLOW}{{elapsed}} {Colors.WHITE}< {Colors.CYAN}{{remaining}}{Colors.GREEN}{{postfix}} {Colors.WHITE}]"
|
||||||
|
progress_bar = tqdm(
|
||||||
|
total=len(urls),
|
||||||
|
unit='url',
|
||||||
|
ascii='░▒█',
|
||||||
|
bar_format=bar_format
|
||||||
|
)
|
||||||
|
|
||||||
|
# Event to signal when to stop checking URLs
|
||||||
|
stop_event = threading.Event()
|
||||||
|
|
||||||
|
def url_checker(url: str):
|
||||||
|
for attempt in range(retries):
|
||||||
|
if stop_event.is_set():
|
||||||
|
return None
|
||||||
|
|
||||||
|
if check_url_for_content(url, target_content, timeout):
|
||||||
|
stop_event.set()
|
||||||
|
progress_bar.update(1)
|
||||||
|
return url.split(".")[-1]
|
||||||
|
|
||||||
|
logging.info(f"Retrying {url} ({attempt+1}/{retries})")
|
||||||
|
|
||||||
|
failed_urls.append(url)
|
||||||
|
progress_bar.update(1)
|
||||||
|
return None
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
futures = {executor.submit(url_checker, url): url for url in urls}
|
||||||
|
|
||||||
|
for future in as_completed(futures):
|
||||||
|
tld = future.result()
|
||||||
|
|
||||||
|
if tld:
|
||||||
|
results.append(tld)
|
||||||
|
if stop_event.is_set():
|
||||||
|
break
|
||||||
|
|
||||||
|
# Update the progress bar with CPU usage info
|
||||||
|
progress_bar.set_postfix(cpu_usage=f"{psutil.cpu_percent()}%")
|
||||||
|
|
||||||
|
progress_bar.close()
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
total_time = end_time - start_time
|
||||||
|
avg_time_per_url = total_time / len(urls) if urls else 0
|
||||||
|
|
||||||
|
logging.info(f"Tested {len(urls)} URLs: {len(results)} passed, {len(failed_urls)} failed.")
|
||||||
|
logging.info(f"Total time: {total_time:.2f} seconds, Average time per URL: {avg_time_per_url:.2f} seconds.")
|
||||||
|
|
||||||
|
if results:
|
||||||
|
return results[-1]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def search_domain(site_name: str, target_content: str, base_url: str):
|
||||||
|
"""
|
||||||
|
Search for a valid domain for the given site name and base URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- site_name (str): The name of the site to search the domain for.
|
||||||
|
- target_content (str): The content to search for in the response.
|
||||||
|
- base_url (str): The base URL to construct complete URLs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: The found domain and the complete URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Extract config domain
|
||||||
|
domain = config_manager.get("SITE", site_name)
|
||||||
|
console.print(f"[cyan]Test site[white]: [red]{base_url}.{domain}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Test the current domain
|
||||||
|
response = httpx.get(f"{base_url}.{domain}", headers={'user-agent': get_headers()}, timeout=2)
|
||||||
|
console.print(f"[cyan]Test response site[white]: [red]{response.status_code}")
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Return config domain
|
||||||
|
console.print(f"[cyan]Use domain: [red]{domain}")
|
||||||
|
return domain, f"{base_url}.{domain}"
|
||||||
|
|
||||||
|
except:
|
||||||
|
|
||||||
|
# If the current domain fails, find a new one
|
||||||
|
print()
|
||||||
|
console.print("[red]Extract new DOMAIN from TLD list.")
|
||||||
|
new_domain = get_top_level_domain(base_url=base_url, target_content=target_content)
|
||||||
|
|
||||||
|
if new_domain is not None:
|
||||||
|
|
||||||
|
# Update domain in config.json
|
||||||
|
config_manager.set_key('SITE', site_name, new_domain)
|
||||||
|
config_manager.write_config()
|
||||||
|
|
||||||
|
# Return new config domain
|
||||||
|
console.print(f"[cyan]Use domain: [red]{new_domain}")
|
||||||
|
return new_domain, f"{base_url}.{new_domain}"
|
||||||
|
|
||||||
|
else:
|
||||||
|
logging.error(f"Failed to find a new domain for: {base_url}")
|
||||||
|
sys.exit(0)
|
1
Src/Api/Template/__init__.py
Normal file
1
Src/Api/Template/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from .Util.get_domain import search_domain
|
@ -11,9 +11,10 @@ from unidecode import unidecode
|
|||||||
|
|
||||||
|
|
||||||
# Internal utilities
|
# Internal utilities
|
||||||
from Src.Util.table import TVShowManager
|
|
||||||
from Src.Util.console import console
|
|
||||||
from Src.Util.headers import get_headers
|
from Src.Util.headers import get_headers
|
||||||
|
from Src.Util.console import console
|
||||||
|
from Src.Util.table import TVShowManager
|
||||||
|
from ..Template import search_domain
|
||||||
|
|
||||||
|
|
||||||
# Logic class
|
# Logic class
|
||||||
@ -38,8 +39,11 @@ def title_search(title_search: str) -> int:
|
|||||||
int: The number of titles found.
|
int: The number of titles found.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Find new domain if prev dont work
|
||||||
|
domain_to_use, _ = search_domain(SITE_NAME, '<meta name="generator" content="altadefinizione">', f"https://{SITE_NAME}")
|
||||||
|
|
||||||
# Send request to search for titles
|
# Send request to search for titles
|
||||||
response = httpx.get(f"https://{SITE_NAME}.{DOMAIN_NOW}/page/1/?story={unidecode(title_search.replace(' ', '+'))}&do=search&subaction=search&titleonly=3", headers={'user-agent': get_headers()})
|
response = httpx.get(f"https://{SITE_NAME}.{domain_to_use}/page/1/?story={unidecode(title_search.replace(' ', '+'))}&do=search&subaction=search&titleonly=3", headers={'user-agent': get_headers()})
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
# Create soup and find table
|
# Create soup and find table
|
||||||
|
@ -11,9 +11,10 @@ from unidecode import unidecode
|
|||||||
|
|
||||||
|
|
||||||
# Internal utilities
|
# Internal utilities
|
||||||
from Src.Util.table import TVShowManager
|
|
||||||
from Src.Util.console import console
|
from Src.Util.console import console
|
||||||
from Src.Util._jsonConfig import config_manager
|
from Src.Util._jsonConfig import config_manager
|
||||||
|
from Src.Util.table import TVShowManager
|
||||||
|
from ..Template import search_domain
|
||||||
|
|
||||||
|
|
||||||
# Logic class
|
# Logic class
|
||||||
@ -65,28 +66,6 @@ def get_token(site_name: str, domain: str) -> dict:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def update_domain():
|
|
||||||
"""
|
|
||||||
Update the domain for the anime streaming site.
|
|
||||||
|
|
||||||
This function tests the accessibility of the current anime streaming site.
|
|
||||||
If the current domain is inaccessible, it attempts to obtain and set a new domain.
|
|
||||||
It uses the 'light' method to extract a new domain from Anime Unity.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Test current site's accessibility
|
|
||||||
try:
|
|
||||||
|
|
||||||
console.log(f"[cyan]Test site: [red]https://{SITE_NAME}.{DOMAIN_NOW}")
|
|
||||||
response = httpx.get(f"https://www.{SITE_NAME}.{DOMAIN_NOW}")
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
|
|
||||||
console.log("[red]Upload domain")
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
|
|
||||||
def get_real_title(record):
|
def get_real_title(record):
|
||||||
"""
|
"""
|
||||||
Get the real title from a record.
|
Get the real title from a record.
|
||||||
@ -122,12 +101,9 @@ def title_search(title: str) -> int:
|
|||||||
- int: A number containing the length of media search manager.
|
- int: A number containing the length of media search manager.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Update domain
|
|
||||||
update_domain()
|
|
||||||
|
|
||||||
# Get token and session value from configuration
|
# Get token and session value from configuration
|
||||||
url_domain = config_manager.get('SITE', SITE_NAME)
|
domain_to_use, _ = search_domain(SITE_NAME, '<meta name="author" content="AnimeUnity Staff">', f"https://www.{SITE_NAME}")
|
||||||
data = get_token(SITE_NAME, url_domain)
|
data = get_token(SITE_NAME, domain_to_use)
|
||||||
|
|
||||||
# Prepare cookies to be used in the request
|
# Prepare cookies to be used in the request
|
||||||
cookies = {
|
cookies = {
|
||||||
@ -148,7 +124,7 @@ def title_search(title: str) -> int:
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Send a POST request to the API endpoint for live search
|
# Send a POST request to the API endpoint for live search
|
||||||
response = httpx.post(f'https://www.{SITE_NAME}.{url_domain}/livesearch', cookies=cookies, headers=headers, json=json_data)
|
response = httpx.post(f'https://www.{SITE_NAME}.{domain_to_use}/livesearch', cookies=cookies, headers=headers, json=json_data)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
# Process each record returned in the response
|
# Process each record returned in the response
|
||||||
|
@ -15,9 +15,10 @@ from unidecode import unidecode
|
|||||||
|
|
||||||
# Internal utilities
|
# Internal utilities
|
||||||
from Src.Util.headers import get_headers
|
from Src.Util.headers import get_headers
|
||||||
from Src.Util._jsonConfig import config_manager
|
|
||||||
from Src.Util.console import console
|
from Src.Util.console import console
|
||||||
from Src.Util.table import TVShowManager
|
from Src.Util.table import TVShowManager
|
||||||
|
from ..Template import search_domain
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Logic class
|
# Logic class
|
||||||
@ -75,45 +76,15 @@ def get_version(text: str) -> tuple[str, list]:
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
def get_version_and_domain(new_domain = None) -> Tuple[str, str]:
|
def get_version_and_domain() -> Tuple[str, str]:
|
||||||
"""
|
|
||||||
Retrieves the version and domain of the streaming website.
|
|
||||||
|
|
||||||
This function retrieves the version and domain of the streaming website.
|
# Find new domain if prev dont work
|
||||||
It first checks the accessibility of the current site.
|
domain_to_use, base_url = search_domain(SITE_NAME, '<meta name="author" content="StreamingCommunity">', f"https://{SITE_NAME}")
|
||||||
If the site is accessible, it extracts the version from the response.
|
|
||||||
If configured to do so, it also scrapes and prints the titles of the moments.
|
|
||||||
If the site is inaccessible, it attempts to obtain a new domain using the 'insta' method.
|
|
||||||
|
|
||||||
Returns:
|
# Extract version from the response
|
||||||
Tuple[str, str]: A tuple containing the version and domain.
|
version, list_title_top_10 = get_version(httpx.get(base_url, headers={'user-agent': get_headers()}).text)
|
||||||
"""
|
|
||||||
|
|
||||||
# Get the current domain from the configuration
|
return version, domain_to_use
|
||||||
if new_domain is None:
|
|
||||||
config_domain = config_manager.get('SITE', SITE_NAME)
|
|
||||||
else:
|
|
||||||
config_domain = new_domain
|
|
||||||
|
|
||||||
# Test the accessibility of the current site
|
|
||||||
try:
|
|
||||||
|
|
||||||
# Make requests to site to get text
|
|
||||||
console.print(f"[cyan]Test site[white]: [red]https://{SITE_NAME}.{config_domain}")
|
|
||||||
response = httpx.get(f"https://{SITE_NAME}.{config_domain}")
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
console.print(f"[cyan]Test respost site[white]: [red]{response.status_code} \n")
|
|
||||||
|
|
||||||
# Extract version from the response
|
|
||||||
version, list_title_top_10 = get_version(response.text)
|
|
||||||
|
|
||||||
return version, config_domain
|
|
||||||
|
|
||||||
except:
|
|
||||||
|
|
||||||
console.log("[red]Upload domain.")
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
|
|
||||||
def title_search(title_search: str, domain: str) -> int:
|
def title_search(title_search: str, domain: str) -> int:
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
# 29.04.24
|
# 29.04.24
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
import json
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
# URL of the webpage containing the table
|
# URL of the webpage containing the table
|
||||||
url = 'https://icannwiki.org/New_gTLD_Generic_Applications'
|
url = 'https://icannwiki.org/All_New_gTLD_Applications'
|
||||||
|
|
||||||
|
|
||||||
# List to store scraped data
|
# List to store scraped data
|
||||||
@ -78,8 +77,9 @@ def main():
|
|||||||
print(len(data))
|
print(len(data))
|
||||||
|
|
||||||
# Write the scraped data to a JSON file
|
# Write the scraped data to a JSON file
|
||||||
with open('data.json', 'w') as json_file:
|
with open('data.txt', 'w') as json_file:
|
||||||
json.dump(data, json_file)
|
for find_tld in data:
|
||||||
|
json_file.write(find_tld['application_id'] + "\n")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
1490
Test/data/TLD/tld_list_complete.txt
Normal file
1490
Test/data/TLD/tld_list_complete.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user