diff --git a/README.md b/README.md index 13d647f..e04796f 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ You can chat, help improve this repo, or just hang around for some fun in the ** * [CONFIGURATION](#Configuration) * [DOCKER](#docker) * [TUTORIAL](#tutorial) +* [TO DO](#to-do) ## Requirement @@ -232,3 +233,7 @@ docker run -it -p 8000:8000 -v /path/to/download:/app/Video streaming-community- ## Tutorial For a detailed walkthrough, refer to the [video tutorial](https://www.youtube.com/watch?v=Ok7hQCgxqLg&ab_channel=Nothing) + +## To do +- Add a gui +- Add a website api \ No newline at end of file diff --git a/Src/Api/Class/Video.py b/Src/Api/Class/Video.py index f12634a..74002c5 100644 --- a/Src/Api/Class/Video.py +++ b/Src/Api/Class/Video.py @@ -1,6 +1,5 @@ # 01.03.24 -import requests import re import json import binascii @@ -9,12 +8,12 @@ from urllib.parse import urljoin, urlencode, quote # External libraries -import requests from bs4 import BeautifulSoup # Internal utilities from Src.Util.headers import get_headers +from Src.Lib.Request import requests from .SeriesType import TitleManager from .EpisodeType import EpisodeManager from .WindowType import WindowVideo, WindowParameter diff --git a/Src/Api/Util/__init__.py b/Src/Api/Util/__init__.py new file mode 100644 index 0000000..777fc93 --- /dev/null +++ b/Src/Api/Util/__init__.py @@ -0,0 +1,3 @@ +# 29.04.24 + +from .extract_domain import grab_sc_top_level_domain as get_sc_domain \ No newline at end of file diff --git a/Src/Api/Util/extract_domain.py b/Src/Api/Util/extract_domain.py new file mode 100644 index 0000000..d3aa0f4 --- /dev/null +++ b/Src/Api/Util/extract_domain.py @@ -0,0 +1,128 @@ +# 29.04.24 + +import threading +import logging +import os + + +# Internal utilities +from Src.Lib.Google import search as google_search +from Src.Lib.Request import requests + + +def check_url_for_content(url: str, content: str) -> bool: + """ + Check if a URL contains specific content. + + Args: + url (str): The URL to check. + content (str): The content to search for in the response. + + Returns: + bool: True if the content is found, False otherwise. + """ + try: + r = requests.get(url, timeout = 1) + if r.status_code == 200 and content in r.text: + return True + except Exception as e: + pass + return False + +def grab_top_level_domain(base_url: str, target_content: str) -> str: + """ + Get the top-level domain (TLD) from a list of URLs. + + Args: + base_url (str): The base URL to construct complete URLs. + target_content (str): The content to search for in the response. + + Returns: + str: The found TLD, if any. + """ + results = [] + threads = [] + + def url_checker(url: str): + if check_url_for_content(url, target_content): + results.append(url.split(".")[-1]) + + if not os.path.exists("tld_list.txt"): + raise FileNotFoundError("The file 'tld_list.txt' does not exist.") + + urls = [f"{base_url}.{x.strip().lower()}" for x in open("tld_list.txt", "r")] + + for url in urls: + thread = threading.Thread(target=url_checker, args=(url,)) + thread.start() + threads.append(thread) + + for thread in threads: + thread.join() + + if results: + return results[-1] + +def grab_top_level_domain_light(query: str) -> str: + """ + Get the top-level domain (TLD) using a light method via Google search. + + Args: + query (str): The search query for Google search. + + Returns: + str: The found TLD, if any. + """ + for result in google_search(query, num=1, stop=1, pause=2): + return result.split(".", 2)[-1].replace("/", "") + +def grab_sc_top_level_domain(method: str) -> str: + """ + Get the top-level domain (TLD) for the streaming community. + + Args: + method (str): The method to use to obtain the TLD ("light" or "strong"). + + Returns: + str: The found TLD, if any. + """ + if method == "light": + return grab_top_level_domain_light("streaming community") + elif method == "strong": + return grab_top_level_domain("https://streamingcommunity", '') + +def grab_au_top_level_domain(method: str) -> str: + """ + Get the top-level domain (TLD) for Anime Unity. + + Args: + method (str): The method to use to obtain the TLD ("light" or "strong"). + + Returns: + str: The found TLD, if any. + """ + if method == "light": + return grab_top_level_domain_light("animeunity") + elif method == "strong": + return grab_top_level_domain("https://www.animeunity", '') + +def compose_both_top_level_domains(method: str) -> dict: + """ + Compose TLDs for both the streaming community and Anime Unity. + + Args: + method (str): The method to use to obtain the TLD ("light" or "strong"). + + Returns: + dict: A dictionary containing the TLDs for the streaming community and Anime Unity. + """ + sc_tld = grab_sc_top_level_domain(method) + au_tld = grab_au_top_level_domain(method) + + if not sc_tld: + sc_tld = grab_sc_top_level_domain("strong") + + if not au_tld: + au_tld = grab_au_top_level_domain("strong") + + return {"streaming_community": sc_tld, "anime_unity": au_tld} \ No newline at end of file diff --git a/Src/Api/anime.py b/Src/Api/anime.py index a0200cb..7456d60 100644 --- a/Src/Api/anime.py +++ b/Src/Api/anime.py @@ -4,13 +4,10 @@ import os import logging -# External libraries -import requests - - # Internal utilities from Src.Util.console import console, msg from Src.Util.config import config_manager +from Src.Lib.Request import requests from Src.Lib.FFmpeg.my_m3u8 import Downloader from Src.Util.message import start_message from .Class import VideoSource diff --git a/Src/Api/site.py b/Src/Api/site.py index a773bac..b02793d 100644 --- a/Src/Api/site.py +++ b/Src/Api/site.py @@ -7,15 +7,16 @@ from typing import Tuple # External libraries -import requests from bs4 import BeautifulSoup # Internal utilities from Src.Util.table import TVShowManager from Src.Util.headers import get_headers +from Src.Lib.Request import requests from Src.Util.console import console from Src.Util.config import config_manager +from .Util import get_sc_domain from .Class import MediaManager, MediaItem @@ -109,10 +110,6 @@ def get_moment_titles(domain: str, version: str, prefix: str): return None -def get_domain() -> str: - pass - - def test_site(domain: str) -> str: """ Tests the availability of a website. @@ -180,7 +177,7 @@ def get_version_and_domain() -> Tuple[str, str]: response_test_site = test_site(config_domain) if response_test_site is None: - config_domain = get_domain() + config_domain = get_sc_domain('light') response_test_site = test_site(config_domain) if response_test_site: diff --git a/Src/Lib/FFmpeg/my_m3u8.py b/Src/Lib/FFmpeg/my_m3u8.py index 2333b4e..54f48b0 100644 --- a/Src/Lib/FFmpeg/my_m3u8.py +++ b/Src/Lib/FFmpeg/my_m3u8.py @@ -14,7 +14,6 @@ warnings.filterwarnings("ignore", category=UserWarning, module="cryptography") # External libraries -import requests from tqdm.rich import tqdm from concurrent.futures import ThreadPoolExecutor, as_completed @@ -22,6 +21,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed # Internal utilities from Src.Util.console import console from Src.Util.headers import get_headers +from Src.Lib.Request import requests from Src.Util.config import config_manager from Src.Util.os import ( remove_folder, diff --git a/Src/Lib/FFmpeg/util/parser.py b/Src/Lib/FFmpeg/util/parser.py index a9fb229..280a743 100644 --- a/Src/Lib/FFmpeg/util/parser.py +++ b/Src/Lib/FFmpeg/util/parser.py @@ -5,10 +5,10 @@ import logging # Internal utilities from Src.Util.headers import get_headers +from Src.Lib.Request import requests # External libraries -import requests from m3u8 import M3U8 diff --git a/Src/Lib/Google/__init__.py b/Src/Lib/Google/__init__.py new file mode 100644 index 0000000..a9d17bb --- /dev/null +++ b/Src/Lib/Google/__init__.py @@ -0,0 +1,3 @@ +# 29.04.24 + +from .page import search \ No newline at end of file diff --git a/Src/Lib/Google/page.py b/Src/Lib/Google/page.py new file mode 100644 index 0000000..6452652 --- /dev/null +++ b/Src/Lib/Google/page.py @@ -0,0 +1,140 @@ +# 29.04.24 + +import time +import ssl +from urllib.request import Request, urlopen +from urllib.parse import quote_plus, urlparse, parse_qs +from bs4 import BeautifulSoup + +from typing import Generator, Optional + + +# Internal utilities +from Src.Lib.Request import requests + + + +def get_page(url: str) -> bytes: + """ + Fetches the HTML content of a webpage given its URL. + + Args: + url (str): The URL of the webpage. + + Returns: + bytes: The HTML content of the webpage. + """ + + response = requests.get(url) + return response.text + + +def filter_result(link: str) -> Optional[str]: + """ + Filters search result links to remove unwanted ones. + + Args: + link (str): The URL of the search result. + + Returns: + Optional[str]: The filtered URL if valid, None otherwise. + """ + try: + if link.startswith('/url?'): + + # Extract the actual URL from Google's redirect link + o = urlparse(link, 'http') + link = parse_qs(o.query)['q'][0] + + o = urlparse(link, 'http') + + # Filter out Google links + if o.netloc and 'google' not in o.netloc: + return link + + except Exception: + pass + + +def search(query: str, num: int = 10, stop: Optional[int] = None, pause: float = 2.0) -> Generator[str, None, None]: + """ + Performs a Google search and yields the URLs of search results. + + Args: + query (str): The search query. + num (int): Number of results to fetch per request. Default is 10. + stop (int, optional): Total number of results to retrieve. Default is None. + pause (float): Pause duration between requests. Default is 2.0. + + Yields: + str: The URL of a search result. + + Example: + >>> for url in search("Python tutorials", num=5, stop=10): + ... print(url) + ... + https://www.python.org/about/gettingstarted/ + """ + + # Set to store unique URLs + hashes = set() + + # Counter for the number of fetched URLs + count = 0 + + # Encode the query for URL + query = quote_plus(query) + + while not stop or count < stop: + last_count = count + + # Construct the Google search URL + url = f"https://www.google.com/search?client=opera&q={query}&sourceid=opera&oe=UTF-8" + + # Pause before making the request + time.sleep(pause) + + # Fetch the HTML content of the search page + html = get_page(url) + soup = BeautifulSoup(html, 'html.parser') + + try: + # Find all anchor tags containing search result links + anchors = soup.find(id='search').findAll('a') + except AttributeError: + # Handle cases where search results are not found in the usual div + gbar = soup.find(id='gbar') + if gbar: + gbar.clear() + anchors = soup.findAll('a') + + # Iterate over each anchor tag + for a in anchors: + try: + link = a['href'] + except KeyError: + continue + + # Filter out unwanted links + link = filter_result(link) + if not link: + continue + + # Check for duplicate URLs + h = hash(link) + if h in hashes: + continue + hashes.add(h) + + # Yield the valid URL + yield link + + # Increment the counter + count += 1 + # Check if the desired number of URLs is reached + if stop and count >= stop: + return + + # Break the loop if no new URLs are found + if last_count == count: + break diff --git a/Src/Lib/Request/__init__.py b/Src/Lib/Request/__init__.py new file mode 100644 index 0000000..ca582dc --- /dev/null +++ b/Src/Lib/Request/__init__.py @@ -0,0 +1,3 @@ +# 21.04.24 + +from .my_requests import requests \ No newline at end of file diff --git a/Src/Lib/Request/my_requests.py b/Src/Lib/Request/my_requests.py index 8148d0a..8ebcfb8 100644 --- a/Src/Lib/Request/my_requests.py +++ b/Src/Lib/Request/my_requests.py @@ -9,7 +9,8 @@ import re import urllib.parse import urllib.request import urllib.error -from typing import Dict, Optional, Union + +from typing import Dict, Optional, Union, TypedDict, Any try: from typing import Unpack @@ -22,9 +23,13 @@ except ImportError: "Please make sure you have the necessary libraries installed.") +# External library +from bs4 import BeautifulSoup + + # Constants -HTTP_TIMEOUT = 4 -HTTP_RETRIES = 2 +HTTP_TIMEOUT = 3 +HTTP_RETRIES = 1 HTTP_DELAY = 1 @@ -95,11 +100,42 @@ class Response: raise RequestError(f"Request failed with status code {self.status_code}") def json(self): - """Return the response content as JSON if it is JSON.""" + """ + Return the response content as JSON if it is JSON. + + Returns: + dict or list or None: A Python dictionary or list parsed from JSON if the response content is JSON, otherwise None. + """ if self.is_json: return json.loads(self.text) else: return None + + def get_redirects(self): + """ + Extracts unique site URLs from HTML elements within the section. + + Returns: + list or None: A list of unique site URLs if found, otherwise None. + """ + + site_find = [] + + if self.text: + soup = BeautifulSoup(self.text, "html.parser") + + for links in soup.find("head").find_all('link'): + if links is not None: + parsed_url = urllib.parse.urlparse(links.get('href')) + site = parsed_url.scheme + "://" + parsed_url.netloc + + if site not in site_find: + site_find.append(site) + + if site_find: + return site_find + else: + return None class ManageRequests: @@ -116,6 +152,7 @@ class ManageRequests: auth: Optional[tuple] = None, proxy: Optional[str] = None, cookies: Optional[Dict[str, str]] = None, + json_data: Optional[Dict[str, Any]] = None, redirection_handling: bool = True, ): """ @@ -136,7 +173,7 @@ class ManageRequests: """ self.url = url self.method = method - self.headers = headers or {'User-Agent': 'Mozilla/5.0'} + self.headers = headers or {} self.timeout = timeout self.retries = retries self.params = params @@ -144,6 +181,7 @@ class ManageRequests: self.auth = auth self.proxy = proxy self.cookies = cookies + self.json_data = json_data self.redirection_handling = redirection_handling def add_header(self, key: str, value: str) -> None: @@ -152,6 +190,7 @@ class ManageRequests: def send(self) -> Response: """Send the HTTP request.""" + start_time = time.time() self.attempt = 0 redirect_url = None @@ -160,24 +199,53 @@ class ManageRequests: try: req = self._build_request() response = self._perform_request(req) + return self._process_response(response, start_time, redirect_url) + except (urllib.error.URLError, urllib.error.HTTPError) as e: self._handle_error(e) - attempt += 1 + self.attempt += 1 def _build_request(self) -> urllib.request.Request: """Build the urllib Request object.""" headers = self.headers.copy() + if self.params: url = self.url + '?' + urllib.parse.urlencode(self.params) else: url = self.url + req = urllib.request.Request(url, headers=headers, method=self.method) + + if self.json_data: + req.add_header('Content-Type', 'application/json') + req.body = json.dumps(self.json_data).encode('utf-8') + else: + req = urllib.request.Request(url, headers=headers, method=self.method) + if self.auth: req.add_header('Authorization', 'Basic ' + base64.b64encode(f"{self.auth[0]}:{self.auth[1]}".encode()).decode()) + if self.cookies: cookie_str = '; '.join([f"{name}={value}" for name, value in self.cookies.items()]) req.add_header('Cookie', cookie_str) + + if self.headers: + for key, value in self.headers.items(): + req.add_header(key, value) + + # Add default user agent + if True: + there_is_agent = False + + for key, value in self.headers.items(): + if str(key).lower() == 'user-agent': + there_is_agent = True + + if not there_is_agent: + default_user_agent = 'Mozilla/5.0' + req.add_header('user-agent', default_user_agent) + return req def _perform_request(self, req: urllib.request.Request) -> urllib.response.addinfourl: @@ -186,26 +254,30 @@ class ManageRequests: proxy_handler = urllib.request.ProxyHandler({'http': self.proxy, 'https': self.proxy}) opener = urllib.request.build_opener(proxy_handler) urllib.request.install_opener(opener) + if not self.verify_ssl: ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE response = urllib.request.urlopen(req, timeout=self.timeout, context=ssl_context) + else: response = urllib.request.urlopen(req, timeout=self.timeout) + return response def _process_response(self, response: urllib.response.addinfourl, start_time: float, redirect_url: Optional[str]) -> Response: """Process the HTTP response.""" response_data = response.read() content_type = response.headers.get('Content-Type', '').lower() - is_response_api = "json" in content_type + if self.redirection_handling and response.status in (301, 302, 303, 307, 308): location = response.headers.get('Location') logging.info(f"Redirecting to: {location}") redirect_url = location self.url = location return self.send() + return self._build_response(response, response_data, start_time, redirect_url, content_type) def _build_response(self, response: urllib.response.addinfourl, response_data: bytes, start_time: float, redirect_url: Optional[str], content_type: str) -> Response: @@ -216,7 +288,7 @@ class ManageRequests: for cookie in response.headers.get_all('Set-Cookie', []): cookie_parts = cookie.split(';') - cookie_name, cookie_value = cookie_parts[0].split('=') + cookie_name, cookie_value = cookie_parts[0].split('=', 1) response_cookies[cookie_name.strip()] = cookie_value.strip() return Response( @@ -234,9 +306,11 @@ class ManageRequests: def _handle_error(self, e: Union[urllib.error.URLError, urllib.error.HTTPError]) -> None: """Handle request error.""" logging.error(f"Request failed for URL '{self.url}': {str(e)}") + if self.attempt < self.retries: logging.info(f"Retrying request for URL '{self.url}' (attempt {self.attempt}/{self.retries})") time.sleep(HTTP_DELAY) + else: logging.error(f"Maximum retries reached for URL '{self.url}'") raise RequestError(str(e)) @@ -251,9 +325,7 @@ class ValidateRequest: r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain... r'localhost|' # localhost... - r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or IP - r'(?::\d+)?' # optional port - r'(?:/?|[/?]\S+)$', re.IGNORECASE) + r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', re.IGNORECASE) return re.match(url_regex, url) is not None @staticmethod @@ -287,14 +359,14 @@ class SSLHandler: ssl_context.verify_mode = ssl.CERT_NONE -class KwargsRequest(): - """Class representing keyword arguments for a request.""" +class KwargsRequest(TypedDict, total = False): url: str headers: Optional[Dict[str, str]] = None timeout: float = HTTP_TIMEOUT retries: int = HTTP_RETRIES params: Optional[Dict[str, str]] = None cookies: Optional[Dict[str, str]] = None + json_data: Optional[Dict[str, Any]] = None class Request: @@ -302,7 +374,7 @@ class Request: def __init__(self) -> None: pass - def get(self, url: str, **kwargs: Unpack[KwargsRequest]): + def get(self, url: str, **kwargs: Unpack[KwargsRequest])-> 'Response': """ Send a GET request. @@ -315,7 +387,7 @@ class Request: """ return self._send_request(url, 'GET', **kwargs) - def post(self, url: str, **kwargs: Unpack[KwargsRequest]): + def post(self, url: str, **kwargs: Unpack[KwargsRequest]) -> 'Response': """ Send a POST request. @@ -327,36 +399,9 @@ class Request: Response: The response object. """ return self._send_request(url, 'POST', **kwargs) - - def put(self, url: str, **kwargs: Unpack[KwargsRequest]): - """ - Send a PUT request. - - Args: - url (str): The URL to which the request will be sent. - **kwargs: Additional keyword arguments for the request. - - Returns: - Response: The response object. - """ - return self._send_request(url, 'PUT', **kwargs) - - def delete(self, url: str, **kwargs: Unpack[KwargsRequest]): - """ - Send a DELETE request. - - Args: - url (str): The URL to which the request will be sent. - **kwargs: Additional keyword arguments for the request. - - Returns: - Response: The response object. - """ - return self._send_request(url, 'DELETE', **kwargs) - - def _send_request(self, url: str, method: str, **kwargs: Unpack[KwargsRequest]): + + def _send_request(self, url: str, method: str, **kwargs: Unpack[KwargsRequest]) -> 'Response': """Send an HTTP request.""" - # Add validation checks for URL and headers if not ValidateRequest.validate_url(url): raise ValueError("Invalid URL format") @@ -364,7 +409,6 @@ class Request: raise ValueError("Invalid header values") return ManageRequests(url, method, **kwargs).send() - - + # Out -request = Request() \ No newline at end of file +requests: Request = Request() \ No newline at end of file diff --git a/Src/Lib/UserAgent/__init__.py b/Src/Lib/UserAgent/__init__.py new file mode 100644 index 0000000..72a0157 --- /dev/null +++ b/Src/Lib/UserAgent/__init__.py @@ -0,0 +1,3 @@ +# 21.04.24 + +from .user_agent import ua \ No newline at end of file diff --git a/Src/Lib/Request/user_agent.py b/Src/Lib/UserAgent/user_agent.py similarity index 96% rename from Src/Lib/Request/user_agent.py rename to Src/Lib/UserAgent/user_agent.py index 46c3f7f..d4489d5 100644 --- a/Src/Lib/Request/user_agent.py +++ b/Src/Lib/UserAgent/user_agent.py @@ -7,10 +7,12 @@ import random import threading import json import tempfile + from typing import Dict, List + # Internal utilities -from .my_requests import request +from ..Request import requests def get_browser_user_agents_online(browser: str) -> List[str]: @@ -28,7 +30,7 @@ def get_browser_user_agents_online(browser: str) -> List[str]: try: # Make request and find all user agents - html = request.get(url).text + html = requests.get(url).text browser_user_agents = re.findall(r"(.+?)", html, re.UNICODE) return [ua for ua in browser_user_agents if "more" not in ua.lower()] @@ -103,4 +105,4 @@ class UserAgentManager: # Output -ua = UserAgentManager() \ No newline at end of file +ua: UserAgentManager = UserAgentManager() \ No newline at end of file diff --git a/Src/Upload/update.py b/Src/Upload/update.py index 9d9b4ae..9394359 100644 --- a/Src/Upload/update.py +++ b/Src/Upload/update.py @@ -1,13 +1,13 @@ # 01.03.2023 import os -import requests import time # Internal utilities from .version import __version__ from Src.Util.console import console +from Src.Lib.Request import requests # Variable diff --git a/Src/Util/headers.py b/Src/Util/headers.py index 381d192..ff978fc 100644 --- a/Src/Util/headers.py +++ b/Src/Util/headers.py @@ -4,7 +4,7 @@ import logging # Internal utilities -from Src.Lib.Request.user_agent import ua +from Src.Lib.UserAgent import ua def get_headers() -> str: diff --git a/requirements.txt b/requirements.txt index de6588c..3d213d5 100644 Binary files a/requirements.txt and b/requirements.txt differ