Add light and strong search domain @inklook

This commit is contained in:
Ghost 2024-04-29 20:23:11 +02:00
parent a60dcb2b79
commit 02edd4d5cd
17 changed files with 390 additions and 66 deletions

View File

@ -18,6 +18,7 @@ You can chat, help improve this repo, or just hang around for some fun in the **
* [CONFIGURATION](#Configuration)
* [DOCKER](#docker)
* [TUTORIAL](#tutorial)
* [TO DO](#to-do)
## Requirement
@ -232,3 +233,7 @@ docker run -it -p 8000:8000 -v /path/to/download:/app/Video streaming-community-
## Tutorial
For a detailed walkthrough, refer to the [video tutorial](https://www.youtube.com/watch?v=Ok7hQCgxqLg&ab_channel=Nothing)
## To do
- Add a gui
- Add a website api

View File

@ -1,6 +1,5 @@
# 01.03.24
import requests
import re
import json
import binascii
@ -9,12 +8,12 @@ from urllib.parse import urljoin, urlencode, quote
# External libraries
import requests
from bs4 import BeautifulSoup
# Internal utilities
from Src.Util.headers import get_headers
from Src.Lib.Request import requests
from .SeriesType import TitleManager
from .EpisodeType import EpisodeManager
from .WindowType import WindowVideo, WindowParameter

3
Src/Api/Util/__init__.py Normal file
View File

@ -0,0 +1,3 @@
# 29.04.24
from .extract_domain import grab_sc_top_level_domain as get_sc_domain

View File

@ -0,0 +1,128 @@
# 29.04.24
import threading
import logging
import os
# Internal utilities
from Src.Lib.Google import search as google_search
from Src.Lib.Request import requests
def check_url_for_content(url: str, content: str) -> bool:
"""
Check if a URL contains specific content.
Args:
url (str): The URL to check.
content (str): The content to search for in the response.
Returns:
bool: True if the content is found, False otherwise.
"""
try:
r = requests.get(url, timeout = 1)
if r.status_code == 200 and content in r.text:
return True
except Exception as e:
pass
return False
def grab_top_level_domain(base_url: str, target_content: str) -> str:
"""
Get the top-level domain (TLD) from a list of URLs.
Args:
base_url (str): The base URL to construct complete URLs.
target_content (str): The content to search for in the response.
Returns:
str: The found TLD, if any.
"""
results = []
threads = []
def url_checker(url: str):
if check_url_for_content(url, target_content):
results.append(url.split(".")[-1])
if not os.path.exists("tld_list.txt"):
raise FileNotFoundError("The file 'tld_list.txt' does not exist.")
urls = [f"{base_url}.{x.strip().lower()}" for x in open("tld_list.txt", "r")]
for url in urls:
thread = threading.Thread(target=url_checker, args=(url,))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
if results:
return results[-1]
def grab_top_level_domain_light(query: str) -> str:
"""
Get the top-level domain (TLD) using a light method via Google search.
Args:
query (str): The search query for Google search.
Returns:
str: The found TLD, if any.
"""
for result in google_search(query, num=1, stop=1, pause=2):
return result.split(".", 2)[-1].replace("/", "")
def grab_sc_top_level_domain(method: str) -> str:
"""
Get the top-level domain (TLD) for the streaming community.
Args:
method (str): The method to use to obtain the TLD ("light" or "strong").
Returns:
str: The found TLD, if any.
"""
if method == "light":
return grab_top_level_domain_light("streaming community")
elif method == "strong":
return grab_top_level_domain("https://streamingcommunity", '<meta name="author" content="StreamingCommunity">')
def grab_au_top_level_domain(method: str) -> str:
"""
Get the top-level domain (TLD) for Anime Unity.
Args:
method (str): The method to use to obtain the TLD ("light" or "strong").
Returns:
str: The found TLD, if any.
"""
if method == "light":
return grab_top_level_domain_light("animeunity")
elif method == "strong":
return grab_top_level_domain("https://www.animeunity", '<meta name="author" content="AnimeUnity Staff">')
def compose_both_top_level_domains(method: str) -> dict:
"""
Compose TLDs for both the streaming community and Anime Unity.
Args:
method (str): The method to use to obtain the TLD ("light" or "strong").
Returns:
dict: A dictionary containing the TLDs for the streaming community and Anime Unity.
"""
sc_tld = grab_sc_top_level_domain(method)
au_tld = grab_au_top_level_domain(method)
if not sc_tld:
sc_tld = grab_sc_top_level_domain("strong")
if not au_tld:
au_tld = grab_au_top_level_domain("strong")
return {"streaming_community": sc_tld, "anime_unity": au_tld}

View File

@ -4,13 +4,10 @@ import os
import logging
# External libraries
import requests
# Internal utilities
from Src.Util.console import console, msg
from Src.Util.config import config_manager
from Src.Lib.Request import requests
from Src.Lib.FFmpeg.my_m3u8 import Downloader
from Src.Util.message import start_message
from .Class import VideoSource

View File

@ -7,15 +7,16 @@ from typing import Tuple
# External libraries
import requests
from bs4 import BeautifulSoup
# Internal utilities
from Src.Util.table import TVShowManager
from Src.Util.headers import get_headers
from Src.Lib.Request import requests
from Src.Util.console import console
from Src.Util.config import config_manager
from .Util import get_sc_domain
from .Class import MediaManager, MediaItem
@ -109,10 +110,6 @@ def get_moment_titles(domain: str, version: str, prefix: str):
return None
def get_domain() -> str:
pass
def test_site(domain: str) -> str:
"""
Tests the availability of a website.
@ -180,7 +177,7 @@ def get_version_and_domain() -> Tuple[str, str]:
response_test_site = test_site(config_domain)
if response_test_site is None:
config_domain = get_domain()
config_domain = get_sc_domain('light')
response_test_site = test_site(config_domain)
if response_test_site:

View File

@ -14,7 +14,6 @@ warnings.filterwarnings("ignore", category=UserWarning, module="cryptography")
# External libraries
import requests
from tqdm.rich import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
@ -22,6 +21,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
# Internal utilities
from Src.Util.console import console
from Src.Util.headers import get_headers
from Src.Lib.Request import requests
from Src.Util.config import config_manager
from Src.Util.os import (
remove_folder,

View File

@ -5,10 +5,10 @@ import logging
# Internal utilities
from Src.Util.headers import get_headers
from Src.Lib.Request import requests
# External libraries
import requests
from m3u8 import M3U8

View File

@ -0,0 +1,3 @@
# 29.04.24
from .page import search

140
Src/Lib/Google/page.py Normal file
View File

@ -0,0 +1,140 @@
# 29.04.24
import time
import ssl
from urllib.request import Request, urlopen
from urllib.parse import quote_plus, urlparse, parse_qs
from bs4 import BeautifulSoup
from typing import Generator, Optional
# Internal utilities
from Src.Lib.Request import requests
def get_page(url: str) -> bytes:
"""
Fetches the HTML content of a webpage given its URL.
Args:
url (str): The URL of the webpage.
Returns:
bytes: The HTML content of the webpage.
"""
response = requests.get(url)
return response.text
def filter_result(link: str) -> Optional[str]:
"""
Filters search result links to remove unwanted ones.
Args:
link (str): The URL of the search result.
Returns:
Optional[str]: The filtered URL if valid, None otherwise.
"""
try:
if link.startswith('/url?'):
# Extract the actual URL from Google's redirect link
o = urlparse(link, 'http')
link = parse_qs(o.query)['q'][0]
o = urlparse(link, 'http')
# Filter out Google links
if o.netloc and 'google' not in o.netloc:
return link
except Exception:
pass
def search(query: str, num: int = 10, stop: Optional[int] = None, pause: float = 2.0) -> Generator[str, None, None]:
"""
Performs a Google search and yields the URLs of search results.
Args:
query (str): The search query.
num (int): Number of results to fetch per request. Default is 10.
stop (int, optional): Total number of results to retrieve. Default is None.
pause (float): Pause duration between requests. Default is 2.0.
Yields:
str: The URL of a search result.
Example:
>>> for url in search("Python tutorials", num=5, stop=10):
... print(url)
...
https://www.python.org/about/gettingstarted/
"""
# Set to store unique URLs
hashes = set()
# Counter for the number of fetched URLs
count = 0
# Encode the query for URL
query = quote_plus(query)
while not stop or count < stop:
last_count = count
# Construct the Google search URL
url = f"https://www.google.com/search?client=opera&q={query}&sourceid=opera&oe=UTF-8"
# Pause before making the request
time.sleep(pause)
# Fetch the HTML content of the search page
html = get_page(url)
soup = BeautifulSoup(html, 'html.parser')
try:
# Find all anchor tags containing search result links
anchors = soup.find(id='search').findAll('a')
except AttributeError:
# Handle cases where search results are not found in the usual div
gbar = soup.find(id='gbar')
if gbar:
gbar.clear()
anchors = soup.findAll('a')
# Iterate over each anchor tag
for a in anchors:
try:
link = a['href']
except KeyError:
continue
# Filter out unwanted links
link = filter_result(link)
if not link:
continue
# Check for duplicate URLs
h = hash(link)
if h in hashes:
continue
hashes.add(h)
# Yield the valid URL
yield link
# Increment the counter
count += 1
# Check if the desired number of URLs is reached
if stop and count >= stop:
return
# Break the loop if no new URLs are found
if last_count == count:
break

View File

@ -0,0 +1,3 @@
# 21.04.24
from .my_requests import requests

View File

@ -9,7 +9,8 @@ import re
import urllib.parse
import urllib.request
import urllib.error
from typing import Dict, Optional, Union
from typing import Dict, Optional, Union, TypedDict, Any
try:
from typing import Unpack
@ -22,9 +23,13 @@ except ImportError:
"Please make sure you have the necessary libraries installed.")
# External library
from bs4 import BeautifulSoup
# Constants
HTTP_TIMEOUT = 4
HTTP_RETRIES = 2
HTTP_TIMEOUT = 3
HTTP_RETRIES = 1
HTTP_DELAY = 1
@ -95,12 +100,43 @@ class Response:
raise RequestError(f"Request failed with status code {self.status_code}")
def json(self):
"""Return the response content as JSON if it is JSON."""
"""
Return the response content as JSON if it is JSON.
Returns:
dict or list or None: A Python dictionary or list parsed from JSON if the response content is JSON, otherwise None.
"""
if self.is_json:
return json.loads(self.text)
else:
return None
def get_redirects(self):
"""
Extracts unique site URLs from HTML <link> elements within the <head> section.
Returns:
list or None: A list of unique site URLs if found, otherwise None.
"""
site_find = []
if self.text:
soup = BeautifulSoup(self.text, "html.parser")
for links in soup.find("head").find_all('link'):
if links is not None:
parsed_url = urllib.parse.urlparse(links.get('href'))
site = parsed_url.scheme + "://" + parsed_url.netloc
if site not in site_find:
site_find.append(site)
if site_find:
return site_find
else:
return None
class ManageRequests:
"""Class for managing HTTP requests."""
@ -116,6 +152,7 @@ class ManageRequests:
auth: Optional[tuple] = None,
proxy: Optional[str] = None,
cookies: Optional[Dict[str, str]] = None,
json_data: Optional[Dict[str, Any]] = None,
redirection_handling: bool = True,
):
"""
@ -136,7 +173,7 @@ class ManageRequests:
"""
self.url = url
self.method = method
self.headers = headers or {'User-Agent': 'Mozilla/5.0'}
self.headers = headers or {}
self.timeout = timeout
self.retries = retries
self.params = params
@ -144,6 +181,7 @@ class ManageRequests:
self.auth = auth
self.proxy = proxy
self.cookies = cookies
self.json_data = json_data
self.redirection_handling = redirection_handling
def add_header(self, key: str, value: str) -> None:
@ -152,6 +190,7 @@ class ManageRequests:
def send(self) -> Response:
"""Send the HTTP request."""
start_time = time.time()
self.attempt = 0
redirect_url = None
@ -160,24 +199,53 @@ class ManageRequests:
try:
req = self._build_request()
response = self._perform_request(req)
return self._process_response(response, start_time, redirect_url)
except (urllib.error.URLError, urllib.error.HTTPError) as e:
self._handle_error(e)
attempt += 1
self.attempt += 1
def _build_request(self) -> urllib.request.Request:
"""Build the urllib Request object."""
headers = self.headers.copy()
if self.params:
url = self.url + '?' + urllib.parse.urlencode(self.params)
else:
url = self.url
req = urllib.request.Request(url, headers=headers, method=self.method)
if self.json_data:
req.add_header('Content-Type', 'application/json')
req.body = json.dumps(self.json_data).encode('utf-8')
else:
req = urllib.request.Request(url, headers=headers, method=self.method)
if self.auth:
req.add_header('Authorization', 'Basic ' + base64.b64encode(f"{self.auth[0]}:{self.auth[1]}".encode()).decode())
if self.cookies:
cookie_str = '; '.join([f"{name}={value}" for name, value in self.cookies.items()])
req.add_header('Cookie', cookie_str)
if self.headers:
for key, value in self.headers.items():
req.add_header(key, value)
# Add default user agent
if True:
there_is_agent = False
for key, value in self.headers.items():
if str(key).lower() == 'user-agent':
there_is_agent = True
if not there_is_agent:
default_user_agent = 'Mozilla/5.0'
req.add_header('user-agent', default_user_agent)
return req
def _perform_request(self, req: urllib.request.Request) -> urllib.response.addinfourl:
@ -186,26 +254,30 @@ class ManageRequests:
proxy_handler = urllib.request.ProxyHandler({'http': self.proxy, 'https': self.proxy})
opener = urllib.request.build_opener(proxy_handler)
urllib.request.install_opener(opener)
if not self.verify_ssl:
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
response = urllib.request.urlopen(req, timeout=self.timeout, context=ssl_context)
else:
response = urllib.request.urlopen(req, timeout=self.timeout)
return response
def _process_response(self, response: urllib.response.addinfourl, start_time: float, redirect_url: Optional[str]) -> Response:
"""Process the HTTP response."""
response_data = response.read()
content_type = response.headers.get('Content-Type', '').lower()
is_response_api = "json" in content_type
if self.redirection_handling and response.status in (301, 302, 303, 307, 308):
location = response.headers.get('Location')
logging.info(f"Redirecting to: {location}")
redirect_url = location
self.url = location
return self.send()
return self._build_response(response, response_data, start_time, redirect_url, content_type)
def _build_response(self, response: urllib.response.addinfourl, response_data: bytes, start_time: float, redirect_url: Optional[str], content_type: str) -> Response:
@ -216,7 +288,7 @@ class ManageRequests:
for cookie in response.headers.get_all('Set-Cookie', []):
cookie_parts = cookie.split(';')
cookie_name, cookie_value = cookie_parts[0].split('=')
cookie_name, cookie_value = cookie_parts[0].split('=', 1)
response_cookies[cookie_name.strip()] = cookie_value.strip()
return Response(
@ -234,9 +306,11 @@ class ManageRequests:
def _handle_error(self, e: Union[urllib.error.URLError, urllib.error.HTTPError]) -> None:
"""Handle request error."""
logging.error(f"Request failed for URL '{self.url}': {str(e)}")
if self.attempt < self.retries:
logging.info(f"Retrying request for URL '{self.url}' (attempt {self.attempt}/{self.retries})")
time.sleep(HTTP_DELAY)
else:
logging.error(f"Maximum retries reached for URL '{self.url}'")
raise RequestError(str(e))
@ -251,9 +325,7 @@ class ValidateRequest:
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or IP
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', re.IGNORECASE)
return re.match(url_regex, url) is not None
@staticmethod
@ -287,14 +359,14 @@ class SSLHandler:
ssl_context.verify_mode = ssl.CERT_NONE
class KwargsRequest():
"""Class representing keyword arguments for a request."""
class KwargsRequest(TypedDict, total = False):
url: str
headers: Optional[Dict[str, str]] = None
timeout: float = HTTP_TIMEOUT
retries: int = HTTP_RETRIES
params: Optional[Dict[str, str]] = None
cookies: Optional[Dict[str, str]] = None
json_data: Optional[Dict[str, Any]] = None
class Request:
@ -302,7 +374,7 @@ class Request:
def __init__(self) -> None:
pass
def get(self, url: str, **kwargs: Unpack[KwargsRequest]):
def get(self, url: str, **kwargs: Unpack[KwargsRequest])-> 'Response':
"""
Send a GET request.
@ -315,7 +387,7 @@ class Request:
"""
return self._send_request(url, 'GET', **kwargs)
def post(self, url: str, **kwargs: Unpack[KwargsRequest]):
def post(self, url: str, **kwargs: Unpack[KwargsRequest]) -> 'Response':
"""
Send a POST request.
@ -328,35 +400,8 @@ class Request:
"""
return self._send_request(url, 'POST', **kwargs)
def put(self, url: str, **kwargs: Unpack[KwargsRequest]):
"""
Send a PUT request.
Args:
url (str): The URL to which the request will be sent.
**kwargs: Additional keyword arguments for the request.
Returns:
Response: The response object.
"""
return self._send_request(url, 'PUT', **kwargs)
def delete(self, url: str, **kwargs: Unpack[KwargsRequest]):
"""
Send a DELETE request.
Args:
url (str): The URL to which the request will be sent.
**kwargs: Additional keyword arguments for the request.
Returns:
Response: The response object.
"""
return self._send_request(url, 'DELETE', **kwargs)
def _send_request(self, url: str, method: str, **kwargs: Unpack[KwargsRequest]):
def _send_request(self, url: str, method: str, **kwargs: Unpack[KwargsRequest]) -> 'Response':
"""Send an HTTP request."""
# Add validation checks for URL and headers
if not ValidateRequest.validate_url(url):
raise ValueError("Invalid URL format")
@ -365,6 +410,5 @@ class Request:
return ManageRequests(url, method, **kwargs).send()
# Out
request = Request()
requests: Request = Request()

View File

@ -0,0 +1,3 @@
# 21.04.24
from .user_agent import ua

View File

@ -7,10 +7,12 @@ import random
import threading
import json
import tempfile
from typing import Dict, List
# Internal utilities
from .my_requests import request
from ..Request import requests
def get_browser_user_agents_online(browser: str) -> List[str]:
@ -28,7 +30,7 @@ def get_browser_user_agents_online(browser: str) -> List[str]:
try:
# Make request and find all user agents
html = request.get(url).text
html = requests.get(url).text
browser_user_agents = re.findall(r"<a href=\'/.*?>(.+?)</a>", html, re.UNICODE)
return [ua for ua in browser_user_agents if "more" not in ua.lower()]
@ -103,4 +105,4 @@ class UserAgentManager:
# Output
ua = UserAgentManager()
ua: UserAgentManager = UserAgentManager()

View File

@ -1,13 +1,13 @@
# 01.03.2023
import os
import requests
import time
# Internal utilities
from .version import __version__
from Src.Util.console import console
from Src.Lib.Request import requests
# Variable

View File

@ -4,7 +4,7 @@ import logging
# Internal utilities
from Src.Lib.Request.user_agent import ua
from Src.Lib.UserAgent import ua
def get_headers() -> str:

Binary file not shown.