diff --git a/Dockerfile b/Dockerfile index 3d8ea35..32a80ab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11-slim-bullseye as builder +FROM debian:trixie-slim AS builder # Build dummy packages to skip installing them and their dependencies RUN apt-get update \ @@ -10,41 +10,38 @@ RUN apt-get update \ && equivs-control adwaita-icon-theme \ && printf 'Section: misc\nPriority: optional\nStandards-Version: 3.9.2\nPackage: adwaita-icon-theme\nVersion: 99.0.0\nDescription: Dummy package for adwaita-icon-theme\n' >> adwaita-icon-theme \ && equivs-build adwaita-icon-theme \ - && mv adwaita-icon-theme_*.deb /adwaita-icon-theme.deb + && mv adwaita-icon-theme_*.deb /adwaita-icon-theme.deb \ + && apt-get purge -y --auto-remove equivs \ + && rm -rf /var/lib/apt/lists/* -FROM python:3.11-slim-bullseye +FROM debian:trixie-slim # Copy dummy packages -COPY --from=builder /*.deb / +COPY --from=builder /libgl1-mesa-dri.deb /adwaita-icon-theme.deb / # Install dependencies and create flaresolverr user -# You can test Chromium running this command inside the container: -# xvfb-run -s "-screen 0 1600x1200x24" chromium --no-sandbox -# The error traces is like this: "*** stack smashing detected ***: terminated" -# To check the package versions available you can use this command: -# apt-cache madison chromium WORKDIR /app +COPY requirements.txt . +RUN apt-get update \ # Install dummy packages -RUN dpkg -i /libgl1-mesa-dri.deb \ + && dpkg -i /libgl1-mesa-dri.deb \ && dpkg -i /adwaita-icon-theme.deb \ + && apt-get install -f \ # Install dependencies - && apt-get update \ - && apt-get install -y --no-install-recommends chromium chromium-common chromium-driver xvfb dumb-init \ - procps curl vim xauth \ + && apt-get install -y --no-install-recommends chromium xvfb dumb-init \ + procps curl vim xauth python3 python3-pip \ # Remove temporary files and hardware decoding libraries && rm -rf /var/lib/apt/lists/* \ && rm -f /usr/lib/x86_64-linux-gnu/libmfxhw* \ && rm -f /usr/lib/x86_64-linux-gnu/mfx/* \ # Create flaresolverr user && useradd --home-dir /app --shell /bin/sh flaresolverr \ - && mv /usr/bin/chromedriver chromedriver \ - && chown -R flaresolverr:flaresolverr . - -# Install Python dependencies -COPY requirements.txt . -RUN pip install -r requirements.txt \ + && chown -R flaresolverr:flaresolverr . \ + # Set up Python and install dependencies + && ln -s /usr/bin/python3 /usr/local/bin/python \ + && pip install --break-system-packages -r requirements.txt \ # Remove temporary files - && rm -rf /root/.cache + && rm -rf /root/.cache /tmp/* USER flaresolverr diff --git a/README.md b/README.md index 8f97158..f6d2e57 100644 --- a/README.md +++ b/README.md @@ -185,9 +185,10 @@ session. When you no longer need to use a session you should make sure to close | session | Optional. Will send the request from and existing browser instance. If one is not sent it will create a temporary instance that will be destroyed immediately after the request is completed. | | session_ttl_minutes | Optional. FlareSolverr will automatically rotate expired sessions based on the TTL provided in minutes. | | maxTimeout | Optional, default value 60000. Max timeout to solve the challenge in milliseconds. | +| userAgent | Optional. Used for the current request and does not affect subsequent ones. | | cookies | Optional. Will be used by the headless browser. Eg: `"cookies": [{"name": "cookie1", "value": "value1"}, {"name": "cookie2", "value": "value2"}]`. | | returnOnlyCookies | Optional, default false. Only returns the cookies. Response data, headers and other parts of the response are removed. | -| proxy | Optional, default disabled. Eg: `"proxy": {"url": "http://127.0.0.1:8888"}`. You must include the proxy schema in the URL: `http://`, `socks4://` or `socks5://`. Authorization (username/password) is not supported. (When the `session` parameter is set, the proxy is ignored; a session specific proxy can be set in `sessions.create`.) | +| proxy | Optional, default disabled. Eg: `"proxy": {"url": "http://127.0.0.1:8888"}`. You must include the proxy schema in the URL: `http://`, `socks4://` or `socks5://`. Authorization (username/password) is supported. (When the `session` parameter is set, the proxy is ignored; a session specific proxy can be set in `sessions.create`.) | > **Warning** > If you want to use Cloudflare clearance cookie in your scripts, make sure you use the FlareSolverr User-Agent too. If they don't match you will see the challenge. diff --git a/requirements.txt b/requirements.txt index d633aeb..1297d50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ bottle==0.12.25 waitress==2.1.2 -selenium==4.15.2 +DrissionPage==4.1.0.0b19 func-timeout==4.3.5 prometheus-client==0.17.1 # required by undetected_chromedriver diff --git a/src/dtos.py b/src/dtos.py index 1e9aace..de5bb20 100644 --- a/src/dtos.py +++ b/src/dtos.py @@ -35,7 +35,7 @@ class V1RequestBase(object): session: str = None session_ttl_minutes: int = None headers: list = None # deprecated v2.0.0, not used - userAgent: str = None # deprecated v2.0.0, not used + userAgent: str = None # V1Request url: str = None diff --git a/src/flaresolverr.py b/src/flaresolverr.py index 3596fe1..cbf29f7 100644 --- a/src/flaresolverr.py +++ b/src/flaresolverr.py @@ -114,6 +114,9 @@ if __name__ == "__main__": prometheus_plugin.setup() app.install(prometheus_plugin.prometheus_plugin) + webdriver_data = utils.get_webdriver_data_path() + utils.remove_all_subfolders(webdriver_data) + # start webserver # default server 'wsgiref' does not support concurrent requests # https://github.com/FlareSolverr/FlareSolverr/issues/680 diff --git a/src/flaresolverr_service.py b/src/flaresolverr_service.py index cfc2088..72b4b01 100644 --- a/src/flaresolverr_service.py +++ b/src/flaresolverr_service.py @@ -7,13 +7,8 @@ from html import escape from urllib.parse import unquote, quote from func_timeout import FunctionTimedOut, func_timeout -from selenium.common import TimeoutException -from selenium.webdriver.chrome.webdriver import WebDriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.expected_conditions import ( - presence_of_element_located, staleness_of, title_is) -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.support.wait import WebDriverWait +from DrissionPage import ChromiumPage +from DrissionPage._units.listener import DataPacket import utils from dtos import (STATUS_ERROR, STATUS_OK, ChallengeResolutionResultT, @@ -116,8 +111,6 @@ def _controller_v1_handler(req: V1RequestBase) -> V1ResponseBase: raise Exception("Request parameter 'cmd' is mandatory.") if req.headers is not None: logging.warning("Request parameter 'headers' was removed in FlareSolverr v2.") - if req.userAgent is not None: - logging.warning("Request parameter 'userAgent' was removed in FlareSolverr v2.") # set default values if req.maxTimeout is None or int(req.maxTimeout) < 1: @@ -223,6 +216,7 @@ def _cmd_sessions_destroy(req: V1RequestBase) -> V1ResponseBase: def _resolve_challenge(req: V1RequestBase, method: str) -> ChallengeResolutionT: timeout = int(req.maxTimeout) / 1000 driver = None + user_data_path = None try: if req.session: session_id = req.session @@ -237,7 +231,8 @@ def _resolve_challenge(req: V1RequestBase, method: str) -> ChallengeResolutionT: driver = session.driver else: - driver = utils.get_webdriver(req.proxy) + user_data_path = utils.get_user_data_path() + driver = utils.get_webdriver(req.proxy, user_data_path) logging.debug('New instance of webdriver has been created to perform the request') return func_timeout(timeout, _evil_logic, (req, driver, method)) except FunctionTimedOut: @@ -250,179 +245,142 @@ def _resolve_challenge(req: V1RequestBase, method: str) -> ChallengeResolutionT: driver.close() driver.quit() logging.debug('A used instance of webdriver has been destroyed') + if user_data_path: + utils.remove_user_data(user_data_path) - -def click_verify(driver: WebDriver): +def click_verify(driver: ChromiumPage) -> DataPacket: try: - logging.debug("Try to find the Cloudflare verify checkbox...") - iframe = driver.find_element(By.XPATH, "//iframe[starts-with(@id, 'cf-chl-widget-')]") - driver.switch_to.frame(iframe) - checkbox = driver.find_element( - by=By.XPATH, - value='//*[@id="content"]/div/div/label/input', + bde = ( + driver + .ele("@Style=border: 0px; margin: 0px; padding: 0px;", timeout=10) + .shadow_root + .ele("tag:iframe", timeout=10) + .ele('tag:body', timeout=10) + .shadow_root ) - if checkbox: - actions = ActionChains(driver) - actions.move_to_element_with_offset(checkbox, 5, 7) - actions.click(checkbox) - actions.perform() - logging.debug("Cloudflare verify checkbox found and clicked!") - except Exception: - logging.debug("Cloudflare verify checkbox not found on the page.") - finally: - driver.switch_to.default_content() + ve = bde.ele("text:Verify you are human", timeout=10) - try: - logging.debug("Try to find the Cloudflare 'Verify you are human' button...") - button = driver.find_element( - by=By.XPATH, - value="//input[@type='button' and @value='Verify you are human']", - ) - if button: - actions = ActionChains(driver) - actions.move_to_element_with_offset(button, 5, 7) - actions.click(button) - actions.perform() - logging.debug("The Cloudflare 'Verify you are human' button found and clicked!") - except Exception: - logging.debug("The Cloudflare 'Verify you are human' button not found on the page.") + driver.listen.resume() + ve.click() + data = driver.listen.wait(count=1,timeout=5) - time.sleep(2) + if isinstance(data, DataPacket): + return data + + return None + + except Exception as e: + logging.debug("Cloudflare verify checkbox not found on the page. %s", repr(e)) -def get_correct_window(driver: WebDriver) -> WebDriver: - if len(driver.window_handles) > 1: - for window_handle in driver.window_handles: - driver.switch_to.window(window_handle) - current_url = driver.current_url - if not current_url.startswith("devtools://devtools"): - return driver - return driver +def search_challenge(driver: ChromiumPage) -> bool: + page_title = driver.title.lower() + + # find challenge by title + for title in CHALLENGE_TITLES: + if title.lower() == page_title: + logging.debug("Challenge detected. Title found: %s", page_title) + return True + # find challenge by selectors + if driver.wait.eles_loaded(locators=CHALLENGE_SELECTORS, timeout=SHORT_TIMEOUT, any_one=True): + logging.debug("Challenge detected. One of selectors found") + return True + return False -def access_page(driver: WebDriver, url: str) -> None: - driver.get(url) - driver.start_session() - driver.start_session() # required to bypass Cloudflare - - -def _evil_logic(req: V1RequestBase, driver: WebDriver, method: str) -> ChallengeResolutionT: +def _evil_logic(req: V1RequestBase, driver: ChromiumPage, method: str) -> ChallengeResolutionT: res = ChallengeResolutionT({}) res.status = STATUS_OK res.message = "" + old_user_agent = utils.get_user_agent(driver) + if req.userAgent is not None and req.userAgent != "": + driver.set.user_agent(ua=req.userAgent) # navigate to the page - logging.debug(f'Navigating to... {req.url}') + logging.debug('Navigating to... %s', req.url) + driver.listen.start(req.url) if method == 'POST': _post_request(req, driver) else: - access_page(driver, req.url) - driver = get_correct_window(driver) + driver.get(req.url) + data = driver.listen.wait(count=1,timeout=5) + driver.listen.pause() # set cookies if required if req.cookies is not None and len(req.cookies) > 0: - logging.debug(f'Setting cookies...') + logging.debug('Setting cookies...') for cookie in req.cookies: - driver.delete_cookie(cookie['name']) - driver.add_cookie(cookie) + driver.set.cookies.remove(cookie['name']) + driver.set.cookies(cookie) # reload the page + driver.listen.resume() if method == 'POST': _post_request(req, driver) else: - access_page(driver, req.url) - driver = get_correct_window(driver) + driver.get(req.url) + data = driver.listen.wait(count=1, timeout=5) + driver.listen.pause() # wait for the page if utils.get_config_log_html(): - logging.debug(f"Response HTML:\n{driver.page_source}") - html_element = driver.find_element(By.TAG_NAME, "html") - page_title = driver.title + logging.debug("Response HTML:\n%s", driver.html) + page_title = driver.title # find access denied titles for title in ACCESS_DENIED_TITLES: if title == page_title: raise Exception('Cloudflare has blocked this request. ' 'Probably your IP is banned for this site, check in your web browser.') # find access denied selectors - for selector in ACCESS_DENIED_SELECTORS: - found_elements = driver.find_elements(By.CSS_SELECTOR, selector) - if len(found_elements) > 0: - raise Exception('Cloudflare has blocked this request. ' - 'Probably your IP is banned for this site, check in your web browser.') - - # find challenge by title - challenge_found = False - for title in CHALLENGE_TITLES: - if title.lower() == page_title.lower(): - challenge_found = True - logging.info("Challenge detected. Title found: " + page_title) - break - if not challenge_found: - # find challenge by selectors - for selector in CHALLENGE_SELECTORS: - found_elements = driver.find_elements(By.CSS_SELECTOR, selector) - if len(found_elements) > 0: - challenge_found = True - logging.info("Challenge detected. Selector found: " + selector) - break + if driver.wait.eles_loaded(locators=ACCESS_DENIED_SELECTORS, timeout=SHORT_TIMEOUT, any_one=True): + raise Exception('Cloudflare has blocked this request. ' + 'Probably your IP is banned for this site, check in your web browser.') attempt = 0 - if challenge_found: - while True: - try: - attempt = attempt + 1 - # wait until the title changes - for title in CHALLENGE_TITLES: - logging.debug("Waiting for title (attempt " + str(attempt) + "): " + title) - WebDriverWait(driver, SHORT_TIMEOUT).until_not(title_is(title)) + challenge_found = True + while challenge_found: + try: + attempt += 1 - # then wait until all the selectors disappear - for selector in CHALLENGE_SELECTORS: - logging.debug("Waiting for selector (attempt " + str(attempt) + "): " + selector) - WebDriverWait(driver, SHORT_TIMEOUT).until_not( - presence_of_element_located((By.CSS_SELECTOR, selector))) + if search_challenge(driver): + if attempt == 1: + logging.info("Challenge detected.") - # all elements not found + data = click_verify(driver) + else: + if attempt == 1: + logging.info("Challenge not detected!") + res.message = "Challenge not detected!" + else: + logging.info("Challenge solved!") + res.message = "Challenge solved!" break - except TimeoutException: - logging.debug("Timeout waiting for selector") + except Exception as e: + logging.debug("Cloudflare check exception") + raise e - click_verify(driver) - - # update the html (cloudflare reloads the page every 5 s) - html_element = driver.find_element(By.TAG_NAME, "html") - - # waits until cloudflare redirection ends - logging.debug("Waiting for redirect") - # noinspection PyBroadException - try: - WebDriverWait(driver, SHORT_TIMEOUT).until(staleness_of(html_element)) - except Exception: - logging.debug("Timeout waiting for redirect") - - logging.info("Challenge solved!") - res.message = "Challenge solved!" - else: - logging.info("Challenge not detected!") - res.message = "Challenge not detected!" challenge_res = ChallengeResolutionResultT({}) - challenge_res.url = driver.current_url - challenge_res.status = 200 # todo: fix, selenium not provides this info - challenge_res.cookies = driver.get_cookies() - challenge_res.userAgent = utils.get_user_agent(driver) + challenge_res.url = driver.url + if data is not None and data.response is not None: # Fixed #1162 + challenge_res.status = data.response.status + if not req.returnOnlyCookies: + challenge_res.response = data.response.body + challenge_res.headers = dict(data.response.headers) - if not req.returnOnlyCookies: - challenge_res.headers = {} # todo: fix, selenium not provides this info - challenge_res.response = driver.page_source + challenge_res.cookies = driver.cookies(all_info=True) + challenge_res.userAgent = req.userAgent or utils.get_user_agent(driver) + + if old_user_agent: + driver.set.user_agent(ua=old_user_agent) res.result = challenge_res return res -def _post_request(req: V1RequestBase, driver: WebDriver): +def _post_request(req: V1RequestBase, driver: ChromiumPage): post_form = f'