diff --git a/src/providers/cloudflare.ts b/src/providers/cloudflare.ts new file mode 100644 index 0000000..de0e57d --- /dev/null +++ b/src/providers/cloudflare.ts @@ -0,0 +1,147 @@ +import {Response} from 'puppeteer' +import {Page} from "puppeteer-extra/dist/puppeteer"; +import {TimeoutError} from "puppeteer/Errors"; + +import log from "../log"; +import getCaptchaSolver, {CaptchaType} from "../captcha"; + +/** + * This class contains the logic to solve protections provided by CloudFlare +**/ + +const CHALLENGE_SELECTORS = ['#trk_jschal_js', '.ray_id', '.attack-box']; +const TOKEN_INPUT_NAMES = ['g-recaptcha-response', 'h-captcha-response']; + +export default async function resolveChallenge(url: string, page: Page, response: Response): Promise { + + // look for challenge and return fast if not detected + if (!response.headers().server.startsWith('cloudflare')) { + log.info('Cloudflare not detected'); + return response; + } + log.info('Cloudflare detected'); + + if (await page.$('.cf-error-code')) { + throw new Error('Cloudflare has blocked this request (Code 1020 Detected).') + } + + if (response.status() > 400) { + // detect cloudflare wait 5s + let selectorFoundCount = 0 + for (const selector of CHALLENGE_SELECTORS) { + const cfChallengeElem = await page.$(selector) + if (cfChallengeElem) { + selectorFoundCount++ + log.debug(`'${selector}' challenge element detected.`) + log.debug('Waiting for Cloudflare challenge...') + + while (true) { + await page.waitFor(1000) + try { + // catch exception timeout in waitForNavigation + response = await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 5000 }) + } catch (error) { } + + try { + // catch Execution context was destroyed + const cfChallengeElem = await page.$(selector) + if (!cfChallengeElem) { break } + log.debug('Found challenge element again...') + } catch (error) + { } + + response = await page.reload({ waitUntil: 'domcontentloaded' }) + log.debug('Reloaded page...') + log.html(await page.content()) + } + + log.debug('Validating HTML code...') + break + } else { + log.debug(`No '${selector}' challenge element detected.`) + } + } + log.debug("Number of selector found: " + selectorFoundCount + ", total selector: " + CHALLENGE_SELECTORS.length) + if (selectorFoundCount == 0) + { + throw new Error('No challenge selectors found, unable to proceed') + } + } + + // it seems some captcha pages return 200 sometimes + if (await page.$('input[name="cf_captcha_kind"]')) { + const captchaSolver = getCaptchaSolver() + if (captchaSolver) { + const captchaStartTimestamp = Date.now() + const challengeForm = await page.$('#challenge-form') + if (challengeForm) { + const captchaTypeElm = await page.$('input[name="cf_captcha_kind"]') + const cfCaptchaType: string = await captchaTypeElm.evaluate((e: any) => e.value) + const captchaType: CaptchaType = (CaptchaType as any)[cfCaptchaType] + if (!captchaType) { + throw new Error('Unknown captcha type!'); + } + + let sitekey = null + if (captchaType != 'hCaptcha' && process.env.CAPTCHA_SOLVER != 'hcaptcha-solver') { + const sitekeyElem = await page.$('*[data-sitekey]') + if (!sitekeyElem) { + throw new Error('Could not find sitekey!'); + } + sitekey = await sitekeyElem.evaluate((e) => e.getAttribute('data-sitekey')) + } + + log.info('Waiting to receive captcha token to bypass challenge...') + const token = await captchaSolver({ + url, + sitekey, + type: captchaType + }) + + if (!token) { + throw new Error('Token solver failed to return a token.') + } + + for (const name of TOKEN_INPUT_NAMES) { + const input = await page.$(`textarea[name="${name}"]`) + if (input) { await input.evaluate((e: HTMLTextAreaElement, token) => { e.value = token }, token) } + } + + // ignore preset event listeners on the form + await page.evaluate(() => { + window.addEventListener('submit', (e) => { event.stopPropagation() }, true) + }) + + // it seems some sites obfuscate their challenge forms + // TODO: look into how they do it and come up with a more solid solution + try { + // this element is added with js and we want to wait for all the js to load before submitting + await page.waitForSelector('#challenge-form [type=submit]', { timeout: 5000 }) + } catch (err) { + if (err instanceof TimeoutError) { + log.debug(`No '#challenge-form [type=submit]' element detected.`) + } + } + + // calculates the time it took to solve the captcha + const captchaSolveTotalTime = Date.now() - captchaStartTimestamp + + // generates a random wait time + const randomWaitTime = (Math.floor(Math.random() * 20) + 10) * 1000 + + // waits, if any, time remaining to appear human but stay as fast as possible + const timeLeft = randomWaitTime - captchaSolveTotalTime + if (timeLeft > 0) { await page.waitFor(timeLeft) } + + // submit captcha response + challengeForm.evaluate((e: HTMLFormElement) => e.submit()) + response = await page.waitForNavigation({ waitUntil: 'domcontentloaded' }) + + } + } else { + throw new Error('Captcha detected but no automatic solver is configured.'); + } + } + + return response; +} diff --git a/src/routes.ts b/src/routes.ts index 8ecec05..8fcca22 100644 --- a/src/routes.ts +++ b/src/routes.ts @@ -1,12 +1,12 @@ import { v1 as UUIDv1 } from 'uuid' +import { SetCookie, Request, Response, Headers, HttpMethod, Overrides } from 'puppeteer' +import { Page, Browser } from "puppeteer-extra/dist/puppeteer"; +const Timeout = require('await-timeout'); + +import log from './log' import sessions, { SessionsCacheItem } from './session' import { RequestContext } from './types' -import log from './log' -import { SetCookie, Request, Headers, HttpMethod, Overrides, Cookie } from 'puppeteer' -import { TimeoutError } from 'puppeteer/Errors' -import getCaptchaSolver, { CaptchaType } from './captcha' -import * as Puppeteer from "puppeteer-extra/dist/puppeteer"; -const Timeout = require('await-timeout'); +import cloudflareProvider from './providers/cloudflare'; export interface BaseAPICall { cmd: string @@ -69,12 +69,10 @@ type OverridesProps = 'postData' | 'headers' -// We always set a Windows User-Agent because ARM builds are detected by CloudFlare +// We always set a Windows User-Agent because ARM builds are detected by Cloudflare const DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" -const CHALLENGE_SELECTORS = ['#trk_jschal_js', '.ray_id', '.attack-box'] -const TOKEN_INPUT_NAMES = ['g-recaptcha-response', 'h-captcha-response'] -async function resolveChallengeWithTimeout(ctx: RequestContext, params: BaseRequestAPICall, page: Puppeteer.Page) { +async function resolveChallengeWithTimeout(ctx: RequestContext, params: BaseRequestAPICall, page: Page) { const maxTimeout = params.maxTimeout || 60000 const timer = new Timeout(); try { @@ -88,7 +86,7 @@ async function resolveChallengeWithTimeout(ctx: RequestContext, params: BaseRequ } } -async function resolveChallenge(ctx: RequestContext, { url, proxy, download, returnOnlyCookies }: BaseRequestAPICall, page: Puppeteer.Page): Promise { +async function resolveChallenge(ctx: RequestContext, { url, proxy, download, returnOnlyCookies }: BaseRequestAPICall, page: Page): Promise { let status = 'ok' let message = '' @@ -100,137 +98,15 @@ async function resolveChallenge(ctx: RequestContext, { url, proxy, download, ret } log.debug(`Navigating to... ${url}`) - let response = await page.goto(url, { waitUntil: 'domcontentloaded' }) - + let response: Response = await page.goto(url, { waitUntil: 'domcontentloaded' }) log.html(await page.content()) - // look for challenge - if (response.headers().server.startsWith('cloudflare')) { - log.info('Cloudflare detected') - - if (await page.$('.cf-error-code')) { - await page.close() - return ctx.errorResponse('Cloudflare has blocked this request (Code 1020 Detected).') - } - - if (response.status() > 400) { - // detect cloudflare wait 5s - let selectorFoundCount = 0 - for (const selector of CHALLENGE_SELECTORS) { - const cfChallengeElem = await page.$(selector) - if (cfChallengeElem) { - selectorFoundCount++ - log.debug(`'${selector}' challenge element detected.`) - log.debug('Waiting for Cloudflare challenge...') - - while (true) { - await page.waitFor(1000) - try { - // catch exception timeout in waitForNavigation - response = await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 5000 }) - } catch (error) { } - - try { - // catch Execution context was destroyed - const cfChallengeElem = await page.$(selector) - if (!cfChallengeElem) { break } - log.debug('Found challenge element again...') - } catch (error) - { } - - response = await page.reload({ waitUntil: 'domcontentloaded' }) - log.debug('Reloaded page...') - log.html(await page.content()) - } - - log.debug('Validating HTML code...') - break - } else { - log.debug(`No '${selector}' challenge element detected.`) - } - } - log.debug("Number of selector found: " + selectorFoundCount + ", total selector: " + CHALLENGE_SELECTORS.length) - if (selectorFoundCount == 0) - { - await page.close() - return ctx.errorResponse('No challenge selectors found, unable to proceed') - } - } - - // it seems some captcha pages return 200 sometimes - if (await page.$('input[name="cf_captcha_kind"]')) { - const captchaSolver = getCaptchaSolver() - if (captchaSolver) { - const captchaStartTimestamp = Date.now() - const challengeForm = await page.$('#challenge-form') - if (challengeForm) { - const captchaTypeElm = await page.$('input[name="cf_captcha_kind"]') - const cfCaptchaType: string = await captchaTypeElm.evaluate((e: any) => e.value) - const captchaType: CaptchaType = (CaptchaType as any)[cfCaptchaType] - if (!captchaType) { return ctx.errorResponse('Unknown captcha type!') } - - let sitekey = null - if (captchaType != 'hCaptcha' && process.env.CAPTCHA_SOLVER != 'hcaptcha-solver') { - const sitekeyElem = await page.$('*[data-sitekey]') - if (!sitekeyElem) { return ctx.errorResponse('Could not find sitekey!') } - sitekey = await sitekeyElem.evaluate((e) => e.getAttribute('data-sitekey')) - } - - log.info('Waiting to receive captcha token to bypass challenge...') - const token = await captchaSolver({ - url, - sitekey, - type: captchaType - }) - - if (!token) { - await page.close() - return ctx.errorResponse('Token solver failed to return a token.') - } - - for (const name of TOKEN_INPUT_NAMES) { - const input = await page.$(`textarea[name="${name}"]`) - if (input) { await input.evaluate((e: HTMLTextAreaElement, token) => { e.value = token }, token) } - } - - // ignore preset event listeners on the form - await page.evaluate(() => { - window.addEventListener('submit', (e) => { event.stopPropagation() }, true) - }) - - // it seems some sites obfuscate their challenge forms - // TODO: look into how they do it and come up with a more solid solution - try { - // this element is added with js and we want to wait for all the js to load before submitting - await page.waitForSelector('#challenge-form [type=submit]', { timeout: 5000 }) - } catch (err) { - if (err instanceof TimeoutError) { - log.debug(`No '#challenge-form [type=submit]' element detected.`) - } - } - - // calculates the time it took to solve the captcha - const captchaSolveTotalTime = Date.now() - captchaStartTimestamp - - // generates a random wait time - const randomWaitTime = (Math.floor(Math.random() * 20) + 10) * 1000 - - // waits, if any, time remaining to appear human but stay as fast as possible - const timeLeft = randomWaitTime - captchaSolveTotalTime - if (timeLeft > 0) { await page.waitFor(timeLeft) } - - // submit captcha response - challengeForm.evaluate((e: HTMLFormElement) => e.submit()) - response = await page.waitForNavigation({ waitUntil: 'domcontentloaded' }) - - } - } else { - status = 'warning' - message = 'Captcha detected but no automatic solver is configured.' - } - } - - log.debug("Response is: " + response.status()) + // Detect protection services and solve challenges + try { + response = await cloudflareProvider(url, page, response); + } catch (e) { + status = "error"; + message = "Cloudflare " + e.toString(); } const payload: ChallengeResolutionT = { @@ -278,7 +154,7 @@ function mergeSessionWithParams({ defaults }: SessionsCacheItem, params: BaseReq return copy } -async function setupPage(ctx: RequestContext, params: BaseRequestAPICall, browser: Puppeteer.Browser): Promise { +async function setupPage(ctx: RequestContext, params: BaseRequestAPICall, browser: Browser): Promise { const page = await browser.newPage() // merge session defaults with params