From a5b3e08e1f8b4ed1858efbd3ba907c6a662b7c8b Mon Sep 17 00:00:00 2001 From: ngosang Date: Sun, 17 Oct 2021 20:43:36 +0200 Subject: [PATCH] Code clean up, remove returnRawHtml, download, headers params --- README.md | 18 +-- src/controllers/v1.ts | 36 ++++-- src/services/sessions.ts | 22 ++-- src/services/solver.ts | 239 ++++++++++++++------------------------- src/services/utils.ts | 10 -- src/tests/app.test.ts | 56 +++++++++ 6 files changed, 173 insertions(+), 208 deletions(-) diff --git a/README.md b/README.md index e9c6d17..3bc4bfa 100644 --- a/README.md +++ b/README.md @@ -86,10 +86,7 @@ curl -L -X POST 'http://localhost:8191/v1' \ --data-raw '{ "cmd": "request.get", "url":"http://www.google.com/", - "maxTimeout": 60000, - "headers": { - "X-Test": "Testing 123..." - } + "maxTimeout": 60000 }' ``` @@ -140,11 +137,9 @@ Parameter | Notes |--|--| url | Mandatory session | Optional. Will send the request from and existing browser instance. If one is not sent it will create a temporary instance that will be destroyed immediately after the request is completed. -headers | Optional. To specify user headers. maxTimeout | Optional, default value 60000. Max timeout to solve the challenge in milliseconds. cookies | Optional. Will be used by the headless browser. Follow [this](https://github.com/puppeteer/puppeteer/blob/v3.3.0/docs/api.md#pagesetcookiecookies) format. returnOnlyCookies | Optional, default false. Only returns the cookies. Response data, headers and other parts of the response are removed. -returnRawHtml | Optional, default false. The response data will be returned without JS processing. This is useful for JSON or plain text content. Example response from running the `curl` above: @@ -211,16 +206,7 @@ This is the same as `request.get` but it takes one more param: Parameter | Notes |--|--| -postData | Must be a string. If you want to POST a form, don't forget to set the `Content-Type` header to `application/x-www-form-urlencoded` or the server might not understand your request. - -### Download small files - -If you need to access an image/pdf or small file, you should pass the `download` parameter to `request.get` setting it -to `true`. Rather than access the html and return text it will return the buffer **base64** encoded which you will be -able to decode and save the image/pdf. - -This method isn't recommended for videos or anything larger. As that should be streamed back to the client and at the -moment there is nothing setup to do so. If this is something you need feel free to create an issue and/or submit a PR. +postData | Must be a string. If you want to POST a form with format `application/x-www-form-urlencoded`. ## Environment variables diff --git a/src/controllers/v1.ts b/src/controllers/v1.ts index 3eca33e..c733fd2 100644 --- a/src/controllers/v1.ts +++ b/src/controllers/v1.ts @@ -15,11 +15,11 @@ interface V1Routes { export interface V1RequestBase { cmd: string cookies?: SetCookie[], - headers?: Headers maxTimeout?: number proxy?: any// TODO: use interface not any session: string - userAgent?: string // deprecated, not used + headers?: Headers // deprecated v2, not used + userAgent?: string // deprecated v2, not used } interface V1RequestSession extends V1RequestBase { @@ -29,9 +29,9 @@ export interface V1Request extends V1RequestBase { url: string method?: HttpMethod postData?: string - download?: boolean returnOnlyCookies?: boolean - returnRawHtml?: boolean + download?: boolean // deprecated v2, not used + returnRawHtml?: boolean // deprecated v2, not used } export interface V1ResponseBase { @@ -59,7 +59,6 @@ export const routes: V1Routes = { const options: SessionCreateOptions = { oneTimeSession: false, cookies: params.cookies, - headers: params.headers, maxTimeout: params.maxTimeout, proxy: params.proxy } @@ -87,12 +86,15 @@ export const routes: V1Routes = { }, 'request.get': async (params: V1Request, response: V1ResponseSolution): Promise => { params.method = 'GET' - if (params.userAgent) { - log.warn('Request parameter "userAgent" was removed in FlareSolverr v2.') - } if (params.postData) { throw Error('Cannot use "postBody" when sending a GET request.') } + if (params.returnRawHtml) { + log.warn("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.") + } + if (params.download) { + log.warn("Request parameter 'download' was removed in FlareSolverr v2.") + } const result: ChallengeResolutionT = await browserRequest(params) response.status = result.status; @@ -104,12 +106,15 @@ export const routes: V1Routes = { }, 'request.post': async (params: V1Request, response: V1ResponseSolution): Promise => { params.method = 'POST' - if (params.userAgent) { - log.warn('Request parameter "userAgent" was removed in FlareSolverr v2.') - } if (!params.postData) { throw Error('Must send param "postBody" when sending a POST request.') } + if (params.returnRawHtml) { + log.warn("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.") + } + if (params.download) { + log.warn("Request parameter 'download' was removed in FlareSolverr v2.") + } const result: ChallengeResolutionT = await browserRequest(params) response.status = result.status; @@ -133,8 +138,15 @@ export async function controllerV1(req: Request, res: Response): Promise { try { const params: V1RequestBase = req.body if (!params.cmd) { - throw Error("Parameter 'cmd' is mandatory.") + throw Error("Request parameter 'cmd' is mandatory.") } + if (params.headers) { + log.warn("Request parameter 'headers' was removed in FlareSolverr v2.") + } + if (params.userAgent) { + log.warn("Request parameter 'userAgent' was removed in FlareSolverr v2.") + } + const route = routes[params.cmd] if (route) { await route(params, response) diff --git a/src/services/sessions.ts b/src/services/sessions.ts index 9a35921..60291c2 100644 --- a/src/services/sessions.ts +++ b/src/services/sessions.ts @@ -2,22 +2,17 @@ import {v1 as UUIDv1} from 'uuid' import * as os from 'os' import * as path from 'path' import * as fs from 'fs' -import {LaunchOptions, Headers, SetCookie, Browser} from 'puppeteer' +import {LaunchOptions, SetCookie, Browser} from 'puppeteer' import log from './log' -import {deleteFolderRecursive, sleep, removeEmptyFields} from './utils' +import {deleteFolderRecursive, sleep} from './utils' const puppeteer = require('puppeteer'); -interface SessionPageDefaults { - headers?: Headers -} - export interface SessionsCacheItem { sessionId: string browser: Browser userDataDir?: string - defaults: SessionPageDefaults } interface SessionsCache { @@ -27,7 +22,6 @@ interface SessionsCache { export interface SessionCreateOptions { oneTimeSession: boolean cookies?: SetCookie[], - headers?: Headers maxTimeout?: number proxy?: any// TODO: use interface not any } @@ -79,6 +73,8 @@ export async function testWebBrowserInstallation(): Promise { export async function create(session: string, options: SessionCreateOptions): Promise { const sessionId = session || UUIDv1() + // todo: cookies can't be set in the session, you need to open the page first + // todo: these args are only supported in chrome let args = [ '--no-sandbox', @@ -126,18 +122,14 @@ export async function create(session: string, options: SessionCreateOptions): Pr } } - if (!browser) { throw Error(`Failed to launch browser 3 times in a row.`) } - - if (options.cookies) { - const page = await browser.newPage() - await page.setCookie(...options.cookies) + if (!browser) { + throw Error(`Failed to launch browser 3 times in a row.`) } sessionCache[sessionId] = { sessionId: sessionId, browser: browser, - userDataDir: puppeteerOptions.userDataDir, - defaults: removeEmptyFields(options) // todo: review + userDataDir: puppeteerOptions.userDataDir } return sessionCache[sessionId] diff --git a/src/services/solver.ts b/src/services/solver.ts index d9e5cca..c4cb688 100644 --- a/src/services/solver.ts +++ b/src/services/solver.ts @@ -1,8 +1,8 @@ -import {Response, Headers, Page, Browser} from 'puppeteer' +import {Response, Headers, Page} from 'puppeteer' const Timeout = require('await-timeout'); import log from './log' -import {SessionsCacheItem} from "./sessions"; +import {SessionCreateOptions, SessionsCacheItem} from "./sessions"; import {V1Request} from "../controllers/v1"; import cloudflareProvider from '../providers/cloudflare'; @@ -23,22 +23,11 @@ export interface ChallengeResolutionT { result: ChallengeResolutionResultT } -// interface OverrideResolvers { -// method?: (request: Request) => HttpMethod, -// postData?: (request: Request) => string, -// headers?: (request: Request) => Headers -// } -// -// type OverridesProps = -// 'method' | -// 'postData' | -// 'headers' - -async function resolveChallengeWithTimeout(params: V1Request, page: Page) { +async function resolveChallengeWithTimeout(params: V1Request, session: SessionsCacheItem) { const maxTimeout = params.maxTimeout || 60000 const timer = new Timeout(); try { - const promise = resolveChallenge(params, page); + const promise = resolveChallenge(params, session); return await Promise.race([ promise, timer.set(maxTimeout, `Maximum timeout reached. maxTimeout=${maxTimeout} (ms)`) @@ -48,167 +37,107 @@ async function resolveChallengeWithTimeout(params: V1Request, page: Page) { } } -async function resolveChallenge({ url, proxy, download, returnOnlyCookies, returnRawHtml }: V1Request, - page: Page): Promise { - - let status = 'ok' - let message = '' - - if (proxy) { - log.debug("Apply proxy"); - if (proxy.username) - await page.authenticate({ username: proxy.username, password: proxy.password }); - } - - log.debug(`Navigating to... ${url}`) - let response: Response = await page.goto(url, { waitUntil: 'domcontentloaded' }) - log.html(await page.content()) - - // Detect protection services and solve challenges +async function resolveChallenge(params: V1Request, session: SessionsCacheItem): Promise { try { - response = await cloudflareProvider(url, page, response); - } catch (e) { - status = "error"; - message = "Cloudflare " + e.toString(); - } + let status = 'ok' + let message = '' - const payload: ChallengeResolutionT = { - status, - message, - result: { - url: page.url(), - status: response.status(), - headers: response.headers(), - response: null, - cookies: await page.cookies(), - userAgent: await page.evaluate(() => navigator.userAgent) + const page: Page = await session.browser.newPage() + + // the user-agent is changed just for linux arm build + await page.setUserAgent(sessions.getUserAgent()) + + // todo: review + if (params.proxy) { + log.debug("Apply proxy"); + if (params.proxy.username) { + await page.authenticate({ + username: params.proxy.username, + password: params.proxy.password + }); + } } - } - if (returnOnlyCookies) { - payload.result.headers = null; - payload.result.userAgent = null; - } else { - if (download) { - // for some reason we get an error unless we reload the page - // has something to do with a stale buffer and this is the quickest - // fix since I am short on time - response = await page.goto(url, { waitUntil: 'domcontentloaded' }) - payload.result.response = (await response.buffer()).toString('base64') + log.debug(`Navigating to... ${params.url}`) + let response: Response = await page.goto(params.url, { waitUntil: 'domcontentloaded' }) - // todo: review this functionality - // } else if (returnRawHtml) { - // payload.result.response = await response.text() + // set cookies + if (params.cookies) { + for (const cookie of params.cookies) { + // the other fields in the cookie can cause issues + await page.setCookie({ + "name": cookie.name, + "value": cookie.value + }) + } + // reload the page + response = await page.goto(params.url, { waitUntil: 'domcontentloaded' }) + } + + // log html in debug mode + log.html(await page.content()) + + // Detect protection services and solve challenges + try { + response = await cloudflareProvider(params.url, page, response); + } catch (e) { + status = "error"; + message = "Cloudflare " + e.toString(); + } + + const payload: ChallengeResolutionT = { + status, + message, + result: { + url: page.url(), + status: response.status(), + headers: response.headers(), + response: null, + cookies: await page.cookies(), + userAgent: sessions.getUserAgent() + } + } + + if (params.returnOnlyCookies) { + payload.result.headers = null; + payload.result.userAgent = null; } else { payload.result.response = await page.content() } + + // make sure the page is closed because if it isn't and error will be thrown + // when a user uses a temporary session, the browser make be quit before + // the page is properly closed. + await page.close() + + return payload + } catch (e) { + log.error("Unexpected error: " + e); + throw e; } - - // Add final url in result - payload.result.url = page.url(); - - // make sure the page is closed because if it isn't and error will be thrown - // when a user uses a temporary session, the browser make be quit before - // the page is properly closed. - await page.close() - - return payload -} - -function mergeSessionWithParams({ defaults }: SessionsCacheItem, params: V1Request): V1Request { - const copy = { ...defaults, ...params } - - // custom merging logic - copy.headers = { ...defaults.headers || {}, ...params.headers || {} } || null - - return copy -} - -async function setupPage(params: V1Request, browser: Browser): Promise { - const page = await browser.newPage() - - // merge session defaults with params - const { method, postData, headers, cookies } = params - - // the user-agent is changed just for linux arm build - await page.setUserAgent(sessions.getUserAgent()) - - // todo: redo all functionality - - // let overrideResolvers: OverrideResolvers = {} - // - // if (method !== 'GET') { - // log.debug(`Setting method to ${method}`) - // overrideResolvers.method = request => method - // } - // - // if (postData) { - // log.debug(`Setting body data to ${postData}`) - // overrideResolvers.postData = request => postData - // } - // - // if (headers) { - // log.debug(`Adding custom headers: ${JSON.stringify(headers)}`) - // overrideResolvers.headers = request => Object.assign(request.headers(), headers) - // } - // - // if (cookies) { - // log.debug(`Setting custom cookies: ${JSON.stringify(cookies)}`) - // await page.setCookie(...cookies) - // } - // - // // if any keys have been set on the object - // if (Object.keys(overrideResolvers).length > 0) { - // let callbackRunOnce = false - // const callback = (request: Request) => { - // - // // avoid loading resources to speed up page load - // if(request.resourceType() == 'stylesheet' || request.resourceType() == 'font' || request.resourceType() == 'image') { - // request.abort() - // return - // } - // - // if (callbackRunOnce || !request.isNavigationRequest()) { - // request.continue() - // return - // } - // - // callbackRunOnce = true - // const overrides: Overrides = {} - // - // Object.keys(overrideResolvers).forEach((key: OverridesProps) => { - // // @ts-ignore - // overrides[key] = overrideResolvers[key](request) - // }); - // - // log.debug(`Overrides: ${JSON.stringify(overrides)}`) - // request.continue(overrides) - // } - // - // await page.setRequestInterception(true) - // page.on('request', callback) - // } - - return page } export async function browserRequest(params: V1Request): Promise { const oneTimeSession = params.session === undefined; + + const options: SessionCreateOptions = { + oneTimeSession: oneTimeSession, + cookies: params.cookies, + maxTimeout: params.maxTimeout, + proxy: params.proxy + } + const session: SessionsCacheItem = oneTimeSession - ? await sessions.create(null, { - oneTimeSession: true - }) + ? await sessions.create(null, options) : sessions.get(params.session) if (!session) { throw Error('This session does not exist. Use \'list_sessions\' to see all the existing sessions.') } - params = mergeSessionWithParams(session, params) - try { - const page = await setupPage(params, session.browser) - return await resolveChallengeWithTimeout(params, page) + // const page = await setupPage(params, session.browser) + return await resolveChallengeWithTimeout(params, session) } catch (error) { throw Error("Unable to process browser request. Error: " + error) } finally { diff --git a/src/services/utils.ts b/src/services/utils.ts index da0da3e..e2636ac 100644 --- a/src/services/utils.ts +++ b/src/services/utils.ts @@ -19,13 +19,3 @@ export function deleteFolderRecursive(path: string) { fs.rmdirSync(path) } } - -export const removeEmptyFields = (o: Record): typeof o => { - const r: typeof o = {} - for (const k in o) { - if (o[k] !== undefined) { - r[k] = o[k] - } - } - return r -} \ No newline at end of file diff --git a/src/tests/app.test.ts b/src/tests/app.test.ts index 9356083..587de59 100644 --- a/src/tests/app.test.ts +++ b/src/tests/app.test.ts @@ -7,6 +7,7 @@ import {testWebBrowserInstallation} from "../services/sessions"; const request = require("supertest"); const app = require("../app"); const version: string = require('../../package.json').version + const googleUrl = "https://www.google.com"; const cfUrl = "https://pirateiro.com/torrents/?search=s"; const cfCaptchaUrl = "https://idope.se" @@ -136,6 +137,60 @@ describe("Test '/v1' path", () => { expect(apiResponse.solution.url).toContain(cfCaptchaUrl) }); + test("Cmd 'request.get' should return OK with 'cookies' param", async () => { + const payload = { + "cmd": "request.get", + "url": googleUrl, + "cookies": [ + { + "name": "testcookie1", + "value": "testvalue1" + }, + { + "name": "testcookie2", + "value": "testvalue2" + } + ] + } + const response: Response = await request(app).post("/v1").send(payload); + expect(response.statusCode).toBe(200); + + const apiResponse: V1ResponseSolution = response.body; + expect(apiResponse.status).toBe("ok"); + + const solution = apiResponse.solution; + expect(solution.url).toContain(googleUrl) + expect(Object.keys(solution.cookies).length).toBeGreaterThan(1) + const cookie1: string = (solution.cookies as any[]).filter(function(cookie) { + return cookie.name == "testcookie1"; + })[0].value + expect(cookie1).toBe("testvalue1") + const cookie2: string = (solution.cookies as any[]).filter(function(cookie) { + return cookie.name == "testcookie2"; + })[0].value + expect(cookie2).toBe("testvalue2") + }); + + test("Cmd 'request.get' should return OK with 'returnOnlyCookies' param", async () => { + const payload = { + "cmd": "request.get", + "url": googleUrl, + "returnOnlyCookies": true + } + const response: Response = await request(app).post("/v1").send(payload); + expect(response.statusCode).toBe(200); + + const apiResponse: V1ResponseSolution = response.body; + + const solution = apiResponse.solution; + expect(solution.url).toContain(googleUrl) + expect(solution.status).toBe(200); + expect(solution.headers).toBe(null) + expect(solution.response).toBe(null) + expect(Object.keys(solution.cookies).length).toBeGreaterThan(0) + expect(solution.userAgent).toBe(null) + }); + test("Cmd 'request.get' should return timeout", async () => { const payload = { "cmd": "request.get", @@ -306,4 +361,5 @@ describe("Test '/v1' path", () => { expect(cfCookie2.length).toBeGreaterThan(30) expect(cfCookie2).toBe(cfCookie) }); + });