Code clean up, remove returnRawHtml, download, headers params

This commit is contained in:
ngosang 2021-10-17 20:43:36 +02:00
parent a0e897067a
commit a5b3e08e1f
6 changed files with 173 additions and 208 deletions

View File

@ -86,10 +86,7 @@ curl -L -X POST 'http://localhost:8191/v1' \
--data-raw '{
"cmd": "request.get",
"url":"http://www.google.com/",
"maxTimeout": 60000,
"headers": {
"X-Test": "Testing 123..."
}
"maxTimeout": 60000
}'
```
@ -140,11 +137,9 @@ Parameter | Notes
|--|--|
url | Mandatory
session | Optional. Will send the request from and existing browser instance. If one is not sent it will create a temporary instance that will be destroyed immediately after the request is completed.
headers | Optional. To specify user headers.
maxTimeout | Optional, default value 60000. Max timeout to solve the challenge in milliseconds.
cookies | Optional. Will be used by the headless browser. Follow [this](https://github.com/puppeteer/puppeteer/blob/v3.3.0/docs/api.md#pagesetcookiecookies) format.
returnOnlyCookies | Optional, default false. Only returns the cookies. Response data, headers and other parts of the response are removed.
returnRawHtml | Optional, default false. The response data will be returned without JS processing. This is useful for JSON or plain text content.
Example response from running the `curl` above:
@ -211,16 +206,7 @@ This is the same as `request.get` but it takes one more param:
Parameter | Notes
|--|--|
postData | Must be a string. If you want to POST a form, don't forget to set the `Content-Type` header to `application/x-www-form-urlencoded` or the server might not understand your request.
### Download small files
If you need to access an image/pdf or small file, you should pass the `download` parameter to `request.get` setting it
to `true`. Rather than access the html and return text it will return the buffer **base64** encoded which you will be
able to decode and save the image/pdf.
This method isn't recommended for videos or anything larger. As that should be streamed back to the client and at the
moment there is nothing setup to do so. If this is something you need feel free to create an issue and/or submit a PR.
postData | Must be a string. If you want to POST a form with format `application/x-www-form-urlencoded`.
## Environment variables

View File

@ -15,11 +15,11 @@ interface V1Routes {
export interface V1RequestBase {
cmd: string
cookies?: SetCookie[],
headers?: Headers
maxTimeout?: number
proxy?: any// TODO: use interface not any
session: string
userAgent?: string // deprecated, not used
headers?: Headers // deprecated v2, not used
userAgent?: string // deprecated v2, not used
}
interface V1RequestSession extends V1RequestBase {
@ -29,9 +29,9 @@ export interface V1Request extends V1RequestBase {
url: string
method?: HttpMethod
postData?: string
download?: boolean
returnOnlyCookies?: boolean
returnRawHtml?: boolean
download?: boolean // deprecated v2, not used
returnRawHtml?: boolean // deprecated v2, not used
}
export interface V1ResponseBase {
@ -59,7 +59,6 @@ export const routes: V1Routes = {
const options: SessionCreateOptions = {
oneTimeSession: false,
cookies: params.cookies,
headers: params.headers,
maxTimeout: params.maxTimeout,
proxy: params.proxy
}
@ -87,12 +86,15 @@ export const routes: V1Routes = {
},
'request.get': async (params: V1Request, response: V1ResponseSolution): Promise<void> => {
params.method = 'GET'
if (params.userAgent) {
log.warn('Request parameter "userAgent" was removed in FlareSolverr v2.')
}
if (params.postData) {
throw Error('Cannot use "postBody" when sending a GET request.')
}
if (params.returnRawHtml) {
log.warn("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.")
}
if (params.download) {
log.warn("Request parameter 'download' was removed in FlareSolverr v2.")
}
const result: ChallengeResolutionT = await browserRequest(params)
response.status = result.status;
@ -104,12 +106,15 @@ export const routes: V1Routes = {
},
'request.post': async (params: V1Request, response: V1ResponseSolution): Promise<void> => {
params.method = 'POST'
if (params.userAgent) {
log.warn('Request parameter "userAgent" was removed in FlareSolverr v2.')
}
if (!params.postData) {
throw Error('Must send param "postBody" when sending a POST request.')
}
if (params.returnRawHtml) {
log.warn("Request parameter 'returnRawHtml' was removed in FlareSolverr v2.")
}
if (params.download) {
log.warn("Request parameter 'download' was removed in FlareSolverr v2.")
}
const result: ChallengeResolutionT = await browserRequest(params)
response.status = result.status;
@ -133,8 +138,15 @@ export async function controllerV1(req: Request, res: Response): Promise<void> {
try {
const params: V1RequestBase = req.body
if (!params.cmd) {
throw Error("Parameter 'cmd' is mandatory.")
throw Error("Request parameter 'cmd' is mandatory.")
}
if (params.headers) {
log.warn("Request parameter 'headers' was removed in FlareSolverr v2.")
}
if (params.userAgent) {
log.warn("Request parameter 'userAgent' was removed in FlareSolverr v2.")
}
const route = routes[params.cmd]
if (route) {
await route(params, response)

View File

@ -2,22 +2,17 @@ import {v1 as UUIDv1} from 'uuid'
import * as os from 'os'
import * as path from 'path'
import * as fs from 'fs'
import {LaunchOptions, Headers, SetCookie, Browser} from 'puppeteer'
import {LaunchOptions, SetCookie, Browser} from 'puppeteer'
import log from './log'
import {deleteFolderRecursive, sleep, removeEmptyFields} from './utils'
import {deleteFolderRecursive, sleep} from './utils'
const puppeteer = require('puppeteer');
interface SessionPageDefaults {
headers?: Headers
}
export interface SessionsCacheItem {
sessionId: string
browser: Browser
userDataDir?: string
defaults: SessionPageDefaults
}
interface SessionsCache {
@ -27,7 +22,6 @@ interface SessionsCache {
export interface SessionCreateOptions {
oneTimeSession: boolean
cookies?: SetCookie[],
headers?: Headers
maxTimeout?: number
proxy?: any// TODO: use interface not any
}
@ -79,6 +73,8 @@ export async function testWebBrowserInstallation(): Promise<void> {
export async function create(session: string, options: SessionCreateOptions): Promise<SessionsCacheItem> {
const sessionId = session || UUIDv1()
// todo: cookies can't be set in the session, you need to open the page first
// todo: these args are only supported in chrome
let args = [
'--no-sandbox',
@ -126,18 +122,14 @@ export async function create(session: string, options: SessionCreateOptions): Pr
}
}
if (!browser) { throw Error(`Failed to launch browser 3 times in a row.`) }
if (options.cookies) {
const page = await browser.newPage()
await page.setCookie(...options.cookies)
if (!browser) {
throw Error(`Failed to launch browser 3 times in a row.`)
}
sessionCache[sessionId] = {
sessionId: sessionId,
browser: browser,
userDataDir: puppeteerOptions.userDataDir,
defaults: removeEmptyFields(options) // todo: review
userDataDir: puppeteerOptions.userDataDir
}
return sessionCache[sessionId]

View File

@ -1,8 +1,8 @@
import {Response, Headers, Page, Browser} from 'puppeteer'
import {Response, Headers, Page} from 'puppeteer'
const Timeout = require('await-timeout');
import log from './log'
import {SessionsCacheItem} from "./sessions";
import {SessionCreateOptions, SessionsCacheItem} from "./sessions";
import {V1Request} from "../controllers/v1";
import cloudflareProvider from '../providers/cloudflare';
@ -23,22 +23,11 @@ export interface ChallengeResolutionT {
result: ChallengeResolutionResultT
}
// interface OverrideResolvers {
// method?: (request: Request) => HttpMethod,
// postData?: (request: Request) => string,
// headers?: (request: Request) => Headers
// }
//
// type OverridesProps =
// 'method' |
// 'postData' |
// 'headers'
async function resolveChallengeWithTimeout(params: V1Request, page: Page) {
async function resolveChallengeWithTimeout(params: V1Request, session: SessionsCacheItem) {
const maxTimeout = params.maxTimeout || 60000
const timer = new Timeout();
try {
const promise = resolveChallenge(params, page);
const promise = resolveChallenge(params, session);
return await Promise.race([
promise,
timer.set(maxTimeout, `Maximum timeout reached. maxTimeout=${maxTimeout} (ms)`)
@ -48,25 +37,49 @@ async function resolveChallengeWithTimeout(params: V1Request, page: Page) {
}
}
async function resolveChallenge({ url, proxy, download, returnOnlyCookies, returnRawHtml }: V1Request,
page: Page): Promise<ChallengeResolutionT | void> {
async function resolveChallenge(params: V1Request, session: SessionsCacheItem): Promise<ChallengeResolutionT | void> {
try {
let status = 'ok'
let message = ''
if (proxy) {
const page: Page = await session.browser.newPage()
// the user-agent is changed just for linux arm build
await page.setUserAgent(sessions.getUserAgent())
// todo: review
if (params.proxy) {
log.debug("Apply proxy");
if (proxy.username)
await page.authenticate({ username: proxy.username, password: proxy.password });
if (params.proxy.username) {
await page.authenticate({
username: params.proxy.username,
password: params.proxy.password
});
}
}
log.debug(`Navigating to... ${url}`)
let response: Response = await page.goto(url, { waitUntil: 'domcontentloaded' })
log.debug(`Navigating to... ${params.url}`)
let response: Response = await page.goto(params.url, { waitUntil: 'domcontentloaded' })
// set cookies
if (params.cookies) {
for (const cookie of params.cookies) {
// the other fields in the cookie can cause issues
await page.setCookie({
"name": cookie.name,
"value": cookie.value
})
}
// reload the page
response = await page.goto(params.url, { waitUntil: 'domcontentloaded' })
}
// log html in debug mode
log.html(await page.content())
// Detect protection services and solve challenges
try {
response = await cloudflareProvider(url, page, response);
response = await cloudflareProvider(params.url, page, response);
} catch (e) {
status = "error";
message = "Cloudflare " + e.toString();
@ -81,31 +94,16 @@ async function resolveChallenge({ url, proxy, download, returnOnlyCookies, retur
headers: response.headers(),
response: null,
cookies: await page.cookies(),
userAgent: await page.evaluate(() => navigator.userAgent)
userAgent: sessions.getUserAgent()
}
}
if (returnOnlyCookies) {
if (params.returnOnlyCookies) {
payload.result.headers = null;
payload.result.userAgent = null;
} else {
if (download) {
// for some reason we get an error unless we reload the page
// has something to do with a stale buffer and this is the quickest
// fix since I am short on time
response = await page.goto(url, { waitUntil: 'domcontentloaded' })
payload.result.response = (await response.buffer()).toString('base64')
// todo: review this functionality
// } else if (returnRawHtml) {
// payload.result.response = await response.text()
} else {
payload.result.response = await page.content()
}
}
// Add final url in result
payload.result.url = page.url();
// make sure the page is closed because if it isn't and error will be thrown
// when a user uses a temporary session, the browser make be quit before
@ -113,102 +111,33 @@ async function resolveChallenge({ url, proxy, download, returnOnlyCookies, retur
await page.close()
return payload
}
function mergeSessionWithParams({ defaults }: SessionsCacheItem, params: V1Request): V1Request {
const copy = { ...defaults, ...params }
// custom merging logic
copy.headers = { ...defaults.headers || {}, ...params.headers || {} } || null
return copy
}
async function setupPage(params: V1Request, browser: Browser): Promise<Page> {
const page = await browser.newPage()
// merge session defaults with params
const { method, postData, headers, cookies } = params
// the user-agent is changed just for linux arm build
await page.setUserAgent(sessions.getUserAgent())
// todo: redo all functionality
// let overrideResolvers: OverrideResolvers = {}
//
// if (method !== 'GET') {
// log.debug(`Setting method to ${method}`)
// overrideResolvers.method = request => method
// }
//
// if (postData) {
// log.debug(`Setting body data to ${postData}`)
// overrideResolvers.postData = request => postData
// }
//
// if (headers) {
// log.debug(`Adding custom headers: ${JSON.stringify(headers)}`)
// overrideResolvers.headers = request => Object.assign(request.headers(), headers)
// }
//
// if (cookies) {
// log.debug(`Setting custom cookies: ${JSON.stringify(cookies)}`)
// await page.setCookie(...cookies)
// }
//
// // if any keys have been set on the object
// if (Object.keys(overrideResolvers).length > 0) {
// let callbackRunOnce = false
// const callback = (request: Request) => {
//
// // avoid loading resources to speed up page load
// if(request.resourceType() == 'stylesheet' || request.resourceType() == 'font' || request.resourceType() == 'image') {
// request.abort()
// return
// }
//
// if (callbackRunOnce || !request.isNavigationRequest()) {
// request.continue()
// return
// }
//
// callbackRunOnce = true
// const overrides: Overrides = {}
//
// Object.keys(overrideResolvers).forEach((key: OverridesProps) => {
// // @ts-ignore
// overrides[key] = overrideResolvers[key](request)
// });
//
// log.debug(`Overrides: ${JSON.stringify(overrides)}`)
// request.continue(overrides)
// }
//
// await page.setRequestInterception(true)
// page.on('request', callback)
// }
return page
} catch (e) {
log.error("Unexpected error: " + e);
throw e;
}
}
export async function browserRequest(params: V1Request): Promise<ChallengeResolutionT> {
const oneTimeSession = params.session === undefined;
const options: SessionCreateOptions = {
oneTimeSession: oneTimeSession,
cookies: params.cookies,
maxTimeout: params.maxTimeout,
proxy: params.proxy
}
const session: SessionsCacheItem = oneTimeSession
? await sessions.create(null, {
oneTimeSession: true
})
? await sessions.create(null, options)
: sessions.get(params.session)
if (!session) {
throw Error('This session does not exist. Use \'list_sessions\' to see all the existing sessions.')
}
params = mergeSessionWithParams(session, params)
try {
const page = await setupPage(params, session.browser)
return await resolveChallengeWithTimeout(params, page)
// const page = await setupPage(params, session.browser)
return await resolveChallengeWithTimeout(params, session)
} catch (error) {
throw Error("Unable to process browser request. Error: " + error)
} finally {

View File

@ -19,13 +19,3 @@ export function deleteFolderRecursive(path: string) {
fs.rmdirSync(path)
}
}
export const removeEmptyFields = (o: Record<string, any>): typeof o => {
const r: typeof o = {}
for (const k in o) {
if (o[k] !== undefined) {
r[k] = o[k]
}
}
return r
}

View File

@ -7,6 +7,7 @@ import {testWebBrowserInstallation} from "../services/sessions";
const request = require("supertest");
const app = require("../app");
const version: string = require('../../package.json').version
const googleUrl = "https://www.google.com";
const cfUrl = "https://pirateiro.com/torrents/?search=s";
const cfCaptchaUrl = "https://idope.se"
@ -136,6 +137,60 @@ describe("Test '/v1' path", () => {
expect(apiResponse.solution.url).toContain(cfCaptchaUrl)
});
test("Cmd 'request.get' should return OK with 'cookies' param", async () => {
const payload = {
"cmd": "request.get",
"url": googleUrl,
"cookies": [
{
"name": "testcookie1",
"value": "testvalue1"
},
{
"name": "testcookie2",
"value": "testvalue2"
}
]
}
const response: Response = await request(app).post("/v1").send(payload);
expect(response.statusCode).toBe(200);
const apiResponse: V1ResponseSolution = response.body;
expect(apiResponse.status).toBe("ok");
const solution = apiResponse.solution;
expect(solution.url).toContain(googleUrl)
expect(Object.keys(solution.cookies).length).toBeGreaterThan(1)
const cookie1: string = (solution.cookies as any[]).filter(function(cookie) {
return cookie.name == "testcookie1";
})[0].value
expect(cookie1).toBe("testvalue1")
const cookie2: string = (solution.cookies as any[]).filter(function(cookie) {
return cookie.name == "testcookie2";
})[0].value
expect(cookie2).toBe("testvalue2")
});
test("Cmd 'request.get' should return OK with 'returnOnlyCookies' param", async () => {
const payload = {
"cmd": "request.get",
"url": googleUrl,
"returnOnlyCookies": true
}
const response: Response = await request(app).post("/v1").send(payload);
expect(response.statusCode).toBe(200);
const apiResponse: V1ResponseSolution = response.body;
const solution = apiResponse.solution;
expect(solution.url).toContain(googleUrl)
expect(solution.status).toBe(200);
expect(solution.headers).toBe(null)
expect(solution.response).toBe(null)
expect(Object.keys(solution.cookies).length).toBeGreaterThan(0)
expect(solution.userAgent).toBe(null)
});
test("Cmd 'request.get' should return timeout", async () => {
const payload = {
"cmd": "request.get",
@ -306,4 +361,5 @@ describe("Test '/v1' path", () => {
expect(cfCookie2.length).toBeGreaterThan(30)
expect(cfCookie2).toBe(cfCookie)
});
});