@letsscrapedata/controller
v0.0.71
Published
Unified browser / HTML controller interfaces that support patchright, camoufox, playwright, puppeteer and cheerio
Maintainers
Readme
Please get help and discuss how to scrape a website on the discord server, which can respond quickly. It is better to submit issues on github for better tracking.
Features
This package is used by @letsscrapedata/scraper to facilitate switching between different types of browser controllers and to facilitate support for the new anti-bot browser controller without modifying existing programs.
- Same interface of patchright, camoufox, playwright, puppeteer, cheerio: easy to switch between them
- Web browsing automation: goto(open) / click / input / hover / select / scroll
- State data management: cookies, localStorage, HTTP Headers, custom session data
- Request and response interception management: data and HTTP headers
- Elements selection by CSS selectors or XPath: whether in frames or not
- Element's attributes: innerHtml, innerText, outerHtml, textContent, etc
- Automatic file saving: such as screenshot, pdf
- CDP message
- Page evaluate
- Completed the functions that are not supported by individual browser controllers or provided workarounds for known issues
Install
npm install @letsscrapedata/controllerExamples
import { controller } from "@letsscrapedata/controller";
const browser = await controller.launch("patchright", "chromium", { headless: false });
// const browser = await controller.launch("camoufox", "firefox", { headless: false });
const browserContext = await browser.newBrowserContext();
const page = await browserContext.getPage();
await page.goto("https://www.letsscrapedata.com/pages/listexample1.html");
await page.screenshot({path: "screenshot.png"});
await browser.close();Same interfaces
- LsdElement
- LsdPage
- LsdBrowserContext
- LsdBrowser
- LsdBrowserController
LsdPage
export interface LsdPage extends EventEmitter {
/**
* Get the LsdApiContext associated with this page's LsdBrowserContext
* * only vaild in playwright
*/
apiContext(): LsdApiContext;
bringToFront(): Promise<boolean>;
browserContext(): LsdBrowserContext;
/**
* clear the cookies of the current page(url)
* * Prerequisites: page must has a valid url, such as by calling goto(url)
*/
clearCookies(): Promise<boolean>;
/**
* clear the localStorage of the current page(url)
* * Prerequisites: page must has a valid url, such as by calling goto(url)
*/
clearLocalStorage(): Promise<boolean>;
/**
* Clear all request interceptions on the page
*/
clearRequestInterceptions(): Promise<boolean>;
/**
* Clear all response interceptions on the page
*/
clearResponseInterceptions(): Promise<boolean>;
/**
* clear the stateData of the current page(url):
* * stateData: cookies, localStorage, indexedDB
* * Prerequisites: page must has a valid url, such as by calling goto(url)
*/
clearStateData(): Promise<boolean>;
/**
* Only free page can be closed!
*/
close(): Promise<boolean>;
/**
* Get the full HTML content of the page or decendant frame
* @param iframeOptions default [], selectors of decendant frames
*/
content(iframeOptions?: IframeOption[]): Promise<string>;
cookies(): Promise<CookieItem[]>;
evalute(fun: Function, args?: any[]): Promise<any>;
/**
* @returns the first element matching the given CSS selector or XPath
* @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
* @param iframeOptions default [], options to select decendant frame
*/
findElement(selectorOrXpath: string | string[], iframeOptions?: IframeOption[]): Promise<LsdElement | null>;
/**
* @returns elements matching the given CSS selector or XPath
* @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
* @param iframeOptions default [], options to select decendant frame
*/
findElements(selectorOrXpath: string | string[], iframeOptions?: IframeOption[]): Promise<LsdElement[]>;
/**
* Free a busy page. All request and response interceptions will be cleared.
*/
free(): Promise<boolean>;
/**
* @returns whether the element has the specified attribute or not
* @param attributeName
*/
goto(url: string, options?: GotoOptions): Promise<boolean>;
id(): string;
isFree(): boolean;
/**
* valid only in CheerioPage
* @param html
* @param isHtml default true
*/
load(html: string, isHtml?: boolean): boolean;
localStroage(): Promise<LocalStorageOrigin[]>;
mainFrame(): AllFrame;
maximizeViewport(): Promise<boolean>;
pageHeight(): Promise<number>;
pageInfo(): PageInfo;
pageWidth(): Promise<number>;
pdf(options?: PDFOptions): Promise<Buffer>;
screenshot(options?: ScreenshotOptions): Promise<Buffer>;
scrollBy(x: number, y: number): Promise<boolean>;
scrollTo(x: number, y: number): Promise<boolean>;
/**
*
* Send a CDP message over the current(not detached) or new CDP session
* @param method protocol method name
* @param params default null(ignored), method parameters
* @param detach default true, whether to detach the CDPSession from target
*/
sendCDPMessage(method: string, params?: object | null, detach?: boolean): Promise<any>;
setCookies(cookies: CookieItem[]): Promise<boolean>;
setExtraHTTPHeaders(headers: Record<string, string>): Promise<boolean>;
/**
* set localStorage on the current web page(page.url())
* @param localStorageItems
*/
setLocalStroage(localStorageItems: LocalStorageItem[]): Promise<boolean>;
setPageInfo(pageInfo: UpdatablePageInfo): boolean;
/**
* Intercept requests that meet the conditions(requestMatch) to perform an action(action and fulfill).
* @param options
*/
setRequestInterception(options: RequestInterceptionOption | RequestInterceptionOption[]): Promise<boolean>;
/**
* Intercept responses that meet the conditions(requestMatch and responseMatch) to perform actions(cacheArray and handler )
* @param options
*/
setResponseInterception(options: ResponseInterceptionOption | ResponseInterceptionOption[]): Promise<boolean>;
/**
* Shortcut for LsdPage.browserContext().setStateData(stateData)
* @param stateData
*/
setStateData(stateData: BrowserStateData): Promise<boolean>;
/**
* valid only in puppeteer
* @param userAgent
*/
setUserAgent(userAgent: string): Promise<boolean>;
setViewportSize(viewPortSize: ViewportSize): Promise<boolean>;
stateData(): Promise<BrowserStateData>;
status(): PageStatus;
title(): Promise<string>;
url(): string;
/**
* start to use this free page
*/
use(): boolean;
/**
*
* @param selector CSS selector, not XPath
* @param options
*/
waitForElement(selector: string, options?: WaitElementOptions): Promise<boolean>;
/**
*
* @param options
*/
waitForNavigation(options: WaitNavigationOptions): Promise<boolean>;
/**
* obj=window?.[key1]...?.[keyn]
* @return obj ? JSON.stringify(obj) : ""
* @param keys
*/
windowMember(keys: string[]): Promise<string>;
_origPage(): AllPage;
}LsdElement
export interface LsdElement {
/////////////////////////////////////////////////////////////////////////////// methods used to extract data from the element
/**
*
* @return the value of a specified attribute on the element
* @param attributeName
*/
attribute(attributeName: string): Promise<string>;
/**
* @returns the attribute names of the element
*/
attributeNames(): Promise<string[]>;
/**
* @returns the first element matching the given CSS selector or XPath
* @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
* @param iframeOptions default [], options to select decendant frame
* @param absolute valid only if iframeOptions.length===0
*/
findElement(selectorOrXpath: string | string[], iframeOptions?: IframeOption[], absolute?: boolean): Promise<LsdElement | null>;
/**
* @returns elements matching the given CSS selector or XPath
* @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
* @param iframeOptions default [], options to select decendant frame
* @param absolute valid only if iframeOptions.length===0
*/
findElements(selectorOrXpath: string | string[], iframeOptions?: IframeOption[], absolute?: boolean): Promise<LsdElement[]>;
/**
* @returns whether the element has the specified attribute or not
* @param attributeName
*/
hasAttribute(attributeName: string): Promise<boolean>;
/**
* @returns the HTML or XML markup contained within the element
*/
innerHtml(): Promise<string>;
/**
* @returns innerText of element
* @param onlyChild default false, whether to include only the text of the child text nodes
*/
innerText(onlyChild?: boolean): Promise<string>;
/**
* @returns the serialized HTML fragment describing the element including its descendants
*/
outerHtml(): Promise<string>;
textContent(): Promise<string>;
/////////////////////////////////////////////////////////////////////////////// methods to operate the element(only valid for browser)
/**
* Click this element.
* @param options default {button: "left", count: 1, delay: 0, modifies: []}
*/
click(options?: MouseClickOptions): Promise<boolean>;
focus(): Promise<boolean>;
hover(): Promise<boolean>;
/**
* * playwright: fill
* * puppeteer: type
*/
input(value: string, options?: InputOptions): Promise<boolean>;
press(key: KeyInput, options: KeyPressOptions): Promise<boolean>;
screenshot(options?: ScreenshotOptions): Promise<Buffer>;
scrollIntoView(): Promise<boolean>;
select(options: SelectOptions): Promise<boolean>;
setAttribute(attributeName: string, newValue: string): Promise<boolean>;
_origElement(): AllElement;
}LsdBrowserContext
export interface LsdBrowserContext extends EventEmitter {
/**
* Get the LsdApiContext associated with this LsdBrowserContext
* * only vaild in playwright
*/
apiContext(): LsdApiContext;
browser(): LsdBrowser;
close(): Promise<boolean>;
/**
* close pages that are free more than maxPageFreeSeconds if maxPageFreeSeconds > 0
* * but the last page in the browserContext will not be closed
* @default 0 the default maxPageFreeSeconds of the browserContext will be used
*/
closeFreePages(maxPageFreeSeconds?: number): Promise<boolean>;
/**
* doest this browser meet browserContextRequirements (incognitos ignored in browser)?
* @param browserContextRequirements
*/
doesMeetBrowserContextRequirements(browserContextRequirements: BrowserContextRequirements): boolean;
/**
* get a free page from current pages or by creating a new page
*/
getPage(always?: boolean): Promise<LsdPage | null>;
/**
* whether can get a number of free page(s)
* * refer to getPage()
* @param pageNum default 1, the number of free pages
*/
hasFreePage(pageNum?: number): boolean;
id(): string;
isIncognito(): boolean;
page(pageIdx: number): LsdPage | null;
pages(): LsdPage[];
proxy(): ProxyInController | null; // 备用
setStateData(stateData: BrowserStateData): Promise<boolean>;
_origBrowserContext(): AllBrowserContext;
}
LsdBrowser
export interface LsdBrowser extends EventEmitter {
// By default, constructor can be called in LsdBrowserController.launch/connect to create new instance
// main methods
newBrowserContext(options?: LsdBrowserContextOptions): Promise<LsdBrowserContext | null>;
/**
* 1. launched: close all browserContexts and this browser
* 2. connected:
* * in puppeteer: close all browserContexts and this browser???
* * in playwright: only browserContexts created by newContext will be closed, browser is disconnected and browser will not be closed
*/
close(): Promise<boolean>;
// other methods
browserContexts(): LsdBrowserContext[];
browserControllerType(): BrowserControllerType;
browserCreationMethod(): BrowserCreationMethod;
browserType(): LsdBrowserType;
/**
* doest this browser meet browserContextRequirements (incognitos ignored in browser)?
* @param browserContextRequirements
*/
doesMeetBrowserContextRequirements(browserContextRequirements: BrowserContextRequirements): boolean;
/**
* @returns
* 1. launched: actual executable path
* 2. connected: exectuablePath in LsdConnectOptions, default ""(unkown)
*/
executablePath(): string;
id(): string;
isConnected(): boolean;
isHeadless(): boolean;
options(): LsdLaunchOptions | LsdConnectOptions;
/**
* * puppeteer: return pid of connected or launched browser
* * playwright: return pid of connected browser that is launched manually or using launchServer, or else return 0
*/
pid(): number;
/**
* get the cpu utility(%) and memory usage(MB) of browser processes if pid is greater than 0 (refer to pid())
*/
pidUsage(): Promise<{ cpu: number, memory: number }>;
version(): Promise<string>; // playwright: sync; puppeteer: async
_origBrowser(): AllBrowser;
}LsdBrowserController
export interface LsdBrowserController {
/**
* launch a new browser using related browser controller
* @param browserControllerType
* @param browserType
* @param options
*/
launch(browserControllerType: BrowserControllerType, browserType: LsdBrowserType, options?: LsdLaunchOptions): Promise<LsdBrowser>;
/**
* connect to the current browser using related browser controller
* @param browserControllerType
* @param browserType
* @param options
*/
connect(browserControllerType: BrowserControllerType, browserType: LsdBrowserType, options?: LsdConnectOptions): Promise<LsdBrowser>;
/**
*
* @param puppeteer null means use puppeteer-extra-plugin-stealth based on puppeteer-extra
*/
setPuppeteerNode(puppeteer: PuppeteerNode | null): boolean;
/**
*
* @param puppeteer null means use puppeteer-extra-plugin-stealth based on playwright-extra
*/
setPlaywrightBrowserType(browserType: LsdBrowserType, playwrightBrowserType: BrowserType | null): boolean;
/**
* Create a new LsdApiContext, valid in playwright;
*/
newApiContext(options?: LsdApiContextOptions): Promise<LsdApiContext>;
}