@letsscrapedata/scraper
v0.0.90
Published
Web scraper that scraping web pages by LetsScrapeData XML template
Maintainers
Readme
Please get help and discuss how to scrape a website on the discord server, which can respond quickly. It is better to submit issues on github for better tracking.
Features
- Template driven web scraping
- you can quickly design templates for scraping different websites.
- The templates are intuitive and easier to maintain.
- Browser operations supported by the controller package
- Same interface of playwright, patchright, camoufox, puppeteer, cheerio: easy to switch between them
- Web browsing automation: goto(open) / click / input / hover / select / scroll
- Automatic captcha solver: Recaptcha(v2 & v3), Cloudflare Turnstile, GeeTest(v3 & v4), image/text, cooridinate
- State data management: cookies, localStorage, HTTP Headers, custom session data
- Elements selection by CSS selectors or XPath: whether in frames or not
- Automatic file saving: such as screenshot, pdf, mhtml, download directly or by clicking
- API request
- Both browser and API can be used at the same time and cookies/headers are shared.
- HTTP headers: intercepted, generated automatically or by browser automation, got by API or others
- fingerprint management:
- Automatically generate fingerprints of the latest common browsers
- Simple rate limits: automatic flow control, such as interval / max concurrency /times per period
- Simple proxy management: multiple "static" proxies to increase concurrency
- Subtasks: complex tasks can be split into multiple simple subtasks for better maintenance and increased concurrency
- Data export
Install
npm install @letsscrapedata/scraperExamples
- Example with default ScraperConfig:
// javascript
import { scraper } from "@letsscrapedata/sraper";
/**
* tid: ID of template to be executed, such as template for scraping one list of example in page "https://www.letsscrapedata.com/pages/listexample1.html"
* parasstrs: input parameters of tasks, such as "1"
* this example will execute five tasks using template 10001, each of them scrapes the data in one page.
*/
const newTasks = [{ tid: 10001, parasstrs: ["1", "2", "3", "4", "5"] }];
/* The following line can do the same thing using subtasks, scraping the data in the first five pages */
// const newTasks = [{ tid: 10002, parasstrs: ["5"] }];
await scraper(newTasks);- Example with ScraperConfig
// typescript
import { scraper, TemplateTasks, ScraperConfig } from "@letsscrapedata/sraper";
const scraperConfig: ScraperConfig = {
browserConfigs: [
/* launch a chromium browser using puppeteer, no proxy */
{ browserControllerType: "puppeteer", proxyUrl: "" },
/* launch a chromium browser using playwright, proxy */
{ browserContollerType: "playwright", proxyUrl: "http://proxyId:port" },
/* connect to the current browser using patchright */
{ browserUrl: "http://localhost:9222/" },
],
// exitWhenCompleted: true,
// lsdLaunchOptions: { headless: true },
// loadUnfinishedTasks: true,
// loadFailedTasksInterval: 5
// captcha: { clientKey: "xxx" } // to solve captcha using 2captca
};
const newTasks: TemplateTasks[] = [{ tid: 10002, parasstrs: ["9"] }];
await scraper(newTasks, scraperConfig);ScraperConfig
Common configurations:
- Proxies and browser: browserConfigs, by default launching a browser using browserControllerType/browserType, without proxy
- Launch options of browser: lsdLaunchOptions, default {headless: false}
- Whether to load unfinished tasks: loadUnfinishedTasks, default false
- Whether to exist when completed: exitWhenCompleted, default false
- File format of scraped data: dataFileFormat, default "jsonl"
- API Key of captcha solver: captcha.clientKey
Complete configurations:
export interface ScraperConfig {
/**
* @default false
*/
exitWhenCompleted?: boolean;
/**
* whether to use the parasstr in XML if parasstr of a task is ""
* @default false
*/
useParasstrInXmlIfNeeded?: boolean;
/**
* whether to load unfinished tasks
* @default false
*/
loadUnfinishedTasks?: boolean;
//////////////////////////////////////////////////////////////////////////// directory
/**
* @default "", which will use current directory of process + "/data/"
* if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
*/
baseDir?: string;
/**
* filename in action_setvar_get/get_file must include inputFileDirePart for security.
* @default "LetsScrapeData"
*/
inputFileDirPart?: string;
//////////////////////////////////////////////////////////////////////////// browser
/**
* wether to use puppeteer-extra-plugin-stealth, use patchright instead
* @default false
*/
useStealthPlugin?: boolean;
/**
* default browserControllerType of BrowserConfig
* @default "patchright"
*/
browserControllerType?: BrowserControllerType;
/**
* default browserType of BrowserConfig
* @default "chromium"
*/
browserType?: LsdBrowserType;
/**
* @default { headless: false, geoip: true }
*/
lsdLaunchOptions?: LsdLaunchOptions;
/**
* @default {browserUrl: ""}
*/
lsdConnectOptions?: LsdConnectOptions;
/**
* Important: browsers to be launched or connected using proxyUrl
* @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy
*/
browserConfigs?: BrowserConfig[];
//////////////////////////////////////////////////////////////////////////// captcha
captcha?: {
/**
* clientKey of 2captcha
*/
clientKey: string;
// if you need to solve captcha in camoufox, please contact administrator
},
//////////////////////////////////////////////////////////////////////////// template
/**
* the default maximum number of concurrent tasks that can execute the same template in a browserContext
* @default 1
*/
maxConcurrency?: number;
/**
* @default ""
*/
readCode?: string;
/**
* @default []
*/
templateParas?: TemplatePara[];
//////////////////////////////////////////////////////////////////////////// scheduler
/**
* @default 10
*/
totalMaxConcurrency?: number;
/**
* min miliseconds between two tasks of the same template
* @default 2000
*/
minMiliseconds?: number,
//////////////////////////////////////////////////////////////////////////// data
/**
* whether to move all dat_* files into a new directory "yyyyMMddHHmmss"
* @default false
*/
moveDataWhenStart?: boolean;
/**
** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
* @default "jsonl"
*/
dataFileFormat?: DataFileFormat;
* valid only when dataFileFormat is "txt"
*/
columnSeperator?: string;
}
/**
* Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority.
*/
export interface BrowserConfig {
browserControllerType?: BrowserControllerType;
/**
* url used to connected the current browser
** url starts with "http://", such as "http://localhost:9222/"
** browserUrl can be used when mannaul login in advance.
*/
browserUrl?: string;
/**
* proxy
** no proxy will be used if proxyUrl is ""
** valid only if !browserUrl
*/
proxyUrl?: string;
/**
* type of browser to be launched
* valid only if !browserUrl
* @default "chromium"
*/
browserType?: LsdBrowserType;
}