@letsscrapedata/proxy
v0.0.27
Published
proxy manager used to scrape data
Maintainers
Readme
Please get help and discuss how to scrape a website on the discord server, which can respond quickly. It is better to submit issues on github for better tracking.
Features
This package is used by LetsScrapeData App to manage different types of proxies from different vendors using the same interfaces.
- Same interface for mananing proxies from different vendors.
- Proxy Manager: manage the different proxies according to requirements.
- Most ProxyAccountManagers require a username/password or key or token to manage proxies.
Install
npm install @letsscrapedata/proxyExamples
import { ProxyManager, GeneralAccount, AddPackageData, ProxyRequirements } from "@letsscrapedata/proxy";
let filename = "isp-proxies.txt";
const generalAccount = new GeneralAccount();
const addPackageData: AddPackageData = {
proxyIpType: "isp",
proxyDurationType: "static",
proxySharedType: "dedicated",
billingModel: "period",
vendorName: "unkown", // important for some proxy vendors
// priority: 10, // the smaller the higher
maxProxyDuration: 3600 * 24, // seconds
// maxSessionDuraion: 3600, // for rotating proxies
maxUsersPerIp: 4,
maxConcurrencyPerUser: 10,
bandwidth: 0, // unkown
filename
}
generalAccount.update("addPackage", addPackageData);
// add new proxies from filename regularly
await generalAccount.startRefresh();
let proxyManager = new ProxyManager();
proxyManager.addProxyAccount(generalAccount)
// more diffenct ProxyAccounts can be added
// get proxies according to requirements
const proxyRequirements: ProxyRequirements = {
proxyIpSharedTypes: ["rd", "id"], // residential-dedicated or isp-dedicated proxy
minProxyValidSeconds: 3600
}
const proxies = await proxyManager.getProxies(proxyRequirements, 2);
await sleep(20000)
await proxyManager.close();interface ProxyAccountManager
The new ProxyAccountManager should contain as complete and accurate information as possible so that the crawler scheduler can accurately determine whether the proxy is applicable, control users and access concurrency.
/**
* Proxy account manager manages proxies that an account has purchased from a proxy vendor. Basic concepts:
* * ProxyAccount: an account usually manages the proxies provided by an proxy vendor. A prorxyAccount may purchase 0 or many proxyPackages. A joint proxy account manages proxies provided by many vendors.
* * ProxyPackage: a package that you buy from a proxy vendor. Each proxy package usually contains many proxies of the same type, that will expire later.
* * Proxy: a network proxy that can be used to scrape data
* * NewProxyEvent: event emitted when new proxies are added
*/
export interface ProxyAccountManager extends EventEmitter {
/**
* each type of proxyAccountManager decides which parameters are required or optional:
* * the constructor should throw error if the parameters are invalid.
*/
// new(options?: ProxyAccountManagerOptions): ProxyAccountManager;
/**
* set new options of this manager, each type of proxyAccountManager decides which options can be updated
* @param options
*/
setOptions(options: ProxyAccountManagerOptions): boolean;
/**
* Start to refresh proxies:
* * refresh once immediately if options.refreshNow is true
* * refresh periodly if refreshIntervalSecs of ProxyAccountManager is greater than 0
* @param options
*/
startRefresh(options: RefreshOptions): Promise<boolean>;
/**
* Stop to refresh periodly.
*/
stopRefresh(): boolean;
/**
* Get proxy packages that meet the conditions.
* * used only by ProxyManager
* @param proxyRequirements
*/
_getProxyPackages(proxyRequirements: ProxyRequirements): ProxyPackageInfo[];
/**
* Get proxies, which meet the conditions, from a package.
* * used only by ProxyManager, user should use getProxies()
* @param proxyRequirements
* @param proxyNum default 1
* @param onlyApplied default false, whether to get proxies only from applied proxies
* @param onlyApply default false;false - apply and allocate,true - apply and not allocate
*/
_getProxiesFromPackage(
proxyPackageInfo: ProxyPackageInfo,
proxyRequirements: ProxyRequirements,
proxyNum?: number,
onlyApplied?: boolean,
onlyApply?: boolean
): Promise<Proxy[]>;
/**
* Get the applied proxies that meet proxyRequirements
* @param proxyRequirements
* @param proxyNum defaut 0, <=0: all, >0: the number of proxies to get
* @param onlyApply default false;false - apply and allocate,true - apply and not allocate
*/
// getAppliedProxies(proxyRequirements: ProxyRequirements, proxyNum?: number, onlyApply?: boolean): Promise<Proxy[]>;
/**
* Get proxies that meet the conditions, can be used now, and have the highest priority(and the minimum expireTime if same priority).
* * The number of proxies in return may be less than proxyNum.
* * Return [] if there is no requried proxies.
* * Throw error if there is exeception.
* @param proxyRequirements
* @param proxyNum default 1, the number of proxies to get
* @param onlyApplied default false, whether to get proxies only from applied proxies
*/
getProxies(proxyRequirements: ProxyRequirements, proxyNum?: number, onlyApplied?: boolean): Promise<Proxy[]>;
/**
* Discard a proxy that is expired or invalid. This proxy will not be used again.
* @param proxy
*/
discardProxy(proxy: Proxy): boolean;
/**
* * Free a busy proxy for later use.
* @param proxy
*/
freeProxy(proxy: Proxy): boolean;
/**
* Lock a proxy that cannot be used now and can be used later again (usually lock **static** proxy):
* * Free this proxy if it is busy (in use)
* * Lock this proxy by updating lastAbnormalTime of this proxy (status of proxy is "idle")
* * Please don't lock proxy if it is used to access many different websites
* @param proxy
*/
lockProxy(proxy: Proxy): boolean;
/**
* Optional method, launched manually and used by some types of proxyAccountManager, such as GeneralAccount uses this to add new package or add new proxies.
* * It's best to refresh periodly in startRefresh method.
* * Use this method only when it is requried.
* @param opType
* @param data
*/
update(opType?: string, data?: object): Promise<boolean>;
/**
* Adjust the priority of packages
* @param priority
* @param packageNames
*/
adjustPriorityOfPackages(priority: number, packageNames: PackageName[]): boolean;
/**
* Enable packages
* @param packageNames
*/
enablePackages(packageNames: PackageName[]): boolean;
/**
* Disable packages
* @param packageNames
*/
disablePackages(packageNames: PackageName[]): boolean;
/**
* Enable this proxyAccount
*/
enable(): boolean;
/**
* Disable this proxyAccount
*/
disable(): boolean;
proxyAccountId(): ProxyAccountId;
status(): ProxyAccountStatus;
proxyAccount(): ProxyAccount;
proxyPackages(): ProxyPackage[];
proxies(): Proxy[];
/**
* Close this proxy account manager (proxyAccount) that cannot be used again.
*/
close(): Promise<boolean>;
}