wildspider

v1.2.23

Published

2 years ago

```typescript import { CrawCallbackParam, Spider, Schedule } from 'wildspider'

Downloads

0High
0Medium
0Low

xujif

开发

示例爬虫

import { CrawCallbackParam, Spider, Schedule } from 'wildspider'

export default class Example extends Spider {
    // 声明项目名称
    project = 'example'

    // 指定运行时间
    @Schedule.cron('0 */1 * * * *')
    start () {
        const startUrl = 'http://money.163.com/'
        // 使用dispatch 分配下一个任务，第一个参数为下一步要使用的方法
        this.dispatch(this.index, {
            url: startUrl
        })
    }

    // age表示url有效性，指定时间内url相同的链接不会重新采集
    // @Schedule.age({ minute: 2 })
    index ({ req, res }: CrawCallbackParam) {
        // 使用 cheerio 和使用(jQuery)一样获取内容
        const $ = res.doc()
        const latestNews = $('#ln_list1 li a')
        latestNews.each((index, a) => {
            const href = $(a).attr('href')
            this.dispatch(this.detail, {
                url: href
            })
        })
    }

    // age设置的很大意味着不会重新采集
    @Schedule.age({ day: 1000000 })
    @Schedule.returnResult('article')
    async detail ({ req, res }: CrawCallbackParam) {
        const $ = res.doc()
        const article: any = {}
        article.url = req.url
        article.title = $('#epContentLeft h1').text()
        const timeAndSrcNode = $('#epContentLeft .post_time_source').text()
        const timeMatch = timeAndSrcNode.match(/\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/)
        if (timeMatch && timeMatch.length > 0) {
            article.datetime = timeMatch[0]
        }
        const fromMatch = timeAndSrcNode.match(/(?:来源\:)\s*(.+)\s*/)
        if (fromMatch && fromMatch.length > 1) {
            article.from = fromMatch[1]
        } else {
            article.from = '163'
        }
        article.content = $('#endText').html()
        return article
    }
}

Spider方法

方法需要的参数参考代码

this.dispatch 分配下一个任务
this.save 保存结果
this.saveFile 保存文件，需传入内容
this.downLoadAndSaveFile 使用爬虫爬取文件并保存
this.sendMessage 给其他爬虫发消息（暂未使用）

装饰器说明：

装饰器出现在爬虫的方法上，改变其默认的行为配置

@Schedule.cron('0 */1 * * * *')
爬虫需要定时执行的方法，
只支持在start方法上
2.0 将支持任何方法（未部署）
@Schedule.returnResult('article') 将方法的返回值作为结果保存，参数为需要保存的【队列】支持生成器 yiled 返回多个结果
@Schedule.age()
爬虫的有效期，有效期内相同的请求不会被重复爬取
@Schedule.faildDelay(number:second)
爬取失败后，等等x秒后重试
@Schedule.reqInterval(number:second)
每个任务之间，需要间隔 x秒
@Schedule.noWait()
每个任务之间，不需要等待，尽快执行，相当于 reqInterval(0)
@Schedule.handleNon200(codes?:number[]) 接受非200返回，默认只有返回200才认为成功，否则认为请求失败，不会进入方法体
@Schedule.timeout(timeout:number) 请求的最大超时
@Schedule.ignoreSslError() 忽略ssl证书错误
@Schedule.priority(priority:number) 任务的优先级，支持0-32，任务优先级越高，越先爬取，默认为8

Published

Vulnerabilities

Links

Maintainers

Keywords

Readme

开发

示例爬虫

Spider方法

装饰器说明：