@vector341/wget-js

v1.0.2

Published

a year ago

JavaScript implement of wget

0High
0Medium
0Low

wget 的 JavaScript 实现

实现 wget 的 -K 选项，将 html 文档中外部链接转化为本地链接。这个过程涉及到解析 HTML 内容（jsdom），查找所有的外部链接（<a>, <iframe>, <link rel="stylesheet", <script>等），并将它替换为本地路径/代理路径。

wget 保存网站以用于本地浏览

# 实用
wget -E -H -k -K -p http://<site>/<document>
# 镜像整个网站，包括所有资源
wget -m -k -K -E https://example.com

选项解释：

-m 开启镜像模式，等价于 -r -N -l inf --no-remove-listing. 使用该选项往往会复制整个网站，包括很多不需要的资源
-k 转换链接以适用于本地浏览. /bar/img.gif -> http://hostname/bar/img.gif
-p, --page-requisites 下载页面需要的所有为支持的元素，例如图片、样式等. 不包括外部文档链接（在任何标签、标签或标签（除了之外）中指定的URL）

注：wget 的 -k 选项考虑了某个外部资源是否被下载，因此链接的转换发生在下载所有资源之后.

wget 底层实现

源码见 https://ftp.gnu.org/gnu/wget/ wget 实现了自己的 HTML 解析器（src/http-parse.c），用于提取 HTML 文档中的标签和属性. 再通过 interesting tag 和 interesting attribute 的列表，来决定哪些标签和属性需要被处理（src/http-url.c）。具体列表如下：

/* The list of known tags and functions used for handling them.  Most
   tags are simply harvested for URLs. */
static struct known_tag {
  int tagid;
  const char *name;
  tag_handler_t handler;
} known_tags[] = {
  { TAG_A,       "a",           tag_find_urls },
  { TAG_APPLET,  "applet",      tag_find_urls },
  { TAG_AREA,    "area",        tag_find_urls },
  { TAG_BASE,    "base",        tag_handle_base },
  { TAG_BGSOUND, "bgsound",     tag_find_urls },
  { TAG_BODY,    "body",        tag_find_urls },
  { TAG_EMBED,   "embed",       tag_find_urls },
  { TAG_FIG,     "fig",         tag_find_urls },
  { TAG_FORM,    "form",        tag_handle_form },
  { TAG_FRAME,   "frame",       tag_find_urls },
  { TAG_IFRAME,  "iframe",      tag_find_urls },
  { TAG_IMG,     "img",         tag_handle_img }, // tag_find_urls() plus handling "srcset"
  { TAG_INPUT,   "input",       tag_find_urls },
  { TAG_LAYER,   "layer",       tag_find_urls },
  { TAG_LINK,    "link",        tag_handle_link },
  { TAG_META,    "meta",        tag_handle_meta },
  { TAG_OBJECT,  "object",      tag_find_urls },
  { TAG_OVERLAY, "overlay",     tag_find_urls },
  { TAG_SCRIPT,  "script",      tag_find_urls },
  { TAG_TABLE,   "table",       tag_find_urls },
  { TAG_TD,      "td",          tag_find_urls },
  { TAG_TH,      "th",          tag_find_urls },
  { TAG_VIDEO,   "video",       tag_find_urls },
  { TAG_AUDIO,   "audio",       tag_find_urls },
  { TAG_SOURCE,  "source",      tag_handle_img } // tag_find_urls() plus handling "srcset"
};

/* tag_url_attributes documents which attributes of which tags contain
   URLs to harvest.  It is used by tag_find_urls.  */

/* Defines for the FLAGS. */

/* The link is "inline", i.e. needs to be retrieved for this document
   to be correctly rendered.  Inline links include inlined images,
   stylesheets, children frames, etc.  */
#define ATTR_INLINE     1

/* The link is expected to yield HTML contents.  It's important not to
   try to follow HTML obtained by following e.g. <img src="...">
   regardless of content-type.  Doing this causes infinite loops for
   "images" that return non-404 error pages with links to the same
   image.  */
#define ATTR_HTML       2

/* For tags handled by tag_find_urls: attributes that contain URLs to
   download. */
static struct {
  int tagid;
  const char *attr_name;
  int flags;
} tag_url_attributes[] = {
  { TAG_A,              "href",         ATTR_HTML },
  { TAG_APPLET,         "code",         ATTR_INLINE },
  { TAG_AREA,           "href",         ATTR_HTML },
  { TAG_BGSOUND,        "src",          ATTR_INLINE },
  { TAG_BODY,           "background",   ATTR_INLINE },
  { TAG_EMBED,          "href",         ATTR_HTML },
  { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
  { TAG_FIG,            "src",          ATTR_INLINE },
  { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
  { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
  { TAG_IMG,            "href",         ATTR_INLINE },
  { TAG_IMG,            "lowsrc",       ATTR_INLINE },
  { TAG_IMG,            "src",          ATTR_INLINE },
  { TAG_INPUT,          "src",          ATTR_INLINE },
  { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
  { TAG_OBJECT,         "data",         ATTR_INLINE },
  { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
  { TAG_SCRIPT,         "src",          ATTR_INLINE },
  { TAG_TABLE,          "background",   ATTR_INLINE },
  { TAG_TD,             "background",   ATTR_INLINE },
  { TAG_TH,             "background",   ATTR_INLINE },
  { TAG_VIDEO,          "src",          ATTR_INLINE },
  { TAG_VIDEO,          "poster",       ATTR_INLINE },
  { TAG_AUDIO,          "src",          ATTR_INLINE },
  { TAG_AUDIO,          "poster",       ATTR_INLINE },
  { TAG_SOURCE,         "src",          ATTR_INLINE },
};

/* The lists of interesting tags and attributes are built dynamically,
   from the information above.  However, some places in the code refer
   to the attributes not mentioned here.  We add them manually.  */
static const char *additional_attributes[] = {
  "rel",                        /* used by tag_handle_link  */
  "type",                       /* used by tag_handle_link  */
  "http-equiv",                 /* used by tag_handle_meta  */
  "name",                       /* used by tag_handle_meta  */
  "content",                    /* used by tag_handle_meta  */
  "action",                     /* used by tag_handle_form  */
  "style",                      /* used by check_style_attr */
  "srcset",                     /* used by tag_handle_img */
};

对于文本为主的静态页面，大部分情况我们仅需要关注 TAG_A 和 TAG_LINK 即可。

Pkg
Stats

Discover Tips

General search

Package details

User packages

Sponsor

About

Twitter

GitHub

Twitter

GitHub

Site

Open Software & Tools

Framework

Server

Data Store

Caching

CSS / Styling

Typeface

Avatars

Data Viz

Date formatting

Infinite scrolling

Markdown rendering

Repository url parsing

User data

Compiling

Types

Odds & Ends

@vector341/wget-js

v1.0.2

Published

Vulnerabilities

Links

Maintainers

Keywords

Readme

wget 的 JavaScript 实现

wget 保存网站以用于本地浏览

wget 底层实现