@vector341/wget-js
v1.0.2
Published
JavaScript implement of wget
Downloads
4
Readme
wget 的 JavaScript 实现
实现 wget 的 -K 选项,将 html 文档中外部链接转化为本地链接。这个过程涉及到解析 HTML 内容(jsdom),查找所有的外部链接(<a>, <iframe>, <link rel="stylesheet", <script>等),并将它替换为本地路径/代理路径。
wget 保存网站以用于本地浏览
# 实用
wget -E -H -k -K -p http://<site>/<document>
# 镜像整个网站,包括所有资源
wget -m -k -K -E https://example.com选项解释:
-m开启镜像模式,等价于-r -N -l inf --no-remove-listing. 使用该选项往往会复制整个网站,包括很多不需要的资源-k转换链接以适用于本地浏览. /bar/img.gif -> http://hostname/bar/img.gif-p, --page-requisites下载页面需要的所有为支持的元素,例如图片、样式等. 不包括外部文档链接(在任何标签、标签或标签(除了之外)中指定的URL)
注:wget 的 -k 选项考虑了某个外部资源是否被下载,因此链接的转换发生在下载所有资源之后.
wget 底层实现
源码见 https://ftp.gnu.org/gnu/wget/ wget 实现了自己的 HTML 解析器(
src/http-parse.c),用于提取 HTML 文档中的标签和属性. 再通过 interesting tag 和 interesting attribute 的列表,来决定哪些标签和属性需要被处理(src/http-url.c)。具体列表如下:
/* The list of known tags and functions used for handling them. Most
tags are simply harvested for URLs. */
static struct known_tag {
int tagid;
const char *name;
tag_handler_t handler;
} known_tags[] = {
{ TAG_A, "a", tag_find_urls },
{ TAG_APPLET, "applet", tag_find_urls },
{ TAG_AREA, "area", tag_find_urls },
{ TAG_BASE, "base", tag_handle_base },
{ TAG_BGSOUND, "bgsound", tag_find_urls },
{ TAG_BODY, "body", tag_find_urls },
{ TAG_EMBED, "embed", tag_find_urls },
{ TAG_FIG, "fig", tag_find_urls },
{ TAG_FORM, "form", tag_handle_form },
{ TAG_FRAME, "frame", tag_find_urls },
{ TAG_IFRAME, "iframe", tag_find_urls },
{ TAG_IMG, "img", tag_handle_img }, // tag_find_urls() plus handling "srcset"
{ TAG_INPUT, "input", tag_find_urls },
{ TAG_LAYER, "layer", tag_find_urls },
{ TAG_LINK, "link", tag_handle_link },
{ TAG_META, "meta", tag_handle_meta },
{ TAG_OBJECT, "object", tag_find_urls },
{ TAG_OVERLAY, "overlay", tag_find_urls },
{ TAG_SCRIPT, "script", tag_find_urls },
{ TAG_TABLE, "table", tag_find_urls },
{ TAG_TD, "td", tag_find_urls },
{ TAG_TH, "th", tag_find_urls },
{ TAG_VIDEO, "video", tag_find_urls },
{ TAG_AUDIO, "audio", tag_find_urls },
{ TAG_SOURCE, "source", tag_handle_img } // tag_find_urls() plus handling "srcset"
};
/* tag_url_attributes documents which attributes of which tags contain
URLs to harvest. It is used by tag_find_urls. */
/* Defines for the FLAGS. */
/* The link is "inline", i.e. needs to be retrieved for this document
to be correctly rendered. Inline links include inlined images,
stylesheets, children frames, etc. */
#define ATTR_INLINE 1
/* The link is expected to yield HTML contents. It's important not to
try to follow HTML obtained by following e.g. <img src="...">
regardless of content-type. Doing this causes infinite loops for
"images" that return non-404 error pages with links to the same
image. */
#define ATTR_HTML 2
/* For tags handled by tag_find_urls: attributes that contain URLs to
download. */
static struct {
int tagid;
const char *attr_name;
int flags;
} tag_url_attributes[] = {
{ TAG_A, "href", ATTR_HTML },
{ TAG_APPLET, "code", ATTR_INLINE },
{ TAG_AREA, "href", ATTR_HTML },
{ TAG_BGSOUND, "src", ATTR_INLINE },
{ TAG_BODY, "background", ATTR_INLINE },
{ TAG_EMBED, "href", ATTR_HTML },
{ TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_FIG, "src", ATTR_INLINE },
{ TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_IMG, "href", ATTR_INLINE },
{ TAG_IMG, "lowsrc", ATTR_INLINE },
{ TAG_IMG, "src", ATTR_INLINE },
{ TAG_INPUT, "src", ATTR_INLINE },
{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_OBJECT, "data", ATTR_INLINE },
{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_SCRIPT, "src", ATTR_INLINE },
{ TAG_TABLE, "background", ATTR_INLINE },
{ TAG_TD, "background", ATTR_INLINE },
{ TAG_TH, "background", ATTR_INLINE },
{ TAG_VIDEO, "src", ATTR_INLINE },
{ TAG_VIDEO, "poster", ATTR_INLINE },
{ TAG_AUDIO, "src", ATTR_INLINE },
{ TAG_AUDIO, "poster", ATTR_INLINE },
{ TAG_SOURCE, "src", ATTR_INLINE },
};
/* The lists of interesting tags and attributes are built dynamically,
from the information above. However, some places in the code refer
to the attributes not mentioned here. We add them manually. */
static const char *additional_attributes[] = {
"rel", /* used by tag_handle_link */
"type", /* used by tag_handle_link */
"http-equiv", /* used by tag_handle_meta */
"name", /* used by tag_handle_meta */
"content", /* used by tag_handle_meta */
"action", /* used by tag_handle_form */
"style", /* used by check_style_attr */
"srcset", /* used by tag_handle_img */
};对于文本为主的静态页面,大部分情况我们仅需要关注 TAG_A 和 TAG_LINK 即可。
