npm package discovery and stats viewer.

Discover Tips

  • General search

    [free text search, go nuts!]

  • Package details

    pkg:[package-name]

  • User packages

    @[username]

Sponsor

Optimize Toolset

I’ve always been into building performant and accessible sites, but lately I’ve been taking it extremely seriously. So much so that I’ve been building a tool to help me optimize and monitor the sites that I build to make sure that I’m making an attempt to offer the best experience to those who visit them. If you’re into performant, accessible and SEO friendly sites, you might like it too! You can check it out at Optimize Toolset.

About

Hi, 👋, I’m Ryan Hefner  and I built this site for me, and you! The goal of this site was to provide an easy way for me to check the stats on my npm packages, both for prioritizing issues and updates, and to give me a little kick in the pants to keep up on stuff.

As I was building it, I realized that I was actually using the tool to build the tool, and figured I might as well put this out there and hopefully others will find it to be a fast and useful way to search and browse npm packages as I have.

If you’re interested in other things I’m working on, follow me on Twitter or check out the open source projects I’ve been publishing on GitHub.

I am also working on a Twitter bot for this site to tweet the most popular, newest, random packages from npm. Please follow that account now and it will start sending out packages soon–ish.

Open Software & Tools

This site wouldn’t be possible without the immense generosity and tireless efforts from the people who make contributions to the world and share their work via open source initiatives. Thank you 🙏

© 2024 – Pkg Stats / Ryan Hefner

html-parser

v0.11.0

Published

HTML/XML parser with less explosions

Downloads

10,532

Readme

html-parser

Build Status NPM version

Now with less explosions!

The purpose of this library is not to be the best XML parsing library ever conceived. Because it's not. It's meant to be an HTML/XML parser that doesn't require valid HTML/XML. It's also meant to act as a sanitizer, which is the main reason for it's existence.

For example, you can just shove a blob of text into it, and it will happily parse as if it were valid XML.

Licensed under MIT.

Installation

npm install html-parser

Callback based parsing

var htmlParser = require('html-parser');

var html = '<!doctype html><html><body onload="alert(\'hello\');">Hello<br />world</body></html>';
htmlParser.parse(html, {
	openElement: function(name) { console.log('open: %s', name); },
	closeOpenedElement: function(name, token, unary) { console.log('token: %s, unary: %s', token, unary); },
	closeElement: function(name) { console.log('close: %s', name); },
	comment: function(value) { console.log('comment: %s', value); },
	cdata: function(value) { console.log('cdata: %s', value); },
	attribute: function(name, value) { console.log('attribute: %s=%s', name, value); },
	docType: function(value) { console.log('doctype: %s', value); },
	text: function(value) { console.log('text: %s', value); }
});

/*
doctype: html
open: html
close token: >
open: body
attribute: onload=alert('hello');
close token: >
text: Hello
open: br
close token: />, unary: true
text: world
close: body
close: html
*/

Sanitization

var htmlParser = require('html-parser');

var html = '<script>alert(\'danger!\')</script><p onclick="alert(\'danger!\')">blah blah<!-- useless comment --></p>';
var sanitized = htmlParser.sanitize(html, {
	elements: [ 'script' ],
	attributes: [ 'onclick' ],
	comments: true
});

console.log(sanitized);
//<p>blah blah</p>

Using callbacks

var htmlParser = require('html-parser');

var html = '<script>alert(\'danger!\')</script><p onclick="alert(\'danger!\')">blah blah<!-- useless comment --></p>';
var sanitized = htmlParser.sanitize(html, {
	elements: function(name) {
		return name === 'script';
	},
	attributes: function(name, value) {
		return /^on/i.test(name) || /^javascript:/i.test(value);
	},
	comments: true
});

console.log(sanitized);
//<p>blah blah</p>

Custom data elements

You can parser custom data elements like php code or underscore templates with regex.dataElements config

helpers.parseString('<div><?= "<div>$var</div>" ?></div>', {
    openElement: function(name) {
        console.log(name); // 'div'
    },
    closeElement: function(name) {
        console.log(name); // 'div'
    },
    phpEcho: function(value) {
        console.log(value); // {length: 61, someProperty: ' "<div>$var</div>" '}
    }
}, {
    dataElements: {
        phpEcho: {
            start: '<?=',
            data: function (string) {
                var index = string.indexOf('?>'),
                    code = string.slice(0, index);

                return code;
                // or
                return {
                    length: code.length, // required field
                    someProperty: code
                };
            },
            end: '?>'
        }
    }
});

API

/**
 * Parses the given string o' HTML, executing each callback when it
 * encounters a token.
 *
 * @param {String} htmlString A string o' HTML
 * @param {Object} [callbacks] Callbacks for each token
 * @param {Function} [callbacks.attribute] Takes the name of the attribute and its value
 * @param {Function} [callbacks.openElement] Takes the tag name of the element
 * @param {Function} [callbacks.closeOpenedElement] Takes the tag name of the element and the token used to
 * close it (">", "/>", "?>")
 * @param {Function} [callbacks.closeElement] Takes the name of the element
 * @param {Function} [callbacks.comment] Takes the content of the comment
 * @param {Function} [callbacks.docType] Takes the content of the document type declaration
 * @param {Function} [callbacks.cdata] Takes the content of the CDATA
 * @param {Function} [callbacks.xmlProlog] Takes no arguments
 * @param {Function} [callbacks.text] Takes the value of the text node
 * @param {Object} [regex]
 * @param {RegExp} [regex.name] Regex for element name. Default is [a-zA-Z_][\w:\-\.]*
 * @param {RegExp} [regex.attribute] Regex for attribute name. Default is [a-zA-Z_][\w:\-\.]*
 * @param {Object.<callbackName,DataElementConfig>} [regex.dataElements] Config of data elements like docType, comment and your own custom data elements
 */
parse(htmlString, callbacks, regex)

/**
 * @typedef {Object} DataElementConfig
 * @property {String|RegExp|Function} start - start of data element, for example '<%' or /^<\?=/ or function(string){return string.slice(0, 2) === '<%' ? 2 : -1;}
 * @property {RegExp|Function} data - content of data element, for example /^[^\s]+/ or function(string){return string.match(/^[^\s]+/)[0];}
 * @property {String|RegExp|Function} end - end of data element, for example '%>' or /^\?>/ or function(string){return 2;}
 */

/**
 * Parses the HTML contained in the given file asynchronously.
 *
 * Note that this is merely a convenience function, it will still read the entire
 * contents of the file into memory.
 *
 * @param {String} fileName Name of the file to parse
 * @param {String} [encoding] Optional encoding to read the file in, defaults to utf8
 * @param {Object} [callbacks] Callbacks to pass to parse()
 * @param {Function} [callback]
 */
parseFile(fileName, encoding, callbacks, callback)

/**
 * Sanitizes an HTML string.
 *
 * If removalCallbacks is not given, it will simply reformat the HTML
 * (i.e. converting all tags to lowercase, etc.). Note that this function
 * assumes that the HTML is decently formatted and kind of valid. It
 * may exhibit undefined or unexpected behavior if your HTML is trash.
 *
 * @param {String} htmlString A string o' HTML
 * @param {Object} [removalCallbacks] Callbacks for each token type
 * @param {Function|Array} [removalCallbacks.attributes] Callback or array of specific attributes to strip
 * @param {Function|Array} [removalCallbacks.elements] Callback or array of specific elements to strip
 * @param {Function|Boolean} [removalCallbacks.comments] Callback or boolean indicating to strip comments
 * @param {Function|Boolean} [removalCallbacks.docTypes] Callback or boolean indicating to strip doc type declarations
 * @return {String} The sanitized HTML
 */
sanitize(htmlString, removalCallbacks)

Development

git clone https://github.com/tmont/html-parser.git
cd html-parser
npm link
npm test