@jerp/xml-stream-js

v1.1.8

Published

4 years ago

XML Stream parser (native js - no dependency)

0High
0Medium
0Low

jerp

xml-stream-js

A simple js parser for XML for nodejs and the browser.

It will handle smoothly line-breaks, spaces, chunks (from stream), namespaces...

It is fast and optimized for Buffer or Uint8Array of encoded strings (utf8) but will take strings too.

Using the parser

Parses some (or all) xml elements. Uses the tokenizer bellow but facilitate state managment (in between 2 writes).

Predefined parsers

To get started.

Example 1

const aString = '<a id="a0"><aa/>some text a0<ab><abChild/></ab>more text</a>'
const docParser = new DocumentParser(new Tokenizer())
// parsing the whole document (this might not the best usecase for this library)
docParser.onRoot(XmlElementParser()) // parser preserving child node order
docParser.write(aString)
const a = docParser.next() // XmlElement
a.toString() // produces back this xml string
a.getAttribute('id') === 'a0'

Example 2

const a0String = '<a id="a0"><aa id="aa0"/><aa id="aa1"/>some text a0<ab><abChild/></ab> &amp; more text</a>'
const a1String = '<a id="a1"><aa/>some text a1</a>'
const b0String = '<b id="b0">some text b0</b>'
const docParser = new DocumentParser(new Tokenizer())
docParser.on('root/a', XmlToObject())
docParser.on('root/b', XmlElementParser())
docParser.write(`<root>${a0String}${a1String}${b0String}</root>`)
const a0 = docParser.next() // object
const a1 = docParser.next() // object
const b = docParser.next() // XmlElement
const u = docParser.next() // undefined
a0.id === 'a0'
a0.aa[0].id === 'aa0'

Custom parsers

const xmlStr = '<root><a title="..."><aa id="aa00" label="item aa00"/><aa id="aa01" label="item aa01"/></a><b>...</b></root>'
const docParser = new DocumentParser(new Tokenizer())
docParser.on('root/a', {
  onStart(startTag) {
    // object representing `a`
    return {
      title: startTag.getAttribute('title'),
      items: [],
    }
  },
  onText(text, a) {
    if (!a.firstText) a.firstText = text.textContent.trim()
  },
  onEnd: (a) => a, // this is the object returned by `docParser.next()`
  onChild(startTag) {
    switch (startTag.tagName) {
      case 'aa': {
        // returning a new parser for 'aa'
        return {
          onStart(startTag, parentCtx) {
            parentCtx.items.push({
              name: startTag.getAttribute('id'),
              label: startTag.getAttribute('label'),
              type: 'aa',
            })
            return false // skipping child nodes of `aa`
          },
        }
      }
      // only interesed in `aa` children of `a`
      default:
        return false
    }
  },
})
docParser.write(xmlStr)
console.log(docParser.next())
{
  title: '...',
  firstText: 'some text a0',
  items: [
    { type: 'aa', name: 'aa00', label: 'item aa00' },
    { type: 'aa', name: 'aa01', label: 'item aa01' },
  ],
}

Using the tokenizer

import { Tokenizer } from '@jerp/xml-stream-js'
const tokenizer = new Tokenizer()
let token // any of StartTag | EndTag | Text | CDATA | undefined (undifined meaning end-of-chunk)
const tokens = [] // collected tokens
try {
  // write the first chunk of the xml string
  tokenizer.write('<a><b b1="value b1"><c/>some<d>inn')
  while ((token = tokenizer.nextToken())) {
    tokens.push(token)
  }
  // write the last chunk of the xml string
  tokenizer.write('er</d>text</b></a>')
  while ((token = tokenizer.nextToken())) {
    tokens.push(token)
  }
} catch (e) {
  // will not happen in this case, but will if xml string is corrupted
}
tokens[0].tagName === 'a'
tokens[1].getAttribute('b1') === 'value b1'
tokens.join('') === '<a><b b1="value b1"><c/>some<d>inner</d>text</b></a>'
tokenizer.exhausted === true // the whole string has been consumed
tokens.join('') // === '<a><b b1="value b1"><c/>some<d>inner</d>text</b></a>'

Published

Vulnerabilities

Links

Maintainers

Keywords

Readme