npm package discovery and stats viewer.

Discover Tips

  • General search

    [free text search, go nuts!]

  • Package details

    pkg:[package-name]

  • User packages

    @[username]

Sponsor

Optimize Toolset

I’ve always been into building performant and accessible sites, but lately I’ve been taking it extremely seriously. So much so that I’ve been building a tool to help me optimize and monitor the sites that I build to make sure that I’m making an attempt to offer the best experience to those who visit them. If you’re into performant, accessible and SEO friendly sites, you might like it too! You can check it out at Optimize Toolset.

About

Hi, 👋, I’m Ryan Hefner  and I built this site for me, and you! The goal of this site was to provide an easy way for me to check the stats on my npm packages, both for prioritizing issues and updates, and to give me a little kick in the pants to keep up on stuff.

As I was building it, I realized that I was actually using the tool to build the tool, and figured I might as well put this out there and hopefully others will find it to be a fast and useful way to search and browse npm packages as I have.

If you’re interested in other things I’m working on, follow me on Twitter or check out the open source projects I’ve been publishing on GitHub.

I am also working on a Twitter bot for this site to tweet the most popular, newest, random packages from npm. Please follow that account now and it will start sending out packages soon–ish.

Open Software & Tools

This site wouldn’t be possible without the immense generosity and tireless efforts from the people who make contributions to the world and share their work via open source initiatives. Thank you 🙏

© 2024 – Pkg Stats / Ryan Hefner

tokenize-this

v1.4.2

Published

Turns a string into tokens.

Downloads

25,171

Readme

TokenizeThis

Quickstart

It turns a string into tokens.

var tokenizer = new TokenizeThis();
var str = 'Tokenize this!';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['Tokenize', 'this', '!']);

By default, it can tokenize math-based strings.

var tokenizer = new TokenizeThis();
var str = '5 + 6 -(4/2) + gcd(10, 5)';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});

equals(tokens, [5, '+', 6, '-', '(', 4, '/', 2, ')', '+', 'gcd', '(', 10, ',', 5, ')']);

...Or SQL.

var tokenizer = new TokenizeThis();
var str = 'SELECT COUNT(id), 5+6 FROM `users` WHERE name = "shaun persad" AND hobby IS NULL';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    
    if (surroundedBy) {
        tokens.push(surroundedBy+token+surroundedBy);
    } else {
        tokens.push(token);
    }
});
equals(tokens, [
    'SELECT',
    'COUNT', '(', 'id', ')',
    ',',
    5, '+', 6,
    'FROM', '`users`',
    'WHERE',
    'name', '=', '"shaun persad"',
    'AND',
    'hobby', 'IS', null
]);

Installation

npm install tokenize-this.

// or if in the browser: <script src="tokenize-this/tokenize-this.min.js"></script>

Usage

require it, create a new instance, then call tokenize.

// var TokenizeThis = require('tokenize-this');
// OR
// var TokenizeThis = require('tokenize-this/tokenize-this.min.js'); // for node.js < 4.0
// OR
// <script src="tokenize-this/tokenize-this.min.js"></script> <!-- if in browser -->

var tokenizer = new TokenizeThis();

var str = 'Hi!, I want to add 5+6';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['Hi', '!', ',', 'I', 'want', 'to', 'add', 5, '+', 6]);

Advanced Usage

Supplying a config object to the constructor

See here for all options

This can be used to tokenize many forms of data, like JSON into key-value pairs.

var jsonConfig = {
    shouldTokenize: ['{', '}', '[', ']'],
    shouldMatch: ['"'],
    shouldDelimitBy: [' ', "\n", "\r", "\t", ':', ','],
    convertLiterals: true
};
var tokenizer = new TokenizeThis(jsonConfig);
var str = '[{name:"Shaun Persad", id: 5}, { gender : null}]';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['[', '{', 'name', 'Shaun Persad', 'id', 5, '}', '{', 'gender', null, '}', ']']);

Here it is tokenizing XML like a boss.

var xmlConfig = {
    shouldTokenize: ['<?', '?>', '<!', '<', '</', '>', '/>', '='],
    shouldMatch: ['"'],
    shouldDelimitBy: [' ', "\n", "\r", "\t"],
    convertLiterals: true
};
var tokenizer = new TokenizeThis(xmlConfig);
var str = `
<?xml-stylesheet href="catalog.xsl" type="text/xsl"?>
<!DOCTYPE catalog SYSTEM "catalog.dtd">
<catalog>
   <product description="Cardigan Sweater" product_image="cardigan.jpg">
      <size description="Large" />
      <color_swatch image="red_cardigan.jpg">
        Red
      </color_swatch>
   </product>
</catalog>                
`;
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens,
    [
        '<?', 'xml-stylesheet', 'href', '=', 'catalog.xsl', 'type', '=', 'text/xsl', '?>',
        '<!', 'DOCTYPE', 'catalog', 'SYSTEM', 'catalog.dtd', '>',
        '<', 'catalog', '>',
        '<', 'product', 'description', '=', 'Cardigan Sweater', 'product_image', '=', 'cardigan.jpg', '>',
        '<', 'size', 'description', '=', 'Large', '/>',
        '<', 'color_swatch', 'image', '=', 'red_cardigan.jpg', '>',
        'Red',
        '</', 'color_swatch', '>',
        '</', 'product', '>',
        '</', 'catalog', '>'
    ]
);

The above examples are the first steps in writing parsers for those formats. The next would be parsing the stream of tokens based on the format-specific rules, e.g. SQL.

API

Methods

#tokenize(str:String, forEachToken:Function)

sends each token to the forEachToken(token:String, surroundedBy:String, index:Integer) callback.

var tokenizer = new TokenizeThis();
var str = 'Tokenize "this"!';

var tokens = [];
var indices = [];
var forEachToken = function(token, surroundedBy, index) {

    tokens.push(surroundedBy+token+surroundedBy);
    indices.push(index);
};

tokenizer.tokenize(str, forEachToken);

equals(tokens, ['Tokenize', '"this"', '!']);
equals(indices, [8, 14, 15]);

it converts true, false, null, and numbers into their literal versions.

var tokenizer = new TokenizeThis();
var str = 'true false null TRUE FALSE NULL 1 2 3.4 5.6789';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, [true, false, null, true, false, null, 1, 2, 3.4, 5.6789]);

.defaultConfig:Object

The default config object used when no config is supplied.

var config = {
    shouldTokenize: ['(', ')', ',', '*', '/', '%', '+', '-', '=', '!=', '!', '<', '>', '<=', '>=', '^'],
    shouldMatch: ['"', "'", '`'],
    shouldDelimitBy: [' ', "\n", "\r", "\t"],
    convertLiterals: true,
    escapeCharacter: "\\"
};
equals(TokenizeThis.defaultConfig, config);

You can change converting to literals with the convertLiterals config option.

var config = {
    convertLiterals: false
};
var tokenizer = new TokenizeThis(config);
var str = 'true false null TRUE FALSE NULL 1 2 3.4 5.6789';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, ['true', 'false', 'null', 'TRUE', 'FALSE', 'NULL', '1', '2', '3.4', '5.6789']);

Any strings surrounded by the quotes specified in the shouldMatch option are treated as whole tokens.

var config = {
    shouldMatch: ['"', '`', '#']
};
var tokenizer = new TokenizeThis(config);
var str = '"hi there" `this is a test` #of quotes#';
var tokens = [];
var tokensQuoted = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
    tokensQuoted.push(surroundedBy+token+surroundedBy);
});
equals(tokens, ['hi there', 'this is a test', 'of quotes']);
equals(tokensQuoted, ['"hi there"', '`this is a test`', '#of quotes#']);

Quotes can be escaped via a backslash.

var tokenizer = new TokenizeThis();
var str = 'These are "\\"quotes\\""';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});

equals(tokens, ['These', 'are', '"quotes"']);

The escape character can be specified with the escapeCharacter option.

var config = {
    escapeCharacter: '#'
};
var tokenizer = new TokenizeThis(config);
var str = 'These are "#"quotes#""';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, ['These', 'are', '"quotes"']);