Skip to content

Commit

Permalink
Add ignore_merges option to BPE tokenizers (huggingface#716)
Browse files Browse the repository at this point in the history
  • Loading branch information
xenova authored Apr 18, 2024
1 parent 6427431 commit 6d5901e
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -630,10 +630,12 @@ class BPE extends TokenizerModel {
* Create a BPE instance.
* @param {Object} config The configuration object for BPE.
* @param {Object} config.vocab A mapping of tokens to ids.
* @param {string[]} config.merges An array of BPE merges as strings.
* @param {string} config.unk_token The unknown token used for out of vocabulary words.
* @param {string} config.end_of_word_suffix The suffix to place at the end of each word.
* @param {string} [config.continuing_subword_suffix] The suffix to insert between words.
* @param {Array} config.merges An array of BPE merges as strings.
* @param {boolean} [config.byte_fallback=false] Whether to use spm byte-fallback trick (defaults to False)
* @param {boolean} [config.ignore_merges=false] Whether or not to match tokens with the vocab before using merges.
*/
constructor(config) {
super(config);
Expand Down Expand Up @@ -665,6 +667,8 @@ class BPE extends TokenizerModel {
this.text_encoder = new TextEncoder();
}

this.ignore_merges = this.config.ignore_merges ?? false;

/** @type {Map<string, string[]>} */
this.cache = new Map();
}
Expand Down Expand Up @@ -826,6 +830,10 @@ class BPE extends TokenizerModel {
const outputTokens = [];

for (const token of tokens) {
if (this.ignore_merges && this.tokens_to_ids.has(token)) {
outputTokens.push(token);
continue;
}
const bpe_token_list = this.bpe(token);

for (const t of bpe_token_list) {
Expand Down

0 comments on commit 6d5901e

Please sign in to comment.