Skip to content

Add NFD normalizer #1211

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 42 additions & 28 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -995,6 +995,8 @@ class Normalizer extends Callable {
return new Replace(config);
case 'NFC':
return new NFC(config);
case 'NFD':
return new NFD(config);
case 'NFKC':
return new NFKC(config);
case 'NFKD':
Expand Down Expand Up @@ -1053,50 +1055,62 @@ class Replace extends Normalizer {
}

/**
* A normalizer that applies Unicode normalization form C (NFC) to the input text.
* A normalizer that applies Unicode normalization to the input text.
* @extends Normalizer
* @abstract
*/
class NFC extends Normalizer {
class UnicodeNormalizer extends Normalizer {
/**
* @type {string} The Unicode normalization form to apply.
* Should be one of: 'NFC', 'NFD', 'NFKC', or 'NFKD'.
*/
form = undefined;

/**
* Normalize the input text by applying Unicode normalization form C (NFC).
* Normalize the input text by applying Unicode normalization.
* @param {string} text The input text to be normalized.
* @returns {string} The normalized text.
*/
normalize(text) {
text = text.normalize('NFC')
text = text.normalize(this.form)
return text;
}
}

/**
* NFKC Normalizer.
* @extends Normalizer
* A normalizer that applies Unicode normalization form C (NFC) to the input text.
* Canonical Decomposition, followed by Canonical Composition.
* @extends UnicodeNormalizer
*/
class NFKC extends Normalizer {
/**
* Normalize text using NFKC normalization.
* @param {string} text The text to be normalized.
* @returns {string} The normalized text.
*/
normalize(text) {
text = text.normalize('NFKC')
return text;
}
class NFC extends UnicodeNormalizer {
form = 'NFC';
}

/**
* NFKD Normalizer.
* @extends Normalizer
* A normalizer that applies Unicode normalization form D (NFD) to the input text.
* Canonical Decomposition.
* @extends UnicodeNormalizer
*/
class NFKD extends Normalizer {
/**
* Normalize text using NFKD normalization.
* @param {string} text The text to be normalized.
* @returns {string} The normalized text.
*/
normalize(text) {
text = text.normalize('NFKD')
return text;
}
class NFD extends UnicodeNormalizer {
form = 'NFD';
}

/**
* A normalizer that applies Unicode normalization form KC (NFKC) to the input text.
* Compatibility Decomposition, followed by Canonical Composition.
* @extends UnicodeNormalizer
*/
class NFKC extends UnicodeNormalizer {
form = 'NFKC';
}

/**
* A normalizer that applies Unicode normalization form KD (NFKD) to the input text.
* Compatibility Decomposition.
* @extends UnicodeNormalizer
*/
class NFKD extends UnicodeNormalizer {
form = 'NFKD';
}

/**
Expand Down
29 changes: 28 additions & 1 deletion tests/models/bert/test_tokenization_bert.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { BertTokenizer } from "../../../src/tokenizers.js";
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js";
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS, NORMALIZATION_TEST_STRINGS } from "../test_strings.js";

export const TOKENIZER_CLASS = BertTokenizer;
export const TEST_CONFIG = {
Expand Down Expand Up @@ -1341,4 +1341,31 @@ export const TEST_CONFIG = {
decoded: "[CLS] ah [UNK] [UNK] zz [SEP]",
},
},
// NFD normalizer
"onnx-community/language_detection-ONNX": {
DEFAULT_EXAMPLE: {
text: NORMALIZATION_TEST_STRINGS.DEFAULT_EXAMPLE,
tokens: ["ame", "##lie", "|", "ame", "##lie"],
ids: [1, 21947, 31933, 70, 21947, 31933, 2],
decoded: "[CLS] amelie | amelie [SEP]",
},
CANONICAL_EQUIVALENCE_NORMALIZATION: {
text: NORMALIZATION_TEST_STRINGS.CANONICAL_EQUIVALENCE_NORMALIZATION,
tokens: ["n", "|", "n"],
ids: [1, 56, 70, 56, 2],
decoded: "[CLS] n | n [SEP]",
},
COMPATIBILITY_NORMALIZATION: {
text: NORMALIZATION_TEST_STRINGS.COMPATIBILITY_NORMALIZATION,
tokens: ["[UNK]", "|", "ff"],
ids: [1, 0, 70, 40133, 2],
decoded: "[CLS] [UNK] | ff [SEP]",
},
COMBINED_EXAMPLE: {
text: NORMALIZATION_TEST_STRINGS.COMBINED_EXAMPLE,
tokens: ["ſ", "|", "ſ", "|", "ſ", "|", "s", "|", "s"],
ids: [1, 121, 70, 121, 70, 121, 70, 61, 70, 61, 2],
decoded: "[CLS] ſ | ſ | ſ | s | s [SEP]",
},
},
};
10 changes: 10 additions & 0 deletions tests/models/test_strings.js
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,13 @@ export const M2M_100_TEST_STRINGS = {
HIDNI_TEXT: "जीवन एक चॉकलेट बॉक्स की तरह है।",
CHINESE_TEXT: "生活就像一盒巧克力。",
};

// Test strings adapted from https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize
export const NORMALIZATION_TEST_STRINGS = {
DEFAULT_EXAMPLE: "\u0041\u006d\u00e9\u006c\u0069\u0065 | \u0041\u006d\u0065\u0301\u006c\u0069\u0065",
CANONICAL_EQUIVALENCE_NORMALIZATION: "\u00F1 | \u006E\u0303",
COMPATIBILITY_NORMALIZATION: "\uFB00 | \u0066\u0066",

// Original | NFC | NFD | NFKC | NFKD
COMBINED_EXAMPLE: "\u1E9B\u0323 | \u1E9B\u0323 | \u017F\u0323\u0307 | \u1E69 | \u0073\u0323\u0307",
};