fork from https://github.com/mozilla/pdf.js.git
This commit is contained in:
194
web/pdf_find_utils.js
Normal file
194
web/pdf_find_utils.js
Normal file
@@ -0,0 +1,194 @@
|
||||
/* Copyright 2018 Mozilla Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import { FeatureTest } from "pdfjs-lib";
|
||||
|
||||
const CharacterType = {
|
||||
SPACE: 0,
|
||||
ALPHA_LETTER: 1,
|
||||
PUNCT: 2,
|
||||
HAN_LETTER: 3,
|
||||
KATAKANA_LETTER: 4,
|
||||
HIRAGANA_LETTER: 5,
|
||||
HALFWIDTH_KATAKANA_LETTER: 6,
|
||||
THAI_LETTER: 7,
|
||||
};
|
||||
|
||||
function isAlphabeticalScript(charCode) {
|
||||
return charCode < 0x2e80;
|
||||
}
|
||||
|
||||
function isAscii(charCode) {
|
||||
return (charCode & 0xff80) === 0;
|
||||
}
|
||||
|
||||
function isAsciiAlpha(charCode) {
|
||||
return (
|
||||
(charCode >= /* a = */ 0x61 && charCode <= /* z = */ 0x7a) ||
|
||||
(charCode >= /* A = */ 0x41 && charCode <= /* Z = */ 0x5a)
|
||||
);
|
||||
}
|
||||
|
||||
function isAsciiDigit(charCode) {
|
||||
return charCode >= /* 0 = */ 0x30 && charCode <= /* 9 = */ 0x39;
|
||||
}
|
||||
|
||||
function isAsciiSpace(charCode) {
|
||||
return (
|
||||
charCode === /* SPACE = */ 0x20 ||
|
||||
charCode === /* TAB = */ 0x09 ||
|
||||
charCode === /* CR = */ 0x0d ||
|
||||
charCode === /* LF = */ 0x0a
|
||||
);
|
||||
}
|
||||
|
||||
function isHan(charCode) {
|
||||
return (
|
||||
(charCode >= 0x3400 && charCode <= 0x9fff) ||
|
||||
(charCode >= 0xf900 && charCode <= 0xfaff)
|
||||
);
|
||||
}
|
||||
|
||||
function isKatakana(charCode) {
|
||||
return charCode >= 0x30a0 && charCode <= 0x30ff;
|
||||
}
|
||||
|
||||
function isHiragana(charCode) {
|
||||
return charCode >= 0x3040 && charCode <= 0x309f;
|
||||
}
|
||||
|
||||
function isHalfwidthKatakana(charCode) {
|
||||
return charCode >= 0xff60 && charCode <= 0xff9f;
|
||||
}
|
||||
|
||||
function isThai(charCode) {
|
||||
return (charCode & 0xff80) === 0x0e00;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function is based on the word-break detection implemented in:
|
||||
* https://hg.mozilla.org/mozilla-central/file/tip/intl/lwbrk/WordBreaker.cpp
|
||||
*/
|
||||
function getCharacterType(charCode) {
|
||||
if (isAlphabeticalScript(charCode)) {
|
||||
if (isAscii(charCode)) {
|
||||
if (isAsciiSpace(charCode)) {
|
||||
return CharacterType.SPACE;
|
||||
} else if (
|
||||
isAsciiAlpha(charCode) ||
|
||||
isAsciiDigit(charCode) ||
|
||||
charCode === /* UNDERSCORE = */ 0x5f
|
||||
) {
|
||||
return CharacterType.ALPHA_LETTER;
|
||||
}
|
||||
return CharacterType.PUNCT;
|
||||
} else if (isThai(charCode)) {
|
||||
return CharacterType.THAI_LETTER;
|
||||
} else if (charCode === /* NBSP = */ 0xa0) {
|
||||
return CharacterType.SPACE;
|
||||
}
|
||||
return CharacterType.ALPHA_LETTER;
|
||||
}
|
||||
|
||||
if (isHan(charCode)) {
|
||||
return CharacterType.HAN_LETTER;
|
||||
} else if (isKatakana(charCode)) {
|
||||
return CharacterType.KATAKANA_LETTER;
|
||||
} else if (isHiragana(charCode)) {
|
||||
return CharacterType.HIRAGANA_LETTER;
|
||||
} else if (isHalfwidthKatakana(charCode)) {
|
||||
return CharacterType.HALFWIDTH_KATAKANA_LETTER;
|
||||
}
|
||||
return CharacterType.ALPHA_LETTER;
|
||||
}
|
||||
|
||||
let NormalizeWithNFKC;
|
||||
function getNormalizeWithNFKC() {
|
||||
if (
|
||||
(typeof PDFJSDev === "undefined" && FeatureTest.platform.isFirefox) ||
|
||||
(typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL"))
|
||||
) {
|
||||
// The `NormalizeWithNFKC` string is generated with the for loop below.
|
||||
// Because of a small difference between Chrome and Firefox, the string is
|
||||
// only hardcoded for Firefox, and Chrome (or others) will generate it at
|
||||
// runtime.
|
||||
// In order to detect if the string is up to date, a check is performed in
|
||||
// the loop below, and if a difference is detected, an error is thrown.
|
||||
/* eslint-disable no-irregular-whitespace */
|
||||
NormalizeWithNFKC ||= `\xA0¨ª¯²-µ¸-º¼-¾IJ-ijĿ-ŀʼnſDŽ-njDZ-dzʰ-ʸ˘-˝ˠ-ˤʹͺ;΄-΅·ϐ-ϖϰ-ϲϴ-ϵϹևٵ-ٸक़-य़ড়-ঢ়য়ਲ਼ਸ਼ਖ਼-ਜ਼ਫ਼ଡ଼-ଢ଼ำຳໜ-ໝ༌གྷཌྷདྷབྷཛྷཀྵჼᴬ-ᴮᴰ-ᴺᴼ-ᵍᵏ-ᵪᵸᶛ-ᶿẚ-ẛάέήίόύώΆ᾽-῁ΈΉ῍-῏ΐΊ῝-῟ΰΎ῭-`ΌΏ´-῾ - ‑‗․-… ″-‴‶-‷‼‾⁇-⁉⁗ ⁰-ⁱ⁴-₎ₐ-ₜ₨℀-℃℅-ℇ℉-ℓℕ-№ℙ-ℝ℠-™ℤΩℨK-ℭℯ-ℱℳ-ℹ℻-⅀ⅅ-ⅉ⅐-ⅿ↉∬-∭∯-∰〈-〉①-⓪⨌⩴-⩶⫝̸ⱼ-ⱽⵯ⺟⻳⼀-⿕ 〶〸-〺゛-゜ゟヿㄱ-ㆎ㆒-㆟㈀-㈞㈠-㉇㉐-㉾㊀-㏿ꚜ-ꚝꝰ-ꟴꟸ-ꟹꭜ-ꭟꭩ豈-嗀塚晴凞-羽蘒諸逸-都飯-舘並-龎ff-stﬓ-ﬗיִײַ-זּטּ-לּמּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-﷼︐-︙︰-﹄﹇-﹒﹔-﹦﹨-﹫ﹰ-ﹲﹴﹶ-ﻼ!-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ¢-₩`;
|
||||
}
|
||||
|
||||
if (
|
||||
typeof PDFJSDev === "undefined" ||
|
||||
PDFJSDev.test("TESTING") ||
|
||||
(!PDFJSDev.test("MOZCENTRAL") && !NormalizeWithNFKC)
|
||||
) {
|
||||
const ranges = [];
|
||||
const range = [];
|
||||
const diacriticsRegex = /^\p{M}$/u;
|
||||
// Some chars must be replaced by their NFKC counterpart during a search.
|
||||
for (let i = 0; i < 65536; i++) {
|
||||
if (i >= 0xd800 && i <= 0xdfff) {
|
||||
continue; // Skip surrogates since they're not valid Unicode scalar values.
|
||||
}
|
||||
const c = String.fromCharCode(i);
|
||||
if (c.normalize("NFKC") !== c && !diacriticsRegex.test(c)) {
|
||||
if (range.length !== 2) {
|
||||
range[0] = range[1] = i;
|
||||
continue;
|
||||
}
|
||||
if (range[1] + 1 !== i) {
|
||||
if (range[0] === range[1]) {
|
||||
ranges.push(String.fromCharCode(range[0]));
|
||||
} else {
|
||||
ranges.push(
|
||||
`${String.fromCharCode(range[0])}-${String.fromCharCode(
|
||||
range[1]
|
||||
)}`
|
||||
);
|
||||
}
|
||||
range[0] = range[1] = i;
|
||||
} else {
|
||||
range[1] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const rangesStr = ranges.join("");
|
||||
if (!NormalizeWithNFKC) {
|
||||
NormalizeWithNFKC = rangesStr;
|
||||
} else if (rangesStr !== NormalizeWithNFKC) {
|
||||
for (let i = 1; i < rangesStr.length; i++) {
|
||||
if (rangesStr[i] !== NormalizeWithNFKC[i]) {
|
||||
console.log(
|
||||
`Difference at index ${i}: ` +
|
||||
`U+${rangesStr.charCodeAt(i).toString(16).toUpperCase().padStart(4, "0")}` +
|
||||
`!== U+${NormalizeWithNFKC.charCodeAt(i)
|
||||
.toString(16)
|
||||
.toUpperCase()
|
||||
.padStart(4, "0")}`
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
throw new Error(
|
||||
"getNormalizeWithNFKC - update the `NormalizeWithNFKC` string."
|
||||
);
|
||||
}
|
||||
}
|
||||
return NormalizeWithNFKC;
|
||||
}
|
||||
|
||||
export { CharacterType, getCharacterType, getNormalizeWithNFKC };
|
||||
Reference in New Issue
Block a user