fork from https://github.com/mozilla/pdf.js.git

2026-06-08 13:26:57 +02:00
parent be60be5b03
commit 141762041b
2444 changed files with 1179392 additions and 15 deletions
--- a/web/pdf_find_utils.js
+++ b/web/pdf_find_utils.js
@@ -0,0 +1,194 @@
+/* Copyright 2018 Mozilla Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import { FeatureTest } from "pdfjs-lib";
+
+const CharacterType = {
+  SPACE: 0,
+  ALPHA_LETTER: 1,
+  PUNCT: 2,
+  HAN_LETTER: 3,
+  KATAKANA_LETTER: 4,
+  HIRAGANA_LETTER: 5,
+  HALFWIDTH_KATAKANA_LETTER: 6,
+  THAI_LETTER: 7,
+};
+
+function isAlphabeticalScript(charCode) {
+  return charCode < 0x2e80;
+}
+
+function isAscii(charCode) {
+  return (charCode & 0xff80) === 0;
+}
+
+function isAsciiAlpha(charCode) {
+  return (
+    (charCode >= /* a = */ 0x61 && charCode <= /* z = */ 0x7a) ||
+    (charCode >= /* A = */ 0x41 && charCode <= /* Z = */ 0x5a)
+  );
+}
+
+function isAsciiDigit(charCode) {
+  return charCode >= /* 0 = */ 0x30 && charCode <= /* 9 = */ 0x39;
+}
+
+function isAsciiSpace(charCode) {
+  return (
+    charCode === /* SPACE = */ 0x20 ||
+    charCode === /* TAB = */ 0x09 ||
+    charCode === /* CR = */ 0x0d ||
+    charCode === /* LF = */ 0x0a
+  );
+}
+
+function isHan(charCode) {
+  return (
+    (charCode >= 0x3400 && charCode <= 0x9fff) ||
+    (charCode >= 0xf900 && charCode <= 0xfaff)
+  );
+}
+
+function isKatakana(charCode) {
+  return charCode >= 0x30a0 && charCode <= 0x30ff;
+}
+
+function isHiragana(charCode) {
+  return charCode >= 0x3040 && charCode <= 0x309f;
+}
+
+function isHalfwidthKatakana(charCode) {
+  return charCode >= 0xff60 && charCode <= 0xff9f;
+}
+
+function isThai(charCode) {
+  return (charCode & 0xff80) === 0x0e00;
+}
+
+/**
+ * This function is based on the word-break detection implemented in:
+ * https://hg.mozilla.org/mozilla-central/file/tip/intl/lwbrk/WordBreaker.cpp
+ */
+function getCharacterType(charCode) {
+  if (isAlphabeticalScript(charCode)) {
+    if (isAscii(charCode)) {
+      if (isAsciiSpace(charCode)) {
+        return CharacterType.SPACE;
+      } else if (
+        isAsciiAlpha(charCode) ||
+        isAsciiDigit(charCode) ||
+        charCode === /* UNDERSCORE = */ 0x5f
+      ) {
+        return CharacterType.ALPHA_LETTER;
+      }
+      return CharacterType.PUNCT;
+    } else if (isThai(charCode)) {
+      return CharacterType.THAI_LETTER;
+    } else if (charCode === /* NBSP = */ 0xa0) {
+      return CharacterType.SPACE;
+    }
+    return CharacterType.ALPHA_LETTER;
+  }
+
+  if (isHan(charCode)) {
+    return CharacterType.HAN_LETTER;
+  } else if (isKatakana(charCode)) {
+    return CharacterType.KATAKANA_LETTER;
+  } else if (isHiragana(charCode)) {
+    return CharacterType.HIRAGANA_LETTER;
+  } else if (isHalfwidthKatakana(charCode)) {
+    return CharacterType.HALFWIDTH_KATAKANA_LETTER;
+  }
+  return CharacterType.ALPHA_LETTER;
+}
+
+let NormalizeWithNFKC;
+function getNormalizeWithNFKC() {
+  if (
+    (typeof PDFJSDev === "undefined" && FeatureTest.platform.isFirefox) ||
+    (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL"))
+  ) {
+    // The `NormalizeWithNFKC` string is generated with the for loop below.
+    // Because of a small difference between Chrome and Firefox, the string is
+    // only hardcoded for Firefox, and Chrome (or others) will generate it at
+    // runtime.
+    // In order to detect if the string is up to date, a check is performed in
+    // the loop below, and if a difference is detected, an error is thrown.
+    /* eslint-disable no-irregular-whitespace */
+    NormalizeWithNFKC ||= `\xA0¨ª¯²-µ¸-º¼-¾Ĳ-ĳĿ-ŀŉſǄ-ǌǱ-ǳʰ-ʸ˘-˝ˠ-ˤʹͺ;΄-΅·ϐ-ϖϰ-ϲϴ-ϵϹևٵ-ٸक़-य़ড়-ঢ়য়ਲ਼ਸ਼ਖ਼-ਜ਼ਫ਼ଡ଼-ଢ଼ำຳໜ-ໝ༌གྷཌྷདྷབྷཛྷཀྵჼᴬ-ᴮᴰ-ᴺᴼ-ᵍᵏ-ᵪᵸᶛ-ᶿẚ-ẛάέήίόύώΆ᾽-῁ΈΉ῍-῏ΐΊ῝-῟ΰΎ῭-`ΌΏ´-῾ - ‑‗․-… ″-‴‶-‷‼‾⁇-⁉⁗ ⁰-ⁱ⁴-₎ₐ-ₜ₨℀-℃℅-ℇ℉-ℓℕ-№ℙ-ℝ℠-™ℤΩℨK-ℭℯ-ℱℳ-ℹ℻-⅀ⅅ-ⅉ⅐-ⅿ↉∬-∭∯-∰〈-〉①-⓪⨌⩴-⩶⫝̸ⱼ-ⱽⵯ⺟⻳⼀-⿕　〶〸-〺゛-゜ゟヿㄱ-ㆎ㆒-㆟㈀-㈞㈠-㉇㉐-㉾㊀-㏿ꚜ-ꚝꝰ꟱-ꟴꟸ-ꟹꭜ-ꭟꭩ豈-嗀塚晴凞-羽蘒諸逸-都飯-舘並-龎ﬀ-ﬆﬓ-ﬗיִײַ-זּטּ-לּמּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-﷼︐-︙︰-﹄﹇-﹒﹔-﹦﹨-﹫ﹰ-ﹲﹴﹶ-ﻼ！-ﾾￂ-ￇￊ-ￏￒ-ￗￚ-ￜ￠-￦`;
+  }
+
+  if (
+    typeof PDFJSDev === "undefined" ||
+    PDFJSDev.test("TESTING") ||
+    (!PDFJSDev.test("MOZCENTRAL") && !NormalizeWithNFKC)
+  ) {
+    const ranges = [];
+    const range = [];
+    const diacriticsRegex = /^\p{M}$/u;
+    // Some chars must be replaced by their NFKC counterpart during a search.
+    for (let i = 0; i < 65536; i++) {
+      if (i >= 0xd800 && i <= 0xdfff) {
+        continue; // Skip surrogates since they're not valid Unicode scalar values.
+      }
+      const c = String.fromCharCode(i);
+      if (c.normalize("NFKC") !== c && !diacriticsRegex.test(c)) {
+        if (range.length !== 2) {
+          range[0] = range[1] = i;
+          continue;
+        }
+        if (range[1] + 1 !== i) {
+          if (range[0] === range[1]) {
+            ranges.push(String.fromCharCode(range[0]));
+          } else {
+            ranges.push(
+              `${String.fromCharCode(range[0])}-${String.fromCharCode(
+                range[1]
+              )}`
+            );
+          }
+          range[0] = range[1] = i;
+        } else {
+          range[1] = i;
+        }
+      }
+    }
+
+    const rangesStr = ranges.join("");
+    if (!NormalizeWithNFKC) {
+      NormalizeWithNFKC = rangesStr;
+    } else if (rangesStr !== NormalizeWithNFKC) {
+      for (let i = 1; i < rangesStr.length; i++) {
+        if (rangesStr[i] !== NormalizeWithNFKC[i]) {
+          console.log(
+            `Difference at index ${i}: ` +
+              `U+${rangesStr.charCodeAt(i).toString(16).toUpperCase().padStart(4, "0")}` +
+              `!== U+${NormalizeWithNFKC.charCodeAt(i)
+                .toString(16)
+                .toUpperCase()
+                .padStart(4, "0")}`
+          );
+          break;
+        }
+      }
+      throw new Error(
+        "getNormalizeWithNFKC - update the `NormalizeWithNFKC` string."
+      );
+    }
+  }
+  return NormalizeWithNFKC;
+}
+
+export { CharacterType, getCharacterType, getNormalizeWithNFKC };