From 2e41568313620ab564425be7e5d777581e2db5d4 Mon Sep 17 00:00:00 2001
From: Brian Vaughn <bvaughn@fb.com>
Date: Mon, 27 Sep 2021 13:34:39 -0400
Subject: [PATCH] =?UTF-8?q?DevTools=20encoding=20supports=20multibyte=20ch?=
 =?UTF-8?q?aracters=20(e.g.=20"=F0=9F=9F=A9")=20(#22424)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes our text encoding approach to properly support multibyte characters following this algorithm. Based on benchmarking, this new approach is roughly equivalent in terms of performance (sometimes slightly faster, sometimes slightly slower).

I also considered using TextEncoder/TextDecoder for this, but it was much slower (~85%).
---
 .../src/__tests__/store-test.js               | 13 ++++
 .../src/backend/renderer.js                   | 59 +++++++++++++------
 packages/react-devtools-shared/src/utils.js   | 26 +++++++-
 3 files changed, 76 insertions(+), 22 deletions(-)
diff --git a/packages/react-devtools-shared/src/__tests__/store-test.js b/packages/react-devtools-shared/src/__tests__/store-test.js
index 02cea3d925..53a531d51b 100644
--- a/packages/react-devtools-shared/src/__tests__/store-test.js
+++ b/packages/react-devtools-shared/src/__tests__/store-test.js
@@ -101,6 +101,19 @@ describe('Store', () => {
     `);
   });
 
+  it('should handle multibyte character strings', () => {
+    const Component = () => null;
+    Component.displayName = '🟩💜🔵';
+
+    const container = document.createElement('div');
+
+    act(() => legacyRender(<Component />, container));
+    expect(store).toMatchInlineSnapshot(`
+      [root]
+          <🟩💜🔵>
+    `);
+  });
+
   describe('collapseNodesByDefault:false', () => {
     beforeEach(() => {
       store.collapseNodesByDefault = false;
diff --git a/packages/react-devtools-shared/src/backend/renderer.js b/packages/react-devtools-shared/src/backend/renderer.js
index 24a00583e7..a2469577a0 100644
--- a/packages/react-devtools-shared/src/backend/renderer.js
+++ b/packages/react-devtools-shared/src/backend/renderer.js
@@ -1514,11 +1514,16 @@ export function attach(
 
   type OperationsArray = Array<number>;
 
+  type StringTableEntry = {|
+    encodedString: Array<number>,
+    id: number,
+  |};
+
   const pendingOperations: OperationsArray = [];
   const pendingRealUnmountedIDs: Array<number> = [];
   const pendingSimulatedUnmountedIDs: Array<number> = [];
   let pendingOperationsQueue: Array<OperationsArray> | null = [];
-  const pendingStringTable: Map<string, number> = new Map();
+  const pendingStringTable: Map<string, StringTableEntry> = new Map();
   let pendingStringTableLength: number = 0;
   let pendingUnmountedRootID: number | null = null;
 
@@ -1736,13 +1741,19 @@ export function attach(
     // Now fill in the string table.
     // [stringTableLength, str1Length, ...str1, str2Length, ...str2, ...]
     operations[i++] = pendingStringTableLength;
-    pendingStringTable.forEach((value, key) => {
-      operations[i++] = key.length;
-      const encodedKey = utfEncodeString(key);
-      for (let j = 0; j < encodedKey.length; j++) {
-        operations[i + j] = encodedKey[j];
+    pendingStringTable.forEach((entry, stringKey) => {
+      const encodedString = entry.encodedString;
+
+      // Don't use the string length.
+      // It won't work for multibyte characters (like emoji).
+      const length = encodedString.length;
+
+      operations[i++] = length;
+      for (let j = 0; j < length; j++) {
+        operations[i + j] = encodedString[j];
       }
-      i += key.length;
+
+      i += length;
     });
 
     if (numUnmountIDs > 0) {
@@ -1789,21 +1800,31 @@ export function attach(
     pendingStringTableLength = 0;
   }
 
-  function getStringID(str: string | null): number {
-    if (str === null) {
+  function getStringID(string: string | null): number {
+    if (string === null) {
       return 0;
     }
-    const existingID = pendingStringTable.get(str);
-    if (existingID !== undefined) {
-      return existingID;
+    const existingEntry = pendingStringTable.get(string);
+    if (existingEntry !== undefined) {
+      return existingEntry.id;
     }
-    const stringID = pendingStringTable.size + 1;
-    pendingStringTable.set(str, stringID);
-    // The string table total length needs to account
-    // both for the string length, and for the array item
-    // that contains the length itself. Hence + 1.
-    pendingStringTableLength += str.length + 1;
-    return stringID;
+
+    const id = pendingStringTable.size + 1;
+    const encodedString = utfEncodeString(string);
+
+    pendingStringTable.set(string, {
+      encodedString,
+      id,
+    });
+
+    // The string table total length needs to account both for the string length,
+    // and for the array item that contains the length itself.
+    //
+    // Don't use string length for this table.
+    // It won't work for multibyte characters (like emoji).
+    pendingStringTableLength += encodedString.length + 1;
+
+    return id;
   }
 
   function recordMount(fiber: Fiber, parentFiber: Fiber | null) {
diff --git a/packages/react-devtools-shared/src/utils.js b/packages/react-devtools-shared/src/utils.js
index 9ec19e3afd..771c18688a 100644
--- a/packages/react-devtools-shared/src/utils.js
+++ b/packages/react-devtools-shared/src/utils.js
@@ -138,17 +138,37 @@ export function utfDecodeString(array: Array<number>): string {
   return string;
 }
 
+function surrogatePairToCodePoint(
+  charCode1: number,
+  charCode2: number,
+): number {
+  return ((charCode1 & 0x3ff) << 10) + (charCode2 & 0x3ff) + 0x10000;
+}
+
+// Credit for this encoding approach goes to Tim Down:
+// https://stackoverflow.com/questions/4877326/how-can-i-tell-if-a-string-contains-multibyte-characters-in-javascript
 export function utfEncodeString(string: string): Array<number> {
   const cached = encodedStringCache.get(string);
   if (cached !== undefined) {
     return cached;
   }
 
-  const encoded = new Array(string.length);
-  for (let i = 0; i < string.length; i++) {
-    encoded[i] = string.codePointAt(i);
+  const encoded = [];
+  let i = 0;
+  let charCode;
+  while (i < string.length) {
+    charCode = string.charCodeAt(i);
+    // Handle multibyte unicode characters (like emoji).
+    if ((charCode & 0xf800) === 0xd800) {
+      encoded.push(surrogatePairToCodePoint(charCode, string.charCodeAt(++i)));
+    } else {
+      encoded.push(charCode);
+    }
+    ++i;
   }
+
   encodedStringCache.set(string, encoded);
+
   return encoded;
 }