<p>Missing character identifier: <input type="text" id="identifierElm" value="?" required pattern="[ -~]" />(← This represents one missing byte)</p>
<p lang="ja">
  漢字:
  <input type="checkbox" name="jis1cb" id="jis1cb" checked />
  <label for="jis1cb">JIS 第一水準漢字</label>
  <input type="checkbox" name="jis2cb" id="jis2cb" checked />
  <label for="jis2cb">JIS 第二水準漢字</label>
  <input type="checkbox" name="jis3cb" id="jis3cb" checked />
  <label for="jis3cb">JIS 第三水準漢字</label>
  <input type="checkbox" name="jis4cb" id="jis4cb" checked />
  <label for="jis4cb">JIS 第四水準漢字</label>
</p>
<p id="identifierElmIssue">Missing character identifier has to be one single ASCII character.</p>
<p>Input: (Convert one &#xfffd; into <button id="conv1">one missing byte</button> or <button id="conv2">two missing bytes</button>) </p>
<textarea name="inputElm" id="inputElm" cols="30" rows="10">譁?蟄怜喧縺代@縺滓律譛ャ隱槭?ョ繝?繧ュ繧ケ繝医?ョ萓?</textarea>
<p>Output:</p>
<pre id="outputElm" lang="ja">(斂斃文斈斉斌斎斐斑斗料斛斜斟)字化けした日本語(の)(ダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミム)キスト(の)(侃來侈侊例侍侏侑侒侔侖侘侚供依侠)</pre>
pre {
  white-space: pre-wrap;
}
#identifierElmIssue {
  display: none;
}

:root:has(#identifierElm:invalid) #identifierElmIssue {
  display: block;
  color: red;
}
import {
  TextEncoder,
  TextDecoder,
  EncodingIndexes,
  getEncoding
} from "https://esm.sh/text-decoding";

function encode(text, codec) {
  return new TextEncoder(codec, {
    NONSTANDARD_allowLegacyEncoding: true
  }).encode(text);
}
function decode(byteArr, codec) {
  return new TextDecoder(codec).decode(byteArr);
}

let identifier = "?";
let identifierByte = 63;
identifierElm.addEventListener("input", (e) => {
  identifier = e.target.value;
  identifierByte = encode(identifier, "utf8")[0];
});

let jis1 = true,
  jis2 = true,
  jis3 = true,
  jis4 = true;

function guessSize(byteArr) {
  if (!byteArr.length) throw new Error("bytearr is empty!");
  if (byteArr[0] !== identifierByte) {
    if ((byteArr[0] & 0b10000000) === 0) return 1;
    // ASCII byte
    else if ((byteArr[0] & 0b11000000) === 0b10000000)
      throw new Error(`Starting from mid byte! ${byteArr[0].toString(16)}`);
    else if ((byteArr[0] & 0b11100000) === 0b11000000) return 2;
    else if ((byteArr[0] & 0b11110000) === 0b11100000) return 3;
    else if ((byteArr[0] & 0b11111000) === 0b11110000) return 4;
    throw new Error(`Unidentified byte pattern ${byteArr[0]}`);
  } else {
    let ptr = 1;
    while (
      (byteArr[ptr] === identifierByte ||
        (byteArr[ptr] & 0b11000000) === 0b10000000) &&
      ptr + 1 < byteArr.length
    )
      ptr++;
    return ptr;
  }
}

function possibleChoices(byteArr, offset) {
  let prefix = [byteArr[0]];
  if (byteArr[0] === identifierByte) {
    let start = 0b10000000,
      end = 0b10111111;
    if (offset === 0) {
      if (byteArr.length === 1) {
        start = 0b00100000;
        end = 0b01111111;
      } else if (byteArr.length === 2) {
        start = 0b11000000;
        end = 0b11011111;
      } else if (byteArr.length === 3) {
        start = 0b11100000;
        end = 0b11101111;
      } else if (byteArr.length === 4) {
        start = 0b11110000;
        end = 0b11110111;
      } else {
        throw new Error(
          `invalid permute length (${byteArr.length}): ${byteArr}`
        );
      }
    }
    prefix = new Array(end - start + 1).fill(start).map((i, idx) => i + idx);
  }
  if (byteArr.length === 1) return prefix.map((v) => new Uint8Array([v]));
  return prefix
    .map((p) =>
      possibleChoices(byteArr.slice(1), offset + 1).map(
        (c) => new Uint8Array([p, ...c])
      )
    )
    .flat();
}

function solveBytes(byteArr, floatingBytes, prevMissing) {
  try {
    let choices = possibleChoices(byteArr, 0);

    // decoded UTF-8 must be one char
    choices = choices.filter((ba) => decode(ba, "utf8").length === 1);

    choices = choices.filter((ba) => {
      const d = encode(decode(ba, "utf8"), "shift_jis");

      if (d.length === 2) {
        // keep choices where encode_shiftjis(decode_utf8(i)) is valid
        // i.e. char is encoded in Shift-JIS
        if (d[0] === 128 && d[1] === 63) return false; // [128, 63] represents unencoded char

        // JIS level 1 kanji
        if (
          (d[0] === 0x88 && 0x9f <= d[1] && d[1] <= 0xfc) ||
          (0x89 <= d[0] && d[0] <= 0x97 && 0x40 <= d[1] && d[1] <= 0xfc) ||
          (d[0] === 0x98 && 0x40 <= d[1] && d[1] <= 0x9e)
        )
          return jis1;

        if (
          (d[0] === 0x98 && 0x9f <= d[1] && d[1] <= 0xfc) ||
          (0x99 <= d[0] && d[0] <= 0xe9 && 0x40 <= d[1] && d[1] <= 0xfc) ||
          (d[0] === 0xea && 0x40 <= d[1] && d[1] <= 0xa4)
        )
          return jis2;

        const twoByte = (d[0] << 8) + d[1];

        if (
          (0x879f <= twoByte && twoByte <= 0x889e) ||
          (0x9873 <= twoByte && twoByte <= 0x989e) ||
          (0xeaa5 <= twoByte && twoByte <= 0xeffc)
        )
          return jis3;
        if (0xf040 <= twoByte && twoByte <= 0xfcf4) return jis4;
      }
      return true;
    });

    // This is only valid if previous done byte is a confirmed byte
    if (!prevMissing) {
      // When re-encoded with prepending floating bytes, the outcome should have \ufffd
      // i.e. resulting mojibake should not be conclusive
      choices = choices.filter((ba) =>
        decode(new Uint8Array([...floatingBytes, ...ba]), "shift_jis")
          .replaceAll("\x80", "\ufffd") // text-decoding somehow allowed 0x80 to be doceded to unicode U+0080 which \x80 is not a valid Shift-JIS byte
          .includes("\ufffd")
      );
    }
    return "(" + choices.map((ba) => decode(ba, "utf8")).join("") + ")";
  } catch (e) {
    return `(error: ${e}, ${decode(byteArr, "utf8")})`;
  }
}

function getFloatingBytes(byteArr) {
  const decoded = decode(byteArr, "shift_jis").replaceAll("\ufffd", "");
  let confirmedBytes = encode(decoded, "shift_jis");
  return byteArr.slice(confirmedBytes.length);
}

function mainDecode(input) {
  // try plain decode
  const plainDecode = decode(encode(input, "shift_jis", "utf8"));
  if (!plainDecode.includes("\ufffd")) {
    return plainDecode;
  }

  // need to guess bytes
  let output = "";
  let pendingBytes = encode(input, "shift_jis", "utf8");
  let doneBytes = new Uint8Array([]);
  while (pendingBytes.length) {
    // purge decodable bytes first
    const decoded = decode(pendingBytes, "utf8");
    const failurePos = decoded.startsWith("?") ? 0 : decoded.indexOf("\ufffd");
    if (failurePos < 0) {
      // all remaining text is safe
      output += decoded;
      pendingBytes = new Uint8Array([]);
    } else {
      const safeSeg = decoded.substring(0, failurePos);
      const safeBytes = encode(safeSeg, "utf8").length;
      output += safeSeg;
      doneBytes = new Uint8Array([
        ...doneBytes,
        ...pendingBytes.slice(0, safeBytes)
      ]);
      pendingBytes = pendingBytes.slice(safeBytes);

      // guess next char size
      const size = guessSize(pendingBytes);
      const floatingBytes = getFloatingBytes(doneBytes);
      const prevMissing =
        doneBytes.length > 0 &&
        doneBytes[doneBytes.length - 1] === identifierByte;
      output += solveBytes(
        pendingBytes.slice(0, size),
        floatingBytes,
        prevMissing
      );
      doneBytes = new Uint8Array([
        ...doneBytes,
        ...pendingBytes.slice(0, size)
      ]);
      pendingBytes = pendingBytes.slice(size);
    }
  }
  return output;
}

let input = "譁?蟄怜喧縺代@縺滓律譛ャ隱槭?ョ繝?繧ュ繧ケ繝医?ョ萓?";
function solve() {
  outputElm.innerText = input
    .split("\n")
    .map((v) => {
      try {
        return mainDecode(v);
      } catch (e) {
        return `error: ${e} (${v}) (${decode(encode(v, "shift_jis", "utf8"))})`;
      }
    })
    .join("\n");
}
inputElm.addEventListener("input", (e) => {
  input = e.target.value;
  solve();
});

conv1.addEventListener("click", () => {
  inputElm.value = inputElm.value.replaceAll("\ufffd", identifier);
  input = inputElm.value;
  solve();
});

conv2.addEventListener("click", () => {
  inputElm.value = inputElm.value.replaceAll("\ufffd", identifier + identifier);
  input = inputElm.value;
  solve();
});

jis1cb.addEventListener("change", (evt) => {
  jis1 = evt.target.checked;
  solve();
});
jis2cb.addEventListener("change", (evt) => {
  jis2 = evt.target.checked;
  solve();
});
jis3cb.addEventListener("change", (evt) => {
  jis3 = evt.target.checked;
  solve();
});
jis4cb.addEventListener("change", (evt) => {
  jis4 = evt.target.checked;
  solve();
});
Run Pen

External CSS

This Pen doesn't use any external CSS resources.

External JavaScript

This Pen doesn't use any external JavaScript resources.