<p>Missing character identifier: <input type="text" id="identifierElm" value="?" required pattern="[ -~]" />(← This represents one missing byte)</p>
<p lang="ja">
漢字:
<input type="checkbox" name="jis1cb" id="jis1cb" checked />
<label for="jis1cb">JIS 第一水準漢字</label>
<input type="checkbox" name="jis2cb" id="jis2cb" checked />
<label for="jis2cb">JIS 第二水準漢字</label>
<input type="checkbox" name="jis3cb" id="jis3cb" checked />
<label for="jis3cb">JIS 第三水準漢字</label>
<input type="checkbox" name="jis4cb" id="jis4cb" checked />
<label for="jis4cb">JIS 第四水準漢字</label>
</p>
<p id="identifierElmIssue">Missing character identifier has to be one single ASCII character.</p>
<p>Input: (Convert one � into <button id="conv1">one missing byte</button> or <button id="conv2">two missing bytes</button>) </p>
<textarea name="inputElm" id="inputElm" cols="30" rows="10">譁?蟄怜喧縺代@縺滓律譛ャ隱槭?ョ繝?繧ュ繧ケ繝医?ョ萓?</textarea>
<p>Output:</p>
<pre id="outputElm" lang="ja">(斂斃文斈斉斌斎斐斑斗料斛斜斟)字化けした日本語(の)(ダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミム)キスト(の)(侃來侈侊例侍侏侑侒侔侖侘侚供依侠)</pre>
pre {
white-space: pre-wrap;
}
#identifierElmIssue {
display: none;
}
:root:has(#identifierElm:invalid) #identifierElmIssue {
display: block;
color: red;
}
import {
TextEncoder,
TextDecoder,
EncodingIndexes,
getEncoding
} from "https://esm.sh/text-decoding";
function encode(text, codec) {
return new TextEncoder(codec, {
NONSTANDARD_allowLegacyEncoding: true
}).encode(text);
}
function decode(byteArr, codec) {
return new TextDecoder(codec).decode(byteArr);
}
let identifier = "?";
let identifierByte = 63;
identifierElm.addEventListener("input", (e) => {
identifier = e.target.value;
identifierByte = encode(identifier, "utf8")[0];
});
let jis1 = true,
jis2 = true,
jis3 = true,
jis4 = true;
function guessSize(byteArr) {
if (!byteArr.length) throw new Error("bytearr is empty!");
if (byteArr[0] !== identifierByte) {
if ((byteArr[0] & 0b10000000) === 0) return 1;
// ASCII byte
else if ((byteArr[0] & 0b11000000) === 0b10000000)
throw new Error(`Starting from mid byte! ${byteArr[0].toString(16)}`);
else if ((byteArr[0] & 0b11100000) === 0b11000000) return 2;
else if ((byteArr[0] & 0b11110000) === 0b11100000) return 3;
else if ((byteArr[0] & 0b11111000) === 0b11110000) return 4;
throw new Error(`Unidentified byte pattern ${byteArr[0]}`);
} else {
let ptr = 1;
while (
(byteArr[ptr] === identifierByte ||
(byteArr[ptr] & 0b11000000) === 0b10000000) &&
ptr + 1 < byteArr.length
)
ptr++;
return ptr;
}
}
function possibleChoices(byteArr, offset) {
let prefix = [byteArr[0]];
if (byteArr[0] === identifierByte) {
let start = 0b10000000,
end = 0b10111111;
if (offset === 0) {
if (byteArr.length === 1) {
start = 0b00100000;
end = 0b01111111;
} else if (byteArr.length === 2) {
start = 0b11000000;
end = 0b11011111;
} else if (byteArr.length === 3) {
start = 0b11100000;
end = 0b11101111;
} else if (byteArr.length === 4) {
start = 0b11110000;
end = 0b11110111;
} else {
throw new Error(
`invalid permute length (${byteArr.length}): ${byteArr}`
);
}
}
prefix = new Array(end - start + 1).fill(start).map((i, idx) => i + idx);
}
if (byteArr.length === 1) return prefix.map((v) => new Uint8Array([v]));
return prefix
.map((p) =>
possibleChoices(byteArr.slice(1), offset + 1).map(
(c) => new Uint8Array([p, ...c])
)
)
.flat();
}
function solveBytes(byteArr, floatingBytes, prevMissing) {
try {
let choices = possibleChoices(byteArr, 0);
// decoded UTF-8 must be one char
choices = choices.filter((ba) => decode(ba, "utf8").length === 1);
choices = choices.filter((ba) => {
const d = encode(decode(ba, "utf8"), "shift_jis");
if (d.length === 2) {
// keep choices where encode_shiftjis(decode_utf8(i)) is valid
// i.e. char is encoded in Shift-JIS
if (d[0] === 128 && d[1] === 63) return false; // [128, 63] represents unencoded char
// JIS level 1 kanji
if (
(d[0] === 0x88 && 0x9f <= d[1] && d[1] <= 0xfc) ||
(0x89 <= d[0] && d[0] <= 0x97 && 0x40 <= d[1] && d[1] <= 0xfc) ||
(d[0] === 0x98 && 0x40 <= d[1] && d[1] <= 0x9e)
)
return jis1;
if (
(d[0] === 0x98 && 0x9f <= d[1] && d[1] <= 0xfc) ||
(0x99 <= d[0] && d[0] <= 0xe9 && 0x40 <= d[1] && d[1] <= 0xfc) ||
(d[0] === 0xea && 0x40 <= d[1] && d[1] <= 0xa4)
)
return jis2;
const twoByte = (d[0] << 8) + d[1];
if (
(0x879f <= twoByte && twoByte <= 0x889e) ||
(0x9873 <= twoByte && twoByte <= 0x989e) ||
(0xeaa5 <= twoByte && twoByte <= 0xeffc)
)
return jis3;
if (0xf040 <= twoByte && twoByte <= 0xfcf4) return jis4;
}
return true;
});
// This is only valid if previous done byte is a confirmed byte
if (!prevMissing) {
// When re-encoded with prepending floating bytes, the outcome should have \ufffd
// i.e. resulting mojibake should not be conclusive
choices = choices.filter((ba) =>
decode(new Uint8Array([...floatingBytes, ...ba]), "shift_jis")
.replaceAll("\x80", "\ufffd") // text-decoding somehow allowed 0x80 to be doceded to unicode U+0080 which \x80 is not a valid Shift-JIS byte
.includes("\ufffd")
);
}
return "(" + choices.map((ba) => decode(ba, "utf8")).join("") + ")";
} catch (e) {
return `(error: ${e}, ${decode(byteArr, "utf8")})`;
}
}
function getFloatingBytes(byteArr) {
const decoded = decode(byteArr, "shift_jis").replaceAll("\ufffd", "");
let confirmedBytes = encode(decoded, "shift_jis");
return byteArr.slice(confirmedBytes.length);
}
function mainDecode(input) {
// try plain decode
const plainDecode = decode(encode(input, "shift_jis", "utf8"));
if (!plainDecode.includes("\ufffd")) {
return plainDecode;
}
// need to guess bytes
let output = "";
let pendingBytes = encode(input, "shift_jis", "utf8");
let doneBytes = new Uint8Array([]);
while (pendingBytes.length) {
// purge decodable bytes first
const decoded = decode(pendingBytes, "utf8");
const failurePos = decoded.startsWith("?") ? 0 : decoded.indexOf("\ufffd");
if (failurePos < 0) {
// all remaining text is safe
output += decoded;
pendingBytes = new Uint8Array([]);
} else {
const safeSeg = decoded.substring(0, failurePos);
const safeBytes = encode(safeSeg, "utf8").length;
output += safeSeg;
doneBytes = new Uint8Array([
...doneBytes,
...pendingBytes.slice(0, safeBytes)
]);
pendingBytes = pendingBytes.slice(safeBytes);
// guess next char size
const size = guessSize(pendingBytes);
const floatingBytes = getFloatingBytes(doneBytes);
const prevMissing =
doneBytes.length > 0 &&
doneBytes[doneBytes.length - 1] === identifierByte;
output += solveBytes(
pendingBytes.slice(0, size),
floatingBytes,
prevMissing
);
doneBytes = new Uint8Array([
...doneBytes,
...pendingBytes.slice(0, size)
]);
pendingBytes = pendingBytes.slice(size);
}
}
return output;
}
let input = "譁?蟄怜喧縺代@縺滓律譛ャ隱槭?ョ繝?繧ュ繧ケ繝医?ョ萓?";
function solve() {
outputElm.innerText = input
.split("\n")
.map((v) => {
try {
return mainDecode(v);
} catch (e) {
return `error: ${e} (${v}) (${decode(encode(v, "shift_jis", "utf8"))})`;
}
})
.join("\n");
}
inputElm.addEventListener("input", (e) => {
input = e.target.value;
solve();
});
conv1.addEventListener("click", () => {
inputElm.value = inputElm.value.replaceAll("\ufffd", identifier);
input = inputElm.value;
solve();
});
conv2.addEventListener("click", () => {
inputElm.value = inputElm.value.replaceAll("\ufffd", identifier + identifier);
input = inputElm.value;
solve();
});
jis1cb.addEventListener("change", (evt) => {
jis1 = evt.target.checked;
solve();
});
jis2cb.addEventListener("change", (evt) => {
jis2 = evt.target.checked;
solve();
});
jis3cb.addEventListener("change", (evt) => {
jis3 = evt.target.checked;
solve();
});
jis4cb.addEventListener("change", (evt) => {
jis4 = evt.target.checked;
solve();
});
This Pen doesn't use any external CSS resources.
This Pen doesn't use any external JavaScript resources.