Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"@dnd-kit/modifiers": "^9.0.0",
"@dnd-kit/sortable": "^10.0.0",
"@dnd-kit/utilities": "^3.2.2",
"chardet": "^2.1.1",
"cron": "^3.2.1",
"crypto-js": "^4.2.0",
"dayjs": "^1.11.13",
Expand Down
23 changes: 23 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 9 additions & 7 deletions src/pages/install/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import { cacheInstance } from "@App/app/cache";
import { formatBytes, prettyUrl } from "@App/pkg/utils/utils";
import { ScriptIcons } from "../options/routes/utils";
import { detectEncoding } from "@App/pkg/utils/encoding";

const backgroundPromptShownKey = "background_prompt_shown";

Expand Down Expand Up @@ -102,11 +103,6 @@
onProgress?.({ receivedLength });
}

// 检查 Content-Type 中的 charset
const contentType = response.headers.get("content-type") || "";
const charsetMatch = contentType.match(/charset=([^;]+)/i);
const charset = charsetMatch ? charsetMatch[1].toLowerCase() : "utf-8";

// 合并分片(chunks)
const chunksAll = new Uint8Array(receivedLength);
let position = 0;
Expand All @@ -115,12 +111,18 @@
position += chunk.length;
}

// 检测编码:优先使用 Content-Type,回退到 chardet(仅检测前16KB)
const contentType = response.headers.get("content-type");
const encode = detectEncoding(chunksAll, contentType);

// 使用检测到的 charset 解码
let code;
try {
code = new TextDecoder(charset).decode(chunksAll);
code = new TextDecoder(encode).decode(chunksAll);
} catch (e: any) {
throw new Error(`Failed to decode response with charset ${charset}: ${e.message}`);
console.warn(`Failed to decode response with charset ${encode}: ${e.message}`);
// 回退到 UTF-8
code = new TextDecoder("utf-8").decode(chunksAll);
}

const metadata = parseMetadata(code);
Expand Down Expand Up @@ -324,7 +326,7 @@

useEffect(() => {
!loaded && initAsync();
}, [searchParams, loaded]);

Check warning on line 329 in src/pages/install/App.tsx

View workflow job for this annotation

GitHub Actions / Run tests

React Hook useEffect has a missing dependency: 'initAsync'. Either include it or remove the dependency array

const [watchFile, setWatchFile] = useState(false);
const metadataLive = useMemo(() => (scriptInfo?.metadata || {}) as SCMetadata, [scriptInfo]);
Expand Down Expand Up @@ -633,7 +635,7 @@
return () => {
unmountFileTrack(handle);
};
}, [memoWatchFile]);

Check warning on line 638 in src/pages/install/App.tsx

View workflow job for this annotation

GitHub Actions / Run tests

React Hook useEffect has missing dependencies: 'localFileHandle', 'scriptInfo?.uuid', 'setupWatchFile', and 'watchFile'. Either include them or remove the dependency array

// 检查是否有 uuid 或 file
const hasUUIDorFile = useMemo(() => {
Expand Down Expand Up @@ -700,7 +702,7 @@
useEffect(() => {
if (!urlHref) return;
loadURLAsync(urlHref);
}, [urlHref]);

Check warning on line 705 in src/pages/install/App.tsx

View workflow job for this annotation

GitHub Actions / Run tests

React Hook useEffect has a missing dependency: 'loadURLAsync'. Either include it or remove the dependency array

if (!hasUUIDorFile) {
return urlHref ? (
Expand Down
142 changes: 142 additions & 0 deletions src/pkg/utils/encoding.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import { describe, it, expect, vi } from "vitest";
import { parseCharsetFromContentType, detectEncoding } from "./encoding";

describe("encoding detection", () => {
describe("parseCharsetFromContentType", () => {
it("should extract charset from valid Content-Type header", () => {
expect(parseCharsetFromContentType("text/javascript; charset=utf-8")).toBe("utf-8");
expect(parseCharsetFromContentType("text/plain; charset=GBK")).toBe("gbk");
expect(parseCharsetFromContentType("application/javascript; charset=ISO-8859-1")).toBe("iso-8859-1");
});

it("should handle charset with quotes", () => {
expect(parseCharsetFromContentType('text/javascript; charset="utf-8"')).toBe("utf-8");
expect(parseCharsetFromContentType("text/javascript; charset='gbk'")).toBe("gbk");
});

it("should handle case-insensitive charset parameter", () => {
expect(parseCharsetFromContentType("text/javascript; CHARSET=UTF-8")).toBe("utf-8");
expect(parseCharsetFromContentType("text/javascript; Charset=GBK")).toBe("gbk");
});

it("should return null for missing charset", () => {
expect(parseCharsetFromContentType("text/javascript")).toBe(null);
expect(parseCharsetFromContentType("text/plain; boundary=something")).toBe(null);
});

it("should return null for null or empty input", () => {
expect(parseCharsetFromContentType(null)).toBe(null);
expect(parseCharsetFromContentType("")).toBe(null);
});

it("should handle charset with additional parameters", () => {
expect(parseCharsetFromContentType("text/javascript; charset=utf-8; boundary=xxx")).toBe("utf-8");
});
});

describe("detectEncoding", () => {
it("should prioritize valid charset from Content-Type header", () => {
const utf8Data = new TextEncoder().encode("hello world");
expect(detectEncoding(utf8Data, "text/javascript; charset=utf-8")).toBe("utf-8");
});

it("should fallback to chardet when Content-Type header is missing", () => {
// UTF-8 编码的中文
const utf8Data = new TextEncoder().encode("你好世界");
const encoding = detectEncoding(utf8Data, null);
expect(encoding).toBe("utf-8");
});

it("should fallback to chardet when Content-Type charset is invalid", () => {
const utf8Data = new TextEncoder().encode("hello world");
const encoding = detectEncoding(utf8Data, "text/javascript; charset=invalid-encoding");
// chardet 可能检测为 utf-8 或 ascii,都是合理的
expect(["utf-8", "ascii", "windows-1252"]).toContain(encoding);
});

it("should fallback to utf-8 when chardet returns null", () => {
// 模拟 chardet 返回 null 的情况(空数据)
const emptyData = new Uint8Array(0);
const encoding = detectEncoding(emptyData, null);
// 空数据时,chardet 可能返回 ascii 或其他编码,但都应该是有效的
expect(encoding).toBeTruthy();
expect(() => new TextDecoder(encoding)).not.toThrow();
});

it("should only use first 16KB for chardet detection", () => {
// 创建一个大于 16KB 的数据
const largeData = new Uint8Array(20 * 1024);
// 填充 UTF-8 编码的数据
const text = "a".repeat(20 * 1024);
const textBytes = new TextEncoder().encode(text);
largeData.set(textBytes.slice(0, largeData.length));

const encoding = detectEncoding(largeData, null);
// 应该成功检测,说明使用了采样
expect(["utf-8", "ascii", "windows-1252"]).toContain(encoding);
});

it("should handle GBK encoded data", () => {
// GBK 编码的 "你好" (这是一个简化的测试,实际 GBK 编码更复杂)
// 注意:在浏览器环境中,GBK 编码可能被识别为其他兼容编码
const gbkLikeData = new Uint8Array([0xc4, 0xe3, 0xba, 0xc3]); // "你好" in GBK
const encoding = detectEncoding(gbkLikeData, null);
// chardet 可能识别为 GBK、Shift_JIS 或相关的东亚编码
expect(encoding).toBeTruthy();
expect(() => new TextDecoder(encoding)).not.toThrow();
});

it("should handle ISO-8859-1 encoded data", () => {
// ISO-8859-1 特有字符(扩展 ASCII)
const iso88591Data = new Uint8Array([0xe9, 0xe8, 0xe0, 0xe7]); // é è à ç
const encoding = detectEncoding(iso88591Data, null);
expect(encoding).toBeTruthy();
});

it("should validate detected encoding is supported by TextDecoder", () => {
const utf8Data = new TextEncoder().encode("test");
const encoding = detectEncoding(utf8Data, null);

// 确保返回的编码可以被 TextDecoder 使用
expect(() => new TextDecoder(encoding)).not.toThrow();
});

it("should prefer Content-Type charset over chardet detection", () => {
// 即使数据看起来像 GBK,如果 Content-Type 指定了 UTF-8,应该使用 UTF-8
const data = new Uint8Array([0xc4, 0xe3, 0xba, 0xc3]);
const encoding = detectEncoding(data, "text/javascript; charset=utf-8");
expect(encoding).toBe("utf-8");
});

it("should handle charset with different cases from Content-Type", () => {
const data = new TextEncoder().encode("test");
expect(detectEncoding(data, "text/javascript; charset=UTF-8")).toBe("utf-8");
expect(detectEncoding(data, "text/javascript; charset=Utf-8")).toBe("utf-8");
expect(detectEncoding(data, "text/javascript; charset=GBK")).toBe("gbk");
});

it("should handle Windows-1252 encoded data", () => {
// Windows-1252 特有字符
const win1252Data = new Uint8Array([0x80, 0x82, 0x83, 0x84]); // € ‚ ƒ „
const encoding = detectEncoding(win1252Data, null);
expect(encoding).toBeTruthy();
// chardet 应该能检测出编码或回退到有效的编码
// Shift_JIS 也是一个有效的编码,chardet 可能会识别为它
expect(["utf-8", "windows-1252", "iso-8859-1", "shift_jis", "ascii"]).toContain(encoding);
});

it("should fallback to utf-8 when chardet detects invalid encoding", () => {
// 使用 vi.spyOn 来模拟 console.warn
const consoleWarnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});

const data = new TextEncoder().encode("test");
const encoding = detectEncoding(data, null);

// 应该成功返回一个有效的编码
expect(encoding).toBeTruthy();
expect(() => new TextDecoder(encoding)).not.toThrow();

consoleWarnSpy.mockRestore();
});
});
});
51 changes: 51 additions & 0 deletions src/pkg/utils/encoding.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import chardet from "chardet";

/**
* 从 Content-Type header 中解析 charset
*/
export const parseCharsetFromContentType = (contentType: string | null): string | null => {
if (!contentType) return null;

const match = contentType.match(/charset=([^;]+)/i);
if (match && match[1]) {
return match[1].trim().toLowerCase().replace(/['"]/g, "");
}
return null;
};

/**
* 检测字节数组的编码
* 优先使用 Content-Type header,失败时使用 chardet(仅对前16KB检测以提升性能)
*/
export const detectEncoding = (data: Uint8Array, contentType: string | null): string => {
// 优先尝试使用 Content-Type header 中的 charset
const headerCharset = parseCharsetFromContentType(contentType);
if (headerCharset) {
try {
// 验证 charset 是否有效
new TextDecoder(headerCharset);
return headerCharset;
} catch (e: any) {
console.warn(`Invalid charset from Content-Type header: ${headerCharset}, error: ${e.message}`);
}
}

// 使用 chardet 检测编码,仅检测前16KB以提升性能
const sampleSize = Math.min(data.length, 16 * 1024);
const sample = data.subarray(0, sampleSize);
const detected = chardet.detect(sample);

if (detected) {
const encoding = detected.toLowerCase();
try {
// 验证检测到的编码是否有效
new TextDecoder(encoding);
return encoding;
} catch (e: any) {
console.warn(`Invalid charset detected by chardet: ${encoding}, error: ${e.message}`);
}
}

// 回退到 UTF-8
return "utf-8";
};
Loading