Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/Parser.events.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,18 @@ describe("Events", () => {

it("Scripts ending with <", () => runTest("<script><</script>"));

it("Special end tags ending with /> in script", () =>
runTest("<script>safe</script/><img>"));

it("Special end tags ending with /> in style", () =>
runTest("<style>safe</style/><img>"));

it("Special end tags ending with /> in title", () =>
runTest("<title>safe</title/><img>"));

it("Special end tags ending with /> in textarea", () =>
runTest("<textarea>safe</textarea/><img>"));

it("CDATA more edge-cases", () =>
runTest("<![CDATA[foo]bar]>baz]]>", { recognizeCDATA: true }));

Expand Down
66 changes: 63 additions & 3 deletions src/Tokenizer.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ import { describe, expect, it } from "vitest";
import { Tokenizer } from "./index.js";
import type { Callbacks } from "./Tokenizer.js";

function tokenize(data: string, options = {}) {
function tokenize(
data: string | ((tokenizer: Tokenizer, log: unknown[][]) => void),
options = {},
) {
const log: unknown[][] = [];
const tokenizer = new Tokenizer(
options,
Expand All @@ -17,8 +20,12 @@ function tokenize(data: string, options = {}) {
) as Callbacks,
);

tokenizer.write(data);
tokenizer.end();
if (typeof data === "function") {
data(tokenizer, log);
} else {
tokenizer.write(data);
tokenizer.end();
}

return log;
}
Expand Down Expand Up @@ -82,6 +89,23 @@ describe("Tokenizer", () => {
});
});

describe("should close special tags on end tags ending with />", () => {
it("for script tag", () => {
expect(tokenize("<script>safe</script/><img>")).toMatchSnapshot();
});
it("for style tag", () => {
expect(tokenize("<style>safe</style/><img>")).toMatchSnapshot();
});
it("for title tag", () => {
expect(tokenize("<title>safe</title/><img>")).toMatchSnapshot();
});
it("for textarea tag", () => {
expect(
tokenize("<textarea>safe</textarea/><img>"),
).toMatchSnapshot();
});
});

describe("should correctly mark attributes", () => {
it("for no value attribute", () => {
expect(tokenize("<div aaaaaaa >")).toMatchSnapshot();
Expand Down Expand Up @@ -128,6 +152,42 @@ describe("Tokenizer", () => {
expect(tokenize("&NotGreaterFullEqual;")).toMatchSnapshot());
});

it("should close comments on --!>", () => {
expect(
tokenize("<!-- --!><img src=x onerror=alert(1)>-->"),
).toMatchSnapshot();
});

it.each([
"script",
"style",
"title",
"textarea",
])("should reset after an unclosed %s tag", (tag) => {
expect(
tokenize((tokenizer, events) => {
tokenizer.write(`<${tag}>body{color:red}`);
tokenizer.end();
events.length = 0;
tokenizer.reset();
tokenizer.write("<div>hello</div>");
tokenizer.end();
}).map(([event]) => event),
).toEqual([
"onopentagname",
"onopentagend",
"ontext",
"onclosetag",
"onend",
]);
});

it("should terminate XML processing instructions on ?>", () => {
expect(
tokenize("<?target data > injected ?>", { xmlMode: true }),
).toMatchSnapshot();
});

it("should not lose data when pausing", () => {
const log: unknown[][] = [];
const tokenizer = new Tokenizer(
Expand Down
50 changes: 44 additions & 6 deletions src/Tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ const Sequences = {
Empty: new Uint8Array(0),
Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), // CDATA[
CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), // ]]>
CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]), // `-->`
CommentEnd: new Uint8Array([0x2d, 0x2d, 0x21, 0x3e]), // `--!>`
ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), // `</script`
StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), // `</style`
TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title`
Expand Down Expand Up @@ -196,7 +196,9 @@ export default class Tokenizer {
this.sectionStart = 0;
this.index = 0;
this.baseState = State.Text;
this.isSpecial = false;
this.currentSequence = Sequences.Empty;
this.sequenceIndex = 0;
this.running = true;
this.offset = 0;
}
Expand Down Expand Up @@ -265,7 +267,7 @@ export default class Tokenizer {
*/
private stateInSpecialTag(c: number): void {
if (this.sequenceIndex === this.currentSequence.length) {
if (c === CharCodes.Gt || isWhitespace(c)) {
if (isEndOfTagSection(c)) {
const endOfText = this.index - this.currentSequence.length;

if (this.sectionStart < endOfText) {
Expand Down Expand Up @@ -352,12 +354,29 @@ export default class Tokenizer {
* @param c Current character code point.
*/
private stateInCommentLike(c: number): void {
if (c === this.currentSequence[this.sequenceIndex]) {
if (
this.currentSequence === Sequences.CommentEnd &&
this.sequenceIndex === 2 &&
c === CharCodes.Gt
) {
// `!` is optional here, so the same sequence also accepts `-->`.
this.cbs.oncomment(this.sectionStart, this.index, 2);

this.sequenceIndex = 0;
this.sectionStart = this.index + 1;
this.state = State.Text;
} else if (
this.currentSequence === Sequences.CommentEnd &&
this.sequenceIndex === this.currentSequence.length - 1 &&
c !== CharCodes.Gt
) {
this.sequenceIndex = Number(c === CharCodes.Dash);
} else if (c === this.currentSequence[this.sequenceIndex]) {
if (++this.sequenceIndex === this.currentSequence.length) {
if (this.currentSequence === Sequences.CdataEnd) {
this.cbs.oncdata(this.sectionStart, this.index, 2);
} else {
this.cbs.oncomment(this.sectionStart, this.index, 2);
this.cbs.oncomment(this.sectionStart, this.index, 3);
}

this.sequenceIndex = 0;
Expand Down Expand Up @@ -399,6 +418,7 @@ export default class Tokenizer {
this.sectionStart = this.index + 1;
} else if (c === CharCodes.Questionmark) {
this.state = State.InProcessingInstruction;
this.sequenceIndex = 0;
this.sectionStart = this.index + 1;
} else if (this.isTagStartChar(c)) {
const lower = c | 0x20;
Expand Down Expand Up @@ -443,7 +463,7 @@ export default class Tokenizer {
}
}
private stateInClosingTagName(c: number): void {
if (c === CharCodes.Gt || isWhitespace(c)) {
if (isEndOfTagSection(c)) {
this.cbs.onclosetag(this.sectionStart, this.index);
this.sectionStart = -1;
this.state = State.AfterClosingTagName;
Expand Down Expand Up @@ -574,7 +594,25 @@ export default class Tokenizer {
}
}
private stateInProcessingInstruction(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
if (this.xmlMode) {
if (c === CharCodes.Questionmark) {
// Remember that we just consumed `?`, so the next `>` closes the PI.
this.sequenceIndex = 1;
} else if (c === CharCodes.Gt && this.sequenceIndex === 1) {
this.cbs.onprocessinginstruction(
this.sectionStart,
this.index - 1,
);
this.sequenceIndex = 0;
this.state = State.Text;
this.sectionStart = this.index + 1;
} else {
// Keep scanning for the next `?`, which can start a closing `?>`.
this.sequenceIndex = Number(
this.fastForwardTo(CharCodes.Questionmark),
);
}
} else if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.cbs.onprocessinginstruction(this.sectionStart, this.index);
this.state = State.Text;
this.sectionStart = this.index + 1;
Expand Down
Loading
Loading