From 30fb3c843365c8dfd923ab7a0c173ccdbd439e20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Wed, 10 Nov 2021 20:20:18 +0000 Subject: [PATCH 01/12] refactor(tokenizer): Introduce sequences --- src/Tokenizer.ts | 104 ++++++++++++++++++++--------------------------- 1 file changed, 45 insertions(+), 59 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 1651b2e3..abace737 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -76,12 +76,6 @@ const enum State { AfterComment2, // Cdata - BeforeCdata1, // [ - BeforeCdata2, // C - BeforeCdata3, // D - BeforeCdata4, // A - BeforeCdata5, // T - BeforeCdata6, // A InCdata, // [ AfterCdata1, // ] AfterCdata2, // ] @@ -126,6 +120,9 @@ const enum State { InNamedEntity, InNumericEntity, InHexEntity, // X + + // Sequences + CDATASequence, } const enum Special { @@ -183,31 +180,17 @@ function ifElseState(upper: string, SUCCESS: State, FAILURE: State) { }; } -const stateBeforeCdata1 = ifElseState( - "C", - State.BeforeCdata2, - State.InDeclaration -); -const stateBeforeCdata2 = ifElseState( - "D", - State.BeforeCdata3, - State.InDeclaration -); -const stateBeforeCdata3 = ifElseState( - "A", - State.BeforeCdata4, - State.InDeclaration -); -const stateBeforeCdata4 = ifElseState( - "T", - State.BeforeCdata5, - State.InDeclaration -); -const stateBeforeCdata5 = ifElseState( - "A", - State.BeforeCdata6, - State.InDeclaration -); +const SEQUENCES = { + CDATA: new Uint16Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), // CDATA[ + SCRIPT: new Uint16Array([0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), // `script` + SCRIPT_END: new Uint16Array([ + 0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, + ]), // ` +}; const stateBeforeScript1 = ifElseState( "R", @@ -376,6 +359,21 @@ export default class Tokenizer { this.sectionStart = this._index; } } + private sequenceIndex = 0; + private stateCDATASequence(c: number) { + if (c === SEQUENCES.CDATA[this.sequenceIndex]) { + if (++this.sequenceIndex === SEQUENCES.CDATA.length) { + this._state = State.InCdata; + this.sectionStart = this._index + 1; + return; + } + } else { + this.sequenceIndex = 0; + this._state = State.InDeclaration; + this.stateInDeclaration(c); // Reconsume the character + } + } + /** * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name. * @@ -570,12 +568,15 @@ export default class Tokenizer { } } private stateBeforeDeclaration(c: number) { - this._state = - c === CharCodes.OpeningSquareBracket - ? State.BeforeCdata1 - : c === CharCodes.Dash - ? State.BeforeComment - : State.InDeclaration; + if (c === CharCodes.OpeningSquareBracket) { + this._state = State.CDATASequence; + this.sequenceIndex = 0; + } else { + this._state = + c === CharCodes.Dash + ? State.BeforeComment + : State.InDeclaration; + } } private stateInDeclaration(c: number) { if (c === CharCodes.Gt) { @@ -631,15 +632,6 @@ export default class Tokenizer { } // Else: stay in AFTER_COMMENT_2 (`--->`) } - private stateBeforeCdata6(c: number) { - if (c === CharCodes.OpeningSquareBracket) { - this._state = State.InCdata; - this.sectionStart = this._index + 1; - } else { - this._state = State.InDeclaration; - this.stateInDeclaration(c); - } - } private stateInCdata(c: number) { if (c === CharCodes.ClosingSquareBracket) this._state = State.AfterCdata1; @@ -843,16 +835,22 @@ export default class Tokenizer { } } + private shouldContinue() { + return this._index < this.buffer.length && this.running; + } + /** * Iterates through the buffer, calling the function corresponding to the current state. * * States that are more likely to be hit are higher up, as a performance improvement. */ private parse() { - while (this._index < this.buffer.length && this.running) { + while (this.shouldContinue()) { const c = this.buffer.charCodeAt(this._index); if (this._state === State.Text) { this.stateText(c); + } else if (this._state === State.CDATASequence) { + this.stateCDATASequence(c); } else if (this._state === State.InAttributeValueDq) { this.stateInAttributeValueDoubleQuotes(c); } else if (this._state === State.InAttributeName) { @@ -959,24 +957,12 @@ export default class Tokenizer { this.stateInProcessingInstruction(c); } else if (this._state === State.InNamedEntity) { this.stateInNamedEntity(c); - } else if (this._state === State.BeforeCdata1) { - stateBeforeCdata1(this, c); } else if (this._state === State.BeforeEntity) { this.stateBeforeEntity(c); - } else if (this._state === State.BeforeCdata2) { - stateBeforeCdata2(this, c); - } else if (this._state === State.BeforeCdata3) { - stateBeforeCdata3(this, c); } else if (this._state === State.AfterCdata1) { this.stateAfterCdata1(c); } else if (this._state === State.AfterCdata2) { this.stateAfterCdata2(c); - } else if (this._state === State.BeforeCdata4) { - stateBeforeCdata4(this, c); - } else if (this._state === State.BeforeCdata5) { - stateBeforeCdata5(this, c); - } else if (this._state === State.BeforeCdata6) { - this.stateBeforeCdata6(c); } else if (this._state === State.InHexEntity) { this.stateInHexEntity(c); } else if (this._state === State.InNumericEntity) { From 3a55fa0983c69f31fd984b96ddd82c27bc1621c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Wed, 10 Nov 2021 20:48:09 +0000 Subject: [PATCH 02/12] Add `specialStartSequence` --- src/Tokenizer.ts | 147 +++++++++++++++++------------------------------ 1 file changed, 52 insertions(+), 95 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index abace737..a4a7b92b 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -84,32 +84,18 @@ const enum State { BeforeSpecialS, // S BeforeSpecialSEnd, // S - BeforeScript1, // C - BeforeScript2, // R - BeforeScript3, // I - BeforeScript4, // P - BeforeScript5, // T AfterScript1, // C AfterScript2, // R AfterScript3, // I AfterScript4, // P AfterScript5, // T - BeforeStyle1, // T - BeforeStyle2, // Y - BeforeStyle3, // L - BeforeStyle4, // E AfterStyle1, // T AfterStyle2, // Y AfterStyle3, // L AfterStyle4, // E - BeforeSpecialT, // T BeforeSpecialTEnd, // T - BeforeTitle1, // I - BeforeTitle2, // T - BeforeTitle3, // L - BeforeTitle4, // E AfterTitle1, // I AfterTitle2, // T AfterTitle3, // L @@ -123,6 +109,8 @@ const enum State { // Sequences CDATASequence, + SpecialStartSequence, + SpecialEndSequence, } const enum Special { @@ -192,49 +180,15 @@ const SEQUENCES = { CDATA_END: new Uint16Array([0x5d, 0x5d, 0x3e]), // ]]> }; -const stateBeforeScript1 = ifElseState( - "R", - State.BeforeScript2, - State.InTagName -); -const stateBeforeScript2 = ifElseState( - "I", - State.BeforeScript3, - State.InTagName -); -const stateBeforeScript3 = ifElseState( - "P", - State.BeforeScript4, - State.InTagName -); -const stateBeforeScript4 = ifElseState( - "T", - State.BeforeScript5, - State.InTagName -); - const stateAfterScript1 = ifElseState("R", State.AfterScript2, State.Text); const stateAfterScript2 = ifElseState("I", State.AfterScript3, State.Text); const stateAfterScript3 = ifElseState("P", State.AfterScript4, State.Text); const stateAfterScript4 = ifElseState("T", State.AfterScript5, State.Text); -const stateBeforeStyle1 = ifElseState("Y", State.BeforeStyle2, State.InTagName); -const stateBeforeStyle2 = ifElseState("L", State.BeforeStyle3, State.InTagName); -const stateBeforeStyle3 = ifElseState("E", State.BeforeStyle4, State.InTagName); - const stateAfterStyle1 = ifElseState("Y", State.AfterStyle2, State.Text); const stateAfterStyle2 = ifElseState("L", State.AfterStyle3, State.Text); const stateAfterStyle3 = ifElseState("E", State.AfterStyle4, State.Text); -const stateBeforeSpecialT = ifElseState( - "I", - State.BeforeTitle1, - State.InTagName -); -const stateBeforeTitle1 = ifElseState("T", State.BeforeTitle2, State.InTagName); -const stateBeforeTitle2 = ifElseState("L", State.BeforeTitle3, State.InTagName); -const stateBeforeTitle3 = ifElseState("E", State.BeforeTitle4, State.InTagName); - const stateBeforeSpecialTEnd = ifElseState("I", State.AfterTitle1, State.Text); const stateAfterTitle1 = ifElseState("T", State.AfterTitle2, State.Text); const stateAfterTitle2 = ifElseState("L", State.AfterTitle3, State.Text); @@ -359,16 +313,37 @@ export default class Tokenizer { this.sectionStart = this._index; } } + + private currentSequence!: Uint16Array; private sequenceIndex = 0; + private stateSpecialStartSequence(c: number) { + const isMatch = + this.sequenceIndex === this.currentSequence.length + ? // If we are at the end of the sequence, make sure the tag name has ended + c === CharCodes.Slash || c === CharCodes.Gt || whitespace(c) + : // Otherwise, do a case-insensitive comparison + (c | 0x20) === this.currentSequence[this.sequenceIndex]; + + if (!isMatch) { + this.special = Special.None; + } else if (this.sequenceIndex < this.currentSequence.length) { + this.sequenceIndex++; + return; + } + + this._state = State.InTagName; + this.stateInTagName(c); // Reconsume the character + } + + private stateSpecialEndSequence(_c: number) {} + private stateCDATASequence(c: number) { if (c === SEQUENCES.CDATA[this.sequenceIndex]) { if (++this.sequenceIndex === SEQUENCES.CDATA.length) { this._state = State.InCdata; this.sectionStart = this._index + 1; - return; } } else { - this.sequenceIndex = 0; this._state = State.InDeclaration; this.stateInDeclaration(c); // Reconsume the character } @@ -410,15 +385,22 @@ export default class Tokenizer { } else if (!this.isTagStartChar(c)) { this._state = State.Text; } else { - this._state = - !this.xmlMode && - (c === CharCodes.LowerS || c === CharCodes.UpperS) - ? State.BeforeSpecialS - : !this.xmlMode && - (c === CharCodes.LowerT || c === CharCodes.UpperT) - ? State.BeforeSpecialT - : State.InTagName; this.sectionStart = this._index; + if ( + !this.xmlMode && + (c === CharCodes.LowerT || c === CharCodes.UpperT) + ) { + this.special = Special.Title; + this.currentSequence = SEQUENCES.TITLE; + this.sequenceIndex = 1; + this._state = State.SpecialStartSequence; + } else { + this._state = + !this.xmlMode && + (c === CharCodes.LowerS || c === CharCodes.UpperS) + ? State.BeforeSpecialS + : State.InTagName; + } } } private stateInTagName(c: number) { @@ -656,9 +638,15 @@ export default class Tokenizer { } private stateBeforeSpecialS(c: number) { if (c === CharCodes.LowerC || c === CharCodes.UpperC) { - this._state = State.BeforeScript1; + this.special = Special.Script; + this.currentSequence = SEQUENCES.SCRIPT; + this.sequenceIndex = 2; + this._state = State.SpecialStartSequence; } else if (c === CharCodes.LowerT || c === CharCodes.UpperT) { - this._state = State.BeforeStyle1; + this.special = Special.Style; + this.currentSequence = SEQUENCES.STYLE; + this.sequenceIndex = 2; + this._state = State.SpecialStartSequence; } else { this._state = State.InTagName; this.stateInTagName(c); // Consume the token again @@ -677,13 +665,6 @@ export default class Tokenizer { this._state = State.AfterStyle1; } else this._state = State.Text; } - private stateBeforeSpecialLast(c: number, special: Special) { - if (c === CharCodes.Slash || c === CharCodes.Gt || whitespace(c)) { - this.special = special; - } - this._state = State.InTagName; - this.stateInTagName(c); // Consume the token again - } private stateAfterSpecialLast(c: number, sectionStartOffset: number) { if (c === CharCodes.Gt || whitespace(c)) { this.sectionStart = this._index - sectionStartOffset; @@ -849,6 +830,10 @@ export default class Tokenizer { const c = this.buffer.charCodeAt(this._index); if (this._state === State.Text) { this.stateText(c); + } else if (this._state === State.SpecialStartSequence) { + this.stateSpecialStartSequence(c); + } else if (this._state === State.SpecialEndSequence) { + this.stateSpecialEndSequence(c); } else if (this._state === State.CDATASequence) { this.stateCDATASequence(c); } else if (this._state === State.InAttributeValueDq) { @@ -903,30 +888,12 @@ export default class Tokenizer { stateAfterScript2(this, c); } else if (this._state === State.AfterScript3) { stateAfterScript3(this, c); - } else if (this._state === State.BeforeScript1) { - stateBeforeScript1(this, c); - } else if (this._state === State.BeforeScript2) { - stateBeforeScript2(this, c); - } else if (this._state === State.BeforeScript3) { - stateBeforeScript3(this, c); - } else if (this._state === State.BeforeScript4) { - stateBeforeScript4(this, c); - } else if (this._state === State.BeforeScript5) { - this.stateBeforeSpecialLast(c, Special.Script); } else if (this._state === State.AfterScript4) { stateAfterScript4(this, c); } else if (this._state === State.AfterScript5) { this.stateAfterSpecialLast(c, 6); - } else if (this._state === State.BeforeStyle1) { - stateBeforeStyle1(this, c); } else if (this._state === State.InCdata) { this.stateInCdata(c); - } else if (this._state === State.BeforeStyle2) { - stateBeforeStyle2(this, c); - } else if (this._state === State.BeforeStyle3) { - stateBeforeStyle3(this, c); - } else if (this._state === State.BeforeStyle4) { - this.stateBeforeSpecialLast(c, Special.Style); } else if (this._state === State.AfterStyle1) { stateAfterStyle1(this, c); } else if (this._state === State.AfterStyle2) { @@ -935,16 +902,6 @@ export default class Tokenizer { stateAfterStyle3(this, c); } else if (this._state === State.AfterStyle4) { this.stateAfterSpecialLast(c, 5); - } else if (this._state === State.BeforeSpecialT) { - stateBeforeSpecialT(this, c); - } else if (this._state === State.BeforeTitle1) { - stateBeforeTitle1(this, c); - } else if (this._state === State.BeforeTitle2) { - stateBeforeTitle2(this, c); - } else if (this._state === State.BeforeTitle3) { - stateBeforeTitle3(this, c); - } else if (this._state === State.BeforeTitle4) { - this.stateBeforeSpecialLast(c, Special.Title); } else if (this._state === State.AfterTitle1) { stateAfterTitle1(this, c); } else if (this._state === State.AfterTitle2) { From 422a0b743a5e9c353e343f2febd9d660b810dc40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Wed, 10 Nov 2021 21:54:29 +0000 Subject: [PATCH 03/12] Add `stateInCommentLike` --- src/Tokenizer.ts | 150 ++++++++++++++++++++++------------------------- 1 file changed, 71 insertions(+), 79 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index a4a7b92b..9e71740a 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -70,15 +70,8 @@ const enum State { // Comments BeforeComment, - InComment, InSpecialComment, - AfterComment1, - AfterComment2, - - // Cdata - InCdata, // [ - AfterCdata1, // ] - AfterCdata2, // ] + InCommentLike, // Special tags BeforeSpecialS, // S @@ -177,6 +170,7 @@ const SEQUENCES = { STYLE: new Uint16Array([0x73, 0x74, 0x79, 0x6c, 0x65]), // `style` TITLE: new Uint16Array([0x74, 0x69, 0x74, 0x6c, 0x65]), // `title` + COMMENT_END: new Uint16Array([0x2d, 0x2d, 0x3e]), // `-->` CDATA_END: new Uint16Array([0x5d, 0x5d, 0x3e]), // ]]> }; @@ -340,7 +334,9 @@ export default class Tokenizer { private stateCDATASequence(c: number) { if (c === SEQUENCES.CDATA[this.sequenceIndex]) { if (++this.sequenceIndex === SEQUENCES.CDATA.length) { - this._state = State.InCdata; + this._state = State.InCommentLike; + this.currentSequence = SEQUENCES.CDATA_END; + this.sequenceIndex = 0; this.sectionStart = this._index + 1; } } else { @@ -349,6 +345,60 @@ export default class Tokenizer { } } + /** + * When we wait for one specific character, we can speed things up + * by skipping through the buffer until we find it. + * + * @returns Whether the character was found. + */ + private fastForwardTo(_c: number): boolean { + // TODO: Refactor `parse` to increment index before calling states. + while (this._index < this.buffer.length - 1) { + if (this.buffer.charCodeAt(this._index) === _c) { + return true; + } + this._index++; + } + return false; + } + + /** + * Comments and CDATA end with `-->` and `]]>`. + * + * Their common qualities are: + * - Their end sequences have a distinct character they start with. + * - That character is then repeated, so we have to check multiple repeats. + * - All characters but the start character of the sequence can be skipped. + */ + private stateInCommentLike(c: number) { + if (c === this.currentSequence[this.sequenceIndex]) { + if (++this.sequenceIndex === this.currentSequence.length) { + // Remove 2 trailing chars + const section = this.buffer.slice( + this.sectionStart, + this._index - 2 + ); + + if (this.currentSequence === SEQUENCES.CDATA_END) { + this.cbs.oncdata(section); + } else { + this.cbs.oncomment(section); + } + + this.sectionStart = this._index + 1; + this._state = State.Text; + } + } else if (this.sequenceIndex === 0) { + // Fast-forward to the first character of the sequence + if (this.fastForwardTo(this.currentSequence[0])) { + this.sequenceIndex = 1; + } + } else if (c !== this.currentSequence[this.sequenceIndex - 1]) { + // Allow long sequences, eg. --->, ]]]> + this.sequenceIndex = 0; + } + } + /** * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name. * @@ -576,15 +626,15 @@ export default class Tokenizer { } private stateBeforeComment(c: number) { if (c === CharCodes.Dash) { - this._state = State.InComment; + this._state = State.InCommentLike; + this.currentSequence = SEQUENCES.COMMENT_END; + // Allow short comments (eg. ) + this.sequenceIndex = 2; this.sectionStart = this._index + 1; } else { this._state = State.InDeclaration; } } - private stateInComment(c: number) { - if (c === CharCodes.Dash) this._state = State.AfterComment1; - } private stateInSpecialComment(c: number) { if (c === CharCodes.Gt) { this.cbs.oncomment( @@ -594,48 +644,6 @@ export default class Tokenizer { this.sectionStart = this._index + 1; } } - private stateAfterComment1(c: number) { - if (c === CharCodes.Dash) { - this._state = State.AfterComment2; - } else { - this._state = State.InComment; - } - } - private stateAfterComment2(c: number) { - if (c === CharCodes.Gt) { - // Remove 2 trailing chars - this.cbs.oncomment( - this.buffer.substring(this.sectionStart, this._index - 2) - ); - this._state = State.Text; - this.sectionStart = this._index + 1; - } else if (c !== CharCodes.Dash) { - this._state = State.InComment; - } - // Else: stay in AFTER_COMMENT_2 (`--->`) - } - private stateInCdata(c: number) { - if (c === CharCodes.ClosingSquareBracket) - this._state = State.AfterCdata1; - } - private stateAfterCdata1(c: number) { - if (c === CharCodes.ClosingSquareBracket) - this._state = State.AfterCdata2; - else this._state = State.InCdata; - } - private stateAfterCdata2(c: number) { - if (c === CharCodes.Gt) { - // Remove 2 trailing chars - this.cbs.oncdata( - this.buffer.substring(this.sectionStart, this._index - 2) - ); - this._state = State.Text; - this.sectionStart = this._index + 1; - } else if (c !== CharCodes.ClosingSquareBracket) { - this._state = State.InCdata; - } - // Else: stay in AFTER_CDATA_2 (`]]]>`) - } private stateBeforeSpecialS(c: number) { if (c === CharCodes.LowerC || c === CharCodes.UpperC) { this.special = Special.Script; @@ -840,8 +848,8 @@ export default class Tokenizer { this.stateInAttributeValueDoubleQuotes(c); } else if (this._state === State.InAttributeName) { this.stateInAttributeName(c); - } else if (this._state === State.InComment) { - this.stateInComment(c); + } else if (this._state === State.InCommentLike) { + this.stateInCommentLike(c); } else if (this._state === State.InSpecialComment) { this.stateInSpecialComment(c); } else if (this._state === State.BeforeAttributeName) { @@ -864,8 +872,6 @@ export default class Tokenizer { this.stateAfterClosingTagName(c); } else if (this._state === State.BeforeSpecialS) { this.stateBeforeSpecialS(c); - } else if (this._state === State.AfterComment1) { - this.stateAfterComment1(c); } else if (this._state === State.InAttributeValueNq) { this.stateInAttributeValueNoQuotes(c); } else if (this._state === State.InSelfClosingTag) { @@ -874,8 +880,6 @@ export default class Tokenizer { this.stateInDeclaration(c); } else if (this._state === State.BeforeDeclaration) { this.stateBeforeDeclaration(c); - } else if (this._state === State.AfterComment2) { - this.stateAfterComment2(c); } else if (this._state === State.BeforeComment) { this.stateBeforeComment(c); } else if (this._state === State.BeforeSpecialSEnd) { @@ -892,8 +896,6 @@ export default class Tokenizer { stateAfterScript4(this, c); } else if (this._state === State.AfterScript5) { this.stateAfterSpecialLast(c, 6); - } else if (this._state === State.InCdata) { - this.stateInCdata(c); } else if (this._state === State.AfterStyle1) { stateAfterStyle1(this, c); } else if (this._state === State.AfterStyle2) { @@ -916,10 +918,6 @@ export default class Tokenizer { this.stateInNamedEntity(c); } else if (this._state === State.BeforeEntity) { this.stateBeforeEntity(c); - } else if (this._state === State.AfterCdata1) { - this.stateAfterCdata1(c); - } else if (this._state === State.AfterCdata2) { - this.stateAfterCdata2(c); } else if (this._state === State.InHexEntity) { this.stateInHexEntity(c); } else if (this._state === State.InNumericEntity) { @@ -944,18 +942,12 @@ export default class Tokenizer { /** Handle any trailing data. */ private handleTrailingData() { const data = this.buffer.substr(this.sectionStart); - if ( - this._state === State.InCdata || - this._state === State.AfterCdata1 || - this._state === State.AfterCdata2 - ) { - this.cbs.oncdata(data); - } else if ( - this._state === State.InComment || - this._state === State.AfterComment1 || - this._state === State.AfterComment2 - ) { - this.cbs.oncomment(data); + if (this._state === State.InCommentLike) { + if (this.currentSequence === SEQUENCES.CDATA_END) { + this.cbs.oncdata(data); + } else { + this.cbs.oncomment(data); + } } else if (this._state === State.InNamedEntity && !this.xmlMode) { // Increase excess for EOF this.trieExcess++; From 198b21c5c011a13bd92776a754af143e4211e3c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Wed, 10 Nov 2021 22:34:01 +0000 Subject: [PATCH 04/12] Fast forward in more places --- src/Tokenizer.ts | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 3b362f2b..29df46e0 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -320,7 +320,7 @@ export default class Tokenizer { const isMatch = this.sequenceIndex === this.currentSequence.length ? // If we are at the end of the sequence, make sure the tag name has ended - c === CharCodes.Slash || c === CharCodes.Gt || whitespace(c) + endOfTagSectionChars.has(c) : // Otherwise, do a case-insensitive comparison (c | 0x20) === this.currentSequence[this.sequenceIndex]; @@ -357,14 +357,21 @@ export default class Tokenizer { * * @returns Whether the character was found. */ - private fastForwardTo(_c: number): boolean { - // TODO: Refactor `parse` to increment index before calling states. - while (this._index < this.buffer.length - 1) { - if (this.buffer.charCodeAt(this._index) === _c) { + private fastForwardTo(c: number): boolean { + while (++this._index < this.buffer.length) { + if (this.buffer.charCodeAt(this._index) === c) { return true; } - this._index++; } + + /* + * We increment the index at the end of the `parse` loop, + * so set it to `buffer.length - 1` here. + * + * TODO: Refactor `parse` to increment index before calling states. + */ + this._index = this.buffer.length - 1; + return false; } @@ -501,7 +508,7 @@ export default class Tokenizer { } private stateAfterClosingTagName(c: number) { // Skip everything until ">" - if (c === CharCodes.Gt) { + if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this._state = State.Text; this.sectionStart = this._index + 1; } @@ -608,14 +615,14 @@ export default class Tokenizer { } } private stateInDeclaration(c: number) { - if (c === CharCodes.Gt) { + if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this.cbs.ondeclaration(this.getSection()); this._state = State.Text; this.sectionStart = this._index + 1; } } private stateInProcessingInstruction(c: number) { - if (c === CharCodes.Gt) { + if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this.cbs.onprocessinginstruction(this.getSection()); this._state = State.Text; this.sectionStart = this._index + 1; @@ -633,7 +640,7 @@ export default class Tokenizer { } } private stateInSpecialComment(c: number) { - if (c === CharCodes.Gt) { + if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this.cbs.oncomment(this.getSection()); this._state = State.Text; this.sectionStart = this._index + 1; From fccfb031dbd1ce5d6ce0e80871b73161a9bba1f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Wed, 10 Nov 2021 23:25:04 +0000 Subject: [PATCH 05/12] And some more places, when entities don't matter --- src/Tokenizer.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 29df46e0..97ec283f 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -294,7 +294,10 @@ export default class Tokenizer { } private stateText(c: number) { - if (c === CharCodes.Lt) { + if ( + c === CharCodes.Lt || + (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt)) + ) { if (this._index > this.sectionStart) { this.cbs.ontext(this.getSection()); } @@ -571,7 +574,10 @@ export default class Tokenizer { } } private handleInAttributeValue(c: number, quote: number) { - if (c === quote) { + if ( + c === quote || + (!this.decodeEntities && this.fastForwardTo(quote)) + ) { this.cbs.onattribdata(this.getSection()); this.sectionStart = -1; this.cbs.onattribend(String.fromCharCode(quote)); From 33ae3f3922e542192e75c7aef1bc19d80658de98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 11 Nov 2021 00:00:49 +0000 Subject: [PATCH 06/12] Introduce `InSpecialTag` --- src/Tokenizer.ts | 227 ++++++++++++++++++++--------------------------- 1 file changed, 96 insertions(+), 131 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 97ec283f..a0421944 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -26,11 +26,8 @@ const enum CharCodes { Eq = 0x3d, // "=" Gt = 0x3e, // ">" Questionmark = 0x3f, // "?" - UpperC = 0x43, // "C" LowerC = 0x63, // "c" - UpperS = 0x53, // "S" LowerS = 0x73, // "s" - UpperT = 0x54, // "T" LowerT = 0x74, // "t" UpperA = 0x41, // "A" LowerA = 0x61, // "a" @@ -75,24 +72,7 @@ const enum State { // Special tags BeforeSpecialS, // S - BeforeSpecialSEnd, // S - - AfterScript1, // C - AfterScript2, // R - AfterScript3, // I - AfterScript4, // P - AfterScript5, // T - - AfterStyle1, // T - AfterStyle2, // Y - AfterStyle3, // L - AfterStyle4, // E - - BeforeSpecialTEnd, // T - AfterTitle1, // I - AfterTitle2, // T - AfterTitle3, // L - AfterTitle4, // E + InSpecialTag, BeforeEntity, // & BeforeNumericEntity, // # @@ -103,7 +83,6 @@ const enum State { // Sequences CDATASequence, SpecialStartSequence, - SpecialEndSequence, } const enum Special { @@ -174,26 +153,14 @@ const SEQUENCES = { 0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, ]), // `` CDATA_END: new Uint16Array([0x5d, 0x5d, 0x3e]), // ]]> }; -const stateAfterScript1 = ifElseState("R", State.AfterScript2, State.Text); -const stateAfterScript2 = ifElseState("I", State.AfterScript3, State.Text); -const stateAfterScript3 = ifElseState("P", State.AfterScript4, State.Text); -const stateAfterScript4 = ifElseState("T", State.AfterScript5, State.Text); - -const stateAfterStyle1 = ifElseState("Y", State.AfterStyle2, State.Text); -const stateAfterStyle2 = ifElseState("L", State.AfterStyle3, State.Text); -const stateAfterStyle3 = ifElseState("E", State.AfterStyle4, State.Text); - -const stateBeforeSpecialTEnd = ifElseState("I", State.AfterTitle1, State.Text); -const stateAfterTitle1 = ifElseState("T", State.AfterTitle2, State.Text); -const stateAfterTitle2 = ifElseState("L", State.AfterTitle3, State.Text); -const stateAfterTitle3 = ifElseState("E", State.AfterTitle4, State.Text); - const stateBeforeNumericEntity = ifElseState( "X", State.InHexEntity, @@ -303,11 +270,7 @@ export default class Tokenizer { } this._state = State.BeforeTagName; this.sectionStart = this._index; - } else if ( - this.decodeEntities && - c === CharCodes.Amp && - (this.special === Special.None || this.special === Special.Title) - ) { + } else if (this.decodeEntities && c === CharCodes.Amp) { if (this._index > this.sectionStart) { this.cbs.ontext(this.getSection()); } @@ -320,25 +283,69 @@ export default class Tokenizer { private currentSequence!: Uint16Array; private sequenceIndex = 0; private stateSpecialStartSequence(c: number) { - const isMatch = - this.sequenceIndex === this.currentSequence.length - ? // If we are at the end of the sequence, make sure the tag name has ended - endOfTagSectionChars.has(c) - : // Otherwise, do a case-insensitive comparison - (c | 0x20) === this.currentSequence[this.sequenceIndex]; + const isEnd = this.sequenceIndex === this.currentSequence.length; + const isMatch = isEnd + ? // If we are at the end of the sequence, make sure the tag name has ended + endOfTagSectionChars.has(c) + : // Otherwise, do a case-insensitive comparison + (c | 0x20) === this.currentSequence[this.sequenceIndex]; if (!isMatch) { this.special = Special.None; - } else if (this.sequenceIndex < this.currentSequence.length) { + } else if (!isEnd) { this.sequenceIndex++; return; } this._state = State.InTagName; - this.stateInTagName(c); // Reconsume the character + this.stateInTagName(c); } - private stateSpecialEndSequence(_c: number) {} + /** Look for an end tag. For