diff --git a/packages/super-editor/src/extensions/diffing/algorithm/inline-diffing.test.js b/packages/super-editor/src/extensions/diffing/algorithm/inline-diffing.test.js index 91edee6a18..0e73018ea2 100644 --- a/packages/super-editor/src/extensions/diffing/algorithm/inline-diffing.test.js +++ b/packages/super-editor/src/extensions/diffing/algorithm/inline-diffing.test.js @@ -64,6 +64,29 @@ const buildInlineNodeToken = (attrs = {}, type = { name: 'link' }, pos = 0) => { }; }; +/** + * Builds a mock image inline-node token for diff tests. + * + * @param {Record} attrs Image node attributes. + * @param {number} pos Position offset for the image node. + * @returns {import('./inline-diffing.ts').InlineNodeToken} + */ +const buildImageNodeToken = (attrs = {}, pos = 0) => { + const nodeAttrs = { ...attrs }; + const type = { name: 'image' }; + return { + kind: 'inlineNode', + nodeType: 'image', + node: { + type, + attrs: nodeAttrs, + toJSON: () => ({ type: 'image', attrs: nodeAttrs }), + }, + nodeJSON: { type: 'image', attrs: nodeAttrs }, + pos, + }; +}; + /** * Builds text tokens without offsets for tokenizer assertions. * @@ -426,3 +449,131 @@ describe('tokenizeInlineContent', () => { expect(tokens[5]?.offset).toBe(16); }); }); + +describe('image semantic normalization in inline diff', () => { + it('produces no diff when images differ only in volatile originalAttributes', () => { + const baseAttrs = { + src: 'image1.png', + size: { width: 100, height: 50 }, + originalAttributes: { + 'wp14:anchorId': 'AAAA1111', + 'wp14:editId': 'BBBB2222', + cx: '914400', + }, + }; + const changedAttrs = { + src: 'image1.png', + size: { width: 100, height: 50 }, + originalAttributes: { + 'wp14:anchorId': 'CCCC3333', + 'wp14:editId': 'DDDD4444', + cx: '914400', + }, + }; + + const oldToken = buildImageNodeToken(baseAttrs, 5); + const newToken = buildImageNodeToken(changedAttrs, 5); + + const diffs = getInlineDiff([oldToken], [newToken], 6); + expect(diffs).toEqual([]); + }); + + it('detects a real image change even when volatile attrs also differ', () => { + const oldAttrs = { + src: 'old-image.png', + originalAttributes: { 'wp14:anchorId': 'A1', cx: '100' }, + }; + const newAttrs = { + src: 'new-image.png', + originalAttributes: { 'wp14:anchorId': 'A2', cx: '100' }, + }; + + const oldToken = buildImageNodeToken(oldAttrs, 3); + const newToken = buildImageNodeToken(newAttrs, 3); + + const diffs = getInlineDiff([oldToken], [newToken], 4); + + expect(diffs).toHaveLength(1); + expect(diffs[0].action).toBe('modified'); + expect(diffs[0].kind).toBe('inlineNode'); + expect(diffs[0].attrsDiff?.modified).toHaveProperty('src'); + }); + + it('handles multiple images in one paragraph using type-based pairing', () => { + const mkImage = (src, anchorId, pos) => + buildImageNodeToken({ src, originalAttributes: { 'wp14:anchorId': anchorId, cx: '100' } }, pos); + + const oldTokens = [mkImage('a.png', 'ID1', 1), mkImage('b.png', 'ID2', 3)]; + const newTokens = [mkImage('a.png', 'ID3', 1), mkImage('b.png', 'ID4', 3)]; + + const diffs = getInlineDiff(oldTokens, newTokens, 5); + expect(diffs).toEqual([]); + }); + + it('emits a diff when one of multiple images genuinely changes', () => { + const mkImage = (src, anchorId, pos) => + buildImageNodeToken({ src, originalAttributes: { 'wp14:anchorId': anchorId } }, pos); + + const oldTokens = [mkImage('a.png', 'ID1', 1), mkImage('b.png', 'ID2', 3)]; + const newTokens = [mkImage('a.png', 'ID3', 1), mkImage('c.png', 'ID4', 3)]; + + const diffs = getInlineDiff(oldTokens, newTokens, 5); + + expect(diffs).toHaveLength(1); + expect(diffs[0].action).toBe('modified'); + expect(diffs[0].attrsDiff?.modified).toHaveProperty('src'); + }); + + it('correctly detects an image insertion when a new image is prepended', () => { + const mkImage = (src, pos) => buildImageNodeToken({ src }, pos); + + const oldTokens = [mkImage('a.png', 1), mkImage('b.png', 3)]; + const newTokens = [mkImage('x.png', 1), mkImage('a.png', 3), mkImage('b.png', 5)]; + + const diffs = getInlineDiff(oldTokens, newTokens, 5); + + // Should be a single insertion of x.png, not two modifications + addition + expect(diffs).toHaveLength(1); + expect(diffs[0].action).toBe('added'); + expect(diffs[0].kind).toBe('inlineNode'); + expect(diffs[0].nodeJSON.attrs.src).toBe('x.png'); + }); + + it('correctly detects image reordering as delete + add', () => { + const mkImage = (src, pos) => buildImageNodeToken({ src }, pos); + + const oldTokens = [mkImage('a.png', 1), mkImage('b.png', 3)]; + const newTokens = [mkImage('b.png', 1), mkImage('a.png', 3)]; + + const diffs = getInlineDiff(oldTokens, newTokens, 5); + + // Reorder produces diffs — at minimum some combination of added/deleted + expect(diffs.length).toBeGreaterThan(0); + }); + + it('excludes volatile attrs from attrsDiff when a real image change occurs', () => { + const oldAttrs = { + src: 'v1.png', + size: { width: 100 }, + originalAttributes: { 'wp14:anchorId': 'OLD', 'wp14:editId': 'OLD', cx: '100' }, + }; + const newAttrs = { + src: 'v2.png', + size: { width: 200 }, + originalAttributes: { 'wp14:anchorId': 'NEW', 'wp14:editId': 'NEW', cx: '100' }, + }; + + const diffs = getInlineDiff([buildImageNodeToken(oldAttrs, 1)], [buildImageNodeToken(newAttrs, 1)], 2); + + expect(diffs).toHaveLength(1); + const attrsDiff = diffs[0].attrsDiff; + + // Semantic changes are reported + expect(attrsDiff?.modified).toHaveProperty('src'); + expect(attrsDiff?.modified).toHaveProperty('size.width'); + + // Volatile changes are NOT reported + expect(attrsDiff?.modified).not.toHaveProperty('originalAttributes.wp14:anchorId'); + expect(attrsDiff?.modified).not.toHaveProperty('originalAttributes.wp14:editId'); + }); +}); diff --git a/packages/super-editor/src/extensions/diffing/algorithm/inline-diffing.ts b/packages/super-editor/src/extensions/diffing/algorithm/inline-diffing.ts index 53adf8ba04..41d57ae175 100644 --- a/packages/super-editor/src/extensions/diffing/algorithm/inline-diffing.ts +++ b/packages/super-editor/src/extensions/diffing/algorithm/inline-diffing.ts @@ -1,5 +1,6 @@ import type { Node as PMNode } from 'prosemirror-model'; import { getAttributesDiff, getMarksDiff, type AttributesDiff, type MarksDiff } from './attributes-diffing'; +import { normalizeInlineNodeJSON, normalizeInlineNodeAttrs, semanticInlineNodeKey } from './semantic-normalization'; import { diffSequences } from './sequence-diffing'; type NodeJSON = ReturnType; @@ -237,7 +238,9 @@ export function getInlineDiff( buildDeleted: (token, oldIdx) => buildInlineDiff('deleted', token, oldIdx), buildModified: (oldToken, newToken, oldIdx) => { if (oldToken.kind !== 'text' && newToken.kind !== 'text') { - const attrsDiff = getAttributesDiff(oldToken.node.attrs, newToken.node.attrs); + const oldNormalized = normalizeInlineNodeAttrs(oldToken.node.type.name, oldToken.node.attrs); + const newNormalized = normalizeInlineNodeAttrs(newToken.node.type.name, newToken.node.attrs); + const attrsDiff = getAttributesDiff(oldNormalized, newNormalized); return { action: 'modified', idx: oldIdx, @@ -270,7 +273,8 @@ export function getInlineDiff( /** * Compares two inline tokens to decide if they can be considered equal for the Myers diff. - * Text tokens compare character equality while inline nodes compare their type. + * Text tokens compare character equality. Inline nodes compare by semantic identity + * (normalized JSON), not just type name, so that distinct images are not falsely paired. */ function inlineComparator(a: InlineDiffToken, b: InlineDiffToken): boolean { if (a.kind !== b.kind) { @@ -281,7 +285,7 @@ function inlineComparator(a: InlineDiffToken, b: InlineDiffToken): boolean { return a.char === b.char; } if (a.kind === 'inlineNode' && b.kind === 'inlineNode') { - return a.node.type.name === b.node.type.name; + return semanticInlineNodeKey(a.node) === semanticInlineNodeKey(b.node); } return false; } @@ -299,8 +303,8 @@ function shouldProcessEqualAsModification(oldToken: InlineDiffToken, newToken: I } if (oldToken.kind === 'inlineNode' && newToken.kind === 'inlineNode') { - const oldJSON = oldToken.node.toJSON(); - const newJSON = newToken.node.toJSON(); + const oldJSON = normalizeInlineNodeJSON(oldToken.node.toJSON()); + const newJSON = normalizeInlineNodeJSON(newToken.node.toJSON()); return JSON.stringify(oldJSON) !== JSON.stringify(newJSON); } diff --git a/packages/super-editor/src/extensions/diffing/algorithm/paragraph-diffing.test.js b/packages/super-editor/src/extensions/diffing/algorithm/paragraph-diffing.test.js index b6f5749fe3..38dd460976 100644 --- a/packages/super-editor/src/extensions/diffing/algorithm/paragraph-diffing.test.js +++ b/packages/super-editor/src/extensions/diffing/algorithm/paragraph-diffing.test.js @@ -7,6 +7,7 @@ import { buildModifiedParagraphDiff, canTreatAsModification, } from './paragraph-diffing.ts'; +import { semanticInlineNodeKey } from './semantic-normalization.ts'; /** * Builds text tokens without offsets for paragraph diff tests. @@ -35,6 +36,28 @@ const buildMarkedRuns = (text, marks, attrs = {}, offsetStart = 0) => offset: offsetStart + index, })); +/** + * Builds a mock inline image node token for paragraph diff tests. + * + * @param {Record} attrs Image node attributes. + * @param {number} pos Position offset for the image node. + * @returns {Record} + */ +const buildImageToken = (attrs = {}, pos = 0) => { + const nodeAttrs = { ...attrs }; + return { + kind: 'inlineNode', + nodeType: 'image', + node: { + type: { name: 'image' }, + attrs: nodeAttrs, + toJSON: () => ({ type: 'image', attrs: nodeAttrs }), + }, + nodeJSON: { type: 'image', attrs: nodeAttrs }, + pos, + }; +}; + /** * Creates a mock paragraph node with default attributes. * @@ -60,6 +83,21 @@ const createParagraphNode = (overrides = {}) => { * @param {Record} overrides Overrides for the snapshot. * @returns {Record} */ +/** + * Derives a content signature from tokens, matching the real buildContentSignature logic. + * Text tokens contribute their char; inline node tokens contribute a normalized JSON key. + */ +const deriveContentSignature = (tokens) => + tokens + .map((token) => { + if (token.kind === 'text') return token.char; + if (token.kind === 'inlineNode' && token.node) { + return `\0${semanticInlineNodeKey(token.node)}\0`; + } + return ''; + }) + .join(''); + const createParagraphInfo = (overrides = {}) => { const fullText = overrides.fullText ?? 'text'; const paragraphPos = overrides.pos ?? 0; @@ -79,6 +117,8 @@ const createParagraphInfo = (overrides = {}) => { return token; }); + const contentSignature = overrides.contentSignature ?? deriveContentSignature(textTokens); + return { node: createParagraphNode(overrides.node), pos: paragraphPos, @@ -86,6 +126,7 @@ const createParagraphInfo = (overrides = {}) => { fullText, text: textTokens, endPos: overrides.endPos ?? paragraphPos + 1 + fullText.length, + contentSignature, ...overrides, }; }; @@ -244,6 +285,183 @@ describe('paragraph diff builders', () => { }); }); +describe('image paragraph semantic normalization', () => { + it('does not emit a modification when only volatile attrs differ on an image paragraph', () => { + const makeImageParagraphNode = (paraId, rsidR, anchorId, editId) => + createParagraphNode({ + attrs: { paraId, rsidR, align: 'left' }, + toJSON: () => ({ + type: 'paragraph', + attrs: { paraId, rsidR, align: 'left' }, + content: [ + { + type: 'run', + attrs: {}, + content: [ + { + type: 'image', + attrs: { + src: 'photo.png', + originalAttributes: { + 'wp14:anchorId': anchorId, + 'wp14:editId': editId, + cx: '914400', + }, + }, + }, + ], + }, + ], + }), + }); + + const oldInfo = createParagraphInfo({ + node: makeImageParagraphNode('P1', 'R1', 'ANC1', 'EDT1'), + fullText: '', + text: [], + }); + const newInfo = createParagraphInfo({ + node: makeImageParagraphNode('P1', 'R2', 'ANC2', 'EDT2'), + fullText: '', + text: [], + }); + + expect(shouldProcessEqualAsModification(oldInfo, newInfo)).toBe(false); + }); + + it('emits a modification when semantic image attrs change alongside volatile attrs', () => { + const makeNode = (paraId, rsidR, src, anchorId) => + createParagraphNode({ + attrs: { paraId, rsidR }, + toJSON: () => ({ + type: 'paragraph', + attrs: { paraId, rsidR }, + content: [ + { + type: 'run', + attrs: {}, + content: [ + { + type: 'image', + attrs: { + src, + originalAttributes: { 'wp14:anchorId': anchorId }, + }, + }, + ], + }, + ], + }), + }); + + const oldInfo = createParagraphInfo({ + node: makeNode('P1', 'R1', 'old.png', 'ANC1'), + fullText: '', + text: [], + }); + const newInfo = createParagraphInfo({ + node: makeNode('P1', 'R2', 'new.png', 'ANC2'), + fullText: '', + text: [], + }); + + expect(shouldProcessEqualAsModification(oldInfo, newInfo)).toBe(true); + }); + + it('does not report volatile paragraph attrs as a modification diff', () => { + const oldParagraph = createParagraphInfo({ + node: createParagraphNode({ attrs: { paraId: 'A', rsidR: '001', align: 'left' } }), + fullText: 'same', + }); + const newParagraph = createParagraphInfo({ + node: createParagraphNode({ attrs: { paraId: 'B', rsidR: '002', align: 'left' } }), + fullText: 'same', + }); + + const diff = buildModifiedParagraphDiff(oldParagraph, newParagraph); + expect(diff).toBeNull(); + }); + + it('still detects semantic paragraph attr changes', () => { + const oldParagraph = createParagraphInfo({ + node: createParagraphNode({ attrs: { paraId: 'A', rsidR: '001', align: 'left' } }), + }); + const newParagraph = createParagraphInfo({ + node: createParagraphNode({ attrs: { paraId: 'B', rsidR: '002', align: 'center' } }), + }); + + const diff = buildModifiedParagraphDiff(oldParagraph, newParagraph); + expect(diff).not.toBeNull(); + expect(diff.attrsDiff?.modified).toHaveProperty('align'); + expect(diff.attrsDiff?.modified).not.toHaveProperty('paraId'); + expect(diff.attrsDiff?.modified).not.toHaveProperty('rsidR'); + }); + + it('text paragraphs are unaffected by normalization', () => { + const oldParagraph = createParagraphInfo({ + fullText: 'hello', + text: buildRuns('hello'), + node: createParagraphNode({ attrs: { align: 'left' } }), + }); + const newParagraph = createParagraphInfo({ + fullText: 'world', + text: buildRuns('world'), + node: createParagraphNode({ attrs: { align: 'left' } }), + }); + + const diff = buildModifiedParagraphDiff(oldParagraph, newParagraph); + expect(diff).not.toBeNull(); + expect(diff.contentDiff.length).toBeGreaterThan(0); + expect(diff.attrsDiff).toBeNull(); + }); +}); + +describe('paragraphComparator with image-only paragraphs', () => { + it('distinguishes image-only paragraphs with different images', () => { + const imgA = createParagraphInfo({ + fullText: '', + text: [buildImageToken({ src: 'a.png' })], + node: createParagraphNode({ attrs: {} }), + }); + const imgB = createParagraphInfo({ + fullText: '', + text: [buildImageToken({ src: 'b.png' })], + node: createParagraphNode({ attrs: {} }), + }); + + expect(paragraphComparator(imgA, imgB)).toBe(false); + }); + + it('matches image-only paragraphs with the same semantic content', () => { + const makeInfo = (anchorId) => + createParagraphInfo({ + fullText: '', + text: [ + buildImageToken({ + src: 'same.png', + originalAttributes: { 'wp14:anchorId': anchorId, cx: '100' }, + }), + ], + node: createParagraphNode({ attrs: {} }), + }); + + // volatile anchorId differs, but semantic content is the same + expect(paragraphComparator(makeInfo('ID1'), makeInfo('ID2'))).toBe(true); + }); + + it('still matches text paragraphs by fullText', () => { + const a = createParagraphInfo({ fullText: 'same text' }); + const b = createParagraphInfo({ fullText: 'same text' }); + expect(paragraphComparator(a, b)).toBe(true); + }); + + it('falls back to fullText when contentSignature is missing', () => { + const a = { node: { attrs: {} }, fullText: 'fallback' }; + const b = { node: { attrs: {} }, fullText: 'fallback' }; + expect(paragraphComparator(a, b)).toBe(true); + }); +}); + describe('canTreatAsModification', () => { it('returns true when paragraph comparator matches by paraId', () => { const buildInfo = (paraId) => ({ diff --git a/packages/super-editor/src/extensions/diffing/algorithm/paragraph-diffing.ts b/packages/super-editor/src/extensions/diffing/algorithm/paragraph-diffing.ts index bf785ab718..d24d3db2e3 100644 --- a/packages/super-editor/src/extensions/diffing/algorithm/paragraph-diffing.ts +++ b/packages/super-editor/src/extensions/diffing/algorithm/paragraph-diffing.ts @@ -2,6 +2,7 @@ import type { Node as PMNode } from 'prosemirror-model'; import { getInlineDiff, tokenizeInlineContent, type InlineDiffToken, type InlineDiffResult } from './inline-diffing'; import { getAttributesDiff, type AttributesDiff } from './attributes-diffing'; import { getInsertionPos, type NodePositionInfo } from './diff-utils'; +import { normalizeParagraphAttrs, normalizeParagraphNodeJSON, semanticInlineNodeKey } from './semantic-normalization'; import { levenshteinDistance } from './similarity'; // Heuristics that prevent unrelated paragraphs from being paired as modifications. @@ -23,6 +24,8 @@ export interface ParagraphNodeInfo { endPos: number; /** Plain-text representation of the paragraph content. */ fullText: string; + /** Semantic fingerprint of all inline content (text + nodes), used for identity matching. */ + contentSignature: string; } /** @@ -97,9 +100,30 @@ export function createParagraphSnapshot(paragraph: PMNode, paragraphPos: number, text, endPos: paragraphPos + 1 + paragraph.content.size, fullText: text.map((token) => (token.kind === 'text' ? token.char : '')).join(''), + contentSignature: buildContentSignature(text), }; } +/** + * Builds a semantic fingerprint from inline tokens that covers both + * text characters and inline nodes (images, etc.). + * + * Text-only paragraphs produce the same result as `fullText`. + * Image-only paragraphs produce a unique key per distinct image, + * so that the paragraph comparator can tell them apart. + */ +function buildContentSignature(tokens: InlineDiffToken[]): string { + return tokens + .map((token) => { + if (token.kind === 'text') { + return token.char; + } + // Null bytes delimit inline node keys so they can't collide with text + return `\0${semanticInlineNodeKey(token.node)}\0`; + }) + .join(''); +} + /** * Determines whether equal paragraph nodes should still be marked as modified because their serialized structure differs. * @@ -111,11 +135,16 @@ export function shouldProcessEqualAsModification( oldParagraph: ParagraphNodeInfo, newParagraph: ParagraphNodeInfo, ): boolean { - return JSON.stringify(oldParagraph.node.toJSON()) !== JSON.stringify(newParagraph.node.toJSON()); + const oldNormalized = normalizeParagraphNodeJSON(oldParagraph.node.toJSON()); + const newNormalized = normalizeParagraphNodeJSON(newParagraph.node.toJSON()); + return JSON.stringify(oldNormalized) !== JSON.stringify(newNormalized); } /** - * Compares two paragraphs for identity based on paraId or text content. + * Compares two paragraphs for identity based on paraId, then content signature. + * + * The content signature covers both text and inline nodes (images, etc.), + * so image-only paragraphs with different images are not falsely paired. */ export function paragraphComparator(oldParagraph: ParagraphNodeInfo, newParagraph: ParagraphNodeInfo): boolean { const oldId = oldParagraph?.node?.attrs?.paraId; @@ -123,7 +152,11 @@ export function paragraphComparator(oldParagraph: ParagraphNodeInfo, newParagrap if (oldId && newId && oldId === newId) { return true; } - return oldParagraph?.fullText === newParagraph?.fullText; + // Content signature includes inline node fingerprints, so it distinguishes + // image-only paragraphs that would otherwise all have empty fullText. + const oldSig = oldParagraph?.contentSignature ?? oldParagraph?.fullText; + const newSig = newParagraph?.contentSignature ?? newParagraph?.fullText; + return oldSig === newSig; } /** @@ -165,7 +198,10 @@ export function buildModifiedParagraphDiff( ): ModifiedParagraphDiff | null { const contentDiff = getInlineDiff(oldParagraph.text, newParagraph.text, oldParagraph.endPos); - const attrsDiff = getAttributesDiff(oldParagraph.node.attrs, newParagraph.node.attrs); + const attrsDiff = getAttributesDiff( + normalizeParagraphAttrs(oldParagraph.node.attrs), + normalizeParagraphAttrs(newParagraph.node.attrs), + ); if (contentDiff.length === 0 && !attrsDiff) { return null; } diff --git a/packages/super-editor/src/extensions/diffing/algorithm/semantic-normalization.test.ts b/packages/super-editor/src/extensions/diffing/algorithm/semantic-normalization.test.ts new file mode 100644 index 0000000000..f76ae06b90 --- /dev/null +++ b/packages/super-editor/src/extensions/diffing/algorithm/semantic-normalization.test.ts @@ -0,0 +1,338 @@ +import { describe, it, expect } from 'vitest'; +import { + normalizeParagraphAttrs, + normalizeImageNodeJSON, + normalizeInlineNodeJSON, + normalizeInlineNodeAttrs, + normalizeParagraphNodeJSON, + normalizeDocJSON, + semanticInlineNodeKey, +} from './semantic-normalization'; + +describe('normalizeParagraphAttrs', () => { + it('strips all volatile paragraph attributes', () => { + const attrs = { + paraId: '1A2B3C4D', + textId: '77777777', + rsidR: '00A1B2C3', + rsidRDefault: '00D4E5F6', + rsidP: '00112233', + rsidRPr: '00445566', + rsidDel: '00778899', + align: 'center', + indent: { left: 720 }, + }; + + const result = normalizeParagraphAttrs(attrs); + + expect(result).toEqual({ + align: 'center', + indent: { left: 720 }, + }); + }); + + it('returns all attributes when none are volatile', () => { + const attrs = { align: 'left', spacing: { before: 100 } }; + const result = normalizeParagraphAttrs(attrs); + expect(result).toEqual(attrs); + }); + + it('returns an empty object for empty input', () => { + expect(normalizeParagraphAttrs({})).toEqual({}); + }); +}); + +describe('normalizeImageNodeJSON', () => { + it('strips volatile keys from originalAttributes', () => { + const nodeJSON = { + type: 'image', + attrs: { + src: 'image1.png', + size: { width: 100, height: 100 }, + originalAttributes: { + 'wp14:anchorId': '4A5B6C7D', + 'wp14:editId': '8E9F0A1B', + cx: '914400', + cy: '914400', + }, + }, + }; + + const result = normalizeImageNodeJSON(nodeJSON); + + expect(result.attrs.originalAttributes).toEqual({ + cx: '914400', + cy: '914400', + }); + expect(result.attrs.src).toBe('image1.png'); + expect(result.attrs.size).toEqual({ width: 100, height: 100 }); + }); + + it('returns the node unchanged when originalAttributes is absent', () => { + const nodeJSON = { type: 'image', attrs: { src: 'img.png' } }; + const result = normalizeImageNodeJSON(nodeJSON); + expect(result).toEqual(nodeJSON); + }); + + it('preserves non-volatile originalAttributes keys', () => { + const nodeJSON = { + type: 'image', + attrs: { + originalAttributes: { cx: '100', cy: '200' }, + }, + }; + + const result = normalizeImageNodeJSON(nodeJSON); + expect(result.attrs.originalAttributes).toEqual({ cx: '100', cy: '200' }); + }); + + it('does not mutate the input', () => { + const original = { + type: 'image', + attrs: { + originalAttributes: { 'wp14:anchorId': 'AAA', cx: '100' }, + }, + }; + const copy = JSON.parse(JSON.stringify(original)); + + normalizeImageNodeJSON(original); + + expect(original).toEqual(copy); + }); +}); + +describe('normalizeInlineNodeJSON', () => { + it('normalizes image nodes', () => { + const imageJSON = { + type: 'image', + attrs: { + originalAttributes: { 'wp14:anchorId': 'X', keep: 'yes' }, + }, + }; + + const result = normalizeInlineNodeJSON(imageJSON); + expect(result.attrs.originalAttributes).toEqual({ keep: 'yes' }); + }); + + it('passes non-image nodes through unchanged', () => { + const linkJSON = { type: 'link', attrs: { href: 'http://example.com' } }; + const result = normalizeInlineNodeJSON(linkJSON); + expect(result).toBe(linkJSON); + }); +}); + +describe('normalizeParagraphNodeJSON', () => { + it('strips volatile attrs and normalizes nested image nodes', () => { + const paragraphJSON = { + type: 'paragraph', + attrs: { paraId: 'AABB', rsidR: '0011', align: 'left' }, + content: [ + { + type: 'run', + attrs: {}, + content: [ + { + type: 'image', + attrs: { + src: 'photo.png', + originalAttributes: { + 'wp14:anchorId': 'DEAD', + 'wp14:editId': 'BEEF', + cx: '500', + }, + }, + }, + ], + }, + ], + }; + + const result = normalizeParagraphNodeJSON(paragraphJSON) as any; + + expect(result.attrs).toEqual({ align: 'left' }); + expect(result.content[0].content[0].attrs.originalAttributes).toEqual({ cx: '500' }); + expect(result.content[0].content[0].attrs.src).toBe('photo.png'); + }); + + it('handles paragraphs with no content', () => { + const paragraphJSON = { + type: 'paragraph', + attrs: { paraId: 'X', align: 'center' }, + }; + + const result = normalizeParagraphNodeJSON(paragraphJSON); + expect(result.attrs).toEqual({ align: 'center' }); + expect(result).not.toHaveProperty('content'); + }); + + it('handles text-only paragraphs without modifying content', () => { + const paragraphJSON = { + type: 'paragraph', + attrs: { rsidR: '00AA' }, + content: [ + { + type: 'run', + attrs: {}, + content: [{ type: 'text', text: 'hello' }], + }, + ], + }; + + const result = normalizeParagraphNodeJSON(paragraphJSON) as any; + expect(result.content[0].content[0]).toEqual({ type: 'text', text: 'hello' }); + }); +}); + +describe('normalizeDocJSON', () => { + it('normalizes paragraphs within a document tree', () => { + const docJSON = { + type: 'doc', + content: [ + { + type: 'paragraph', + attrs: { paraId: 'P1', rsidR: 'R1', align: 'left' }, + content: [ + { + type: 'run', + attrs: {}, + content: [ + { + type: 'image', + attrs: { + src: 'test.png', + originalAttributes: { 'wp14:anchorId': 'A1' }, + }, + }, + ], + }, + ], + }, + { + type: 'paragraph', + attrs: { paraId: 'P2', align: 'right' }, + content: [ + { + type: 'run', + attrs: {}, + content: [{ type: 'text', text: 'world' }], + }, + ], + }, + ], + }; + + const result = normalizeDocJSON(docJSON) as any; + + // First paragraph: volatile attrs stripped, image normalized + expect(result.content[0].attrs).toEqual({ align: 'left' }); + expect(result.content[0].content[0].content[0].attrs.originalAttributes).toEqual({}); + + // Second paragraph: volatile attrs stripped, text untouched + expect(result.content[1].attrs).toEqual({ align: 'right' }); + expect(result.content[1].content[0].content[0]).toEqual({ type: 'text', text: 'world' }); + }); + + it('recurses into structural containers (tables, etc.)', () => { + const docJSON = { + type: 'doc', + content: [ + { + type: 'table', + attrs: {}, + content: [ + { + type: 'tableRow', + attrs: {}, + content: [ + { + type: 'tableCell', + attrs: {}, + content: [ + { + type: 'paragraph', + attrs: { paraId: 'TC1', rsidR: 'R9' }, + }, + ], + }, + ], + }, + ], + }, + ], + }; + + const result = normalizeDocJSON(docJSON) as any; + const cellParagraph = result.content[0].content[0].content[0].content[0]; + expect(cellParagraph.attrs).toEqual({}); + }); + + it('returns the doc unchanged when there is no content', () => { + const docJSON = { type: 'doc' }; + expect(normalizeDocJSON(docJSON)).toEqual(docJSON); + }); +}); + +describe('normalizeInlineNodeAttrs', () => { + it('strips volatile keys from image originalAttributes', () => { + const attrs = { + src: 'img.png', + originalAttributes: { + 'wp14:anchorId': 'A1', + 'wp14:editId': 'E1', + cx: '100', + }, + }; + + const result = normalizeInlineNodeAttrs('image', attrs); + + expect(result.originalAttributes).toEqual({ cx: '100' }); + expect(result.src).toBe('img.png'); + }); + + it('passes non-image attrs through unchanged', () => { + const attrs = { href: 'http://example.com' }; + const result = normalizeInlineNodeAttrs('link', attrs); + expect(result).toBe(attrs); + }); + + it('passes image attrs through when originalAttributes is absent', () => { + const attrs = { src: 'img.png' }; + const result = normalizeInlineNodeAttrs('image', attrs); + expect(result).toBe(attrs); + }); +}); + +describe('semanticInlineNodeKey', () => { + it('produces identical keys for images differing only in volatile attrs', () => { + const makeNode = (anchorId: string) => ({ + type: { name: 'image' }, + toJSON: () => ({ + type: 'image', + attrs: { src: 'same.png', originalAttributes: { 'wp14:anchorId': anchorId, cx: '100' } }, + }), + }); + + expect(semanticInlineNodeKey(makeNode('A'))).toBe(semanticInlineNodeKey(makeNode('B'))); + }); + + it('produces different keys for images with different semantic attrs', () => { + const makeNode = (src: string) => ({ + type: { name: 'image' }, + toJSON: () => ({ + type: 'image', + attrs: { src, originalAttributes: { 'wp14:anchorId': 'same' } }, + }), + }); + + expect(semanticInlineNodeKey(makeNode('a.png'))).not.toBe(semanticInlineNodeKey(makeNode('b.png'))); + }); + + it('passes non-image nodes through without normalization', () => { + const node = { + type: { name: 'link' }, + toJSON: () => ({ type: 'link', attrs: { href: 'http://example.com' } }), + }; + + expect(semanticInlineNodeKey(node)).toBe(JSON.stringify({ type: 'link', attrs: { href: 'http://example.com' } })); + }); +}); diff --git a/packages/super-editor/src/extensions/diffing/algorithm/semantic-normalization.ts b/packages/super-editor/src/extensions/diffing/algorithm/semantic-normalization.ts new file mode 100644 index 0000000000..b9fb447eb2 --- /dev/null +++ b/packages/super-editor/src/extensions/diffing/algorithm/semantic-normalization.ts @@ -0,0 +1,215 @@ +/** + * Semantic normalization for diff comparisons. + * + * Strips non-semantic OOXML metadata from node JSON before diffing so that + * volatile attributes (regenerated by Word on every save) do not produce + * false-positive diffs. This module is used exclusively by the diffing + * pipeline — it never mutates live ProseMirror nodes or touches + * importer/exporter code. + */ + +/** + * Paragraph-level attributes that Word regenerates on every save. + * These carry no semantic meaning for diff comparison. + */ +const VOLATILE_PARAGRAPH_ATTRS = new Set(['paraId', 'textId', 'rsidR', 'rsidRDefault', 'rsidP', 'rsidRPr', 'rsidDel']); + +/** + * Keys inside `originalAttributes` on image nodes that Word regenerates + * on every save. These are drawing-level identifiers, not content. + */ +const VOLATILE_IMAGE_ORIGINAL_ATTR_KEYS = new Set(['wp14:anchorId', 'wp14:editId']); + +/** + * Removes volatile keys from a flat attributes object. + * Returns a new object — never mutates the input. + */ +function omitKeys(attrs: Record, keysToOmit: Set): Record { + const result: Record = {}; + for (const [key, value] of Object.entries(attrs)) { + if (!keysToOmit.has(key)) { + result[key] = value; + } + } + return result; +} + +/** + * Strips volatile OOXML metadata from paragraph attributes. + * + * @param attrs Raw paragraph node attributes. + * @returns A shallow copy with non-semantic keys removed. + */ +export function normalizeParagraphAttrs(attrs: Record): Record { + return omitKeys(attrs, VOLATILE_PARAGRAPH_ATTRS); +} + +/** + * Strips volatile OOXML metadata from an image node's JSON representation. + * + * Only touches `attrs.originalAttributes` — all other attributes are + * preserved as-is so that genuine image changes (src, size, wrapping, etc.) + * still produce diffs. + * + * @param nodeJSON Serialized image node (from `node.toJSON()`). + * @returns A deep-enough copy with volatile keys removed from `originalAttributes`. + */ +export function normalizeImageNodeJSON(nodeJSON: Record): Record { + const attrs = nodeJSON.attrs as Record | undefined; + if (!attrs?.originalAttributes) { + return nodeJSON; + } + + const originalAttributes = attrs.originalAttributes as Record; + const cleanedOriginalAttributes = omitKeys(originalAttributes, VOLATILE_IMAGE_ORIGINAL_ATTR_KEYS); + + return { + ...nodeJSON, + attrs: { + ...attrs, + originalAttributes: cleanedOriginalAttributes, + }, + }; +} + +/** + * Strips volatile metadata from an inline node's JSON based on its type. + * + * Currently normalizes image nodes. Other inline node types pass through + * unchanged — extend the switch as new volatile-attr patterns emerge. + * + * @param nodeJSON Serialized inline node. + * @returns Normalized copy (or the original if no normalization is needed). + */ +export function normalizeInlineNodeJSON(nodeJSON: Record): Record { + if (nodeJSON.type === 'image') { + return normalizeImageNodeJSON(nodeJSON); + } + return nodeJSON; +} + +/** + * Strips volatile metadata from an inline node's raw attributes. + * + * Used when computing attrsDiff for modified inline nodes so that + * volatile keys don't appear in the diff payload. + * + * @param typeName The node type name (e.g. 'image'). + * @param attrs Raw node attributes. + * @returns Normalized copy with volatile keys removed. + */ +export function normalizeInlineNodeAttrs(typeName: string, attrs: Record): Record { + if (typeName !== 'image') { + return attrs; + } + + const originalAttributes = attrs.originalAttributes as Record | undefined; + if (!originalAttributes) { + return attrs; + } + + return { + ...attrs, + originalAttributes: omitKeys(originalAttributes, VOLATILE_IMAGE_ORIGINAL_ATTR_KEYS), + }; +} + +/** + * Produces a stable semantic key for an inline node, suitable for + * identity comparison in Myers diff. + * + * Two nodes with the same key represent the same semantic content. + * Volatile OOXML metadata is stripped so that re-imported copies of + * the same image are treated as identical. + * + * @param node A ProseMirror node (or mock) with `type.name` and `toJSON()`. + * @returns Stable JSON string usable as a comparison key. + */ +export function semanticInlineNodeKey(node: { type: { name: string }; toJSON: () => unknown }): string { + return JSON.stringify(normalizeInlineNodeJSON(node.toJSON() as Record)); +} + +/** + * Strips volatile metadata from a paragraph node's full JSON representation. + * + * Normalizes the paragraph's own attrs and recursively normalizes any + * inline image nodes nested within its content tree. + * + * @param nodeJSON Serialized paragraph node (from `node.toJSON()`). + * @returns Normalized copy suitable for semantic comparison. + */ +export function normalizeParagraphNodeJSON(nodeJSON: Record): Record { + const attrs = (nodeJSON.attrs as Record) ?? {}; + const content = nodeJSON.content as Record[] | undefined; + + return { + ...nodeJSON, + attrs: normalizeParagraphAttrs(attrs), + ...(content ? { content: content.map(normalizeContentNodeJSON) } : {}), + }; +} + +/** + * Recursively normalizes a content node within a paragraph's JSON tree. + * + * Applies inline node normalization to leaf nodes (e.g. images) and + * recurses into container nodes (e.g. runs) that have their own content. + */ +function normalizeContentNodeJSON(nodeJSON: Record): Record { + const content = nodeJSON.content as Record[] | undefined; + + // Leaf inline nodes (image, etc.) + if (!content) { + return normalizeInlineNodeJSON(nodeJSON); + } + + // Container nodes (run, etc.) — recurse into children + return { + ...nodeJSON, + content: content.map(normalizeContentNodeJSON), + }; +} + +/** + * Normalizes an entire document JSON tree for diff fingerprinting. + * + * Walks the full document structure, stripping volatile paragraph and + * image attributes at every level. Used by `canonicalize.ts` to ensure + * fingerprints and diff comparisons agree on what counts as a real change. + * + * @param docJSON Serialized document (from `doc.toJSON()`). + * @returns Normalized copy suitable for stable fingerprinting. + */ +export function normalizeDocJSON(docJSON: Record): Record { + const content = docJSON.content as Record[] | undefined; + if (!content) { + return docJSON; + } + + return { + ...docJSON, + content: content.map(normalizeDocNodeJSON), + }; +} + +/** + * Normalizes a single node within the document tree based on its type. + */ +function normalizeDocNodeJSON(nodeJSON: Record): Record { + const type = nodeJSON.type as string | undefined; + + if (type === 'paragraph') { + return normalizeParagraphNodeJSON(nodeJSON); + } + + // Recurse into structural containers (body, section, table, row, cell, etc.) + const content = nodeJSON.content as Record[] | undefined; + if (content) { + return { + ...nodeJSON, + content: content.map(normalizeDocNodeJSON), + }; + } + + return nodeJSON; +} diff --git a/packages/super-editor/src/extensions/diffing/computeDiff.test.js b/packages/super-editor/src/extensions/diffing/computeDiff.test.js index 35fd762082..62bd82da1c 100644 --- a/packages/super-editor/src/extensions/diffing/computeDiff.test.js +++ b/packages/super-editor/src/extensions/diffing/computeDiff.test.js @@ -63,11 +63,13 @@ describe('Diff', () => { const deletedDiffs = diffs.filter((diff) => diff.action === 'deleted'); const attrOnlyDiffs = modifiedDiffs.filter((diff) => diff.contentDiff.length === 0); - expect(diffs).toHaveLength(19); - expect(modifiedDiffs).toHaveLength(9); + // One volatile-only paragraph diff (paraId/rsidR/textId changes) is now + // correctly filtered out by semantic normalization. See semantic-normalization.ts. + expect(diffs).toHaveLength(18); + expect(modifiedDiffs).toHaveLength(8); expect(addedDiffs).toHaveLength(5); expect(deletedDiffs).toHaveLength(5); - expect(attrOnlyDiffs).toHaveLength(4); + expect(attrOnlyDiffs).toHaveLength(3); // Modified paragraph with multiple text diffs let diff = getDiff( @@ -166,7 +168,6 @@ describe('Diff', () => { expect(diff.contentDiff[0].newText).toBe(' '); expect(diff.contentDiff[1].text).toBe('NEW'); expect(diff.contentDiff[2].text).toBe(' '); - expect(diff.attrsDiff?.modified?.textId).toBeDefined(); diff = diffs.find((diff) => diff.action === 'deleted' && diff.oldText === 'I deleted this sentence.'); expect(diff).toBeDefined(); @@ -177,7 +178,6 @@ describe('Diff', () => { diff = diffs.find((diff) => diff.action === 'modified' && diff.oldText === 'We are not done yet.'); expect(diff.newText).toBe('We are done now.'); expect(diff.contentDiff).toHaveLength(3); - expect(diff.attrsDiff?.modified?.textId).toBeDefined(); }); it('Compare another set of two documents with only formatting changes', async () => { diff --git a/packages/super-editor/src/extensions/diffing/service/canonicalize.ts b/packages/super-editor/src/extensions/diffing/service/canonicalize.ts index 454229e415..dfba296d48 100644 --- a/packages/super-editor/src/extensions/diffing/service/canonicalize.ts +++ b/packages/super-editor/src/extensions/diffing/service/canonicalize.ts @@ -9,6 +9,7 @@ import type { Node as PMNode } from 'prosemirror-model'; import type { NumberingProperties, StylesDocumentProperties } from '@superdoc/style-engine/ooxml'; import type { CommentInput } from '../algorithm/comment-diffing'; import { COMMENT_ATTRS_DIFF_IGNORED_KEYS } from '../algorithm/comment-diffing'; +import { normalizeDocJSON } from '../algorithm/semantic-normalization'; /** The canonical diffable state of one document. */ export interface CanonicalDiffableState { @@ -63,7 +64,7 @@ export function buildCanonicalDiffableState( numbering: NumberingProperties | null | undefined, ): CanonicalDiffableState { return { - body: doc.toJSON() as Record, + body: normalizeDocJSON(doc.toJSON() as Record), comments: comments.map(canonicalizeComment), styles: styles ? (styles as unknown as Record) : null, numbering: numbering ? (numbering as unknown as Record) : null,