From 160582e5e7038f558f47b4044b66112b96405b8d Mon Sep 17 00:00:00 2001 From: Martin Donadieu Date: Sat, 2 May 2026 01:07:33 +0200 Subject: [PATCH 1/5] Validate translation worker on real pages --- .github/workflows/deploy-translation.yml | 2 +- .../scripts/verify-real-ai.ts | 55 ++++++++++-- apps/translation-worker/src/index.ts | 83 ++++++++++++++++++- 3 files changed, 132 insertions(+), 8 deletions(-) diff --git a/.github/workflows/deploy-translation.yml b/.github/workflows/deploy-translation.yml index ef35c3d76..039eb18ee 100644 --- a/.github/workflows/deploy-translation.yml +++ b/.github/workflows/deploy-translation.yml @@ -53,7 +53,7 @@ jobs: if [ "$status" -eq 0 ]; then return 0 fi - printf '%s\n' "$output" | grep -Eiq "already exists|already.*${queue}" + printf '%s\n' "$output" | grep -Eiq "already exists|already taken|already.*${queue}" } ensure_queue capgo-translation-refresh diff --git a/apps/translation-worker/scripts/verify-real-ai.ts b/apps/translation-worker/scripts/verify-real-ai.ts index 929ae1515..4ab9d3cb7 100644 --- a/apps/translation-worker/scripts/verify-real-ai.ts +++ b/apps/translation-worker/scripts/verify-real-ai.ts @@ -9,18 +9,29 @@ type ProbePayload = { cache?: boolean r2?: boolean } + page?: { + path?: string + locale?: string + segmentCount?: number + batchCount?: number + translatedBatchCount?: number + translatedSegmentCount?: number + changedCount?: number + samples?: unknown + } translations?: unknown error?: string } const WORKER_DIR = resolve(dirname(fileURLToPath(import.meta.url)), '..') const MODEL = process.env.TRANSLATION_REAL_TEST_MODEL || '@cf/meta/llama-3.1-8b-instruct-fast' -const TIMEOUT_MS = Number.parseInt(process.env.TRANSLATION_REAL_TEST_TIMEOUT_MS || '180000', 10) -const REQUEST_TIMEOUT_MS = Math.min(10_000, TIMEOUT_MS) +const TIMEOUT_MS = Number.parseInt(process.env.TRANSLATION_REAL_TEST_TIMEOUT_MS || '240000', 10) +const REQUEST_TIMEOUT_MS = Math.min(60_000, TIMEOUT_MS) const LOG_LIMIT = 16_000 const WRANGLER_CONFIG = 'wrangler.real-test.jsonc' const DEVELOPMENT_R2_BUCKET = 'capgo-translation-cache-development' const SOURCE_TEXTS = ['Ship updates instantly', 'Pricing', 'Keep Capgo, Capacitor, code, API, SDK, CLI, npm, bun, GitHub, and Cloudflare unchanged.'] +const REAL_PAGE_PROBES = ['/', '/docs/'] as const let wranglerLog = '' @@ -127,7 +138,7 @@ function assertProbePayload(payload: ProbePayload): void { } } -async function fetchProbe(url: string): Promise { +async function fetchJsonProbe(url: string): Promise { const controller = new AbortController() const timeout = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS) let response: Response @@ -151,10 +162,34 @@ async function fetchProbe(url: string): Promise { } if (!response.ok) throw new Error(payload.error || `Probe returned HTTP ${response.status}`) + return payload +} + +async function fetchRuntimeProbe(url: string): Promise { + const payload = await fetchJsonProbe(url) assertProbePayload(payload) return payload } +async function fetchRealPageProbe(url: string, path: string): Promise { + const payload = await fetchJsonProbe(url) + if (!payload.ok) throw new Error(payload.error || `Real page probe failed for ${path}`) + if (payload.model !== MODEL) throw new Error(`Real page probe used ${payload.model || 'unknown model'} instead of ${MODEL}`) + + const page = payload.page + if (!page) throw new Error(`Real page probe returned no page result for ${path}`) + if (page.path !== path) throw new Error(`Real page probe returned ${page.path || 'unknown path'} instead of ${path}`) + if (page.locale !== 'es') throw new Error(`Real page probe returned ${page.locale || 'unknown locale'} instead of es`) + if (!page.segmentCount || page.segmentCount < 1) throw new Error(`Real page probe found no segments for ${path}`) + if (!page.batchCount || page.batchCount < 1) throw new Error(`Real page probe found no batches for ${path}`) + if (!page.translatedBatchCount || page.translatedBatchCount < 1) throw new Error(`Real page probe translated no batches for ${path}`) + if (!page.translatedSegmentCount || page.translatedSegmentCount < 1) throw new Error(`Real page probe translated no segments for ${path}`) + if (!page.changedCount || page.changedCount < 1) throw new Error(`Real page probe left ${path} untranslated`) + if (!Array.isArray(page.samples) || page.samples.length < 1) throw new Error(`Real page probe returned no translated samples for ${path}`) + + return payload +} + async function exitedCode(process: Bun.Subprocess<'pipe', 'pipe', 'inherit'>): Promise { return await Promise.race([process.exited, sleep(0).then(() => null)]) } @@ -162,7 +197,12 @@ async function exitedCode(process: Bun.Subprocess<'pipe', 'pipe', 'inherit'>): P await ensureDevelopmentBucket() const port = await getFreePort() -const probeUrl = `http://127.0.0.1:${port}/__translation-test__/real-runtime` +const probeBaseUrl = `http://127.0.0.1:${port}` +const runtimeProbeUrl = `${probeBaseUrl}/__translation-test__/real-runtime` +const realPageProbeUrls = REAL_PAGE_PROBES.map((path) => ({ + path, + url: `${probeBaseUrl}/__translation-test__/real-page?path=${encodeURIComponent(path)}&locale=es&batches=2`, +})) const wrangler = Bun.spawn( [ 'bunx', @@ -209,8 +249,11 @@ try { if (code !== null) throw new Error(`wrangler dev exited early with code ${code}`) try { - const payload = await fetchProbe(probeUrl) - console.log(`Real translation worker probe passed with ${payload.model}`) + const payload = await fetchRuntimeProbe(runtimeProbeUrl) + for (const probe of realPageProbeUrls) { + await fetchRealPageProbe(probe.url, probe.path) + } + console.log(`Real translation worker probe passed with ${payload.model} on ${REAL_PAGE_PROBES.join(', ')}`) passed = true break } catch (error) { diff --git a/apps/translation-worker/src/index.ts b/apps/translation-worker/src/index.ts index e61399ba6..655f5773b 100644 --- a/apps/translation-worker/src/index.ts +++ b/apps/translation-worker/src/index.ts @@ -1924,11 +1924,92 @@ async function probeRuntimeStorage(env: Env, requestUrl: URL): Promise<{ cache: return { cache: true, r2: true } } +function testProbeNumberParam(requestUrl: URL, name: string, defaultValue: number, minimum: number, maximum: number): number { + const rawValue = requestUrl.searchParams.get(name) + if (!rawValue) return defaultValue + + const value = Number.parseInt(rawValue, 10) + if (!Number.isFinite(value)) return defaultValue + return Math.min(maximum, Math.max(minimum, value)) +} + +function testProbeLocaleParam(requestUrl: URL): Locale { + const rawLocale = requestUrl.searchParams.get('locale') || 'es' + return isSupportedLocale(rawLocale) ? rawLocale : 'es' +} + +function testProbePathParam(requestUrl: URL): string { + const rawPath = requestUrl.searchParams.get('path') || '/' + const pathUrl = new URL(rawPath, 'https://capgo.app') + const pathname = normalizePathname(stripLocalePrefix(pathUrl.pathname)) + if (shouldBypassTranslation(pathname)) throw new Error(`Real page probe cannot translate bypassed path: ${pathname}`) + return `${pathname}${pathUrl.search}` +} + +async function probeRealPageTranslation(env: Env, requestUrl: URL): Promise> { + const locale = testProbeLocaleParam(requestUrl) + const targetLanguage = LANGUAGE_NAMES[locale] + const path = testProbePathParam(requestUrl) + const maxBatches = testProbeNumberParam(requestUrl, 'batches', 2, 1, 4) + const sourceUrl = new URL(path, 'https://capgo.app') + const sourceResponse = await fetch(sourceUrl.toString(), { + headers: { + Accept: 'text/html', + 'Accept-Language': DEFAULT_LOCALE, + 'X-Capgo-Translation-Origin': 'real-page-probe', + }, + }) + + if (!sourceResponse.ok || !isHtmlResponse(sourceResponse)) { + throw new Error(`Real page probe source failed: ${sourceResponse.status} ${sourceResponse.statusText}`) + } + + const sourceHtml = await sourceResponse.text() + const { segments } = collectSegments(sourceHtml) + const batches = buildBatches(segments) + if (batches.length === 0) throw new Error(`Real page probe found no translatable segments for ${path}`) + + const translatedBatches: string[][] = [] + const batchLimit = Math.min(maxBatches, batches.length) + for (let batchIndex = 0; batchIndex < batchLimit; batchIndex += 1) { + translatedBatches.push(await translateBatchWithJsonMode(env, targetLanguage, batches[batchIndex])) + } + + const sourceTexts = batches.slice(0, batchLimit).flat() + const translatedTexts = translatedBatches.flat() + const changedCount = translatedTexts.filter((translated, index) => normalizedTranslationValue(translated) !== normalizedTranslationValue(sourceTexts[index] ?? '')).length + if (changedCount === 0) throw new Error(`Real page probe left ${path} untranslated for ${targetLanguage}`) + + return { + path, + locale, + targetLanguage, + sourceBytes: sourceHtml.length, + segmentCount: segments.length, + batchCount: batches.length, + translatedBatchCount: translatedBatches.length, + translatedSegmentCount: translatedTexts.length, + changedCount, + samples: translatedTexts.slice(0, 5), + } +} + async function handleTranslationTestRequest(request: Request, env: Env, requestUrl: URL): Promise { if (request.method !== 'GET') return jsonResponse({ ok: false, error: 'Method not allowed' }, 405) - if (requestUrl.pathname !== `${TRANSLATION_TEST_ROUTE_PREFIX}/real-runtime`) return jsonResponse({ ok: false, error: 'Not found' }, 404) try { + if (requestUrl.pathname === `${TRANSLATION_TEST_ROUTE_PREFIX}/real-page`) { + const page = await probeRealPageTranslation(env, requestUrl) + return jsonResponse({ + ok: true, + model: env.TRANSLATION_MODEL || DEFAULT_MODEL, + cacheVersion: TRANSLATION_CACHE_VERSION, + page, + }) + } + + if (requestUrl.pathname !== `${TRANSLATION_TEST_ROUTE_PREFIX}/real-runtime`) return jsonResponse({ ok: false, error: 'Not found' }, 404) + const storage = await probeRuntimeStorage(env, requestUrl) const translations = await translateBatchWithJsonMode(env, 'Spanish', [ 'Ship updates instantly', From 82885d33af03120ef5a937f2e8cb60e8644a8cd1 Mon Sep 17 00:00:00 2001 From: Martin Donadieu Date: Sat, 2 May 2026 01:17:57 +0200 Subject: [PATCH 2/5] Fix translation probe byte count --- apps/translation-worker/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/translation-worker/src/index.ts b/apps/translation-worker/src/index.ts index 655f5773b..eda2fbbc1 100644 --- a/apps/translation-worker/src/index.ts +++ b/apps/translation-worker/src/index.ts @@ -1984,7 +1984,7 @@ async function probeRealPageTranslation(env: Env, requestUrl: URL): Promise Date: Sat, 2 May 2026 01:52:09 +0200 Subject: [PATCH 3/5] Translate body text after skipped scripts --- apps/translation-worker/package.json | 3 +- .../scripts/verify-parser.ts | 46 +++++ .../scripts/verify-real-ai.ts | 17 +- apps/translation-worker/src/index.ts | 158 +++++++++++++++--- 4 files changed, 196 insertions(+), 28 deletions(-) create mode 100644 apps/translation-worker/scripts/verify-parser.ts diff --git a/apps/translation-worker/package.json b/apps/translation-worker/package.json index a8c24121d..f258cd97b 100644 --- a/apps/translation-worker/package.json +++ b/apps/translation-worker/package.json @@ -3,7 +3,8 @@ "private": true, "type": "module", "scripts": { - "check": "tsc --noEmit", + "check": "tsc --noEmit && bun run test:parser", + "test:parser": "bun run scripts/verify-parser.ts", "test": "bun run test:real", "test:real": "bun run scripts/verify-real-ai.ts", "dev": "wrangler dev -c wrangler.jsonc -c ../web/wrangler.jsonc -c ../docs/wrangler.jsonc", diff --git a/apps/translation-worker/scripts/verify-parser.ts b/apps/translation-worker/scripts/verify-parser.ts new file mode 100644 index 000000000..30d380504 --- /dev/null +++ b/apps/translation-worker/scripts/verify-parser.ts @@ -0,0 +1,46 @@ +import { __translationWorkerTest } from '../src/index' + +function assert(condition: unknown, message: string): void { + if (!condition) throw new Error(message) +} + +const html = ` + + + Capgo - Live Updates for Capacitor Apps + + + + Skip to main content +

Ship mobile updates instantly to every user

+ +

Deploy fixes and features without waiting for app store review delays.

+ +` + +const { parts, segments } = __translationWorkerTest.collectSegments(html) +const bodySegments = segments.filter((segment) => segment.inBody).map((segment) => segment.text) + +assert( + bodySegments.some((text) => text.includes('Skip to main content')), + 'Parser did not collect body text after a script with a less-than operator', +) +assert( + bodySegments.some((text) => text.includes('Ship mobile updates instantly')), + 'Parser did not collect the body heading', +) +assert( + bodySegments.some((text) => text.includes('Deploy fixes and features')), + 'Parser did not collect the body paragraph after a skipped body script', +) + +const translations = segments.map((segment) => (segment.inBody ? `FR: ${segment.text}` : segment.text)) +const stats = __translationWorkerTest.bodyTranslationStats(segments, translations) +assert(stats.candidateCount > 0, 'Body translation validator found no body candidates') +assert(stats.changedCount > 0, 'Body translation validator did not detect changed body text') + +const rendered = __translationWorkerTest.renderTranslatedHtml(parts, segments, translations) +assert(rendered.includes('FR: Ship mobile updates instantly to every user'), 'Renderer did not write translated body text') +assert(rendered.includes('current < total'), 'Renderer changed skipped script content') diff --git a/apps/translation-worker/scripts/verify-real-ai.ts b/apps/translation-worker/scripts/verify-real-ai.ts index 4ab9d3cb7..f1fb0b6ed 100644 --- a/apps/translation-worker/scripts/verify-real-ai.ts +++ b/apps/translation-worker/scripts/verify-real-ai.ts @@ -13,10 +13,12 @@ type ProbePayload = { path?: string locale?: string segmentCount?: number + bodySegmentCount?: number batchCount?: number translatedBatchCount?: number translatedSegmentCount?: number changedCount?: number + bodyChecks?: unknown samples?: unknown } translations?: unknown @@ -31,7 +33,10 @@ const LOG_LIMIT = 16_000 const WRANGLER_CONFIG = 'wrangler.real-test.jsonc' const DEVELOPMENT_R2_BUCKET = 'capgo-translation-cache-development' const SOURCE_TEXTS = ['Ship updates instantly', 'Pricing', 'Keep Capgo, Capacitor, code, API, SDK, CLI, npm, bun, GitHub, and Cloudflare unchanged.'] -const REAL_PAGE_PROBES = ['/', '/docs/'] as const +const REAL_PAGE_PROBES = [ + { path: '/', checks: ['Skip to main content', 'Products', 'By Team'] }, + { path: '/docs/', checks: ['Skip to content', 'Select theme', 'Deploy a Live Update'] }, +] as const let wranglerLog = '' @@ -181,10 +186,12 @@ async function fetchRealPageProbe(url: string, path: string): Promise ({ - path, - url: `${probeBaseUrl}/__translation-test__/real-page?path=${encodeURIComponent(path)}&locale=es&batches=2`, +const realPageProbeUrls = REAL_PAGE_PROBES.map((probe) => ({ + path: probe.path, + url: `${probeBaseUrl}/__translation-test__/real-page?path=${encodeURIComponent(probe.path)}&locale=es&batches=2${probe.checks.map((check) => `&check=${encodeURIComponent(check)}`).join('')}`, })) const wrangler = Bun.spawn( [ @@ -253,7 +260,7 @@ try { for (const probe of realPageProbeUrls) { await fetchRealPageProbe(probe.url, probe.path) } - console.log(`Real translation worker probe passed with ${payload.model} on ${REAL_PAGE_PROBES.join(', ')}`) + console.log(`Real translation worker probe passed with ${payload.model} on ${REAL_PAGE_PROBES.map((probe) => probe.path).join(', ')}`) passed = true break } catch (error) { diff --git a/apps/translation-worker/src/index.ts b/apps/translation-worker/src/index.ts index eda2fbbc1..3e30b9f9a 100644 --- a/apps/translation-worker/src/index.ts +++ b/apps/translation-worker/src/index.ts @@ -77,6 +77,7 @@ type Segment = { leading: string trailing: string mode: 'text' | 'attribute' + inBody: boolean quote?: string } @@ -115,11 +116,11 @@ const DEFAULT_MODEL = '@cf/meta/llama-3.1-8b-instruct-fast' const FRESH_MS = 24 * 60 * 60 * 1000 const CACHE_KEEP_SECONDS = 7 * 24 * 60 * 60 const TRANSLATION_PENDING_SECONDS = 10 * 60 -const TRANSLATION_CACHE_VERSION = '2026-05-01-llama-3.1-8b-json-v1' +const TRANSLATION_CACHE_VERSION = '2026-05-02-llama-3.1-8b-json-body-v2' const CLIENT_NO_STORE = 'no-store, max-age=0, must-revalidate' const MAX_HTML_BYTES = 1_500_000 const MAX_BATCH_CHARS = 1_500 -const MAX_BATCH_ITEMS = 32 +const MAX_BATCH_ITEMS = 12 const TRANSLATION_BATCHES_PER_QUEUE_JOB = 1 const TRANSLATION_MODEL_ATTEMPTS = 3 const TRANSLATION_SINGLE_TEXT_ATTEMPTS = 2 @@ -484,7 +485,7 @@ function splitLongCoreText(value: string): string[] { return chunks } -function addSegment(parts: HtmlPart[], segments: Segment[], text: string, mode: Segment['mode'], quote?: string): void { +function addSegment(parts: HtmlPart[], segments: Segment[], text: string, mode: Segment['mode'], inBody: boolean, quote?: string): void { if (!hasAsciiLetter(text)) { parts.push(text) return @@ -507,6 +508,7 @@ function addSegment(parts: HtmlPart[], segments: Segment[], text: string, mode: leading: index === 0 ? leading : '', trailing: index === chunks.length - 1 ? trailing : '', mode, + inBody, quote, }) - 1 parts.push({ segmentIndex }) @@ -625,7 +627,7 @@ function shouldTranslateAttribute(tag: string, tagName: string, attrName: string return TRANSLATABLE_ATTRIBUTES.has(normalizedAttr) } -function appendTag(parts: HtmlPart[], segments: Segment[], tag: string, skipText: boolean): void { +function appendTag(parts: HtmlPart[], segments: Segment[], tag: string, skipText: boolean, inBody: boolean): void { const tagName = tagNameOf(tag) if (!tagName || skipText || isClosingTag(tag)) { parts.push(tag) @@ -639,7 +641,7 @@ function appendTag(parts: HtmlPart[], segments: Segment[], tag: string, skipText if (!shouldTranslateAttribute(tag, tagName, attribute.name, attribute.value)) continue parts.push(tag.slice(lastIndex, attribute.start), tag.slice(attribute.start, attribute.valueStart)) - addSegment(parts, segments, attribute.value, 'attribute', attribute.quote) + addSegment(parts, segments, attribute.value, 'attribute', inBody, attribute.quote) parts.push(attribute.quote) lastIndex = attribute.end matched = true @@ -697,13 +699,54 @@ function findNextHtmlTag(html: string, startIndex: number): { index: number; end return { index, end, tag: html.slice(index, end) } } +function findClosingTag(html: string, startIndex: number, tagName: string): { index: number; end: number; tag: string } | null { + const lowerHtml = html.toLowerCase() + const needle = ` 0) parts.push(text) - else addSegment(parts, segments, text, 'text') + addSegment(parts, segments, text, 'text', insideBody) } const tagName = tagNameOf(tag) - const insideSkippedElement = skipStack.length > 0 - appendTag(parts, segments, tag, insideSkippedElement) + appendTag(parts, segments, tag, false, insideBody) + + if (tagName === 'body' && isClosingTag(tag)) { + insideBody = false + } - if (tagName && !isClosingTag(tag) && !isSelfClosingTag(tag, tagName) && (insideSkippedElement || shouldSkipElementText(tag, tagName))) { + if (tagName && !isClosingTag(tag) && !isSelfClosingTag(tag, tagName) && shouldSkipElementText(tag, tagName)) { skipStack.push(tagName) } - if (tagName && isClosingTag(tag) && insideSkippedElement) { - const stackIndex = skipStack.lastIndexOf(tagName) - if (stackIndex !== -1) skipStack.splice(stackIndex) + if (tagName === 'body' && !isClosingTag(tag) && !isSelfClosingTag(tag, tagName)) { + insideBody = true } lastIndex = nextTag.end @@ -735,7 +779,7 @@ function collectSegments(html: string): { parts: HtmlPart[]; segments: Segment[] const tail = html.slice(lastIndex) if (tail) { if (skipStack.length > 0) parts.push(tail) - else addSegment(parts, segments, tail, 'text') + else addSegment(parts, segments, tail, 'text', insideBody) } return { parts, segments } @@ -974,6 +1018,28 @@ function assertTranslatedBatch(targetLanguage: string, batch: string[], translat } } +function bodyTranslationStats(segments: Segment[], translations: string[]): { candidateCount: number; changedCount: number } { + const candidates = segments + .map((segment, index) => ({ + source: normalizedTranslationValue(segment.text), + translated: normalizedTranslationValue(translations[index] ?? ''), + inBody: segment.inBody, + })) + .filter(({ source, inBody }) => inBody && shouldCheckUnchangedTranslation(source)) + + return { + candidateCount: candidates.length, + changedCount: candidates.filter(({ source, translated }) => source !== translated).length, + } +} + +function assertTranslatedBody(targetLanguage: string, segments: Segment[], translations: string[]): void { + const { candidateCount, changedCount } = bodyTranslationStats(segments, translations) + if (candidateCount > 0 && changedCount === 0) { + throw new Error(`Translation produced no changed body strings for ${targetLanguage}`) + } +} + function isProtectedTokenBoundary(value: string, index: number): boolean { if (index < 0 || index >= value.length) return true const code = value.charCodeAt(index) @@ -981,10 +1047,12 @@ function isProtectedTokenBoundary(value: string, index: number): boolean { } function protectedTokenAt(value: string, index: number): string | null { + const lowerValue = value.toLowerCase() for (const token of PROTECTED_TRANSLATION_TOKENS) { - if (!value.startsWith(token, index)) continue - if (!isProtectedTokenBoundary(value, index - 1) || !isProtectedTokenBoundary(value, index + token.length)) continue - return token + if (!lowerValue.startsWith(token.toLowerCase(), index)) continue + const matched = value.slice(index, index + token.length) + if (!isProtectedTokenBoundary(value, index - 1) || !isProtectedTokenBoundary(value, index + matched.length)) continue + return matched } return null } @@ -1742,6 +1810,7 @@ async function refreshCacheIncrementally(request: Request, env: Env, requestUrl: if (translations.length !== segments.length) { throw new Error(`Partial translation produced ${translations.length} strings for ${segments.length} HTML segments`) } + assertTranslatedBody(LANGUAGE_NAMES[locale], segments, translations) const translatedHtml = renderTranslatedHtml(parts, segments, translations) const response = createTranslatedHtmlResponse(source.originResponse, translatedHtml, requestUrl, locale) @@ -1946,11 +2015,30 @@ function testProbePathParam(requestUrl: URL): string { return `${pathname}${pathUrl.search}` } +function testProbeCheckParams(requestUrl: URL): string[] { + return requestUrl.searchParams + .getAll('check') + .map((value) => value.trim()) + .filter(Boolean) +} + +function findBatchText(batches: string[][], expectedText: string): { batchIndex: number; textIndex: number; source: string } | null { + for (let batchIndex = 0; batchIndex < batches.length; batchIndex += 1) { + const batch = batches[batchIndex] + for (let textIndex = 0; textIndex < batch.length; textIndex += 1) { + const source = batch[textIndex] + if (source.includes(expectedText)) return { batchIndex, textIndex, source } + } + } + return null +} + async function probeRealPageTranslation(env: Env, requestUrl: URL): Promise> { const locale = testProbeLocaleParam(requestUrl) const targetLanguage = LANGUAGE_NAMES[locale] const path = testProbePathParam(requestUrl) const maxBatches = testProbeNumberParam(requestUrl, 'batches', 2, 1, 4) + const requiredChecks = testProbeCheckParams(requestUrl) const sourceUrl = new URL(path, 'https://capgo.app') const sourceResponse = await fetch(sourceUrl.toString(), { headers: { @@ -1969,27 +2057,49 @@ async function probeRealPageTranslation(env: Env, requestUrl: URL): Promise() const batchLimit = Math.min(maxBatches, batches.length) for (let batchIndex = 0; batchIndex < batchLimit; batchIndex += 1) { - translatedBatches.push(await translateBatchWithJsonMode(env, targetLanguage, batches[batchIndex])) + selectedBatchIndexes.add(batchIndex) } - const sourceTexts = batches.slice(0, batchLimit).flat() - const translatedTexts = translatedBatches.flat() + const checkSources = requiredChecks.map((check) => { + const found = findBatchText(batches, check) + if (!found) throw new Error(`Real page probe did not collect required body text for ${path}: ${check}`) + selectedBatchIndexes.add(found.batchIndex) + return { check, ...found } + }) + + const translatedBatchMap = new Map() + for (const batchIndex of [...selectedBatchIndexes].sort((left, right) => left - right)) { + translatedBatchMap.set(batchIndex, await translateBatchWithJsonMode(env, targetLanguage, batches[batchIndex])) + } + + const sourceTexts = [...translatedBatchMap.keys()].flatMap((batchIndex) => batches[batchIndex]) + const translatedTexts = [...translatedBatchMap.values()].flat() const changedCount = translatedTexts.filter((translated, index) => normalizedTranslationValue(translated) !== normalizedTranslationValue(sourceTexts[index] ?? '')).length if (changedCount === 0) throw new Error(`Real page probe left ${path} untranslated for ${targetLanguage}`) + const bodyChecks = checkSources.map(({ check, batchIndex, textIndex, source }) => { + const translated = translatedBatchMap.get(batchIndex)?.[textIndex] ?? '' + if (normalizedTranslationValue(translated) === normalizedTranslationValue(source)) { + throw new Error(`Real page probe left required body text untranslated for ${path}: ${check}`) + } + return { check, batchIndex, source, translated } + }) + return { path, locale, targetLanguage, sourceBytes: new TextEncoder().encode(sourceHtml).length, segmentCount: segments.length, + bodySegmentCount: segments.filter((segment) => segment.inBody).length, batchCount: batches.length, - translatedBatchCount: translatedBatches.length, + translatedBatchCount: translatedBatchMap.size, translatedSegmentCount: translatedTexts.length, changedCount, + bodyChecks, samples: translatedTexts.slice(0, 5), } } @@ -2031,6 +2141,10 @@ async function handleTranslationTestRequest(request: Request, env: Env, requestU export const __translationWorkerTest = { TRANSLATION_CACHE_VERSION, + bodyTranslationStats, + buildBatches, + collectSegments, + renderTranslatedHtml, } export default { From 5a26254d885e25c9939bfb52e83ee1c2014773c8 Mon Sep 17 00:00:00 2001 From: Martin Donadieu Date: Sat, 2 May 2026 01:55:20 +0200 Subject: [PATCH 4/5] Deduplicate translation tag scanning --- apps/translation-worker/src/index.ts | 36 ++++++++-------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/apps/translation-worker/src/index.ts b/apps/translation-worker/src/index.ts index 3e30b9f9a..487ae2be2 100644 --- a/apps/translation-worker/src/index.ts +++ b/apps/translation-worker/src/index.ts @@ -699,18 +699,18 @@ function findNextHtmlTag(html: string, startIndex: number): { index: number; end return { index, end, tag: html.slice(index, end) } } -function findClosingTag(html: string, startIndex: number, tagName: string): { index: number; end: number; tag: string } | null { +function findNamedTag(html: string, startIndex: number, needle: string): { index: number; end: number; tag: string } | null { const lowerHtml = html.toLowerCase() - const needle = ` Date: Sat, 2 May 2026 02:05:06 +0200 Subject: [PATCH 5/5] Address translation probe review feedback --- .../scripts/verify-parser.ts | 10 +++ .../scripts/verify-real-ai.ts | 13 ++- apps/translation-worker/src/index.ts | 83 ++++++++++++++++--- 3 files changed, 88 insertions(+), 18 deletions(-) diff --git a/apps/translation-worker/scripts/verify-parser.ts b/apps/translation-worker/scripts/verify-parser.ts index 30d380504..111f814cf 100644 --- a/apps/translation-worker/scripts/verify-parser.ts +++ b/apps/translation-worker/scripts/verify-parser.ts @@ -15,6 +15,8 @@ const html = ` Skip to main content

Ship mobile updates instantly to every user

+ Do not collect nested SVG textDo not collect outer SVG text +

Translate the paragraph after a nested skipped SVG.

Deploy fixes and features without waiting for app store review delays.

@@ -31,6 +33,14 @@ assert( bodySegments.some((text) => text.includes('Ship mobile updates instantly')), 'Parser did not collect the body heading', ) +assert( + bodySegments.every((text) => !text.includes('Do not collect')), + 'Parser collected text from a nested skipped SVG', +) +assert( + bodySegments.some((text) => text.includes('paragraph after a nested skipped SVG')), + 'Parser did not resume body text after a nested skipped SVG', +) assert( bodySegments.some((text) => text.includes('Deploy fixes and features')), 'Parser did not collect the body paragraph after a skipped body script', diff --git a/apps/translation-worker/scripts/verify-real-ai.ts b/apps/translation-worker/scripts/verify-real-ai.ts index f1fb0b6ed..b8eb83f19 100644 --- a/apps/translation-worker/scripts/verify-real-ai.ts +++ b/apps/translation-worker/scripts/verify-real-ai.ts @@ -33,10 +33,7 @@ const LOG_LIMIT = 16_000 const WRANGLER_CONFIG = 'wrangler.real-test.jsonc' const DEVELOPMENT_R2_BUCKET = 'capgo-translation-cache-development' const SOURCE_TEXTS = ['Ship updates instantly', 'Pricing', 'Keep Capgo, Capacitor, code, API, SDK, CLI, npm, bun, GitHub, and Cloudflare unchanged.'] -const REAL_PAGE_PROBES = [ - { path: '/', checks: ['Skip to main content', 'Products', 'By Team'] }, - { path: '/docs/', checks: ['Skip to content', 'Select theme', 'Deploy a Live Update'] }, -] as const +const REAL_PAGE_PROBES = ['/', '/docs/'] as const let wranglerLog = '' @@ -206,9 +203,9 @@ await ensureDevelopmentBucket() const port = await getFreePort() const probeBaseUrl = `http://127.0.0.1:${port}` const runtimeProbeUrl = `${probeBaseUrl}/__translation-test__/real-runtime` -const realPageProbeUrls = REAL_PAGE_PROBES.map((probe) => ({ - path: probe.path, - url: `${probeBaseUrl}/__translation-test__/real-page?path=${encodeURIComponent(probe.path)}&locale=es&batches=2${probe.checks.map((check) => `&check=${encodeURIComponent(check)}`).join('')}`, +const realPageProbeUrls = REAL_PAGE_PROBES.map((path) => ({ + path, + url: `${probeBaseUrl}/__translation-test__/real-page?path=${encodeURIComponent(path)}&locale=es&batches=2`, })) const wrangler = Bun.spawn( [ @@ -260,7 +257,7 @@ try { for (const probe of realPageProbeUrls) { await fetchRealPageProbe(probe.url, probe.path) } - console.log(`Real translation worker probe passed with ${payload.model} on ${REAL_PAGE_PROBES.map((probe) => probe.path).join(', ')}`) + console.log(`Real translation worker probe passed with ${payload.model} on ${REAL_PAGE_PROBES.join(', ')}`) passed = true break } catch (error) { diff --git a/apps/translation-worker/src/index.ts b/apps/translation-worker/src/index.ts index 487ae2be2..720f1581c 100644 --- a/apps/translation-worker/src/index.ts +++ b/apps/translation-worker/src/index.ts @@ -153,6 +153,7 @@ const LANGUAGE_FLAG_ENTITIES: Record = { } const SKIP_TEXT_TAGS = new Set(['script', 'style', 'svg', 'pre', 'code', 'kbd', 'samp', 'textarea']) +const RAW_TEXT_SKIP_TAGS = new Set(['script', 'style', 'textarea']) const LANGUAGE_SELECTOR_SKIP_IDS = new Set(['language-dropdown-button', 'language-dropdown', 'language-menu']) const VOID_TAGS = new Set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']) const TRANSLATABLE_META = new Set(['description', 'keywords', 'title', 'og:title', 'og:description', 'og:image:alt', 'twitter:title', 'twitter:description', 'twitter:image:alt']) @@ -725,7 +726,29 @@ function findNamedTag(html: string, startIndex: number, needle: string): { index } function findClosingTag(html: string, startIndex: number, tagName: string): { index: number; end: number; tag: string } | null { - return findNamedTag(html, startIndex, `= 4) fallback.push(item) + if (selected.length >= maximum) return selected + } + + return selected.length > 0 ? selected : fallback.slice(0, maximum) +} + async function probeRealPageTranslation(env: Env, requestUrl: URL): Promise> { const locale = testProbeLocaleParam(requestUrl) const targetLanguage = LANGUAGE_NAMES[locale] @@ -2047,12 +2103,19 @@ async function probeRealPageTranslation(env: Env, requestUrl: URL): Promise { - const found = findBatchText(batches, check) - if (!found) throw new Error(`Real page probe did not collect required body text for ${path}: ${check}`) - selectedBatchIndexes.add(found.batchIndex) - return { check, ...found } - }) + const checkSources = + requiredChecks.length > 0 + ? requiredChecks.map((check) => { + const found = findBatchText(segments, batches, check) + if (!found) throw new Error(`Real page probe did not collect required body text for ${path}: ${check}`) + selectedBatchIndexes.add(found.batchIndex) + return { check, ...found } + }) + : selectBodyProbeChecks(segments, batches) + if (checkSources.length === 0) throw new Error(`Real page probe found no body text checks for ${path}`) + for (const checkSource of checkSources) { + selectedBatchIndexes.add(checkSource.batchIndex) + } const translatedBatchMap = new Map() for (const batchIndex of [...selectedBatchIndexes].sort((left, right) => left - right)) {