diff --git a/src/lib/command-framework/apify-command.ts b/src/lib/command-framework/apify-command.ts index 3a0094455..8c64af0d0 100644 --- a/src/lib/command-framework/apify-command.ts +++ b/src/lib/command-framework/apify-command.ts @@ -1,5 +1,6 @@ /* eslint-disable max-classes-per-file */ +import process from 'node:process'; import type { parseArgs, ParseArgsConfig, ParseArgsOptionDescriptor } from 'node:util'; import type { Awaitable } from '@crawlee/types'; @@ -144,6 +145,39 @@ const jsonFlagDefinition = { multiple: false, } as const satisfies ParseArgsOptionDescriptor; +const userAgentFlagDefinition = { + type: 'string', + multiple: false, +} as const satisfies ParseArgsOptionDescriptor; + +export const USER_AGENT_FLAG_NAME = 'user-agent'; +export const USER_AGENT_ENV_VAR = 'APIFY_CLI_USER_AGENT'; +export const USER_AGENT_MAX_LENGTH = 256; +// Scope the caller-id flag to the public `apify` entrypoint. The `actor` entrypoint +// runs inside Actor Docker images where caller-identification is not meaningful. +const USER_AGENT_SUPPORTED_ENTRYPOINTS = new Set(['apify']); + +function sanitizeUserAgentValue(value: string | undefined): string | undefined { + if (typeof value !== 'string') { + return undefined; + } + // Strip ASCII control chars (0x00-0x1F and 0x7F) to keep telemetry payloads clean. + // eslint-disable-next-line no-control-regex + const stripped = value.replace(/[\u0000-\u001f\u007f]/g, ''); + const trimmed = stripped.trim(); + if (!trimmed) { + return undefined; + } + return trimmed.length > USER_AGENT_MAX_LENGTH ? trimmed.slice(0, USER_AGENT_MAX_LENGTH) : trimmed; +} + +export function resolveUserAgentForTelemetry( + flagValue: string | undefined, + envValue: string | undefined, +): string | undefined { + return sanitizeUserAgentValue(flagValue) ?? sanitizeUserAgentValue(envValue); +} + export const commandRegistry = new Map(); type ParseResult = ReturnType>>; @@ -281,6 +315,17 @@ export abstract class ApifyCommand = { + help: helpFlagDefinition, + }; + + if (USER_AGENT_SUPPORTED_ENTRYPOINTS.has(this.entrypoint)) { + baseOptions[USER_AGENT_FLAG_NAME] = userAgentFlagDefinition; + } + const object = { allowNegative: true, allowPositionals: true, strict: true, tokens: true, - options: { - help: helpFlagDefinition, - } as { + options: baseOptions as { help: typeof helpFlagDefinition; json: typeof jsonFlagDefinition; [k: string]: ParseArgsOptionDescriptor; diff --git a/src/lib/hooks/telemetry/trackEvent.ts b/src/lib/hooks/telemetry/trackEvent.ts index 441f5b30c..aa4f12ca4 100644 --- a/src/lib/hooks/telemetry/trackEvent.ts +++ b/src/lib/hooks/telemetry/trackEvent.ts @@ -42,6 +42,7 @@ export interface TrackEventMap { exitCode?: number; durationMs?: number; aiAgent?: string; + userAgent?: string; isCi?: boolean; ciProvider?: string; isInteractive?: boolean; diff --git a/test/e2e/commands/user-agent.test.ts b/test/e2e/commands/user-agent.test.ts new file mode 100644 index 000000000..45c9b4355 --- /dev/null +++ b/test/e2e/commands/user-agent.test.ts @@ -0,0 +1,29 @@ +import { runCli } from '../__helpers__/run-cli.js'; + +describe('[e2e] --user-agent flag', () => { + it('accepts --user-agent on any command without erroring', async () => { + const result = await runCli('apify', ['help', '--user-agent', 'test-caller/1.0.0']); + expect(result.exitCode, `stderr: ${result.stderr}`).toBe(0); + }); + + it('accepts APIFY_CLI_USER_AGENT env var on any command without erroring', async () => { + const result = await runCli('apify', ['help'], { + env: { APIFY_CLI_USER_AGENT: 'test-caller/env-1.0.0' }, + }); + expect(result.exitCode, `stderr: ${result.stderr}`).toBe(0); + }); + + it('rejects --user-agent without a value (string flag)', async () => { + // Passing --user-agent with no value should surface a parseArgs error, + // since the flag is declared as a string type. + const result = await runCli('apify', ['help', '--user-agent']); + expect(result.exitCode).not.toBe(0); + }); + + it('rejects --user-agent under the actor entrypoint', async () => { + // The flag is scoped to the public apify entrypoint only — the actor + // entrypoint runs inside Actor Docker images where caller-id is meaningless. + const result = await runCli('actor', ['help', '--user-agent', 'foo']); + expect(result.exitCode).not.toBe(0); + }); +}); diff --git a/test/local/lib/command-framework/user-agent.test.ts b/test/local/lib/command-framework/user-agent.test.ts new file mode 100644 index 000000000..d27a89b8d --- /dev/null +++ b/test/local/lib/command-framework/user-agent.test.ts @@ -0,0 +1,224 @@ +/* eslint-disable max-classes-per-file */ +import { parseArgs } from 'node:util'; + +import { + ApifyCommand, + type BuiltApifyCommand as _BuiltApifyCommand, + resolveUserAgentForTelemetry, + USER_AGENT_ENV_VAR, + USER_AGENT_FLAG_NAME, + USER_AGENT_MAX_LENGTH, +} from '../../../../src/lib/command-framework/apify-command.js'; +import { Flags } from '../../../../src/lib/command-framework/flags.js'; + +const BuiltApifyCommand = ApifyCommand as typeof _BuiltApifyCommand; + +class NoOpCommand extends BuiltApifyCommand { + static override name = 'noop' as const; + static override description = 'Does nothing.'; + async run() { + // no-op + } +} + +class CommandWithFlags extends BuiltApifyCommand { + static override name = 'with-flags' as const; + static override description = 'Has flags.'; + static override flags = { + foo: Flags.string({ description: 'foo flag' }), + }; + + async run() { + // no-op + } +} + +describe('resolveUserAgentForTelemetry()', () => { + test('flag value wins over env value', () => { + expect(resolveUserAgentForTelemetry('flag', 'env')).toBe('flag'); + }); + + test('env value used as fallback', () => { + expect(resolveUserAgentForTelemetry(undefined, 'env')).toBe('env'); + }); + + test('undefined when neither set', () => { + expect(resolveUserAgentForTelemetry(undefined, undefined)).toBeUndefined(); + }); + + test('empty/whitespace flag falls through to env', () => { + expect(resolveUserAgentForTelemetry(' ', 'env')).toBe('env'); + expect(resolveUserAgentForTelemetry('', 'env')).toBe('env'); + }); + + test('empty/whitespace env returns undefined when flag also empty', () => { + expect(resolveUserAgentForTelemetry('', '')).toBeUndefined(); + expect(resolveUserAgentForTelemetry(undefined, ' ')).toBeUndefined(); + }); + + test('trims surrounding whitespace on returned value', () => { + expect(resolveUserAgentForTelemetry(' foo ', undefined)).toBe('foo'); + expect(resolveUserAgentForTelemetry(undefined, ' bar ')).toBe('bar'); + }); + + test('strips ASCII control characters silently', () => { + expect(resolveUserAgentForTelemetry('plugin\u0000/1.0\u0007', undefined)).toBe('plugin/1.0'); + expect(resolveUserAgentForTelemetry('line\nbreak\rthing\ttab', undefined)).toBe('linebreakthingtab'); + expect(resolveUserAgentForTelemetry('\u007f\u0001only-control\u001f', undefined)).toBe('only-control'); + }); + + test('returns undefined when input is only control characters', () => { + expect(resolveUserAgentForTelemetry('\u0000\u0001\u001f', undefined)).toBeUndefined(); + }); + + test('caps length at USER_AGENT_MAX_LENGTH', () => { + const long = 'x'.repeat(USER_AGENT_MAX_LENGTH + 100); + const resolved = resolveUserAgentForTelemetry(long, undefined); + expect(resolved).toBeDefined(); + expect(resolved!.length).toBe(USER_AGENT_MAX_LENGTH); + }); + + test('does not touch values at or below cap', () => { + const exact = 'y'.repeat(USER_AGENT_MAX_LENGTH); + expect(resolveUserAgentForTelemetry(exact, undefined)).toBe(exact); + }); +}); + +describe('--user-agent flag registration', () => { + test('is parseable on commands with no declared flags', () => { + const instance = new NoOpCommand('apify', NoOpCommand.name, NoOpCommand.name); + // eslint-disable-next-line dot-notation + const parserOptions = instance['_buildParseArgsOption'](); + + expect(parserOptions.options).toHaveProperty(USER_AGENT_FLAG_NAME); + + const parsed = parseArgs({ + ...parserOptions, + args: ['--user-agent', 'apify-agent-skills/ultimate-scraper-1.3.0'], + }); + + expect(parsed.values[USER_AGENT_FLAG_NAME]).toBe('apify-agent-skills/ultimate-scraper-1.3.0'); + }); + + test('is parseable alongside command-specific flags', () => { + const instance = new CommandWithFlags('apify', CommandWithFlags.name, CommandWithFlags.name); + // eslint-disable-next-line dot-notation + const parserOptions = instance['_buildParseArgsOption'](); + + const parsed = parseArgs({ + ...parserOptions, + args: ['--foo', 'hello', '--user-agent', 'my-plugin/1.0.0'], + }); + + // command-declared string flags use multiple:true, so node returns an array. + expect(parsed.values.foo).toEqual(['hello']); + expect(parsed.values[USER_AGENT_FLAG_NAME]).toBe('my-plugin/1.0.0'); + }); + + test('is optional — parser does not fail when omitted', () => { + const instance = new NoOpCommand('apify', NoOpCommand.name, NoOpCommand.name); + // eslint-disable-next-line dot-notation + const parserOptions = instance['_buildParseArgsOption'](); + + const parsed = parseArgs({ + ...parserOptions, + args: [], + }); + + expect(parsed.values[USER_AGENT_FLAG_NAME]).toBeUndefined(); + }); + + test('treats --user-agent after `--` separator as a positional, not a flag', () => { + const instance = new NoOpCommand('apify', NoOpCommand.name, NoOpCommand.name); + // eslint-disable-next-line dot-notation + const parserOptions = instance['_buildParseArgsOption'](); + + const parsed = parseArgs({ + ...parserOptions, + args: ['--', '--user-agent', 'forwarded-value'], + }); + + expect(parsed.values[USER_AGENT_FLAG_NAME]).toBeUndefined(); + expect(parsed.positionals).toEqual(['--user-agent', 'forwarded-value']); + }); + + test('is NOT registered under non-apify entrypoints (e.g. actor)', () => { + const instance = new NoOpCommand('actor', NoOpCommand.name, NoOpCommand.name); + // eslint-disable-next-line dot-notation + const parserOptions = instance['_buildParseArgsOption'](); + + expect(parserOptions.options).not.toHaveProperty(USER_AGENT_FLAG_NAME); + + // parseArgs with strict:true should now reject --user-agent on this entrypoint. + expect(() => + parseArgs({ + ...parserOptions, + args: ['--user-agent', 'foo'], + }), + ).toThrow(); + }); +}); + +describe('--user-agent end-to-end telemetry wiring', () => { + const originalEnv = process.env[USER_AGENT_ENV_VAR]; + + afterEach(() => { + if (originalEnv === undefined) { + delete process.env[USER_AGENT_ENV_VAR]; + } else { + process.env[USER_AGENT_ENV_VAR] = originalEnv; + } + }); + + async function runWith( + args: string[], + entrypoint = 'apify', + ): Promise<_BuiltApifyCommand & { telemetryData: Record }> { + const instance = new NoOpCommand(entrypoint, NoOpCommand.name, NoOpCommand.name); + // eslint-disable-next-line dot-notation + instance['skipTelemetry'] = true; + // eslint-disable-next-line dot-notation + const parserOptions = instance['_buildParseArgsOption'](); + const parsed = parseArgs({ ...parserOptions, args }); + // eslint-disable-next-line dot-notation + await instance['_run'](parsed); + return instance as unknown as _BuiltApifyCommand & { telemetryData: Record }; + } + + test('populates telemetryData.userAgent when flag passed', async () => { + delete process.env[USER_AGENT_ENV_VAR]; + const instance = await runWith(['--user-agent', 'skills/scraper-1.0']); + expect(instance.telemetryData.userAgent).toBe('skills/scraper-1.0'); + }); + + test('falls back to APIFY_CLI_USER_AGENT env var', async () => { + process.env[USER_AGENT_ENV_VAR] = 'env-caller/2.0'; + const instance = await runWith([]); + expect(instance.telemetryData.userAgent).toBe('env-caller/2.0'); + }); + + test('flag overrides env var', async () => { + process.env[USER_AGENT_ENV_VAR] = 'env-caller/2.0'; + const instance = await runWith(['--user-agent', 'flag-caller/3.0']); + expect(instance.telemetryData.userAgent).toBe('flag-caller/3.0'); + }); + + test('leaves userAgent unset when neither flag nor env var provided', async () => { + delete process.env[USER_AGENT_ENV_VAR]; + const instance = await runWith([]); + expect(instance.telemetryData.userAgent).toBeUndefined(); + }); + + test('sanitizes oversized flag input before emitting', async () => { + delete process.env[USER_AGENT_ENV_VAR]; + const bigValue = 'a'.repeat(USER_AGENT_MAX_LENGTH + 50); + const instance = await runWith(['--user-agent', bigValue]); + expect((instance.telemetryData.userAgent as string).length).toBe(USER_AGENT_MAX_LENGTH); + }); + + test('ignores env var entirely when running under actor entrypoint', async () => { + process.env[USER_AGENT_ENV_VAR] = 'env-caller/2.0'; + const instance = await runWith([], 'actor'); + expect(instance.telemetryData.userAgent).toBeUndefined(); + }); +});