Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 37 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -319,24 +319,58 @@ brightdata scraper create <url> <description> [options]
| `--name <name>` | Scraper template name (default: `cli-scraper-<timestamp>`) |
| `--deliver-webhook <url>` | Webhook URL for the deliver stub (default: `https://example.com/webhook`) |
| `--timeout <seconds>` | Polling timeout in seconds (default: `600`) |
| `-o, --output <path>` | Write output to file |
| `-o, --output <path>` | Write the JSON envelope to a file (see below) |
| `--json` / `--pretty` | JSON output (raw / indented) |
| `--legacy-output` | Write the pre-v0.3 bare AI-progress payload to `-o` instead of the envelope. Migration only. |
| `--timing` | Show request timing |
| `-k, --api-key <key>` | Override API key |

> **Note:** The scraper is created with a placeholder webhook delivery target (`https://example.com/webhook`). You can reconfigure the actual delivery endpoint in the [Bright Data web UI](https://brightdata.com/cp/scrapers) after creation.

#### Output envelope (`-o create.json`)

Every termination path — success or failure — writes the same JSON envelope shape:

```json
{
"collector_id": "c_mp7x8a9b2c0d1e2f",
"name": "my-product-scraper",
"status": "done",
"completed_steps": ["prepare_intent_analyzer", "planner", "..."],
"view_url": "https://brightdata.com/cp/scrapers/c_mp7x8a9b2c0d1e2f",
"created_at": "2026-05-18T07:28:30Z"
}
```

On failure paths the envelope adds an `error` field and the `status` reflects the failure category (`ai_trigger_failed`, `failed`, `poll_failed`). The `collector_id` and `view_url` are still present so you can recover or inspect the half-built scraper.

This makes the documented chain in [recipes.md](https://github.com/brightdata/skills/blob/main/skills/scraper-studio/references/recipes.md) work as written:

```bash
brightdata scraper create https://example.com/product/1 "..." \
-o create.json
COLLECTOR_ID=$(jq -r '.collector_id' create.json)
brightdata scraper run "$COLLECTOR_ID" https://example.com/product/2
```

> The file format follows the `-o` extension, so `.json` is written compact (ideal for `jq`). Use `--pretty` for indented JSON on stdout when you omit `-o`.

Use `--legacy-output` if you have an existing script that depended on the pre-v0.3 bare-progress shape; the flag is supported for one minor version while you migrate.

**Examples**

```bash
# Build a scraper for a product page
brightdata scraper create https://example.com/product/1 \
"Extract title, price, and image URL from this product page"

# Name the scraper and save the full AI output to a file
# Name the scraper and save the envelope to a file
brightdata scraper create https://example.com/product/1 \
"Extract title, price, and image URL from this product page" \
--name my-product-scraper --pretty -o scraper-output.json
--name my-product-scraper -o create.json

# Capture the collector_id for chaining
COLLECTOR_ID=$(jq -r '.collector_id' create.json)

# Use a custom webhook delivery URL
brightdata scraper create https://example.com/product/1 \
Expand Down
232 changes: 229 additions & 3 deletions src/__tests__/commands/scraper.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import {
extract_progress_status,
format_create_summary,
handle_create_scraper,
build_create_envelope,
handle_run_scraper,
build_run_request,
build_run_query,
Expand Down Expand Up @@ -115,7 +116,7 @@ describe('commands/scraper', ()=>{
expect(extract_progress_status({status: 'done'})).toBe('done');
});

it('returns sentinel running token for any non-done status', ()=>{
it('returns sentinel running token for in-progress statuses', ()=>{
expect(extract_progress_status({status: 'running'}))
.toBe('__running__');
expect(extract_progress_status({status: 'queued'}))
Expand All @@ -124,6 +125,16 @@ describe('commands/scraper', ()=>{
.toBe('__running__');
});

it('returns terminal failure statuses verbatim so polling stops',
()=>{
expect(extract_progress_status({status: 'failed'}))
.toBe('failed');
expect(extract_progress_status({status: 'error'}))
.toBe('error');
expect(extract_progress_status({status: 'cancelled'}))
.toBe('cancelled');
});

it('returns undefined for missing/invalid input', ()=>{
expect(extract_progress_status(null as never)).toBeUndefined();
expect(extract_progress_status({} as never)).toBeUndefined();
Expand All @@ -149,6 +160,212 @@ describe('commands/scraper', ()=>{
});
});

describe('build_create_envelope', ()=>{
it('returns the documented success shape', ()=>{
const env = build_create_envelope({
collector_id: 'c_xyz',
name: 'product-v1',
status: 'done',
progress: {status: 'done',
completed_steps: ['a', 'b', 'c']},
created_at: '2026-05-18T07:28:30Z',
});
expect(env).toEqual({
collector_id: 'c_xyz',
name: 'product-v1',
status: 'done',
completed_steps: ['a', 'b', 'c'],
view_url: 'https://brightdata.com/cp/scrapers/c_xyz',
created_at: '2026-05-18T07:28:30Z',
});
});

it('omits created_at when not known', ()=>{
const env = build_create_envelope({
collector_id: 'c_xyz',
name: 'n',
status: 'done',
progress: {status: 'done', completed_steps: []},
});
expect(env).not.toHaveProperty('created_at');
});

it('records the error message and partial steps on failure',
()=>{
const env = build_create_envelope({
collector_id: 'c_xyz',
name: 'n',
status: 'ai_trigger_failed',
error: 'Cannot run more than 3 jobs in parallel',
});
expect(env.collector_id).toBe('c_xyz');
expect(env.status).toBe('ai_trigger_failed');
expect(env.error).toMatch(/parallel/);
expect(env.completed_steps).toEqual([]);
expect(env.view_url)
.toBe('https://brightdata.com/cp/scrapers/c_xyz');
});

it('still includes view_url on every termination path', ()=>{
for (const status of ['done', 'failed', 'ai_trigger_failed',
'poll_failed'])
{
const env = build_create_envelope({
collector_id: 'c_xyz', name: 'n', status,
});
expect(env.view_url)
.toBe('https://brightdata.com/cp/scrapers/c_xyz');
}
});
});

describe('handle_create_scraper envelope output', ()=>{
const setup_success = ()=>{
mocks.post
.mockResolvedValueOnce({
id: 'c_xyz', name: 'product-v1',
created: '2026-05-18T07:28:30Z',
})
.mockResolvedValueOnce({id: 'ia_xyz', queued: false});
mocks.poll_until.mockResolvedValue({
result: {status: 'done',
completed_steps: ['a', 'b', 'c']},
attempts: 4,
});
};

it('writes the new envelope to -o on success', async()=>{
setup_success();
await handle_create_scraper(
'https://x.com/p', 'd',
{output: 'create.json', pretty: true}
);
expect(mocks.print).toHaveBeenCalledWith(
expect.objectContaining({
collector_id: 'c_xyz',
name: 'product-v1',
status: 'done',
completed_steps: ['a', 'b', 'c'],
view_url: 'https://brightdata.com/cp/scrapers/c_xyz',
created_at: '2026-05-18T07:28:30Z',
}),
expect.objectContaining({output: 'create.json'})
);
});

it('the documented `jq -r .collector_id` recipe works on the '
+'envelope', async()=>{
setup_success();
await handle_create_scraper('https://x.com/p', 'd',
{output: 'create.json'});
const written = mocks.print.mock.calls[0][0] as {
collector_id?: string};
expect(written.collector_id).toBe('c_xyz');
});

it('--legacy-output preserves the bare progress payload',
async()=>{
setup_success();
await handle_create_scraper(
'https://x.com/p', 'd',
{output: 'create.json', legacyOutput: true}
);
const written = mocks.print.mock.calls[0][0] as {
collector_id?: unknown; status?: string};
expect(written.collector_id).toBeUndefined();
expect(written).not.toHaveProperty('view_url');
expect(written.status).toBe('done');
});

it('writes the envelope when AI trigger fails (stub-collector '
+'recovery path), with a single-line error', async()=>{
// multi-line client error -> envelope keeps the first line.
mocks.post
.mockResolvedValueOnce({id: 'c_stub', name: 'n'})
.mockRejectedValueOnce(new Error(
'Error: Cannot run more than 3 jobs in parallel\n'
+' Status: 429\n Hint: serialise your launches.'));
const exit = vi.spyOn(process, 'exit')
.mockImplementation(()=>undefined as never);
const error = vi.spyOn(console, 'error')
.mockImplementation(()=>{});
await handle_create_scraper(
'https://x.com/p', 'd',
{output: 'create.json'}
);
expect(mocks.print).toHaveBeenCalledWith(
expect.objectContaining({
collector_id: 'c_stub',
status: 'ai_trigger_failed',
error: 'Cannot run more than 3 jobs in parallel',
view_url: 'https://brightdata.com/cp/scrapers/c_stub',
}),
expect.objectContaining({output: 'create.json'})
);
exit.mockRestore();
error.mockRestore();
});

it('writes the envelope when poll returns status != done',
async()=>{
mocks.post
.mockResolvedValueOnce({id: 'c_abc', name: 'n'})
.mockResolvedValueOnce({id: 'ia_xyz', queued: false});
mocks.poll_until.mockResolvedValue({
result: {status: 'failed',
completed_steps: ['planner']},
attempts: 2,
});
const exit = vi.spyOn(process, 'exit')
.mockImplementation(()=>undefined as never);
const error = vi.spyOn(console, 'error')
.mockImplementation(()=>{});
await handle_create_scraper(
'https://x.com/p', 'd',
{output: 'create.json'}
);
expect(mocks.print).toHaveBeenCalledWith(
expect.objectContaining({
collector_id: 'c_abc',
status: 'failed',
completed_steps: ['planner'],
error: expect.stringMatching(/finished with status/),
}),
expect.objectContaining({output: 'create.json'})
);
exit.mockRestore();
error.mockRestore();
});

it('writes the envelope when polling itself throws (timeout '
+'or network)', async()=>{
mocks.post
.mockResolvedValueOnce({id: 'c_abc', name: 'n'})
.mockResolvedValueOnce({id: 'ia_xyz', queued: false});
mocks.poll_until.mockRejectedValue(
new Error(
'Timeout after 600 seconds waiting for AI generation'));
const exit = vi.spyOn(process, 'exit')
.mockImplementation(()=>undefined as never);
const error = vi.spyOn(console, 'error')
.mockImplementation(()=>{});
await handle_create_scraper(
'https://x.com/p', 'd',
{output: 'create.json'}
);
expect(mocks.print).toHaveBeenCalledWith(
expect.objectContaining({
collector_id: 'c_abc',
status: 'poll_failed',
error: expect.stringMatching(/Timeout/),
}),
expect.objectContaining({output: 'create.json'})
);
exit.mockRestore();
error.mockRestore();
});
});

describe('handle_create_scraper', ()=>{
it('chains create → trigger → poll and prints JSON in non-TTY',
async()=>{
Expand Down Expand Up @@ -195,7 +412,13 @@ describe('commands/scraper', ()=>{
})
);
expect(mocks.print).toHaveBeenCalledWith(
progress,
expect.objectContaining({
collector_id: 'c_abc',
name: 'cli-scraper-1',
status: 'done',
completed_steps: ['a', 'b'],
view_url: 'https://brightdata.com/cp/scrapers/c_abc',
}),
{json: undefined, pretty: undefined, output: undefined}
);
});
Expand All @@ -209,7 +432,10 @@ describe('commands/scraper', ()=>{
result: progress, attempts: 1});
await handle_create_scraper('https://x.com', 'd', {json: true});
expect(mocks.print).toHaveBeenCalledWith(
progress,
expect.objectContaining({
collector_id: 'c_abc',
status: 'done',
}),
{json: true, pretty: undefined, output: undefined}
);
});
Expand Down
Loading