diff --git a/.babelrc b/.babelrc deleted file mode 100644 index 19e03bb3bf..0000000000 --- a/.babelrc +++ /dev/null @@ -1,5 +0,0 @@ -{ - "parserOpts": { - "allowAwaitOutsideFunction": true - } -} \ No newline at end of file diff --git a/.lintignore b/.eslintignore similarity index 52% rename from .lintignore rename to .eslintignore index 1513435321..dd87e2d73f 100644 --- a/.lintignore +++ b/.eslintignore @@ -1,3 +1,2 @@ node_modules build -docs/.eslintrc.js \ No newline at end of file diff --git a/.eslintrc.json b/.eslintrc.json index 9fa448aab7..3d8de7b084 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -1,24 +1,28 @@ { "extends": [ - "plugin:markdown/recommended", - "@apify" - ], - "parser": "@babel/eslint-parser", - "plugins": [ - "@babel" + "@apify/eslint-config-ts", + "plugin:react/recommended", + "plugin:react-hooks/recommended" ], "parserOptions": { - "configFile": ".babelrc" + "files": ["*.js", "*.jsx", "*.ts", "*.tsx"], + "project": "./tsconfig.eslint.json", + "ecmaFeatures": { + "jsx": true + }, + "ecmaVersion": 2020 }, "env": { - "es6": true, - "node": true + "browser": true + }, + "settings": { + "react": { + "version": "detect" + } }, + "root": true, "rules": { - "import/no-extraneous-dependencies": "off", - "no-unused-vars": "off", - "no-unused-expressions": "off", - "no-undef": "off", - "no-console": "off" + "quote-props": ["error", "consistent"], + "react/prop-types": ["off"] } } diff --git a/.github/workflows/apiary.yml b/.github/workflows/apiary.yml new file mode 100644 index 0000000000..8362bbf6b6 --- /dev/null +++ b/.github/workflows/apiary.yml @@ -0,0 +1,25 @@ +name: apiary.io + +on: + push: + branches: + - master + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Use Node.js 16 + uses: actions/setup-node@v3 + with: + node-version: 16 + + - name: Install apiaryio + run: sudo gem install apiaryio + + - name: Upload API docs to Apiary.io + run: ./tools/upload_to_apiary.sh + env: + APIARY_API_KEY: ${{ secrets.APIARY_API_KEY }} diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 48492c2ab6..0000000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: Build and deploy - -on: [push] - -jobs: - build-and-deploy: - runs-on: ubuntu-20.04 - - strategy: - matrix: - node-version: [14.x] - - steps: - - uses: actions/checkout@master - - name: install apiary - run: sudo gem install apiaryio - - name: Install dependencies - run: npm install - - name: Lint Markdown - run: npm run lint:md:fix - - name: Lint code - run: npm run lint:code:fix - - name: Build documentation - run: npm run build - - name: Upload docs to AWS S3 - run: ./src/scripts/upload_to_s3.sh - env: - AWS_DEFAULT_REGION: us-east-1 - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - name: Upload API docs to Apiary.io - run: ./src/scripts/upload_to_apiary.sh - env: - APIARY_API_KEY: ${{ secrets.APIARY_API_KEY }} diff --git a/.github/workflows/check-pr-title.yml b/.github/workflows/check-pr-title.yml new file mode 100644 index 0000000000..9e5d9243c2 --- /dev/null +++ b/.github/workflows/check-pr-title.yml @@ -0,0 +1,14 @@ +name: Check PR title + +on: + pull_request_target: + types: [ opened, edited, synchronize ] + +jobs: + check_pr_title: + name: 'Check PR title' + runs-on: ubuntu-latest + steps: + - uses: amannn/action-semantic-pull-request@v5.0.2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/close.yml b/.github/workflows/close.yml deleted file mode 100644 index a1b756a75e..0000000000 --- a/.github/workflows/close.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: Close pull request -on: - pull_request: - types: [closed] - -jobs: - close: - name: Close - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@master - - name: remove branch specific artifacts from AWS S3 - run: ./src/scripts/remove_branch_artifacts_from_s3.sh ${{ github.head_ref }} - env: - AWS_DEFAULT_REGION: us-east-1 - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000000..486bf59325 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,39 @@ +name: docs + +on: + push: + branches: + - master + +jobs: + build: + environment: + name: github-pages + permissions: + contents: write + pages: write + id-token: write + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Use Node.js 16 + uses: actions/setup-node@v3 + with: + node-version: 16 + + - name: Build docs + run: | + npm ci --force + npm run build + + - name: Set up GitHub Pages + uses: actions/configure-pages@v3 + + - name: Upload GitHub Pages artifact + uses: actions/upload-pages-artifact@v1 + with: + path: ./build + + - name: Deploy artifact to GitHub Pages + uses: actions/deploy-pages@v1 diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml deleted file mode 100644 index 08f35dcaf3..0000000000 --- a/.github/workflows/lint.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: Lint - -on: [push] - -jobs: - lint: - runs-on: ubuntu-latest - - strategy: - matrix: - node-version: [14.x] - - steps: - - uses: actions/checkout@master - - name: Install dependencies - run: npm install - - name: Lint Markdown - run: npm run lint:md:fix - - name: Lint code - run: npm run lint:code:fix diff --git a/.github/workflows/publish-theme.yml b/.github/workflows/publish-theme.yml new file mode 100644 index 0000000000..30eeba96da --- /dev/null +++ b/.github/workflows/publish-theme.yml @@ -0,0 +1,118 @@ +name: publish-theme + +on: + push: + branches: + - master + +jobs: + look_for_change: + if: ${{ !contains(github.event.head_commit.message, '[skip ci]') }} + runs-on: ubuntu-latest + outputs: + theme_changed: ${{ steps.changed-theme-files.outputs.any_changed }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Use Node.js 16 + uses: actions/setup-node@v3 + with: + node-version: 16 + + - name: Check changes in theme + id: changed-theme-files + uses: tj-actions/changed-files@v35 + with: + since_last_remote_commit: "true" + files: | + apify-docs-theme/** + + publish: + needs: look_for_change + if: needs.look_for_change.outputs.theme_changed == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.GH_TOKEN }} + + - name: Use Node.js 16 + uses: actions/setup-node@v3 + with: + node-version: 16 + + - name: Setup git user and npm + run: | + git config --global user.name "Apify Release Bot" + git config --global user.email "noreply@apify.com" + + cd $GITHUB_WORKSPACE/apify-docs-theme + echo "access=public" > .npmrc + echo "//registry.npmjs.org/:_authToken=${{ secrets.NPM_TOKEN }}" >> .npmrc + + - name: Bump the theme version + run: | + cd $GITHUB_WORKSPACE/apify-docs-theme + npm version patch + + - name: Deploy theme to npm + run: | + cd $GITHUB_WORKSPACE/apify-docs-theme + npx -y publish-if-not-exists + env: + NPM_TOKEN: ${{ secrets.NPM_TOKEN }} + GIT_USER: "barjin:${{ secrets.GH_TOKEN }}" + GH_TOKEN: ${{ secrets.GH_TOKEN }} + + - name: Wait until the new theme version is available on npm + run: | + cd $GITHUB_WORKSPACE/apify-docs-theme + PACKAGE_JSON=$(cat package.json); + PACKAGE_NAME=$(jq -r .name <(echo $PACKAGE_JSON)); + PACKAGE_VER=$(jq -r .version <(echo $PACKAGE_JSON)); + for i in $(seq 1 10); do + EXIT_CODE=0; + npm show $PACKAGE_NAME@$PACKAGE_VER || EXIT_CODE=1; + if [[ $EXIT_CODE -eq 1 ]]; then + echo "The new package version ($PACKAGE_VER) is not yet available, waiting 30 seconds..."; + sleep 30; + continue; + fi; + echo "The new package version ($PACKAGE_VER) is live, proceeding!"; + break; + done; + npm show $PACKAGE_NAME@$PACKAGE_VER # fails if the package is not available, succeeds if it is + + - name: Commit the new theme version + uses: stefanzweifel/git-auto-commit-action@v4 + with: + commit_message: 'chore: publish new version of @apify/docs-theme [skip ci]' + file_pattern: 'apify-docs-theme/package*.json' + commit_user_name: Apify Bot + commit_user_email: my-github-actions-bot@example.org + commit_author: Apify Bot + + rebuild-docs: + needs: publish + strategy: + matrix: + include: + - repo: 'apify/apify-sdk-js' + branch: 'master' + - repo: 'apify/apify-sdk-python' + branch: 'docs-v2' + - repo: 'apify/apify-cli' + branch: 'master' + - repo: 'apify/apify-client-js' + branch: 'master' + - repo: 'apify/apify-client-python' + branch: 'docs-v2' + + runs-on: ubuntu-latest + steps: + - env: + GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} + run: | + gh workflow run docs.yml --repo ${{ matrix.repo }} --ref ${{ matrix.branch }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000000..516f85f732 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,65 @@ +name: Test + +on: + push: + branches: [ master, renovate/** ] + pull_request: + branches: [ master ] + +jobs: + build: + name: Docs build + runs-on: ubuntu-latest + steps: + - name: Checkout Source code + uses: actions/checkout@v3 + + - name: Use Node.js 16 + uses: actions/setup-node@v3 + with: + node-version: 16 + cache: 'npm' + cache-dependency-path: 'package-lock.json' + + - name: Install Dependencies + run: npm ci --force + + - run: npm run build + + lint_content: + name: Lint markdown content + runs-on: ubuntu-latest + steps: + - name: Checkout Source code + uses: actions/checkout@v3 + + - name: Use Node.js 16 + uses: actions/setup-node@v3 + with: + node-version: 16 + cache: 'npm' + cache-dependency-path: 'package-lock.json' + + - name: Install Dependencies + run: npm ci --force + + - run: npm run lint:md + + lint_code: + name: Lint app code + runs-on: ubuntu-latest + steps: + - name: Checkout Source code + uses: actions/checkout@v3 + + - name: Use Node.js 16 + uses: actions/setup-node@v3 + with: + node-version: 16 + cache: 'npm' + cache-dependency-path: 'package-lock.json' + + - name: Install Dependencies + run: npm ci --force + + - run: npm run lint:code diff --git a/.gitignore b/.gitignore index a82a429b0b..2b4440aa44 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ build +dist build-docs node_modules *.log @@ -11,6 +12,16 @@ coverage logs pids .idea +.vscode +.npmrc yarn.lock tmp -.vscode +jsconfig.json +types +sources/api +sources/typedefs +.history +.docusaurus +tsconfig.tsbuildinfo +.turbo +apify-docs-theme/package-lock.json diff --git a/.nvmrc b/.nvmrc deleted file mode 100644 index e24183e587..0000000000 --- a/.nvmrc +++ /dev/null @@ -1 +0,0 @@ -v14.18.1 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..a230d71afd --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,116 @@ +# Contributing to Apify Documentation + +## Architecture + +Currently, there are 6 separate projects outside of this repo: + +- apify-client-js +- apify-client-python +- apify-sdk-js +- apify-sdk-python +- apify-cli +- apify-docs (this repository) + +The main documentation content for Platform docs and Academy is inside the `./sources` directory. Every project repository then has its own docusaurus instance and is available on a URL prefix (used as the `baseUrl` in docusaurus) that is routed via nginx reverse proxy to the main domain. All those docusaurus instances are deployed to GH pages on push. + +We use a shared docusaurus theme published to NPM as `@apify/docs-theme`, that is automatically synced in all the repositories via CI. + +## Local setup + +If you want to work only on the main documentation content, cloning this repository is enough, Once you install and run `npm start`, the main portal will open on . All the links in navbar and footer need to be absolute, and they will use a different hostname, configured to `docs.apify.loc` - to use that, follow the steps below and set up the nginx server. + +Alternatively, you can skip the nginx part and navigate to or manually instead of using links in navbar. All relative links should work fine there, the problem with absolute links is only with shared components. The nginx server is needed only for testing the whole setup and mapping all the different ports to a single one. + +Clone all the repositories, checkout the `docs-v2` branch (if still not merged to `master`). Then you can start the docusaurus instances in them. + +| repo | branch | port | +|---------------------|---------|------| +| apify-docs | master | 3000 | +| apify-client-js | master | 3001 | +| apify-client-python | docs-v2 | 3002 | +| apify-sdk-js | master | 3003 | +| apify-sdk-python | docs-v2 | 3004 | +| apify-cli | master | 3005 | + +> To run docusaurus on a specific port, use `npm start -- --port XXXX`. + +To route them, you will need nginx server with following config: + +```nginx +server { + listen 80; + server_name docs.apify.loc; + location / { + proxy_pass http://localhost:3000; + } + location /api/client/js { + proxy_pass http://localhost:3001; + } + location /api/client/python { + proxy_pass http://localhost:3002; + } + location /sdk/js { + proxy_pass http://localhost:3003; + } + location /sdk/python { + proxy_pass http://localhost:3004; + } + location /cli { + proxy_pass http://localhost:3005; + } +} +``` + +And add a record to `/etc/hosts` to map the docs.apify.loc hostname to localhost: + +```text +127.0.0.1 docs.apify.loc +``` + +## Deployment + +Current nginx deployment config: + +```nginx +server { + listen 80; + server_name docs.apify.com; + location / { + proxy_pass https://apify.github.io/apify-docs/; + } + location /api/client/js { + proxy_pass https://apify.github.io/apify-client-js/; + } + location /api/client/python { + proxy_pass https://apify.github.io/apify-client-python/; + } + location /sdk/js { + proxy_pass https://apify.github.io/apify-sdk-js/; + } + location /sdk/python { + proxy_pass https://apify.github.io/apify-sdk-python/; + } + location /cli { + proxy_pass https://apify.github.io/apify-cli/; + } +} +``` + +## @apify/docs-theme + +The `@apify/docs-theme` is a Docusaurus theme package with custom components and styles to be used in all the Apify Docuaurus instances. +Aside from the regular Docusaurus theme interface, it also exports the common parts of the Docusaurus config, such as the navbar contents, url, `og:image`, etc. + +The theme is available on npm as `@apify/docs-theme` and can be installed in any Docusaurus instance by running `npm install @apify/docs-theme`. + +### Publishing the theme + +There is a GitHub Action that automatically publishes the theme to npm whenever any changes are pushed to the `master` branch. However, this only happens if you update the version in the `package.json` file manually - if the current version already exists on npm, the publish will be skipped. + +Additionally, if there are any changes to the `apify-docs-theme` folder detected, the GitHub action will invoke docs builds in all the subprojects to make sure that all the pages are using the latest theme version. This is done in the `rebuild-docs` job. This job utilizes a matrix strategy to run the builds in parallel. The actual rebuild is initiated by the `workflow_dispatch` event in the respective repositories. Because of this, the `GITHUB_TOKEN` envvar has to be replaced by the PAT token stored in the `GH_TOKEN` secret - the original token does not have the necessary permissions to trigger the workflows in other repositories. + +## Interesting links + +- +- +- diff --git a/README.md b/README.md index 5e916c0cb3..29f7d63b53 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ -# Apify documentation +# Apify Documentation -[![Build Status](https://github.com/apify/apify-docs/workflows/Build%20and%20deploy/badge.svg?branch=master)](https://github.com/apify/apify-docs/actions) +[![Check & Release](https://github.com/apify/apify-docs/actions/workflows/test.yml/badge.svg)](https://github.com/apify/apify-docs/actions/workflows/test.yml) ## Intro -This repo is the home of Apify's documentation, which you can find at [docs.apify.com](https://docs.apify.com/). The documentation is written using [Markdown](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) and all of its source files are located in the [/docs](https://github.com/apifytech/apify-docs/tree/master/docs) directory. +This repository is the home of Apify's documentation, which you can find at [docs.apify.com](https://docs.apify.com/). The documentation is written using [Markdown](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) and all of its source files are located in the [/sources](https://github.com/apify/apify-docs/tree/master/sources) directory. -Before **push**-ing to GitHub, always execute `npm run build` to make sure everything works. +When you create a PR, the CI will try to build the project, searching for broken links and linting both the application code and markdown files. ## Implementation and style @@ -14,89 +14,78 @@ In addition to the tips below: when in doubt, check existing docs for formatting ### Highlighting -For consistency, use **bold** for highlighting non-code words/phrases. +* For consistency, use **bold** for highlighting non-code words/phrases. +* For inline `code` examples, use **back-ticks** (\` \`). +* For multi-line code examples, use code fences and specify the language. Preferably, specify the title as well. -For inline `code` examples, use **back-ticks** (\` \`). + ```markdown + \`\`\`js title='foo.js' -For multi-line code examples, use code fences and specify the language: + const docsAreCool = require('coolDocs');
+ ...
+ return docsAreCool;
-\`\`\`js + \`\`\` + ``` -const docsAreCool = require('coolDocs');
-...
-return docsAreCool;
+ See [Markdown features](https://docusaurus.io/docs/markdown-features) in the docusaurus docs for more information. -\`\`\` +### Code tabs -### Code example tabs +See [docusaurus documentation for tabs](https://docusaurus.io/docs/markdown-features/tabs) for examples. -When providing code examples in multiple languages, use the below format. +```markdown +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; -Next to the code fence, specify the language as **marked-tabs**. - -Place each language's code in a **\\** element. - -> Only use double quotation marks in the code tab headers
-> **Good**: \
-> **Avoid**: \ - -\`\`\`marked-tabs - -\ - -console.log('Some JS code'); - -\ - - -\ - -print('Some python code'); -count = 1 -if count >= 1: - print('Some intended python code'); -print('Some python code on next line'); - -\ - - -\ - -echo "Some bash code" - -\ - -\`\`\` - -PHP examples using guzzle live in separate repository: . + + + This is an apple 🍎 + + + This is an orange 🍊 + + + This is a banana 🍌 + + +``` ### Metadata -Each Markdown file here starts with metadata that define the document's menu title, placement, page description, and paths. For example: +The page metadata can be provided as part of so-called [front-matter](https://docusaurus.io/docs/api/plugins/@docusaurus/plugin-content-docs#markdown-front-matter). -```text +```markdown --- -title: Getting started with Apify Scrapers -menuTitle: Getting started -description: Step-by-step tutorial that will help you get started with all Apify Scrapers. -externalSourceUrl: https://raw.githubusercontent.com/apifytech/actor-scraper/master/docs/build/introduction-tutorial.md -menuWeight: 2.1 -paths: - - scraping/getting-started +id: doc-markdown +title: Docs Markdown Features +hide_title: false +hide_table_of_contents: false +sidebar_label: Markdown +sidebar_position: 3 +pagination_label: Markdown features +custom_edit_url: https://github.com/facebook/docusaurus/edit/main/docs/api-doc-markdown.md +description: How do I find you when I cannot solve this problem +keywords: + - docs + - docusaurus +image: https://i.imgur.com/mErPwqL.png +slug: /myDoc +last_update: + date: 1/1/2000 + author: custom author name --- -``` -The document's `category` and `menuWeight` determine its placement in the docs menu. When inserting a new document, make sure to adjust the `menuWeight` properties of existing documents. +# Markdown Features -The `paths` metadata ensures successful redirects in case articles are renamed. When renaming or moving an article, keep the existing paths and add the new path at the bottom. +My Document Markdown content +``` ### Descriptions -Metadata descriptions are super important in making our documentation easy to find using search engines. To maximize our SEO, - -#### Keep the descriptions between [140 and 160 characters in length](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwigg6Og56brAhUNi1wKHULsAHEQFjAGegQIDBAG&url=https%3A%2F%2Fmoz.com%2Flearn%2Fseo%2Fmeta-description&usg=AOvVaw3L26bXhHZTd0wYDM_5xtJ9) whenever possible +Metadata descriptions are super important in making our documentation easy to find using search engines. To maximize our SEO, **keep the descriptions between [140 and 160 characters in length](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwigg6Og56brAhUNi1wKHULsAHEQFjAGegQIDBAG&url=https%3A%2F%2Fmoz.com%2Flearn%2Fseo%2Fmeta-description&usg=AOvVaw3L26bXhHZTd0wYDM_5xtJ9) whenever possible**. -(Of course, when there just isn't enough to say, don't waffle - it's not a university essay.) +> Of course, when there just isn't enough to say, don't waffle - it's not a university essay. GOOD: "Store anything from images and key-value pairs to structured output data. Learn how to access and manage your stored data from the Apify platform or via API." @@ -130,34 +119,6 @@ GOOD: "Learn how to make your actor available to the public or keep it private. AVOID: "Description of the processes regarding the optimizing and preparing for publishing of one's actor in Apify Store." -### Assets - -When adding new images to articles, first compress them using [tinypng.com](https://tinypng.com). This will help our docs pages load faster. - -Avoid HTML in assets or links. - -You can place assets (images for example) in any directory. If you want to obtain a URL, use the following tag: - -```text -{{@asset actor/images/run-log-2.webp}} -``` - -So to include this image in Markdown use: - -```text -![Apify actor run log]({{@asset actor/images/run-log-2.webp}}) -``` - -### Linking - -For links, we use a similar syntax as for assets: - -```text -{{@link actor/source_code.md#source-git-repo}} -``` - -Avoid using HTML. - ## Docs homepage The homepage menu card items are in the `docs/homepage_content.json` file. @@ -184,8 +145,7 @@ For example: }, ``` -Note: -In JSON, all entries except booleans (`true/false`) and numbers need to be in double quote marks (""). +> In JSON, all entries except booleans (`true/false`) and numbers need to be in double quote marks (""). Over time, we should track which items are useful and which don't get any traffic. Also, as Apify Docs expand, we may need to add more cards and update which articles we link to. @@ -193,46 +153,27 @@ Over time, we should track which items are useful and which don't get any traffi On each commit to the `master` branch of this repository, a new version of the Apify documentation gets built and deployed to the appropriate subdomain. -Every other branch can be viewed on its respective subdomain using the `?version=BRANCH_NAME` parameter, e.g. [https://docs.apify.com?version=feature/new-section]. - -Keep in mind that there might be about 2 minute delay before updated documentation gets online (1 minute Github actions build + 1 minute update interval of the website). - -> Please don't use a `+` sign in your branch name, as the deployment will not work. Stick to dashes and slashes. - ## Linting -The **apify-docs** repo contains both Markdown and JavaScript files. Several Markdown files, such as [dataset docs]({{@link docs/storage/dataset.md}}) contain code examples. Because of this, we have two commands for linting. +The **apify-docs** repo contains both Markdown and JavaScript/TypeScript files. We have two commands for linting them: -* **npm run lint:md** / **npm run lint:md:fix** checks the **.md** files. -* **npm run lint:code** / **npm run lint:code:fix** checks both the code examples within Markdown files and the build scripts. +* `npm run lint:md` and `npm run lint:md:fix` checks the `*.md` files. +* `npm run lint:code` and `npm run lint:code:fix` checks the `*.{js,ts}` files. For Markdown, we use the [markdownlint](https://github.com/DavidAnson/markdownlint) package, which also has a handy VSCode [extension](https://marketplace.visualstudio.com/items?itemName=DavidAnson.vscode-markdownlint). For JavaScript, we use the [ESLint Markdown plugin](https://github.com/eslint/eslint-plugin-markdown). - ## API docs -The `docs/api_v2` directory contains the source file for the -API reference () hosted on Apiary. -The build script contained in the **apify-docs/src** folder automatically uploads the API docs to Apiary during the web deployment process. +The `./sources/platform/api_v2` directory contains the source file for the API reference () hosted on Apiary. The build script contained in the `./tools` folder automatically uploads the API docs to Apiary during the web deployment process. ### Local testing 1. Install Apiary gem `gem install apiaryio` -2. After that, you can open the generated doc with the command: `apiary preview --path="./content/docs/api_v2/api_v2_reference.apib"` +2. After that, you can open the generated doc with the + command: `apiary preview --path="./content/docs/api_v2/api_v2_reference.apib"` ### Test After updating the API docs, you should ALWAYS log in to Apiary, analyze the document and make sure there are **no warnings**! - -## External docs - -You will find most of the documentation in this repository. - -There are, however, a few exceptions, shown below. To make changes to them, you'll need to clone those repos and make your pull requests to them. When updating the tutorials in the **apify/actor-scraper** repo, don't forget to execute `npm run build` before pushing your code to GitHub. - -* Tutorials for Apify's scrapers (**docs/scraping** directory) are in the [**apify/actor-scraper**](https://github.com/apify/actor-scraper) repository. -* Apify's API client for JavaScript documentation is in the [**apify-docs/apify-client-js**](https://github.com/apify/apify-client-js) repository. -* Apify's API client for Python documentation is in the [**apify-docs/apify-client-python**](https://github.com/apify/apify-client-python) repository. -* Docs for the command-line interface are in the [**apify/apify-cli**](https://github.com/apify/apify-cli) repo. diff --git a/apify-docs-theme/package.json b/apify-docs-theme/package.json new file mode 100644 index 0000000000..98f7ce2dbd --- /dev/null +++ b/apify-docs-theme/package.json @@ -0,0 +1,27 @@ +{ + "name": "@apify/docs-theme", + "version": "1.0.58", + "description": "", + "main": "./src/index.js", + "files": [ + "src", + "types", + "static" + ], + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1", + "build": "echo 'Building @apify/docs-theme!'" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "@docusaurus/theme-common": "^2.3.1", + "axios": "^1.3.1", + "babel-loader": "^9.1.0", + "prism-react-renderer": "^1.3.5" + }, + "peerDependencies": { + "react": "*" + } +} diff --git a/apify-docs-theme/src/config.js b/apify-docs-theme/src/config.js new file mode 100644 index 0000000000..248d73ac17 --- /dev/null +++ b/apify-docs-theme/src/config.js @@ -0,0 +1,249 @@ +/* eslint-disable global-require */ +const absoluteUrl = process.env.LOCAL ? 'http://docs.apify.loc' : 'https://docs.apify.com'; + +const themeConfig = ({ + docs: { + versionPersistence: 'localStorage', + sidebar: { + hideable: true, + }, + }, + navbar: { + title: 'Apify Docs', + logo: { + src: 'img/apify_sdk.svg', + srcDark: 'img/apify_sdk_white.svg', + href: absoluteUrl, + target: '_self', + }, + items: [ + { + label: 'Academy', + href: `${absoluteUrl}/academy`, + activeBasePath: 'academy', + position: 'left', + target: '_self', + rel: 'dofollow', + }, + { + label: 'Platform', + href: `${absoluteUrl}/platform`, + className: 'navbar__active', + activeBasePath: 'platform', + position: 'left', + target: '_self', + rel: 'dofollow', + }, + { + label: 'API', + type: 'dropdown', + activeBasePath: 'api', + position: 'left', + items: [ + { + label: 'Reference', + href: `${absoluteUrl}/api/v2/`, + target: '_self', + rel: 'dofollow', + }, + { + label: 'Client for JavaScript', + href: `${absoluteUrl}/api/client/js/`, // we need a trailing slash here, we'd get redirected there anyway + target: '_self', + rel: 'dofollow', + }, + { + label: 'Client for Python', + href: `${absoluteUrl}/api/client/python/`, // we need a trailing slash here, we'd get redirected there anyway + target: '_self', + rel: 'dofollow', + }, + ], + }, + { + label: 'SDK', + type: 'dropdown', + activeBasePath: 'sdk', + position: 'left', + items: [ + { + label: 'SDK for JavaScript', + href: `${absoluteUrl}/sdk/js/`, // we need a trailing slash here, we'd get redirected there anyway + target: '_self', + rel: 'dofollow', + }, + { + html: 'SDK for Python beta', + href: `${absoluteUrl}/sdk/python/`, // we need a trailing slash here, we'd get redirected there anyway + target: '_self', + rel: 'dofollow', + }, + ], + }, + { + label: 'CLI', + href: `${absoluteUrl}/cli/`, // we need a trailing slash here, we'd get redirected there anyway + position: 'left', + target: '_self', + rel: 'dofollow', + }, + { + label: 'Open source', + type: 'dropdown', + position: 'left', + className: 'navbar__item', + items: [ + { + label: 'Crawlee', + href: 'https://crawlee.dev', + rel: 'dofollow', + }, + { + label: 'Got Scraping', + href: 'https://github.com/apify/got-scraping', + }, + { + label: 'Fingerprint Suite', + href: 'https://github.com/apify/fingerprint-suite', + }, + { + label: 'See Apify on GitHub', + href: 'https://github.com/apify', + }, + ], + }, + { + href: 'https://github.com/apify', + label: 'GitHub', + title: 'See Apify on GitHub', + position: 'right', + className: 'icon', + }, + { + href: 'https://discord.com/invite/jyEM2PRvMU', + label: 'Discord', + title: 'Chat on Discord', + position: 'right', + className: 'icon', + }, + ], + }, + colorMode: { + defaultMode: 'light', + disableSwitch: false, + respectPrefersColorScheme: true, + }, + prism: { + defaultLanguage: 'typescript', + theme: require('prism-react-renderer/themes/github'), + darkTheme: require('prism-react-renderer/themes/dracula'), + additionalLanguages: ['docker', 'log'], + }, + image: 'img/docs-og.png', + footer: { + links: [ + { + title: 'Learn', + items: [ + { + label: 'Academy', + href: `${absoluteUrl}/academy`, + target: '_self', + rel: 'dofollow', + }, + { + label: 'Platform', + href: `${absoluteUrl}/platform`, + target: '_self', + rel: 'dofollow', + }, + ], + }, + { + title: 'API', + items: [ + { + label: 'Reference', + href: `${absoluteUrl}/api/v2/`, + target: '_self', + rel: 'dofollow', + }, + { + label: 'Client for JavaScript', + href: `${absoluteUrl}/api/client/js/`, // we need a trailing slash here, we'd get redirected there anyway + target: '_self', + rel: 'dofollow', + }, + { + label: 'Client for Python', + href: `${absoluteUrl}/api/client/python/`, // we need a trailing slash here, we'd get redirected there anyway + target: '_self', + rel: 'dofollow', + }, + ], + }, + { + title: 'SDK', + items: [ + { + label: 'SDK for JavaScript', + href: `${absoluteUrl}/sdk/js/`, // we need a trailing slash here, we'd get redirected there anyway + target: '_self', + rel: 'dofollow', + }, + { + label: 'SDK for Python (beta)', + href: `${absoluteUrl}/sdk/python/`, // we need a trailing slash here, we'd get redirected there anyway + target: '_self', + rel: 'dofollow', + }, + ], + }, + { + title: 'Other', + items: [ + { + label: 'CLI', + href: `${absoluteUrl}/cli/`, // we need a trailing slash here, we'd get redirected there anyway + position: 'left', + target: '_self', + rel: 'dofollow', + }, + ], + }, + { + title: 'More', + items: [ + { + label: 'Crawlee', + to: 'https://crawlee.dev', + rel: 'dofollow', + }, + { + label: 'GitHub', + href: 'https://github.com/apify', + }, + ], + }, + ], + logo: { + src: 'img/apify_logo.svg', + href: '/', + width: '60px', + height: '60px', + }, + }, + algolia: { + appId: 'N8EOCSBQGH', + apiKey: 'b43e67a96ed18c7f63f5fd965906a96d', // search only (public) API key + indexName: 'apify_sdk', + algoliaOptions: { + facetFilters: ['version:VERSION'], + }, + }, +}); + +module.exports = { + themeConfig, + absoluteUrl, +}; diff --git a/apify-docs-theme/src/index.js b/apify-docs-theme/src/index.js new file mode 100644 index 0000000000..de371e1ccb --- /dev/null +++ b/apify-docs-theme/src/index.js @@ -0,0 +1,7 @@ +const config = require('./config.js'); +const { theme } = require('./theme.js'); + +module.exports = { + default: theme, + config, +}; diff --git a/apify-docs-theme/src/markdown.js b/apify-docs-theme/src/markdown.js new file mode 100644 index 0000000000..087e2b3fa8 --- /dev/null +++ b/apify-docs-theme/src/markdown.js @@ -0,0 +1,32 @@ +function updateChangelog(changelog) { + changelog = addHeader(changelog); + changelog = pushHeadings(changelog); + changelog = linkUsers(changelog); + changelog = linkPRs(changelog); + return changelog; +} + +function addHeader(changelog, header = 'Changelog') { + return `--- +title: ${header} +sidebar_label: ${header} +toc_max_heading_level: 2 +--- +${changelog}`; +} + +function pushHeadings(changelog) { + return changelog.replaceAll(/\n#[^#]/g, '\n## '); +} + +function linkUsers(changelog) { + return changelog.replaceAll(/^\s*[^#].*@([a-zA-Z0-9-]+)/g, '[@$1](https://github.com/$1)'); +} + +function linkPRs(changelog) { + return changelog.replaceAll(/(((https?:\/\/)?(www.)?)?github.com\/[^\s]*?\/pull\/([0-9]+))/g, '[#$5]($1)'); +} + +module.exports = { + updateChangelog, +}; diff --git a/apify-docs-theme/src/theme.js b/apify-docs-theme/src/theme.js new file mode 100644 index 0000000000..163165eff4 --- /dev/null +++ b/apify-docs-theme/src/theme.js @@ -0,0 +1,133 @@ +const path = require('path'); +const fs = require('fs'); +const axios = require('axios'); +const { updateChangelog } = require('./markdown'); + +function findPathInParent(endPath) { + let parentPath = __dirname; + while (parentPath !== path.join(parentPath, '..')) { + const filePath = path.join(parentPath, endPath); + if (fs.existsSync(filePath)) return filePath; + parentPath = path.join(parentPath, '..'); + } + const filePath = path.join(parentPath, endPath); + if (fs.existsSync(filePath)) return filePath; + + return false; +} + +function findPathInParentOrThrow(endPath) { + const filePath = findPathInParent(endPath); + if (!filePath) throw new Error(`Could not find ${endPath} in any parent directory`); + return filePath; +} + +async function copyChangelogFromReleases(paths, repo) { + const response = await axios.get(`https://api.github.com/repos/${repo}/releases`); + const releases = response.data; + + let markdown = ''; + if (!Array.isArray(releases) || releases.length === 0) return; + + releases.forEach((release) => { + markdown += release.tag_name + ? `## [${release.name}](https://github.com/${repo}/releases/tag/${release.tag_name})\n` + : `## ${release.name}\n`; + markdown += `${release.body.replaceAll(/(^#|\n#)/g, '###')}\n`; + }); + + paths.forEach((p) => { + fs.writeFileSync(`${p}/changelog.md`, updateChangelog(markdown)); + }); +} + +function copyChangelogFromRoot(paths) { + const changelogPath = findPathInParentOrThrow('CHANGELOG.md'); + + for (const docsPath of paths) { + if (fs.existsSync(path.join(docsPath, 'changelog.md')) && fs.statSync( + path.join(docsPath, 'changelog.md')).mtime >= fs.statSync(changelogPath).mtime) continue; + const changelog = fs.readFileSync(changelogPath, 'utf-8'); + fs.writeFileSync(`${docsPath}/changelog.md`, updateChangelog(changelog)); + } +} + +function theme( + context, + options, +) { + return { + name: '@apify/docs-theme', + getPathsToWatch() { + return ['./pages']; + }, + getThemePath() { + return '../src/theme'; + }, + getTypeScriptThemePath() { + return '../src/theme'; + }, + async loadContent() { + try { + const versioned = findPathInParent('website/versioned_docs'); + const pathsToCopyChangelog = [ + findPathInParentOrThrow('docs'), + ...(versioned + ? fs.readdirSync(versioned).map((version) => path.join(versioned, version)) + : [] + ), + ]; + + for (const p of pathsToCopyChangelog) { + // the changelog page has to exist for the sidebar to work - async loadContent() is (apparently) not awaited for by sidebar + if (fs.existsSync(path.join(p, 'changelog.md'))) continue; + fs.writeFileSync(`${p}/changelog.md`, `--- +title: Changelog +sidebar_label: Changelog +--- +It seems that the changelog is not available. +This either means that your Docusaurus setup is misconfigured, or that your GitHub repository contains no releases yet. +`); + } + + if (options.changelogFromRoot) { + copyChangelogFromRoot(pathsToCopyChangelog); + } else { + await copyChangelogFromReleases(pathsToCopyChangelog, `${context.siteConfig.organizationName}/${context.siteConfig.projectName}`); + } + } catch (e) { + // eslint-disable-next-line no-console + console.warn(`Changelog page could not be initialized: ${e.message}`); + } + }, + async contentLoaded({ actions }) { + const { setGlobalData } = actions; + setGlobalData({ + options, + }); + }, + getClientModules() { + return [ + require.resolve('./theme/custom.css'), + ]; + }, + configureWebpack() { + return { + module: { + rules: [ + { + test: /(@apify\/|apify-)docs-theme\/src\/(theme|pages)\/.*?\.jsx?$/, + use: { + loader: 'babel-loader', + }, + }, + ], + }, + }; + }, + }; +} + +module.exports = { + theme, +}; diff --git a/apify-docs-theme/src/theme/ColorModeToggle/index.jsx b/apify-docs-theme/src/theme/ColorModeToggle/index.jsx new file mode 100644 index 0000000000..e2436c0989 --- /dev/null +++ b/apify-docs-theme/src/theme/ColorModeToggle/index.jsx @@ -0,0 +1,64 @@ +import React from 'react'; +import clsx from 'clsx'; +import useIsBrowser from '@docusaurus/useIsBrowser'; +import { translate } from '@docusaurus/Translate'; +import IconLightMode from '../Icon/LightMode'; +import IconDarkMode from '../Icon/DarkMode'; +import styles from './styles.module.css'; + +function ColorModeToggle({ + className, + value, + onChange, +}) { + const isBrowser = useIsBrowser(); + const title = translate( + { + message: 'Switch between dark and light mode (currently {mode})', + id: 'theme.colorToggle.ariaLabel', + description: 'The ARIA label for the navbar color mode toggle', + }, + { + mode: + value === 'dark' + ? translate({ + message: 'dark mode', + id: 'theme.colorToggle.ariaLabel.mode.dark', + description: 'The name for the dark color mode', + }) + : translate({ + message: 'light mode', + id: 'theme.colorToggle.ariaLabel.mode.light', + description: 'The name for the light color mode', + }), + }, + ); + return ( +
+ +
+ ); +} + +export default React.memo(ColorModeToggle); diff --git a/apify-docs-theme/src/theme/ColorModeToggle/styles.module.css b/apify-docs-theme/src/theme/ColorModeToggle/styles.module.css new file mode 100644 index 0000000000..3a5fd98a6a --- /dev/null +++ b/apify-docs-theme/src/theme/ColorModeToggle/styles.module.css @@ -0,0 +1,53 @@ +.toggle { + padding: 3px; +} + +.toggleButton { + width: 52px; + height: 26px; + background: #cfd4eb; + border-radius: 160px; + display: flex; + align-items: center; + transition: all var(--ifm-transition-fast); +} + +[data-theme='dark'] .toggleButton { + background: #585e76; +} + +.toggleButton span { + -webkit-tap-highlight-color: transparent; + align-items: center; + display: flex; + justify-content: center; + width: 20px; + height: 20px; + border-radius: 50%; + background: #fff; + vertical-align: middle; + margin: 3px; + position: relative; + transition: all var(--ifm-transition-fast); + left: 0; + color: #585e76; +} + +[data-theme='dark'] .toggleButton span { + background: #1a1b23; + color: #b3b8d2; + left: 25px; +} + +.toggleButton:hover span { + background: var(--ifm-color-emphasis-200); +} + +[data-theme='light'] .darkToggleIcon, +[data-theme='dark'] .lightToggleIcon { + display: none; +} + +.toggleButtonDisabled { + cursor: not-allowed; +} diff --git a/apify-docs-theme/src/theme/DocSidebarItem/Link/index.jsx b/apify-docs-theme/src/theme/DocSidebarItem/Link/index.jsx new file mode 100644 index 0000000000..48ea277f42 --- /dev/null +++ b/apify-docs-theme/src/theme/DocSidebarItem/Link/index.jsx @@ -0,0 +1,62 @@ +import React from 'react'; +import clsx from 'clsx'; +import { ThemeClassNames } from '@docusaurus/theme-common'; +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import { isActiveSidebarItem } from '@docusaurus/theme-common/internal'; +import Link from '@docusaurus/Link'; +import isInternalUrl from '@docusaurus/isInternalUrl'; +import IconExternalLink from '@theme/Icon/ExternalLink'; +import styles from './styles.module.css'; + +export default function DocSidebarItemLink({ + item, + onItemClick, + activePath, + level, + index, + ...props +}) { + const { + href, + label, + className, + autoAddBaseUrl, + } = item; + const isActive = isActiveSidebarItem(item, activePath); + const isInternalLink = isInternalUrl(href); + const baseUrl = useDocusaurusContext().siteConfig.url; + + if (href.startsWith(baseUrl)) { + props.target = '_self'; + } + + return ( +
  • + onItemClick(item) : undefined, + })} + {...props}> + {label} + {!isInternalLink && } + +
  • + ); +} diff --git a/apify-docs-theme/src/theme/DocSidebarItem/Link/styles.module.css b/apify-docs-theme/src/theme/DocSidebarItem/Link/styles.module.css new file mode 100644 index 0000000000..acba7f3f2f --- /dev/null +++ b/apify-docs-theme/src/theme/DocSidebarItem/Link/styles.module.css @@ -0,0 +1,3 @@ +.menuExternalLink { + align-items: center; +} diff --git a/apify-docs-theme/src/theme/Footer/LinkItem/index.js b/apify-docs-theme/src/theme/Footer/LinkItem/index.js new file mode 100644 index 0000000000..c4c37ea92f --- /dev/null +++ b/apify-docs-theme/src/theme/Footer/LinkItem/index.js @@ -0,0 +1,40 @@ +import React from 'react'; +import Link from '@docusaurus/Link'; +import useBaseUrl from '@docusaurus/useBaseUrl'; +import isInternalUrl_ from '@docusaurus/isInternalUrl'; +import IconExternalLink from '@theme/Icon/ExternalLink'; +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; + +export default function FooterLinkItem({ item }) { + const { + to, + href, + label, + prependBaseUrlToHref, + ...props + } = item; + const toUrl = useBaseUrl(to); + const normalizedHref = useBaseUrl(href, { forcePrependBaseUrl: true }); + const { siteConfig } = useDocusaurusContext(); + const isInternalUrl = (url) => { + if (url.startsWith(siteConfig.url)) { + return true; + } + return isInternalUrl_(url); + }; + return ( + + {label} + {href && !isInternalUrl(href) && } + + ); +} diff --git a/apify-docs-theme/src/theme/Footer/index.jsx b/apify-docs-theme/src/theme/Footer/index.jsx new file mode 100644 index 0000000000..c3a9c3ec08 --- /dev/null +++ b/apify-docs-theme/src/theme/Footer/index.jsx @@ -0,0 +1,52 @@ +import React from 'react'; +import clsx from 'clsx'; +// eslint-disable-next-line import/no-extraneous-dependencies +import { useThemeConfig } from '@docusaurus/theme-common'; +import LinkItem from '@theme/Footer/LinkItem'; +import styles from './index.module.css'; + +function FooterLinksColumn({ column }) { + return ( + <> +
    {column.title}
    +
      + {column.items.map((item, i) => ( +
    • + +
    • + ))} +
    + + ); +} + +function Footer() { + const { footer } = useThemeConfig(); + if (!footer) { + return null; + } + const { links, style } = footer; + return ( +
    +
    +
    + { links.map((column, i) => ( +
    + +
    + )) + } +
    +
    +
    + + + +
    +
    +
    +
    + ); +} + +export default React.memo(Footer); diff --git a/apify-docs-theme/src/theme/Footer/index.module.css b/apify-docs-theme/src/theme/Footer/index.module.css new file mode 100644 index 0000000000..d24f34cb12 --- /dev/null +++ b/apify-docs-theme/src/theme/Footer/index.module.css @@ -0,0 +1,57 @@ +.footer { + padding-top: 64px; +} + +.builtBy { + color: #b3b8d2; +} + +.builtBy svg { + margin-left: 10px; + width: 90px; + height: 24px; +} + +.freeAndOpenSource { + color: #b3b8d2; +} + +.alignMiddle { + vertical-align: middle; + display: inline-block; +} + +.freeAndOpenSource svg { + margin-right: 10px; +} + +.freeAndOpenSource svg path { + fill: #b3b8d2 !important; +} + +.footer .footer__item svg path { + fill: #6f7490; +} + +.footerTitle { + font-family: 'Lota Grotesque', sans-serif; + font-weight: 600; + font-size: 16px; + line-height: 20px; + letter-spacing: 0.1em; + text-transform: uppercase; + color: #8d92af; + margin-bottom: 20px; +} + +.footerLogo { + display: inline-block; + width: 90px; + height: 24px; + background-image: url('/img/footer-apify-logo-black.svg'); + background-repeat: no-repeat; +} + +html[data-theme='dark'] .footerLogo { + background-image: url('/img/footer-apify-logo-white.svg'); +} diff --git a/apify-docs-theme/src/theme/Icon/DarkMode/index.jsx b/apify-docs-theme/src/theme/Icon/DarkMode/index.jsx new file mode 100644 index 0000000000..b3ddaca146 --- /dev/null +++ b/apify-docs-theme/src/theme/Icon/DarkMode/index.jsx @@ -0,0 +1,12 @@ +/* eslint-disable max-len */ +import React from 'react'; + +function IconDarkMode(props) { + return ( + + + + ); +} + +export default React.memo(IconDarkMode); diff --git a/apify-docs-theme/src/theme/Icon/LightMode/index.jsx b/apify-docs-theme/src/theme/Icon/LightMode/index.jsx new file mode 100644 index 0000000000..c91e6a1095 --- /dev/null +++ b/apify-docs-theme/src/theme/Icon/LightMode/index.jsx @@ -0,0 +1,29 @@ +/* eslint-disable max-len */ +import React from 'react'; + +function IconLightMode(props) { + return ( + + + + + + + + + + + + + + + + + + + ); +} + +export default React.memo(IconLightMode); diff --git a/apify-docs-theme/src/theme/Layout/index.jsx b/apify-docs-theme/src/theme/Layout/index.jsx new file mode 100644 index 0000000000..5d04d76d56 --- /dev/null +++ b/apify-docs-theme/src/theme/Layout/index.jsx @@ -0,0 +1,18 @@ +import React from 'react'; +// cannot use any of the theme aliases here as it causes a circular dependency :( ideas welcome +import Layout from '@docusaurus/theme-classic/lib/theme/Layout/index'; +import { usePluginData } from '@docusaurus/useGlobalData'; + +export default function LayoutWrapper(props) { + const { options: { subNavbar } } = usePluginData('@apify/docs-theme'); + return ( +
    + +
    + ); +} diff --git a/apify-docs-theme/src/theme/Logo/index.js b/apify-docs-theme/src/theme/Logo/index.js new file mode 100644 index 0000000000..dab8ab6b37 --- /dev/null +++ b/apify-docs-theme/src/theme/Logo/index.js @@ -0,0 +1,61 @@ +import React from 'react'; +import Link from '@docusaurus/Link'; +import useBaseUrl from '@docusaurus/useBaseUrl'; +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import { useThemeConfig } from '@docusaurus/theme-common'; +import ThemedImage from '@theme/ThemedImage'; + +function LogoThemedImage({ logo, alt, imageClassName }) { + const sources = { + light: useBaseUrl(logo.src), + dark: useBaseUrl(logo.srcDark || logo.src), + }; + const themedImage = ( + + ); + // Is this extra div really necessary? + // introduced in https://github.com/facebook/docusaurus/pull/5666 + return imageClassName ? ( +
    {themedImage}
    + ) : ( + themedImage + ); +} +export default function Logo(props) { + const { + siteConfig: { title }, + } = useDocusaurusContext(); + const { + navbar: { title: navbarTitle, logo }, + } = useThemeConfig(); + const { imageClassName, titleClassName, ...propsRest } = props; + const logoLink = useBaseUrl(logo?.href || '/'); + // If visible title is shown, fallback alt text should be + // an empty string to mark the logo as decorative. + const fallbackAlt = navbarTitle ? '' : title; + // Use logo alt text if provided (including empty string), + // and provide a sensible fallback otherwise. + const alt = logo?.alt ?? fallbackAlt; + return ( + + {logo && ( + + )} + {!logo ? {navbarTitle} : null} + + ); +} diff --git a/apify-docs-theme/src/theme/Navbar/Content/index.jsx b/apify-docs-theme/src/theme/Navbar/Content/index.jsx new file mode 100644 index 0000000000..f0df46a67e --- /dev/null +++ b/apify-docs-theme/src/theme/Navbar/Content/index.jsx @@ -0,0 +1,92 @@ +import React from 'react'; +import { useThemeConfig } from '@docusaurus/theme-common'; +import { usePluginData } from '@docusaurus/useGlobalData'; +import { + splitNavbarItems, +} from '@docusaurus/theme-common/internal'; +import NavbarLogo from '@theme/Navbar/Logo'; +import NavbarItem from '@theme/NavbarItem'; +import NavbarColorModeToggle from '@theme/Navbar/ColorModeToggle'; +import NavbarSearch from '@theme/Navbar/Search'; +import SearchBar from '@theme/SearchBar'; +import NavbarMobileSidebarToggle from '@theme/Navbar/MobileSidebar/Toggle'; +import styles from './styles.module.css'; + +function NavbarItems({ items }) { + return ( + <> + {items.map((item, i) => )} + + ); +} + +function NavbarContentLayout({ + left, + right, +}) { + return ( +
    +
    +
    {left}
    +
    {right}
    +
    +
    + ); +} + +function SubNavbar() { + const { options: { subNavbar } } = usePluginData('@apify/docs-theme'); + return ( + subNavbar ? ( +
    +
    +
    +
    + +
    + +
    +
    +
    + ) : null + ); +} + +export default function NavbarContent() { + const { navbar: { items } } = useThemeConfig(); + const [leftItems, rightItems] = splitNavbarItems(items); + const searchBarItem = items.find((item) => item.type === 'search'); + return ( +
    + + + + + + } + right={ + <> + + + {!searchBarItem && ( + + + + )} + + } + /> + +
    + ); +} diff --git a/apify-docs-theme/src/theme/Navbar/Content/styles.module.css b/apify-docs-theme/src/theme/Navbar/Content/styles.module.css new file mode 100644 index 0000000000..4c9471e109 --- /dev/null +++ b/apify-docs-theme/src/theme/Navbar/Content/styles.module.css @@ -0,0 +1,8 @@ +/* +Hide color mode toggle in small viewports + */ +@media (max-width: 996px) { + .colorModeToggle { + display: none; + } +} diff --git a/apify-docs-theme/src/theme/Navbar/MobileSidebar/PrimaryMenu/index.jsx b/apify-docs-theme/src/theme/Navbar/MobileSidebar/PrimaryMenu/index.jsx new file mode 100644 index 0000000000..940db6c51a --- /dev/null +++ b/apify-docs-theme/src/theme/Navbar/MobileSidebar/PrimaryMenu/index.jsx @@ -0,0 +1,58 @@ +import React from 'react'; +import { useThemeConfig } from '@docusaurus/theme-common'; +import useBaseUrl from '@docusaurus/useBaseUrl'; +import { usePluginData } from '@docusaurus/useGlobalData'; +import NavbarItem from '@theme/NavbarItem'; + +function useNavbarItems() { + // TODO temporary casting until ThemeConfig type is improved + return useThemeConfig().navbar.items; +} +// The primary menu displays the navbar items +export default function NavbarMobilePrimaryMenu() { + // const mobileSidebar = useNavbarMobileSidebar(); + // TODO how can the order be defined for mobile? + // Should we allow providing a different list of items? + const items = useNavbarItems(); + const baseUrl = useBaseUrl('/'); + const { options: { subNavbar } } = usePluginData('@apify/docs-theme'); + return ( + <> + { + subNavbar ? <> +
      + + {subNavbar.items.map((item, i) => ( + + ))} +
    + : null + } +
      + + {items.map((item, i) => ( + + ))} +
    + + ); +} diff --git a/apify-docs-theme/src/theme/NavbarItem/ComponentTypes.jsx b/apify-docs-theme/src/theme/NavbarItem/ComponentTypes.jsx new file mode 100644 index 0000000000..2fbde15152 --- /dev/null +++ b/apify-docs-theme/src/theme/NavbarItem/ComponentTypes.jsx @@ -0,0 +1,101 @@ +import DefaultNavbarItem from '@theme/NavbarItem/DefaultNavbarItem'; +import DropdownNavbarItem from '@theme/NavbarItem/DropdownNavbarItem'; +import LocaleDropdownNavbarItem from '@theme/NavbarItem/LocaleDropdownNavbarItem'; +import SearchNavbarItem from '@theme/NavbarItem/SearchNavbarItem'; +import HtmlNavbarItem from '@theme/NavbarItem/HtmlNavbarItem'; +import DocSidebarNavbarItem from '@theme/NavbarItem/DocSidebarNavbarItem'; +import DocsVersionNavbarItem from '@theme/NavbarItem/DocsVersionNavbarItem'; +import DocsVersionDropdownNavbarItem from '@theme/NavbarItem/DocsVersionDropdownNavbarItem'; +import { useDocsVersion, useLayoutDoc } from '@docusaurus/theme-common/internal'; +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import React from 'react'; + +// const versions = require('../../../versions.json'); +// +// const stable = versions[0]; +const stable = '1'; + +function DocNavbarItem({ + docId, + label: staticLabel, + docsPluginId, + ...props +}) { + const doc = useLayoutDoc(docId, docsPluginId); + // Draft items are not displayed in the navbar. + if (doc === null) { + return null; + } + return ( + + ); +} + +function ApiNavbarItem(ctx) { + let version = {}; + + try { + // eslint-disable-next-line react-hooks/rules-of-hooks + version = useDocsVersion(); + } catch { + version.version = stable; + } + + const { siteConfig } = useDocusaurusContext(); + + if (siteConfig.presets[0][1].docs.disableVersioning || version.version === stable) { + return ( + + ); + } + + // skip changelog button for older versions + if (+version.version < 3 && ctx.className === 'changelog') { + return null; + } + + // link directly to the old API docs under /docs/x.x/api + if (+version.version < 3) { + return ( + + ); + } + + return ( + + ); +} + +const ComponentTypes = { + 'default': DefaultNavbarItem, + 'localeDropdown': LocaleDropdownNavbarItem, + 'search': SearchNavbarItem, + 'dropdown': DropdownNavbarItem, + 'html': HtmlNavbarItem, + 'custom-api': ApiNavbarItem, + 'doc': DocNavbarItem, + 'docSidebar': DocSidebarNavbarItem, + 'docsVersion': DocsVersionNavbarItem, + 'docsVersionDropdown': DocsVersionDropdownNavbarItem, +}; +export default ComponentTypes; diff --git a/apify-docs-theme/src/theme/NavbarItem/NavbarNavLink.jsx b/apify-docs-theme/src/theme/NavbarItem/NavbarNavLink.jsx new file mode 100644 index 0000000000..d759280b1d --- /dev/null +++ b/apify-docs-theme/src/theme/NavbarItem/NavbarNavLink.jsx @@ -0,0 +1,92 @@ +import React from 'react'; +import Link from '@docusaurus/Link'; +import useBaseUrl from '@docusaurus/useBaseUrl'; +import isInternalUrl_ from '@docusaurus/isInternalUrl'; +import IconExternalLink from '@theme/Icon/ExternalLink'; +import { useLocation } from '@docusaurus/router'; +import { isRegexpStringMatch, useThemeConfig } from '@docusaurus/theme-common'; +import { usePluginData } from '@docusaurus/useGlobalData'; +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; + +export default function NavbarNavLink({ + activeBasePath, + activeBaseRegex, + to, + href, + label, + html, + isDropdownLink, + prependBaseUrlToHref, + ...props +}) { + const { navbar: { items = [] } } = useThemeConfig(); + const { options: { subNavbar } } = usePluginData('@apify/docs-theme'); + const allItems = [...items, ...(subNavbar?.items || [])]; + const location = useLocation(); + // TODO all this seems hacky + // {to: 'version'} should probably be forbidden, in favor of {to: '/version'} + const toUrl = useBaseUrl(to); + const activeBaseUrl = useBaseUrl(activeBasePath); + const normalizedHref = useBaseUrl(href, { forcePrependBaseUrl: true }); + const { siteConfig } = useDocusaurusContext(); + const isInternalUrl = (url) => { + if (url.startsWith(siteConfig.url)) { + return true; + } + return isInternalUrl_(url); + }; + + const isExternalLink = label && href && !isInternalUrl(href); + // Link content is set through html XOR label + const linkContentProps = html + ? { dangerouslySetInnerHTML: { __html: html } } + : { + children: ( + <> + {label} + {isExternalLink && ( + + )} + + ), + }; + + // If the item is a dropdown, look for any of its children that match the current path + const dropDownHasActiveItem = location.pathname !== '/' && allItems + .filter((item) => item.type === 'dropdown') + .filter((item) => item.label === label) + .reduce((nestedItems, item) => [...nestedItems, ...item.items], []) + .some((item) => (item.to || item.href).endsWith(location.pathname)); + + if (href) { + return ( + + ); + } + + return ( + (activeBaseRegex + ? isRegexpStringMatch(activeBaseRegex, location.pathname) || dropDownHasActiveItem + : location.pathname.startsWith(activeBaseUrl)), + })} + {...props} + {...linkContentProps} + /> + ); +} diff --git a/apify-docs-theme/src/theme/NotFound.jsx b/apify-docs-theme/src/theme/NotFound.jsx new file mode 100644 index 0000000000..7139852582 --- /dev/null +++ b/apify-docs-theme/src/theme/NotFound.jsx @@ -0,0 +1,25 @@ +import React from 'react'; +import { PageMetadata } from '@docusaurus/theme-common'; +import Layout from '@theme/Layout'; + +export default function NotFound() { + return ( + <> + + +
    +
    +
    +

    + Page Not Found +

    +

    + We could not find what you were looking for 😢 +

    +
    +
    +
    +
    + + ); +} diff --git a/apify-docs-theme/src/theme/custom.css b/apify-docs-theme/src/theme/custom.css new file mode 100644 index 0000000000..451cb35e06 --- /dev/null +++ b/apify-docs-theme/src/theme/custom.css @@ -0,0 +1,703 @@ +@import url('https://fonts.googleapis.com/css2?family=Be+Vietnam+Pro:wght@400;600;700&display=swap'); + +html[data-theme='dark'] { + --ifm-navbar-background-color: #1a1b23; + --ifm-background-color: #1a1b23; + --ifm-background-surface-color: #242736; + + --ifm-font-color-base: #f2f3fb; + + --ifm-pre-background: #242736; + + --ifm-color-primary: #5d9df1; + --ifm-link-color: #5d9df1; + --ifm-heading-color: #f2f3fb; + --ifm-navbar-link-color: #f2f3fb; + + /* TODO set this conditionally to 123px when there is second level nav */ + --ifm-navbar-height: 68px; + + --docusaurus-highlighted-code-line-bg: rgba(255, 255, 255, 0.1); + --docsearch-text-color: #8d92af; + + /* TRON colors */ + --color-Neutral_Text: #F3F4FA; + --color-Neutral_TextMuted: #b0b8d1; + --color-Neutral_Border: #d1d5e4; + --color-Neutral_Hover: #2a2d39; + --color-Neutral_Background: #1A1B21; + --color-Neutral_BackgroundMuted: #252832; + --color-Neutral_ChipBackground: #555d76; + --color-Neutral_ChipBackgroundActive: #8C93A8; + --color-Neutral_SeparatorSubtle: #31384d; + --color-Primary_ChipText: #8ebcff; + --color-Primary_ChipBackground: #1a3a78; + --color-Primary_TextInteractive: #6f9dff; +} + +:root { + /* use default system font based on https://devhints.io/css-system-font-stack */ + --ifm-font-family-base: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif; + --ifm-heading-font-family: 'Lota Grotesque', sans-serif; + --ifm-font-weight-semibold: 600; + --ifm-font-color-base: #242736; + + --ifm-navbar-item-padding-horizontal: 1.75rem; + --ifm-navbar-link-color: #41465d; + --ifm-navbar-shadow: none; + + --ifm-heading-margin-top: var(--ifm-heading-margin-bottom); + --ifm-hero-background-color: transparent; + + --ifm-code-background: var(--ifm-pre-background) !important; + --ifm-code-padding-horizontal: 0.4rem; + --ifm-code-padding-vertical: 0.2rem; + + --ifm-color-primary-lightest: #5d9df1; + --ifm-color-primary-lighter: #3a87ee; + --ifm-color-primary-light: #2e80ed; + --ifm-color-primary: #1672eb; + --ifm-color-primary-dark: #1266d5; + --ifm-color-primary-darker: #1161c9; + --ifm-color-primary-darkest: #0e50a6; + + --ifm-link-color: hsl(214, 84%, 50%); + --ifm-link-hover-color: hsl(214, 84%, 65%); + --ifm-link-hover-decoration: none; + + --ifm-footer-background-color: #272c3d; + --ifm-footer-title-color: #f2f3fb; + --ifm-footer-link-color: #8d92af; + --max-layout-width: 1680px; + + --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1); + + --ifm-heading-color: #242736; + + /* TRON colors */ + --color-Neutral_Text: #242836; + --color-Neutral_TextMuted: #3f475d; + --color-Neutral_Border: #d0d5e9; + --color-Neutral_Hover: #f3f4fa; + --color-Neutral_Background: #ffffff; + --color-Neutral_BackgroundMuted: #f8f9fc; + --color-Neutral_ChipBackground: #e0e3f2; + --color-Neutral_ChipBackgroundActive: #C1C6DD; + --color-Neutral_SeparatorSubtle: #e0e3f2; + --color-Primary_ChipText: #1A57DA; + --color-Primary_ChipBackground: #E1EAFF; + --color-Primary_TextInteractive: #3970d7; +} + +@font-face { + font-family: 'Lota Grotesque'; + src: url('/font/lota.woff2') format('woff2'), + url('/font/lota.woff') format('woff'); + font-weight: 600; +} + +.footer__title { + font-size: 1.25rem; + font-weight: 600; +} + +.footer__bottom a { + opacity: 0.75; +} + +.footer__copyright { + color: var(--ifm-footer-title-color); +} + +footer .col { + margin-bottom: 2rem; +} + +.navbar__title { + /* Replaced by SVG */ + display: none; +} + +.navbar__inner { + /* .container */ + padding: 10px var(--ifm-spacing-horizontal); + width: 100%; + background: var(--color-Neutral_Background); +} + +.navbar__container { + max-width: calc(var(--max-layout-width) - 32px); + display: flex; + margin: 0 auto; + width: 100%; +} + +.navbar__item.dropdown { + padding: 0; + display: none; +} + +.DocSearch-Button-Placeholder { + font-size: 14px !important; +} + +html .DocSearch-Button { + border-radius: 6px !important; + font-weight: 400 !important; + background: #f9fafd; + border: 1px solid #c1c6dd; + + /* Annoying, but needed */ + /* https://stackoverflow.com/questions/26140050/why-is-font-family-not-inherited-in-button-tags-automatically/26140154 */ + font-family: inherit; +} + +html .DocSearch-Button .DocSearch-Search-Icon { + color: var(--docsearch-muted-color); +} + +html[data-theme="dark"] .DocSearch-Button { + background: none; + border: 1px solid var(--docsearch-muted-color); +} + +html[data-theme="dark"] .DocSearch-Button .DocSearch-Search-Icon { + color: var(--docsearch-muted-color); +} + +.DocSearch-Button:hover { + box-shadow: none !important; +} + +.navbar { + padding: 0; + /* height: fit-content; */ + height: auto; +} + +.navbar, .main-wrapper { + justify-content: center; +} + +.main-wrapper > div { + max-width: var(--max-layout-width); + margin: auto; +} + +aside > div > a { + padding-left: 0px; +} + +aside > div > a > b { + display: none; +} + +.dropdown > .navbar__link::after { + border-color: currentColor; + border-style: solid; + border-width: 0.1em 0.1em 0 0; + content: ''; + display: inline-block; + height: 0.4em; + left: 0.3em; + position: relative; + vertical-align: top; + width: 0.4em; + top: 7px; + transform: rotate(135deg); + transition: all ease-in 0.2s; + margin-right: 6px; +} + +.dropdown:hover .navbar__link::after { + transform: rotate(-45deg); + top: 10px; +} + +.navbar .icon { + font-size: 0; + padding: 4px; + line-height: 0; +} + +.navbar .icon::before { + content: ''; + display: block; + width: 24px; + height: 24px; + background-size: cover; +} + +.navbar .icon[href*=github]::before { + background-image: url('/img/github-brand.svg'); +} + +html[data-theme="dark"] .navbar .navbar__link[href*=github]:before { + background-image: url('/img/github-brand-dark.svg'); +} + +.navbar .icon[href*=discord]::before { + background-image: url('/img/discord-brand.svg'); +} + +html[data-theme="dark"] .navbar .navbar__link[href*=discord]:before { + background-image: url('/img/discord-brand-dark.svg'); +} + +.navbar .icon svg[class*=iconExternalLink], +aside .icon svg[class*=iconExternalLink] { + display: none; +} + +.navbar__items { + gap: 6px; +} + +.navbar__item, .menu__link, .navbar__link { + border-radius: 8px; + color: var(--color-Neutral_TextMuted); + padding: 4px 8px; + font-size: 14px; + line-height: 24.4px; + transition: all ease-in 0.2s; +} + +.navbar__link:hover, .navbar__link--active:hover { + color: unset; +} + +.navbar__sub { + display: none; + background-color: var(--color-Neutral_BackgroundMuted); + border: 1px solid var(--color-Neutral_SeparatorSubtle); +} + +.navbar__sub--title { + display: flex; + align-items: center; + width: 160px; + justify-content: flex-end; + margin-right: 40px; + position: relative; +} + +.navbar__sub--title::after { + content: ""; + height: 20px; + border-right: 1px solid var(--color-Neutral_SeparatorSubtle); + position: absolute; + right: -32px; +} + +header.hero div[class^=heroButtons] { + justify-content: inherit; +} + +.markdown blockquote { + --ifm-alert-background-color: var(--ifm-color-info-contrast-background); + --ifm-alert-background-color-highlight: rgba(84,199,236,.15); + --ifm-alert-foreground-color: var(--ifm-color-info-contrast-foreground); + --ifm-alert-border-color: var(--ifm-color-info-dark); + --ifm-code-background: var(--ifm-alert-background-color-highlight); + --ifm-link-color: var(--ifm-alert-foreground-color); + --ifm-link-hover-color: var(--ifm-alert-foreground-color); + --ifm-link-decoration: underline; + --ifm-tabs-color: var(--ifm-alert-foreground-color); + --ifm-tabs-color-active: var(--ifm-alert-foreground-color); + --ifm-tabs-color-active-border: var(--ifm-alert-border-color); + background-color: var(--ifm-alert-background-color); + border: var(--ifm-alert-border-width) solid var(--ifm-alert-border-color); + border-left-width: var(--ifm-alert-border-left-width); + border-radius: var(--ifm-alert-border-radius); + box-shadow: var(--ifm-alert-shadow); + padding: var(--ifm-alert-padding-vertical) var(--ifm-alert-padding-horizontal); +} + +article .card h2 { + margin-top: 0; +} + +.tsd-kind-icon, +.menu__link, +.table-of-contents__link { + text-overflow: ellipsis; + display: inline-block !important; + width: 100%; + overflow: hidden; + white-space: nowrap; +} + +.menu__caret:before, +.menu__link--sublist:after { + float: right; +} + +.menu__caret { + transform: scale(.7); +} + +aside button[class*="collapseSidebarButton"] svg { + transform: scale(.7) rotate(180deg); +} + +.table-of-contents__link { + height: 20px; +} + +nav.navbar .dropdown__menu { + min-width: 6rem; +} + +.navbar__logo { + display: none; + width: 11rem; + height: 3rem; + /* padding: 10px 0px; */ +} + +.navbar-sidebar .navbar__logo { + display: initial; +} + +.navbar-sidebar .toggle_theme-src-theme-ColorModeToggle-styles-module { + display: none; +} + +.navbar-sidebar div[class*="toggle_apify-docs"] { + display: none; +} + +.navbar__link.subnav { + font-size: 0.8em; + padding: 5px; +} + +.main-wrapper a[class*='sidebarLogo'] img { + height: 3rem; + width: 11rem; + padding: 10px 0px; +} + +html.plugin-pages h2 { + font-size: 36px; + line-height: 48px; +} + +html.plugin-docs .theme-doc-markdown { + font-size: 18px; + line-height: 32px; +} + +.markdown .tsd-panel li { + margin-top: var(--ifm-list-item-margin); +} + +html.plugin-docs .theme-doc-markdown img { + display: block; + margin: 1em auto 2em; +} + +html.plugin-docs .theme-doc-markdown h1 { + font-weight: 600; + font-size: 48px; + line-height: 64px; + color: #000; +} + +html[data-theme='dark'].plugin-docs .theme-doc-markdown h1 { + color: #fff; +} + +html.plugin-typedoc-api .theme-doc-markdown h1 { + color: #000; +} + +html[data-theme='dark'].plugin-typedoc-api .theme-doc-markdown h1 { + color: #fff; +} + +html.plugin-docs .theme-doc-markdown h2 { + font-size: 36px; + line-height: 48px; +} + +html.plugin-docs .theme-doc-markdown h3 { + font-size: 28px; + line-height: 36px; + /*color: #242736;*/ +} + +.theme-doc-toc-desktop .table-of-contents { + font-size: 14px; + line-height: 20px; +} + +.navbar-sidebar .menu__link.icon { + display: none; +} + +.theme-doc-sidebar-menu .menu__link, +.theme-doc-toc-desktop .table-of-contents .toc-highlight { + height: auto; + background: none; +} + +.menu__list-item:not(:first-child) { + margin-top: 0; +} + +.theme-doc-sidebar-menu .menu__link:hover { + background: inherit; +} + +.theme-doc-sidebar-menu .menu__link { + font-weight: 400; +} + +.theme-doc-sidebar-menu .menu__link--active { + font-weight: 700; +} + +.theme-doc-sidebar-menu .menu__list-item-collapsible { + background: none; +} + +.theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active { + font-weight: 700; +} + +.navbar__link--active, +.theme-doc-sidebar-menu.menu__list .menu__link--active, +.theme-doc-sidebar-menu.menu__list .menu__list-item-collapsible--active + { + color: var(--color-Neutral_Text); + background: var(--color-Neutral_ChipBackgroundActive); +} + +.navbar__link:not(.navbar__link--active):hover { + background: var(--color-Neutral_Hover); +} + +html[data-theme='dark'] .theme-doc-sidebar-menu .menu__link, +html[data-theme='dark'] .theme-doc-toc-desktop .table-of-contents .toc-highlight { + color: #b3b8d2; +} + +html[data-theme='dark'] .theme-doc-sidebar-menu .menu__link--active, +html[data-theme='dark'] .theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active { + color: #f2f3fb; +} + +.theme-doc-sidebar-menu .menu__link:hover, +.theme-doc-sidebar-menu .menu__link--active, +.theme-doc-toc-desktop .table-of-contents .table-of-contents__link:hover, +.theme-doc-toc-desktop .table-of-contents .table-of-contents__link--active { + color: #242736; +} + +.hero { + position: relative; +} + +.apiItemContainer .tsd-readme h1:first-child { + display: none; +} + +nav.navbar { + transition: transform var(--ifm-transition-slow) ease; +} + +nav.navbar[class*="navbarHidden"]{ + transform: translate3d(0, calc(-210%), 0); +} + +.navbar__items--right a.icon, +div[class*="colorModeToggle"] +{ + display: initial; +} + +div[class*="searchBox"] { + padding-left: 0; + position: unset; +} + +.menu__link.navbar__item { + padding: 4px 8px; +} + +.menu__link, .menu__list-item > .navbar__item { + display: flex; +} + +.theme-doc-sidebar-item-category .menu__list-item-collapsible, +.theme-doc-sidebar-item-link { + display: flex; + align-items: center; +} + +@media (min-width: 480px) { + .navbar__logo { + display: initial; + } +} + +@media (min-width: 997px) { + .navbar__sub { + display: block; + } + + .navbar__item.dropdown { + display: flex; + } +} + +@media (min-width: 1130px) { + .navbar__items { + gap: 20px; + } +} + +/* @media (min-width: 997px) and (max-width: 1250px) { + .navbar__items--right a.icon { + display: none; + } +} */ + +@media (min-width: 997px) and (max-width: 1130px) { + .navbar__link.changelog { + display: none; + } +} + +@media (min-width: 997px) and (max-width: 1439px) { + footer .col--offset-9 { + --ifm-col-width: calc(4 / 12 * 100%); + margin-left: calc(8 / 12 * 100%); + } +} + +html .theme-doc-sidebar-container { + border: 0; +} + +html .theme-doc-sidebar-container button { + border: 0; + border-radius: 10px; +} + +html .table-of-contents { + border-left: 0; +} + +html .table-of-contents ul { + border-left: 2px solid #dfe2f5; +} + +.actionLink { + font-weight: 700; + font-size: 20px; + line-height: 32px; + color: var(--color-Neutral_TextMuted); + border-bottom: 2px solid var(--color-Neutral_TextMuted); + display: inline-flex; + -webkit-box-align: center; + align-items: center; +} + +.actionLink:hover { + color: var(--color-Primary_TextInteractive); + border-bottom: 2px solid var(--color-Primary_TextInteractive); +} + +html[data-theme='dark'] .actionLink::after { + background-image: url('/img/arrow-right-light.svg'); +} + +html[data-theme='dark'] .actionLink:hover::after { + background-image: url('/img/arrow-right-primary-light.svg'); +} + +.actionLink::after { + content: " "; + display: block; + background-image: url('/img/arrow-right.svg'); + background-size: 15px 15px; + height: 15px; + width: 15px; + margin-left: 4px; + transition: margin 200ms ease-in-out; +} + +.actionLink:hover::after { + background-image: url('/img/arrow-right-primary.svg'); + margin-left: 8px; +} + +@media (min-width: 997px) and (max-width: 1660px) { + :root { + --ifm-toc-padding-vertical: 0.25rem; + --ifm-toc-padding-horizontal: 0.25rem; + } + + .navbar__item, .menu__link, .navbar__link { + padding: 2px 8px; + } + + .theme-doc-sidebar-menu .menu__link, + .theme-doc-toc-desktop .table-of-contents .toc-highlight { + font-size: .8rem; + } + + .theme-doc-toc-desktop { + margin-left: -15px; + } + + html.plugin-docs .theme-doc-markdown { + font-size: 16px; + line-height: 28px; + } +} + +aside li.section-header > div > .menu__link { + text-transform: uppercase; + opacity: 0.8; + font-size: 1em; + font-weight: 700; + margin: 0; +} + +aside li.section-header.menu__list-item { + margin-top: 15px; + margin-bottom: 5px; +} + +aside li.section-header.menu__list-item:nth-child(2) { + margin-top: 5px; +} + +aside li.section-header > .menu__list { + padding-left: 0; +} + +.beta-chip { + display: inline-block; + border: 1px solid #ccc; + border-radius: 20px; + content: 'beta'; + background: #ddd; + font-size: 80%; + line-height: 10px; + padding: 3px; + position: relative; + top: -1px; + margin-left: 5px; +} + +html[data-theme='dark'] .beta-chip { + background: #333; +} + +a.tsd-anchor[href^="https://undefined"] { + display: none; +} diff --git a/apify-docs-theme/static/font/lota.woff b/apify-docs-theme/static/font/lota.woff new file mode 100644 index 0000000000..dc3ee1d37c Binary files /dev/null and b/apify-docs-theme/static/font/lota.woff differ diff --git a/apify-docs-theme/static/font/lota.woff2 b/apify-docs-theme/static/font/lota.woff2 new file mode 100644 index 0000000000..7d026a4894 Binary files /dev/null and b/apify-docs-theme/static/font/lota.woff2 differ diff --git a/apify-docs-theme/static/img/apify_logo.svg b/apify-docs-theme/static/img/apify_logo.svg new file mode 100644 index 0000000000..759c49cd84 --- /dev/null +++ b/apify-docs-theme/static/img/apify_logo.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/apify-docs-theme/static/img/apify_sdk.svg b/apify-docs-theme/static/img/apify_sdk.svg new file mode 100644 index 0000000000..d5ff8486f1 --- /dev/null +++ b/apify-docs-theme/static/img/apify_sdk.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/apify-docs-theme/static/img/apify_sdk_white.svg b/apify-docs-theme/static/img/apify_sdk_white.svg new file mode 100644 index 0000000000..11165e664b --- /dev/null +++ b/apify-docs-theme/static/img/apify_sdk_white.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/apify-docs-theme/static/img/arrow-right-light.svg b/apify-docs-theme/static/img/arrow-right-light.svg new file mode 100644 index 0000000000..cd35dac87a --- /dev/null +++ b/apify-docs-theme/static/img/arrow-right-light.svg @@ -0,0 +1,3 @@ + + + diff --git a/apify-docs-theme/static/img/arrow-right-primary-light.svg b/apify-docs-theme/static/img/arrow-right-primary-light.svg new file mode 100644 index 0000000000..b5c9926004 --- /dev/null +++ b/apify-docs-theme/static/img/arrow-right-primary-light.svg @@ -0,0 +1,3 @@ + + + diff --git a/apify-docs-theme/static/img/arrow-right-primary.svg b/apify-docs-theme/static/img/arrow-right-primary.svg new file mode 100644 index 0000000000..43762de8a9 --- /dev/null +++ b/apify-docs-theme/static/img/arrow-right-primary.svg @@ -0,0 +1,3 @@ + + + diff --git a/apify-docs-theme/static/img/arrow-right.svg b/apify-docs-theme/static/img/arrow-right.svg new file mode 100644 index 0000000000..efd904d867 --- /dev/null +++ b/apify-docs-theme/static/img/arrow-right.svg @@ -0,0 +1,3 @@ + + + diff --git a/apify-docs-theme/static/img/discord-brand-dark.svg b/apify-docs-theme/static/img/discord-brand-dark.svg new file mode 100644 index 0000000000..8b97b4c5d8 --- /dev/null +++ b/apify-docs-theme/static/img/discord-brand-dark.svg @@ -0,0 +1,3 @@ + + + diff --git a/apify-docs-theme/static/img/discord-brand.svg b/apify-docs-theme/static/img/discord-brand.svg new file mode 100644 index 0000000000..58254473b9 --- /dev/null +++ b/apify-docs-theme/static/img/discord-brand.svg @@ -0,0 +1,3 @@ + + + diff --git a/apify-docs-theme/static/img/external-link.svg b/apify-docs-theme/static/img/external-link.svg new file mode 100644 index 0000000000..961990cdcd --- /dev/null +++ b/apify-docs-theme/static/img/external-link.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/apify-docs-theme/static/img/favicon.ico b/apify-docs-theme/static/img/favicon.ico new file mode 100644 index 0000000000..1062669a8f Binary files /dev/null and b/apify-docs-theme/static/img/favicon.ico differ diff --git a/apify-docs-theme/static/img/footer-apify-logo-black.svg b/apify-docs-theme/static/img/footer-apify-logo-black.svg new file mode 100644 index 0000000000..33320d1480 --- /dev/null +++ b/apify-docs-theme/static/img/footer-apify-logo-black.svg @@ -0,0 +1,7 @@ + + + + + + + diff --git a/apify-docs-theme/static/img/footer-apify-logo-white.svg b/apify-docs-theme/static/img/footer-apify-logo-white.svg new file mode 100644 index 0000000000..ec384d432c --- /dev/null +++ b/apify-docs-theme/static/img/footer-apify-logo-white.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/apify-docs-theme/static/img/footer-apify-logo.svg b/apify-docs-theme/static/img/footer-apify-logo.svg new file mode 100644 index 0000000000..ec384d432c --- /dev/null +++ b/apify-docs-theme/static/img/footer-apify-logo.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/apify-docs-theme/static/img/footer-open-source.svg b/apify-docs-theme/static/img/footer-open-source.svg new file mode 100644 index 0000000000..1cf9d6d8a4 --- /dev/null +++ b/apify-docs-theme/static/img/footer-open-source.svg @@ -0,0 +1,4 @@ + + + + diff --git a/apify-docs-theme/static/img/github-brand-dark.svg b/apify-docs-theme/static/img/github-brand-dark.svg new file mode 100644 index 0000000000..47d24ba97f --- /dev/null +++ b/apify-docs-theme/static/img/github-brand-dark.svg @@ -0,0 +1,3 @@ + diff --git a/apify-docs-theme/static/img/github-brand.svg b/apify-docs-theme/static/img/github-brand.svg new file mode 100644 index 0000000000..9c7d363492 --- /dev/null +++ b/apify-docs-theme/static/img/github-brand.svg @@ -0,0 +1,3 @@ + diff --git a/apify-docs-theme/static/img/logo-blur.png b/apify-docs-theme/static/img/logo-blur.png new file mode 100644 index 0000000000..6f80c49b8c Binary files /dev/null and b/apify-docs-theme/static/img/logo-blur.png differ diff --git a/apify-docs-theme/static/js/custom.js b/apify-docs-theme/static/js/custom.js new file mode 100644 index 0000000000..e9a7ece756 --- /dev/null +++ b/apify-docs-theme/static/js/custom.js @@ -0,0 +1,34 @@ +// function load() { +// const versions = document.querySelectorAll('.navbar .dropdown ul a'); +// const basePath = ''; +// const types = [`${basePath}/docs/next`, `${basePath}/docs`]; +// let i = 0; +// +// for (const el of versions) { +// const match = el.href.match(/\/docs\/(\d+\.\d+(\.\d+)?)$/) || el.href.match(/\/docs\/(\d+\.\d+(\.\d+)?)/); +// +// if (!types[i++] && !match) { +// continue; +// } +// +// const version = (types[i++] || match[0]).replace('/docs', '/api'); +// +// if (el.classList.contains('api-version-bound')) { +// continue; +// } +// +// el.addEventListener('click', (e) => { +// if (version && window.location.pathname.startsWith(`${basePath}/api`)) { +// window.location.href = version; +// e.preventDefault(); +// } +// }); +// el.classList.add('api-version-bound'); +// } +// } +// +// setInterval(() => { +// if (document.querySelectorAll('.navbar .dropdown ul a').length > 0) { +// load(); +// } +// }, 500); diff --git a/babel.config.js b/babel.config.js new file mode 100644 index 0000000000..0adade1fb9 --- /dev/null +++ b/babel.config.js @@ -0,0 +1,3 @@ +module.exports = { + presets: [require.resolve('@docusaurus/core/lib/babel/preset')], +}; diff --git a/content/academy/advanced_web_scraping/images/pagination-filters.webp b/content/academy/advanced_web_scraping/images/pagination-filters.webp deleted file mode 100644 index 97b3c54daf..0000000000 Binary files a/content/academy/advanced_web_scraping/images/pagination-filters.webp and /dev/null differ diff --git a/content/academy/advanced_web_scraping/images/pagination.webp b/content/academy/advanced_web_scraping/images/pagination.webp deleted file mode 100644 index 03bedcea12..0000000000 Binary files a/content/academy/advanced_web_scraping/images/pagination.webp and /dev/null differ diff --git a/content/academy/anti_scraping/mitigation.md b/content/academy/anti_scraping/mitigation.md deleted file mode 100644 index 3c9f3c8212..0000000000 --- a/content/academy/anti_scraping/mitigation.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: Mitigation -description: After learning about the various different anti-scraping techniques websites use, learn how to mitigate them with a few different techniques. -menuWeight: 3.2 -paths: -- anti-scraping/mitigation ---- - -# [](#anti-scraping-mitigation) Anti-scraping mitigation - -In the [techniques]({{@link anti_scraping/techniques.md}}) section of this course, you learned about multiple methods websites use to prevent bots from accessing their content. This **Mitigation** section will be all about how to circumvent these protections using various different techniques. - - - -## [](#next) Next up - -In the [first lesson]({{@link anti_scraping/mitigation/proxies.md}}) of this section, you'll be learning about what proxies are and how to use them in your own crawler. diff --git a/content/academy/anti_scraping/mitigation/images/proxy-info-logs.webp b/content/academy/anti_scraping/mitigation/images/proxy-info-logs.webp deleted file mode 100644 index c67cbbd265..0000000000 Binary files a/content/academy/anti_scraping/mitigation/images/proxy-info-logs.webp and /dev/null differ diff --git a/content/academy/anti_scraping/techniques/images/canvas-differences.webp b/content/academy/anti_scraping/techniques/images/canvas-differences.webp deleted file mode 100644 index 543a73baf4..0000000000 Binary files a/content/academy/anti_scraping/techniques/images/canvas-differences.webp and /dev/null differ diff --git a/content/academy/anti_scraping/techniques/images/cloudflare-graphic.webp b/content/academy/anti_scraping/techniques/images/cloudflare-graphic.webp deleted file mode 100644 index 3f067a1ed9..0000000000 Binary files a/content/academy/anti_scraping/techniques/images/cloudflare-graphic.webp and /dev/null differ diff --git a/content/academy/anti_scraping/techniques/images/cloudflare.webp b/content/academy/anti_scraping/techniques/images/cloudflare.webp deleted file mode 100644 index ecd31bbae6..0000000000 Binary files a/content/academy/anti_scraping/techniques/images/cloudflare.webp and /dev/null differ diff --git a/content/academy/anti_scraping/techniques/images/dont-fingerprint-me.webp b/content/academy/anti_scraping/techniques/images/dont-fingerprint-me.webp deleted file mode 100644 index f558a61634..0000000000 Binary files a/content/academy/anti_scraping/techniques/images/dont-fingerprint-me.webp and /dev/null differ diff --git a/content/academy/anti_scraping/techniques/images/fingerprinted-headers.webp b/content/academy/anti_scraping/techniques/images/fingerprinted-headers.webp deleted file mode 100644 index df82325fa5..0000000000 Binary files a/content/academy/anti_scraping/techniques/images/fingerprinted-headers.webp and /dev/null differ diff --git a/content/academy/api_scraping/general_api_scraping/images/analyzing-the-url.webp b/content/academy/api_scraping/general_api_scraping/images/analyzing-the-url.webp deleted file mode 100644 index 5e7b22cb47..0000000000 Binary files a/content/academy/api_scraping/general_api_scraping/images/analyzing-the-url.webp and /dev/null differ diff --git a/content/academy/api_scraping/general_api_scraping/images/endpoint-found.webp b/content/academy/api_scraping/general_api_scraping/images/endpoint-found.webp deleted file mode 100644 index f1f2db6bd2..0000000000 Binary files a/content/academy/api_scraping/general_api_scraping/images/endpoint-found.webp and /dev/null differ diff --git a/content/academy/api_scraping/general_api_scraping/images/results-in-network-tab.webp b/content/academy/api_scraping/general_api_scraping/images/results-in-network-tab.webp deleted file mode 100644 index 7cebd42986..0000000000 Binary files a/content/academy/api_scraping/general_api_scraping/images/results-in-network-tab.webp and /dev/null differ diff --git a/content/academy/api_scraping/graphql_scraping/images/cheddar-headers.webp b/content/academy/api_scraping/graphql_scraping/images/cheddar-headers.webp deleted file mode 100644 index 83c33c292b..0000000000 Binary files a/content/academy/api_scraping/graphql_scraping/images/cheddar-headers.webp and /dev/null differ diff --git a/content/academy/api_scraping/graphql_scraping/images/edges-suggested.webp b/content/academy/api_scraping/graphql_scraping/images/edges-suggested.webp deleted file mode 100644 index d3ebf87bc9..0000000000 Binary files a/content/academy/api_scraping/graphql_scraping/images/edges-suggested.webp and /dev/null differ diff --git a/content/academy/api_scraping/graphql_scraping/images/introspection-disabled.webp b/content/academy/api_scraping/graphql_scraping/images/introspection-disabled.webp deleted file mode 100644 index 727a4cff36..0000000000 Binary files a/content/academy/api_scraping/graphql_scraping/images/introspection-disabled.webp and /dev/null differ diff --git a/content/academy/api_scraping/graphql_scraping/images/introspection.webp b/content/academy/api_scraping/graphql_scraping/images/introspection.webp deleted file mode 100644 index 2923a4c882..0000000000 Binary files a/content/academy/api_scraping/graphql_scraping/images/introspection.webp and /dev/null differ diff --git a/content/academy/api_scraping/graphql_scraping/images/media-field.webp b/content/academy/api_scraping/graphql_scraping/images/media-field.webp deleted file mode 100644 index 263f2aecea..0000000000 Binary files a/content/academy/api_scraping/graphql_scraping/images/media-field.webp and /dev/null differ diff --git a/content/academy/api_scraping/graphql_scraping/images/pasting-introspection.webp b/content/academy/api_scraping/graphql_scraping/images/pasting-introspection.webp deleted file mode 100644 index de6265500f..0000000000 Binary files a/content/academy/api_scraping/graphql_scraping/images/pasting-introspection.webp and /dev/null differ diff --git a/content/academy/api_scraping/graphql_scraping/images/successful-request.webp b/content/academy/api_scraping/graphql_scraping/images/successful-request.webp deleted file mode 100644 index 545e9718f7..0000000000 Binary files a/content/academy/api_scraping/graphql_scraping/images/successful-request.webp and /dev/null differ diff --git a/content/academy/api_scraping/graphql_scraping/images/unauthorized.webp b/content/academy/api_scraping/graphql_scraping/images/unauthorized.webp deleted file mode 100644 index d81fa87838..0000000000 Binary files a/content/academy/api_scraping/graphql_scraping/images/unauthorized.webp and /dev/null differ diff --git a/content/academy/api_scraping/graphql_scraping/images/voyager-interface.webp b/content/academy/api_scraping/graphql_scraping/images/voyager-interface.webp deleted file mode 100644 index 1f5178006a..0000000000 Binary files a/content/academy/api_scraping/graphql_scraping/images/voyager-interface.webp and /dev/null differ diff --git a/content/academy/api_scraping/images/graphql-endpoint.webp b/content/academy/api_scraping/images/graphql-endpoint.webp deleted file mode 100644 index 719e60979e..0000000000 Binary files a/content/academy/api_scraping/images/graphql-endpoint.webp and /dev/null differ diff --git a/content/academy/api_scraping/images/graphql-payload.webp b/content/academy/api_scraping/images/graphql-payload.webp deleted file mode 100644 index 31ebc5f89a..0000000000 Binary files a/content/academy/api_scraping/images/graphql-payload.webp and /dev/null differ diff --git a/content/academy/api_scraping/images/stringified-syntax.webp b/content/academy/api_scraping/images/stringified-syntax.webp deleted file mode 100644 index 833dccd900..0000000000 Binary files a/content/academy/api_scraping/images/stringified-syntax.webp and /dev/null differ diff --git a/content/academy/deploying_your_code/images/actor-page.webp b/content/academy/deploying_your_code/images/actor-page.webp deleted file mode 100644 index e452643816..0000000000 Binary files a/content/academy/deploying_your_code/images/actor-page.webp and /dev/null differ diff --git a/content/academy/deploying_your_code/images/filepath.webp b/content/academy/deploying_your_code/images/filepath.webp deleted file mode 100644 index b18e6e1621..0000000000 Binary files a/content/academy/deploying_your_code/images/filepath.webp and /dev/null differ diff --git a/content/academy/deploying_your_code/images/filestructure.webp b/content/academy/deploying_your_code/images/filestructure.webp deleted file mode 100644 index 788f58d815..0000000000 Binary files a/content/academy/deploying_your_code/images/filestructure.webp and /dev/null differ diff --git a/content/academy/deploying_your_code/images/output-schema-example.webp b/content/academy/deploying_your_code/images/output-schema-example.webp deleted file mode 100644 index 016f4f06b8..0000000000 Binary files a/content/academy/deploying_your_code/images/output-schema-example.webp and /dev/null differ diff --git a/content/academy/deploying_your_code/images/rendered-ui.webp b/content/academy/deploying_your_code/images/rendered-ui.webp deleted file mode 100644 index f040a884e1..0000000000 Binary files a/content/academy/deploying_your_code/images/rendered-ui.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/images/api-button.webp b/content/academy/expert_scraping_with_apify/images/api-button.webp deleted file mode 100644 index 1393ed1162..0000000000 Binary files a/content/academy/expert_scraping_with_apify/images/api-button.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/images/github-integration.webp b/content/academy/expert_scraping_with_apify/images/github-integration.webp deleted file mode 100644 index b2d4d3077f..0000000000 Binary files a/content/academy/expert_scraping_with_apify/images/github-integration.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/images/github-new-repo.webp b/content/academy/expert_scraping_with_apify/images/github-new-repo.webp deleted file mode 100644 index 790d144a8c..0000000000 Binary files a/content/academy/expert_scraping_with_apify/images/github-new-repo.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/images/product-offers.webp b/content/academy/expert_scraping_with_apify/images/product-offers.webp deleted file mode 100644 index ffe4f7d782..0000000000 Binary files a/content/academy/expert_scraping_with_apify/images/product-offers.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/images/select-source-location.webp b/content/academy/expert_scraping_with_apify/images/select-source-location.webp deleted file mode 100644 index 4646a53221..0000000000 Binary files a/content/academy/expert_scraping_with_apify/images/select-source-location.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/solutions/images/actor-settings.webp b/content/academy/expert_scraping_with_apify/solutions/images/actor-settings.webp deleted file mode 100644 index a5394d4c53..0000000000 Binary files a/content/academy/expert_scraping_with_apify/solutions/images/actor-settings.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/solutions/images/adding-webhook.webp b/content/academy/expert_scraping_with_apify/solutions/images/adding-webhook.webp deleted file mode 100644 index 511f33beb3..0000000000 Binary files a/content/academy/expert_scraping_with_apify/solutions/images/adding-webhook.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/solutions/images/bad-schema.webp b/content/academy/expert_scraping_with_apify/solutions/images/bad-schema.webp deleted file mode 100644 index 105e871327..0000000000 Binary files a/content/academy/expert_scraping_with_apify/solutions/images/bad-schema.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/solutions/images/create-new-task.webp b/content/academy/expert_scraping_with_apify/solutions/images/create-new-task.webp deleted file mode 100644 index c167c47a16..0000000000 Binary files a/content/academy/expert_scraping_with_apify/solutions/images/create-new-task.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/solutions/images/creating-task.webp b/content/academy/expert_scraping_with_apify/solutions/images/creating-task.webp deleted file mode 100644 index bfe97f9124..0000000000 Binary files a/content/academy/expert_scraping_with_apify/solutions/images/creating-task.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/solutions/images/dataset-preview.webp b/content/academy/expert_scraping_with_apify/solutions/images/dataset-preview.webp deleted file mode 100644 index 4250c94465..0000000000 Binary files a/content/academy/expert_scraping_with_apify/solutions/images/dataset-preview.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/solutions/images/result-items.webp b/content/academy/expert_scraping_with_apify/solutions/images/result-items.webp deleted file mode 100644 index d23134320e..0000000000 Binary files a/content/academy/expert_scraping_with_apify/solutions/images/result-items.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/solutions/images/select-empty.webp b/content/academy/expert_scraping_with_apify/solutions/images/select-empty.webp deleted file mode 100644 index 14842ae651..0000000000 Binary files a/content/academy/expert_scraping_with_apify/solutions/images/select-empty.webp and /dev/null differ diff --git a/content/academy/expert_scraping_with_apify/solutions/images/webhook-succeeded.webp b/content/academy/expert_scraping_with_apify/solutions/images/webhook-succeeded.webp deleted file mode 100644 index 0d04af537b..0000000000 Binary files a/content/academy/expert_scraping_with_apify/solutions/images/webhook-succeeded.webp and /dev/null differ diff --git a/content/academy/get_most_of_actors/images/Apify-Billing.webp b/content/academy/get_most_of_actors/images/Apify-Billing.webp deleted file mode 100644 index be84b6083f..0000000000 Binary files a/content/academy/get_most_of_actors/images/Apify-Billing.webp and /dev/null differ diff --git a/content/academy/get_most_of_actors/images/actors-publishing-google.webp b/content/academy/get_most_of_actors/images/actors-publishing-google.webp deleted file mode 100644 index 9bfda31ab2..0000000000 Binary files a/content/academy/get_most_of_actors/images/actors-publishing-google.webp and /dev/null differ diff --git a/content/academy/get_most_of_actors/images/billing-details.webp b/content/academy/get_most_of_actors/images/billing-details.webp deleted file mode 100644 index 8e318c9920..0000000000 Binary files a/content/academy/get_most_of_actors/images/billing-details.webp and /dev/null differ diff --git a/content/academy/getting_started/images/actor-logs.webp b/content/academy/getting_started/images/actor-logs.webp deleted file mode 100644 index b7d40c0bed..0000000000 Binary files a/content/academy/getting_started/images/actor-logs.webp and /dev/null differ diff --git a/content/academy/getting_started/images/actor-settings-id.webp b/content/academy/getting_started/images/actor-settings-id.webp deleted file mode 100644 index 708cec7c09..0000000000 Binary files a/content/academy/getting_started/images/actor-settings-id.webp and /dev/null differ diff --git a/content/academy/getting_started/images/actors-tab.webp b/content/academy/getting_started/images/actors-tab.webp deleted file mode 100644 index d10589a961..0000000000 Binary files a/content/academy/getting_started/images/actors-tab.webp and /dev/null differ diff --git a/content/academy/getting_started/images/api-csv-response.webp b/content/academy/getting_started/images/api-csv-response.webp deleted file mode 100644 index 25e61f6d65..0000000000 Binary files a/content/academy/getting_started/images/api-csv-response.webp and /dev/null differ diff --git a/content/academy/getting_started/images/api-error.webp b/content/academy/getting_started/images/api-error.webp deleted file mode 100644 index e3e7696849..0000000000 Binary files a/content/academy/getting_started/images/api-error.webp and /dev/null differ diff --git a/content/academy/getting_started/images/api-tab.webp b/content/academy/getting_started/images/api-tab.webp deleted file mode 100644 index 569afe442e..0000000000 Binary files a/content/academy/getting_started/images/api-tab.webp and /dev/null differ diff --git a/content/academy/getting_started/images/build-actor.webp b/content/academy/getting_started/images/build-actor.webp deleted file mode 100644 index 88371731f7..0000000000 Binary files a/content/academy/getting_started/images/build-actor.webp and /dev/null differ diff --git a/content/academy/getting_started/images/choose-template.webp b/content/academy/getting_started/images/choose-template.webp deleted file mode 100644 index dc7fe1fe60..0000000000 Binary files a/content/academy/getting_started/images/choose-template.webp and /dev/null differ diff --git a/content/academy/getting_started/images/configure-inputs.webp b/content/academy/getting_started/images/configure-inputs.webp deleted file mode 100644 index 0d50e2f438..0000000000 Binary files a/content/academy/getting_started/images/configure-inputs.webp and /dev/null differ diff --git a/content/academy/getting_started/images/create-new-actor.webp b/content/academy/getting_started/images/create-new-actor.webp deleted file mode 100644 index 3bb4466acb..0000000000 Binary files a/content/academy/getting_started/images/create-new-actor.webp and /dev/null differ diff --git a/content/academy/getting_started/images/dataset-preview.webp b/content/academy/getting_started/images/dataset-preview.webp deleted file mode 100644 index 9bdab723b5..0000000000 Binary files a/content/academy/getting_started/images/dataset-preview.webp and /dev/null differ diff --git a/content/academy/getting_started/images/hello-world-run.webp b/content/academy/getting_started/images/hello-world-run.webp deleted file mode 100644 index 64289de9da..0000000000 Binary files a/content/academy/getting_started/images/hello-world-run.webp and /dev/null differ diff --git a/content/academy/getting_started/images/multifile-editor.webp b/content/academy/getting_started/images/multifile-editor.webp deleted file mode 100644 index bca06e8877..0000000000 Binary files a/content/academy/getting_started/images/multifile-editor.webp and /dev/null differ diff --git a/content/academy/getting_started/images/new-defaults.webp b/content/academy/getting_started/images/new-defaults.webp deleted file mode 100644 index 3bb413a4c3..0000000000 Binary files a/content/academy/getting_started/images/new-defaults.webp and /dev/null differ diff --git a/content/academy/getting_started/images/provide-input.webp b/content/academy/getting_started/images/provide-input.webp deleted file mode 100644 index 6d899cc3b0..0000000000 Binary files a/content/academy/getting_started/images/provide-input.webp and /dev/null differ diff --git a/content/academy/getting_started/images/result-box.webp b/content/academy/getting_started/images/result-box.webp deleted file mode 100644 index f3f9d05cd1..0000000000 Binary files a/content/academy/getting_started/images/result-box.webp and /dev/null differ diff --git a/content/academy/getting_started/images/run-info.webp b/content/academy/getting_started/images/run-info.webp deleted file mode 100644 index 342a6293cf..0000000000 Binary files a/content/academy/getting_started/images/run-info.webp and /dev/null differ diff --git a/content/academy/getting_started/images/seo-actor-config.webp b/content/academy/getting_started/images/seo-actor-config.webp deleted file mode 100644 index 1c4e7669d0..0000000000 Binary files a/content/academy/getting_started/images/seo-actor-config.webp and /dev/null differ diff --git a/content/academy/getting_started/images/start.webp b/content/academy/getting_started/images/start.webp deleted file mode 100644 index 31a6c049f7..0000000000 Binary files a/content/academy/getting_started/images/start.webp and /dev/null differ diff --git a/content/academy/images/deployment-workflow.webp b/content/academy/images/deployment-workflow.webp deleted file mode 100644 index 65253fa02b..0000000000 Binary files a/content/academy/images/deployment-workflow.webp and /dev/null differ diff --git a/content/academy/images/supported-languages.webp b/content/academy/images/supported-languages.webp deleted file mode 100644 index 599d342702..0000000000 Binary files a/content/academy/images/supported-languages.webp and /dev/null differ diff --git a/content/academy/node_js/images/actor-load.webp b/content/academy/node_js/images/actor-load.webp deleted file mode 100644 index 3408a8ee42..0000000000 Binary files a/content/academy/node_js/images/actor-load.webp and /dev/null differ diff --git a/content/academy/node_js/images/almost-there.webp b/content/academy/node_js/images/almost-there.webp deleted file mode 100644 index 9bc68f53fe..0000000000 Binary files a/content/academy/node_js/images/almost-there.webp and /dev/null differ diff --git a/content/academy/node_js/images/anyone-with-link.webp b/content/academy/node_js/images/anyone-with-link.webp deleted file mode 100644 index 47c5052086..0000000000 Binary files a/content/academy/node_js/images/anyone-with-link.webp and /dev/null differ diff --git a/content/academy/node_js/images/bad-results.webp b/content/academy/node_js/images/bad-results.webp deleted file mode 100644 index 31e97ec5b4..0000000000 Binary files a/content/academy/node_js/images/bad-results.webp and /dev/null differ diff --git a/content/academy/node_js/images/bad-scraper-stats.webp b/content/academy/node_js/images/bad-scraper-stats.webp deleted file mode 100644 index 46a77aeb9e..0000000000 Binary files a/content/academy/node_js/images/bad-scraper-stats.webp and /dev/null differ diff --git a/content/academy/node_js/images/cnn-network-tab.webp b/content/academy/node_js/images/cnn-network-tab.webp deleted file mode 100644 index 757bcc0e71..0000000000 Binary files a/content/academy/node_js/images/cnn-network-tab.webp and /dev/null differ diff --git a/content/academy/node_js/images/fast-with-cache.webp b/content/academy/node_js/images/fast-with-cache.webp deleted file mode 100644 index c877e34e0d..0000000000 Binary files a/content/academy/node_js/images/fast-with-cache.webp and /dev/null differ diff --git a/content/academy/node_js/images/good-run-results.webp b/content/academy/node_js/images/good-run-results.webp deleted file mode 100644 index f561f27c14..0000000000 Binary files a/content/academy/node_js/images/good-run-results.webp and /dev/null differ diff --git a/content/academy/node_js/images/gserp-api.webp b/content/academy/node_js/images/gserp-api.webp deleted file mode 100644 index 4fd001ff19..0000000000 Binary files a/content/academy/node_js/images/gserp-api.webp and /dev/null differ diff --git a/content/academy/node_js/images/gsheets-url.webp b/content/academy/node_js/images/gsheets-url.webp deleted file mode 100644 index efeec1ea7f..0000000000 Binary files a/content/academy/node_js/images/gsheets-url.webp and /dev/null differ diff --git a/content/academy/node_js/images/improved-actor-loading.webp b/content/academy/node_js/images/improved-actor-loading.webp deleted file mode 100644 index fdbea41a63..0000000000 Binary files a/content/academy/node_js/images/improved-actor-loading.webp and /dev/null differ diff --git a/content/academy/node_js/images/kv-store-puppeteer.webp b/content/academy/node_js/images/kv-store-puppeteer.webp deleted file mode 100644 index 0349faafaa..0000000000 Binary files a/content/academy/node_js/images/kv-store-puppeteer.webp and /dev/null differ diff --git a/content/academy/node_js/images/new-arrivals.webp b/content/academy/node_js/images/new-arrivals.webp deleted file mode 100644 index 5e9e65ba3c..0000000000 Binary files a/content/academy/node_js/images/new-arrivals.webp and /dev/null differ diff --git a/content/academy/node_js/images/shadow.webp b/content/academy/node_js/images/shadow.webp deleted file mode 100644 index 09254e0d3e..0000000000 Binary files a/content/academy/node_js/images/shadow.webp and /dev/null differ diff --git a/content/academy/node_js/images/slow-no-cache.webp b/content/academy/node_js/images/slow-no-cache.webp deleted file mode 100644 index 6cb2bf3441..0000000000 Binary files a/content/academy/node_js/images/slow-no-cache.webp and /dev/null differ diff --git a/content/academy/node_js/images/view-845.webp b/content/academy/node_js/images/view-845.webp deleted file mode 100644 index 22a04fbc24..0000000000 Binary files a/content/academy/node_js/images/view-845.webp and /dev/null differ diff --git a/content/academy/node_js/images/view-object-in-window.webp b/content/academy/node_js/images/view-object-in-window.webp deleted file mode 100644 index 4373c0d936..0000000000 Binary files a/content/academy/node_js/images/view-object-in-window.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/common_use_cases/images/github-last-page.webp b/content/academy/puppeteer_playwright/common_use_cases/images/github-last-page.webp deleted file mode 100644 index 6b8771cc51..0000000000 Binary files a/content/academy/puppeteer_playwright/common_use_cases/images/github-last-page.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/common_use_cases/images/logged-in.webp b/content/academy/puppeteer_playwright/common_use_cases/images/logged-in.webp deleted file mode 100644 index ec8795c6f8..0000000000 Binary files a/content/academy/puppeteer_playwright/common_use_cases/images/logged-in.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/executing_scripts/images/log-products.webp b/content/academy/puppeteer_playwright/executing_scripts/images/log-products.webp deleted file mode 100644 index 9b151141e5..0000000000 Binary files a/content/academy/puppeteer_playwright/executing_scripts/images/log-products.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/images/api-response-tiesto.webp b/content/academy/puppeteer_playwright/images/api-response-tiesto.webp deleted file mode 100644 index 0b312f4635..0000000000 Binary files a/content/academy/puppeteer_playwright/images/api-response-tiesto.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/images/chromium.webp b/content/academy/puppeteer_playwright/images/chromium.webp deleted file mode 100644 index 84a28eddd9..0000000000 Binary files a/content/academy/puppeteer_playwright/images/chromium.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/images/context-diagram.webp b/content/academy/puppeteer_playwright/images/context-diagram.webp deleted file mode 100644 index ee1088b34d..0000000000 Binary files a/content/academy/puppeteer_playwright/images/context-diagram.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/images/dual-contexts.webp b/content/academy/puppeteer_playwright/images/dual-contexts.webp deleted file mode 100644 index 36fbe46785..0000000000 Binary files a/content/academy/puppeteer_playwright/images/dual-contexts.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/images/green-google.webp b/content/academy/puppeteer_playwright/images/green-google.webp deleted file mode 100644 index 52a12b44e0..0000000000 Binary files a/content/academy/puppeteer_playwright/images/green-google.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/images/mesto-following.webp b/content/academy/puppeteer_playwright/images/mesto-following.webp deleted file mode 100644 index dac744e6fd..0000000000 Binary files a/content/academy/puppeteer_playwright/images/mesto-following.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/images/tiesto-request.webp b/content/academy/puppeteer_playwright/images/tiesto-request.webp deleted file mode 100644 index 3407dcd31f..0000000000 Binary files a/content/academy/puppeteer_playwright/images/tiesto-request.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/images/title-changed.webp b/content/academy/puppeteer_playwright/images/title-changed.webp deleted file mode 100644 index 8a45826f4e..0000000000 Binary files a/content/academy/puppeteer_playwright/images/title-changed.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/images/ugly-soundcloud.webp b/content/academy/puppeteer_playwright/images/ugly-soundcloud.webp deleted file mode 100644 index 285bc0e9fe..0000000000 Binary files a/content/academy/puppeteer_playwright/images/ugly-soundcloud.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/images/vietnamese-google.webp b/content/academy/puppeteer_playwright/images/vietnamese-google.webp deleted file mode 100644 index be91415854..0000000000 Binary files a/content/academy/puppeteer_playwright/images/vietnamese-google.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/page/images/google-results.webp b/content/academy/puppeteer_playwright/page/images/google-results.webp deleted file mode 100644 index 29bfc122c8..0000000000 Binary files a/content/academy/puppeteer_playwright/page/images/google-results.webp and /dev/null differ diff --git a/content/academy/puppeteer_playwright/page/images/wikipedia-screenshot.webp b/content/academy/puppeteer_playwright/page/images/wikipedia-screenshot.webp deleted file mode 100644 index bbfa00bd75..0000000000 Binary files a/content/academy/puppeteer_playwright/page/images/wikipedia-screenshot.webp and /dev/null differ diff --git a/content/academy/python/images/bbc-time-offset.webp b/content/academy/python/images/bbc-time-offset.webp deleted file mode 100644 index fccd4ec35c..0000000000 Binary files a/content/academy/python/images/bbc-time-offset.webp and /dev/null differ diff --git a/content/academy/python/images/bbc-weather-after-midnight.webp b/content/academy/python/images/bbc-weather-after-midnight.webp deleted file mode 100644 index 37a51b2d48..0000000000 Binary files a/content/academy/python/images/bbc-weather-after-midnight.webp and /dev/null differ diff --git a/content/academy/python/images/bbc-weather-devtools.webp b/content/academy/python/images/bbc-weather-devtools.webp deleted file mode 100644 index 30553aa05c..0000000000 Binary files a/content/academy/python/images/bbc-weather-devtools.webp and /dev/null differ diff --git a/content/academy/python/images/bbc-weather-parser-source.webp b/content/academy/python/images/bbc-weather-parser-source.webp deleted file mode 100644 index a67956fb7f..0000000000 Binary files a/content/academy/python/images/bbc-weather-parser-source.webp and /dev/null differ diff --git a/content/academy/python/images/bbc-weather-prediction.webp b/content/academy/python/images/bbc-weather-prediction.webp deleted file mode 100644 index c72f55f1e3..0000000000 Binary files a/content/academy/python/images/bbc-weather-prediction.webp and /dev/null differ diff --git a/content/academy/python/images/bbc-weather-scraper-source.webp b/content/academy/python/images/bbc-weather-scraper-source.webp deleted file mode 100644 index 862567a9b7..0000000000 Binary files a/content/academy/python/images/bbc-weather-scraper-source.webp and /dev/null differ diff --git a/content/academy/python/images/bbc-weather-url-format.webp b/content/academy/python/images/bbc-weather-url-format.webp deleted file mode 100644 index e66e51f1a0..0000000000 Binary files a/content/academy/python/images/bbc-weather-url-format.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/another-error.webp b/content/academy/switching_to_typescript/images/another-error.webp deleted file mode 100644 index e276fd3113..0000000000 Binary files a/content/academy/switching_to_typescript/images/another-error.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/constant-autofill.webp b/content/academy/switching_to_typescript/images/constant-autofill.webp deleted file mode 100644 index 039fbb689a..0000000000 Binary files a/content/academy/switching_to_typescript/images/constant-autofill.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/epic-autocomplete.webp b/content/academy/switching_to_typescript/images/epic-autocomplete.webp deleted file mode 100644 index e296af9c1f..0000000000 Binary files a/content/academy/switching_to_typescript/images/epic-autocomplete.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/is-any.webp b/content/academy/switching_to_typescript/images/is-any.webp deleted file mode 100644 index 0eed901ca9..0000000000 Binary files a/content/academy/switching_to_typescript/images/is-any.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/no-more-error.webp b/content/academy/switching_to_typescript/images/no-more-error.webp deleted file mode 100644 index 3a6e03657a..0000000000 Binary files a/content/academy/switching_to_typescript/images/no-more-error.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/number-inference.webp b/content/academy/switching_to_typescript/images/number-inference.webp deleted file mode 100644 index 1f128bc410..0000000000 Binary files a/content/academy/switching_to_typescript/images/number-inference.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/object-inference.webp b/content/academy/switching_to_typescript/images/object-inference.webp deleted file mode 100644 index 2e3ef2d562..0000000000 Binary files a/content/academy/switching_to_typescript/images/object-inference.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/object-type-error.webp b/content/academy/switching_to_typescript/images/object-type-error.webp deleted file mode 100644 index f7615d038f..0000000000 Binary files a/content/academy/switching_to_typescript/images/object-type-error.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/parameter-type.webp b/content/academy/switching_to_typescript/images/parameter-type.webp deleted file mode 100644 index bb4c2fde57..0000000000 Binary files a/content/academy/switching_to_typescript/images/parameter-type.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/pasted-example.webp b/content/academy/switching_to_typescript/images/pasted-example.webp deleted file mode 100644 index 9910387f89..0000000000 Binary files a/content/academy/switching_to_typescript/images/pasted-example.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/promise-any.webp b/content/academy/switching_to_typescript/images/promise-any.webp deleted file mode 100644 index bcf276f7a7..0000000000 Binary files a/content/academy/switching_to_typescript/images/promise-any.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/replace-with-unknown.webp b/content/academy/switching_to_typescript/images/replace-with-unknown.webp deleted file mode 100644 index bcb0486fb3..0000000000 Binary files a/content/academy/switching_to_typescript/images/replace-with-unknown.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/return-inferred.webp b/content/academy/switching_to_typescript/images/return-inferred.webp deleted file mode 100644 index f36ee43ef9..0000000000 Binary files a/content/academy/switching_to_typescript/images/return-inferred.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/string-not-number.webp b/content/academy/switching_to_typescript/images/string-not-number.webp deleted file mode 100644 index dd1c139bfc..0000000000 Binary files a/content/academy/switching_to_typescript/images/string-not-number.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/typescript-error.webp b/content/academy/switching_to_typescript/images/typescript-error.webp deleted file mode 100644 index badee1c272..0000000000 Binary files a/content/academy/switching_to_typescript/images/typescript-error.webp and /dev/null differ diff --git a/content/academy/switching_to_typescript/images/we-need-overloads.webp b/content/academy/switching_to_typescript/images/we-need-overloads.webp deleted file mode 100644 index b0ac03a196..0000000000 Binary files a/content/academy/switching_to_typescript/images/we-need-overloads.webp and /dev/null differ diff --git a/content/academy/tools/images/edit-this-cookie-options.webp b/content/academy/tools/images/edit-this-cookie-options.webp deleted file mode 100644 index 4e757d93ff..0000000000 Binary files a/content/academy/tools/images/edit-this-cookie-options.webp and /dev/null differ diff --git a/content/academy/tools/images/edit-this-cookie-popup.webp b/content/academy/tools/images/edit-this-cookie-popup.webp deleted file mode 100644 index 113995f3db..0000000000 Binary files a/content/academy/tools/images/edit-this-cookie-popup.webp and /dev/null differ diff --git a/content/academy/tools/images/insomnia-cookies.webp b/content/academy/tools/images/insomnia-cookies.webp deleted file mode 100644 index b47a187fee..0000000000 Binary files a/content/academy/tools/images/insomnia-cookies.webp and /dev/null differ diff --git a/content/academy/tools/images/insomnia-interface.webp b/content/academy/tools/images/insomnia-interface.webp deleted file mode 100644 index 6b74b57f89..0000000000 Binary files a/content/academy/tools/images/insomnia-interface.webp and /dev/null differ diff --git a/content/academy/tools/images/insomnia-manage-cookies.webp b/content/academy/tools/images/insomnia-manage-cookies.webp deleted file mode 100644 index 70eeff254d..0000000000 Binary files a/content/academy/tools/images/insomnia-manage-cookies.webp and /dev/null differ diff --git a/content/academy/tools/images/insomnia-proxy.webp b/content/academy/tools/images/insomnia-proxy.webp deleted file mode 100644 index 60b65c7a35..0000000000 Binary files a/content/academy/tools/images/insomnia-proxy.webp and /dev/null differ diff --git a/content/academy/tools/images/insomnia-timeline.webp b/content/academy/tools/images/insomnia-timeline.webp deleted file mode 100644 index 4a61570437..0000000000 Binary files a/content/academy/tools/images/insomnia-timeline.webp and /dev/null differ diff --git a/content/academy/tools/images/js-off.webp b/content/academy/tools/images/js-off.webp deleted file mode 100644 index 4ef22b83bf..0000000000 Binary files a/content/academy/tools/images/js-off.webp and /dev/null differ diff --git a/content/academy/tools/images/js-on.webp b/content/academy/tools/images/js-on.webp deleted file mode 100644 index f7eb57dfe0..0000000000 Binary files a/content/academy/tools/images/js-on.webp and /dev/null differ diff --git a/content/academy/tools/images/modheader.webp b/content/academy/tools/images/modheader.webp deleted file mode 100644 index 0189e9c073..0000000000 Binary files a/content/academy/tools/images/modheader.webp and /dev/null differ diff --git a/content/academy/tools/images/postman-cookies-button.webp b/content/academy/tools/images/postman-cookies-button.webp deleted file mode 100644 index 879af238ad..0000000000 Binary files a/content/academy/tools/images/postman-cookies-button.webp and /dev/null differ diff --git a/content/academy/tools/images/postman-interface.webp b/content/academy/tools/images/postman-interface.webp deleted file mode 100644 index 8f2347d305..0000000000 Binary files a/content/academy/tools/images/postman-interface.webp and /dev/null differ diff --git a/content/academy/tools/images/postman-manage-cookies.webp b/content/academy/tools/images/postman-manage-cookies.webp deleted file mode 100644 index 4bd95fdc6f..0000000000 Binary files a/content/academy/tools/images/postman-manage-cookies.webp and /dev/null differ diff --git a/content/academy/tools/images/postman-proxy.webp b/content/academy/tools/images/postman-proxy.webp deleted file mode 100644 index d19645cbb8..0000000000 Binary files a/content/academy/tools/images/postman-proxy.webp and /dev/null differ diff --git a/content/academy/tools/images/proxyman-apps-tab.webp b/content/academy/tools/images/proxyman-apps-tab.webp deleted file mode 100644 index 9d1d87e2dd..0000000000 Binary files a/content/academy/tools/images/proxyman-apps-tab.webp and /dev/null differ diff --git a/content/academy/tools/images/proxyman-filter.webp b/content/academy/tools/images/proxyman-filter.webp deleted file mode 100644 index b599a65ac9..0000000000 Binary files a/content/academy/tools/images/proxyman-filter.webp and /dev/null differ diff --git a/content/academy/tools/images/proxyman-results.webp b/content/academy/tools/images/proxyman-results.webp deleted file mode 100644 index bbe7a8a011..0000000000 Binary files a/content/academy/tools/images/proxyman-results.webp and /dev/null differ diff --git a/content/academy/tools/images/proxyman-view-request.webp b/content/academy/tools/images/proxyman-view-request.webp deleted file mode 100644 index d2c437612a..0000000000 Binary files a/content/academy/tools/images/proxyman-view-request.webp and /dev/null differ diff --git a/content/academy/tools/images/settings-integrations.webp b/content/academy/tools/images/settings-integrations.webp deleted file mode 100644 index 05b478fe7a..0000000000 Binary files a/content/academy/tools/images/settings-integrations.webp and /dev/null differ diff --git a/content/academy/tools/images/switchyomega-auth.webp b/content/academy/tools/images/switchyomega-auth.webp deleted file mode 100644 index d9655416f0..0000000000 Binary files a/content/academy/tools/images/switchyomega-auth.webp and /dev/null differ diff --git a/content/academy/tools/images/switchyomega-menu.webp b/content/academy/tools/images/switchyomega-menu.webp deleted file mode 100644 index 230679091a..0000000000 Binary files a/content/academy/tools/images/switchyomega-menu.webp and /dev/null differ diff --git a/content/academy/tools/images/switchyomega-options.webp b/content/academy/tools/images/switchyomega-options.webp deleted file mode 100644 index 3f64aa646b..0000000000 Binary files a/content/academy/tools/images/switchyomega-options.webp and /dev/null differ diff --git a/content/academy/tools/images/switchyomega-proxy-profile.webp b/content/academy/tools/images/switchyomega-proxy-profile.webp deleted file mode 100644 index 15e5a86aba..0000000000 Binary files a/content/academy/tools/images/switchyomega-proxy-profile.webp and /dev/null differ diff --git a/content/academy/tools/images/switchyomega-proxy-settings.webp b/content/academy/tools/images/switchyomega-proxy-settings.webp deleted file mode 100644 index 6380fe54de..0000000000 Binary files a/content/academy/tools/images/switchyomega-proxy-settings.webp and /dev/null differ diff --git a/content/academy/tools/images/switchyomega.webp b/content/academy/tools/images/switchyomega.webp deleted file mode 100644 index 441fe0339a..0000000000 Binary files a/content/academy/tools/images/switchyomega.webp and /dev/null differ diff --git a/content/academy/tools/images/user-agent-switcher-agents.webp b/content/academy/tools/images/user-agent-switcher-agents.webp deleted file mode 100644 index 98cfa7219b..0000000000 Binary files a/content/academy/tools/images/user-agent-switcher-agents.webp and /dev/null differ diff --git a/content/academy/tools/images/user-agent-switcher-config.webp b/content/academy/tools/images/user-agent-switcher-config.webp deleted file mode 100644 index 7004cd99eb..0000000000 Binary files a/content/academy/tools/images/user-agent-switcher-config.webp and /dev/null differ diff --git a/content/academy/tools/images/user-agent-switcher-groups.webp b/content/academy/tools/images/user-agent-switcher-groups.webp deleted file mode 100644 index 95f76db6eb..0000000000 Binary files a/content/academy/tools/images/user-agent-switcher-groups.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/challenge/images/crawlee-create.webp b/content/academy/web_scraping_for_beginners/challenge/images/crawlee-create.webp deleted file mode 100644 index c43f73b5d4..0000000000 Binary files a/content/academy/web_scraping_for_beginners/challenge/images/crawlee-create.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/challenge/images/offers-page.webp b/content/academy/web_scraping_for_beginners/challenge/images/offers-page.webp deleted file mode 100644 index 7e4d49ba45..0000000000 Binary files a/content/academy/web_scraping_for_beginners/challenge/images/offers-page.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/challenge/images/view-offers-button.webp b/content/academy/web_scraping_for_beginners/challenge/images/view-offers-button.webp deleted file mode 100644 index 1883b21b44..0000000000 Binary files a/content/academy/web_scraping_for_beginners/challenge/images/view-offers-button.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/crawling/images/api-token.webp b/content/academy/web_scraping_for_beginners/crawling/images/api-token.webp deleted file mode 100644 index 1bd0e3eb6b..0000000000 Binary files a/content/academy/web_scraping_for_beginners/crawling/images/api-token.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/crawling/images/final-results.webp b/content/academy/web_scraping_for_beginners/crawling/images/final-results.webp deleted file mode 100644 index ba96c08f29..0000000000 Binary files a/content/academy/web_scraping_for_beginners/crawling/images/final-results.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/crawling/images/nested-tag.webp b/content/academy/web_scraping_for_beginners/crawling/images/nested-tag.webp deleted file mode 100644 index 2d7c4becc4..0000000000 Binary files a/content/academy/web_scraping_for_beginners/crawling/images/nested-tag.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/crawling/images/product-urls.webp b/content/academy/web_scraping_for_beginners/crawling/images/product-urls.webp deleted file mode 100644 index 5e6e3a5a1f..0000000000 Binary files a/content/academy/web_scraping_for_beginners/crawling/images/product-urls.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-console-commands.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-console-commands.webp deleted file mode 100644 index 1ac3e697fd..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-console-commands.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-console.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-console.webp deleted file mode 100644 index c547f6df72..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-console.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-element-selection.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-element-selection.webp deleted file mode 100644 index 0d450a6382..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-element-selection.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-elements-tab.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-elements-tab.webp deleted file mode 100644 index 13f4cb8bd5..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-elements-tab.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-hover.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-hover.webp deleted file mode 100644 index 38fef75c9c..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-hover.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-wikipedia.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-wikipedia.webp deleted file mode 100644 index f7894c72c1..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/browser-devtools-wikipedia.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/csv-data-in-sheets.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/csv-data-in-sheets.webp deleted file mode 100644 index a6e8cf831e..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/csv-data-in-sheets.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-clean-price.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-clean-price.webp deleted file mode 100644 index 3c18952110..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-clean-price.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-cleaning-noise.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-cleaning-noise.webp deleted file mode 100644 index a652455998..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-cleaning-noise.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-class.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-class.webp deleted file mode 100644 index 5b5e8624ad..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-class.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-product-hover.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-product-hover.webp deleted file mode 100644 index 6c80f63adf..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-product-hover.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-product-name.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-product-name.webp deleted file mode 100644 index 22f4d2eaae..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-product-name.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-query-all.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-query-all.webp deleted file mode 100644 index e0dcf0000c..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-query-all.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-query-hover.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-query-hover.webp deleted file mode 100644 index 061e666883..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-query-hover.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-query.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-query.webp deleted file mode 100644 index a8441758f4..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-query.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-warehouse.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-warehouse.webp deleted file mode 100644 index 1a5e1cfea0..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-collection-warehouse.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-count-products.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-count-products.webp deleted file mode 100644 index c743239895..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-count-products.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-extract-product-price.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-extract-product-price.webp deleted file mode 100644 index b6d27f38b6..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-extract-product-price.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-extract-product-title.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-extract-product-title.webp deleted file mode 100644 index b65f2fcff6..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-extract-product-title.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-find-child-elements.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-find-child-elements.webp deleted file mode 100644 index 24980e2c8f..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-find-child-elements.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-print-all-products.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-print-all-products.webp deleted file mode 100644 index f8d85aec08..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-print-all-products.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-print-parent-text.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-print-parent-text.webp deleted file mode 100644 index bcdb113975..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-print-parent-text.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-product-titles.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-product-titles.webp deleted file mode 100644 index fa5052fb8b..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-product-titles.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-split-price.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-split-price.webp deleted file mode 100644 index 40cd837d38..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/devtools-split-price.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/node-scraper-title.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/node-scraper-title.webp deleted file mode 100644 index a8daf11ca0..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/node-scraper-title.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-create-file.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-create-file.webp deleted file mode 100644 index fb1e470762..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-create-file.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-hello-world.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-hello-world.webp deleted file mode 100644 index ffb07a0334..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-hello-world.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-npm-init.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-npm-init.webp deleted file mode 100644 index b962935a2f..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-npm-init.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-open-folder.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-open-folder.webp deleted file mode 100644 index 9d447494d3..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-open-folder.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-open-terminal.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-open-terminal.webp deleted file mode 100644 index 7de6a6c6f1..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-open-terminal.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-test-setup.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-test-setup.webp deleted file mode 100644 index 78b7f99280..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-test-setup.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-type-module.webp b/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-type-module.webp deleted file mode 100644 index 76c22e5e78..0000000000 Binary files a/content/academy/web_scraping_for_beginners/data_extraction/images/vscode-type-module.webp and /dev/null differ diff --git a/content/academy/web_scraping_for_beginners/images/beginners-data-extraction.webp b/content/academy/web_scraping_for_beginners/images/beginners-data-extraction.webp deleted file mode 100644 index 05746d3203..0000000000 Binary files a/content/academy/web_scraping_for_beginners/images/beginners-data-extraction.webp and /dev/null differ diff --git a/content/docs/access_rights/images/configure-permissions.webp b/content/docs/access_rights/images/configure-permissions.webp deleted file mode 100644 index 568bdfeece..0000000000 Binary files a/content/docs/access_rights/images/configure-permissions.webp and /dev/null differ diff --git a/content/docs/access_rights/images/convert-to-organization.webp b/content/docs/access_rights/images/convert-to-organization.webp deleted file mode 100644 index 92ca730014..0000000000 Binary files a/content/docs/access_rights/images/convert-to-organization.webp and /dev/null differ diff --git a/content/docs/access_rights/images/create-new-org.webp b/content/docs/access_rights/images/create-new-org.webp deleted file mode 100644 index 3cc2f05ba1..0000000000 Binary files a/content/docs/access_rights/images/create-new-org.webp and /dev/null differ diff --git a/content/docs/access_rights/images/integrations.webp b/content/docs/access_rights/images/integrations.webp deleted file mode 100644 index 7bdb27e5d1..0000000000 Binary files a/content/docs/access_rights/images/integrations.webp and /dev/null differ diff --git a/content/docs/access_rights/images/members.webp b/content/docs/access_rights/images/members.webp deleted file mode 100644 index 9390b5e335..0000000000 Binary files a/content/docs/access_rights/images/members.webp and /dev/null differ diff --git a/content/docs/access_rights/images/my-organizations.webp b/content/docs/access_rights/images/my-organizations.webp deleted file mode 100644 index 7a1b3f9199..0000000000 Binary files a/content/docs/access_rights/images/my-organizations.webp and /dev/null differ diff --git a/content/docs/access_rights/images/roles.webp b/content/docs/access_rights/images/roles.webp deleted file mode 100644 index 004822f83c..0000000000 Binary files a/content/docs/access_rights/images/roles.webp and /dev/null differ diff --git a/content/docs/access_rights/images/switch-to-organization.webp b/content/docs/access_rights/images/switch-to-organization.webp deleted file mode 100644 index 12144a1aa5..0000000000 Binary files a/content/docs/access_rights/images/switch-to-organization.webp and /dev/null differ diff --git a/content/docs/access_rights/images/upgrade.webp b/content/docs/access_rights/images/upgrade.webp deleted file mode 100644 index 32432c6717..0000000000 Binary files a/content/docs/access_rights/images/upgrade.webp and /dev/null differ diff --git a/content/docs/actors.md b/content/docs/actors.md deleted file mode 100644 index 134b0d8242..0000000000 --- a/content/docs/actors.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -title: Actors -description: Learn how to develop, run and share serverless cloud programs. Create your own web scraping and automation tools and publish them on the Apify platform. -menuWeight: 7 -category: platform -paths: -# NOTE: IF ADDING A NEW PATH, LEAVE THE OLD ONES FOR REDIRECTS - - actor - - actors ---- - -# Actors - -Actors are serverless cloud programs that can do almost anything a human can do in a web browser. They can do anything from small tasks like filling in forms or unsubscribing from online services, all the way up to scraping and processing vast numbers of web pages. - -You can use actors [manually in the Apify Console](https://console.apify.com/actors), using [API](/api/v2) or [scheduler]({{@link schedules.md}}). You can easily [integrate them with other apps]({{@link integrations.md}}) and share your actors with other Apify users via our [access rights]({{@link access_rights.md}}) system. - -> New to Apify? [Try actors with our **quick start** tutorial]({{@link tutorials/quick_start.md}}). - -A single isolated actor consists of source code and various settings. You can think of an actor as a cloud app or service that runs on the Apify platform. The run of an actor is not limited to the lifetime of a single HTTP transaction. It can run for as long as necessary, even forever. - -## Section overview - -* [Running]({{@link actors/running.md}}) - * [Input]({{@link actors/running/input.md}}) - * [Memory and CPU]({{@link actors/running/memory_and_cpu.md}}) - * [Compute units and consumption]({{@link actors/running/compute_units.md}}) -* [Tasks]({{@link actors/tasks.md}}) -* [Development]({{@link actors/development.md}}) - * [Base Docker images]({{@link actors/development/base_docker_images.md}}) - * [Builds]({{@link actors/development/builds.md}}) - * [Continuous integration]({{@link actors/development/continuous_integration.md}}) - * [Environment variables]({{@link actors/development/environment_variables.md}}) - * [Input schema]({{@link actors/development/input_schema.md}}) - * [Secret input]({{@link actors/development/secret_input.md}}) - * [Source code]({{@link actors/development/source_code.md}}) - * [State persistence]({{@link actors/development/state_persistence.md}}) - * [Testing and maintenance]({{@link actors/development/testing_and_maintenance.md}}) -* [Paid actors]({{@link actors/paid_actors.md}}) -* [Publishing]({{@link actors/publishing.md}}) -* [Naming your actor](https://developers.apify.com/academy/apify-platform/get-most-of-actors/naming-your-actor) -* [SEO and promotion](https://developers.apify.com/academy/apify-platform/get-most-of-actors/seo-and-promotion) -* [Security]({{@link actors/security.md}}) -* [Limits]({{@link actors/limits.md}}) -* [Examples]({{@link actors/examples.md}}) - -## Public, private, and paid actors - -Actors can be public (free or [paid]({{@link actors/paid_actors.md}})) or private. Private actors are yours to use and keep, and no one will see them if you don't want them to. Public actors are [available to everyone]({{@link actors/publishing.md}}) in [Apify Store](https://apify.com/store). You can make them free to use, or you can [charge for them](https://blog.apify.com/make-regular-passive-income-developing-web-automation-actors-b0392278d085/). - diff --git a/content/docs/actors/development/images/ci-add-build-url.webp b/content/docs/actors/development/images/ci-add-build-url.webp deleted file mode 100644 index 8b9c295ff5..0000000000 Binary files a/content/docs/actors/development/images/ci-add-build-url.webp and /dev/null differ diff --git a/content/docs/actors/development/images/ci-builds.webp b/content/docs/actors/development/images/ci-builds.webp deleted file mode 100644 index 5ed109b6c2..0000000000 Binary files a/content/docs/actors/development/images/ci-builds.webp and /dev/null differ diff --git a/content/docs/actors/development/images/ci-token.webp b/content/docs/actors/development/images/ci-token.webp deleted file mode 100644 index 9621801342..0000000000 Binary files a/content/docs/actors/development/images/ci-token.webp and /dev/null differ diff --git a/content/docs/actors/development/images/input-schema-colors.webp b/content/docs/actors/development/images/input-schema-colors.webp deleted file mode 100644 index 9b7326a0b7..0000000000 Binary files a/content/docs/actors/development/images/input-schema-colors.webp and /dev/null differ diff --git a/content/docs/actors/development/images/input-schema-country.webp b/content/docs/actors/development/images/input-schema-country.webp deleted file mode 100644 index fef6b94eef..0000000000 Binary files a/content/docs/actors/development/images/input-schema-country.webp and /dev/null differ diff --git a/content/docs/actors/development/images/input-schema-example.webp b/content/docs/actors/development/images/input-schema-example.webp deleted file mode 100644 index 6656d002c3..0000000000 Binary files a/content/docs/actors/development/images/input-schema-example.webp and /dev/null differ diff --git a/content/docs/actors/development/images/input-schema-memory.webp b/content/docs/actors/development/images/input-schema-memory.webp deleted file mode 100644 index 20da020273..0000000000 Binary files a/content/docs/actors/development/images/input-schema-memory.webp and /dev/null differ diff --git a/content/docs/actors/development/images/input-schema-options.webp b/content/docs/actors/development/images/input-schema-options.webp deleted file mode 100644 index 854b05a3e3..0000000000 Binary files a/content/docs/actors/development/images/input-schema-options.webp and /dev/null differ diff --git a/content/docs/actors/development/images/input-schema-page-function.webp b/content/docs/actors/development/images/input-schema-page-function.webp deleted file mode 100644 index ecf0f2584e..0000000000 Binary files a/content/docs/actors/development/images/input-schema-page-function.webp and /dev/null differ diff --git a/content/docs/actors/development/images/input-schema-proxy.webp b/content/docs/actors/development/images/input-schema-proxy.webp deleted file mode 100644 index 9bd4b1c931..0000000000 Binary files a/content/docs/actors/development/images/input-schema-proxy.webp and /dev/null differ diff --git a/content/docs/actors/development/images/input-schema-start-urls.webp b/content/docs/actors/development/images/input-schema-start-urls.webp deleted file mode 100644 index c3c043edeb..0000000000 Binary files a/content/docs/actors/development/images/input-schema-start-urls.webp and /dev/null differ diff --git a/content/docs/actors/development/images/input-schema-user.webp b/content/docs/actors/development/images/input-schema-user.webp deleted file mode 100644 index 0fc04fcf21..0000000000 Binary files a/content/docs/actors/development/images/input-schema-user.webp and /dev/null differ diff --git a/content/docs/actors/development/images/output-schema-example.webp b/content/docs/actors/development/images/output-schema-example.webp deleted file mode 100644 index b3649ee06b..0000000000 Binary files a/content/docs/actors/development/images/output-schema-example.webp and /dev/null differ diff --git a/content/docs/actors/development/images/secret-input-editor.webp b/content/docs/actors/development/images/secret-input-editor.webp deleted file mode 100644 index b88f10dbb8..0000000000 Binary files a/content/docs/actors/development/images/secret-input-editor.webp and /dev/null differ diff --git a/content/docs/actors/development/images/testing-tasks.webp b/content/docs/actors/development/images/testing-tasks.webp deleted file mode 100644 index d6e748002b..0000000000 Binary files a/content/docs/actors/development/images/testing-tasks.webp and /dev/null differ diff --git a/content/docs/actors/images/actor-console.webp b/content/docs/actors/images/actor-console.webp deleted file mode 100644 index 93a65865b7..0000000000 Binary files a/content/docs/actors/images/actor-console.webp and /dev/null differ diff --git a/content/docs/actors/images/actor-usage.webp b/content/docs/actors/images/actor-usage.webp deleted file mode 100644 index c7cb93086b..0000000000 Binary files a/content/docs/actors/images/actor-usage.webp and /dev/null differ diff --git a/content/docs/actors/images/create-task-configure.webp b/content/docs/actors/images/create-task-configure.webp deleted file mode 100644 index 636382814a..0000000000 Binary files a/content/docs/actors/images/create-task-configure.webp and /dev/null differ diff --git a/content/docs/actors/images/create-task-run.webp b/content/docs/actors/images/create-task-run.webp deleted file mode 100644 index d4191aa70f..0000000000 Binary files a/content/docs/actors/images/create-task-run.webp and /dev/null differ diff --git a/content/docs/actors/images/create-task-settings.webp b/content/docs/actors/images/create-task-settings.webp deleted file mode 100644 index fd8850b756..0000000000 Binary files a/content/docs/actors/images/create-task-settings.webp and /dev/null differ diff --git a/content/docs/actors/images/create-task.webp b/content/docs/actors/images/create-task.webp deleted file mode 100644 index 625d96859f..0000000000 Binary files a/content/docs/actors/images/create-task.webp and /dev/null differ diff --git a/content/docs/actors/images/gist-settings.webp b/content/docs/actors/images/gist-settings.webp deleted file mode 100644 index 02e54032d3..0000000000 Binary files a/content/docs/actors/images/gist-settings.webp and /dev/null differ diff --git a/content/docs/actors/images/github-integration.webp b/content/docs/actors/images/github-integration.webp deleted file mode 100644 index b2d4d3077f..0000000000 Binary files a/content/docs/actors/images/github-integration.webp and /dev/null differ diff --git a/content/docs/actors/images/memory-cpu-usage-spike.webp b/content/docs/actors/images/memory-cpu-usage-spike.webp deleted file mode 100644 index 91a7d76214..0000000000 Binary files a/content/docs/actors/images/memory-cpu-usage-spike.webp and /dev/null differ diff --git a/content/docs/actors/images/memory-settings.webp b/content/docs/actors/images/memory-settings.webp deleted file mode 100644 index 601a17420b..0000000000 Binary files a/content/docs/actors/images/memory-settings.webp and /dev/null differ diff --git a/content/docs/actors/images/paid-actors-billing.webp b/content/docs/actors/images/paid-actors-billing.webp deleted file mode 100644 index a026e90a20..0000000000 Binary files a/content/docs/actors/images/paid-actors-billing.webp and /dev/null differ diff --git a/content/docs/actors/images/paid-actors-issues-tab.webp b/content/docs/actors/images/paid-actors-issues-tab.webp deleted file mode 100644 index 8e3471220e..0000000000 Binary files a/content/docs/actors/images/paid-actors-issues-tab.webp and /dev/null differ diff --git a/content/docs/actors/images/paid-actors-store.webp b/content/docs/actors/images/paid-actors-store.webp deleted file mode 100644 index 7cc09a548d..0000000000 Binary files a/content/docs/actors/images/paid-actors-store.webp and /dev/null differ diff --git a/content/docs/actors/images/paid-actors-trial.webp b/content/docs/actors/images/paid-actors-trial.webp deleted file mode 100644 index 798e2d636e..0000000000 Binary files a/content/docs/actors/images/paid-actors-trial.webp and /dev/null differ diff --git a/content/docs/actors/images/run-console.webp b/content/docs/actors/images/run-console.webp deleted file mode 100644 index cab1ac16d1..0000000000 Binary files a/content/docs/actors/images/run-console.webp and /dev/null differ diff --git a/content/docs/actors/images/source-env-vars.webp b/content/docs/actors/images/source-env-vars.webp deleted file mode 100644 index 0ef8e41104..0000000000 Binary files a/content/docs/actors/images/source-env-vars.webp and /dev/null differ diff --git a/content/docs/api.md b/content/docs/api.md deleted file mode 100644 index 489831b8ba..0000000000 --- a/content/docs/api.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: API reference -description: Manage, build and run actors, access and manage your storages with the Apify REST API (v2). Test API endpoints and create URL requests using the Blueprint GUI. -menuWeight: 13 -category: platform -paths: - - api - - api-reference ---- - - diff --git a/content/docs/apify_client_js.md b/content/docs/apify_client_js.md deleted file mode 100644 index 168c8b7b81..0000000000 --- a/content/docs/apify_client_js.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: JavaScript API client -description: Simplified access to the Apify API from any JavaScript or Node.js application. Manage your actors, tasks and storage via API with the apify-client NPM package. -category: developer tools -menuWeight: 14 -paths: - - api/apify-client-js - - api/apify-client-js/latest - - apify-client-js/latest - - apify-client-js ---- diff --git a/content/docs/apify_client_python.md b/content/docs/apify_client_python.md deleted file mode 100644 index 147551eca8..0000000000 --- a/content/docs/apify_client_python.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Python API client -description: Simplified access to the Apify API from any Python application. Manage your actors, tasks and storage via API with the apify_client PyPI package. -category: developer tools -menuWeight: 15 -paths: - - api/apify-client-python - - api/apify-client-python/latest - - apify-client-python/latest - - apify-client-python ---- diff --git a/content/docs/cli.md b/content/docs/cli.md deleted file mode 100644 index 5f6559d82f..0000000000 --- a/content/docs/cli.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Command-line interface -description: Create new actors from your computer's command line. Run actors locally or deploy them to the Apify platform. View the Apify CLI's command reference. -externalSourceUrl: https://raw.githubusercontent.com/apifytech/apify-cli/master/README.md -menuWeight: 16 -category: developer tools -paths: - - cli ---- - diff --git a/content/docs/images/access-rights.webp b/content/docs/images/access-rights.webp deleted file mode 100644 index e7e68e4eb9..0000000000 Binary files a/content/docs/images/access-rights.webp and /dev/null differ diff --git a/content/docs/images/datasets-overview.webp b/content/docs/images/datasets-overview.webp deleted file mode 100644 index 2de4811c3a..0000000000 Binary files a/content/docs/images/datasets-overview.webp and /dev/null differ diff --git a/content/docs/images/proxy-custom.webp b/content/docs/images/proxy-custom.webp deleted file mode 100644 index c398972016..0000000000 Binary files a/content/docs/images/proxy-custom.webp and /dev/null differ diff --git a/content/docs/images/publication.webp b/content/docs/images/publication.webp deleted file mode 100644 index a0a46453f3..0000000000 Binary files a/content/docs/images/publication.webp and /dev/null differ diff --git a/content/docs/images/schedule-actor-run.webp b/content/docs/images/schedule-actor-run.webp deleted file mode 100644 index c2118438ea..0000000000 Binary files a/content/docs/images/schedule-actor-run.webp and /dev/null differ diff --git a/content/docs/images/schedule-add-tasks.webp b/content/docs/images/schedule-add-tasks.webp deleted file mode 100644 index 75f93cccea..0000000000 Binary files a/content/docs/images/schedule-add-tasks.webp and /dev/null differ diff --git a/content/docs/images/schedule-settings.webp b/content/docs/images/schedule-settings.webp deleted file mode 100644 index 41d1598284..0000000000 Binary files a/content/docs/images/schedule-settings.webp and /dev/null differ diff --git a/content/docs/images/schedule-setup.webp b/content/docs/images/schedule-setup.webp deleted file mode 100644 index 07c9e9ab5e..0000000000 Binary files a/content/docs/images/schedule-setup.webp and /dev/null differ diff --git a/content/docs/integrations/images/api-token.webp b/content/docs/integrations/images/api-token.webp deleted file mode 100644 index d77e4ae1e0..0000000000 Binary files a/content/docs/integrations/images/api-token.webp and /dev/null differ diff --git a/content/docs/integrations/images/integrations-tab.webp b/content/docs/integrations/images/integrations-tab.webp deleted file mode 100644 index 7e9bbf3d55..0000000000 Binary files a/content/docs/integrations/images/integrations-tab.webp and /dev/null differ diff --git a/content/docs/integrations/images/slack-apify-message.webp b/content/docs/integrations/images/slack-apify-message.webp deleted file mode 100644 index f12dec63f8..0000000000 Binary files a/content/docs/integrations/images/slack-apify-message.webp and /dev/null differ diff --git a/content/docs/integrations/images/slack-integration-setup.webp b/content/docs/integrations/images/slack-integration-setup.webp deleted file mode 100644 index 541679449c..0000000000 Binary files a/content/docs/integrations/images/slack-integration-setup.webp and /dev/null differ diff --git a/content/docs/monitoring/images/covid-config.webp b/content/docs/monitoring/images/covid-config.webp deleted file mode 100644 index 369c5c2bee..0000000000 Binary files a/content/docs/monitoring/images/covid-config.webp and /dev/null differ diff --git a/content/docs/monitoring/images/covid-multiple-tasks.webp b/content/docs/monitoring/images/covid-multiple-tasks.webp deleted file mode 100644 index a3e30be2d1..0000000000 Binary files a/content/docs/monitoring/images/covid-multiple-tasks.webp and /dev/null differ diff --git a/content/docs/monitoring/images/covid-validate-schema.webp b/content/docs/monitoring/images/covid-validate-schema.webp deleted file mode 100644 index 28cb36e535..0000000000 Binary files a/content/docs/monitoring/images/covid-validate-schema.webp and /dev/null differ diff --git a/content/docs/monitoring/images/customize-notifications.webp b/content/docs/monitoring/images/customize-notifications.webp deleted file mode 100644 index 7f70f8e971..0000000000 Binary files a/content/docs/monitoring/images/customize-notifications.webp and /dev/null differ diff --git a/content/docs/monitoring/images/enable-dashboard.webp b/content/docs/monitoring/images/enable-dashboard.webp deleted file mode 100644 index 0649c44c62..0000000000 Binary files a/content/docs/monitoring/images/enable-dashboard.webp and /dev/null differ diff --git a/content/docs/monitoring/images/iphone-check-duplicates.webp b/content/docs/monitoring/images/iphone-check-duplicates.webp deleted file mode 100644 index 6e3ab313a0..0000000000 Binary files a/content/docs/monitoring/images/iphone-check-duplicates.webp and /dev/null differ diff --git a/content/docs/monitoring/images/iphone-task.webp b/content/docs/monitoring/images/iphone-task.webp deleted file mode 100644 index e7779e0a27..0000000000 Binary files a/content/docs/monitoring/images/iphone-task.webp and /dev/null differ diff --git a/content/docs/monitoring/images/iphone-validate-data.webp b/content/docs/monitoring/images/iphone-validate-data.webp deleted file mode 100644 index b87eaa357d..0000000000 Binary files a/content/docs/monitoring/images/iphone-validate-data.webp and /dev/null differ diff --git a/content/docs/monitoring/images/joke-duplicates.webp b/content/docs/monitoring/images/joke-duplicates.webp deleted file mode 100644 index 316825225e..0000000000 Binary files a/content/docs/monitoring/images/joke-duplicates.webp and /dev/null differ diff --git a/content/docs/monitoring/images/joke-monitoring-config.webp b/content/docs/monitoring/images/joke-monitoring-config.webp deleted file mode 100644 index 99346b93fb..0000000000 Binary files a/content/docs/monitoring/images/joke-monitoring-config.webp and /dev/null differ diff --git a/content/docs/monitoring/images/joke-schedule.webp b/content/docs/monitoring/images/joke-schedule.webp deleted file mode 100644 index b57c49b0e3..0000000000 Binary files a/content/docs/monitoring/images/joke-schedule.webp and /dev/null differ diff --git a/content/docs/monitoring/images/joke-scraper-tasks.webp b/content/docs/monitoring/images/joke-scraper-tasks.webp deleted file mode 100644 index 17c35f5025..0000000000 Binary files a/content/docs/monitoring/images/joke-scraper-tasks.webp and /dev/null differ diff --git a/content/docs/monitoring/images/joke-storage.webp b/content/docs/monitoring/images/joke-storage.webp deleted file mode 100644 index 567836ec08..0000000000 Binary files a/content/docs/monitoring/images/joke-storage.webp and /dev/null differ diff --git a/content/docs/monitoring/images/joke-validate-schema.webp b/content/docs/monitoring/images/joke-validate-schema.webp deleted file mode 100644 index 10d5282e24..0000000000 Binary files a/content/docs/monitoring/images/joke-validate-schema.webp and /dev/null differ diff --git a/content/docs/monitoring/images/monitoring-dashboard.webp b/content/docs/monitoring/images/monitoring-dashboard.webp deleted file mode 100644 index 05e852a719..0000000000 Binary files a/content/docs/monitoring/images/monitoring-dashboard.webp and /dev/null differ diff --git a/content/docs/monitoring/images/monitoring-in-store.webp b/content/docs/monitoring/images/monitoring-in-store.webp deleted file mode 100644 index b4d9c53b93..0000000000 Binary files a/content/docs/monitoring/images/monitoring-in-store.webp and /dev/null differ diff --git a/content/docs/monitoring/images/puppies-config.webp b/content/docs/monitoring/images/puppies-config.webp deleted file mode 100644 index 9c054080e7..0000000000 Binary files a/content/docs/monitoring/images/puppies-config.webp and /dev/null differ diff --git a/content/docs/monitoring/images/puppies-failed-run.webp b/content/docs/monitoring/images/puppies-failed-run.webp deleted file mode 100644 index 468b2c89d6..0000000000 Binary files a/content/docs/monitoring/images/puppies-failed-run.webp and /dev/null differ diff --git a/content/docs/monitoring/images/puppies-schema.webp b/content/docs/monitoring/images/puppies-schema.webp deleted file mode 100644 index fae5b8e135..0000000000 Binary files a/content/docs/monitoring/images/puppies-schema.webp and /dev/null differ diff --git a/content/docs/monitoring/images/puppies-task.webp b/content/docs/monitoring/images/puppies-task.webp deleted file mode 100644 index b6bc2bc5d6..0000000000 Binary files a/content/docs/monitoring/images/puppies-task.webp and /dev/null differ diff --git a/content/docs/proxy/images/proxy-status.webp b/content/docs/proxy/images/proxy-status.webp deleted file mode 100644 index 3752856a2f..0000000000 Binary files a/content/docs/proxy/images/proxy-status.webp and /dev/null differ diff --git a/content/docs/proxy/residential_proxy/tips_and_tricks.md b/content/docs/proxy/residential_proxy/tips_and_tricks.md deleted file mode 100644 index a8d8d597a8..0000000000 --- a/content/docs/proxy/residential_proxy/tips_and_tricks.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: Tips and tricks -description: Helpful tips for using your application with Apify's residential proxies. Control traffic, deal with interrupted connections and manage expenses. -paths: - - proxy/residential-proxy/tips-and-tricks ---- - -# [](#tips-and-tricks) Tips and tricks - -[Residential]({{@link proxy/residential_proxy.md}}) proxies are less predictable than [datacenter]({{@link proxy/datacenter_proxy.md}}) proxies and are priced differently (by number of IPs vs traffic used). Because of this, there are some important things to consider before using residential proxy in your solutions. - -## [](#control-traffic-used-by-automated-browsers) Control traffic used by automated browsers - -Residential proxy is priced by data traffic used. Thus, it's easy to quickly use up all your prepaid traffic. In particular, when accessing websites with large files loaded on every page. - -To reduce your traffic use, we recommend using the `blockRequests()` function of [`playwrightUtils`](https://crawlee.dev/api/playwright-crawler/namespace/playwrightUtils#blockRequests)/[`puppeteerUtils`](https://crawlee.dev/api/puppeteer-crawler/namespace/puppeteerUtils#blockRequests) (depending on the library used). - -## [](#connected-proxy-speed-variation) Connected proxy speed variation - -Each host on the residential proxy network uses a different device. They have different network speeds and different latencies. This means that requests made with one [session]({{@link proxy.md#sessions}}) can be extremely fast, while another request with a different session can be extremely slow. The difference can range from a few milliseconds to a few seconds. - -If your solution requires quickly loaded content, the best option is to set a [session]({{@link proxy.md#sessions}}), try a small request and see if the response time is acceptable. If it is, you can use this session for other requests. Otherwise, repeat the attempt with a different session. - -## [](#connection-interruptions) Connection interruptions - -While sessions are persistent, they can be destroyed at any time if the host devices are turned off or disconnected. - -For this problem there is no easy solution. One option is to not use residential proxy for larger requests (and use [datacenter]({{@link proxy/datacenter_proxy.md}}) proxy instead). If you have no other choice, expect that interruptions might happen and write your solution with this in mind. diff --git a/content/docs/sdk.md b/content/docs/sdk.md deleted file mode 100644 index 5c10fda909..0000000000 --- a/content/docs/sdk.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Apify SDK -description: Develop web scraping and automation tools using JavaScript/Node.js and headless Chrome and Puppeteer. Build Apify actors locally or upload to the Apify cloud. -menuWeight: 17 -category: developer tools -paths: - - sdk - - apify-sdk ---- - - diff --git a/content/docs/storage/images/datasets-app.webp b/content/docs/storage/images/datasets-app.webp deleted file mode 100644 index ab94cd2f70..0000000000 Binary files a/content/docs/storage/images/datasets-app.webp and /dev/null differ diff --git a/content/docs/storage/images/datasets-detail.webp b/content/docs/storage/images/datasets-detail.webp deleted file mode 100644 index 6f5225dcf1..0000000000 Binary files a/content/docs/storage/images/datasets-detail.webp and /dev/null differ diff --git a/content/docs/storage/images/find-store-id.webp b/content/docs/storage/images/find-store-id.webp deleted file mode 100644 index 699dec3a9c..0000000000 Binary files a/content/docs/storage/images/find-store-id.webp and /dev/null differ diff --git a/content/docs/storage/images/key-value-stores-app.webp b/content/docs/storage/images/key-value-stores-app.webp deleted file mode 100644 index e9ed7bdc31..0000000000 Binary files a/content/docs/storage/images/key-value-stores-app.webp and /dev/null differ diff --git a/content/docs/storage/images/key-value-stores-detail.webp b/content/docs/storage/images/key-value-stores-detail.webp deleted file mode 100644 index 312bcde57e..0000000000 Binary files a/content/docs/storage/images/key-value-stores-detail.webp and /dev/null differ diff --git a/content/docs/storage/images/overview-api.webp b/content/docs/storage/images/overview-api.webp deleted file mode 100644 index d0f0ec9ae2..0000000000 Binary files a/content/docs/storage/images/overview-api.webp and /dev/null differ diff --git a/content/docs/storage/images/request-queue-app.webp b/content/docs/storage/images/request-queue-app.webp deleted file mode 100644 index da113f2c64..0000000000 Binary files a/content/docs/storage/images/request-queue-app.webp and /dev/null differ diff --git a/content/docs/storage/images/request-queue-detail.webp b/content/docs/storage/images/request-queue-detail.webp deleted file mode 100644 index 8bff230cae..0000000000 Binary files a/content/docs/storage/images/request-queue-detail.webp and /dev/null differ diff --git a/content/docs/storage/key_value_store.md b/content/docs/storage/key_value_store.md deleted file mode 100644 index 85535b140e..0000000000 --- a/content/docs/storage/key_value_store.md +++ /dev/null @@ -1,238 +0,0 @@ ---- -title: Key-value store -description: Store anything from actor or task run results JSON documents or images. Learn how to access and manage key-value stores from Apify Console or via API. -menuWeight: 9.2 -paths: - - storage/key-value-store ---- - -# Key-value store - -The key-value store is simple storage that can be used for storing any kind of data. It can be JSON or HTML documents, zip files, images, or simply strings. The data are stored along with their [MIME content type](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types). - -Each actor run is assigned its own key-value store when it is created. The store contains the actor's input, and, if necessary, other data such as its output. - -Key-value stores are mutable–you can both add entries and delete them. - -> Named key-value stores are retained indefinitely.
    -> Unnamed key-value stores expire after 7 days unless otherwise specified.
    -> [Learn about named and unnamed key-value stores.]({{@link storage.md#named-and-unnamed-storages}}) - -## Basic usage - -There are five ways to access your key-value stores: - -* [Apify Console](https://console.apify.com/storage?tab=keyValueStores) - provides an easy-to-understand interface [[details](#apify-console)]. -* [Apify SDK](https://sdk.apify.com/docs/guides/result-storage#key-value-store) - when building your own Apify actor [[details](#apify-sdk)]. -* [JavaScript API client](/apify-client-js#keyvaluestoreclient) - to access your key-value stores from any Node.js application [[details](#javascript-api-client)]. -* [Python API client](/apify-client-python#keyvaluestoreclient) - to access your key-value stores from any Python application [[details](#python-api-client)]. -* [Apify API](https://docs.apify.com/api/v2#/reference/key-value-stores/get-items) - for accessing your key-value stores programmatically [[details](#apify-api)]. - -### Apify Console - -In [Apify Console](https://console.apify.com), you can view your key-value stores in the [Storage](https://console.apify.com/storage) section under the [Key-value stores](https://console.apify.com/storage?tab=keyValueStores) tab. - -Only named key-value stores are displayed by default. Select the **Include unnamed key-value stores** checkbox to display all of your stores. - -![Key-value stores in app]({{@asset storage/images/key-value-stores-app.webp}}) - -To view a key-value store's content, click on its **Store ID**. -Under the **Settings** tab, you can update the store's name (and, in turn, its [retention period]({{@link storage.md#data-retention}})) and [access rights]({{@link access_rights.md}}). -Click on the `API` button to view and test a store's [API endpoints](https://docs.apify.com/api/v2#/reference/key-value-stores). - -![Key-value stores detail]({{@asset storage/images/key-value-stores-detail.webp}}) - -### Apify SDK - -If you are building an [Apify actor]({{@link actors.md}}), you will be using the [Apify SDK](https://sdk.apify.com). -In the [Apify SDK](https://sdk.apify.com/docs/guides/result-storage#key-value-store), the key-value store is represented by the [`KeyValueStore`](https://sdk.apify.com/api/apify/class/KeyValueStore) class. - -You can use the `KeyValueStore` class to specify whether your data is stored locally or in the Apify cloud, -get and set values using the [`getValue()`](https://sdk.apify.com/api/apify/class/KeyValueStore#getValue) and [`setValue()`](https://sdk.apify.com/api/apify/class/KeyValueStore#setValue) methods respectively, or iterate over your key-value store keys using the [`forEachKey()`](https://sdk.apify.com/api/apify/class/KeyValueStore#forEachKey) method. - -Each actor run is associated with the default key-value store, which is created for the actor run. When running your actors and storing data locally, you can pass its [input]({{@link actors/running/input.md}}) using the **INPUT.json** file in the default key-value store directory. - -You can find `INPUT.json` and other key-value store files in the location below. - -```text -{APIFY_LOCAL_STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}.{EXT} -``` - -The default key-value store's ID is **default**. The {KEY} is the record's **key** and {EXT} corresponds to the record value's MIME content type. - -To manage your key-value stores, you can use the following methods. See the `KeyValueStore` class's [API reference](https://sdk.apify.com/api/apify/class/KeyValueStore) for the full list. - -```js -import { Actor } from 'apify'; - -await Actor.init(); -// ... - -// Get the default input -const input = await Actor.getInput(); - -// Open a named key-value store -const exampleStore = await Actor.openKeyValueStore('my-store'); - -// Read a record in the exampleStore storage -const value = await exampleStore.getValue('some-key'); - -// Write a record to exampleStore -await exampleStore.setValue('some-key', { foo: 'bar' }); - -// Delete a record from exampleStore -await exampleStore.setValue('some-key', null); - -// ... -await Actor.exit(); -``` - -> Note that JSON is automatically parsed to a JavaScript object, text data returned as a string and other data is returned as binary buffer. - -```js -import { Actor } from 'apify'; - -await Actor.init(); -// ... - -// Get input of your actor -const input = await Actor.getInput(); -const value = await Actor.getValue('my-key'); - -// ... -await Actor.setValue( - 'OUTPUT', - imageBuffer, - { contentType: 'image/jpeg' }, -); - -// ... -await Actor.exit(); -``` - -The `Actor.getInput()` method is not only a shortcut to `Actor.getValue('INPUT')` - it is also compatible with `Actor.metamorph()` [[docs](https://docs.apify.com/actors/source-code#metamorph)]. This is because a metamorphed actor run's input is stored in the **INPUT-METAMORPH-1** key instead of **INPUT**, which hosts the original input. - -See the [SDK documentation](https://sdk.apify.com/docs/guides/result-storage#key-value-store) and the `KeyValueStore` class's [API reference](https://sdk.apify.com/api/apify/class/KeyValueStore) for details on managing your key-value stores with the Apify SDK. - -### JavaScript API client - -Apify's [JavaScript API client](/apify-client-js#keyvaluestoreclient) (`apify-client`) allows you to access your key-value stores from any Node.js application, whether it is running on the Apify platform or elsewhere. - -After importing and initiating the client, you can save each key-value store to a variable for easier access. - -```js -const myKeyValStoreClient = apifyClient.keyValueStore('jane-doe/my-key-val-store'); -``` - -You can then use that variable to [access the key-value store's items and manage it](/apify-client-js#keyvaluestoreclient). - -See the [JavaScript API client documentation](/apify-client-js#keyvaluestoreclient) for [help with setup](/apify-client-js#quick-start) and more details. - -### Python API client - -Apify's [Python API client](/apify-client-python#keyvaluestoreclient) (`apify-client`) allows you to access your key-value stores from any Python application, whether it is running on the Apify platform or elsewhere. - -After importing and initiating the client, you can save each key-value store to a variable for easier access. - -```python -my_key_val_store_client = apify_client.key_value_store('jane-doe/my-key-val-store') -``` - -You can then use that variable to [access the key-value store's items and manage it](/apify-client-python#keyvaluestoreclient). - -See the [Python API client documentation](/apify-client-python#keyvaluestoreclient) for [help with setup](/apify-client-python#quick-start) and more details. - -### Apify API - -The [Apify API](https://docs.apify.com/api/v2#/reference/key-value-stores) allows you to access your key-value stores programmatically using [HTTP requests](https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods) and easily share your crawling results. - -If you are accessing your datasets using the **username~store-name** [store ID format]({{@link storage.md#apify-api}}), you will need to use your [secret API token]({{@link integrations.md#api-token}}). You can find the token (and your user ID) on the [Integrations](https://console.apify.com/account#/integrations) page of your Apify account. - -> When providing your API authentication token, we recommend using the request's `Authorization` header, rather than the URL. ([More info](#introduction/authentication)). - -To **get a list of your key-value stores**, send a GET request to the [Get list of key-value stores](https://docs.apify.com/api/v2#/reference/key-value-stores/store-collection/get-list-of-key-value-stores) endpoint. - -```text -https://api.apify.com/v2/key-value-stores -``` - -To **get information about a key-value store** such as its creation time and item count, send a GET request to the [Get store](https://docs.apify.com/api/v2#/reference/key-value-stores/store-object/get-store) endpoint. - -```text -https://api.apify.com/v2/key-value-stores/{STORE_ID} -``` - -To **get a record** (its value) from a key-value store, send a GET request to the [Get record](https://docs.apify.com/api/v2#/reference/key-value-stores/key-collection/get-record) endpoint. - -```text -https://api.apify.com/v2/key-value-stores/{STORE_ID}/records/{KEY_ID} -``` - -To **add a record** with a specific key in a key-value store, send a PUT request to the [Put record](https://docs.apify.com/api/v2#/reference/key-value-stores/record/put-record) endpoint. - -```text -https://api.apify.com/v2/key-value-stores/{STORE_ID}/records/{KEY_ID} -``` - -Example payload: - -```json -{ - "foo": "bar", - "fos": "baz" -} -``` - -To **delete a record**, send a DELETE request specifying the key from a key-value store to the [Delete record](https://docs.apify.com/api/v2#/reference/key-value-stores/record/delete-record) endpoint. - -```text -https://api.apify.com/v2/key-value-stores/{STORE_ID}/records/{KEY_ID} -``` - -See the [API documentation](https://docs.apify.com/api/v2#/reference/key-value-stores) for a detailed breakdown of each API endpoint. - -## Compression - -In the past, every record uploaded using the [Put record](https://docs.apify.com/api/v2#/reference/key-value-stores/record/put-record) endpoint was compressed using Gzip before uploading. This has changed. **Now, records are stored in the state you upload them. This means it is up to you if the record is stored compressed or uncompressed.** - -You can compress a record and use the [Content-Encoding request header](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding) to let our platform know which compression it uses. We recommend compressing large key-value records to save storage space and network traffic. - -**If you use the [Apify SDK](https://sdk.apify.com/api/apify/class/KeyValueStore#setValue) or our [JavaScript API client](https://docs.apify.com/apify-client-js#keyvaluestoreclient-setrecord), your files are compressed automatically by default.** We recommend using the JavaScript API client, which compresses your data before they are sent to our servers and decompresses them when you retrieve them. This makes your storage costs as low as possible. - -## Sharing - -You can invite other Apify users to view or modify your key-value stores using the [access rights]({{@link access_rights.md}}) system. See the [full list of permissions]({{@link access_rights/list_of_permissions.md#key-value-store}}). - -### Sharing key-value stores between runs - -You can access a key-value store from any [actor]({{@link actors.md}}) or [task]({{@link actors/tasks.md}}) run as long as you know its **name** or **ID**. - -To access a key-value store from another run using the Apify SDK, open it using the [`Actor.openKeyValueStore(storeIdOrName)`](https://sdk.apify.com/api/apify/class/Actor#openKeyValueStore) method like you would do with any other store. - -```js -const otherStore = await Actor.openKeyValueStore('old-store'); -``` - -In the [JavaScript API client](/apify-client-js), you can access a store using [its client](/apify-client-js#keyvaluestoreclient). Once you've opened a store, read and manage its contents like you would do with a key-value store from your current run. - -```js -const otherStoreClient = apifyClient.keyValueStore('jane-doe/old-store'); -``` - -Likewise, in the [Python API client](/apify-client-python), you can access a store using [its client](/apify-client-python#keyvaluestoreclient). - -```python -other_store_client = apify_client.key_value_store('jane-doe/old-store') -``` - -The same applies for the [Apify API](#apify-api) - you can use [the same endpoints](#apify-api) as you would normally do. - -See the [Storage overview](https://docs.apify.com/storage#sharing-storages-between-runs) for details on sharing storages between runs. - -## Data consistency - -Key-value storage uses the [AWS S3](https://aws.amazon.com/s3/) service. According to the [S3 documentation](https://aws.amazon.com/s3/consistency/), it provides **strong read-after-write** consistency. - -## Limits - -* Key-value store names can be up to 63 characters long. diff --git a/content/docs/tutorials.md b/content/docs/tutorials.md deleted file mode 100644 index febd7852de..0000000000 --- a/content/docs/tutorials.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -title: Tutorials -description: Learn how to scrape the web and automate processes with Apify. From beginner guides for using actors to advanced topics like migrations and performance. -menuWeight: 3 -category: guides -paths: - - scraping - - tutorials ---- - -# Tutorials - -This section contains everything you need to get you up and running with Apify, as well as advanced techniques for experienced users. Before you start, it will help to have an [Apify account](https://console.apify.com) set up and ready to go. - -## The basics - -[Quick start]({{@link tutorials/quick_start.md}}) will show you how to [run existing actors]({{@link tutorials/quick_start.md#run-an-actor}}) and [create your first Apify actor]({{@link tutorials/quick_start.md#create-an-actor}}). - -[Set up your first web scraping job]({{@link tutorials/apify_scrapers/getting_started.md}}) with a step-by-step guide. - -## Next steps - -If you liked what you learned, get familiar with our main tools by following -the [tutorial]({{@link tutorials/apify_scrapers/web_scraper.md}}) for **Web Scraper** ([apify/web-scraper](https://apify.com/apify/web-scraper)), -the [tutorial]({{@link tutorials/apify_scrapers/cheerio_scraper.md}}) for **Cheerio Scraper** ([apify/cheerio-scraper](https://apify.com/apify/cheerio-scraper)), -or the [tutorial]({{@link tutorials/apify_scrapers/puppeteer_scraper.md}}) for **Puppeteer Scraper** ([apify/puppeteer-scraper](https://apify.com/apify/puppeteer-scraper)) - -Or, if the solution you need is beyond your skillset or schedule, you can always [order a custom solution](https://apify.com/custom-solutions). - -## Building actors - -Building your own actors requires some experience with coding. If you're ready, get started with the [Apify SDK](https://sdk.apify.com/) and [Crawlee](https://crawlee.dev/docs/quick-start). - -"But wait, I don't speak JavaScript", we hear you say. Worry not! While we currently only have an SDK for JavaScript, your actor can be in any programming language. If you use Python, you can now [scrape data with Beautiful Soup]({{@link tutorials/scrape_data_using_python.md}}) -and [parse the results with Pandas]({{@link tutorials/process_data_using_python.md}}) on the Apify platform. Prefer PHP? Cool! Here's how you can [use Apify in your PHP projects]({{@link tutorials/use_apify_from_php.md}}). - -If you're interested in publishing your actors for others to use, check our [building public actors]({{@link tutorials/building_public_actors.md}}) guide. It contains useful information you'll need to make your actor as good as it can be. - -## Advanced use cases - -Check out our other tutorials that cover more specific and advanced scenarios such as [scraping dynamic content]({{@link tutorials/scraping_dynamic_content.md}}), [integrating]({{@link integrations.md}}) your projects with Apify, [using your actors via API]({{@link tutorials/run_actor_and_retrieve_data_via_api.md}}), or setting up [monitoring]({{@link monitoring.md}}) for your projects. - -## Finally - -We're constantly working on extending and improving this section. If you have suggestions or requests for specific tutorials, let us know using the form below. diff --git a/content/docs/tutorials/analyze_pages_and_fix_errors.md b/content/docs/tutorials/analyze_pages_and_fix_errors.md deleted file mode 100644 index 1988b814b8..0000000000 --- a/content/docs/tutorials/analyze_pages_and_fix_errors.md +++ /dev/null @@ -1,222 +0,0 @@ ---- -title: How to analyze pages and fix errors -description: Learn to deal with random crashes in your web scraping and automation jobs. Find our the essentials of debugging and fixing problems in your actors. -menuWeight: 3.91 -paths: - - tutorials/analyze-pages-and-fix-errors ---- - -# How to analyze pages and fix errors - -Debugging is essential in programming. Even if you would not call yourself a programmer, having basic debugging skills will make building and maintaining [scrapers]({{@link tutorials/apify_scrapers.md}}) and [integration actors]({{@link tutorials/run_actor_and_retrieve_data_via_api.md}}) on Apify easier. It will help you avoid hiring an expensive developer and solve your issues faster. - -This article covers the absolute basics. It discusses the most common problems and the simplest tools for analyzing the issue. - -## [](#possible-problems) Possible problems - -It is often tricky to see the full scope of what can go wrong. We assume once the code is set up correctly, it will keep working. Unfortunately, that is rarely true in the realm of web scraping and automation. - -Websites change, they introduce new [anti-scraping technologies]({{@link web_scraping_101/anti_scraping_techniques.md}}), programming tools change and, in addition, people make mistakes. - -Here are the most common reasons your working solution may break. - -- The website changes its layout or [data feed](https://www.datafeedwatch.com/academy/data-feed). - -- A site's layout changes depending on location or uses [A/B testing](https://www.youtube.com/watch?v=XDoKXaGrUxE&feature=youtu.be). - -- A page starts to block you (recognizes you as a bot). - -- The website [loads its data later dynamically]({{@link tutorials/scraping_dynamic_content.md}}), so the code works only sometimes, if you are slow or lucky enough. - -- You made a mistake when updating your code. - -- The code worked locally but not on the [Apify platform](https://console.apify.com). - -- You have lost access to [Apify proxy]({{@link proxy.md}}) (your proxy trial is over). - -- You have upgraded your [dependencies](https://www.quora.com/What-is-a-dependency-in-coding) (other software that you rely upon) and the new versions no longer work (this is harder to debug). - -This is a long list, and it is by no means complete. However, if you use the right tools and remember the most common causes, you can find the problem quickly. - -## [](#analysis) Analysis - -[Web scraping]({{@link web_scraping_101.md}}) and [automation]({{@link robotic_process_automation.md}}) are very specific types of programming. It is not possible to rely on specialized debugging tools, since the code does not output the same results every time. - -Many issues are edge cases, which occur in just one of a thousand pages or are time-dependent. Because of this, you cannot rely only on [determinism](https://en.wikipedia.org/wiki/Deterministic_algorithm). - -### [](#logging) Logging - -Logging is an essential tool for any programmer. When used correctly, they help you capture a surprising amount of information. - -Note that Apify logs are [not infinite]({{@link actors/limits.md}}). If you see messages with skipped lines, consider toning down your logging. - -General rules for logging: - -- Usually, **many logs** is better than **no logs**. - -- Putting more information into one line, rather than logging multiple short lines, helps reduce the overall log size. - -- Focus on numbers. Log how many items you extract from a page, etc. - -- Structure your logs and use the same structure in all your logs. - -- Append the current page's URL to each log. This lets you immediately open that page and review it. - -#### [](#example-of-a-structured-log) Example of a structured log - -```log -[CATEGORY]: Products: 20, Unique products: 4, Next page: true --- https://apify.com/store -``` - -The log begins with the **page type**. Usually, we use labels such as **[CATEGORY]** and **[DETAIL]**. Then, we log important numbers and other information. Finally, we add the page's URL, so we can check if the log is correct. - -#### [](#errors) Errors - -Errors require a different approach because, if your code crashes, your usual logs will not be called. Instead, exception handlers will print the error, but these are usually ugly messages with a [stack trace](https://en.wikipedia.org/wiki/Stack_trace) that only Apify experts will understand. - -You can overcome this by adding [try/catch blocks](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/try...catch) into your code. In the catch block, explain what happened and re-throw the error (so the request is automatically retried). - -```javascript -try { - // Sensitive code block - // ... -} catch (error) { - // You know where the code crashed so you can explain here - console.error('Request failed during login with an error:'); - throw error; -} -``` - -Read more information about logging and error handling in our public wiki about [developer best practices](https://gitlab.com/apify-public/wiki/-/wikis/writing-actors/how-to-write-and-not-write-an-actor). - -### [](#saving-snapshots) Saving snapshots - -By snapshots, we mean **screenshots** if you use a [browser + Puppeteer/Playwright](https://sdk.apify.com/docs/examples/capture-screenshot) and **HTML** saved into a [key-value store]({{@link storage/key_value_store.md}}) that you can easily display in your browser. Snapshots are useful throughout your code but especially important in error handling. - -Note that an error can happen only in a few pages out of a thousand and look completely random. There is not much you can do other than save and analyze a snapshot. - -Snapshots can tell you if: - -- A website has changed its layout. This can also mean A/B testing or different content for different locations. - -- You have been blocked – you open a [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) or **Access Denied** page. - -- Data load later dynamically – the page is empty. - -- The page was redirected – the content is different. - -#### [](#how-to-save-a-snapshot) How to save a snapshot - -In Apify scrapers (**Web Scraper** ([apify/web-scraper](https://apify.com/apify/web-scraper)), **Cheerio Scraper** ([apify/cheerio-scraper](https://apify.com/apify/cheerio-scraper)), **Playwright Scraper** ([apify/playwright-scraper](https://apify.com/apify/playwright-scraper))) and **Puppeteer Scraper** ([apify/puppeteer-scraper](https://apify.com/apify/puppeteer-scraper))), you can use their built-in `context.saveSnapshot()` function. Once called, it saves a screenshot and HTML into the run's **key-value store**. - -When **building your own actors** with [Puppeteer](https://pptr.dev) or the [Apify SDK](https://sdk.apify.com) and [Crawlee](https://crawlee.dev/) packages, you can use the powerful [`puppeteerUtils.saveSnapshot()`](https://crawlee.dev/api/puppeteer-crawler/namespace/puppeteerUtils#saveSnapshot) function. It allows you to name the screenshot, so you can identify it later. - -[Cheerio](https://cheerio.js.org)-based actors do not have a helper function because they allow taking snapshots with a single line of code. Just save the HTML with the correct content type. - -```javascript -import { Actor } from 'apify'; - -await Actor.init(); -// ... -const html = $('html').html(); -await Actor.setValue('SNAPSHOT', html, { contentType: 'text/html' }); -// ... -await Actor.exit(); -``` - -#### [](#when-to-save-snapshots) When to save snapshots - -The most common approach is to save on error. We can enhance our previous try/catch block like this: - -```javascript -import { Actor } from 'apify'; -import { puppeteerUtils } from 'crawlee'; - -await Actor.init(); -// ... -// storeId is ID of current key value store, where we save snapshots -const storeId = Actor.getEnv().defaultKeyValueStoreId; -try { - // Sensitive code block - // ... -} catch (error) { - // Change the way you save it depending on what tool you use - const randomNumber = Math.random(); - const key = `ERROR-LOGIN-${randomNumber}`; - await puppeteerUtils.saveSnapshot(page, { key }); - const screenshotLink = `https://api.apify.com/v2/key-value-stores/${storeId}/records/${key}.jpg` - - // You know where the code crashed so you can explain here - console.error(`Request failed during login with an error. Screenshot: ${screenshotLink}`); - throw error; -} -// ... -await Actor.exit(); -``` - -To make the error snapshot descriptive, we name it `ERROR-LOGIN`. We add a random number so the next `ERROR-LOGIN`s would not overwrite this one and we can see all the snapshots. If you can use an ID of some sort, it is even better. - -**Beware**: - -- The snapshot's **name** (key) can only contain letter, number, dot and dash characters. Other characters will cause an error, which makes the random number a safe pick. - -- Do not overdo the snapshots. Once you get out of the testing phase, limit them to critical places. Saving snapshots uses resources. - -### [](#error-reporting) Error reporting - -Logging and snapshotting are great tools but once you reach a certain run size, it may be hard to read through them all. For a large project, it is handy to create a more sophisticated reporting system. For example, let's just look at simple **dataset** reporting. - -This example extends our [previous snapshot solution](#when-to-save-snapshots) by creating a [named dataset]({{@link storage.md#named-and-unnamed-storages}}) (named datasets have infinite retention), where we will accumulate error reports. Those reports will explain what happened and will link to a saved snapshot, so we can do a quick visual check. - -```javascript -import { Actor } from 'apify'; -import { puppeteerUtils } from 'crawlee'; - -await Actor.init(); -// ... -// Let's create reporting dataset -// If you already have one, this will continue adding to it -const reportingDataset = await Actor.openDataset('REPORTING'); - -// storeId is ID of current key-value store, where we save snapshots -const storeId = Actor.getEnv().defaultKeyValueStoreId; - -// We can also capture actor and run IDs -// to have easy access in the reporting dataset -const { actorId, actorRunId } = Actor.getEnv(); -const linkToRun = `https://console.apify.com/actors/actorId#/runs/actorRunId`; - -try { - // Sensitive code block - // ... -} catch (error) { - // Change the way you save it depending on what tool you use - const randomNumber = Math.random(); - const key = `ERROR-LOGIN-${randomNumber}`; - await puppeteerUtils.saveSnapshot(page, { key }); - - const screenshotLink = `https://api.apify.com/v2/key-value-stores/${storeId}/records/${key}.jpg?disableRedirect=true`; - - // We create a report object - const report = { - errorType: 'login', - errorMessage: error.toString(), - - // You will have to adjust the keys if you save them in a non-standard way - htmlSnapshot: `https://api.apify.com/v2/key-value-stores/${storeId}/records/${key}.html?disableRedirect=true`, - screenshot: screenshotLink, - run: linkToRun, - }; - - // And we push the report - await reportingDataset.pushData(report); - - // You know where the code crashed so you can explain here - console.error( - `Request failed during login with an error. Screenshot: ${screenshotLink}` - ); - throw error; -} -// ... -await Actor.exit(); -``` diff --git a/content/docs/tutorials/apify_scrapers/cheerio_scraper.md b/content/docs/tutorials/apify_scrapers/cheerio_scraper.md deleted file mode 100644 index de6ef15cf9..0000000000 --- a/content/docs/tutorials/apify_scrapers/cheerio_scraper.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -title: Cheerio Scraper -menuTitle: Cheerio Scraper -description: Learn how to scrape a website using Apify's Cheerio Scraper. Build an actor's page function, extract information from a web page and download your data. -externalSourceUrl: https://raw.githubusercontent.com/apifytech/actor-scraper/master/docs/build/cheerio-scraper-tutorial.md -menuWeight: 3 -paths: - - scraping/cheerio-scraper - - tutorials/apify-scrapers/cheerio-scraper ---- - -[//]: # (TODO: Should be updated) diff --git a/content/docs/tutorials/apify_scrapers/getting_started.md b/content/docs/tutorials/apify_scrapers/getting_started.md deleted file mode 100644 index 9b2180174f..0000000000 --- a/content/docs/tutorials/apify_scrapers/getting_started.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -title: Getting started with Apify Scrapers -menuTitle: Getting started -description: Step-by-step tutorial that will help you get started with all Apify Scrapers. Learn the foundations of scraping the web with Apify and creating your own actors. -externalSourceUrl: https://raw.githubusercontent.com/apifytech/actor-scraper/master/docs/build/introduction-tutorial.md -menuWeight: 1 -paths: - - scraping/getting-started - - tutorials/apify-scrapers/getting-started ---- - -[//]: # (TODO: Should be updated) diff --git a/content/docs/tutorials/apify_scrapers/images/actor-selection.webp b/content/docs/tutorials/apify_scrapers/images/actor-selection.webp deleted file mode 100644 index 6f1cd64156..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/actor-selection.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/description.webp b/content/docs/tutorials/apify_scrapers/images/description.webp deleted file mode 100644 index f90ae022e2..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/description.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/find-data.webp b/content/docs/tutorials/apify_scrapers/images/find-data.webp deleted file mode 100644 index 493b35f270..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/find-data.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/inspect-data.webp b/content/docs/tutorials/apify_scrapers/images/inspect-data.webp deleted file mode 100644 index 5a678ee7da..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/inspect-data.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/inspect-network.webp b/content/docs/tutorials/apify_scrapers/images/inspect-network.webp deleted file mode 100644 index cbf5460a68..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/inspect-network.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/last-run-date.webp b/content/docs/tutorials/apify_scrapers/images/last-run-date.webp deleted file mode 100644 index d1c9822d17..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/last-run-date.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/making-a-pseudo-url.webp b/content/docs/tutorials/apify_scrapers/images/making-a-pseudo-url.webp deleted file mode 100644 index 2f8a88e379..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/making-a-pseudo-url.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/plugging-it-into-the-pagefunction.webp b/content/docs/tutorials/apify_scrapers/images/plugging-it-into-the-pagefunction.webp deleted file mode 100644 index 1864d5eac6..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/plugging-it-into-the-pagefunction.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/scraping-practice.webp b/content/docs/tutorials/apify_scrapers/images/scraping-practice.webp deleted file mode 100644 index 9f978e1371..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/scraping-practice.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/the-run-detail.webp b/content/docs/tutorials/apify_scrapers/images/the-run-detail.webp deleted file mode 100644 index eefd3b15b6..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/the-run-detail.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/the-start-url.webp b/content/docs/tutorials/apify_scrapers/images/the-start-url.webp deleted file mode 100644 index cda12aee1a..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/the-start-url.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/title.webp b/content/docs/tutorials/apify_scrapers/images/title.webp deleted file mode 100644 index 2b9ec5f284..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/title.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/using-devtools.webp b/content/docs/tutorials/apify_scrapers/images/using-devtools.webp deleted file mode 100644 index c4fc39a7ad..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/using-devtools.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/images/waiting-for-the-button.webp b/content/docs/tutorials/apify_scrapers/images/waiting-for-the-button.webp deleted file mode 100644 index 988dc7a1ae..0000000000 Binary files a/content/docs/tutorials/apify_scrapers/images/waiting-for-the-button.webp and /dev/null differ diff --git a/content/docs/tutorials/apify_scrapers/puppeteer_scraper.md b/content/docs/tutorials/apify_scrapers/puppeteer_scraper.md deleted file mode 100644 index 0cab846dad..0000000000 --- a/content/docs/tutorials/apify_scrapers/puppeteer_scraper.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -title: Puppeteer Scraper -menuTitle: Puppeteer Scraper -description: Learn how to scrape a website using Apify's Puppeteer Scraper. Build an actor's page function, extract information from a web page and download your data. -externalSourceUrl: https://raw.githubusercontent.com/apifytech/actor-scraper/master/docs/build/puppeteer-scraper-tutorial.md -menuWeight: 4 -paths: - - scraping/puppeteer-scraper - - tutorials/apify-scrapers/puppeteer-scraper ---- - -[//]: # (TODO: Should be updated) diff --git a/content/docs/tutorials/apify_scrapers/web_scraper.md b/content/docs/tutorials/apify_scrapers/web_scraper.md deleted file mode 100644 index 1ca01ac34b..0000000000 --- a/content/docs/tutorials/apify_scrapers/web_scraper.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: Web Scraper -menuTitle: Web Scraper -description: Learn how to scrape a website using Apify's Web Scraper. Build an actor's page function, extract information from a web page and download your data. -externalSourceUrl: https://raw.githubusercontent.com/apifytech/actor-scraper/master/docs/build/web-scraper-tutorial.md -menuWeight: 2 -paths: - - scraping/web-scraper - - tutorials/apify-scrapers/web-scraper ---- - - -[//]: # (TODO: Should be updated) diff --git a/content/docs/tutorials/cache_data_to_improve_performance.md b/content/docs/tutorials/cache_data_to_improve_performance.md deleted file mode 100644 index 19095ad908..0000000000 --- a/content/docs/tutorials/cache_data_to_improve_performance.md +++ /dev/null @@ -1,190 +0,0 @@ ---- -title: Cache data to improve performance -description: Learn how to make your scrapers more efficient by storing repeated page data. Avoid re-scraping pages and reduce your data extraction costs. -menuWeight: 3.8 -paths: - - tutorials/improve-performance-by-caching-repeated-page-data - - tutorials/cache-data-to-improve-performance ---- - -# Improve performance by caching repeated page data - -Opening a page is by far the most expensive operation a scraper does. Each request has to use a precious IP address to route the traffic, then download a large HTML document (and a lot of other resources, if you use a browser) over the network (and pay for data transfer), and finally spend CPU time on parsing that HTML. Compared to that, the code you write inside the scraper itself is essentially free. - -If you want to reduce your scraping costs, not re-scraping certain pages is one of the best ways to do that. The number of use cases where this is possible might be quite low, but you should always look for and take advantage of such situations. In this article, we will go through one typical scraping scenario and apply caching in a simple and effective way. - -> In a rush? Skip the tutorial and [see the full code example](https://github.com/metalwarrior665/apify-utils/blob/master/examples/caching-page-data.js). - -## [](#how-to-cache-data-inside-an-actor) How to cache data inside an actor - -Thanks to JavaScript's dynamic nature, we can store arbitrary data in a single object and easily manipulate it in place. - -```javascript -const cache = { - data1: 'my-data', - data2: { - myKey: 'my=data', - }, -}; - -// We can easily add things to an object -cache.data3 = 'my-new-data' -// We can remove things from an object -delete cache.data1 -// And we can update the object -cache.data2.myNewKey = 'my-new-data' -``` - -Because [all objects in JavaScript are just references](https://www.freecodecamp.org/news/how-to-get-a-grip-on-reference-vs-value-in-javascript-cba3f86da223/), we can cheaply pass them to other functions and read or modify them there. - -### [](#persisting-cache-to-the-key-value-store) Persisting cache to the key-value store - -The cache lives only in memory. This is the easiest and fastest way to use a cache. One disadvantage is that if the actor run [migrates to a new server]({{@link actors/development/state_persistence.md}}), is aborted or crashes, we lose the cached data. That is not a tragedy but repopulating the cache will waste some resources. Fortunately, this has a simple solution in actors: we can persist arbitrary data into the [key-value store]({{@link storage/key_value_store.md}}). - -```javascript -import { Actor } from 'apify'; - -await Actor.init(); - -// This is a common idiom: we first check if we already have cached data in the store -// If we do, it means the run was already restarted and we restore the cache -// If we don't, we just initialize the cache to an empty object -const cache = (await Actor.getValue('CACHE')) || {}; - -// Now, we set up the persistence. You can choose between 'migrating' and 'persistState' events -// 'migrating' only saves on migration, so it is a little "cheaper" -// 'persistState' is usually preferred, it will also help if you abort the actor -Actor.on('persistState', async () => { - await Actor.setValue('CACHE', cache); -}); -// We have secured the persistence and can now pass on the cache and use it like we want - -await Actor.exit(); -``` - -Another advantage of persisting data is that you can open the key-value store and check what they look like at any time. - -## [](#how-to-use-caching-in-an-e-commerce-project) How to use caching in an e-commerce project - -Now we have covered the base theory, so we can look into applying caching to help us avoid re-scraping pages. This approach is very helpful with e-commerce marketplaces. Let's define our imaginary example project: - -- We need to scrape all products from an imaginary `https://marketplace.com` website. -- Each product is offered by one seller and the product page links to the seller page. -- Each product row we scrape should contain all info about the product and its seller. -- A single seller usually sells about 100 products. - -Let's also define the URLs: - -- Products are available on `https://marketplace.com/product/productId`. -- Sellers are available on `https://marketplace.com/seller/sellerId`. - -### [](#cache-structure) Cache structure - -You might have already realized how we can utilize the cache. Because a seller can sell more than one product, with a naive approach, we would just re-scrape the seller page for each of their products. This is wasteful. Instead, we can store all the data we scrape from the seller page to our cache. If we encounter the seller's product again, we can get the seller data straight from the cache. - -Our cache will be an object where the **keys** will be the seller IDs (imagine a numerical ID) and the **values** will be seller data. - -```json -{ - "545345": { - "sellerId": "545345", - "sellerName": "Jane Doe", - "sellerRating": 3.5, - "sellerNumberOfReviews": 345, - "sellerNumberOfFollowers": 32, - "sellerProductsSold": 1560 - }, - "423423": { - "sellerId": "423423", - "sellerName": "Martin Smith", - "sellerRating": 4.2, - "sellerNumberOfReviews": 23, - "sellerNumberOfFollowers": 2, - "sellerProductsSold": 132 - } -} -``` - -### [](#crawler-example) Crawler example - -```javascript -import { Actor } from 'apify'; -import { CheerioCrawler } from 'crawlee'; - -// Let's imagine we defined the extractor functions in the extractors.js file -import { extractProductData, extractSellerData } from './extractors.js'; - -await Actor.init(); - -const cache = (await Actor.getValue('CACHE')) || {}; - -Actor.on('persistState', async () => { - await Actor.setValue('CACHE', cache); -}); - -// Other crawler setup -// ... - -// It doesn't matter what crawler class we choose -const crawler = new CheerioCrawler({ - // Other crawler options - // ... - async requestHandler({ request, $ }) { - const { label } = request; - if (label === 'START') { - // Enqueue categories etc... - } else if (label === 'CATEGORY') { - // Enqueue products and paginate... - } else if (label === 'PRODUCT') { - // Here is where our example begins - const productData = extractProductData($); - const sellerId = $('#seller-id').text().trim(); - - // We have all we need from the product page - // Now we check the cache if we already scraped this seller - if (cache[sellerId]) { - // If yes, we just merge the data and we are done - const result = { - ...productData, - ...cache[sellerId], - }; - await Actor.pushData(result); - } else { - // If the cache doesn't have this seller, we have to go to their page - await crawler.addRequests([{ - url: `https://marketplace.com/seller/${sellerId}`, - label: 'SELLER', - userData: { - // We also have to pass the product data along - // so we can merge and push them from the seller page - productData, - }, - }]); - } - } else if (label === 'SELLER') { - // And finally we handle the seller page - // We scrape the seller data - const sellerData = extractSellerData($); - - // We populate the cache so we can access all of this seller's other products from there - cache[sellerData.sellerId] = sellerData; - - // We merge seller and product data and push - const result = { - ...request.userData.productData, - ...sellerData, - }; - await Actor.pushData(result); - } - }, -}); - -await crawler.run([{ - url: 'https://marketplace.com', - userData: { label: 'START' }, -}]); - -await Actor.exit(); -``` - -[See the full code example](https://github.com/metalwarrior665/apify-utils/blob/master/examples/caching-page-data.js). diff --git a/content/docs/tutorials/crawl_urls_from_a_google_sheet.md b/content/docs/tutorials/crawl_urls_from_a_google_sheet.md deleted file mode 100644 index 6db971cc6d..0000000000 --- a/content/docs/tutorials/crawl_urls_from_a_google_sheet.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -title: Crawl URLs from a Google Sheet -description: Learn to crawl and scrape data from URLs specified in a spreadsheet with Apify scrapers. Scrape a pre-determined list of web pages with Apify actors. -menuWeight: 3.8 -paths: - - tutorials/crawl-a-list-of-urls-from-a-google-sheets-document - - tutorials/crawl-urls-from-google-sheets-document - - tutorials/crawl-urls-from-a-google-sheet ---- - -# Crawl a list of URLs from a Google Sheets document - -[Actors]({{@link actors.md}}) such as **Web Scraper** ([apify/web-scraper](https://apify.com/apify/web-scraper)) **Cheerio Scraper** ([apify/cheerio-scraper](https://apify.com/apify/web-scraper)) and **Puppeteer Scraper** ([apify/puppeteer-scraper](https://apify.com/apify/web-scraper)) make it simple to crawl web pages and extract data from them. - -These actors start with a pre-defined list of URLs ([start URLs]({{@link tutorials/apify_scrapers/getting_started.md#the-start-url}})), then recursively follow links to find new pages (optional). - -![Add Start URLs in Apify Console]({{@asset tutorials/images/start-url.webp}}) - -Let's say you have the start URLs you want to crawl stored in a [Google Sheets](https://www.google.com/sheets/about/) spreadsheet, such as [this one]( -https://docs.google.com/spreadsheets/d/1GA5sSQhQjB_REes8I5IKg31S-TuRcznWOPjcpNqtxmU). - -![Start URLs in a spreadsheet]({{@asset tutorials/images/start-urls-in-spreadsheet.webp}}) - -You don't have to add them to the actor manually or export them as a file, only to upload to the scraper. Just click the **Text file** -> **Link remote text file** button in the actor's input and paste the URL. - -![Link a remote text file]({{@asset tutorials/images/link-remote-file.webp}}) - -**IMPORTANT: Make sure the document can be viewed by anyone with the link, otherwise the actor will not be able to access it.** - -![Make the link viewable to anyone]({{@asset tutorials/images/make-link-viewable.webp}}) - -And that's it, now the actor will download the content of the spreadsheet with up-to-date URLs whenever it starts. - -> Beware that the spreadsheet should have a simple structure, so the actor can easily find the URLs in it. Also, it should only have one sheet. diff --git a/content/docs/tutorials/images/actor-input-view.webp b/content/docs/tutorials/images/actor-input-view.webp deleted file mode 100644 index fedb6cb935..0000000000 Binary files a/content/docs/tutorials/images/actor-input-view.webp and /dev/null differ diff --git a/content/docs/tutorials/images/actor-run-dataset.webp b/content/docs/tutorials/images/actor-run-dataset.webp deleted file mode 100644 index dad9d21115..0000000000 Binary files a/content/docs/tutorials/images/actor-run-dataset.webp and /dev/null differ diff --git a/content/docs/tutorials/images/actor-run-results.webp b/content/docs/tutorials/images/actor-run-results.webp deleted file mode 100644 index 94f19aff54..0000000000 Binary files a/content/docs/tutorials/images/actor-run-results.webp and /dev/null differ diff --git a/content/docs/tutorials/images/actor-run-view.webp b/content/docs/tutorials/images/actor-run-view.webp deleted file mode 100644 index 4f7f88da61..0000000000 Binary files a/content/docs/tutorials/images/actor-run-view.webp and /dev/null differ diff --git a/content/docs/tutorials/images/apify-store.webp b/content/docs/tutorials/images/apify-store.webp deleted file mode 100644 index 2ca6aca6b9..0000000000 Binary files a/content/docs/tutorials/images/apify-store.webp and /dev/null differ diff --git a/content/docs/tutorials/images/bbc-time-offset.webp b/content/docs/tutorials/images/bbc-time-offset.webp deleted file mode 100644 index fccd4ec35c..0000000000 Binary files a/content/docs/tutorials/images/bbc-time-offset.webp and /dev/null differ diff --git a/content/docs/tutorials/images/bbc-weather-after-midnight.webp b/content/docs/tutorials/images/bbc-weather-after-midnight.webp deleted file mode 100644 index 37a51b2d48..0000000000 Binary files a/content/docs/tutorials/images/bbc-weather-after-midnight.webp and /dev/null differ diff --git a/content/docs/tutorials/images/bbc-weather-devtools.webp b/content/docs/tutorials/images/bbc-weather-devtools.webp deleted file mode 100644 index 30553aa05c..0000000000 Binary files a/content/docs/tutorials/images/bbc-weather-devtools.webp and /dev/null differ diff --git a/content/docs/tutorials/images/bbc-weather-parser-source.webp b/content/docs/tutorials/images/bbc-weather-parser-source.webp deleted file mode 100644 index a67956fb7f..0000000000 Binary files a/content/docs/tutorials/images/bbc-weather-parser-source.webp and /dev/null differ diff --git a/content/docs/tutorials/images/bbc-weather-prediction.webp b/content/docs/tutorials/images/bbc-weather-prediction.webp deleted file mode 100644 index c72f55f1e3..0000000000 Binary files a/content/docs/tutorials/images/bbc-weather-prediction.webp and /dev/null differ diff --git a/content/docs/tutorials/images/bbc-weather-scraper-source.webp b/content/docs/tutorials/images/bbc-weather-scraper-source.webp deleted file mode 100644 index 862567a9b7..0000000000 Binary files a/content/docs/tutorials/images/bbc-weather-scraper-source.webp and /dev/null differ diff --git a/content/docs/tutorials/images/bbc-weather-url-format.webp b/content/docs/tutorials/images/bbc-weather-url-format.webp deleted file mode 100644 index e66e51f1a0..0000000000 Binary files a/content/docs/tutorials/images/bbc-weather-url-format.webp and /dev/null differ diff --git a/content/docs/tutorials/images/create-actor-set-input.webp b/content/docs/tutorials/images/create-actor-set-input.webp deleted file mode 100644 index 5037da434e..0000000000 Binary files a/content/docs/tutorials/images/create-actor-set-input.webp and /dev/null differ diff --git a/content/docs/tutorials/images/create-actor.webp b/content/docs/tutorials/images/create-actor.webp deleted file mode 100644 index 2523e79d5c..0000000000 Binary files a/content/docs/tutorials/images/create-actor.webp and /dev/null differ diff --git a/content/docs/tutorials/images/edit-this-cookie.webp b/content/docs/tutorials/images/edit-this-cookie.webp deleted file mode 100644 index b3185b35d9..0000000000 Binary files a/content/docs/tutorials/images/edit-this-cookie.webp and /dev/null differ diff --git a/content/docs/tutorials/images/facebook-login.webp b/content/docs/tutorials/images/facebook-login.webp deleted file mode 100644 index 699724196e..0000000000 Binary files a/content/docs/tutorials/images/facebook-login.webp and /dev/null differ diff --git a/content/docs/tutorials/images/link-remote-file.webp b/content/docs/tutorials/images/link-remote-file.webp deleted file mode 100644 index c459e1ec2f..0000000000 Binary files a/content/docs/tutorials/images/link-remote-file.webp and /dev/null differ diff --git a/content/docs/tutorials/images/make-link-viewable.webp b/content/docs/tutorials/images/make-link-viewable.webp deleted file mode 100644 index b2017f0710..0000000000 Binary files a/content/docs/tutorials/images/make-link-viewable.webp and /dev/null differ diff --git a/content/docs/tutorials/images/network-tab.webp b/content/docs/tutorials/images/network-tab.webp deleted file mode 100644 index e0b0848a58..0000000000 Binary files a/content/docs/tutorials/images/network-tab.webp and /dev/null differ diff --git a/content/docs/tutorials/images/open-edit-this-cookie.webp b/content/docs/tutorials/images/open-edit-this-cookie.webp deleted file mode 100644 index 24ae9fb22a..0000000000 Binary files a/content/docs/tutorials/images/open-edit-this-cookie.webp and /dev/null differ diff --git a/content/docs/tutorials/images/pagination-filters.webp b/content/docs/tutorials/images/pagination-filters.webp deleted file mode 100644 index 97b3c54daf..0000000000 Binary files a/content/docs/tutorials/images/pagination-filters.webp and /dev/null differ diff --git a/content/docs/tutorials/images/pagination.webp b/content/docs/tutorials/images/pagination.webp deleted file mode 100644 index 03bedcea12..0000000000 Binary files a/content/docs/tutorials/images/pagination.webp and /dev/null differ diff --git a/content/docs/tutorials/images/run-actor-postman.webp b/content/docs/tutorials/images/run-actor-postman.webp deleted file mode 100644 index 238ec94db8..0000000000 Binary files a/content/docs/tutorials/images/run-actor-postman.webp and /dev/null differ diff --git a/content/docs/tutorials/images/run-info-postman.webp b/content/docs/tutorials/images/run-info-postman.webp deleted file mode 100644 index ec004acd4f..0000000000 Binary files a/content/docs/tutorials/images/run-info-postman.webp and /dev/null differ diff --git a/content/docs/tutorials/images/run-log-2.webp b/content/docs/tutorials/images/run-log-2.webp deleted file mode 100644 index ee9fad9c18..0000000000 Binary files a/content/docs/tutorials/images/run-log-2.webp and /dev/null differ diff --git a/content/docs/tutorials/images/run-log.webp b/content/docs/tutorials/images/run-log.webp deleted file mode 100644 index 82c304d3d6..0000000000 Binary files a/content/docs/tutorials/images/run-log.webp and /dev/null differ diff --git a/content/docs/tutorials/images/start-url.webp b/content/docs/tutorials/images/start-url.webp deleted file mode 100644 index f059f5bf09..0000000000 Binary files a/content/docs/tutorials/images/start-url.webp and /dev/null differ diff --git a/content/docs/tutorials/images/start-urls-in-spreadsheet.webp b/content/docs/tutorials/images/start-urls-in-spreadsheet.webp deleted file mode 100644 index 618bf29fce..0000000000 Binary files a/content/docs/tutorials/images/start-urls-in-spreadsheet.webp and /dev/null differ diff --git a/content/docs/tutorials/images/web-scraper-input.webp b/content/docs/tutorials/images/web-scraper-input.webp deleted file mode 100644 index cf1851282f..0000000000 Binary files a/content/docs/tutorials/images/web-scraper-input.webp and /dev/null differ diff --git a/content/docs/tutorials/images/webhook.webp b/content/docs/tutorials/images/webhook.webp deleted file mode 100644 index db10d72fd4..0000000000 Binary files a/content/docs/tutorials/images/webhook.webp and /dev/null differ diff --git a/content/docs/tutorials/log_in_by_transferring_cookies.md b/content/docs/tutorials/log_in_by_transferring_cookies.md deleted file mode 100644 index 0643b07f4d..0000000000 --- a/content/docs/tutorials/log_in_by_transferring_cookies.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: Log in by transferring cookies -description: Learn how to transfer cookies from your web browser to your crawlers. Log into websites when web scraping or automating tasks using your existing logins. -menuWeight: 3.7 -paths: - - tutorials/log-in-by-transferring-cookies ---- - -# Log in by transferring cookies - -To crawl websites that require a login, you can transfer cookies from your web browser directly into [Apify actors]({{@link actors.md}}) such as **Web Scraper** ([apify/web-scraper](https://apify.com/apify/web-scraper)), **Puppeteer Scraper** ([apify/puppeteer-scraper](https://apify.com/apify/puppeteer-scraper)) and **Instagram Scraper** ([jaroslavhejlek/instagram-scraper](https://apify.com/jaroslavhejlek/instagram-scraper)). - -This is the quickest and simplest solution, however there are others that may be more reliable. For example, you can also [fill in the login form directly in the code]({{@link tutorials/log_into_a_website_using_puppeteer.md}}). - -## [](#install-a-cookie-editor) Install a cookie editor - -First, install a browser extension like [EditThisCookie](https://chrome.google.com/webstore/detail/editthiscookie/fngmhnnpilhplaeedifhccceomclgfbg). After installation, go to the website you'd like to crawl and log in using your credentials. - -![Inspect Facebook login with DevTools]({{@asset tutorials/images/edit-this-cookie.webp}}) - -## [](#export-your-cookies) Export your cookies - -Click the **EditThisCookie** button next to your URL and click **Export**. Cookies will be copied to your clipboard as a **JSON array**, which is compatible with the cookie format used by [Puppeteer](https://pptr.dev)/[Headless Chrome](https://developers.google.com/web/updates/2017/04/headless-chrome) (the headless browser we use for crawling). - -![Export your cookies]({{@asset tutorials/images/open-edit-this-cookie.webp}}) - -## [](#pass-cookies-to-web-scraper) Pass cookies to Web Scraper - -The **Initial cookies** field is in the **Proxy and browser configuration** tab in Web Scraper's **Input** section. Paste the cookies into the field. - -![Web scraper input tab]({{@asset tutorials/images/web-scraper-input.webp}}) - -And that's it! When you run the scraper, it will start already logged-in. Note that if the cookies are short-lived, this might not work, and you will need to [implement login in your code]({{@link tutorials/log_into_a_website_using_puppeteer.md}}). diff --git a/content/docs/tutorials/log_into_a_website_using_puppeteer.md b/content/docs/tutorials/log_into_a_website_using_puppeteer.md deleted file mode 100644 index de072cd4ce..0000000000 --- a/content/docs/tutorials/log_into_a_website_using_puppeteer.md +++ /dev/null @@ -1,138 +0,0 @@ ---- -title: Log into a website using Puppeteer -description: Learn how to complete a website's authentication process using headless Chrome and Puppeteer. Automate the filling in of log in details and passwords. -menuWeight: 3.6 -paths: - - tutorials/log-into-a-website-using-puppeteer ---- - -# Log into a website using Puppeteer - -In this article, we demonstrate how you can easily scrape data from a page behind a login using an [Apify actor]({{@link actors.md}}) with [Puppeteer](https://pptr.dev/). For this example, we will use [https://facebook.com](https://www.facebook.com/). - -## [](#find-the-login-form) Find the login form - -First, let's find the **login form** and the **submit** button on the Facebook login page using Chrome's DevTools. Right-click on any of the elements in the form and choose **Inspect**. - -![Inspect Facebook login with DevTools]({{@asset tutorials/images/facebook-login.webp}}) - -We can see an HTML **input** element with the IDs `email` for email and `pass` for the password. The form submission button's ID is not very helpful, however we can see it is a **button** element with the name `login` and type `submit`. We will use its ID, which is `u_0_b`. - -## [](#code-the-actor-to-fill-in-details) Code the actor to fill in details - -Our actor will use the Puppeteer API to fill in the **username** and **password** and click the **submit** button. - -```javascript -import { Actor } from 'apify'; -import { launchPuppeteer, log } from 'crawlee'; - -await Actor.init(); - -// Get the username and password inputs -const input = await Actor.getInput(); - -const browser = await launchPuppeteer(); -const page = await browser.newPage(); -await page.goto('https://facebook.com'); - -// Login -await page.type('#email', input.username); -await page.type('#pass', input.password); -await page.click('#u_0_b'); -await page.waitForNavigation(); - -// Get cookies -const cookies = await page.cookies(); - -// Use cookies in another tab or browser -const page2 = await browser.newPage(); -await page2.setCookie(...cookies); -// Open the page as a logged-in user -await page2.goto('https://facebook.com'); - -await browser.close(); - -log.info('Done.'); - -await Actor.exit(); -``` - -Now, you can run the actor and pass the login credentials as an [input JSON object](https://sdk.apify.com/docs/examples/accept-user-input#docsNav). - -```json -{ - "username": "marge@example.com", - "password": "my secret password" -} -``` - -## [](#save-and-reuse-cookies) Save and reuse cookies - -For most pages, you need to save cookies and reuse then in following runs. You can avoid logging in for each run with the code below. - -The example below uses a [named key-value store]({{@link storage.md#named-and-unnamed-storages}}) to save cookies for upcoming runs. - -```javascript -import { Actor } from 'apify'; -import { launchPuppeteer, log } from 'crawlee'; - -await Actor.init(); - -const loggedCheck = async (page) => { - try { - await page.waitForSelector('#bluebarRoot', { timeout: 10000 }); - return true; - } catch(err) { - return false; - } -}; - -// Get the username and password inputs -const input = await Actor.getInput(); - -const fcbCacheStore = await Actor.openKeyValueStore('fcb-cache'); -const cookiesStoreKey = input.username.replace('@', '(at)'); - -const browser = await launchPuppeteer(); -const page = await browser.newPage(); - -let isLogged = false; -let userCookies = await fcbCacheStore.getValue(cookiesStoreKey); -if (userCookies) { - log.info('Trying to use cached cookies...') - await page.setCookie(...userCookies); - await page.goto('https://facebook.com'); - isLogged = await loggedCheck(page); -} - -if (!isLogged) { - log.info(`Cookies from the cache didn't work. Try to log in.`); - await page.goto('https://facebook.com'); - await page.type('#email', input.username); - await page.type('#pass', input.password); - await page.click('#u_0_b'); - await page.waitForNavigation(); - isLogged = await loggedCheck(page); -} - -if (!isLogged) { - throw new Error('Incorrect username or password.') -} - -// Get cookies and refresh them in store cache -log.info(`Saving new cookies to cache...`); -const cookies = await page.cookies(); -await fcbCacheStore.setValue(cookiesStoreKey, cookies); - -// Use cookies in another tab or browser -const page2 = await browser.newPage(); -await page2.setCookie(...cookies); -// Opens thepage as a logged-in user -await page2.goto('https://facebook.com'); - -await browser.close(); - -log.info('Done.'); - -await Actor.exit(); -``` diff --git a/content/docs/tutorials/scrape_data_using_python.md b/content/docs/tutorials/scrape_data_using_python.md deleted file mode 100644 index 19eb7687de..0000000000 --- a/content/docs/tutorials/scrape_data_using_python.md +++ /dev/null @@ -1,214 +0,0 @@ ---- -title: Scrape data using Python -description: Learn how to scrape websites using Python and its Beautiful Soup library. Follow the tutorial to analyze the target page and create a Python actor. -menuWeight: 3.93 -paths: - - tutorials/scrape-data-using-python ---- - - -# How to scrape data in Python using Beautiful Soup - -Web scraping is not limited to the JavaScript world. The Python ecosystem contains some pretty powerful scraping tools as well. One of those is [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/), a library for parsing HTML and easy navigation or modification of a DOM tree. - -This tutorial shows you how to write a Python [actor]({{@link actors.md}}) for scraping the weather forecast from [BBC Weather](https://www.bbc.com/weather). We also have an accompanying tutorial for [how to process the scraped data]({{@link tutorials/process_data_using_python.md}}) using [Pandas](https://pandas.pydata.org/). - -> In a rush? Skip this tutorial and get the [full code example](https://github.com/apify/apify-docs/tree/master/examples/python-data-scraper/). - -## Exploring the BBC Weather page - -BBC Weather offers you the weather forecast for the upcoming 14 days for a large selection of places around the world. Let's say we want to decide on our next holiday destination. We're choosing between Prague, New York, and Honolulu, and we will pick the destination based on which one has the best weather. To do that, we will scrape the weather forecast for each of our options, and then compare the results. - -### Understanding the URL format - -First, we need to look around the BBC Weather page and understand how the weather data is being retrieved and presented. If we open the [BBC Weather](https://www.bbc.com/weather) page and search for Prague, we can see that it opened a page with a URL ending in a seven-digit number, which we can assume is the ID of the displayed location BBC Weather uses internally. Opening a different location changes only that number in the URL, confirming our assumptions. - -The page shows the weather forecast for the upcoming 14 days. If we hover over the days in the displayed carousel, we can see that the link for each day leads to a URL ending with `/day{X}`, with `{X}` representing how many days in the future the specific day is. - -Combining this information gives us the full format for the URL of a page for a given location and day: `https://www.bbc.com/weather/{LOCATION_ID}/day{DAY_OFFSET}`. - -![BBC Weather URL format]({{@asset tutorials/images/bbc-weather-url-format.webp}}) - -### Determining the forecast's starting date - -Looking more closely at the BBC Weather page, we can see that it shows the forecast for each day from 6:00 AM to 5:00 AM the next day. But what happens when we view a location where the current time is between midnight and 5 AM? Trying that, we can see that, in the day represented by **Tonight**, there are only a few slots for the hours between midnight and 5 AM displayed. This means that the first displayed day can either represent the current date at the location, or the day before the current date. To find out which of these two it is, we will first have to determine the current date and time at the location, and then possibly adjust it by one day based on whether the date matches the first displayed day. - -![BBC Weather displaying a location with current time between midnight and 5 AM]({{@asset tutorials/images/bbc-weather-after-midnight.webp}}) - -To determine the current date and time at the displayed location, we will need to know the location's timezone. Fortunately, the timezone and its offset to GMT are displayed near the bottom of the page. - -![The timezone offset on the BBC Weather page]({{@asset tutorials/images/bbc-time-offset.webp}}) - -### Understanding the element structure - -To extract data from the page, we need to figure out where exactly in the internal page structure it is stored. - -If we right-click on the day title in the top carousel (**Today** or **Tonight**) and select **Inspect** in the popup menu, we can open the Chrome DevTools Inspector with the clicked element highlighted. We can see that the element with the currently displayed day in the top carousel has the class `wr-day--active`, and that the element with the day's title has the class `wr-day__title` and the accessibility label attribute `aria-label` contains the actual date of that day, not just **Today** or **Tonight**. Additionally, the timezone information is in an element with the class `wr-c-footer-timezone__item`. There are two elements with the same class, so we will need to pick the second one when parsing the page. - -Exploring the document tree further, we can see that the element containing all the displayed hours has the class `wr-time-slot-container__slots`. The elements with the forecast for a given hour have the class `wr-time-slot`. In each time slot, the element containing the slot's hour has the class `wr-time-slot-primary__hours` and the element containing the slot's predicted temperature in degrees Celsius has the class `wr-value--temperature--c`. - -![BBC Weather with the DevTools Inspector open]({{@asset tutorials/images/bbc-weather-devtools.webp}}) - -## Scraping the data from the page - -Now that we understand the element structure of the page and know where to find all the data we need, we can start writing the scraper. - -### Setting up the actor - -First, we need to create a new actor. To do this, go to [Apify Console](https://console.apify.com/), open the [Actors section](https://console.apify.com/actors), click on the **Create new** button in the top right, and select the **Example: Hello world in Python** actor template. - -In the page that opens, you can see your newly created actor. In the **Settings** tab, you can give it a name (e.g. `bbc-weather-scraper`) and further customize its settings. We'll skip customizing the settings for now, the defaults should be fine. In the **Source** tab, you can see the files that are at the heart of the actor. There are several of them, but only two are important for us now, `main.py` and `requirements.txt`. - -First we'll start with the `requirements.txt` file. Its purpose is to list all the third-party packages that your actor will use. We will be using the `requests` package for downloading the BBC Weather pages, and the `beautifulsoup4` package for parsing and processing the downloaded pages. We don't particularly care about the specific versions of these packages, so we just list them in the file: - -```python -# Add your dependencies here. -# See https://pip.pypa.io/en/latest/cli/pip_install/#requirements-file-format -# for how to format them - -beautifulsoup4 -requests -``` - -### Writing the code - -Finally, we can get to writing the main logic for the actor, which will live in the `main.py` file. Let's delete everything currently in it and start from an empty file. - -First, we need to import all the packages we will use in the code: - -```python -from datetime import datetime, time, timedelta, timezone -import os -import re - -from apify_client import ApifyClient -from bs4 import BeautifulSoup -import requests -``` - -Next, let's set up the locations we want to scrape in a constant for easier reference and, optionally, modification. - -```python -# Locations which to scrape and their BBC Weather IDs -LOCATIONS = [ - ('Prague', '3067696'), - ('Honolulu', '5856195'), - ('New York', '5128581'), -] -``` - -#### Extracting the data - -We'll be scraping each location separately. For each location, we need to know in which timezone it resides and what is the first displayed date in the weather forecast for that location. We will scrape each of the 14 forecast days one by one. For each day, we will first download its forecast page using the `requests` library, and then parse the downloaded HTML using the `BeautifulSoup` parser: - -```python -# List with scraped results -weather_data = [] - -# Scrape each location separately -for (location_name, location_id) in LOCATIONS: - print(f'Scraping weather from {location_name}...') - location_timezone = None - first_displayed_date = None - for day_offset in range(14): - # Get the BBC Weather page for the given location and day and parse it with BeautifulSoup - response = requests.get(f'https://www.bbc.com/weather/{location_id}/day{day_offset}') - soup = BeautifulSoup(response.content, 'html.parser') -``` - -When scraping a location, we need to know in which timezone it lies, and what date the first displayed day of the forecast represents. We can find that out at the beginning, when scraping the first day of the forecast for that location. - -To get the necessary data, we will need to find the elements in which it is contained. Let's use the `soup.find(...)` and `soup.findAll(...)` methods, which find elements matching some specified conditions in the parsed HTML. - -First, we extract the timezone from the second element with class `wr-c-footer-timezone__item`. The timezone information is described there with a full sentence, but we're only interested in the numerical representation of the timezone offset, so we parse it out using a regular expression. With the timezone offset parsed, we can construct a `timezone` object and from that get the current datetime at the location. - -Afterwards, we can figure out which date is represented by the first displayed day. We find the element with the class `wr-day--active` containing the header for the currently displayed day. Inside it, we find the element with the title of that day, which has the class `wr-day__title`. This element has the accessibility label containing the actual date of the day in its `aria-label` attribute, but it contains only the day and month and not the year, so we can't use it directly. Instead, to get the full date of the first displayed day, we compare the day from the accessibility label and the day from the current datetime at the location. If they match, we know the first displayed date is the current date at the location. If they don't, we know the first displayed date is the day before the current date at the location. - -```python - # When parsing the first day, find out what day it represents, - # to know when do the results start - if day_offset == 0: - # Get the timezone offset written in the page footer and parse it - tz_description = soup.find_all(class_='wr-c-footer-timezone__item')[1].text - tz_offset_match = re.search(r'([+-]\d\d)(\d\d)', tz_description) - tz_offset_hours = int(tz_offset_match.group(1)) - tz_offset_minutes = int(tz_offset_match.group(2)) - - # Get the current date and time at the scraped location - timezone_offset = timedelta(hours=tz_offset_hours, minutes=tz_offset_minutes) - location_timezone = timezone(timezone_offset) - - location_current_datetime = datetime.now(tz=location_timezone) - - # The times displayed for each day are from 6:00 AM that day to 5:00 AM the next day, - # so "today" on BBC Weather might actually mean "yesterday" in actual datetime. - # We have to parse the accessibility label containing the actual date on the header for the first day - # and compare it with the current date at the location, then adjust the date accordingly - day_carousel_item = soup.find(class_='wr-day--active') - day_carousel_title = day_carousel_item.find(class_='wr-day__title')['aria-label'] - website_first_displayed_item_day = int(re.search(r'\d{1,2}', day_carousel_title).group(0)) - - if location_current_datetime.day == website_first_displayed_item_day: - first_displayed_date = location_current_datetime.date() - else: - first_displayed_date = location_current_datetime.date() - timedelta(days=1) -``` - -Now that we've figured out the date of the first displayed day, we can extract the predicted weather from each hour of each forecast day. The forecast for the displayed day is in the element with class `wr-time-slot-container__slots`, and that element contains time slots for each predicted hour represented by elements with the class `wr-time-slot`. In each time slot, the element with the class `wr-time-slot-primary__hours` contains the hour of the time slot. The element with the class `wr-value--temperature--c` contains the temperature in degrees Celsius. - -To get the datetime of each slot, we need to combine the date of the first displayed day, the hour displayed in the slot, and the timezone of the currently processed location. Since the page shows the forecast for each day from 6 AM to 5 AM the next day, we need to add one day to the slots from midnight to 5 AM to get the correct datetime. - -Finally, we can put all the extracted information together and push them to the array holding the resulting data. - -```python - # Go through the elements for each displayed time slot of the displayed day - slot_container = soup.find(class_='wr-time-slot-container__slots') - for slot in slot_container.find_all(class_='wr-time-slot'): - # Find out the date and time of the displayed element from the day offset and the displayed hour. - # The times displayed for each day are from 6:00 AM that day to 5:00 AM the next day, - # so anything between midnight and 6 AM actually represents the next day - slot_hour = int(slot.find(class_='wr-time-slot-primary__hours').text) - slot_datetime = datetime.combine(first_displayed_date, time(hour=slot_hour), tzinfo=location_timezone) - slot_datetime += timedelta(days=day_offset) - if slot_hour < 6: - slot_datetime += timedelta(days=1) - - # Parse the temperature from the right element - slot_temperature = int(slot.find(class_='wr-value--temperature--c').text[:-1]) - - # Add the parsed data to the result list - weather_data.append({ - 'datetime': slot_datetime, - 'location': location_name, - 'temperature': slot_temperature, - }) -``` - -#### Storing the data - -As the last step, we need to store the scraped data in a dataset on the Apify platform, so that we can access it later. We do that through the [Apify API Client for Python]({{@link apify_client_python.md}}), which greatly simplifies working with the Apify platform and allows you to use its functions without having to call the Apify API directly. - -First, we initialize an `ApifyClient` instance. All the necessary arguments are automatically provided to the actor process as environment variables accessible in Python through the `os.environ` mapping. We will save the data into the default dataset belonging to the actor run, so we create a sub-client for working with that dataset, and push the data into it using its `.push_items(...)` method. - -```python -# Initialize the main ApifyClient instance -client = ApifyClient(os.environ['APIFY_TOKEN'], api_url=os.environ['APIFY_API_BASE_URL']) - -# Get the resource subclient for working with the default dataset of the actor run -default_dataset_client = client.dataset(os.environ['APIFY_DEFAULT_DATASET_ID']) - -# Finally, push all the results into the dataset -default_dataset_client.push_items(weather_data) - -print(f'Results have been saved to the dataset with ID {os.environ["APIFY_DEFAULT_DATASET_ID"]}') -``` - -### Running the actor - -And that's it! Now you can save the changes in the editor, and then click **Build and run** at the bottom of the page. The actor will get built, the built actor image will get saved for future reuse, and then it will be executed. You can follow the progress of the actor build and the actor run in the **Last build** and **Last run** tabs, respectively, in the developer console in the actor source view. Once the actor finishes running, you can view the scraped data in the **Dataset** tab in the actor run view. - -![Building and running the BBC Weather Scraper actor]({{@asset tutorials/images/bbc-weather-scraper-source.webp}}) - -## Processing the results - -In this tutorial, you have learned the basics of scraping data in Python using Requests and Beautiful Soup with Apify actors. But scraping data is only one part of the process. Head on to our [next tutorial]({{@link tutorials/process_data_using_python.md}}) to learn how to process the data in a dataset using [Pandas](https://pandas.pydata.org/) and visualize it using [Matplotlib](https://matplotlib.org/). diff --git a/content/docs/tutorials/scrape_paginated_sites.md b/content/docs/tutorials/scrape_paginated_sites.md deleted file mode 100644 index 2e12e48870..0000000000 --- a/content/docs/tutorials/scrape_paginated_sites.md +++ /dev/null @@ -1,286 +0,0 @@ ---- -title: Scrape paginated sites -description: Learn how to extract all of a website's listings even if they limit the number of results pages. See code examples for setting up your scraper. -menuWeight: 3.91 -paths: - - tutorials/scrape-websites-with-limited-pagination - - tutorials/scrape-paginated-sites ---- - -# How to scrape websites with limited pagination - -Limited pagination is a common practice on e-commerce sites and is becoming more popular over time. It makes sense: a real user will never want to look through more than 200 pages of results – only bots love unlimited pagination. Fortunately, there are ways to overcome this limit while keeping our code clean and generic. - -![Pagination in on Google search results page]({{@asset tutorials/images/pagination.webp}}) - -> In a rush? Skip the tutorial and get the [full code example](https://github.com/metalwarrior665/apify-utils/tree/master/examples/crawler-with-filters). - -## [](#how-to-overcome-the-limit) How to overcome the limit - -Websites usually limit the pagination of a single (sub)category to somewhere between 1,000 to 20,000 listings. The site might have over a million listings in total. Without a proven algorithm, it will be very manual and almost impossible to scrape all listings. - -We will first look at a couple ideas that don't work so well and then present the [final robust solution](#using-filter-ranges). - -### [](#going-deeper-into-subcategories) Going deeper into subcategories - -This is usually the first solution that comes to mind. You traverse the smallest subcategories and hope that those are below the pagination limits. Unfortunately, there are two big problems with this approach: - -1. Any subcategory might be bigger than the pagination limit. -2. Some listings from the parent category might not be present in any subcategory. - -While you can often manually test if the second problem is true on the site, the first problem is a hard blocker. You might be just lucky, and it may work on this site but usually, traversing subcategories is just not enough. It can be used as a first step of the solution but not as the solution itself. - -### [](#using-filters) Using filters - -Most websites also provide a way for the user to select search filters. These allow a more granular level of search than categories and can be combined with them. Common filters allow you to select a **color**, **size**, **location** and similar attributes. - -At first, it might seem as an easy solution. Enqueue all possible filter combinations and that should be so granular that it will never hit a pagination limit. Unfortunately, this solution is still far from good. - -1. There is no guarantee that some products don't slip through the chosen filter combinations. -2. The resulting split might be too granular and end up having too many tiny paginations with many duplicate products. This leads to scraping a lot more pages than necessary and makes analytics much harder. - -### [](#using-filter-ranges) Using filter ranges - -The best option is to use only a specific type of filter that can be used as a range. The most common one is **price range** but there may be others like the apartment size, etc. You can split the pagination pages to only contain listings within that range, e.g. products costing between $10 and $20. - -This has several benefits: - -1. All listings can eventually be found in a range. -2. The ranges do not overlap, so we scrape the smallest possible number of pages and avoid duplicate listings. -3. Ranges can be controlled by a generic algorithm that is simple to re-use for different sites. - -## [](#splitting-pages-with-range-filters) Splitting pages with range filters - -In the previous section, we analyzed different options to split the pages to overcome the pagination limit. We have chosen range filters as the most reliable way to do that. In this section, we will discuss a generic algorithm to work with ranges, look at a few special cases and then write an example crawler. - -![An example of range filters on a website]({{@asset tutorials/images/pagination-filters.webp}}) - -### [](#the-algorithm) The algorithm - -The core algorithm is simple and can be used on any (even overlapping) range. This is a simplified presentation, we will discuss the details later. - -1. We choose a few pivot ranges with a similar number of products and enqueue them. For example, **$0-$10**, **$100-$1000**, **$1000-$10000**, **$10000-**. -2. For each range, we open the page and check if the listings are below the limit. If yes, we continue to step 3. If not, we split the filter in half, e.g. **$0-$10** to **$0-$5** and **$5-$10** and enqueue those again. We recursively repeat step **2** for each range as long as needed. -3. We now have a pagination URL that is below the limit, we enqueue it under a pagination label and start enqueuing products. - -Because the algorithm is recursive, we don't need to think about how big the final ranges should be, the algorithm will find them over time. - -### [](#special-cases-to-look-for) Special cases to look for - -We have the base algorithm, but before we start coding, let's answer a few questions to get more insight. - -#### [](#can-the-ranges-overlap) Can the ranges overlap? - -Some sites will allow you to construct non-overlapping ranges. For example, you can set the ranges with cents, e.g. **$0-$4.99**, **$5-$9.99**, etc. If that is possible, create the pivot ranges this way, too. - -Non-overlapping ranges should remove the possibility of duplicate products (unless a [listing has multiple values](#can-a-listing-have-more-values)) and the lowest number of pages. - -If the website supports only overlapping ranges (e.g. **$0-$5**, **$5-10**), it is not a big problem. Only a small portion of the listings will be duplicates, and they can be removed using a [request queue]({{@link storage/request_queue.md}}). - -#### [](#can-a-listing-have-more-values) Can a listing have more values? - -In rare cases, a listing can have more than one value that you are filtering in a range. A typical example is [amazon.com](https://amazon.com), where each product has several offers and those offers have different prices. If any of those offers is within the range, the product is shown. - -There is no easy way to get around this but the price range split works even with duplicate listings, just use a [JS set](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Set) or request queue to deduplicate them. - -#### [](#how-is-the-range-passed-to-the-url) How is the range passed to the URL? - -In the easiest case, you can pass the range directly in the page's URL. For example, ``. Sometimes, you will need to do some query composition because the price range might be encoded together with more information into a single parameter. - -Some sites don't have page URLs with filters and instead load the filtered products via [XHRs](https://docs.apify.com/web-scraping-101/web-scraping-techniques#xhrs). Those can be GET or POST requests with various **URL** and **payload** syntax. - -The nice thing here is that if you get to understand how their internal API works, you can have it return more products per page or extract full product details just from this single request. - -In addition, XHRs are smaller and faster than loading an HTML page. On the other hand, you should not overly abuse them (with setting overly large limits), as this can expose you. - -#### [](#does-the-website-show-the-number-of-products-for-each-filtered-page) Does the website show the number of products for each filtered page? - -If it does, it is a nice bonus. It gives us an easy way to check if we are over or below the pagination limit and helps with analytics. - -If it doesn't, we have to find a different way to check if the number of listings is within a limit. One option is to go to the last allowed page of the pagination. If that page is still full products, we can assume the filter is over the limit. - -#### [](#how-to-handle-open-ends-of-the-range) How to handle (open) ends of the range - -Logically, every full (price) range starts at 0 and ends at infinity. But the way this is encoded will differ on each site. The end of the price range can be either closed (0) or open (infinity). Open ranges require special handling when you split them (we will get to that). - -Most sites will let you start with 0 (there might be exceptions, where you will have make the start open), so we can use just that. The high end is more complicated. Because you don't know the biggest price, it is best to leave it open and handle it specially. Internally you can just assign `null` to the value. - -Here are few examples of a query parameter with an open and closed high-end range: - -- Open: `p:100-` (higher than 100), Closed: `p:100-200` (between 100 and 200) -- Open: `min_price=100`, Closed: `min_price=100&max_price=200` - -#### [](#can-the-range-exceed-the-limit-on-a-single-value) Can the range exceed the limit on a single value? - -In very rare cases, a site will have so many listings that a single value (e.g. **$100** or **$4.99**) will include a number of listings over the limit. [The basic algorithm](#the-algorithm) will recurse until the **min** value equals the **max** value and then stop because it cannot split that single value anymore. - -In this rare case, you will need to add another range or other filters to combine it to get an even deeper split. - -### [](#implementing-a-range-filter) Implementing a range filter - -This section shows a simple code example implementing our solution for an imaginary website. Writing a real solution will bring up more complex problems but the previous section should prepare you for some of them. - -First, let's define our imaginary site: - -- It has a single `/products` path that contains all the products that we want to scrape. -- **Max** pagination limit is **1000**. -- The site contains over a million products. -- It allows for filtering over a price range with query parameters `min_price` and `max_price`. -- If `min_price` or `max_price` are not defined, it opens that end of the range (all products up to or all products over that). -- The site allows to specify the price in cents. -- Pagination is done via `page` query parameter. - -#### [](#define-and-enqueue-pivot-ranges) Define and enqueue pivot ranges - -This step is not necessary but it is useful. The algorithm doesn't start with splitting over too large or too small values. - -```javascript -import { Actor } from 'apify'; -import { CheerioCrawler } from 'crawlee'; - -await Actor.init(); - -const MAX_PRODUCTS_PAGINATION = 1000; - -// These is just an example, choose what makes sense for your site -const PIVOT_PRICE_RANGES = [ - { min: 0, max: 9.99 }, - { min: 10, max: 99.99 }, - { min: 100, max: 999.99 }, - { min: 1000, max: 9999.99 }, - { min: 10000, max: null }, // open-ended -]; - -// Let's create a helper function for creating the filter URLs, you can move those to a utils.js file -const createFilterUrl = ({ min, max }) => { - const minString = `min_price=${min}`; - // We don't want to pass the parameter at all if it is null (open-ended) - const maxString = max ? `&max_price=${max}` : ''; - return `https://www.mysite.com/products?${minString}${maxString}`; -}; - -// And another helper for getting filters back from the URL, we could also pass them in userData -const getFiltersFromUrl = (url) => { - const min = Number(url.match(/min_price=([0-9.]+)/)[1]); - // Max price might be empty - const maxMatch = url.match(/max_price=([0-9.]+)/); - const max = maxMatch ? Number(maxMatch[1]) : null; - return { min, max }; -} - -// Actor setup things here -const crawler = new CheerioCrawler({ - async requestHandler(context) { - // ... - }, -}); - -// Let's create the pivot requests -const initialRequests = []; -for (const { min, max } of PIVOT_PRICE_RANGES) { - initialRequests.push({ - url: createFilterUrl({ min, max }), - label: 'FILTER', - }); -} -// Let's start the crawl -await crawler.run(initialRequests); - -await Actor.exit(); -``` - -#### [](#define-the-logic-for-the-filter-page) Define the logic for the `FILTER` page - -```javascript -import { CheerioCrawler } from 'crawlee'; - -// Doesn't matter what Crawler class we choose -const crawler = new CheerioCrawler({ - // Crawler options here - // ... - async requestHandler({ request, $ }) { - const { label } = request; - if (label === 'FILTER') { - // Of course, change the selectors and make it more robust - const numberOfProducts = Number($('.product-count').text()); - - // The filter is either good enough of we have to split it - if (numberOfProducts <= MAX_PRODUCTS_PAGINATION) { - // We just pass the URL for scraping, we could optimize it so the page is not opened again - await crawler.addRequests([{ - url: `${request.url}&page=1`, - userData: { label: 'PAGINATION' }, - }]); - } else { - // Here we have to split the filter - // To be continued... - } - } - if (label === 'PAGINATION') { - // We know we are under the limit here - // Enqueue next page as long as possible - // Enqueue or scrape products normally - } - } -}); -``` - -#### [](#split-price-filters) Split price filters - -We have the base of the crawler set up. The last part we are missing is the price filter splitting. Let's use a generic function for this. We can place it into the `utils.js` file. - -```javascript -// utils.js -export function splitFilter(filter) { - const { min, max } = filter; - // Don't forget that max can be null and we have to handle that situation - if (max && min > max) { - throw new Error(`WRONG FILTER - min(${min}) is greater than max(${max})`); - } - - // We crate a middle value for the split. If max in null, we will use double min as the middle value - const middle = max - ? min + Math.floor((max - min) / 2) - : min * 2; - - // We have to do the Math.max and Math.min to prevent having min > max - const filterMin = { - min, - max: Math.max(middle, min), - }; - const filterMax = { - min: max ? Math.min(middle + 1, max) : middle + 1, - max, - }; - // We return 2 new filters - return [filterMin, filterMax]; -} -``` - -#### [](#enqueue-the-filters) Enqueue the filters - -Let's finish the crawler now. This code example will go inside the `else` block of the previous crawler example. - -```javascript -const { min, max } = getFiltersFromUrl(request.url); -// Our generic splitFilter function doesn't account for decimal values so we will have to convert to cents and back to dollars -const newFilters = splitFilter({ min: min * 100, max: max * 100 }); - -// And we just enqueue those 2 new filters so the process will recursively repeat until all pages get to the PAGINATION phase -const requestsToEnqueue = []; -for (const filter of newFilters) { - requestsToEnqueue.push({ - // Remember that we have to convert back from cents to dollars - url: createFilterUrl({ min: filter.min / 100, max: filter.max / 100 }), - label: 'FILTER', - }); -} - -await crawler.addRequests(requestsToEnqueue); -``` - -## [](#summary) Summary - -And that's it. We have an elegant and simple solution for a complicated problem. In a real project, you would want to make this a bit more robust, [use logs]({{@link tutorials/analyze_pages_and_fix_errors.md#logging}}), and save analytics data. This will let you know what filters you went through and how many products each of them had. - -Check out the [full code example](https://github.com/metalwarrior665/apify-utils/tree/master/examples/crawler-with-filters). diff --git a/content/docs/tutorials/scrape_websites_using_the_sitemap.md b/content/docs/tutorials/scrape_websites_using_the_sitemap.md deleted file mode 100644 index 0f0121dc4e..0000000000 --- a/content/docs/tutorials/scrape_websites_using_the_sitemap.md +++ /dev/null @@ -1,128 +0,0 @@ ---- -title: Scrape websites using the sitemap -description: The sitemap.xml file is a jackpot for every web scraper. Take advantage of this and learn a much easier way to extract data from websites using the Apify SDK. -menuWeight: 3.9 -paths: - - tutorials/scrape-websites-using-the-sitemap ---- - -# Scrape websites using the sitemap - -Let's say we want to scrape a database of craft beers ([brewbound.com](https://www.brewbound.com)) before summer starts. If we are lucky, the website will contain a sitemap at [https://www.brewbound.com/sitemap.xml](https://www.brewbound.com/sitemap.xml). - -> Check out [Sitemap Sniffer](https://apify.com/vaclavrut/sitemap-sniffer) tool, which can discover sitemaps in hidden locations. - -## [](#the-sitemap) The sitemap - -The sitemap is usually located at the path `/sitemap.xml`. It is always worth trying that URL, as it is rarely linked anywhere on the site. It usually contains a list of all pages in [XML format](https://www.w3.org/standards/xml/core). - -```xml - - - - http://www.brewbound.com/advertise - 2015-03-19 - daily - - - ... -``` - -The URLs of breweries are in the form - -```cURL -http://www.brewbound.com/breweries/[BREWERY_NAME] -``` - -and the URLs of craft beers are in the form - -```cURL -http://www.brewbound.com/breweries/[BREWERY_NAME]/[BEER_NAME] -``` - -They can be matched with the following regular expression (regex). - -```cURL -/http(s)?:\/\/www\.brewbound\.com\/breweries\/[^\/]+\/[^\/<]+/gm -``` - -Note the two parts of the regular expression `[^\/<]` containing `<`. This is because we want to exclude the `` tag, which closes each URL. - -## [](#using-the-sitemap-in-apify-sdk) Using the sitemap in Apify SDK and Crawlee - -Our [web scraping and automation library](https://crawlee.dev/) is well-suited for scraping with sitemaps. - -First, let's import the beer URLs from the sitemap to [RequestList](https://crawlee.dev/api/core/class/RequestList) using our regular expression to match only the (craft!) beer URLs and not pages of breweries, contact page, etc. - -```javascript -const requestList = await RequestList.open(null, [{ - requestsFromUrl: 'https://www.brewbound.com/sitemap.xml', - regex: /http(s)?:\/\/www\.brewbound\.com\/breweries\/[^\/<]+\/[^\/<]+/gm, -}]); -``` - -Now, let's use [PuppeteerCrawler](https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler) to scrape the created [RequestList](hhttps://crawlee.dev/api/core/class/RequestList) with [Puppeteer](https://pptr.dev) and push it to the final dataset. - -```javascript -const crawler = new PuppeteerCrawler({ - requestList, - async requestHandler({ page }) { - const beerPage = await page.evaluate(() => { - return document.getElementsByClassName('productreviews').length; - }); - if (!beerPage) return; - - const data = await page.evaluate(() => { - const title = document.getElementsByTagName('h1')[0].innerText; - const [brewery, beer] = title.split(':'); - const description = document.getElementsByClassName('productreviews')[0].innerText; - - return { brewery, beer, description }; - }); - - await Actor.pushData(data); - }, -}); -``` - -## [](#full-code-example) Full code example - -If we create a new actor using the code below on the [Apify platform](https://console.apify.com/actors), it returns a nicely formatted spreadsheet containing a list of breweries with their beers and descriptions. - -Make sure to use the [`apify/actor-node-puppeteer-chrome`](https://hub.docker.com/r/apify/actor-node-puppeteer-chrome) [image]({{@link actors/development/base_docker_images.md}}) for your Dockerfile, otherwise the run will fail. - -```javascript -import { Actor } from 'apify'; -import { RequestList, PuppeteerCrawler } from 'crawlee'; - -await Actor.init(); - -const requestList = await RequestList.open(null, [{ - requestsFromUrl: 'https://www.brewbound.com/sitemap.xml', - regex: /http(s)?:\/\/www\.brewbound\.com\/breweries\/[^\/<]+\/[^\/<]+/gm, -}]); - -const crawler = new PuppeteerCrawler({ - requestList, - async requestHandler({ page }) { - const beerPage = await page.evaluate(() => { - return document.getElementsByClassName('productreviews').length; - }); - if (!beerPage) return; - - const data = await page.evaluate(() => { - const title = document.getElementsByTagName('h1')[0].innerText; - const [brewery, beer] = title.split(':'); - const description = document.getElementsByClassName('productreviews')[0].innerText; - - return { brewery, beer, description }; - }); - - await Actor.pushData(data); - }, -}); - -await crawler.run(); - -await Actor.exit(); -``` diff --git a/content/docs/tutorials/scraping_dynamic_content.md b/content/docs/tutorials/scraping_dynamic_content.md deleted file mode 100644 index ed717ea376..0000000000 --- a/content/docs/tutorials/scraping_dynamic_content.md +++ /dev/null @@ -1,163 +0,0 @@ ---- -title: Scraping dynamic content -description: Wait for dynamically loaded content when web scraping. See code examples and a detailed breakdown for setting timeouts and custom wait functions. -menuWeight: 3.5 -paths: - - tutorials/scraping-dynamic-content ---- - -# Scraping dynamic content - -Many websites load data in the background via [XHR requests]({{@link web_scraping_101/web_scraping_techniques.md#xhrs}}). These are usually tracking data, ads and other content that may not be essential for the website to load or is useful to collect periodically. Sometimes though, it may contain actual core page data that you need. - -## [](#quick-summary) Quick summary - -Use these helper functions to wait for the data. Pass in time in milliseconds or the [CSS selector](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors) to wait for. - -* `page.waitForTimeout` or `page.waitForSelector` in [Puppeteer](https://pptr.dev) (or **Puppeteer Scraper** ([apify/puppeteer-scraper](https://apify.com/apify/puppeteer-scraper))). -E.g. `await page.waitForTimeout(10000)` - waits for 10 seconds. - -* `context.waitFor` in **Web Scraper** ([apify/web-scraper](https://apify.com/apify/web-scraper)). -E.g. `await context.waitFor('my-selector')` - waits for `my-selector` to appear on the page. - -## [](#how-page-loading-works) How page loading works - -Before looking at code examples that solve this problem, let's review what the page loading process looks like. - -1. **HTML document is loaded** (`domcontentloaded` event). This document contains the HTML as it was rendered on the website server. It also includes all the JavaScript that is executed and rendered in the next step. This HTML is what you get when you use [got-scraping](https://www.npmjs.com/package/got-scraping) or **Cheerio Scraper** ([apify/cheerio-scraper](https://apify.com/apify/cheerio-scraper)) ([CheerioCrawler](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) class). - -2. **JavaScript is executed and rendered** (`load` event). The page is fully rendered, but may still lack dynamically loaded data. - -3. **Network XHR requests are loaded and rendered** (`networkidle0` or `networkidle2` events). Some websites load essential data this way. The execution of these requests may depend on user behavior like in [infinite scroll](https://www.smashingmagazine.com/2013/05/infinite-scrolling-lets-get-to-the-bottom-of-this/). -This is when you use Web Scraper or Puppeteer Scraper ([PuppeteerCrawler](https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler) class) to get the page. Be careful that pages often track you with additional requests and the load may never end. - -## [](#how-to-wait-for-dynamic-content) How to wait for dynamic content - -The section below describes how you can wait for dynamic content. - -### [](#http-request-cheerio-scraper) http-request / Cheerio Scraper - -Often, all the essential data are presented in the initial HTML. And scraping it without a browser (without Puppeteer) is much more efficient. That is why we created [Cheerio Scraper](https://apify.com/apify/cheerio-scraper). - -But even if data are rendered via JavaScript or loaded dynamically, there are advanced techniques that allow you to reverse-engineer this data and still retain Cheerio's efficiency. For example, you can emulate the requests for dynamic data directly in your code. - -### [](#web-scraper-puppeteer-scraper-puppeteer) Web Scraper / Puppeteer Scraper / Puppeteer - -In 95% of cases, the JavaScript-rendered page that you get with Puppeteer is enough. If you actually need to wait for the dynamic content, Puppeteer has several helper functions, of which the most important are: [`page.waitForTimeout`](https://pptr.dev/api/puppeteer.page.waitfortimeout), [`page.waitForSelector`](https://pptr.dev/api/puppeteer.page.waitforselector), [`page.waitForResponse`](https://pptr.dev/api/puppeteer.page.waitforresponse), [`page.waitForNavigation`](https://pptr.dev/api/puppeteer.page.waitfornavigation) and [`page.waitForFunction`](https://pptr.dev/api/puppeteer.page.waitforfunction). - -### [](#waitfor-function) waitFor function - -This function can be found as `context.waitFor` in [Web Scraper](https://apify.com/apify/web-scraper#page-function) where it is a generic function that has three possible arguments. - -* **Number of milliseconds** - `await context.waitFor(10000)`. The same as `page.waitForTimeout` (will wait for 10 seconds). - -* **Selector string** - `await context.waitFor('my-selector')`. The same as `page.waitForSelector` (will wait until that selector appears on the page but timeouts after 30 seconds with an error). - -* **Predicate function** - `await context.waitFor(functionThatReturnsTrueOrFalse)`. The same as `page.waitForFunction` (you can pass an arbitrary function that is executed periodically and the code waits until it returns `true`). - -With Puppeteer, you have to use dedicated functions like `page.waitForTimeout`, `page.waitForSelector` or `page.waitForFunction`. - -### [](#testing-it) Testing it - -If you need to update your code with waiting logic, simply start by waiting 10 seconds. If that doesn't help, try 30 seconds. If it still doesn't work, the problem is elsewhere. - -Try to debug it using logs and screenshots. If your code is working, you know that it was indeed dynamically loaded data that caused your problem. Now you can change the 10 seconds waiting time for `waitForSelector` to be more efficient. - -### [](#timeout-and-errors) Timeout and errors - -By default, `waitFor` times out after 30 seconds with an error. Usually, this means another error is preventing the selector from loading. The selector itself may be wrong, or your browser got blocked or redirected to another version of the website. - -Most of the time, if the selector doesn't load in the first 5 seconds, it won't load at all. You can prevent wasteful waiting by changing the timeout to `await context.waitFor('my-selector', { timeout: 10000 })`. - -The `waitFor` (the selector version) will throw an error once it reaches the timeout. That is usually a good thing because you don't want this to go unnoticed. But if the data are not too important, or you want to fall back to some other solution, you can easily catch the waiting error: - -```javascript -await page.waitForSelector('my-selector', { timeout: 10000 }) - .catch(() => console.log('Wait for my-selector timed out') -); -``` - -The code will then continue. - -## [](#advanced-use-cases) Advanced use cases - -So far, we have only scratched the surface of this topic. Let's have a quick look at some more advanced cases. We have not yet covered the third usage of `waitFor` – `waitForFunction`. - -### [](#waitforfunction) waitForFunction - -If a simple selector is not enough, we can implement a function to be evaluated in the browser context to tell us if the page is ready. Let's imagine that we know the page needs to load 24 products, but for some reason, they load over time. We can define a simple function to check it. - -```javascript -// Let's assume JQuery is injected -const has24Products = () => { - const numberOfProducts = $('.my-products').length; - return numberOfProducts === 24; -}; -``` - -Now we simply pass it to `waitForFunction`: - -```javascript -// In Puppeteer you need to inject JQuery with -// await puppeteerUtils.injectJQuery(page); -// imported from 'crawlee' package -await page.waitForFunction(has24Products); -``` - -### [](#waitforresponse) waitForResponse - -Sometimes, it may be handy to work directly with the XHR request's response. - -* It is faster. You don't need to wait for the element to render. - -* It may contain nicely structured [JSON data]({{@link web_scraping_101/web_scraping_techniques.md#xhrs}}). - -Keep in mind that `waitForResponse` is not included in `waitFor` cases, so it does not work in Web Scraper. If you are interested in exploring the responses, you can look through them in your browser's developer console. In Firefox and Chrome, it is the **Network** tab with the **XHR** filter selected. - -![The Network tab in a browser]({{@asset tutorials/images/network-tab.webp}}) - -We can catch this response by checking for its URL and method (we have to do it since the same URL is included in the OPTIONS method). We return `true` or `false` depending on whether it is the response we want. `waitForResponse` will even give us the response back. - -```javascript -const responseChooser = async (response) => { - const url = response.url(); - const method = response.request().method(); - if (url.includes('/prod_PUBLIC_STORE') && method === 'POST') { - return true; - }; - return false; -}; -const correctResponse = await page.waitForResponse(responseChooser); -``` - -Now, we simply extract the JSON. - -```javascript -const data = await correctResponse.json(); -const userAgent = data.user_agent; -``` - -## [](#custom-waiting-functions) Custom waiting functions - -You don't need to rely on Puppeteer's smart functions to implement something. You can implement "waiters" using a simple loop. Then, you can add your own functionality to it. For example, a `waitForSelector` that logs its waiting. - -```javascript -const waitAndLog = async (page, selector, timeout = 30000) => { - const start = Date.now(); - let myElement = await page.$(selector); - while (!myElement) { - await page.waitFor(500); // wait 0.5s each time - const alreadyWaitingFor = Date.now() - start; - if (alreadyWaitingFor > timeout) { - throw `Wait for ${selector} timed out after ${timeout} ms`; - } - console.log(`Waiting for ${selector} for ${alreadyWaitingFor}`); - myElement = await page.$(selector); - } - console.log(`Selector ${selector} appeared on the page!`) - return myElement; -}; - -// You can use the element handle it returns -await waitAndLog(page, 'my-selector'); -``` diff --git a/content/docs/tutorials/use_apify_from_php.md b/content/docs/tutorials/use_apify_from_php.md deleted file mode 100644 index 6924fe4417..0000000000 --- a/content/docs/tutorials/use_apify_from_php.md +++ /dev/null @@ -1,271 +0,0 @@ ---- -title: Use Apify from PHP -description: Learn how to access Apify's REST API endpoints from your PHP projects using the guzzle package. Follow a tutorial to run an actor and download its data. -menuWeight: 3.95 -paths: - - tutorials/use-apify-from-php ---- - -# How to use Apify from PHP - -Apify's [RESTful API](https://docs.apify.com/api/v2#) allows you to use the platform from basically anywhere. Many projects are and will continue to be built using [PHP](https://www.php.net/). This tutorial enables you to use Apify in these projects in PHP and frameworks built on it. - -Apify does not have an official PHP client (yet), so we are going to use [guzzle](https://github.com/guzzle/guzzle), a great library for HTTP requests. By covering a few fundamental endpoints, this tutorial will show you the principles you can use for all Apify API endpoints. - -## Before you start - -Make sure you have an Apify account and API token. You will find the token in the [Integrations](https://console.apify.com/account?tab=integrations) section in Apify Console. - -If you don't already have guzzle installed in your project (or just want to try out the code examples), run `composer require guzzlehttp/guzzle` to install it in the current directory. - -## Preparing the client - -To get a guzzle instance ready to be used with the Apify API, we first need to set up the base endpoint and authentication. - -```php -require 'vendor/autoload.php'; - -$client = new \GuzzleHttp\Client([ - 'base_uri' => 'https://api.apify.com/v2/', - 'headers' => [ - // Replace with your actual token - 'Authorization' => 'Bearer ', - ] -]); -``` - -Note that we pass the API token in the header. It can also be passed as a query string `token` parameter, but passing it in the header is preferred and more secure. - -To check whether everything works well, we'll try to get information about the [current user](/api/v2#/reference/users/private-data/get-private-user-data). - -```php -// Call the endpoint using our client -// Note that the path does not have a leading slash -$response = $client->get('users/me'); -// Parse the response (most Apify API endpoints return JSON) -$parsedResponse = \json_decode($response->getBody(), true); -// The actual data are usually present under the `data` key -$data = $parsedResponse['data']; - -echo \json_encode($data, JSON_PRETTY_PRINT); -``` - -If, instead of data, you see an error saying `Authentication token is not valid`, check if the API token you used to instantiate the client is valid. - -## Running an actor - -Now that we have our guzzle client ready to go, we can run some actors. Let's try the **Contact Details Scraper** ([vdrmota/contact-info-scraper](https://apify.com/vdrmota/contact-info-scraper)). - -The [API reference](/api/v2#/reference/actors/run-collection/run-actor) states that an actor's input should be passed as JSON in the request body. Other options are passed as query parameters. - -```php -// To run the actor, we make a POST request to its run's endpoint -// To identify the actor, you can use its ID, but you can also pass -// the full actor name [username]~[actorName] or just ~[actorName] for -// your own actors -$response = $client->post('acts/vdrmota~contact-info-scraper/runs', [ - // Actors usually accept JSON as input. When using the `json` key in - // a POST request's options, guzzle sets proper request headers - // and serializes the array we pass in - 'json' => [ - 'startUrls' => [ - ['url' => 'https://www.apify.com/contact'] - ], - 'maxDepth' => 0, - ], - // Other run options are passed in as query parameters - // This is optional since actors usually have reasonable defaults - 'query' => [ 'timeout' => 30 ], -]); -$parsedResponse = \json_decode($response->getBody(), true); -$data = $parsedResponse['data']; - -echo \json_encode($data, JSON_PRETTY_PRINT); -``` - -You should see information about the run, including its ID and the ID of its default [dataset]({{@link storage/dataset.md}}). Take note of these, we will need them later. - -## [](#getting-dataset) Getting the results from dataset - -Actors usually store their output in a default dataset. The [actor runs endpoint](/api/v2#/reference/actor-runs) lets you get overall info about an actor run's default dataset. - -```php -// Replace with the run ID you from earlier -$response = $client->get('actor-runs//dataset'); -$parsedResponse = \json_decode($response->getBody(), true); -$data = $parsedResponse['data']; - -echo \json_encode($data, JSON_PRETTY_PRINT); -``` - -As you can see, the response contains overall stats about the dataset, like its number of items, but not the actual data. To get those, we have to call the **items** endpoint. - -```php -// Replace with the run ID from earlier -$response = $client->get('actor-runs//dataset/items'); -// The dataset items endpoint returns an array of dataset items -// they are not under the `data` key like in other endpoints -$data = \json_decode($response->getBody(), true); - -echo \json_encode($data, JSON_PRETTY_PRINT); -``` - -Some actors write to datasets other than the default. In these cases, you need to have the dataset ID and call the `datasets/` and `datasets//items` endpoints instead. - -For larger datasets, you can paginate through the results by passing query parameters. - -```php -$response = $client->get('datasets//items', [ - 'query' => [ - 'offset' => 20, - 'limit' => 10, - ] -]); -$parsedResponse = \json_decode($response->getBody(), true); -echo \json_encode($parsedResponse, JSON_PRETTY_PRINT); -``` - -All the available parameters are described in our [API reference](/api/v2#/reference/datasets/item-collection/get-items) and work both for all datasets. - -## [](#getting-key-value-store) Getting the results from key-value stores - -Datasets are great for structured data, but are not suited for binary files like images or PDFs. In these cases, actors store their output in [key-value stores]({{@link storage/key_value_store.md}}). One such actor is the **HTML String To PDF** ([mhamas/html-string-to-pdf](https://apify.com/mhamas/html-string-to-pdf)) converter. Let's run it. - -```php -$response = $client->post('acts/mhamas~html-string-to-pdf/runs', [ - 'json' => [ - 'htmlString' => '

    Hello World

    ' - ], -]); -$parsedResponse = \json_decode($response->getBody(), true); -$data = $parsedResponse['data']; - -echo \json_encode($data, JSON_PRETTY_PRINT); -``` - -Keep track of the returned run ID. - -Similar to datasets, we can get overall info about the default key-value store. - -```php -// Replace with the ID returned by the code above -$response = $client->get('actor-runs//key-value-store'); -$parsedResponse = \json_decode($response->getBody(), true); -$data = $parsedResponse['data']; - -echo \json_encode($data, JSON_PRETTY_PRINT); -``` - -The items in key-value stores are not structured, so we cannot use the same approach as we did with dataset items. We can obtain some information about a store's content using its **keys** endpoint. - -```php -// Don't forget to replace with the ID you got earlier -$response = $client->get('actor-runs//key-value-store/keys'); -$parsedResponse = \json_decode($response->getBody(), true); -$data = $parsedResponse['data']; - -echo \json_encode($data, JSON_PRETTY_PRINT); -``` - -We can see that there are two record keys: `INPUT` and `OUTPUT`. The HTML String to PDF actor's README states that the PDF is stored under the `OUTPUT` key. Downloading it is simple: - -```php -// Don't forget to replace the -$response = $client->get('actor-runs//key-value-store/records/OUTPUT'); -// Make sure that the destination (filename) is writable -file_put_contents(__DIR__ . '/hello-world.pdf', $response->getBody()); -``` - -If you open the generated `hello-world.pdf` file, you should see... well, "Hello World". - -If the actor stored the data in a key-value store other than the default, we can use the standalone endpoints, `key-value-stores/`, `key-value-stores//keys`, and `key-value-stores//records/`. They behave the same way as the default endpoints. See the [full docs](https://docs.apify.com/api/v2#/reference/key-value-stores/store-object). - -## When are the data ready? - -It takes some time for an actor to generate its output. There are even actors that run for days. In the previous examples, we chose actors whose runs only take a few seconds. This meant the runs had enough time to finish before we ran the code to retrieve their dataset or key-value store (so the actor had time to produce some output). If we ran the code immediately after starting a longer-running actor, the dataset would probably still be empty. - -For actors that are expected to be quick, we can use the `waitForFinish` parameter. Then, the running actor's endpoint does not respond immediately but waits until the run finishes (up to the given limit). Let's try this with the HTML String to PDF actor. - -```php -$response = $client->post('acts/mhamas~html-string-to-pdf/runs', [ - 'json' => [ - 'htmlString' => '

    Hi World

    ' - ], - // Pass in how long we want to wait, in seconds - 'query' => [ 'waitForFinish' => 60 ] -]); -$parsedResponse = \json_decode($response->getBody(), true); -$data = $parsedResponse['data']; - -echo \json_encode($data, JSON_PRETTY_PRINT); - -$runId = $data['id']; -$response = $client->get(sprintf('actor-runs/%s/key-value-store/records/OUTPUT', $runId)); -file_put_contents(__DIR__ . '/hi-world.pdf', $response->getBody()); -``` - -## Webhooks - -For actors that take longer to run, we can use [webhooks]({{@link integrations/webhooks.md}}). A webhook is an HTML POST request that is sent to a specified URL when an actor's status changes. We can use them as a kind of notification that is sent when your run finishes. You can set them up using query parameters. If we used webhooks in the example above, it would look like this: - -```php -// Webhooks need to be passed as a base64-encoded JSON string -$webhooks = \base64_encode(\json_encode([ - [ - // The webhook can be sent on multiple events - // this one fires when the run succeeds - 'eventTypes' => ['ACTOR.RUN.SUCCEEDED'], - // Set this to some url that you can react to - // To see what is sent to the URL, - // you can set up a temporary request bin at https://requestbin.com/r - 'requestUrl' => '', - ], -])); -$response = $client->post('acts/mhamas~html-string-to-pdf/runs', [ - 'json' => [ - 'htmlString' => '

    Hello World

    ' - ], - 'query' => [ 'webhooks' => $webhooks ] -]); -``` - -## How to use Apify Proxy - -There is another important Apify feature you will need: [proxy]({{@link proxy.md}}). Guzzle makes it really easy to use. - -If you just want to make sure that your server's IP address won't get blocked somewhere when making requests, you can use the automatic proxy selection mode. - -```php -$client = new \GuzzleHttp\Client([ - // Replace below with your password - // found at https://console.apify.com/proxy - 'proxy' => 'http://auto:@proxy.apify.com:8000' -]); - -// This request will be made through an automatically chosen proxy -$response = $client->get("http://proxy.apify.com/?format=json"); -echo $response->getBody(); -``` - -If you want to maintain the same IP between requests, you can use the session mode. - -```php -$client = new \GuzzleHttp\Client([ - // Replace below with your password - // found at https://console.apify.com/proxy - 'proxy' => 'http://session-my_session:@proxy.apify.com:8000' -]); - -// Both responses should contain the same clientIp -$response = $client->get("https://api.apify.com/v2/browser-info"); -echo $response->getBody(); - -$response = $client->get("https://api.apify.com/v2/browser-info"); -echo $response->getBody(); -``` - -See the [proxy docs]({{@link proxy/connection_settings.md}}) for more details on using specific proxies. - -## Feedback - -Are you interested in an Apify PHP client or other PHP-related content? Do you have some feedback on this tutorial? [Let us know](https://apify.typeform.com/to/KqhmiJge#source=tutorial_use_apify_from_php)! diff --git a/content/docs/web_scraping_101.md b/content/docs/web_scraping_101.md deleted file mode 100644 index 2465662bf6..0000000000 --- a/content/docs/web_scraping_101.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Web scraping 101 -description: All you need to know about extracting structured data from web pages, the protections websites employ to prevent it, and how to bypass them. -menuWeight: 4 -category: guides -paths: - - web-scraping-101 ---- - -# [](#web-scraping-101) Web scraping 101 - -Web scraping is the process of extracting structured information from a web page. In essence, web scraping automates the process of manually finding and copy/pasting the information on a website you find useful. - -In other words, instead of manually visiting each of the 1000 listings for white T-shirts on an e-commerce site and copy-pasting each listing's price, description and seller information, you can create a bot that does it for you. You can then make your bot return the data in a handy format like JSON, HTML or Excel, so you could process and use it. - -## [](#what-can-i-use-web-scraping-for) What can I use web scraping for? - -The primary function of web scraping is the extraction of data. - -It is about gathering information, which you can then use to make informed decisions in how to [price](https://apify.com/use-cases/price-comparison) or [market](https://apify.com/use-cases/market-research) your product, [find new customers](https://apify.com/use-cases/lead-generation) and make decisions that enable you to [grow your business](https://apify.com/use-cases). - -To see examples of organizations that have already benefited from web scraping, check out our [success stories](https://apify.com/success-stories). - -## [](#how-does-web-scraping-work) How does web scraping work? - -1. The scraper [requests](https://www.codecademy.com/articles/http-requests) the contents of a particular page from a website (e.g. this week's Top 10 singles on Spotify). The site returns it in [HTML](https://en.wikipedia.org/wiki/HTML) format. -2. It [parses](https://en.wikipedia.org/wiki/Parsing) (splits up the data and converts it to the required format) the HTML and extracts the data it's been programmed to extract (e.g. the song title and artist name). -3. The scraper stores the data in the specified format, so you can use it manually or in a program. - -## [](#is-web-scraping-the-same-as-robotic-process-automation-rpa) Is web scraping the same as robotic process automation (RPA)? - -While web scraping is a kind of [RPA]({{@link robotic_process_automation.md}}), it focuses on extracting data. RPA focuses on the other tasks in browsers - everything except for extracting information. - -RPA allows you to handle [use cases](https://apify.com/use-cases/rpa) like filling forms or uploading files while you get on with more important tasks. And it's not just simple tasks you can automate. How about [processing your invoices](https://apify.com/katerinahronik/toggl-invoice-download) or automating your sales processes? - -## Is web scraping legal? - -In a word, yes. Of course, it has a code of ethics and regulations, which you should always adhere to. [Find out more](https://blog.apify.com/is-web-scraping-legal/) over at Apify Blog. - -## [](#additional-resources) Additional resources - -If you would like to learn more, check out [The Beginner's Guide to Web Scraping](https://blog.apify.com/web-scraping-guide), which explains the basics of web scraping, why you should do it, and helpful tips on how to get started. diff --git a/content/docs/web_scraping_101/anti_scraping_techniques.md b/content/docs/web_scraping_101/anti_scraping_techniques.md deleted file mode 100644 index 6ab5c92196..0000000000 --- a/content/docs/web_scraping_101/anti_scraping_techniques.md +++ /dev/null @@ -1,90 +0,0 @@ ---- -title: Anti-scraping techniques and how to bypass them -menuTitle: Anti-scraping techniques -description: Explore anti web scraping, and methods used to bypass blocking, such as IP address rotation and proxies, emulated browser signatures, shared sessions, etc. -menuWeight: 4.2 -paths: - - web-scraping-101/anti-scraping-techniques ---- - -# [](#anti-scraping-techniques-and-how-to-bypass-them) Anti-scraping techniques - -Many websites use anti-scraping techniques to block web scraping bots. Our research shows that there are a number of methods deployed in the field to bypass these defenses. - -In many cases, we found that very simple changes in approach are commonly used. For example, if a site is blocking based on IP address, switching between different addresses is effective. If a website is analyzing behavior, making that behavior as human-like as possible will confuse the anti-scraping system. If these simpler options fail, there are more complex methods available, such as [shared IP address emulation](https://dev.to/apify/bypassing-web-scraping-protection-get-the-most-out-of-your-proxies-with-shared-ip-address-emulation-291c) (also known as [session multiplexing](https://en.wikipedia.org/wiki/Session_multiplexing)). - -## [](#ip-address-based-blocking) IP address-based blocking - -A popular option some websites use is blocking access based on the IP range your address belongs to. This kind of protection aims to reduce the amount of non-human traffic. For instance, websites will deny access to [ranges of Amazon Web Services's IP addresses](https://docs.aws.amazon.com/general/latest/gr/aws-ip-ranges.html) and other commonly known ranges. - -### [](#bypassing-ip-address-based-blocking) Bypassing IP address-based blocking - -We found that web scraping can work around IP address-based blocking by rotating the IP addresses from which they send the requests to target websites. This can be done by using a pool of [proxy servers](https://en.wikipedia.org/wiki/Proxy_server) by assigning each request another proxy server from the pool and thus making it look like a request coming from another user. The proxies can be selected either randomly or in round-robin fashion. - -This method's effectiveness depends on various factors, such as the number of web pages that are being scraped, the sophistication of the scraping protection and the number and type of proxies. If too many requests are sent from a single proxy in too short a period of time, the proxy might get “burned”, which means all further requests from it are blocked. - -Our research determined that, for successful large-scale scraping activities, it is essential to have a sufficient pool of proxies and to time the workload to maximize the scraping throughput while burning proxies. - -[Apify Proxy]({{@link proxy.md}}) enables you to enhance your data throughput and access websites from any geographical location by using an extensive pool of datacenter and residential proxies. - -## [](#ip-rate-limiting) IP rate limiting - -When crawling a website, a web scraping bot will typically send many more requests from a single IP address than a human user could generate over the same period. Websites can easily monitor how many requests they receive from a single IP address. If the number of requests exceeds a certain limit, websites can block that IP address or require a [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) test. - -### [](#bypassing-ip-rate-limiting) Bypassing IP rate limiting - -We found that two ways are used to work around rate limiting. One method is to limit how many pages on a single site are scraped concurrently, with delays possibly even being intentionally introduced (after reaching the original limit). The other method is to use proxy servers and rotate IP addresses after a certain number of requests. - -Apify actors are designed to reduce the workload on websites being scraped. To lower the concurrency when using the [Apify SDK](https://sdk.apify.com/) and [Crawlee](https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions#maxConcurrency), just pass the `maxConcurrency` option to your crawler's setup. If you use actors from [Apify Store](https://apify.com/store), you can usually set the maximum concurrency in the actor's input. - -## [](#http-request-analysis) HTTP request analysis - -Each HTTP request sent from a client to a web server contains a lot of hidden information such as -[HTTP headers](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers), client IP address, -[SSL/TLS](https://www.websecurity.digicert.com/security-topics/what-is-ssl-tls-https) version or a list of supported -[TLS ciphers](https://en.wikipedia.org/wiki/Cipher_suite). Even the structure of the HTTP request itself, e.g. the order of the HTTP headers, can tell whether the request comes from a real web browser or a script. - -Websites can check for these signals and block requests that do not have the signature of a known web browser or show a CAPTCHA. Our research shows that this kind of protection is commonly bypassed by the use of only plain HTTP requests, because the protection does not collect any window attributes or evaluate any JavaScript code. - -### [](#bypassing-http-request-analysis) Bypassing HTTP request analysis - -A straightforward method frequently used to circumvent HTTP request analysis is to use a real web browser, such as [headless Chrome](https://developers.google.com/web/updates/2017/04/headless-chrome), to emulate browser HTTP signatures. However, this is inefficient, as web browsers consume a lot of system resources and are generally slow. - -A more efficient method used is to emulate browser HTTP request signatures even when using a low-level HTTP request library. This makes the scripted HTTP request look like a real web browser, but is much faster and more efficient. Note that we found that this method only works in situations where the page content is served directly in the first HTML response and not loaded later using [AJAX](https://en.wikipedia.org/wiki/Ajax_(programming)). - -To test this, we used [got-scraping](https://github.com/apify/got-scraping), which is sending browser-like requests out of the box. - -## [](#user-behavior-analysis) User behavior analysis - -Rather than analyzing and reacting to client requests in real time, websites can collect user behavior data over longer periods and then react to it only when sufficient information is available. - -Such data can contain the order in which pages are visited, how long the user stays on each page, mouse movements or even how fast forms are filled in. If enough evidence indicates that the user’s behavior is not human, websites can block the client IP address or serve a CAPTCHA. - -## [](#browser-fingerprinting) Browser fingerprinting - -Websites can also use various techniques to test whether a client's web browser is being used by a human user or a robot, and even identify repeated visits of the same web browser. This is known as [browser fingerprinting](https://pixelprivacy.com/resources/browser-fingerprinting/), and it can range from very primitive JavaScript challenges to state-of-the-art browser integrity tests and behavioral analyses. - -The tests look for things like information about your browser type and version, operating system, installed browser extensions, available fonts, timezone, among others. Combined, all this information forms a browser's “fingerprint”. - -While this information may seem quite generic, [Panopticlick](https://panopticlick.eff.org/) found that on average only [1 in 286,777](https://panopticlick.eff.org/static/browser-uniqueness.pdf) browsers will have the same fingerprint as you. - -## [](#combinations-of-the-above-techniques) Combinations of the above techniques - -To make things complicated, websites often employ various scraping protection combinations such as IP address-based blocking and HTTP request analysis. - -## [](#reducing-blocking-with-shared-ip-address-emulation) Reducing blocking with shared IP address emulation - -IP address rotation and emulation of browser HTTP signatures can be effective for many web scraping tasks, but large-scale crawls will get blocked. Using more proxies is a solution to this, but that can be expensive. - -Shared IP address emulation can dramatically extend the effectiveness of scraping and multiply the number of pages that can be fetched. The technique relies on websites knowing that many different users can be behind a single IP address. - -For example, requests from mobile devices are usually only routed through a handful of IP addresses, while users behind a single corporate firewall might all have a single IP address. By emulating and managing these user sessions per IP address, we found that it is possible to prevent websites from aggressive blocking. - -To make it work, a single user session has to always be routed via the same IP address. A website can identify such user sessions based on cookies, authentication tokens or a browser HTTP signature/fingerprint. - -Our research into this was assisted by the [Crawlee](https://sdk.apify.com)'s [`SessionPool`](https://crawlee.dev/api/core/class/SessionPool) class. This can be added to other Apify tools such as [actors]({{@link actors.md}}) or [proxy]({{@link proxy.md}}), but it also works outside the Apify ecosystem. - -## [](#comparing-ways-of-bypassing-scraping-protection) Comparing ways of bypassing scraping protection - -In a [recent experiment](https://dev.to/apify/bypassing-web-scraping-protection-get-the-most-out-of-your-proxies-with-shared-ip-address-emulation-291c), we found session emulation to be at least twice as effective as plain [IP address rotation](#bypassing-ip-address-based-blocking). - diff --git a/content/docs/web_scraping_101/images/css-selectors.webp b/content/docs/web_scraping_101/images/css-selectors.webp deleted file mode 100644 index e17db17bbe..0000000000 Binary files a/content/docs/web_scraping_101/images/css-selectors.webp and /dev/null differ diff --git a/content/docs/web_scraping_101/images/json-ld.webp b/content/docs/web_scraping_101/images/json-ld.webp deleted file mode 100644 index c5da7d6a58..0000000000 Binary files a/content/docs/web_scraping_101/images/json-ld.webp and /dev/null differ diff --git a/content/docs/web_scraping_101/images/xhrs.webp b/content/docs/web_scraping_101/images/xhrs.webp deleted file mode 100644 index 2b71f59abc..0000000000 Binary files a/content/docs/web_scraping_101/images/xhrs.webp and /dev/null differ diff --git a/content/docs/web_scraping_101/web_scraping_techniques.md b/content/docs/web_scraping_101/web_scraping_techniques.md deleted file mode 100644 index f4f12d40d6..0000000000 --- a/content/docs/web_scraping_101/web_scraping_techniques.md +++ /dev/null @@ -1,94 +0,0 @@ ---- -title: Web scraping techniques -description: An introduction to the methods you can use to extract data from websites. Analyze web pages for hidden elements to find the most effective approach. -menuWeight: 4.1 -paths: - - web-scraping-101/web-scraping-techniques ---- - -# Web scraping techniques - -This article provides a quick summary of ways websites structure and send their information. Knowing these techniques will help you extract data quicker and more efficiently. - -You can find more in-depth discussion and code examples for all the techniques in [this](https://blog.apify.com/web-scraping-in-2018-forget-html-use-xhrs-metadata-or-javascript-variables-8167f252439c) article. - -## [](#css-selectors) CSS selectors - -The first method you will use when trying web scraping is most likely -[CSS selectors](https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors). They allow you to select your desired elements by type, class, ID or attributes. - -To see which element contains the information you need (and its details), open your browser's Developer Tools. Right-click the content, then select **Inspect** on [Chrome](https://developers.google.com/web/tools/chrome-devtools) and **Inspect element** on [Mozilla](https://developer.mozilla.org/en-US/docs/Tools). - -![CSS selectors]({{@asset web_scraping_101/images/css-selectors.webp}}) - -If you are only looking to scrape a couple of elements from a page, this method is sufficient. For more elaborate extraction use cases, however, there are other, more effective, methods. - -## [](#schema-org-microdata) Schema.org microdata - -Schemas provide a way to mark up web pages so major search engines like Google, Bing and Yahoo can understand them. - -Pages with [schema.org](https://schema.org) markup still use HTML. The only difference is that they add machine-readable code markers into the HTML documents. This helps cut down on [ambiguity](https://schema.org/docs/gs.html) and allows search engines to [return more accurate results](https://moz.com/learn/seo/what-is-seo). - -If a site uses microdata, you will find it in its `` element using your browser's developer tools. It will be in a hidden element similar to the one below. - -```html -
    -

    Atlantics

    - Director: Mati Diop - drama - Trailer -
    -``` - -Check out this [tutorial](https://help.apify.com/en/articles/1444245-scraping-data-from-websites-using-schema-org-microdata) to learn how to scrape pages using Schema.org microdata. - -## [](#json-ld) JSON-LD - -Similar to [Schema.org microdata](#schema-org-microdata), some sites use [JSON for Linking Data](https://json-ld.org/) (JSON-LD). Based on the [JSON](https://www.json.org/json-en.html) format, JSON-LD helps structure a web page's content in a way that's easy for humans and computers to read. This improves a site's [SEO](https://moz.com/learn/seo/what-is-seo). - -To see if a website uses JSON-LD, check its `` element using your browser's developer tools. You will find the JSON-LD data in a `