Skip to content
Open
41 changes: 41 additions & 0 deletions docs/guides/code_examples/request_throttling/throttling_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import asyncio

from crawlee.crawlers import BasicCrawler, BasicCrawlingContext
from crawlee.request_loaders import ThrottlingRequestManager
from crawlee.storages import RequestQueue


async def main() -> None:
# Open the default request queue.
queue = await RequestQueue.open()

# Wrap it with ThrottlingRequestManager for specific domains.
# The throttler uses the same storage backend as the underlying queue.
throttler = ThrottlingRequestManager(
queue,
domains=['api.example.com', 'slow-site.org'],
)

# Pass the throttler as the crawler's request manager.
crawler = BasicCrawler(request_manager=throttler)

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}')

# Add requests. Listed domains are routed directly to their
# throttled sub-queues. Others go to the main queue.
await throttler.add_requests(
[
'https://api.example.com/data',
'https://api.example.com/users',
'https://slow-site.org/page1',
'https://fast-site.com/page1', # Not throttled
]
)

await crawler.run()


if __name__ == '__main__':
asyncio.run(main())
47 changes: 47 additions & 0 deletions docs/guides/request_throttling.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
---
id: request-throttling
title: Request throttling
description: How to throttle requests per domain using the ThrottlingRequestManager.
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import ThrottlingExample from '!!raw-loader!roa-loader!./code_examples/request_throttling/throttling_example.py';

When crawling websites that enforce rate limits (HTTP 429) or specify `crawl-delay` in their `robots.txt`, you need a way to throttle requests per domain without blocking unrelated domains. The <ApiLink to="class/ThrottlingRequestManager">`ThrottlingRequestManager`</ApiLink> provides exactly this.

## Overview

The <ApiLink to="class/ThrottlingRequestManager">`ThrottlingRequestManager`</ApiLink> wraps a <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> and manages per-domain throttling. You specify which domains to throttle at initialization, and the manager automatically:

- **Routes requests** for listed domains into dedicated sub-queues at insertion time.
- **Enforces delays** from HTTP 429 responses (exponential backoff) and `robots.txt` crawl-delay directives.
- **Schedules fairly** by fetching from the domain that has been waiting the longest.
- **Sleeps intelligently** when all configured domains are throttled, instead of busy-waiting.

Requests for domains **not** in the configured list pass through to the main queue without any throttling.

## Basic usage

To use request throttling, create a <ApiLink to="class/ThrottlingRequestManager">`ThrottlingRequestManager`</ApiLink> with the domains you want to throttle and pass it as the `request_manager` to your crawler:

<RunnableCodeBlock className="language-python" language="python">
{ThrottlingExample}
</RunnableCodeBlock>

## How it works

1. **Insertion-time routing**: When you add requests via `add_request` or `add_requests`, each request is checked against the configured domain list. Matching requests go directly into a per-domain sub-queue; all others go to the main queue. This eliminates request duplication entirely.

2. **429 backoff**: When the crawler detects an HTTP 429 response, the `ThrottlingRequestManager` records an exponential backoff delay for that domain (starting at 2s, doubling up to 60s). If the response includes a `Retry-After` header, that value takes priority.

3. **Crawl-delay**: If `robots.txt` specifies a `crawl-delay`, the manager enforces a minimum interval between requests to that domain.

4. **Fair scheduling**: `fetch_next_request` sorts available sub-queues by how long each domain has been waiting, ensuring no domain is starved.

:::tip

The `ThrottlingRequestManager` is an opt-in feature. If you don't pass it to your crawler, requests are processed normally without any per-domain throttling.

:::
41 changes: 41 additions & 0 deletions src/crawlee/_utils/http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""HTTP utility functions for Crawlee."""

from __future__ import annotations

from datetime import datetime, timedelta, timezone
from email.utils import parsedate_to_datetime


def parse_retry_after_header(value: str | None) -> timedelta | None:
"""Parse the Retry-After HTTP header value.

The header can contain either a number of seconds or an HTTP-date.
See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After

Args:
value: The raw Retry-After header value.

Returns:
A timedelta representing the delay, or None if the header is missing or unparsable.
"""
if not value:
return None

# Try parsing as integer seconds first.
try:
seconds = int(value)
return timedelta(seconds=seconds)
except ValueError:
pass

# Try parsing as HTTP-date (e.g., "Wed, 21 Oct 2015 07:28:00 GMT").

try:
retry_date = parsedate_to_datetime(value)
delay = retry_date - datetime.now(retry_date.tzinfo or timezone.utc)
if delay.total_seconds() > 0:
return delay
except (ValueError, TypeError):
pass

return None
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,12 @@ async def _handle_status_code_response(
"""
status_code = context.http_response.status_code
if self._retry_on_blocked:
self._raise_for_session_blocked_status_code(context.session, status_code)
self._raise_for_session_blocked_status_code(
context.session,
status_code,
request_url=context.request.url,
retry_after_header=context.http_response.headers.get('retry-after'),
)
self._raise_for_error_status_code(status_code)
yield context

Expand Down
62 changes: 54 additions & 8 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
)
from crawlee._utils.docs import docs_group
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
from crawlee._utils.http import parse_retry_after_header
from crawlee._utils.recurring_task import RecurringTask
from crawlee._utils.robots import RobotsTxtFile
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
Expand All @@ -63,6 +64,7 @@
)
from crawlee.events._types import Event, EventCrawlerStatusData
from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import ThrottlingRequestManager
from crawlee.router import Router
from crawlee.sessions import SessionPool
from crawlee.statistics import Statistics, StatisticsState
Expand Down Expand Up @@ -700,19 +702,30 @@ async def run(

self._running = True

if self._respect_robots_txt_file and not isinstance(self._request_manager, ThrottlingRequestManager):
self._logger.warning(
'The `respect_robots_txt_file` option is enabled, but the crawler is not using '
'`ThrottlingRequestManager`. Crawl-delay directives from robots.txt will not be '
'enforced. To enable crawl-delay support, configure the crawler to use '
'`ThrottlingRequestManager` as the request manager.'
)

if self._has_finished_before:
await self._statistics.reset()

if self._use_session_pool:
await self._session_pool.reset_store()

request_manager = await self.get_request_manager()
if purge_request_queue and isinstance(request_manager, RequestQueue):
await request_manager.drop()
self._request_manager = await RequestQueue.open(
storage_client=self._service_locator.get_storage_client(),
configuration=self._service_locator.get_configuration(),
)
if purge_request_queue:
if isinstance(request_manager, RequestQueue):
await request_manager.drop()
self._request_manager = await RequestQueue.open(
storage_client=self._service_locator.get_storage_client(),
configuration=self._service_locator.get_configuration(),
)
elif isinstance(request_manager, ThrottlingRequestManager):
self._request_manager = await request_manager.recreate_purged()

if requests is not None:
await self.add_requests(requests)
Expand Down Expand Up @@ -1442,6 +1455,10 @@ async def __run_task_function(self) -> None:

await self._mark_request_as_handled(request)

# Record successful request to reset rate limit backoff for this domain.
if isinstance(request_manager, ThrottlingRequestManager):
request_manager.record_success(request.url)

if session and session.is_usable:
session.mark_good()

Expand Down Expand Up @@ -1542,16 +1559,36 @@ def _raise_for_error_status_code(self, status_code: int) -> None:
if is_status_code_server_error(status_code) and not is_ignored_status:
raise HttpStatusCodeError('Error status code returned', status_code)

def _raise_for_session_blocked_status_code(self, session: Session | None, status_code: int) -> None:
def _raise_for_session_blocked_status_code(
self,
session: Session | None,
status_code: int,
*,
request_url: str = '',
retry_after_header: str | None = None,
) -> None:
"""Raise an exception if the given status code indicates the session is blocked.

If the status code is 429 (Too Many Requests), the domain is recorded as
rate-limited in the `ThrottlingRequestManager` for per-domain backoff.

Args:
session: The session used for the request. If None, no check is performed.
status_code: The HTTP status code to check.
request_url: The request URL, used for per-domain rate limit tracking.
retry_after_header: The value of the Retry-After response header, if present.

Raises:
SessionError: If the status code indicates the session is blocked.
"""
if status_code == 429 and request_url: # noqa: PLR2004
retry_after = parse_retry_after_header(retry_after_header)

# _request_manager might not be initialized yet if called directly or early,
# but usually it's set in get_request_manager().
if isinstance(self._request_manager, ThrottlingRequestManager):
self._request_manager.record_domain_delay(request_url, retry_after=retry_after)

if session is not None and session.is_blocked_status_code(
status_code=status_code,
ignore_http_error_status_codes=self._ignore_http_error_status_codes,
Expand Down Expand Up @@ -1582,7 +1619,16 @@ async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:
if not self._respect_robots_txt_file:
return True
robots_txt_file = await self._get_robots_txt_file_for_url(url)
return not robots_txt_file or robots_txt_file.is_allowed(url)
if not robots_txt_file:
return True

# Wire robots.txt crawl-delay into ThrottlingRequestManager
if isinstance(self._request_manager, ThrottlingRequestManager):
crawl_delay = robots_txt_file.get_crawl_delay()
if crawl_delay is not None:
self._request_manager.set_crawl_delay(url, crawl_delay)

return robots_txt_file.is_allowed(url)

async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
"""Get the RobotsTxtFile for a given URL.
Expand Down
7 changes: 6 additions & 1 deletion src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,12 @@ async def _handle_status_code_response(
"""
status_code = context.response.status
if self._retry_on_blocked:
self._raise_for_session_blocked_status_code(context.session, status_code)
self._raise_for_session_blocked_status_code(
context.session,
status_code,
request_url=context.request.url,
retry_after_header=context.response.headers.get('retry-after'),
)
self._raise_for_error_status_code(status_code)
yield context

Expand Down
10 changes: 9 additions & 1 deletion src/crawlee/request_loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,13 @@
from ._request_manager import RequestManager
from ._request_manager_tandem import RequestManagerTandem
from ._sitemap_request_loader import SitemapRequestLoader
from ._throttling_request_manager import ThrottlingRequestManager

__all__ = ['RequestList', 'RequestLoader', 'RequestManager', 'RequestManagerTandem', 'SitemapRequestLoader']
__all__ = [
'RequestList',
'RequestLoader',
'RequestManager',
'RequestManagerTandem',
'SitemapRequestLoader',
'ThrottlingRequestManager',
]
Loading
Loading