diff --git a/docs/guides/code_examples/request_throttling/throttling_example.py b/docs/guides/code_examples/request_throttling/throttling_example.py new file mode 100644 index 0000000000..e46ccb647f --- /dev/null +++ b/docs/guides/code_examples/request_throttling/throttling_example.py @@ -0,0 +1,41 @@ +import asyncio + +from crawlee.crawlers import BasicCrawler, BasicCrawlingContext +from crawlee.request_loaders import ThrottlingRequestManager +from crawlee.storages import RequestQueue + + +async def main() -> None: + # Open the default request queue. + queue = await RequestQueue.open() + + # Wrap it with ThrottlingRequestManager for specific domains. + # The throttler uses the same storage backend as the underlying queue. + throttler = ThrottlingRequestManager( + queue, + domains=['api.example.com', 'slow-site.org'], + ) + + # Pass the throttler as the crawler's request manager. + crawler = BasicCrawler(request_manager=throttler) + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url}') + + # Add requests. Listed domains are routed directly to their + # throttled sub-queues. Others go to the main queue. + await throttler.add_requests( + [ + 'https://api.example.com/data', + 'https://api.example.com/users', + 'https://slow-site.org/page1', + 'https://fast-site.com/page1', # Not throttled + ] + ) + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/request_throttling.mdx b/docs/guides/request_throttling.mdx new file mode 100644 index 0000000000..2504966e28 --- /dev/null +++ b/docs/guides/request_throttling.mdx @@ -0,0 +1,47 @@ +--- +id: request-throttling +title: Request throttling +description: How to throttle requests per domain using the ThrottlingRequestManager. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ThrottlingExample from '!!raw-loader!roa-loader!./code_examples/request_throttling/throttling_example.py'; + +When crawling websites that enforce rate limits (HTTP 429) or specify `crawl-delay` in their `robots.txt`, you need a way to throttle requests per domain without blocking unrelated domains. The `ThrottlingRequestManager` provides exactly this. + +## Overview + +The `ThrottlingRequestManager` wraps a `RequestQueue` and manages per-domain throttling. You specify which domains to throttle at initialization, and the manager automatically: + +- **Routes requests** for listed domains into dedicated sub-queues at insertion time. +- **Enforces delays** from HTTP 429 responses (exponential backoff) and `robots.txt` crawl-delay directives. +- **Schedules fairly** by fetching from the domain that has been waiting the longest. +- **Sleeps intelligently** when all configured domains are throttled, instead of busy-waiting. + +Requests for domains **not** in the configured list pass through to the main queue without any throttling. + +## Basic usage + +To use request throttling, create a `ThrottlingRequestManager` with the domains you want to throttle and pass it as the `request_manager` to your crawler: + + + {ThrottlingExample} + + +## How it works + +1. **Insertion-time routing**: When you add requests via `add_request` or `add_requests`, each request is checked against the configured domain list. Matching requests go directly into a per-domain sub-queue; all others go to the main queue. This eliminates request duplication entirely. + +2. **429 backoff**: When the crawler detects an HTTP 429 response, the `ThrottlingRequestManager` records an exponential backoff delay for that domain (starting at 2s, doubling up to 60s). If the response includes a `Retry-After` header, that value takes priority. + +3. **Crawl-delay**: If `robots.txt` specifies a `crawl-delay`, the manager enforces a minimum interval between requests to that domain. + +4. **Fair scheduling**: `fetch_next_request` sorts available sub-queues by how long each domain has been waiting, ensuring no domain is starved. + +:::tip + +The `ThrottlingRequestManager` is an opt-in feature. If you don't pass it to your crawler, requests are processed normally without any per-domain throttling. + +::: diff --git a/src/crawlee/_utils/http.py b/src/crawlee/_utils/http.py new file mode 100644 index 0000000000..6a6c3dcecc --- /dev/null +++ b/src/crawlee/_utils/http.py @@ -0,0 +1,41 @@ +"""HTTP utility functions for Crawlee.""" + +from __future__ import annotations + +from datetime import datetime, timedelta, timezone +from email.utils import parsedate_to_datetime + + +def parse_retry_after_header(value: str | None) -> timedelta | None: + """Parse the Retry-After HTTP header value. + + The header can contain either a number of seconds or an HTTP-date. + See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After + + Args: + value: The raw Retry-After header value. + + Returns: + A timedelta representing the delay, or None if the header is missing or unparsable. + """ + if not value: + return None + + # Try parsing as integer seconds first. + try: + seconds = int(value) + return timedelta(seconds=seconds) + except ValueError: + pass + + # Try parsing as HTTP-date (e.g., "Wed, 21 Oct 2015 07:28:00 GMT"). + + try: + retry_date = parsedate_to_datetime(value) + delay = retry_date - datetime.now(retry_date.tzinfo or timezone.utc) + if delay.total_seconds() > 0: + return delay + except (ValueError, TypeError): + pass + + return None diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 7aafa49e2e..13bf2bd1b1 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -279,7 +279,12 @@ async def _handle_status_code_response( """ status_code = context.http_response.status_code if self._retry_on_blocked: - self._raise_for_session_blocked_status_code(context.session, status_code) + self._raise_for_session_blocked_status_code( + context.session, + status_code, + request_url=context.request.url, + retry_after_header=context.http_response.headers.get('retry-after'), + ) self._raise_for_error_status_code(status_code) yield context diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 6451d59461..c8f4e3ae17 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -45,6 +45,7 @@ ) from crawlee._utils.docs import docs_group from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream +from crawlee._utils.http import parse_retry_after_header from crawlee._utils.recurring_task import RecurringTask from crawlee._utils.robots import RobotsTxtFile from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute @@ -63,6 +64,7 @@ ) from crawlee.events._types import Event, EventCrawlerStatusData from crawlee.http_clients import ImpitHttpClient +from crawlee.request_loaders import ThrottlingRequestManager from crawlee.router import Router from crawlee.sessions import SessionPool from crawlee.statistics import Statistics, StatisticsState @@ -700,6 +702,14 @@ async def run( self._running = True + if self._respect_robots_txt_file and not isinstance(self._request_manager, ThrottlingRequestManager): + self._logger.warning( + 'The `respect_robots_txt_file` option is enabled, but the crawler is not using ' + '`ThrottlingRequestManager`. Crawl-delay directives from robots.txt will not be ' + 'enforced. To enable crawl-delay support, configure the crawler to use ' + '`ThrottlingRequestManager` as the request manager.' + ) + if self._has_finished_before: await self._statistics.reset() @@ -707,12 +717,15 @@ async def run( await self._session_pool.reset_store() request_manager = await self.get_request_manager() - if purge_request_queue and isinstance(request_manager, RequestQueue): - await request_manager.drop() - self._request_manager = await RequestQueue.open( - storage_client=self._service_locator.get_storage_client(), - configuration=self._service_locator.get_configuration(), - ) + if purge_request_queue: + if isinstance(request_manager, RequestQueue): + await request_manager.drop() + self._request_manager = await RequestQueue.open( + storage_client=self._service_locator.get_storage_client(), + configuration=self._service_locator.get_configuration(), + ) + elif isinstance(request_manager, ThrottlingRequestManager): + self._request_manager = await request_manager.recreate_purged() if requests is not None: await self.add_requests(requests) @@ -1442,6 +1455,10 @@ async def __run_task_function(self) -> None: await self._mark_request_as_handled(request) + # Record successful request to reset rate limit backoff for this domain. + if isinstance(request_manager, ThrottlingRequestManager): + request_manager.record_success(request.url) + if session and session.is_usable: session.mark_good() @@ -1542,16 +1559,36 @@ def _raise_for_error_status_code(self, status_code: int) -> None: if is_status_code_server_error(status_code) and not is_ignored_status: raise HttpStatusCodeError('Error status code returned', status_code) - def _raise_for_session_blocked_status_code(self, session: Session | None, status_code: int) -> None: + def _raise_for_session_blocked_status_code( + self, + session: Session | None, + status_code: int, + *, + request_url: str = '', + retry_after_header: str | None = None, + ) -> None: """Raise an exception if the given status code indicates the session is blocked. + If the status code is 429 (Too Many Requests), the domain is recorded as + rate-limited in the `ThrottlingRequestManager` for per-domain backoff. + Args: session: The session used for the request. If None, no check is performed. status_code: The HTTP status code to check. + request_url: The request URL, used for per-domain rate limit tracking. + retry_after_header: The value of the Retry-After response header, if present. Raises: SessionError: If the status code indicates the session is blocked. """ + if status_code == 429 and request_url: # noqa: PLR2004 + retry_after = parse_retry_after_header(retry_after_header) + + # _request_manager might not be initialized yet if called directly or early, + # but usually it's set in get_request_manager(). + if isinstance(self._request_manager, ThrottlingRequestManager): + self._request_manager.record_domain_delay(request_url, retry_after=retry_after) + if session is not None and session.is_blocked_status_code( status_code=status_code, ignore_http_error_status_codes=self._ignore_http_error_status_codes, @@ -1582,7 +1619,16 @@ async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool: if not self._respect_robots_txt_file: return True robots_txt_file = await self._get_robots_txt_file_for_url(url) - return not robots_txt_file or robots_txt_file.is_allowed(url) + if not robots_txt_file: + return True + + # Wire robots.txt crawl-delay into ThrottlingRequestManager + if isinstance(self._request_manager, ThrottlingRequestManager): + crawl_delay = robots_txt_file.get_crawl_delay() + if crawl_delay is not None: + self._request_manager.set_crawl_delay(url, crawl_delay) + + return robots_txt_file.is_allowed(url) async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: """Get the RobotsTxtFile for a given URL. diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 6f4b2b0e9d..7025ee3b41 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -459,7 +459,12 @@ async def _handle_status_code_response( """ status_code = context.response.status if self._retry_on_blocked: - self._raise_for_session_blocked_status_code(context.session, status_code) + self._raise_for_session_blocked_status_code( + context.session, + status_code, + request_url=context.request.url, + retry_after_header=context.response.headers.get('retry-after'), + ) self._raise_for_error_status_code(status_code) yield context diff --git a/src/crawlee/request_loaders/__init__.py b/src/crawlee/request_loaders/__init__.py index c04d9aa810..6dd8cccfab 100644 --- a/src/crawlee/request_loaders/__init__.py +++ b/src/crawlee/request_loaders/__init__.py @@ -3,5 +3,13 @@ from ._request_manager import RequestManager from ._request_manager_tandem import RequestManagerTandem from ._sitemap_request_loader import SitemapRequestLoader +from ._throttling_request_manager import ThrottlingRequestManager -__all__ = ['RequestList', 'RequestLoader', 'RequestManager', 'RequestManagerTandem', 'SitemapRequestLoader'] +__all__ = [ + 'RequestList', + 'RequestLoader', + 'RequestManager', + 'RequestManagerTandem', + 'SitemapRequestLoader', + 'ThrottlingRequestManager', +] diff --git a/src/crawlee/request_loaders/_throttling_request_manager.py b/src/crawlee/request_loaders/_throttling_request_manager.py new file mode 100644 index 0000000000..9dcc1221d2 --- /dev/null +++ b/src/crawlee/request_loaders/_throttling_request_manager.py @@ -0,0 +1,434 @@ +"""A request manager wrapper that enforces per-domain delays. + +Handles both HTTP 429 backoff and robots.txt crawl-delay at the scheduling layer, +routing requests for explicitly configured domains into dedicated sub-queues and +applying intelligent delay-aware scheduling. +""" + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from logging import getLogger +from typing import TYPE_CHECKING +from urllib.parse import urlparse + +from typing_extensions import override + +from crawlee._utils.docs import docs_group +from crawlee.request_loaders._request_manager import RequestManager +from crawlee.storages import RequestQueue + +if TYPE_CHECKING: + from collections.abc import Sequence + + from crawlee._request import Request + from crawlee.storage_clients.models import ProcessedRequest + +from crawlee._service_locator import ServiceLocator +from crawlee._service_locator import service_locator as global_service_locator + +logger = getLogger(__name__) + + +@dataclass +class _DomainState: + """Tracks delay state for a single domain.""" + + domain: str + """The domain being tracked.""" + + throttled_until: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + """Earliest time the next request to this domain is allowed.""" + + consecutive_429_count: int = 0 + """Number of consecutive 429 responses (for exponential backoff).""" + + crawl_delay: timedelta | None = None + """Minimum interval between requests, used to push `throttled_until` on dispatch.""" + + +@docs_group('Request loaders') +class ThrottlingRequestManager(RequestManager): + """A request manager that wraps another and enforces per-domain delays. + + Requests for explicitly configured domains are routed into dedicated sub-queues + at insertion time — each request lives in exactly one queue, eliminating + duplication and simplifying deduplication. + + When `fetch_next_request()` is called, it returns requests from the sub-queue + whose domain has been waiting the longest. If all configured domains are + throttled, it falls back to the inner queue for non-throttled domains. If the + inner queue is also empty and all sub-queues are throttled, it sleeps until the + earliest cooldown expires. + + Delay sources: + - HTTP 429 responses (via `record_domain_delay`) + - robots.txt crawl-delay directives (via `set_crawl_delay`) + + Example: + ```python + from crawlee.storages import RequestQueue + from crawlee.request_loaders import ThrottlingRequestManager + + queue = await RequestQueue.open() + throttler = ThrottlingRequestManager( + queue, + domains=['api.example.com', 'slow-site.org'], + ) + crawler = BasicCrawler(request_manager=throttler) + ``` + """ + + _BASE_DELAY = timedelta(seconds=2) + """Initial delay after the first 429 response from a domain.""" + + _MAX_DELAY = timedelta(seconds=60) + """Maximum delay between requests to a rate-limited domain.""" + + def __init__( + self, + inner: RequestManager, + *, + domains: Sequence[str], + service_locator: ServiceLocator | None = None, + ) -> None: + """Initialize the throttling manager. + + Args: + inner: The underlying request manager to wrap (typically a RequestQueue). + Requests for non-throttled domains are stored here. + domains: Explicit list of domain hostnames to throttle. Only requests + matching these domains will be routed to per-domain sub-queues. + service_locator: Service locator for creating sub-queues. If not + provided, defaults to the global service locator, ensuring + consistency with the crawler's storage backend. + """ + self._inner = inner + self._service_locator = service_locator if service_locator is not None else global_service_locator + self._domain_states: dict[str, _DomainState] = {d: _DomainState(domain=d) for d in domains} + self._sub_queues: dict[str, RequestQueue] = {} + + @staticmethod + def _extract_domain(url: str) -> str: + """Extract the domain (hostname) from a URL.""" + parsed = urlparse(url) + return parsed.hostname or '' + + def _get_url_from_request(self, request: str | Request) -> str: + """Extract URL string from a request that may be a string or Request object.""" + if isinstance(request, str): + return request + return request.url + + async def _get_or_create_sub_queue(self, domain: str) -> RequestQueue: + """Get or create a per-domain sub-queue.""" + if domain not in self._sub_queues: + self._sub_queues[domain] = await RequestQueue.open( + alias=f'throttled-{domain}', + storage_client=self._service_locator.get_storage_client(), + configuration=self._service_locator.get_configuration(), + ) + return self._sub_queues[domain] + + def _is_domain_throttled(self, domain: str) -> bool: + """Check if a domain is currently throttled.""" + state = self._domain_states.get(domain) + if state is None: + return False + return datetime.now(timezone.utc) < state.throttled_until + + def _get_earliest_available_time(self) -> datetime: + """Get the earliest time any throttled domain becomes available.""" + now = datetime.now(timezone.utc) + earliest = now + self._MAX_DELAY # Fallback upper bound. + + for state in self._domain_states.values(): + if state.throttled_until > now and state.throttled_until < earliest: + earliest = state.throttled_until + + return earliest + + def record_domain_delay(self, url: str, *, retry_after: timedelta | None = None) -> None: + """Record a 429 Too Many Requests response for the domain of the given URL. + + Increments the consecutive 429 count and calculates the next allowed + request time using exponential backoff or the Retry-After value. + + Args: + url: The URL that received a 429 response. + retry_after: Optional delay from the Retry-After header. If provided, + it takes priority over the calculated exponential backoff. + """ + domain = self._extract_domain(url) + if not domain: + return + + state = self._domain_states.get(domain) + if state is None: + return + + now = datetime.now(timezone.utc) + state.consecutive_429_count += 1 + + # Calculate delay: use Retry-After if provided, otherwise exponential backoff. + delay = retry_after if retry_after is not None else self._BASE_DELAY * (2 ** (state.consecutive_429_count - 1)) + + # Cap the delay. + delay = min(delay, self._MAX_DELAY) + + state.throttled_until = now + delay + + logger.info( + f'Rate limit (429) detected for domain "{domain}" ' + f'(consecutive: {state.consecutive_429_count}, delay: {delay.total_seconds():.1f}s)' + ) + + def record_success(self, url: str) -> None: + """Record a successful request, resetting the backoff state for that domain. + + Args: + url: The URL that received a successful response. + """ + domain = self._extract_domain(url) + if not domain: + return + + state = self._domain_states.get(domain) + + if state is not None and state.consecutive_429_count > 0: + logger.debug(f'Resetting rate limit state for domain "{domain}" after successful request') + state.consecutive_429_count = 0 + + def set_crawl_delay(self, url: str, delay_seconds: int) -> None: + """Set the robots.txt crawl-delay for a domain. + + Args: + url: A URL from the domain to throttle. + delay_seconds: The crawl-delay value in seconds. + """ + domain = self._extract_domain(url) + if not domain: + return + + state = self._domain_states.get(domain) + if state is None: + return + + state.crawl_delay = timedelta(seconds=delay_seconds) + + logger.debug(f'Set crawl-delay for domain "{domain}" to {delay_seconds}s') + + def _mark_domain_dispatched(self, url: str) -> None: + """Record that a request to this domain was just dispatched. + + If a crawl-delay is configured, push throttled_until forward by that amount. + """ + domain = self._extract_domain(url) + if not domain: + return + + state = self._domain_states.get(domain) + if state is None: + return + + # If crawl-delay is set, enforce minimum interval by pushing throttled_until. + if state.crawl_delay is not None: + state.throttled_until = datetime.now(timezone.utc) + state.crawl_delay + + async def recreate_purged(self) -> ThrottlingRequestManager: + """Drop all queues and return a fresh ThrottlingRequestManager with the same configuration. + + This is used during crawler purge to reconstruct the throttler with empty + queues while preserving domain configuration and service locator. + """ + await self.drop() + + inner = await RequestQueue.open( + storage_client=self._service_locator.get_storage_client(), + configuration=self._service_locator.get_configuration(), + ) + + return ThrottlingRequestManager( + inner, + domains=list(self._domain_states.keys()), + service_locator=self._service_locator, + ) + + @override + async def drop(self) -> None: + await self._inner.drop() + for sq in self._sub_queues.values(): + await sq.drop() + self._sub_queues.clear() + + @override + async def add_request(self, request: str | Request, *, forefront: bool = False) -> ProcessedRequest: + """Add a request, routing it to the appropriate queue. + + Requests for explicitly configured domains are routed directly to their + per-domain sub-queue. All other requests go to the inner queue. + """ + url = self._get_url_from_request(request) + domain = self._extract_domain(url) + + if domain in self._domain_states: + sq = await self._get_or_create_sub_queue(domain) + result = await sq.add_request(request, forefront=forefront) + else: + result = await self._inner.add_request(request, forefront=forefront) + + if result is None: + msg = 'add_request unexpectedly returned None' + raise RuntimeError(msg) + + return result + + @override + async def add_requests( + self, + requests: Sequence[str | Request], + *, + forefront: bool = False, + batch_size: int = 1000, + wait_time_between_batches: timedelta = timedelta(seconds=1), + wait_for_all_requests_to_be_added: bool = False, + wait_for_all_requests_to_be_added_timeout: timedelta | None = None, + ) -> None: + """Add multiple requests, routing each to the appropriate queue.""" + inner_requests: list[str | Request] = [] + domain_requests: dict[str, list[str | Request]] = {} + + for request in requests: + url = self._get_url_from_request(request) + domain = self._extract_domain(url) + + if domain in self._domain_states: + domain_requests.setdefault(domain, []).append(request) + else: + inner_requests.append(request) + + # Add non-throttled requests to inner queue. + if inner_requests: + await self._inner.add_requests( + inner_requests, + forefront=forefront, + batch_size=batch_size, + wait_time_between_batches=wait_time_between_batches, + wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added, + wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout, + ) + + # Add throttled requests to their respective sub-queues. + for domain, reqs in domain_requests.items(): + sq = await self._get_or_create_sub_queue(domain) + await sq.add_requests( + reqs, + forefront=forefront, + batch_size=batch_size, + wait_time_between_batches=wait_time_between_batches, + wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added, + wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout, + ) + + @override + async def reclaim_request(self, request: Request, *, forefront: bool = False) -> ProcessedRequest | None: + domain = self._extract_domain(request.url) + if domain in self._domain_states and domain in self._sub_queues: + return await self._sub_queues[domain].reclaim_request(request, forefront=forefront) + return await self._inner.reclaim_request(request, forefront=forefront) + + @override + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + domain = self._extract_domain(request.url) + if domain in self._domain_states and domain in self._sub_queues: + return await self._sub_queues[domain].mark_request_as_handled(request) + return await self._inner.mark_request_as_handled(request) + + @override + async def get_handled_count(self) -> int: + count = await self._inner.get_handled_count() + for sq in self._sub_queues.values(): + count += await sq.get_handled_count() + return count + + @override + async def get_total_count(self) -> int: + count = await self._inner.get_total_count() + for sq in self._sub_queues.values(): + count += await sq.get_total_count() + return count + + @override + async def is_empty(self) -> bool: + if not await self._inner.is_empty(): + return False + for sq in self._sub_queues.values(): + if not await sq.is_empty(): + return False + return True + + @override + async def is_finished(self) -> bool: + if not await self._inner.is_finished(): + return False + for sq in self._sub_queues.values(): + if not await sq.is_finished(): + return False + return True + + @override + async def fetch_next_request(self) -> Request | None: + """Fetch the next request, respecting per-domain delays. + + Sub-queues are checked in order of longest-overdue domain first + (sorted by `throttled_until` ascending). If all configured domains are + throttled, falls back to the inner queue for non-throttled domains. + If the inner queue is also empty and all sub-queues are throttled, + sleeps until the earliest domain becomes available. + """ + # Collect unthrottled domains and sort by throttled_until (longest-overdue first). + available_domains = [ + domain + for domain in self._domain_states + if domain in self._sub_queues and not self._is_domain_throttled(domain) + ] + available_domains.sort( + key=lambda d: self._domain_states[d].throttled_until, + ) + + for domain in available_domains: + sq = self._sub_queues[domain] + req = await sq.fetch_next_request() + if req: + self._mark_domain_dispatched(req.url) + return req + + # Try fetching from the inner queue (non-throttled domains). + request = await self._inner.fetch_next_request() + if request is not None: + return request + + # No requests in inner queue. Check if any sub-queues still have requests. + have_sq_requests = False + for sq in self._sub_queues.values(): + if not await sq.is_empty(): + have_sq_requests = True + break + + if have_sq_requests: + # Requests exist but all domains are throttled and inner is empty. Sleep and retry. + earliest = self._get_earliest_available_time() + sleep_duration = max( + (earliest - datetime.now(timezone.utc)).total_seconds(), + 0.1, # Minimum sleep to avoid tight loops. + ) + logger.debug( + f'All configured domains are throttled and inner queue is empty. ' + f'Sleeping {sleep_duration:.1f}s until earliest domain is available.' + ) + await asyncio.sleep(sleep_duration) + return await self.fetch_next_request() + + return None diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 23ca3c1eca..2b3ed72ccd 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1238,7 +1238,8 @@ async def test_crawler_uses_default_storages(tmp_path: Path) -> None: assert dataset is await crawler.get_dataset() assert kvs is await crawler.get_key_value_store() - assert rq is await crawler.get_request_manager() + manager = await crawler.get_request_manager() + assert manager is rq async def test_crawler_can_use_other_storages(tmp_path: Path) -> None: @@ -1256,7 +1257,8 @@ async def test_crawler_can_use_other_storages(tmp_path: Path) -> None: assert dataset is not await crawler.get_dataset() assert kvs is not await crawler.get_key_value_store() - assert rq is not await crawler.get_request_manager() + manager = await crawler.get_request_manager() + assert manager is not rq async def test_crawler_can_use_other_storages_of_same_type(tmp_path: Path) -> None: @@ -1293,7 +1295,8 @@ async def test_crawler_can_use_other_storages_of_same_type(tmp_path: Path) -> No # Assert that the storages are different assert dataset is not await crawler.get_dataset() assert kvs is not await crawler.get_key_value_store() - assert rq is not await crawler.get_request_manager() + manager = await crawler.get_request_manager() + assert manager is not rq # Assert that all storages exists on the filesystem for path in expected_paths: diff --git a/tests/unit/test_throttling_request_manager.py b/tests/unit/test_throttling_request_manager.py new file mode 100644 index 0000000000..30d17cd782 --- /dev/null +++ b/tests/unit/test_throttling_request_manager.py @@ -0,0 +1,467 @@ +"""Tests for ThrottlingRequestManager - per-domain delay scheduling.""" + +from __future__ import annotations + +from datetime import datetime, timedelta, timezone +from typing import Any +from unittest.mock import AsyncMock, patch + +import pytest + +from crawlee._request import Request +from crawlee._service_locator import ServiceLocator +from crawlee._utils.http import parse_retry_after_header +from crawlee.request_loaders._throttling_request_manager import ThrottlingRequestManager +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import RequestQueue + +THROTTLED_DOMAIN = 'throttled.com' +NON_THROTTLED_DOMAIN = 'free.com' +TEST_DOMAINS = [THROTTLED_DOMAIN] + + +@pytest.fixture +def memory_storage_client() -> MemoryStorageClient: + """Create a MemoryStorageClient for testing.""" + return MemoryStorageClient() + + +@pytest.fixture +def service_locator(memory_storage_client: MemoryStorageClient) -> ServiceLocator: + """Create a ServiceLocator with MemoryStorageClient for testing.""" + return ServiceLocator(storage_client=memory_storage_client) + + +@pytest.fixture +async def inner_queue(memory_storage_client: MemoryStorageClient) -> RequestQueue: + """Create a real RequestQueue with MemoryStorageClient.""" + return await RequestQueue.open(name='test-inner', storage_client=memory_storage_client) + + +@pytest.fixture +async def manager(inner_queue: RequestQueue, service_locator: ServiceLocator) -> ThrottlingRequestManager: + """Create a ThrottlingRequestManager wrapping a real queue with test domains.""" + return ThrottlingRequestManager(inner_queue, domains=TEST_DOMAINS, service_locator=service_locator) + + +def _make_request(url: str) -> Request: + """Helper to create a Request object.""" + return Request.from_url(url) + + +# ── Request Routing Tests ───────────────────────────────── + + +@pytest.mark.asyncio +async def test_add_request_routes_listed_domain_to_sub_queue( + manager: ThrottlingRequestManager, + inner_queue: RequestQueue, +) -> None: + """Requests for listed domains should be routed to their sub-queue, not inner.""" + request = _make_request(f'https://{THROTTLED_DOMAIN}/page1') + await manager.add_request(request) + + # Inner queue should be empty — the request went to a sub-queue. + assert await inner_queue.is_empty() + assert THROTTLED_DOMAIN in manager._sub_queues + + # The sub-queue should have the request. + assert await manager._sub_queues[THROTTLED_DOMAIN].get_total_count() == 1 + + +@pytest.mark.asyncio +async def test_add_request_routes_non_listed_domain_to_inner( + manager: ThrottlingRequestManager, + inner_queue: RequestQueue, +) -> None: + """Requests for non-listed domains should go to the inner queue.""" + request = _make_request(f'https://{NON_THROTTLED_DOMAIN}/page1') + await manager.add_request(request) + + assert not await inner_queue.is_empty() + assert NON_THROTTLED_DOMAIN not in manager._sub_queues + + +@pytest.mark.asyncio +async def test_add_request_with_string_url( + manager: ThrottlingRequestManager, +) -> None: + """add_request should also work when given a plain URL string.""" + url = f'https://{THROTTLED_DOMAIN}/page1' + await manager.add_request(url) + + assert THROTTLED_DOMAIN in manager._sub_queues + assert await manager._sub_queues[THROTTLED_DOMAIN].get_total_count() == 1 + + +@pytest.mark.asyncio +async def test_add_requests_routes_mixed_domains( + manager: ThrottlingRequestManager, + inner_queue: RequestQueue, +) -> None: + """add_requests should split requests by domain and route them correctly.""" + throttled_req = _make_request(f'https://{THROTTLED_DOMAIN}/page1') + free_req = _make_request(f'https://{NON_THROTTLED_DOMAIN}/page1') + + await manager.add_requests([throttled_req, free_req]) + + # Inner gets the non-listed domain request. + assert not await inner_queue.is_empty() + + # Sub-queue gets the listed domain request. + assert THROTTLED_DOMAIN in manager._sub_queues + assert await manager._sub_queues[THROTTLED_DOMAIN].get_total_count() == 1 + + +# ── Core Throttling Tests ───────────────────────────────── + + +@pytest.mark.asyncio +async def test_429_triggers_domain_delay(manager: ThrottlingRequestManager) -> None: + """After record_domain_delay(), the domain should be throttled.""" + manager.record_domain_delay(f'https://{THROTTLED_DOMAIN}/page1') + assert manager._is_domain_throttled(THROTTLED_DOMAIN) + + +@pytest.mark.asyncio +async def test_different_domains_independent(manager: ThrottlingRequestManager) -> None: + """Throttling one domain should NOT affect other domains.""" + manager.record_domain_delay(f'https://{THROTTLED_DOMAIN}/page1') + assert manager._is_domain_throttled(THROTTLED_DOMAIN) + assert not manager._is_domain_throttled(NON_THROTTLED_DOMAIN) + + +@pytest.mark.asyncio +async def test_exponential_backoff(manager: ThrottlingRequestManager) -> None: + """Consecutive 429s should increase delay exponentially.""" + url = f'https://{THROTTLED_DOMAIN}/page1' + + manager.record_domain_delay(url) + state = manager._domain_states[THROTTLED_DOMAIN] + first_until = state.throttled_until + + manager.record_domain_delay(url) + second_until = state.throttled_until + + assert second_until > first_until + assert state.consecutive_429_count == 2 + + +@pytest.mark.asyncio +async def test_max_delay_cap(manager: ThrottlingRequestManager) -> None: + """Backoff should cap at _MAX_DELAY (60s).""" + url = f'https://{THROTTLED_DOMAIN}/page1' + + for _ in range(20): + manager.record_domain_delay(url) + + state = manager._domain_states[THROTTLED_DOMAIN] + now = datetime.now(timezone.utc) + actual_delay = state.throttled_until - now + + assert actual_delay <= manager._MAX_DELAY + timedelta(seconds=1) + + +@pytest.mark.asyncio +async def test_retry_after_header_priority(manager: ThrottlingRequestManager) -> None: + """Explicit Retry-After should override exponential backoff.""" + url = f'https://{THROTTLED_DOMAIN}/page1' + + manager.record_domain_delay(url, retry_after=timedelta(seconds=30)) + + state = manager._domain_states[THROTTLED_DOMAIN] + now = datetime.now(timezone.utc) + actual_delay = state.throttled_until - now + + assert actual_delay > timedelta(seconds=28) + assert actual_delay <= timedelta(seconds=31) + + +@pytest.mark.asyncio +async def test_success_resets_backoff(manager: ThrottlingRequestManager) -> None: + """Successful request should reset the consecutive 429 count.""" + url = f'https://{THROTTLED_DOMAIN}/page1' + + manager.record_domain_delay(url) + manager.record_domain_delay(url) + assert manager._domain_states[THROTTLED_DOMAIN].consecutive_429_count == 2 + + manager.record_success(url) + assert manager._domain_states[THROTTLED_DOMAIN].consecutive_429_count == 0 + + +# ── Crawl-Delay Integration Tests ───────────────────────── + + +@pytest.mark.asyncio +async def test_crawl_delay_integration(manager: ThrottlingRequestManager) -> None: + """set_crawl_delay() should record the delay for the domain.""" + url = f'https://{THROTTLED_DOMAIN}/page1' + manager.set_crawl_delay(url, 5) + + state = manager._domain_states[THROTTLED_DOMAIN] + assert state.crawl_delay == timedelta(seconds=5) + + +@pytest.mark.asyncio +async def test_crawl_delay_throttles_after_dispatch(manager: ThrottlingRequestManager) -> None: + """After dispatching a request, crawl-delay should throttle the next one.""" + url = f'https://{THROTTLED_DOMAIN}/page1' + manager.set_crawl_delay(url, 5) + + manager._mark_domain_dispatched(url) + + assert manager._is_domain_throttled(THROTTLED_DOMAIN) + + +# ── Fetch Scheduling Tests ──────────────────────────── + + +@pytest.mark.asyncio +async def test_fetch_from_unthrottled_sub_queue( + manager: ThrottlingRequestManager, +) -> None: + """fetch_next_request should return from an unthrottled sub-queue.""" + url = f'https://{THROTTLED_DOMAIN}/page1' + await manager.add_request(url) + + result = await manager.fetch_next_request() + + assert result is not None + assert result.url == url + + +@pytest.mark.asyncio +async def test_fetch_falls_back_to_inner( + manager: ThrottlingRequestManager, +) -> None: + """When sub-queues are empty, should return from inner queue.""" + url = f'https://{NON_THROTTLED_DOMAIN}/page1' + await manager.add_request(url) + + result = await manager.fetch_next_request() + + assert result is not None + assert result.url == url + + +@pytest.mark.asyncio +async def test_fetch_skips_throttled_sub_queue( + manager: ThrottlingRequestManager, +) -> None: + """Should skip throttled sub-queues and fall through to inner.""" + # Add a request to the throttled domain and mark it as throttled. + throttled_url = f'https://{THROTTLED_DOMAIN}/page1' + free_url = f'https://{NON_THROTTLED_DOMAIN}/page1' + + await manager.add_request(throttled_url) + await manager.add_request(free_url) + + manager.record_domain_delay(throttled_url) + + result = await manager.fetch_next_request() + + assert result is not None + assert result.url == free_url + + +@pytest.mark.asyncio +async def test_sleep_when_all_throttled(manager: ThrottlingRequestManager) -> None: + """When all domains are throttled and inner is empty, should sleep and retry.""" + url = f'https://{THROTTLED_DOMAIN}/page1' + await manager.add_request(url) + + manager.record_domain_delay(url, retry_after=timedelta(seconds=10)) + + target = 'crawlee.request_loaders._throttling_request_manager.asyncio.sleep' + with patch(target, new_callable=AsyncMock) as mock_sleep: + + async def sleep_side_effect(*_args: Any, **_kwargs: Any) -> None: + manager._domain_states[THROTTLED_DOMAIN].throttled_until = datetime.now(timezone.utc) + + mock_sleep.side_effect = sleep_side_effect + + result = await manager.fetch_next_request() + + mock_sleep.assert_called_once() + assert result is not None + assert result.url == url + + +# ── Delegation Tests ──────────────────────────────────── + + +@pytest.mark.asyncio +async def test_reclaim_request_routes_to_sub_queue( + manager: ThrottlingRequestManager, +) -> None: + """reclaim_request should route to sub-queue for listed domains.""" + url = f'https://{THROTTLED_DOMAIN}/page1' + await manager.add_request(url) + + # Fetch it first, then reclaim. + request = await manager.fetch_next_request() + assert request is not None + + await manager.reclaim_request(request) + + # Should be back in the sub-queue. + assert not await manager._sub_queues[THROTTLED_DOMAIN].is_empty() + + +@pytest.mark.asyncio +async def test_reclaim_request_routes_to_inner( + manager: ThrottlingRequestManager, + inner_queue: RequestQueue, +) -> None: + """reclaim_request should route to inner for non-listed domains.""" + url = f'https://{NON_THROTTLED_DOMAIN}/page1' + await manager.add_request(url) + + request = await manager.fetch_next_request() + assert request is not None + + await manager.reclaim_request(request) + + # Should be back in inner queue. + assert not await inner_queue.is_empty() + + +@pytest.mark.asyncio +async def test_mark_request_as_handled_routes_to_sub_queue( + manager: ThrottlingRequestManager, +) -> None: + """mark_request_as_handled should route to sub-queue for listed domains.""" + url = f'https://{THROTTLED_DOMAIN}/page1' + await manager.add_request(url) + + request = await manager.fetch_next_request() + assert request is not None + + await manager.mark_request_as_handled(request) + + assert await manager._sub_queues[THROTTLED_DOMAIN].get_handled_count() == 1 + + +@pytest.mark.asyncio +async def test_mark_request_as_handled_routes_to_inner( + manager: ThrottlingRequestManager, + inner_queue: RequestQueue, +) -> None: + """mark_request_as_handled should route to inner for non-listed domains.""" + url = f'https://{NON_THROTTLED_DOMAIN}/page1' + await manager.add_request(url) + + request = await manager.fetch_next_request() + assert request is not None + + await manager.mark_request_as_handled(request) + + assert await inner_queue.get_handled_count() == 1 + + +@pytest.mark.asyncio +async def test_get_handled_count_aggregates(manager: ThrottlingRequestManager) -> None: + """get_handled_count should sum inner and all sub-queues.""" + throttled_url = f'https://{THROTTLED_DOMAIN}/page1' + free_url = f'https://{NON_THROTTLED_DOMAIN}/page1' + + await manager.add_request(throttled_url) + await manager.add_request(free_url) + + # Fetch and handle both. + req1 = await manager.fetch_next_request() + assert req1 is not None + await manager.mark_request_as_handled(req1) + + req2 = await manager.fetch_next_request() + assert req2 is not None + await manager.mark_request_as_handled(req2) + + assert await manager.get_handled_count() == 2 + + +@pytest.mark.asyncio +async def test_get_total_count_aggregates(manager: ThrottlingRequestManager) -> None: + """get_total_count should sum inner and all sub-queues.""" + throttled_url = f'https://{THROTTLED_DOMAIN}/page1' + free_url = f'https://{NON_THROTTLED_DOMAIN}/page1' + + await manager.add_request(throttled_url) + await manager.add_request(free_url) + + assert await manager.get_total_count() == 2 + + +@pytest.mark.asyncio +async def test_is_empty_aggregates(manager: ThrottlingRequestManager) -> None: + """is_empty should return False if any queue has requests.""" + assert await manager.is_empty() is True + + await manager.add_request(f'https://{THROTTLED_DOMAIN}/page1') + assert await manager.is_empty() is False + + +@pytest.mark.asyncio +async def test_is_finished_aggregates(manager: ThrottlingRequestManager) -> None: + """is_finished should return True only when all queues are finished.""" + assert await manager.is_finished() is True + + url = f'https://{THROTTLED_DOMAIN}/page1' + await manager.add_request(url) + assert await manager.is_finished() is False + + request = await manager.fetch_next_request() + assert request is not None + await manager.mark_request_as_handled(request) + + assert await manager.is_finished() is True + + +@pytest.mark.asyncio +async def test_drop_clears_all( + manager: ThrottlingRequestManager, +) -> None: + """drop should clear inner, all sub-queues.""" + await manager.add_request(f'https://{THROTTLED_DOMAIN}/page1') + + assert THROTTLED_DOMAIN in manager._sub_queues + + await manager.drop() + + assert len(manager._sub_queues) == 0 + + +@pytest.mark.asyncio +async def test_recreate_purged( + manager: ThrottlingRequestManager, +) -> None: + """recreate_purged should return a fresh manager with the same domain configuration.""" + await manager.add_request(f'https://{THROTTLED_DOMAIN}/page1') + + new_manager = await manager.recreate_purged() + + # Same domains should be configured. + assert THROTTLED_DOMAIN in new_manager._domain_states + # But queues should be empty. + assert await new_manager.is_empty() + + +# ── Utility Tests ────────────────────────────────────── + + +def test_parse_retry_after_none_value() -> None: + assert parse_retry_after_header(None) is None + + +def test_parse_retry_after_empty_string() -> None: + assert parse_retry_after_header('') is None + + +def test_parse_retry_after_integer_seconds() -> None: + result = parse_retry_after_header('120') + assert result == timedelta(seconds=120) + + +def test_parse_retry_after_invalid_value() -> None: + assert parse_retry_after_header('not-a-date-or-number') is None diff --git a/uv.lock b/uv.lock index 58630331b1..7c5ce04eca 100644 --- a/uv.lock +++ b/uv.lock @@ -346,6 +346,11 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/84/85/57c314a6b35336efbbdc13e5fc9ae13f6b60a0647cfa7c1221178ac6d8ae/brotlicffi-1.2.0.0.tar.gz", hash = "sha256:34345d8d1f9d534fcac2249e57a4c3c8801a33c9942ff9f8574f67a175e17adb", size = 476682, upload-time = "2025-11-21T18:17:57.334Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/87/ba6298c3d7f8d66ce80d7a487f2a487ebae74a79c6049c7c2990178ce529/brotlicffi-1.2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b13fb476a96f02e477a506423cb5e7bc21e0e3ac4c060c20ba31c44056e38c68", size = 433038, upload-time = "2026-03-05T17:57:37.96Z" }, + { url = "https://files.pythonhosted.org/packages/00/49/16c7a77d1cae0519953ef0389a11a9c2e2e62e87d04f8e7afbae40124255/brotlicffi-1.2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17db36fb581f7b951635cd6849553a95c6f2f53c1a707817d06eae5aeff5f6af", size = 1541124, upload-time = "2026-03-05T17:57:39.488Z" }, + { url = "https://files.pythonhosted.org/packages/e8/17/fab2c36ea820e2288f8c1bf562de1b6cd9f30e28d66f1ce2929a4baff6de/brotlicffi-1.2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:40190192790489a7b054312163d0ce82b07d1b6e706251036898ce1684ef12e9", size = 1541983, upload-time = "2026-03-05T17:57:41.061Z" }, + { url = "https://files.pythonhosted.org/packages/78/c9/849a669b3b3bb8ac96005cdef04df4db658c33443a7fc704a6d4a2f07a56/brotlicffi-1.2.0.0-cp314-cp314t-win32.whl", hash = "sha256:a8079e8ecc32ecef728036a1d9b7105991ce6a5385cf51ee8c02297c90fb08c2", size = 349046, upload-time = "2026-03-05T17:57:42.76Z" }, + { url = "https://files.pythonhosted.org/packages/a4/25/09c0fd21cfc451fa38ad538f4d18d8be566746531f7f27143f63f8c45a9f/brotlicffi-1.2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:ca90c4266704ca0a94de8f101b4ec029624273380574e4cf19301acfa46c61a0", size = 385653, upload-time = "2026-03-05T17:57:44.224Z" }, { url = "https://files.pythonhosted.org/packages/e4/df/a72b284d8c7bef0ed5756b41c2eb7d0219a1dd6ac6762f1c7bdbc31ef3af/brotlicffi-1.2.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:9458d08a7ccde8e3c0afedbf2c70a8263227a68dea5ab13590593f4c0a4fd5f4", size = 432340, upload-time = "2025-11-21T18:17:42.277Z" }, { url = "https://files.pythonhosted.org/packages/74/2b/cc55a2d1d6fb4f5d458fba44a3d3f91fb4320aa14145799fd3a996af0686/brotlicffi-1.2.0.0-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:84e3d0020cf1bd8b8131f4a07819edee9f283721566fe044a20ec792ca8fd8b7", size = 1534002, upload-time = "2025-11-21T18:17:43.746Z" }, { url = "https://files.pythonhosted.org/packages/e4/9c/d51486bf366fc7d6735f0e46b5b96ca58dc005b250263525a1eea3cd5d21/brotlicffi-1.2.0.0-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:33cfb408d0cff64cd50bef268c0fed397c46fbb53944aa37264148614a62e990", size = 1536547, upload-time = "2025-11-21T18:17:45.729Z" }, @@ -3868,26 +3873,26 @@ wheels = [ [[package]] name = "ty" -version = "0.0.17" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/66/c3/41ae6346443eedb65b96761abfab890a48ce2aa5a8a27af69c5c5d99064d/ty-0.0.17.tar.gz", hash = "sha256:847ed6c120913e280bf9b54d8eaa7a1049708acb8824ad234e71498e8ad09f97", size = 5167209, upload-time = "2026-02-13T13:26:36.835Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/01/0ef15c22a1c54b0f728ceff3f62d478dbf8b0dcf8ff7b80b954f79584f3e/ty-0.0.17-py3-none-linux_armv6l.whl", hash = "sha256:64a9a16555cc8867d35c2647c2f1afbd3cae55f68fd95283a574d1bb04fe93e0", size = 10192793, upload-time = "2026-02-13T13:27:13.943Z" }, - { url = "https://files.pythonhosted.org/packages/0f/2c/f4c322d9cded56edc016b1092c14b95cf58c8a33b4787316ea752bb9418e/ty-0.0.17-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:eb2dbd8acd5c5a55f4af0d479523e7c7265a88542efe73ed3d696eb1ba7b6454", size = 10051977, upload-time = "2026-02-13T13:26:57.741Z" }, - { url = "https://files.pythonhosted.org/packages/4c/a5/43746c1ff81e784f5fc303afc61fe5bcd85d0fcf3ef65cb2cef78c7486c7/ty-0.0.17-py3-none-macosx_11_0_arm64.whl", hash = "sha256:f18f5fd927bc628deb9ea2df40f06b5f79c5ccf355db732025a3e8e7152801f6", size = 9564639, upload-time = "2026-02-13T13:26:42.781Z" }, - { url = "https://files.pythonhosted.org/packages/d6/b8/280b04e14a9c0474af574f929fba2398b5e1c123c1e7735893b4cd73d13c/ty-0.0.17-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5383814d1d7a5cc53b3b07661856bab04bb2aac7a677c8d33c55169acdaa83df", size = 10061204, upload-time = "2026-02-13T13:27:00.152Z" }, - { url = "https://files.pythonhosted.org/packages/2a/d7/493e1607d8dfe48288d8a768a2adc38ee27ef50e57f0af41ff273987cda0/ty-0.0.17-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9c20423b8744b484f93e7bf2ef8a9724bca2657873593f9f41d08bd9f83444c9", size = 10013116, upload-time = "2026-02-13T13:26:34.543Z" }, - { url = "https://files.pythonhosted.org/packages/80/ef/22f3ed401520afac90dbdf1f9b8b7755d85b0d5c35c1cb35cf5bd11b59c2/ty-0.0.17-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e6f5b1aba97db9af86517b911674b02f5bc310750485dc47603a105bd0e83ddd", size = 10533623, upload-time = "2026-02-13T13:26:31.449Z" }, - { url = "https://files.pythonhosted.org/packages/75/ce/744b15279a11ac7138832e3a55595706b4a8a209c9f878e3ab8e571d9032/ty-0.0.17-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:488bce1a9bea80b851a97cd34c4d2ffcd69593d6c3f54a72ae02e5c6e47f3d0c", size = 11069750, upload-time = "2026-02-13T13:26:48.638Z" }, - { url = "https://files.pythonhosted.org/packages/f2/be/1133c91f15a0e00d466c24f80df486d630d95d1b2af63296941f7473812f/ty-0.0.17-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8df66b91ec84239420985ec215e7f7549bfda2ac036a3b3c065f119d1c06825a", size = 10870862, upload-time = "2026-02-13T13:26:54.715Z" }, - { url = "https://files.pythonhosted.org/packages/3e/4a/a2ed209ef215b62b2d3246e07e833081e07d913adf7e0448fc204be443d6/ty-0.0.17-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:002139e807c53002790dfefe6e2f45ab0e04012e76db3d7c8286f96ec121af8f", size = 10628118, upload-time = "2026-02-13T13:26:45.439Z" }, - { url = "https://files.pythonhosted.org/packages/b3/0c/87476004cb5228e9719b98afffad82c3ef1f84334bde8527bcacba7b18cb/ty-0.0.17-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6c4e01f05ce82e5d489ab3900ca0899a56c4ccb52659453780c83e5b19e2b64c", size = 10038185, upload-time = "2026-02-13T13:27:02.693Z" }, - { url = "https://files.pythonhosted.org/packages/46/4b/98f0b3ba9aef53c1f0305519536967a4aa793a69ed72677b0a625c5313ac/ty-0.0.17-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:2b226dd1e99c0d2152d218c7e440150d1a47ce3c431871f0efa073bbf899e881", size = 10047644, upload-time = "2026-02-13T13:27:05.474Z" }, - { url = "https://files.pythonhosted.org/packages/93/e0/06737bb80aa1a9103b8651d2eb691a7e53f1ed54111152be25f4a02745db/ty-0.0.17-py3-none-musllinux_1_2_i686.whl", hash = "sha256:8b11f1da7859e0ad69e84b3c5ef9a7b055ceed376a432fad44231bdfc48061c2", size = 10231140, upload-time = "2026-02-13T13:27:10.844Z" }, - { url = "https://files.pythonhosted.org/packages/7c/79/e2a606bd8852383ba9abfdd578f4a227bd18504145381a10a5f886b4e751/ty-0.0.17-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:c04e196809ff570559054d3e011425fd7c04161529eb551b3625654e5f2434cb", size = 10718344, upload-time = "2026-02-13T13:26:51.66Z" }, - { url = "https://files.pythonhosted.org/packages/c5/2d/2663984ac11de6d78f74432b8b14ba64d170b45194312852b7543cf7fd56/ty-0.0.17-py3-none-win32.whl", hash = "sha256:305b6ed150b2740d00a817b193373d21f0767e10f94ac47abfc3b2e5a5aec809", size = 9672932, upload-time = "2026-02-13T13:27:08.522Z" }, - { url = "https://files.pythonhosted.org/packages/de/b5/39be78f30b31ee9f5a585969930c7248354db90494ff5e3d0756560fb731/ty-0.0.17-py3-none-win_amd64.whl", hash = "sha256:531828267527aee7a63e972f54e5eee21d9281b72baf18e5c2850c6b862add83", size = 10542138, upload-time = "2026-02-13T13:27:17.084Z" }, - { url = "https://files.pythonhosted.org/packages/40/b7/f875c729c5d0079640c75bad2c7e5d43edc90f16ba242f28a11966df8f65/ty-0.0.17-py3-none-win_arm64.whl", hash = "sha256:de9810234c0c8d75073457e10a84825b9cd72e6629826b7f01c7a0b266ae25b1", size = 10023068, upload-time = "2026-02-13T13:26:39.637Z" }, +version = "0.0.18" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/74/15/9682700d8d60fdca7afa4febc83a2354b29cdcd56e66e19c92b521db3b39/ty-0.0.18.tar.gz", hash = "sha256:04ab7c3db5dcbcdac6ce62e48940d3a0124f377c05499d3f3e004e264ae94b83", size = 5214774, upload-time = "2026-02-20T21:51:31.173Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ae/d8/920460d4c22ea68fcdeb0b2fb53ea2aeb9c6d7875bde9278d84f2ac767b6/ty-0.0.18-py3-none-linux_armv6l.whl", hash = "sha256:4e5e91b0a79857316ef893c5068afc4b9872f9d257627d9bc8ac4d2715750d88", size = 10280825, upload-time = "2026-02-20T21:51:25.03Z" }, + { url = "https://files.pythonhosted.org/packages/83/56/62587de582d3d20d78fcdddd0594a73822ac5a399a12ef512085eb7a4de6/ty-0.0.18-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ee0e578b3f8416e2d5416da9553b78fd33857868aa1384cb7fefeceee5ff102d", size = 10118324, upload-time = "2026-02-20T21:51:22.27Z" }, + { url = "https://files.pythonhosted.org/packages/2f/2d/dbdace8d432a0755a7417f659bfd5b8a4261938ecbdfd7b42f4c454f5aa9/ty-0.0.18-py3-none-macosx_11_0_arm64.whl", hash = "sha256:3f7a0487d36b939546a91d141f7fc3dbea32fab4982f618d5b04dc9d5b6da21e", size = 9605861, upload-time = "2026-02-20T21:51:16.066Z" }, + { url = "https://files.pythonhosted.org/packages/6b/d9/de11c0280f778d5fc571393aada7fe9b8bc1dd6a738f2e2c45702b8b3150/ty-0.0.18-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5e2fa8d45f57ca487a470e4bf66319c09b561150e98ae2a6b1a97ef04c1a4eb", size = 10092701, upload-time = "2026-02-20T21:51:26.862Z" }, + { url = "https://files.pythonhosted.org/packages/0f/94/068d4d591d791041732171e7b63c37a54494b2e7d28e88d2167eaa9ad875/ty-0.0.18-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d75652e9e937f7044b1aca16091193e7ef11dac1c7ec952b7fb8292b7ba1f5f2", size = 10109203, upload-time = "2026-02-20T21:51:11.59Z" }, + { url = "https://files.pythonhosted.org/packages/34/e4/526a4aa56dc0ca2569aaa16880a1ab105c3b416dd70e87e25a05688999f3/ty-0.0.18-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:563c868edceb8f6ddd5e91113c17d3676b028f0ed380bdb3829b06d9beb90e58", size = 10614200, upload-time = "2026-02-20T21:51:20.298Z" }, + { url = "https://files.pythonhosted.org/packages/fd/3d/b68ab20a34122a395880922587fbfc3adf090d22e0fb546d4d20fe8c2621/ty-0.0.18-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:502e2a1f948bec563a0454fc25b074bf5cf041744adba8794d024277e151d3b0", size = 11153232, upload-time = "2026-02-20T21:51:14.121Z" }, + { url = "https://files.pythonhosted.org/packages/68/ea/678243c042343fcda7e6af36036c18676c355878dcdcd517639586d2cf9e/ty-0.0.18-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc881dea97021a3aa29134a476937fd8054775c4177d01b94db27fcfb7aab65b", size = 10832934, upload-time = "2026-02-20T21:51:32.92Z" }, + { url = "https://files.pythonhosted.org/packages/d8/bd/7f8d647cef8b7b346c0163230a37e903c7461c7248574840b977045c77df/ty-0.0.18-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:421fcc3bc64cab56f48edb863c7c1c43649ec4d78ff71a1acb5366ad723b6021", size = 10700888, upload-time = "2026-02-20T21:51:09.673Z" }, + { url = "https://files.pythonhosted.org/packages/6e/06/cb3620dc48c5d335ba7876edfef636b2f4498eff4a262ff90033b9e88408/ty-0.0.18-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:0fe5038a7136a0e638a2fb1ad06e3d3c4045314c6ba165c9c303b9aeb4623d6c", size = 10078965, upload-time = "2026-02-20T21:51:07.678Z" }, + { url = "https://files.pythonhosted.org/packages/60/27/c77a5a84533fa3b685d592de7b4b108eb1f38851c40fac4e79cc56ec7350/ty-0.0.18-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d123600a52372677613a719bbb780adeb9b68f47fb5f25acb09171de390e0035", size = 10134659, upload-time = "2026-02-20T21:51:18.311Z" }, + { url = "https://files.pythonhosted.org/packages/43/6e/60af6b88c73469e628ba5253a296da6984e0aa746206f3034c31f1a04ed1/ty-0.0.18-py3-none-musllinux_1_2_i686.whl", hash = "sha256:bb4bc11d32a1bf96a829bf6b9696545a30a196ac77bbc07cc8d3dfee35e03723", size = 10297494, upload-time = "2026-02-20T21:51:39.631Z" }, + { url = "https://files.pythonhosted.org/packages/33/90/612dc0b68224c723faed6adac2bd3f930a750685db76dfe17e6b9e534a83/ty-0.0.18-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:dda2efbf374ba4cd704053d04e32f2f784e85c2ddc2400006b0f96f5f7e4b667", size = 10791944, upload-time = "2026-02-20T21:51:37.13Z" }, + { url = "https://files.pythonhosted.org/packages/0d/da/f4ada0fd08a9e4138fe3fd2bcd3797753593f423f19b1634a814b9b2a401/ty-0.0.18-py3-none-win32.whl", hash = "sha256:c5768607c94977dacddc2f459ace6a11a408a0f57888dd59abb62d28d4fee4f7", size = 9677964, upload-time = "2026-02-20T21:51:42.039Z" }, + { url = "https://files.pythonhosted.org/packages/5e/fa/090ed9746e5c59fc26d8f5f96dc8441825171f1f47752f1778dad690b08b/ty-0.0.18-py3-none-win_amd64.whl", hash = "sha256:b78d0fa1103d36fc2fce92f2092adace52a74654ab7884d54cdaec8eb5016a4d", size = 10636576, upload-time = "2026-02-20T21:51:29.159Z" }, + { url = "https://files.pythonhosted.org/packages/92/4f/5dd60904c8105cda4d0be34d3a446c180933c76b84ae0742e58f02133713/ty-0.0.18-py3-none-win_arm64.whl", hash = "sha256:01770c3c82137c6b216aa3251478f0b197e181054ee92243772de553d3586398", size = 10095449, upload-time = "2026-02-20T21:51:34.914Z" }, ] [[package]]