diff --git a/bot/bot.py b/bot/bot.py index 35dbd1ba4e..c271f57e0e 100644 --- a/bot/bot.py +++ b/bot/bot.py @@ -1,19 +1,23 @@ import asyncio import contextlib +import types from sys import exception import aiohttp from discord.errors import Forbidden +from discord.ext import commands from pydis_core import BotBase +from pydis_core.utils import scheduling +from pydis_core.utils._extensions import walk_extensions from pydis_core.utils.error_handling import handle_forbidden_from_block from sentry_sdk import new_scope, start_transaction from bot import constants, exts from bot.log import get_logger +from bot.utils.startup_reporting import StartupFailureReporter log = get_logger("bot") - class StartupError(Exception): """Exception class for startup errors.""" @@ -26,9 +30,13 @@ class Bot(BotBase): """A subclass of `pydis_core.BotBase` that implements bot-specific functions.""" def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + # Track extension load failures and tasks so we can report them after all attempts have completed + self.extension_load_failures: dict[str, BaseException] = {} + self._extension_load_tasks: dict[str, asyncio.Task] = {} + self._startup_failure_reporter = StartupFailureReporter() + async def load_extension(self, name: str, *args, **kwargs) -> None: """Extend D.py's load_extension function to also record sentry performance stats.""" with start_transaction(op="cog-load", name=name): @@ -77,3 +85,53 @@ async def on_error(self, event: str, *args, **kwargs) -> None: scope.set_extra("kwargs", kwargs) log.exception(f"Unhandled exception in {event}.") + + async def add_cog(self, cog: commands.Cog) -> None: + """ + Add a cog to the bot with exception handling. + + Override of `BotBase.add_cog` to capture and log any exceptions raised during cog loading, + including the extension name if available. + """ + extension = cog.__module__ + + try: + await super().add_cog(cog) + log.info(f"Cog successfully loaded: {cog.qualified_name}") + + except BaseException as e: + key = extension or f"(unknown)::{cog.qualified_name}" + self.extension_load_failures[key] = e + + log.exception( + f"Failed during add_cog (extension={extension}, cog={cog.qualified_name})" + ) + # Propagate error + raise + + async def _load_extensions(self, module: types.ModuleType) -> None: + """Load extensions for the bot.""" + await self.wait_until_guild_available() + + self.all_extensions = walk_extensions(module) + + async def _load_one(extension: str) -> None: + try: + await self.load_extension(extension) + log.info(f"Extension successfully loaded: {extension}") + + except BaseException as e: + self.extension_load_failures[extension] = e + log.exception(f"Failed to load extension: {extension}") + raise + + for extension in self.all_extensions: + task = scheduling.create_task(_load_one(extension)) + self._extension_load_tasks[extension] = task + + # Wait for all load tasks to complete so we can report any failures together + await asyncio.gather(*self._extension_load_tasks.values(), return_exceptions=True) + + # Send a Discord message to moderators if any extensions failed to load + if self.extension_load_failures : + await self._startup_failure_reporter.notify(self, self.extension_load_failures) diff --git a/bot/constants.py b/bot/constants.py index 674ae12fcc..c51d0d1cb3 100644 --- a/bot/constants.py +++ b/bot/constants.py @@ -462,6 +462,9 @@ class _URLs(_BaseURLs): connect_max_retries: int = 3 connect_cooldown: int = 5 + # Back-off in cog_load + connect_initial_backoff: int = 1 + site_logs_view: str = "https://pythondiscord.com/staff/bot/logs" diff --git a/bot/exts/filtering/filtering.py b/bot/exts/filtering/filtering.py index 210ae3fb05..719e0ad796 100644 --- a/bot/exts/filtering/filtering.py +++ b/bot/exts/filtering/filtering.py @@ -1,3 +1,4 @@ +import asyncio import datetime import io import json @@ -24,7 +25,7 @@ import bot.exts.filtering._ui.filter as filters_ui from bot import constants from bot.bot import Bot -from bot.constants import BaseURLs, Channels, Guild, MODERATION_ROLES, Roles +from bot.constants import BaseURLs, Channels, Guild, MODERATION_ROLES, Roles, URLs from bot.exts.backend.branding._repository import HEADERS, PARAMS from bot.exts.filtering._filter_context import Event, FilterContext from bot.exts.filtering._filter_lists import FilterList, ListType, ListTypeConverter, filter_list_types @@ -55,6 +56,7 @@ from bot.utils.channel import is_mod_channel from bot.utils.lock import lock_arg from bot.utils.message_cache import MessageCache +from bot.utils.retry import is_retryable_api_error log = get_logger(__name__) @@ -108,7 +110,31 @@ async def cog_load(self) -> None: await self.bot.wait_until_guild_available() log.trace("Loading filtering information from the database.") - raw_filter_lists = await self.bot.api_client.get("bot/filter/filter_lists") + for attempt in range(1, URLs.connect_max_retries + 1): + try: + raw_filter_lists = await self.bot.api_client.get("bot/filter/filter_lists") + break + except Exception as error: + is_retryable = is_retryable_api_error(error) + is_last_attempt = attempt == URLs.connect_max_retries + + if not is_retryable: + raise + + if is_last_attempt: + log.exception("Failed to load filtering data after %d attempts.", URLs.connect_max_retries) + raise + + backoff_seconds = URLs.connect_initial_backoff * (2 ** (attempt - 1)) + log.warning( + "Failed to load filtering data (attempt %d/%d). Retrying in %d second(s): %s", + attempt, + URLs.connect_max_retries, + backoff_seconds, + error + ) + await asyncio.sleep(backoff_seconds) + example_list = None for raw_filter_list in raw_filter_lists: loaded_list = self._load_raw_filter_list(raw_filter_list) diff --git a/bot/exts/info/python_news.py b/bot/exts/info/python_news.py index c786a9d192..437e44cd38 100644 --- a/bot/exts/info/python_news.py +++ b/bot/exts/info/python_news.py @@ -1,3 +1,4 @@ +import asyncio import re import typing as t from datetime import UTC, datetime, timedelta @@ -12,7 +13,9 @@ from bot import constants from bot.bot import Bot +from bot.constants import URLs from bot.log import get_logger +from bot.utils.retry import is_retryable_api_error from bot.utils.webhooks import send_webhook PEPS_RSS_URL = "https://peps.python.org/peps.rss" @@ -46,19 +49,45 @@ def __init__(self, bot: Bot): async def cog_load(self) -> None: """Load all existing seen items from db and create any missing mailing lists.""" - with sentry_sdk.start_span(description="Fetch mailing lists from site"): - response = await self.bot.api_client.get("bot/mailing-lists") - - for mailing_list in response: - self.seen_items[mailing_list["name"]] = set(mailing_list["seen_items"]) - - with sentry_sdk.start_span(description="Update site with new mailing lists"): - for mailing_list in ("pep", *constants.PythonNews.mail_lists): - if mailing_list not in self.seen_items: - await self.bot.api_client.post("bot/mailing-lists", json={"name": mailing_list}) - self.seen_items[mailing_list] = set() - - self.fetch_new_media.start() + for attempt in range(1, URLs.connect_max_retries + 1): + try: + with sentry_sdk.start_span(description="Fetch mailing lists from site"): + response = await self.bot.api_client.get("bot/mailing-lists") + + # Rebuild state on each successful fetch (avoid partial state across retries) + self.seen_items = {} + for mailing_list in response: + self.seen_items[mailing_list["name"]] = set(mailing_list["seen_items"]) + + with sentry_sdk.start_span(description="Update site with new mailing lists"): + for mailing_list in ("pep", *constants.PythonNews.mail_lists): + if mailing_list not in self.seen_items: + await self.bot.api_client.post("bot/mailing-lists", json={"name": mailing_list}) + self.seen_items[mailing_list] = set() + + self.fetch_new_media.start() + return + + except Exception as error: + if not is_retryable_api_error(error): + raise + + if attempt == URLs.connect_max_retries: + log.exception( + "Failed to load PythonNews mailing lists after %d attempt(s).", + URLs.connect_max_retries, + ) + raise + + backoff_seconds = URLs.connect_initial_backoff * (2 ** (attempt - 1)) + log.warning( + "Failed to load PythonNews mailing lists (attempt %d/%d). Retrying in %d second(s). Error: %s", + attempt, + URLs.connect_max_retries, + backoff_seconds, + error, + ) + await asyncio.sleep(backoff_seconds) async def cog_unload(self) -> None: """Stop news posting tasks on cog unload.""" diff --git a/bot/exts/moderation/infraction/superstarify.py b/bot/exts/moderation/infraction/superstarify.py index 006334755d..01481d1f68 100644 --- a/bot/exts/moderation/infraction/superstarify.py +++ b/bot/exts/moderation/infraction/superstarify.py @@ -1,3 +1,4 @@ +import asyncio import json import random import textwrap @@ -10,6 +11,7 @@ from bot import constants from bot.bot import Bot +from bot.constants import URLs from bot.converters import Duration, DurationOrExpiry from bot.decorators import ensure_future_timestamp from bot.exts.moderation.infraction import _utils @@ -17,6 +19,7 @@ from bot.log import get_logger from bot.utils import time from bot.utils.messages import format_user +from bot.utils.retry import is_retryable_api_error log = get_logger(__name__) NICKNAME_POLICY_URL = "https://pythondiscord.com/pages/rules/#nickname-policy" @@ -43,9 +46,7 @@ async def on_member_update(self, before: Member, after: Member) -> None: f"{after.display_name}. Checking if the user is in superstar-prison..." ) - active_superstarifies = await self.bot.api_client.get( - "bot/infractions", - params={ + active_superstarifies = await self._fetch_with_retries(params={ "active": "true", "type": "superstar", "user__id": str(before.id) @@ -84,9 +85,7 @@ async def on_member_update(self, before: Member, after: Member) -> None: @Cog.listener() async def on_member_join(self, member: Member) -> None: """Reapply active superstar infractions for returning members.""" - active_superstarifies = await self.bot.api_client.get( - "bot/infractions", - params={ + active_superstarifies = await self._fetch_with_retries(params={ "active": "true", "type": "superstar", "user__id": member.id @@ -238,6 +237,22 @@ async def cog_check(self, ctx: Context) -> bool: """Only allow moderators to invoke the commands in this cog.""" return await has_any_role(*constants.MODERATION_ROLES).predicate(ctx) + async def _fetch_with_retries(self, + retries: int = URLs.connect_max_retries, + params: dict[str, str] | None = None) -> list[dict]: + """Fetch infractions from the API with retries and exponential backoff.""" + if retries < 1: + raise ValueError("retries must be at least 1") + + for attempt in range(1, retries + 1): + try: + return await self.bot.api_client.get("bot/infractions", params=params) + except Exception as e: + if attempt == retries or not is_retryable_api_error(e): + raise + await asyncio.sleep(URLs.connect_initial_backoff * (2 ** (attempt - 1))) + return None + async def setup(bot: Bot) -> None: """Load the Superstarify cog.""" diff --git a/bot/exts/utils/reminders.py b/bot/exts/utils/reminders.py index 1b386ec000..e116dcf2ae 100644 --- a/bot/exts/utils/reminders.py +++ b/bot/exts/utils/reminders.py @@ -1,3 +1,4 @@ +import asyncio import random import textwrap import typing as t @@ -23,6 +24,7 @@ POSITIVE_REPLIES, Roles, STAFF_AND_COMMUNITY_ROLES, + URLs, ) from bot.converters import Duration, UnambiguousUser from bot.errors import LockedResourceError @@ -224,13 +226,25 @@ async def cog_unload(self) -> None: async def cog_load(self) -> None: """Get all current reminders from the API and reschedule them.""" await self.bot.wait_until_guild_available() - response = await self.bot.api_client.get( - "bot/reminders", - params={"active": "true"} - ) - + # retry fetching reminders with exponential backoff + for attempt in range(1, URLs.connect_max_retries + 1): + try: + # response either throws, or is a list of reminders (possibly empty) + response = await self.bot.api_client.get( + "bot/reminders", + params={"active": "true"} + ) + break + except Exception as e: + if not self._check_error_is_retriable(e): + log.error(f"Failed to load reminders due to non-retryable error: {e}") + raise + log.warning(f"Attempt {attempt} - Failed to fetch reminders from the API: {e}") + if attempt == URLs.connect_max_retries: + log.error("Max retry attempts reached. Failed to load reminders.") + raise + await asyncio.sleep(URLs.connect_initial_backoff * (2 ** (attempt - 1))) # Exponential backoff now = datetime.now(UTC) - for reminder in response: is_valid, *_ = self.ensure_valid_reminder(reminder) if not is_valid: @@ -244,6 +258,13 @@ async def cog_load(self) -> None: else: self.schedule_reminder(reminder) + def _check_error_is_retriable(self, error: Exception) -> bool: + """Return whether loading filter lists failed due to some temporary error, thus retrying could help.""" + if isinstance(error, ResponseCodeError): + return error.status in (408, 429) or error.status >= 500 + + return isinstance(error, (TimeoutError, OSError)) + def ensure_valid_reminder(self, reminder: dict) -> tuple[bool, discord.TextChannel]: """Ensure reminder channel can be fetched otherwise delete the reminder.""" channel = self.bot.get_channel(reminder["channel_id"]) diff --git a/bot/utils/retry.py b/bot/utils/retry.py new file mode 100644 index 0000000000..342897f381 --- /dev/null +++ b/bot/utils/retry.py @@ -0,0 +1,9 @@ +from pydis_core.site_api import ResponseCodeError + + +def is_retryable_api_error(error: Exception) -> bool: + """Return whether an API error is temporary and worth retrying.""" + if isinstance(error, ResponseCodeError): + return error.status in (408, 429) or error.status >= 500 + + return isinstance(error, (TimeoutError, OSError)) diff --git a/bot/utils/startup_reporting.py b/bot/utils/startup_reporting.py new file mode 100644 index 0000000000..f7713d1ca4 --- /dev/null +++ b/bot/utils/startup_reporting.py @@ -0,0 +1,55 @@ +from collections.abc import Mapping +from typing import TYPE_CHECKING + +import discord + +from bot.constants import Channels, Icons +from bot.log import get_logger + +log = get_logger(__name__) + +if TYPE_CHECKING: + from bot.bot import Bot + +class StartupFailureReporter: + """Formats and sends one aggregated startup failure alert to moderators.""" + + async def notify(self, bot: Bot, failures: Mapping[str, BaseException], channel_id: int = Channels.mod_log) -> None: + """Notify moderators of startup failures.""" + if not failures: + return + + if bot.get_channel(channel_id) is None: + # Can't send a message if the channel doesn't exist, so log instead + log.warning("Failed to send startup failure report: mod_log channel not found.") + return + + try: + # Local import avoids circular dependency + from bot.utils.modlog import send_log_message + + text = self.render(failures) + + await send_log_message( + bot, + icon_url=Icons.token_removed, + colour=discord.Colour.red(), + title="Startup: Some extensions failed to load", + text=text, + ping_everyone=True, + channel_id=channel_id + ) + except Exception as exception: + log.exception(f"Failed to send startup failure report: {exception}") + + def render(self, failures: Mapping[str, BaseException]) -> str: + """Render a human-readable message from the given failures.""" + failure_keys = sorted(failures.keys()) + + lines = [] + lines.append("The following extension(s) failed to load:") + for failure_key in failure_keys: + exception = failures[failure_key] + lines.append(f"- **{failure_key}** - `{type(exception).__name__}: {exception}`") + + return "\n".join(lines) diff --git a/figures/Pybot_UML.svg b/figures/Pybot_UML.svg new file mode 100644 index 0000000000..bbe38a1f88 --- /dev/null +++ b/figures/Pybot_UML.svg @@ -0,0 +1,3 @@ + + +
Bot
Bot
-super()+load_extensions()+setup_hook()+add_cog()-_load_extensions()
BotBase
BotBase
-api_client: APIClient+api_client()
Filtering
Filtering
-bot : Bot       -filter_lists : dict[str, FilterList]-webhook: discord.Webhook+cog_load()-_fetch_or_generate_filtering_webhook()+on_message()-_retryable_filter_load_error()
APIClient
APIClient
-session: ClientSession-loop: AbstractEventLoop+get()
Edit cog_load() to retry fetching filter list if failed
Edit cog_load() to retry fetching filter list if failed
Added _retryable_filter_load_error()
Added _retryable_filter_load_error()
PythonNews
PythonNews
-bot: Bot+cog_load()-_retryable_site_loaded_error()
Edit cog_load() to retry fetching mailing list if failed
Edit cog_load() to retry fetching mailing list if failed
Added _retryable_site_loaded_error()
Added _retryable_site_loaded_error()
Reminders
Reminders
-bot: Bot-scheduler: Scheduler+cog_load()-_check_error_is_retriable()
Edit cog_load() to retry fetching reminders if failed
Edit cog_load() to retry fetching reminders if failed
Requires
Requires
Bot
Bot
Added _check_error_is_retriable()
Added _check_error_is_retriable()
Superstarify
Superstarify
-bot: Bot+on_member_update()+on_member_join()-_fetch_with_retries()-_check_error_is_retriable()
Edit on_member_update() and on_member_join() to call on _fetch_with_retries to retry fetching infractions if failed
Edit on_member_update() and on_member_join() to call on _fetch_with_retries to retry fetching infractions if failed
Added _fetch_with_retries() and _check_error_is_retriable()
Added _fetch_with_retries() and _check_error_is_retriable()
Added add_cog: Override of `BotBase.add_cog` to capture and log any exceptions raised during cog loading, including the extension name if available.centralize
Added add_cog: Override of `BotBase.add_cog` to capture and log any exceptions raised during cog loading, including the extension name if available.centralize
StartupFailureReporter
StartupFailureReporter
+notify()+render()
  • Formats a standardized startup failure message

  • Sends a single aggregated alert to mod-log

  • Does not crash if the log channel is unavailable
  • Formats a standardized startup failure messageSends a single aggregated alert to mod-logDoes not crash if the log channel is unavailable
    Bot
    Bot
    Requires
    Requires
    Requires
    Requires
    APIClient
    APIClient
    Requires
    Requires
    Bot
    Bot
    diff --git a/figures/flowchart.svg b/figures/flowchart.svg new file mode 100644 index 0000000000..1ba93cad81 --- /dev/null +++ b/figures/flowchart.svg @@ -0,0 +1,3 @@ + + +
    Notify moderators of any errors
    Bot object created
    Event loop starts
    setup_hook() runs
    load_extension() is called - extensions are run in parallell
    setup() for filters extension runs
    cog_add()
    setup() for reminders extension runs
    setup() for superstarify 
    extension runs
    setup() for pythonNews extension runs
    cog_setup()
    cog_add()
    cog_setup()
    cog_add()
    cog_setup()
    cog_add()
    cog_setup()
    All exceptions are gathered and propagated
    After all exceptions have been gathered
    retry setup if failure up to 3 times
    retry setup if failure up to 3 times
    retry setup if failure up to 3 times
    retry setup if failure up to 3 times
    diff --git a/report.md b/report.md new file mode 100644 index 0000000000..b002154a04 --- /dev/null +++ b/report.md @@ -0,0 +1,308 @@ +# Report for assignment 4 + +## Table of Contents +- [Report for assignment 4](#report-for-assignment-4) + - [Table of Contents](#table-of-contents) + - [Project](#project) + - [Architecture and Purpose](#architecture-and-purpose) + - [Onboarding experience](#onboarding-experience) + - [Did you choose a new project or continue on the previous one?](#did-you-choose-a-new-project-or-continue-on-the-previous-one) + - [If you changed the project, how did your experience differ from before?](#if-you-changed-the-project-how-did-your-experience-differ-from-before) + - [Setting up the project](#setting-up-the-project) + - [Team Members](#team-members) + - [Effort spent](#effort-spent) + - [Dependencies and setup tasks:](#dependencies-and-setup-tasks) + - [Overview of issue(s) and work done.](#overview-of-issues-and-work-done) + - [Requirements for the new feature or requirements affected by functionality being refactored](#requirements-for-the-new-feature-or-requirements-affected-by-functionality-being-refactored) + - [FR-1) Resilient Cog Initialization](#fr-1-resilient-cog-initialization) + - [FR-2) Retry Mechanism for External HTTP calls](#fr-2-retry-mechanism-for-external-http-calls) + - [FR-3) Error logging and monitoring](#fr-3-error-logging-and-monitoring) + - [FR-4) Moderator alert upon failure](#fr-4-moderator-alert-upon-failure) + - [Code changes](#code-changes) + - [Patch](#patch) + - [Test results](#test-results) + - [Output Logs:](#output-logs) + - [UML class diagram and its description](#uml-class-diagram-and-its-description) + - [Key changes/classes affected](#key-changesclasses-affected) + - [Design patterns](#design-patterns) + - [Benefits, drawbacks, and limitations (SEMAT kernel)](#benefits-drawbacks-and-limitations-semat-kernel) + - [Overall experience](#overall-experience) + - [What are your main take-aways from this project? What did you learn?](#what-are-your-main-take-aways-from-this-project-what-did-you-learn) + - [How did you grow as a team, using the Essence standard to evaluate yourself?](#how-did-you-grow-as-a-team-using-the-essence-standard-to-evaluate-yourself) + - [How would you put your work in context with best software engineering practice?](#how-would-you-put-your-work-in-context-with-best-software-engineering-practice) + - [Is there something special you want to mention here?](#is-there-something-special-you-want-to-mention-here) + +## Project + +Name: Python Utility Bot + +URL: [https://github.com/python-discord/bot](https://github.com/python-discord/bot) + +A discord bot designed specifically for use with the [Python discord](https://www.pythondiscord.com/) server. +It is built with an extensible cog-based architecture, integrating numerous functionalities, such as moderation, community management, reminders, and many more. + +## Architecture and Purpose +The project consists of a Discord bot that provides a wide range of features to support the Python Discord community, such as moderation, filtering, community management, reminders, and much more. The bot is designed to be modular and extensible, with a focus on maintainability. + +At its core, the bot operates on an asynchronous event-driven architecture, utilizing Python's `asyncio` library to handle concurrent operations efficiently. The bot's functionality is organized into "cogs", which are modular components that can be loaded, unloaded, and reloaded independently. This design allows for easy maintenance and scalability, as new features can be added without affecting existing functionality. The bot interacts with the Discord API to respond to user commands, manage server events, and integrate with external services. It also incorporates error handling and logging mechanisms using Sentry to ensure reliability and facilitate debugging. + +However, there is no simple way for Discord moderators to be notified if a cog fails to load during startup, which can lead to functionality being unavailable without any indication of the underlying issue. This is particularly problematic for cogs that depend on external services, as they may silently fail to initialize if those services are unavailable, and moderators would not be aware of the failure unless they have direct access to the Sentry logs. + +This issue is mainly related to a few functions in the main `bot.py` file, which is responsible for loading extensions and cogs during startup, as well as the setup functions in each extension. Figure 1 illustrates the flow of the bot's startup process, highlighting where cog initialization occurs and which exceptions are captured and reported. The diagram also indicates the points at which moderator alerting is integrated in our implementation. + +![flowchart.svg](figures/flowchart.svg) + + +## Onboarding experience + +### Did you choose a new project or continue on the previous one? + +We chose a new project, mainly as it was difficult to find an existing issue which would meet all the requirements set by the assignment. + +### If you changed the project, how did your experience differ from before? + +The project is much more complex than the one we chose for assignment 3, which became evident in the amount of time needed with setting up the project environnment, and understanding the codebase. + +### Setting up the project + +The project setup was documented extremely well in the project's [Contributing guide](https://www.pythondiscord.com/pages/guides/pydis-guides/contributing/bot/).\ +Installing the dependencies was straightforward using the `uv` package manager. +Most of the time was likely spent downloading and setting up Docker, particularly for those with no prior experience using it. + +In addition to installing dependencies, the project required setting up both the test server and the actual bot and interconnecting them. +This process was also very well documented, and the project even provided a base template for the Discord server, resulting in a very quick setup. + +The project documentation also explained how to run the tests, providing a README file containing all the necessary commands. +It included an introduction to writing new tests, along with a brief overview of how mocking is used, which provided some initial insight. + +However, the documentation did not include an introduction to the actual codebase. +We spent a significant amount of time trying to understand how the project is structured and how different classes interact with one another. +Because the project is tightly integrated with Discord servers, this made it even more challenging, as some functions are triggered exclusively by the Discord API. For example: It was not possible for us to pass `Status Embed` workflow during continuous integration, as the discord channel id was hard-coded directly in the workflow. +We believe that adding concrete examples or a high-level architectural diagram would be highly beneficial for newcomers. + +## Team Members + +| Name | GitHub | +| --- | --- | +| Apeel | [rippyboii](https://github.com/rippyboii) | +| Josef | [kahoujo1](https://github.com/kahoujo1) | +| Alexander | [a-runebou](https://github.com/a-runebou) | +| Carl | [carlisaksson](https://github.com/carlisaksson) | +| Fabian | [crancker96](https://github.com/crancker96) | + +## Effort spent + +Estimated effort per team member, in hours: + +| Team member | Plenary discussions / Group meetings | Reading documentation | Configuration and setup | Analyzing code / output | Writing documentation | Writing code | Running code / tests | Total | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| Apeel | 8 | 3 | 3 | 6 | 3 | 4 | 1 | ~28 | +| Josef | 8 | 2 | 1 | 6 | 4 | 3| 2 | ~26 | +| Alexander | 8 | 2 | 1 | 5 | 2 | 5 | 2 | ~25 | +| Carl | 8 | 2 | 3 | 5 | 2 | 4 | 1 | ~25 | +| Fabian | 8 | 3 | 4 | 5 | 3 | 3 | 1 | ~27 | +| Total | 8 | 12 | 12 | 27 | 14 | 19 | 7 | 99 | + +### Dependencies and setup tasks: + +| Dependency / tool / setup task | Team member(s) | Time spent | Notes | +| --- | --- | --- | --- | +| Docker | All | 1 | Everyone had locally setup the docker to run the project | +| `uv` and Python environment setup | All | 1 | The given documentation had easy guidelines to setup | +| Discord test server / bot configuration | All | 1 | We all have our own test server and individual bots | + + +## Overview of issue(s) and work done. + +Title: Handling of site connection issues during outage. (#2918) + +URL: [https://github.com/python-discord/bot/issues/2918](https://github.com/python-discord/bot/issues/2918) + +Since some cogs depend on external services (external sites), their initialization fails if those services are unavailable during startup, rendering their functionality inaccessible. +This failure occurs silently, without any indication to moderators. + +Scope (functionality and code affected). + +**Functionality affected** +- Startup behavior of cogs depending on external HTTP services. +- Error handling, error propagation. +- Retry logic with back-off. +- Logging and allerting of moderators. + +**Code affected** +- `cog_load()` implementations in affected cogs. +- Sentry reporting during individual retries and final error. +- Discord message API interaction to alert moderators. +- Associated unit tests covering cog initialization. +- Extension loading failure handling in `bot.py` +## Requirements for the new feature or requirements affected by functionality being refactored +### FR-1) Resilient Cog Initialization +Cogs that depend on external HTTP services shall handle connection errors and HTTP failures during `cog_load()` without failing silently. +If the external service is unavailable, the cog must not terminate initialization without reporting the failure. +Identified cogs pertaining to this problem are: +- `bot/exts/filtering/filtering.py` +- `bot/exts/utils/reminders.py` +- `bot/exts/info/python_news.py` +- `bot/exts/moderation/infraction/superstarify.py` + +### FR-2) Retry Mechanism for External HTTP calls +If a cog fails to initialize due to a retriable HTTP error or network-related exception, the system shall automatically retry the initialization a finite number of times before giving up. +The retry attempts shall use exponential backoff to avoid rapid repeated failures. + +Should a cog failed to load at first due to external site error, the cog shall firstly wait for a predefined time, and then try to load again. +If the loading is finished successfully, the setup is completed. +If the error remains, the cog shall wait for a time INITIAL_TIME^(2*(i)), where i is the number of retries, and then try again. +This is repeated a total number of MAX_RETRIES, after which the setup has to finish, either successfully if the last try did not result in an error, or by throwing an Exception. + +**Tested by:** +- `tests/bot/exts/filtering/test_filtering_cog.py::` + - `test_cog_load_retries_then_succeeds` + - `test_retries_three_times_fails_and_reraises` +- `tests/bot/exts/utils/test_reminders.py::` + - `test_reminders_cog_load_retries_after_initial_exception` + - `test_reminders_cog_load_fails_after_max_retries` +- `tests/bot/exts/info/test_python_news.py::` + - `test_cog_load_retries_then_succeeds` + - `test_retries_max_times_fails_and_reraises` +- `tests/bot/exts/moderation/infraction/test_superstarify_cog.py::` + - `test_fetch_retries_then_succeeds` + - `test_fetch_fails_after_max_retries` + +The tests are structured in the following way: +- The API calls are mocked, allowing for successful and unsuccessful HTTP calls. +- each tests calls `await cog_load()` to start the setup +- If the first call is successful, the cog shall load without throwing an Exception +- If the first call fails, but the second one is successful, the cog shall load without throwing an Exception. Moreover, the `sleep` function shall be called once. +- If all the calls fail, the function shall return an Exception. The `sleep` function shall be called a total of `MAX_TRIES-1` times. +### FR-3) Error logging and monitoring +All initialization failures shall be logged through the existing logging infrastructure and reported to Sentry. + +If a cog fails to load during setup (f.e. it throws an Exception), the error shall be observed and notified in the Sentry logging output. +The logging shall be comprised of warnings after each retry, specyfing the affected cog and the number of retris. +Should all retry attempts fail, a description of the error nature and a name of the failed cog shall be logged in the form of an Error. + +**Tested by:** +- simulating Exception in the affected cogs (adding temporary `raise Exception` in the `cog_load()` function) +- calling `await cog_load()` +- observing the Sentry output + - After each failure, a warning is logged informing the maintainers about a temporary failure and the number of retries + - After the final retry, the full error description is logged warning the maintainers about the cog load failure. + +### FR-4) Moderator alert upon failure +The system shall keep a list of all extensions and cogs that fail to load. +A cog is added to this list if and only if it exhausts all retry attempts. +Once all the cogs and extensions are loaded, the system shall send a singular message on the `mod-log` Discord channel. +The message shall contain the information about all cogs which failed on startup, as well as a short description of the error nature. +All moderators shall be pinged by the message in order to make sure they are alerted. + +**Tested by:** +- `tests/bot/exts/test_extensions.py` +- simulating Exception in the affected cogs (adding temporary `raise Exception` in the `cog_load()` function) +- calling `await cog_load()` +- observing the `mod-log` channel: + - Once the setup finishes, a single message is send to the channel + - The message pings all the moderators, and specifies which cogs failed to load and why. + +## Code changes + +### Patch + +The patch can be compared with the `documentation` branch. `documentation` branch only contains report and no changes in the code-base. + +```bash +git diff main documentation +``` + +The patch is clean, as it only contains changes related to handling site connection issues during cog initialization, moderator alerting, and the associated tests/documentation. + +The changes were considered for acceptance. All relevant automated checks passed except `Status Embed`. This workflow could not be passed in our local setup because it depends on a hard-coded Discord server channel ID from the official environment, and our test bots do not have permission to send messages to that channel. + + +## Test results + +Before refactoring, cogs depending on unavailable external services could fail during initialization without notifying moderators. Relevant tests for the new retry/alert behavior were not present. + +After refactoring, the implemented tests passed when run locally with: + +```bash +uv run task test +``` + +### Output Logs: +- Before Implementation: [Before Output Log](https://github.com/rippyboii/python-bot/issues/21#issuecomment-3977441009) +- After Impementation: [After Output Log](https://github.com/rippyboii/python-bot/issues/21#issuecomment-3977445773) + +## UML class diagram and its description + +The UML class diagram shows the classes modified/added to resolve the given issue as well as how the classes are related to each other. In the UML there are notes adjacent to classes describing how they were augmented to resolve the issue. + +![Pybot_UML.svg](figures/Pybot_UML.svg) + +### Key changes/classes affected + +Optional (point 1): Architectural overview. + +Optional (point 2): relation to design pattern(s). + + +## Design patterns + +The bot follows an *asynchronous event-driven* architecture built on top of asyncio, where the event loop acts as a *Reactor* and the cogs function as *Observers* reacting to events dispatched by Discord. Our update extends this architectural model into the startup phase by making the main bot explicitly await the loading of extensions and cogs. Instead of treating initialization as loosely coordinated asynchronous tasks, startup is now handled as a controlled and deterministic async workflow. + +By centrally awaiting extension loading and registering failures, we effectively introduced a structured lifecycle orchestration mechanism. This resembles the *Template Method pattern*; the bot defines the high-level startup algorithm, while individual extensions provide their specific setup behavior. The core now governs execution order, error propagation, and completion, strengthening architectural cohesion and making startup behavior explicit rather than implicit. + +The introduction of a centralized failure registry and moderator notification system addresses a reliability concern. Previously, most extension failures were handled locally and silently. Now, error notification is abstracted into one central mechanism which ensures consistent handling and visibility. This can be seen as a *refactoring towards separation of concerns* and consolidation of duplicated logic, improving observability and maintainability without altering the *modular extension design*. + +Adding retry logic for critical extensions introduces a resilience pattern into the architecture. Instead of failing permanently, important components now implement retry behavior, aligning with *fault-tolerant design principles*. This change elevates the system from a *fail-fast* startup model to a selectively resilient one, while preserving the modular cog-based structure. + +Overall, the update strengthens the architectural maturity of the system. It centralizes lifecycle control, improves separation of responsibilities between the core and extensions, and introduces structured error handling and resilience patterns, all while remaining consistent with the existing asynchronous event-driven and modular design principles. + +## Benefits, drawbacks, and limitations (SEMAT kernel) +The primary opportunity addressed by this issue concerns operational reliability. +Previously, cogs that depended on external services would fail silently if those services were unavailable during startup. +Since moderators were not alerted to such failures, functionality could become inaccessible to users without explanation. +The identified opportunity was therefore to improve the robustness of cog initialization and introduce explicit alerting mechanisms so that moderators could take corrective action. +With the implemented changes, this opportunity has moved from identified to addressed, as the Software System has been updated to handle such failures explicitly. + +We identified unhandled exceptions in the affected cogs, introduced structured error handling, and implemented a retry-with-exponential-backoff pattern, significantly improving fault tolerance and resilience during startup. +Moderator alerting is handled centrally: a single consolidated message is sent containing all cogs or extensions that failed to load. +This ensures consistent error reporting while avoiding excessive notification noise. +However, the retry mechanism increases the complexity of the startup logic and prolongs initialization time, as the bot completes startup only after all retry attempts have concluded. +This introduces a clear tradeoff between startup latency and system reliability. + +From a Requirements perspective, we transformed the previously implicit requirement *cogs should load* into a set of explicit non-functional requirements. +In particular: *cogs should tolerate temporary external service outages*, *failures during cog loading must be observable*, and *startup must not fail silently*. +Additionally, *all startup failures must be reported to moderators*. +These refinements align strongly with reliability and observability as non-functional requirements and make system expectations clearer and verifiable. + +The primary stakeholders include moderators/maintainers and server users. With the implemented changes, moderators receive explicit failure notifications, enabling further action to resolve the issues. +Observability is further enhanced through Sentry logging, which records retry attempts and associated error details. +Consequently, users experience fewer unexplained missing features, improving transparency and overall service reliability. + +## Overall experience + +### What are your main take-aways from this project? What did you learn? + +Working on a mature open-source system differs greatly from working on a smaller course project, which is the primary lesson we learned from this project. Even though the setup and contribution guides are well written and provided, understanding the architecture and the interactions between indivdual components and classes takes a significant amount of time. + +Technically, we gained more knowledge about cog-based bot architectures, asynchronous Python systems, and how dependencies on outside services can impact startup behavior. One important lesson was that initialization failures shouldn't go unnoticed. Making the system more reliable and user-friendly requires retrying with exponential backoff, clearly documenting errors, and alerting moderators. + + +### How did you grow as a team, using the Essence standard to evaluate yourself? +We believe we are still in the Collaborating state, however much closer to the Performing state than before. +Throughout the course, we have gradually improved our communication and responsiveness, which has resulted in more active code reviews, frequent discussions about current obstacles and problems, and more structured meeting plans. +The coordination also improved, as we experienced smoother task distribution and better fulfillment of commitments. +However, other courses and external responsibilities still affect our consistency, and the work is therefore fulfilled with varying levels of consistency. + + +### How would you put your work in context with best software engineering practice? + +Our work can be placed in the context of best software engineering practice. The issue we had taken was not only about adding functionality, but about making failure handling more compact in a production-like system that depends on external services. By improving retry behavior, logging, and moderator alerting, we worked toward better observability and fault tolerance, which are important principles in modern software engineering. + +Another important aspect of our team was the communication. Throughout the work, we kept the orignal maintainers informed in the issue channel about our understanding of the problem and the our plan of action. This helped to ensure that our approach matched the project expectations and reduced the risk of introducing unintended behavior or potential vulnerabilities. We also communicated directly with the original developers and maintainers of the bot in their official Discord server, which gave us useful clarification and feedback. + +Lastly, we provided testing to support the work. We documented known limitations in the test environment, tried to keep the change set restricted to the issue being addressed, and, when feasible, used automated checks to confirm the outcome. + +### Is there something special you want to mention here? + +It's important to note that while the project's setup documentation was excellent, the codebase's architectural overview was lacking. Understanding how the various components, services, and Discord-specific flows work together was more challenging for new contributors than installing or managing the project. We think that future contributors' onboarding experience would be greatly enhanced by including a higher-level architecture overview. diff --git a/tests/bot/exts/filtering/test_filtering_cog.py b/tests/bot/exts/filtering/test_filtering_cog.py new file mode 100644 index 0000000000..736c4bf1aa --- /dev/null +++ b/tests/bot/exts/filtering/test_filtering_cog.py @@ -0,0 +1,77 @@ +import unittest +from unittest.mock import AsyncMock, MagicMock, patch + +from bot.exts.filtering.filtering import Filtering + + +class FilteringCogLoadTests(unittest.IsolatedAsyncioTestCase): + """Test startup behavior of the Filtering cog (`cog_load`).""" + + def setUp(self) -> None: + """Set up a Filtering cog with a mocked bot and stubbed startup dependencies.""" + self.bot = MagicMock() + self.bot.wait_until_guild_available = AsyncMock() + + self.bot.api_client = MagicMock() + self.bot.api_client.get = AsyncMock() + + self.cog = Filtering(self.bot) + + # Stub internals that are not relevant to this unit test. + self.cog.collect_loaded_types = MagicMock() + self.cog.schedule_offending_messages_deletion = AsyncMock() + self.cog._fetch_or_generate_filtering_webhook = AsyncMock(return_value=MagicMock()) + + # `weekly_auto_infraction_report_task` is a discord task loop; patch its start method. + self.start_patcher = patch.object(self.cog.weekly_auto_infraction_report_task, "start") + self.mock_weekly_task_start = self.start_patcher.start() + self.addCleanup(self.start_patcher.stop) + + async def test_cog_load_retries_then_succeeds(self): + """`cog_load` should retry temporary failures and complete startup after a successful fetch.""" + self.bot.api_client.get.side_effect = [ + OSError("temporary outage"), + TimeoutError("temporary timeout"), + [], + ] + + with patch("bot.exts.filtering.filtering.asyncio.sleep", new_callable=AsyncMock) as mock_sleep: + await self.cog.cog_load() + + self.bot.wait_until_guild_available.assert_awaited_once() + self.assertEqual(self.bot.api_client.get.await_count, 3) + self.bot.api_client.get.assert_awaited_with("bot/filter/filter_lists") + self.assertEqual(mock_sleep.await_count, 2) + self.cog._fetch_or_generate_filtering_webhook.assert_awaited_once() + self.cog.collect_loaded_types.assert_called_once_with(None) + self.cog.schedule_offending_messages_deletion.assert_awaited_once() + self.mock_weekly_task_start.assert_called_once() + + async def test_retries_three_times_fails_and_reraises(self): + """`cog_load` should retry and re-raise when all retry attempts fail.""" + self.bot.api_client.get.side_effect = OSError( + "Simulated site/API outage during cog_load" + ) + + with patch( + "bot.exts.filtering.filtering.asyncio.sleep", + new_callable=AsyncMock, + ) as mock_sleep, self.assertRaises(OSError) as ctx: + await self.cog.cog_load() + + self.assertIs(ctx.exception, self.bot.api_client.get.side_effect) + + # Waited for guild availability + self.bot.wait_until_guild_available.assert_awaited_once() + + # 3 attempts + self.assertEqual(self.bot.api_client.get.await_count, 3) + self.bot.api_client.get.assert_awaited_with("bot/filter/filter_lists") + + # Backoff between attempts (attempts - 1) + self.assertEqual(mock_sleep.await_count, 2) + + # Startup should stop before later steps. + self.cog._fetch_or_generate_filtering_webhook.assert_not_awaited() + self.cog.schedule_offending_messages_deletion.assert_not_awaited() + self.mock_weekly_task_start.assert_not_called() diff --git a/tests/bot/exts/info/test_python_news.py b/tests/bot/exts/info/test_python_news.py new file mode 100644 index 0000000000..75c9e386aa --- /dev/null +++ b/tests/bot/exts/info/test_python_news.py @@ -0,0 +1,100 @@ +import unittest +from unittest.mock import AsyncMock, MagicMock, patch + +from pydis_core.site_api import ResponseCodeError + +from bot.constants import URLs +from bot.exts.info.python_news import PythonNews + + +class PythonNewsCogLoadTests(unittest.IsolatedAsyncioTestCase): + """Test startup behavior of the PythonNews cog (`cog_load`).""" + + def setUp(self) -> None: + """Set up a PythonNews cog with a mocked bot and stubbed startup dependencies.""" + self.bot = MagicMock() + self.bot.wait_until_guild_available = AsyncMock() + + self.bot.api_client = MagicMock() + self.bot.api_client.get = AsyncMock() + self.bot.api_client.post = AsyncMock() + + # Required by `fetch_new_media` later, but not used in these tests. + self.bot.http_session = MagicMock() + + self.cog = PythonNews(self.bot) + + # Stub out task-loop start, so it doesn't actually schedule anything. + self.start_patcher = patch.object(self.cog.fetch_new_media, "start") + self.mock_fetch_new_media_start = self.start_patcher.start() + self.addCleanup(self.start_patcher.stop) + + async def test_cog_load_retries_then_succeeds(self): + """`cog_load` should retry temporary failures and complete startup after a successful fetch.""" + # First two attempts fail with retryable errors, third succeeds. + self.bot.api_client.get.side_effect = [ + OSError("temporary outage"), + TimeoutError("temporary timeout"), + [ + {"name": "pep", "seen_items": ["1", "2"]}, + ], + ] + + # Ensure no missing mailing lists need creating in this test. + with ( + patch("bot.exts.info.python_news.constants.PythonNews.mail_lists", new=()), + patch("bot.exts.info.python_news.asyncio.sleep", new_callable=AsyncMock) as mock_sleep, + ): + await self.cog.cog_load() + + self.assertEqual(self.bot.api_client.get.await_count, 3) + self.bot.api_client.get.assert_awaited_with("bot/mailing-lists") + + # Sleep should have been awaited for the two failed attempts. + self.assertEqual(mock_sleep.await_count, 2) + + # Task should start after successful load. + self.mock_fetch_new_media_start.assert_called_once() + + # State should be populated. + self.assertIn("pep", self.cog.seen_items) + self.assertEqual(self.cog.seen_items["pep"], {"1", "2"}) + + # No posts should happen because no missing lists. + self.bot.api_client.post.assert_not_awaited() + + async def test_retries_max_times_fails_and_reraises(self): + """`cog_load` should re-raise when all retry attempts fail.""" + self.bot.api_client.get.side_effect = OSError("Simulated site/API outage during cog_load") + + with ( + patch("bot.exts.info.python_news.asyncio.sleep", new_callable=AsyncMock) as mock_sleep, + self.assertRaises(OSError), + ): + await self.cog.cog_load() + + # Should try exactly MAX_ATTEMPTS times. + + self.assertEqual(self.bot.api_client.get.await_count, URLs.connect_max_retries) + self.bot.api_client.get.assert_awaited_with("bot/mailing-lists") + + # Sleeps happen between attempts, so MAX_ATTEMPTS - 1 times. + self.assertEqual(mock_sleep.await_count, URLs.connect_max_retries - 1) + + # Task should never start if load fails. + self.mock_fetch_new_media_start.assert_not_called() + + async def test_cog_load_does_not_retry_non_retryable_error(self): + """`cog_load` should not retry when the error is non-retryable.""" + # 404 should be considered non-retryable by your predicate. + self.bot.api_client.get.side_effect = ResponseCodeError(MagicMock(status=404)) + + with ( + patch("bot.exts.info.python_news.asyncio.sleep", new_callable=AsyncMock) as mock_sleep, + self.assertRaises(ResponseCodeError), + ): + await self.cog.cog_load() + + self.assertEqual(self.bot.api_client.get.await_count, 1) + self.assertEqual(mock_sleep.await_count, 0) + self.mock_fetch_new_media_start.assert_not_called() diff --git a/tests/bot/exts/moderation/infraction/test_superstarify_cog.py b/tests/bot/exts/moderation/infraction/test_superstarify_cog.py new file mode 100644 index 0000000000..54473c7064 --- /dev/null +++ b/tests/bot/exts/moderation/infraction/test_superstarify_cog.py @@ -0,0 +1,105 @@ +import unittest +from unittest.mock import AsyncMock, MagicMock, patch + +from bot.exts.moderation.infraction.superstarify import Superstarify +from tests.helpers import MockBot + + +class TestSuperstarify(unittest.IsolatedAsyncioTestCase): + + async def asyncSetUp(self): + self.bot = MockBot() + + self.cog = Superstarify(self.bot) + + self.bot.api_client = MagicMock() + self.bot.api_client.get = AsyncMock() + + self.cog._check_error_is_retriable = MagicMock(return_value=True) + + async def test_fetch_from_api_success(self): + """API succeeds on first attempt.""" + expected = [{"id": 1}] + self.bot.api_client.get.return_value = expected + + result = await self.cog._fetch_with_retries( + params={"user__id": "123"} + ) + self.assertEqual(result, expected) + + self.bot.api_client.get.assert_awaited_once_with( + "bot/infractions", + params={"user__id": "123"}, + ) + + @patch("asyncio.sleep", new_callable=AsyncMock) + async def test_fetch_retries_then_succeeds(self, _): + self.bot.api_client.get.side_effect = [ + OSError("temporary failure"), + [{"id": 42}], + ] + + result = await self.cog._fetch_with_retries( + params={"user__id": "123"} + ) + + self.assertEqual(result, [{"id": 42}]) + self.assertEqual(self.bot.api_client.get.await_count, 2) + + + @patch("asyncio.sleep", new_callable=AsyncMock) + async def test_fetch_fails_after_max_retries(self, _): + error = OSError("API down") + + self.bot.api_client.get.side_effect = error + + with self.assertRaises(OSError): + await self.cog._fetch_with_retries( + retries=3, + params={"user__id": "123"}, + ) + + self.assertEqual(self.bot.api_client.get.await_count, 3) + + + @patch("asyncio.sleep", new_callable=AsyncMock) + async def test_non_retriable_error_stops_immediately(self, _): + error = ValueError("bad request") + + self.bot.api_client.get.side_effect = error + self.cog._check_error_is_retriable.return_value = False + + with self.assertRaises(ValueError): + await self.cog._fetch_with_retries() + + # only one attempt + self.bot.api_client.get.assert_awaited_once() + + + @patch("asyncio.sleep", new_callable=AsyncMock) + async def test_member_update_recovers_from_api_failure(self, _): + before = MagicMock(display_name="Old", id=123) + after = MagicMock(display_name="New", id=123) + after.edit = AsyncMock() + + self.bot.api_client.get.side_effect = [ + OSError(), + [{"id": 42}], + ] + + self.cog.get_nick = MagicMock(return_value="Taylor Swift") + + with patch( + "bot.exts.moderation.infraction._utils.notify_infraction", + new=AsyncMock(return_value=True), + ): + await self.cog.on_member_update(before, after) + + after.edit.assert_awaited_once() + + @patch("asyncio.sleep", new_callable=AsyncMock) + async def test_alert_triggered_after_total_failure(self, _): + self.bot.api_client.get.side_effect = OSError("down") + + with self.assertRaises(OSError): + await self.cog._fetch_with_retries(retries=3) diff --git a/tests/bot/exts/test_extensions.py b/tests/bot/exts/test_extensions.py new file mode 100644 index 0000000000..2546873883 --- /dev/null +++ b/tests/bot/exts/test_extensions.py @@ -0,0 +1,157 @@ +import asyncio +import contextlib +import importlib +import sys +import unittest +import unittest.mock +from pathlib import Path +from tempfile import TemporaryDirectory + +import discord + +from bot.bot import Bot + + +class ExtensionLoadingTests(unittest.IsolatedAsyncioTestCase): + async def asyncSetUp(self) -> None: + self.http_session = unittest.mock.MagicMock(name="http_session") + + # Set up a Bot instance with minimal configuration for testing extension loading. + self.bot = Bot( + command_prefix="!", + guild_id=123456789012345678, + allowed_roles=[], + http_session=self.http_session, + intents=discord.Intents.none() + ) + + # Avoid blocking in _load_extensions() + async def _instant() -> None: + return None + self.bot.wait_until_guild_available = _instant + + # Ensure clean state + self.bot.extension_load_failures = {} + self.bot._extension_load_tasks = {} + + # Temporary importable package: tmp_root/testexts/__init__.py + modules + self._temp_dir = TemporaryDirectory() + self.addCleanup(self._temp_dir.cleanup) + self.tmp_root = Path(self._temp_dir.name) + + self.pkg_name = "testexts" + self.pkg_dir = self.tmp_root / self.pkg_name + self.pkg_dir.mkdir(parents=True, exist_ok=True) + (self.pkg_dir / "__init__.py").write_text("", encoding="utf-8") + + sys.path.insert(0, str(self.tmp_root)) + self.addCleanup(self._remove_tmp_from_syspath) + self._purge_modules(self.pkg_name) + + # Ensure scheduled tasks execute during tests + self._create_task_patcher = unittest.mock.patch( + "pydis_core.utils.scheduling.create_task", + side_effect=lambda coro, *a, **k: asyncio.create_task(coro), + ) + self._create_task_patcher.start() + self.addCleanup(self._create_task_patcher.stop) + + def _remove_tmp_from_syspath(self) -> None: + """Remove the temporary directory from sys.path.""" + with contextlib.suppress(ValueError): + sys.path.remove(str(self.tmp_root)) + + def _purge_modules(self, prefix: str) -> None: + """Remove modules from sys.modules that match the given prefix.""" + for name in list(sys.modules.keys()): + if name == prefix or name.startswith(prefix + "."): + del sys.modules[name] + + def _write_ext(self, module_name: str, source: str) -> str: + """Write an extension module with the given source code and + return its full import path.""" + (self.pkg_dir / f"{module_name}.py").write_text(source, encoding="utf-8") + full = f"{self.pkg_name}.{module_name}" + self._purge_modules(full) + return full + + async def _run_loader(self) -> None: + """Run the extension loader on the package containing our test extensions.""" + module = importlib.import_module(self.pkg_name) + + await self.bot._load_extensions(module) + + # Wait for all extension load tasks to complete so that exceptions are recorded in the bot's state + tasks = getattr(self.bot, "_extension_load_tasks", {}) or {} + if tasks: + await asyncio.gather(*tasks.values(), return_exceptions=True) + + def _assert_failure_recorded_for_extension(self, ext: str) -> None: + """Assert that a failure is recorded for the given extension.""" + if ext in self.bot.extension_load_failures: + return + for key in self.bot.extension_load_failures: + if key.startswith(ext): + return + self.fail( + f"Expected a failure recorded for {ext!r}. " + f"Recorded keys: {sorted(self.bot.extension_load_failures.keys())}" + ) + + async def test_setup_failure_is_captured(self) -> None: + ext = self._write_ext( + "ext_setup_fail", + """ +async def setup(bot): + raise RuntimeError("setup failed") +""", + ) + await self._run_loader() + self._assert_failure_recorded_for_extension(ext) + + async def test_cog_load_failure_is_captured(self) -> None: + ext = self._write_ext( + "ext_cogload_fail", + """ +from discord.ext import commands + +class BadCog(commands.Cog): + async def cog_load(self): + raise RuntimeError("cog_load failed") + +async def setup(bot): + await bot.add_cog(BadCog()) +""", + ) + await self._run_loader() + self._assert_failure_recorded_for_extension(ext) + + async def test_add_cog_failure_is_captured(self) -> None: + ext = self._write_ext( + "ext_addcog_fail", + """ +from discord.ext import commands + +class DupCog(commands.Cog): + pass + +async def setup(bot): + await bot.add_cog(DupCog()) + await bot.add_cog(DupCog()) +""", + ) + await self._run_loader() + self._assert_failure_recorded_for_extension(ext) + + async def test_import_failure_is_captured(self) -> None: + ext = self._write_ext( + "ext_import_fail", + """ +raise RuntimeError("import failed before setup()") + +async def setup(bot): + pass +""", + ) + await self._run_loader() + self._assert_failure_recorded_for_extension(ext) diff --git a/tests/bot/exts/utils/test_reminders.py b/tests/bot/exts/utils/test_reminders.py new file mode 100644 index 0000000000..eb1d903876 --- /dev/null +++ b/tests/bot/exts/utils/test_reminders.py @@ -0,0 +1,57 @@ +import unittest +from unittest.mock import AsyncMock, MagicMock, patch + +from pydis_core.site_api import ResponseCodeError + +from bot.constants import URLs +from bot.exts.utils.reminders import Reminders +from tests.helpers import MockBot + + +class RemindersCogLoadTests(unittest.IsolatedAsyncioTestCase): + """ Tests startup behaviour of the Reminders cog. """ + + def setUp(self): + self.bot = MockBot() + self.bot.wait_until_guild_available = AsyncMock() + self.cog = Reminders(self.bot) + + self.cog._alert_mods_if_loading_failed = AsyncMock() + self.cog.ensure_valid_reminder = MagicMock(return_value=(False, None)) + self.cog.schedule_reminder = MagicMock() + self.cog._alert_mods_if_loading_failed = AsyncMock() + + self.bot.api_client = MagicMock() + self.bot.api_client.get = AsyncMock() + + async def test_reminders_cog_loads_correctly(self): + """ Tests if the Reminders cog loads without error if the GET requests works. """ + self.bot.api_client.get.return_value = [] + try: + with patch("bot.exts.utils.reminders.asyncio.sleep", new_callable=AsyncMock): + await self.cog.cog_load() + except Exception as e: + self.fail(f"Reminders cog failed to load with exception: {e}") + + async def test_reminders_cog_load_retries_after_initial_exception(self): + """ Tests if the Reminders cog loads after retrying on initial exception. """ + self.bot.api_client.get.side_effect = [OSError("fail1"), OSError("fail2"), []] + try: + with patch("bot.exts.utils.reminders.asyncio.sleep", new_callable=AsyncMock) as mock_sleep: + await self.cog.cog_load() + except Exception as e: + self.fail(f"Reminders cog failed to load after retrying with exception: {e}") + self.assertEqual(mock_sleep.await_count, 2) + self.bot.api_client.get.assert_called() + + async def test_reminders_cog_load_fails_after_max_retries(self): + """ Tests if the Reminders cog fails to load after max retries. """ + self.bot.api_client.get.side_effect = ResponseCodeError(response=MagicMock(status=500), + response_text="Internal Server Error") + with patch("bot.exts.utils.reminders.asyncio.sleep", new_callable=AsyncMock) as mock_sleep, \ + self.assertRaises(ResponseCodeError): + await self.cog.cog_load() + + # Should have retried MAX_RETRY_ATTEMPTS - 1 times before failing + self.assertEqual(mock_sleep.await_count, URLs.connect_max_retries - 1) + self.bot.api_client.get.assert_called() diff --git a/tests/bot/utils/test_retry.py b/tests/bot/utils/test_retry.py new file mode 100644 index 0000000000..8ce6f6db0c --- /dev/null +++ b/tests/bot/utils/test_retry.py @@ -0,0 +1,29 @@ +import unittest +from unittest.mock import MagicMock + +from pydis_core.site_api import ResponseCodeError + +from bot.utils.retry import is_retryable_api_error + + +class RetryTests(unittest.TestCase): + """Tests for retry classification helpers.""" + + def test_is_retryable_api_error(self): + """`is_retryable_api_error` should classify temporary failures as retryable.""" + test_cases = ( + (ResponseCodeError(MagicMock(status=408)), True), + (ResponseCodeError(MagicMock(status=429)), True), + (ResponseCodeError(MagicMock(status=500)), True), + (ResponseCodeError(MagicMock(status=503)), True), + (ResponseCodeError(MagicMock(status=400)), False), + (ResponseCodeError(MagicMock(status=404)), False), + (TimeoutError("timeout"), True), + (OSError("os error"), True), + (AttributeError("attr"), False), + (ValueError("value"), False), + ) + + for error, expected_retryable in test_cases: + with self.subTest(error=error): + self.assertEqual(is_retryable_api_error(error), expected_retryable)