feat: add database circuit breaker and health-readiness endpoint
Introduce a process-wide DatabaseCircuitBreaker that fails fast when Postgres is unavailable (e.g. disk full) instead of letting every request burn doomed EF Core retries. CircuitAwareExecutionStrategy derives from NpgsqlRetryingExecutionStrategy and records success/failure around the public Execute/ExecuteAsync seams; background workers skip work and back off while the circuit is open; the exception middleware maps an open circuit (and other DB outages) to 503. Adds /health (liveness) and /health/ready (readiness, reporting circuit state), plus unit tests for the open/half-open transitions and non-transient SQLSTATE detection. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
6da0690fd3
commit
5196ffa0f0
@ -0,0 +1,15 @@
|
||||
namespace LiveRecorder.Application.Models.Reports;
|
||||
|
||||
public sealed class HealthReadyResponse
|
||||
{
|
||||
public string Status { get; set; } = "ready";
|
||||
public DateTimeOffset Timestamp { get; set; }
|
||||
public DatabaseHealthStatus Database { get; set; } = new();
|
||||
}
|
||||
|
||||
public sealed class DatabaseHealthStatus
|
||||
{
|
||||
public bool Reachable { get; set; }
|
||||
public string? Reason { get; set; }
|
||||
public int ConsecutiveFailures { get; set; }
|
||||
}
|
||||
@ -0,0 +1,103 @@
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.EntityFrameworkCore.Storage;
|
||||
using Npgsql.EntityFrameworkCore.PostgreSQL;
|
||||
|
||||
namespace LiveRecorder.Infrastructure.Persistence;
|
||||
|
||||
/// <summary>
|
||||
/// Custom EF Core execution strategy that integrates with <see cref="DatabaseCircuitBreaker"/>.
|
||||
///
|
||||
/// When the circuit is open, operations fail immediately without retrying.
|
||||
/// When a non-transient error occurs (e.g. disk_full), the operation does not retry.
|
||||
/// After each failure, the circuit is recorded, and after each success, the circuit is reset.
|
||||
/// </summary>
|
||||
public sealed class CircuitAwareExecutionStrategy : NpgsqlRetryingExecutionStrategy
|
||||
{
|
||||
private static readonly TimeSpan DefaultMaxRetryDelay = TimeSpan.FromSeconds(15);
|
||||
|
||||
public CircuitAwareExecutionStrategy(
|
||||
ExecutionStrategyDependencies dependencies,
|
||||
int maxRetryCount,
|
||||
TimeSpan maxRetryDelay)
|
||||
: base(dependencies, maxRetryCount, maxRetryDelay, errorCodesToAdd: null)
|
||||
{
|
||||
}
|
||||
|
||||
public CircuitAwareExecutionStrategy(
|
||||
ExecutionStrategyDependencies dependencies,
|
||||
int maxRetryCount,
|
||||
TimeSpan maxRetryDelay,
|
||||
ICollection<string>? errorCodesToAdd)
|
||||
: base(dependencies, maxRetryCount, maxRetryDelay, errorCodesToAdd)
|
||||
{
|
||||
}
|
||||
|
||||
protected override bool ShouldRetryOn(Exception? exception)
|
||||
{
|
||||
// Fast-fail: circuit is open
|
||||
if (DatabaseCircuitBreaker.IsOpen)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Fast-fail: non-transient errors (disk_full, out_of_memory, etc.)
|
||||
if (DatabaseCircuitBreaker.IsNonTransient(exception))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Delegate to default Npgsql retry logic for transient errors
|
||||
return base.ShouldRetryOn(exception);
|
||||
}
|
||||
|
||||
protected override void OnFirstExecution()
|
||||
{
|
||||
// If circuit is open, throw immediately before even attempting
|
||||
if (DatabaseCircuitBreaker.IsOpen)
|
||||
{
|
||||
throw new DatabaseCircuitOpenException(
|
||||
$"Database circuit breaker is open. Consecutive failures: {DatabaseCircuitBreaker.ConsecutiveFailures}. " +
|
||||
$"Circuit opened at: {DatabaseCircuitBreaker.OpenedAt:O}.");
|
||||
}
|
||||
|
||||
base.OnFirstExecution();
|
||||
}
|
||||
|
||||
public override TResult Execute<TState, TResult>(
|
||||
TState state,
|
||||
Func<DbContext, TState, TResult> operation,
|
||||
Func<DbContext, TState, ExecutionResult<TResult>>? verifySucceeded)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = base.Execute(state, operation, verifySucceeded);
|
||||
DatabaseCircuitBreaker.RecordSuccess();
|
||||
return result;
|
||||
}
|
||||
catch
|
||||
{
|
||||
DatabaseCircuitBreaker.RecordFailure();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
public override async Task<TResult> ExecuteAsync<TState, TResult>(
|
||||
TState state,
|
||||
Func<DbContext, TState, CancellationToken, Task<TResult>> operation,
|
||||
Func<DbContext, TState, CancellationToken, Task<ExecutionResult<TResult>>>? verifySucceeded,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await base.ExecuteAsync(state, operation, verifySucceeded, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
DatabaseCircuitBreaker.RecordSuccess();
|
||||
return result;
|
||||
}
|
||||
catch
|
||||
{
|
||||
DatabaseCircuitBreaker.RecordFailure();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,121 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace LiveRecorder.Infrastructure.Persistence;
|
||||
|
||||
/// <summary>
|
||||
/// Thread-safe static circuit breaker for database operations.
|
||||
/// When the database becomes unavailable (e.g. disk full), this prevents
|
||||
/// every API request from wasting 45 seconds on doomed retries.
|
||||
/// </summary>
|
||||
public static class DatabaseCircuitBreaker
|
||||
{
|
||||
private static readonly object Lock = new();
|
||||
|
||||
/// <summary>Consecutive failures before the circuit opens.</summary>
|
||||
private const int FailureThreshold = 5;
|
||||
|
||||
/// <summary>How long the circuit stays open before allowing a probe.</summary>
|
||||
private static readonly TimeSpan BreakDuration = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>Npgsql error codes that are NOT transient — retrying is futile.</summary>
|
||||
private static readonly HashSet<string> NonTransientCodes = new(StringComparer.Ordinal)
|
||||
{
|
||||
"53100", // disk_full
|
||||
"53200", // out_of_memory
|
||||
"53300", // too_many_connections
|
||||
"08006", // connection_failure (persistent)
|
||||
"57P03", // cannot_connect_now
|
||||
"42601", // syntax_error (bug, not transient)
|
||||
"42501", // insufficient_privilege
|
||||
"3D000", // invalid_catalog_name
|
||||
"28P01", // invalid_password
|
||||
};
|
||||
|
||||
private static int _consecutiveFailures;
|
||||
private static DateTimeOffset _openedAt = DateTimeOffset.MinValue;
|
||||
|
||||
/// <summary>Whether the circuit is currently open (failing fast).</summary>
|
||||
public static bool IsOpen
|
||||
{
|
||||
get
|
||||
{
|
||||
if (_consecutiveFailures < FailureThreshold)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (DateTimeOffset.UtcNow - _openedAt > BreakDuration)
|
||||
{
|
||||
// Transition to half-open: allow one probe
|
||||
lock (Lock)
|
||||
{
|
||||
if (_consecutiveFailures >= FailureThreshold &&
|
||||
DateTimeOffset.UtcNow - _openedAt > BreakDuration)
|
||||
{
|
||||
// Reset to just below threshold so the next call probes
|
||||
_consecutiveFailures = FailureThreshold - 1;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public static int ConsecutiveFailures => Volatile.Read(ref _consecutiveFailures);
|
||||
|
||||
public static DateTimeOffset OpenedAt => _openedAt;
|
||||
|
||||
/// <summary>Record a successful database operation.</summary>
|
||||
public static void RecordSuccess()
|
||||
{
|
||||
lock (Lock)
|
||||
{
|
||||
_consecutiveFailures = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Record a failed database operation.</summary>
|
||||
public static void RecordFailure()
|
||||
{
|
||||
lock (Lock)
|
||||
{
|
||||
_consecutiveFailures++;
|
||||
if (_consecutiveFailures >= FailureThreshold)
|
||||
{
|
||||
_openedAt = DateTimeOffset.UtcNow;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Check whether a given exception is a non-transient database error
|
||||
/// that should NOT be retried. Returns true if retrying would be futile.
|
||||
/// </summary>
|
||||
public static bool IsNonTransient(Exception? ex)
|
||||
{
|
||||
while (ex is not null)
|
||||
{
|
||||
if (ex is Npgsql.NpgsqlException npgEx && npgEx.SqlState is { Length: 5 } state)
|
||||
{
|
||||
return NonTransientCodes.Contains(state);
|
||||
}
|
||||
|
||||
ex = ex.InnerException;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>Log the current circuit state.</summary>
|
||||
public static void LogState(ILogger logger)
|
||||
{
|
||||
logger.LogInformation(
|
||||
"DatabaseCircuitBreaker state: Open={IsOpen}, ConsecutiveFailures={Failures}, OpenedAt={OpenedAt}",
|
||||
IsOpen,
|
||||
ConsecutiveFailures,
|
||||
OpenedAt == DateTimeOffset.MinValue ? "never" : OpenedAt.ToString("O"));
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
namespace LiveRecorder.Infrastructure.Persistence;
|
||||
|
||||
/// <summary>
|
||||
/// Thrown when a database operation is rejected because the circuit breaker is open.
|
||||
/// This is a fast-fail — the request will not be retried.
|
||||
/// </summary>
|
||||
public sealed class DatabaseCircuitOpenException : InvalidOperationException
|
||||
{
|
||||
public DatabaseCircuitOpenException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
public DatabaseCircuitOpenException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
}
|
||||
@ -1,3 +1,4 @@
|
||||
using LiveRecorder.Infrastructure.Persistence;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
@ -7,6 +8,8 @@ namespace LiveRecorder.Infrastructure.Services;
|
||||
public sealed class CleanupOperationBackgroundService : BackgroundService
|
||||
{
|
||||
private static readonly TimeSpan IdleDelay = TimeSpan.FromSeconds(2);
|
||||
private static readonly TimeSpan ErrorBaseDelay = TimeSpan.FromSeconds(5);
|
||||
private static readonly TimeSpan ErrorMaxDelay = TimeSpan.FromMinutes(5);
|
||||
private readonly IServiceScopeFactory _serviceScopeFactory;
|
||||
private readonly ILogger<CleanupOperationBackgroundService> _logger;
|
||||
|
||||
@ -20,6 +23,7 @@ public sealed class CleanupOperationBackgroundService : BackgroundService
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
// Startup: requeue interrupted operations
|
||||
try
|
||||
{
|
||||
using var startupScope = _serviceScopeFactory.CreateScope();
|
||||
@ -31,17 +35,36 @@ public sealed class CleanupOperationBackgroundService : BackgroundService
|
||||
_logger.LogWarning(ex, "Failed to requeue interrupted cleanup operations at startup");
|
||||
}
|
||||
|
||||
var consecutiveErrors = 0;
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Skip processing if the circuit is open — don't waste resources
|
||||
if (DatabaseCircuitBreaker.IsOpen)
|
||||
{
|
||||
consecutiveErrors = await DelayWithBackoff(ErrorBaseDelay, ErrorMaxDelay, consecutiveErrors, stoppingToken);
|
||||
continue;
|
||||
}
|
||||
|
||||
using var scope = _serviceScopeFactory.CreateScope();
|
||||
var coordinator = scope.ServiceProvider.GetRequiredService<CleanupOperationCoordinator>();
|
||||
var processed = await coordinator.ProcessNextQueuedOperationAsync(stoppingToken);
|
||||
|
||||
if (processed)
|
||||
{
|
||||
consecutiveErrors = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
// No queued operations — idle delay
|
||||
consecutiveErrors = 0;
|
||||
await Task.Delay(IdleDelay, stoppingToken);
|
||||
}
|
||||
catch (DatabaseCircuitOpenException)
|
||||
{
|
||||
consecutiveErrors = await DelayWithBackoff(ErrorBaseDelay, ErrorMaxDelay, consecutiveErrors, stoppingToken);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
@ -49,17 +72,47 @@ public sealed class CleanupOperationBackgroundService : BackgroundService
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Cleanup operation background worker failed");
|
||||
}
|
||||
var isDatabaseError = DatabaseCircuitBreaker.IsNonTransient(ex) ||
|
||||
ex is Npgsql.NpgsqlException ||
|
||||
ex is Microsoft.EntityFrameworkCore.DbUpdateException;
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(IdleDelay, stoppingToken);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
if (isDatabaseError)
|
||||
{
|
||||
DatabaseCircuitBreaker.RecordFailure();
|
||||
_logger.LogWarning(ex,
|
||||
"Cleanup background worker: database error (#{ErrorCount}). Circuit state: Open={IsOpen}, Failures={Failures}",
|
||||
consecutiveErrors + 1,
|
||||
DatabaseCircuitBreaker.IsOpen,
|
||||
DatabaseCircuitBreaker.ConsecutiveFailures);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogError(ex, "Cleanup operation background worker failed");
|
||||
}
|
||||
|
||||
consecutiveErrors = await DelayWithBackoff(ErrorBaseDelay, ErrorMaxDelay, consecutiveErrors, stoppingToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<int> DelayWithBackoff(
|
||||
TimeSpan baseDelay, TimeSpan maxDelay, int errorCount, CancellationToken cancellationToken)
|
||||
{
|
||||
errorCount++;
|
||||
// Exponential backoff: 5s, 10s, 20s, 40s, 80s, 160s, capping at 5min
|
||||
var factor = Math.Pow(2, Math.Min(errorCount - 1, 6));
|
||||
var delay = TimeSpan.FromMilliseconds(
|
||||
Math.Min(baseDelay.TotalMilliseconds * factor, maxDelay.TotalMilliseconds));
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(delay, cancellationToken);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Swallow — loop will exit on next iteration
|
||||
}
|
||||
|
||||
return errorCount;
|
||||
}
|
||||
}
|
||||
|
||||
@ -146,8 +146,20 @@ public sealed class LiveRoomPollingBackgroundService : BackgroundService, ILiveR
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (DatabaseCircuitOpenException)
|
||||
{
|
||||
// Circuit is open — skip this iteration and wait
|
||||
_logger.LogDebug("Polling loop skipped: database circuit breaker is open");
|
||||
delay = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Record database failures to the circuit breaker
|
||||
if (IsTransientDatabaseException(ex) || DatabaseCircuitBreaker.IsNonTransient(ex))
|
||||
{
|
||||
DatabaseCircuitBreaker.RecordFailure();
|
||||
}
|
||||
|
||||
_logger.LogError(ex, "Background live room polling failed");
|
||||
|
||||
try
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
using LiveRecorder.Infrastructure.Persistence;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
@ -7,7 +8,8 @@ namespace LiveRecorder.Infrastructure.Services;
|
||||
public sealed class RetentionCleanupBackgroundService : BackgroundService
|
||||
{
|
||||
private static readonly TimeSpan CleanupInterval = TimeSpan.FromHours(24);
|
||||
|
||||
private static readonly TimeSpan ErrorBaseDelay = TimeSpan.FromSeconds(10);
|
||||
private static readonly TimeSpan ErrorMaxDelay = TimeSpan.FromMinutes(5);
|
||||
private readonly IServiceScopeFactory _serviceScopeFactory;
|
||||
private readonly ILogger<RetentionCleanupBackgroundService> _logger;
|
||||
|
||||
@ -21,13 +23,31 @@ public sealed class RetentionCleanupBackgroundService : BackgroundService
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
var consecutiveErrors = 0;
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Skip if circuit is already open
|
||||
if (DatabaseCircuitBreaker.IsOpen)
|
||||
{
|
||||
_logger.LogDebug("Retention cleanup skipped: database circuit breaker is open");
|
||||
await Task.Delay(TimeSpan.FromMinutes(1), stoppingToken);
|
||||
continue;
|
||||
}
|
||||
|
||||
using var scope = _serviceScopeFactory.CreateScope();
|
||||
var cleanupService = scope.ServiceProvider.GetRequiredService<RetentionCleanupService>();
|
||||
await cleanupService.TryEnqueueAsync(ignoreEnabledSetting: false, cancellationToken: stoppingToken);
|
||||
|
||||
consecutiveErrors = 0;
|
||||
}
|
||||
catch (DatabaseCircuitOpenException)
|
||||
{
|
||||
// Silent — circuit is already logged
|
||||
await Task.Delay(TimeSpan.FromMinutes(1), stoppingToken);
|
||||
continue;
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
@ -35,7 +55,38 @@ public sealed class RetentionCleanupBackgroundService : BackgroundService
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Retention cleanup background task failed");
|
||||
var isDatabaseError = DatabaseCircuitBreaker.IsNonTransient(ex) ||
|
||||
ex is Npgsql.NpgsqlException ||
|
||||
ex is Microsoft.EntityFrameworkCore.DbUpdateException;
|
||||
|
||||
consecutiveErrors++;
|
||||
var delay = TimeSpan.FromMilliseconds(
|
||||
Math.Min(ErrorBaseDelay.TotalMilliseconds * Math.Pow(2, Math.Min(consecutiveErrors - 1, 6)),
|
||||
ErrorMaxDelay.TotalMilliseconds));
|
||||
|
||||
if (isDatabaseError)
|
||||
{
|
||||
DatabaseCircuitBreaker.RecordFailure();
|
||||
_logger.LogWarning(ex,
|
||||
"Retention cleanup: database error (#{ErrorCount}). Circuit: Open={IsOpen}, Failures={Failures}",
|
||||
consecutiveErrors,
|
||||
DatabaseCircuitBreaker.IsOpen,
|
||||
DatabaseCircuitBreaker.ConsecutiveFailures);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogError(ex, "Retention cleanup background task failed");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(delay, stoppingToken);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
|
||||
@ -1,3 +1,7 @@
|
||||
using LiveRecorder.Infrastructure.Persistence;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Npgsql;
|
||||
|
||||
namespace LiveRecorder.WebApi.Middleware;
|
||||
|
||||
public sealed class ExceptionHandlingMiddleware
|
||||
@ -17,22 +21,80 @@ public sealed class ExceptionHandlingMiddleware
|
||||
{
|
||||
await _next(context);
|
||||
}
|
||||
catch (DatabaseCircuitOpenException ex)
|
||||
{
|
||||
// Circuit is open — fast-fail with 503
|
||||
_logger.LogWarning(ex, "Database circuit breaker is open, returning 503");
|
||||
context.Response.StatusCode = StatusCodes.Status503ServiceUnavailable;
|
||||
await context.Response.WriteAsJsonAsync(new
|
||||
{
|
||||
message = "Database temporarily unavailable",
|
||||
detail = "The database is currently unreachable. Requests will be accepted again once connectivity is restored.",
|
||||
retryAfterSeconds = 30
|
||||
});
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Unhandled exception");
|
||||
context.Response.StatusCode = ex switch
|
||||
|
||||
// Detect database-related exceptions and return 503 instead of 500
|
||||
var statusCode = ex switch
|
||||
{
|
||||
DatabaseCircuitOpenException => StatusCodes.Status503ServiceUnavailable,
|
||||
KeyNotFoundException => StatusCodes.Status404NotFound,
|
||||
InvalidOperationException => StatusCodes.Status400BadRequest,
|
||||
NotSupportedException => StatusCodes.Status400BadRequest,
|
||||
_ when IsDatabaseException(ex) => StatusCodes.Status503ServiceUnavailable,
|
||||
_ => StatusCodes.Status500InternalServerError
|
||||
};
|
||||
|
||||
context.Response.StatusCode = statusCode;
|
||||
|
||||
await context.Response.WriteAsJsonAsync(new
|
||||
{
|
||||
message = ex.Message,
|
||||
detail = context.Response.StatusCode == StatusCodes.Status500InternalServerError ? "Internal Server Error" : null
|
||||
message = statusCode == StatusCodes.Status503ServiceUnavailable
|
||||
? "Database temporarily unavailable"
|
||||
: ex.Message,
|
||||
detail = statusCode switch
|
||||
{
|
||||
StatusCodes.Status503ServiceUnavailable => "The database is currently unreachable. Please retry later.",
|
||||
StatusCodes.Status500InternalServerError => "Internal Server Error",
|
||||
_ => null
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Check if an exception (or any of its inner exceptions) is a database-related error
|
||||
/// that should be surfaced as 503 Service Unavailable.
|
||||
/// </summary>
|
||||
private static bool IsDatabaseException(Exception ex)
|
||||
{
|
||||
var current = ex;
|
||||
while (current is not null)
|
||||
{
|
||||
if (current is NpgsqlException or DbUpdateException)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (current is InvalidOperationException ioEx &&
|
||||
(ioEx.Message.Contains("database", StringComparison.OrdinalIgnoreCase) ||
|
||||
ioEx.Message.Contains("connection", StringComparison.OrdinalIgnoreCase) ||
|
||||
ioEx.Message.Contains("Npgsql", StringComparison.OrdinalIgnoreCase)))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (current is TimeoutException && current.Message.Contains("database", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
current = current.InnerException;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -10,6 +10,7 @@ using LiveRecorder.Application.Abstractions.Recording;
|
||||
using LiveRecorder.Application.Abstractions.Scripting;
|
||||
using LiveRecorder.Application.Abstractions.Settings;
|
||||
using LiveRecorder.Application.Abstractions.Storage;
|
||||
using LiveRecorder.Application.Models.Reports;
|
||||
using LiveRecorder.Application.Services;
|
||||
using LiveRecorder.Infrastructure.Persistence;
|
||||
using LiveRecorder.Infrastructure.Persistence.Repositories;
|
||||
@ -31,6 +32,7 @@ using LiveRecorder.Infrastructure.Platforms.YouTube;
|
||||
using LiveRecorder.Infrastructure.Services;
|
||||
using LiveRecorder.WebApi.Middleware;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.EntityFrameworkCore.Storage;
|
||||
using Microsoft.OpenApi.Models;
|
||||
using Npgsql;
|
||||
|
||||
@ -149,6 +151,11 @@ builder.Services.AddDbContext<LiveRecorderDbContext>(options =>
|
||||
maxRetryCount: 3,
|
||||
maxRetryDelay: TimeSpan.FromSeconds(15),
|
||||
errorCodesToAdd: null);
|
||||
npgsql.ExecutionStrategy(dependencies =>
|
||||
new CircuitAwareExecutionStrategy(
|
||||
dependencies,
|
||||
maxRetryCount: 3,
|
||||
maxRetryDelay: TimeSpan.FromSeconds(15)));
|
||||
}));
|
||||
|
||||
builder.Services.AddScoped<IUnitOfWork>(provider => provider.GetRequiredService<LiveRecorderDbContext>());
|
||||
@ -231,6 +238,61 @@ app.UseMiddleware<ApiTokenAuthenticationMiddleware>();
|
||||
app.MapGet("/", () => Results.Redirect("/swagger"));
|
||||
app.MapControllers();
|
||||
|
||||
// ── Health check endpoints ────────────────────────────────────────────
|
||||
// /health — liveness: is the process alive and responding?
|
||||
app.MapGet("/health", () => Results.Ok(new { status = "healthy", timestamp = DateTimeOffset.UtcNow }));
|
||||
|
||||
// /health/ready — readiness: can we reach the database?
|
||||
app.MapGet("/health/ready", async (CancellationToken cancellationToken) =>
|
||||
{
|
||||
var readyResult = new HealthReadyResponse
|
||||
{
|
||||
Status = "ready",
|
||||
Timestamp = DateTimeOffset.UtcNow,
|
||||
Database = new DatabaseHealthStatus()
|
||||
};
|
||||
|
||||
if (DatabaseCircuitBreaker.IsOpen)
|
||||
{
|
||||
readyResult.Status = "degraded";
|
||||
readyResult.Database = new DatabaseHealthStatus
|
||||
{
|
||||
Reachable = false,
|
||||
Reason = "Circuit breaker is open",
|
||||
ConsecutiveFailures = DatabaseCircuitBreaker.ConsecutiveFailures
|
||||
};
|
||||
return Results.Json(readyResult, statusCode: StatusCodes.Status503ServiceUnavailable);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Lightweight probe: just test connectivity with a 2-second timeout
|
||||
await using var connection = new NpgsqlConnection(connectionStringBuilder.ConnectionString);
|
||||
var probeCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
probeCts.CancelAfter(TimeSpan.FromSeconds(2));
|
||||
|
||||
await connection.OpenAsync(probeCts.Token);
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = "SELECT 1";
|
||||
await cmd.ExecuteScalarAsync(probeCts.Token);
|
||||
|
||||
readyResult.Database = new DatabaseHealthStatus { Reachable = true };
|
||||
return Results.Ok(readyResult);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
DatabaseCircuitBreaker.RecordFailure();
|
||||
readyResult.Status = "unhealthy";
|
||||
readyResult.Database = new DatabaseHealthStatus
|
||||
{
|
||||
Reachable = false,
|
||||
Reason = ex.Message,
|
||||
ConsecutiveFailures = DatabaseCircuitBreaker.ConsecutiveFailures
|
||||
};
|
||||
return Results.Json(readyResult, statusCode: StatusCodes.Status503ServiceUnavailable);
|
||||
}
|
||||
});
|
||||
|
||||
using (var scope = app.Services.CreateScope())
|
||||
{
|
||||
var initializer = scope.ServiceProvider.GetRequiredService<DatabaseInitializer>();
|
||||
|
||||
122
tests/LiveRecorder.Tests/DatabaseCircuitBreakerTests.cs
Normal file
122
tests/LiveRecorder.Tests/DatabaseCircuitBreakerTests.cs
Normal file
@ -0,0 +1,122 @@
|
||||
using System.Reflection;
|
||||
using LiveRecorder.Infrastructure.Persistence;
|
||||
using Npgsql;
|
||||
|
||||
namespace LiveRecorder.Tests;
|
||||
|
||||
public sealed class DatabaseCircuitBreakerTests
|
||||
{
|
||||
// Mirrors the private FailureThreshold in DatabaseCircuitBreaker so the assertions
|
||||
// read intentionally. Keep in sync if the production threshold changes.
|
||||
private const int FailureThreshold = 5;
|
||||
|
||||
public DatabaseCircuitBreakerTests()
|
||||
{
|
||||
// The breaker is a process-wide static, so reset it to a known closed state
|
||||
// before each test to keep cases independent.
|
||||
DatabaseCircuitBreaker.RecordSuccess();
|
||||
SetOpenedAt(DateTimeOffset.MinValue);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordFailure_below_threshold_keeps_circuit_closed()
|
||||
{
|
||||
for (var i = 0; i < FailureThreshold - 1; i++)
|
||||
{
|
||||
DatabaseCircuitBreaker.RecordFailure();
|
||||
}
|
||||
|
||||
Assert.False(DatabaseCircuitBreaker.IsOpen);
|
||||
Assert.Equal(FailureThreshold - 1, DatabaseCircuitBreaker.ConsecutiveFailures);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordFailure_at_threshold_opens_circuit()
|
||||
{
|
||||
for (var i = 0; i < FailureThreshold; i++)
|
||||
{
|
||||
DatabaseCircuitBreaker.RecordFailure();
|
||||
}
|
||||
|
||||
Assert.True(DatabaseCircuitBreaker.IsOpen);
|
||||
Assert.Equal(FailureThreshold, DatabaseCircuitBreaker.ConsecutiveFailures);
|
||||
Assert.NotEqual(DateTimeOffset.MinValue, DatabaseCircuitBreaker.OpenedAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordSuccess_closes_circuit_and_resets_failures()
|
||||
{
|
||||
for (var i = 0; i < FailureThreshold; i++)
|
||||
{
|
||||
DatabaseCircuitBreaker.RecordFailure();
|
||||
}
|
||||
|
||||
Assert.True(DatabaseCircuitBreaker.IsOpen);
|
||||
|
||||
DatabaseCircuitBreaker.RecordSuccess();
|
||||
|
||||
Assert.False(DatabaseCircuitBreaker.IsOpen);
|
||||
Assert.Equal(0, DatabaseCircuitBreaker.ConsecutiveFailures);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsOpen_after_break_duration_transitions_to_half_open()
|
||||
{
|
||||
for (var i = 0; i < FailureThreshold; i++)
|
||||
{
|
||||
DatabaseCircuitBreaker.RecordFailure();
|
||||
}
|
||||
|
||||
Assert.True(DatabaseCircuitBreaker.IsOpen);
|
||||
|
||||
// The breaker reads DateTimeOffset.UtcNow directly (no injectable clock), so move
|
||||
// the recorded open time past the 30s break window to simulate it elapsing.
|
||||
SetOpenedAt(DateTimeOffset.UtcNow - TimeSpan.FromSeconds(31));
|
||||
|
||||
// The first read past the window half-opens: it permits one probe and drops the
|
||||
// failure count to one below the threshold.
|
||||
Assert.False(DatabaseCircuitBreaker.IsOpen);
|
||||
Assert.Equal(FailureThreshold - 1, DatabaseCircuitBreaker.ConsecutiveFailures);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("53100")] // disk_full
|
||||
[InlineData("53300")] // too_many_connections
|
||||
[InlineData("28P01")] // invalid_password
|
||||
public void IsNonTransient_returns_true_for_known_fatal_sql_states(string sqlState)
|
||||
{
|
||||
var exception = new PostgresException("fatal", "FATAL", "FATAL", sqlState);
|
||||
|
||||
Assert.True(DatabaseCircuitBreaker.IsNonTransient(exception));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsNonTransient_returns_false_for_transient_sql_state()
|
||||
{
|
||||
// 40001 = serialization_failure, which is retryable and not in the fatal set.
|
||||
var exception = new PostgresException("retry me", "ERROR", "ERROR", "40001");
|
||||
|
||||
Assert.False(DatabaseCircuitBreaker.IsNonTransient(exception));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsNonTransient_unwraps_inner_exceptions()
|
||||
{
|
||||
var inner = new PostgresException("disk full", "FATAL", "FATAL", "53100");
|
||||
var wrapper = new InvalidOperationException("save failed", inner);
|
||||
|
||||
Assert.True(DatabaseCircuitBreaker.IsNonTransient(wrapper));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsNonTransient_returns_false_for_null_and_non_postgres_exceptions()
|
||||
{
|
||||
Assert.False(DatabaseCircuitBreaker.IsNonTransient(null));
|
||||
Assert.False(DatabaseCircuitBreaker.IsNonTransient(new InvalidOperationException("boom")));
|
||||
}
|
||||
|
||||
private static void SetOpenedAt(DateTimeOffset value) =>
|
||||
typeof(DatabaseCircuitBreaker)
|
||||
.GetField("_openedAt", BindingFlags.NonPublic | BindingFlags.Static)!
|
||||
.SetValue(null, value);
|
||||
}
|
||||
@ -20,6 +20,7 @@
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\src\LiveRecorder.Application\LiveRecorder.Application.csproj" />
|
||||
<ProjectReference Include="..\..\src\LiveRecorder.Domain\LiveRecorder.Domain.csproj" />
|
||||
<ProjectReference Include="..\..\src\LiveRecorder.Infrastructure\LiveRecorder.Infrastructure.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
||||
Loading…
Reference in New Issue
Block a user