From fa929e949cb30a5881759bce79460fca31c4105d Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Thu, 14 Nov 2024 15:59:47 -0800 Subject: [PATCH 1/2] initial work --- .../FaultInjection/README.md | 49 +++++++++++++++++++ .../FaultInjection/changelog.md | 14 ++++++ 2 files changed, 63 insertions(+) create mode 100644 Microsoft.Azure.Cosmos/FaultInjection/README.md create mode 100644 Microsoft.Azure.Cosmos/FaultInjection/changelog.md diff --git a/Microsoft.Azure.Cosmos/FaultInjection/README.md b/Microsoft.Azure.Cosmos/FaultInjection/README.md new file mode 100644 index 0000000000..5b963ee431 --- /dev/null +++ b/Microsoft.Azure.Cosmos/FaultInjection/README.md @@ -0,0 +1,49 @@ +# Azure Cosmos DB .NET SDK Fault Injection Library + +The Azure Cosmos DB .NET SDK Fault Injection Library allows you to simulate network issues in the Azure Cosmos DB .NET SDK. This library is useful for testing the SDK's behavior when there are network issues. Additionally, this library can help you test your own retry policies. + +## Key Concepts + +### `FaultInjectionRule` + +To induce faults, we will introduce a new type `FaultInjectionRule`. This type will allow you to configure how the SDK fails requests. Once created, `FaultInjectionRule`s can be added to specific containers. + +The `FaultInjectionRule` has two major components: a `FaultInjectionCondition` and a `FaultInjectionResult`, in addition to an `id` for each rule. + +#### `FaultInjectionResult` + +The `FaultInjectionResult` component of the `FaultInjectionRule` specifies what the result of the fault that is to be injected will be. The `FaultInjectionResult` component can be one of two types: `FaultInjectionServerErrorResult` or `FaultInjectionConnectionErrorResult`. + +##### `FaultInjectionServerErrorResult` + +This result will return a server error to the customer. `FaultInjectionServerErrorResult`. Currently, the following server error types are supported: + +| Error Type | Status Code | Description | +| ---------- | ----------- | ----------- | +| `Gone` | 410:21005 | The requested resource is no longer available at the server and no forwarding address is known. This condition should be considered permanent. | +| `RetryWith` | 449:0 | The client should retry the request using the specified URI. | +| `InternalServerError` | 500:0 | The server encountered an unexpected condition that prevented it from fulfilling the request. | +| `TooManyRequests` | 429:3200 | The client has sent too many requests in a given amount of time. | +| `ReadSessionNotAvailable` | 404:1002 | The read session is not available. | +| `Timeout` | 408:0 | The operation did not complete within the allocated time. | +| `PartitionIsSplitting` | 410:1007 | The partition is currently splitting. | +| `PartitionIsMigrating` | 410:1008 | The partition is currently migrating. | +| `SendDelay` | n/a | Used to simulate transient timeout/broken connections. | +| `ResponseDelay` | n/a | Used to simulate transient timeout/broken connections. | +| `ConnectionDelay` | n/a | Used to simulate high channel acquisition. | +| `ServiceUnavailable` | 503:0 | The service is currently unavailable. | + +##### `FaultInjectionConnectionErrorResult` + +This result will return a connection error to the customer. `FaultInjectionConnectionErrorResult`. Currently, the following connection error types are supported: + +| Error Type | Description | +| ---------- | ----------- | +| `ReceiveStreamClosed` | The connection was closed. | +| `ReceiveFailed` | The connection was reset. | + +#### `FaultInjectionCondition` + +The `FaultInjectionCondition` component of the `FaultInjectionRule` specifies when the fault should be injected. `FaultInjectionCondition`s can be used to limit the faults in the following ways: + + diff --git a/Microsoft.Azure.Cosmos/FaultInjection/changelog.md b/Microsoft.Azure.Cosmos/FaultInjection/changelog.md new file mode 100644 index 0000000000..301ed44c62 --- /dev/null +++ b/Microsoft.Azure.Cosmos/FaultInjection/changelog.md @@ -0,0 +1,14 @@ +## Release notes + +This project is in beta. The API and functionality may change when the project is updated. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +### [1.0.0-beta.0](https://www.nuget.org/packages/Microsoft.Azure.Cosmos.FaultInjection/1.0.0-beta.0) - 2024-11-15 + +#### Added + +- Support for fault injection in the Cosmos SDK. +- Support for fault injection in Direct Mode. +- Support for fault injection in Gateway Mode. From 978eb1979c055eb29daff587dfb6c207d590a4f5 Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:18:14 -0800 Subject: [PATCH 2/2] added readme examples --- .../FaultInjection/README.md | 234 ++++++++++++++++-- 1 file changed, 211 insertions(+), 23 deletions(-) diff --git a/Microsoft.Azure.Cosmos/FaultInjection/README.md b/Microsoft.Azure.Cosmos/FaultInjection/README.md index 5b963ee431..7f43976791 100644 --- a/Microsoft.Azure.Cosmos/FaultInjection/README.md +++ b/Microsoft.Azure.Cosmos/FaultInjection/README.md @@ -1,12 +1,12 @@ # Azure Cosmos DB .NET SDK Fault Injection Library -The Azure Cosmos DB .NET SDK Fault Injection Library allows you to simulate network issues in the Azure Cosmos DB .NET SDK. This library is useful for testing the SDK's behavior when there are network issues. Additionally, this library can help you test your own retry policies. +The Azure Cosmos DB .NET SDK Fault Injection Library allows you to simulate network issues in the Azure Cosmos DB .NET SDK. This library is useful for testing the SDK's behavior when there are network issues. Additionally, this library can help you test your own retry policies. Note that this library is not intended for use in production environments and should only be used for testing purposes. This **library** is currently in preview, and breaking changes may occur. ## Key Concepts ### `FaultInjectionRule` -To induce faults, we will introduce a new type `FaultInjectionRule`. This type will allow you to configure how the SDK fails requests. Once created, `FaultInjectionRule`s can be added to specific containers. +To induce faults, we will introduce a new type: `FaultInjectionRule`. This type will allow you to configure how the SDK fails requests. Once created, `FaultInjectionRule`s can be added to specific containers. The `FaultInjectionRule` has two major components: a `FaultInjectionCondition` and a `FaultInjectionResult`, in addition to an `id` for each rule. @@ -16,34 +16,222 @@ The `FaultInjectionResult` component of the `FaultInjectionRule` specifies what ##### `FaultInjectionServerErrorResult` -This result will return a server error to the customer. `FaultInjectionServerErrorResult`. Currently, the following server error types are supported: - -| Error Type | Status Code | Description | -| ---------- | ----------- | ----------- | -| `Gone` | 410:21005 | The requested resource is no longer available at the server and no forwarding address is known. This condition should be considered permanent. | -| `RetryWith` | 449:0 | The client should retry the request using the specified URI. | -| `InternalServerError` | 500:0 | The server encountered an unexpected condition that prevented it from fulfilling the request. | -| `TooManyRequests` | 429:3200 | The client has sent too many requests in a given amount of time. | -| `ReadSessionNotAvailable` | 404:1002 | The read session is not available. | -| `Timeout` | 408:0 | The operation did not complete within the allocated time. | -| `PartitionIsSplitting` | 410:1007 | The partition is currently splitting. | -| `PartitionIsMigrating` | 410:1008 | The partition is currently migrating. | -| `SendDelay` | n/a | Used to simulate transient timeout/broken connections. | -| `ResponseDelay` | n/a | Used to simulate transient timeout/broken connections. | -| `ConnectionDelay` | n/a | Used to simulate high channel acquisition. | -| `ServiceUnavailable` | 503:0 | The service is currently unavailable. | +This result will return a server error to the customer: `FaultInjectionServerErrorResult`. Currently, the following server error types are supported: + +| Error Type | Status Code | Description | +| ----------------------- | ------------ | --------------------------------------------------------------------------- | +| `Gone` | 410:21005 | The requested resource is no longer available at the server and no forwarding address is known. This condition should be considered permanent. | +| `RetryWith` | 449:0 | The client should retry the request using the specified URI. | +| `InternalServerError` | 500:0 | The server encountered an unexpected condition that prevented it from fulfilling the request. | +| `TooManyRequests` | 429:3200 | The client has sent too many requests in a given amount of time. | +| `ReadSessionNotAvailable`| 404:1002 | The read session is not available. | +| `Timeout` | 408:0 | The operation did not complete within the allocated time. | +| `PartitionIsSplitting` | 410:1007 | The partition is currently splitting. | +| `PartitionIsMigrating` | 410:1008 | The partition is currently migrating. | +| `SendDelay` | n/a | Will inject a delay to the request before it is sent to the backend. | +| `ResponseDelay` | n/a | Will inject a delay to the request after a response is received from the backend before returning the result. | +| `ConnectionDelay` | n/a | Used to simulate high channel acquisition. | +| `ServiceUnavailable` | 503:0 | The service is currently unavailable. | ##### `FaultInjectionConnectionErrorResult` -This result will return a connection error to the customer. `FaultInjectionConnectionErrorResult`. Currently, the following connection error types are supported: +This result will return a connection error to the customer: `FaultInjectionConnectionErrorResult`. Currently, the following connection error types are supported: -| Error Type | Description | -| ---------- | ----------- | -| `ReceiveStreamClosed` | The connection was closed. | -| `ReceiveFailed` | The connection was reset. | +| Error Type | Description | +| ----------------------- | ---------------------------------------------------------------------- | +| `ReceiveStreamClosed` | The connection was closed. | +| `ReceiveFailed` | The connection was reset. | + +##### Other `FaultInjectionResult` Properties + +When creating a `FaultInjectionResult`, you can also specify the following properties: + +| Property | Description | +| -------------- | ----------- | +| `Times` | This allows you to specify how many times to inject the fault for a single operation. By default, there is no limit. | +| `Delay` | This allows you to specify how long to delay the fault injection. Only applicable for `SendDelay`, `ResponseDelay`, and `ConnectionDelay` error types. | +| `InjectionRate`| This allows you to specify how often the rule is applied when applicable to an operation. By default, the rate is 100%. | #### `FaultInjectionCondition` The `FaultInjectionCondition` component of the `FaultInjectionRule` specifies when the fault should be injected. `FaultInjectionCondition`s can be used to limit the faults in the following ways: +| Condition | Description | +| ------------- | ----------- | +| OperationType | The fault will only be injected if the operation type matches the specified `FaultInjectionOperationType`. If not set, it will inject on all requests. | +| ConnectionType | The fault will only be injected if the connection type matches the specified `FaultInjectionConnectionType`. If not set, it will inject on all connection types. | +| Region | The fault will only be injected if the region matches the specified region. If not set, it will inject on all regions. | +| Endpoint | The fault will only be injected if the endpoint matches the specified endpoint. This can be used to target specific replicas and partitions. If not set, it will inject on all endpoints. | + +##### `FaultInjectionOperationType` + +The `FaultInjectionOperationType` specifies the type of operation that the fault should be injected on. The following operation types are supported: + +| Operation Type | +| -------------- | +| `ReadItem` | +| `QueryItem` | +| `CreateItem` | +| `UpsertItem` | +| `ReplaceItem` | +| `DeleteItem` | +| `PatchItem` | +| `Batch` | +| `ReadFeed` | +| `All` | + +##### `FaultInjectionConnectionType` + +The `FaultInjectionConnectionType` specifies the type of connection that the fault should be injected on. The following connection types are supported: + +| Connection Type | +| --------------- | +| `Direct` | +| `Gateway` | +| `All` | + +#### Other `FaultInjectionRule` Properties + +When creating a `FaultInjectionRule`, you can also specify the following properties: + +| Property | Description | +| -------------- | ----------- | +| `Duration` | This allows you to specify how long a rule is valid for. | +| `StartDelay` | This allows you to specify how long to wait before starting to inject faults. | +| `HitLimit` | This allows you to specify how many times to inject faults. | + + +### `FaultInjector` + +The `FaultInjector` is a class that allows you to inject faults into the Azure Cosmos DB .NET SDK. The `FaultInjector` is created with a list of `FaultInjectionRule`s. Once created, the `FaultInjector` can be passed to the `CosmosClient` constructor to enable fault injection. + +After conductiong the tests, you can use the `FaultInjector` to get the `FaultInjectionApplicationContext` which allows you to get the following: + +- Given a rule id, get the time and activity id of all requests that were affected by the rule. +- Given an activity id, get the rule id that affected the request. + +This can be useful for debugging and understanding which rules are affecting which requests. + +## Examples + +The following examples demonstrate creating `FaultInjectionRule`s for some of the most common scenarios for using the Azure Cosmos DB .NET SDK Fault Injection Library. Additionally, there is also an example of how to create a `CosmosClient` that has fault injection enabled. + +### High Latency in a Single Region + +This rule will inject a 4-second delay in the response for read item operations 5 seconds after client creation for 30 seconds. + +```c# +FaultInjectionRule rule = new FaultInjectionRuleBuilder( + id: "HighLatencyRule", + condition: new FaultInjectionConditionBuilder() + .WithOperationType(FaultInjectionOperationType.ReadItem) + .Build(), + result: FaultInjectionResultBuilder.GetResultBuilder(FaultInjectionServerErrorType.ResponseDelay) + .WithDelay(TimeSpan.FromSeconds(4)) + .Build()) + .WithDuration(TimeSpan.FromSeconds(30)) + .WithStartDelay(TimeSpan.FromSeconds(5)) + .Build(); +``` + +### High Channel Acquisition + +```c# +FaultInjectionRule rule = new FaultInjectionRuleBuilder( + id: "HighChannelAcquisitionRule", + condition: new FaultInjectionConditionBuilder() + .WithConnectionType(FaultInjectionConnectionType.Direct) // Only inject on direct mode connections + .Build(), + result: FaultInjectionResultBuilder.GetResultBuilder(FaultInjectionServerErrorType.ConnectionDelay) + .WithDelay(TimeSpan.FromSeconds(6)) // Default connection timeout is 5 seconds + .Build()) + .Build(); +``` + +### Server Return Gone + +This rule will return a 410 Gone error for all operations. Note that because when the server returns a 410 Gone error, it will apply to all operations, the `FaultInjectionCondition` will be ignored. + +```c# +FaultInjectionRule rule = new FaultInjectionRuleBuilder( + id: "GoneRule", + condition: new FaultInjectionConditionBuilder() + .WithOperationType(FaultInjectionOperationType.ReadItem) + .Build(), + result: FaultInjectionResultBuilder.GetResultBuilder(FaultInjectionServerErrorType.Gone) + .Build()) + .Build(); +``` + +### Server Unavailable + +This rule will return a 503 Service Unavailable error for 10% of all operations in the East US region. + +```c# +FaultInjectionRule rule = new FaultInjectionRuleBuilder( + id: "ServiceUnavailableRule", + condition: new FaultInjectionConditionBuilder() + .WithRegion("East US") + .Build(), + result: FaultInjectionResultBuilder.GetResultBuilder(FaultInjectionServerErrorType.ServiceUnavailable) + .Build()) + .WithInjectionRate(0.1) + .Build(); +``` + +### Random Connection Closed + +This rule will randomly close 30% of connections for all operations every 5 seconds for 30 seconds. + +```c# +FaultInjectionRule rule = new FaultInjectionRuleBuilder( + id: "RandomConnectionClosedRule", + condition: new FaultInjectionConditionBuilder() + .WithEndpoint(new FaultInjectionEndpointBuilder("dbName", "containerName", feedRange).Build()) + .Build(), + result: FaultInjectionResultBuilder.GetResultBuilder(FaultInjectionConnectionErrorType.ReceiveStreamClosed) + .WithInterval(TimeSpan.FromSeconds(5)) // Inject every 5 seconds + .WithThreshold(0.3) + .Build()) + .WithDuration(TimeSpan.FromSeconds(30)) + .Build(); +`` + +### Create a `CosmosClient` with Fault Injection Enabled + +```c# + +List rules = new List +{ + // Add rules here +}; + +FaultInjector faultInjector = new FaultInjector(rules); + +``` + +Once you have created the `FaultInjector`, you can pass it to the `CosmosClient` constructor: + +```c# + +CosmosClient client = new CosmosClientBuilder("connectionString") + .WithFaultInjector(faultInjector) + .Build(); + +``` + +or + +```c# + +CosmosClientOptions options = new CosmosClientOptions +{ + FaultInjector = faultInjector +}; + +CosmosClient client = new CosmosClient("connectionString", options); + +``` + +