Headline
CVE-2022-23606: CVE-2022-23606 · envoyproxy/envoy@4b6dd3b
Envoy is an open source edge and service proxy, designed for cloud-native applications. When a cluster is deleted via Cluster Discovery Service (CDS) all idle connections established to endpoints in that cluster are disconnected. A recursion was introduced in the procedure of disconnecting idle connections that can lead to stack exhaustion and abnormal process termination when a cluster has a large number of idle connections. This infinite recursion causes Envoy to crash. Users are advised to upgrade.
@@ -37,7 +37,8 @@ class CdsIntegrationTest : public Grpc::DeltaSotwIntegrationParamTest, public Ht sotwOrDelta() == Grpc::SotwOrDelta::Sotw || sotwOrDelta() == Grpc::SotwOrDelta::UnifiedSotw ? “GRPC” : “DELTA_GRPC”)) { : “DELTA_GRPC”)), cluster_creator_(&ConfigHelper::buildStaticCluster) { if (sotwOrDelta() == Grpc::SotwOrDelta::UnifiedSotw || sotwOrDelta() == Grpc::SotwOrDelta::UnifiedDelta) { config_helper_.addRuntimeOverride("envoy.reloadable_features.unified_mux", “true”); @@ -79,14 +80,14 @@ class CdsIntegrationTest : public Grpc::DeltaSotwIntegrationParamTest, public Ht // Create the regular (i.e. not an xDS server) upstreams. We create them manually here after // initialize() because finalize() expects all fake_upstreams_ to correspond to a static // cluster in the bootstrap config - which we don’t want since we’re testing dynamic CDS! addFakeUpstream(Http::CodecType::HTTP2); addFakeUpstream(Http::CodecType::HTTP2); cluster1_ = ConfigHelper::buildStaticCluster( addFakeUpstream(upstream_codec_type_); addFakeUpstream(upstream_codec_type_); cluster1_ = cluster_creator_( ClusterName1, fake_upstreams_[UpstreamIndex1]->localAddress()->ip()->port(), Network::Test::getLoopbackAddressString(ipVersion())); cluster2_ = ConfigHelper::buildStaticCluster( Network::Test::getLoopbackAddressString(ipVersion()), “ROUND_ROBIN”); cluster2_ = cluster_creator_( ClusterName2, fake_upstreams_[UpstreamIndex2]->localAddress()->ip()->port(), Network::Test::getLoopbackAddressString(ipVersion())); Network::Test::getLoopbackAddressString(ipVersion()), “ROUND_ROBIN”);
// Let Envoy establish its connection to the CDS server. acceptXdsConnection(); @@ -133,6 +134,10 @@ class CdsIntegrationTest : public Grpc::DeltaSotwIntegrationParamTest, public Ht envoy::config::cluster::v3::Cluster cluster2_; // True if we decided not to run the test after all. bool test_skipped_{true}; Http::CodecType upstream_codec_type_{Http::CodecType::HTTP2}; std::function<envoy::config::cluster::v3::Cluster(const std::string&, int, const std::string&, const std::string&)> cluster_creator_; };
INSTANTIATE_TEST_SUITE_P(IpVersionsClientTypeDelta, CdsIntegrationTest, @@ -343,5 +348,160 @@ TEST_P(CdsIntegrationTest, VersionsRememberedAfterReconnect) { ASSERT_TRUE(codec_client_->waitForDisconnect()); }
// This test verifies that Envoy can delete a cluster with a lot of idle connections. // The original problem was recursive closure of idle connections that can run out // of stack when there are a lot of idle connections. TEST_P(CdsIntegrationTest, CdsClusterDownWithLotsOfIdleConnections) { constexpr int num_requests = 2000; // Make upstream H/1 so it creates connection for each request upstream_codec_type_ = Http::CodecType::HTTP1; // Relax default circuit breaker limits and timeouts so Envoy can accumulate a lot of idle // connections cluster_creator_ = &ConfigHelper::buildH1ClusterWithHighCircuitBreakersLimits; config_helper_.addConfigModifier( [&](envoy::extensions::filters::network::http_connection_manager::v3::HttpConnectionManager& hcm) -> void { hcm.mutable_route_config() ->mutable_virtual_hosts(0) ->mutable_routes(0) ->mutable_route() ->mutable_timeout() ->set_seconds(600); hcm.mutable_route_config() ->mutable_virtual_hosts(0) ->mutable_routes(0) ->mutable_route() ->mutable_idle_timeout() ->set_seconds(600); }); initialize(); std::vector<IntegrationStreamDecoderPtr> responses; std::vector<FakeHttpConnectionPtr> upstream_connections; std::vector<FakeStreamPtr> upstream_requests; codec_client_ = makeHttpConnection(makeClientConnection((lookupPort(“http”)))); // The first loop establishes a lot of open connections with active requests to upstream for (int i = 0; i < num_requests; ++i) { Http::TestRequestHeaderMapImpl request_headers{{":method", "GET"}, {":path", "/cluster1"}, {":scheme", "http"}, {":authority", "host"}, {"x-lyft-user-id", absl::StrCat(i)}};
auto response = codec_client_->makeHeaderOnlyRequest(request_headers); responses.push_back(std::move(response));
FakeHttpConnectionPtr fake_upstream_connection; waitForNextUpstreamConnection({UpstreamIndex1}, TestUtility::DefaultTimeout, fake_upstream_connection); // Wait for the next stream on the upstream connection. FakeStreamPtr upstream_request; AssertionResult result = fake_upstream_connection->waitForNewStream(*dispatcher_, upstream_request); RELEASE_ASSERT(result, result.message()); // Wait for the stream to be completely received. result = upstream_request->waitForEndStream(*dispatcher_); RELEASE_ASSERT(result, result.message()); upstream_connections.push_back(std::move(fake_upstream_connection)); upstream_requests.push_back(std::move(upstream_request)); }
// This loop completes all requests making the all upstream connections idle for (int i = 0; i < num_requests; ++i) { // Send response headers, and end_stream if there is no response body. upstream_requests[i]->encodeHeaders(default_response_headers_, true); // Wait for the response to be read by the codec client. RELEASE_ASSERT(responses[i]->waitForEndStream(), “unexpected timeout”); ASSERT_TRUE(responses[i]->complete()); EXPECT_EQ("200", responses[i]->headers().getStatusValue()); }
test_server_->waitForCounterGe("cluster_manager.cluster_added", 1);
// Tell Envoy that cluster_1 is gone. Envoy will try to close all idle connections EXPECT_TRUE(compareDiscoveryRequest(Config::TypeUrl::get().Cluster, "55", {}, {}, {})); sendDiscoveryResponse<envoy::config::cluster::v3::Cluster>(Config::TypeUrl::get().Cluster, {}, {}, {ClusterName1}, “42”); // We can continue the test once we’re sure that Envoy’s ClusterManager has made use of // the DiscoveryResponse that says cluster_1 is gone. test_server_->waitForCounterGe("cluster_manager.cluster_removed", 1);
// If we made it this far then everything is ok. for (int i = 0; i < num_requests; ++i) { AssertionResult result = upstream_connections[i]->close(); RELEASE_ASSERT(result, result.message()); result = upstream_connections[i]->waitForDisconnect(); RELEASE_ASSERT(result, result.message()); } upstream_connections.clear(); cleanupUpstreamAndDownstream(); ASSERT_TRUE(codec_client_->waitForDisconnect()); }
// This test verifies that Envoy can delete a cluster with a lot of connections in the connecting // state and associated pending requests. The recursion guard in the // ConnPoolImplBase::closeIdleConnectionsForDrainingPool() would fire if it was called recursively. // // Test is currently disabled as there is presently no reliable way of making upstream connections // hang in connecting state. TEST_P(CdsIntegrationTest, DISABLED_CdsClusterDownWithLotsOfConnectingConnections) { // Use low number of pending connections to prevent bumping into the default // limit of 128, since the upstream will be prevented below from // accepting connections. constexpr int num_requests = 64; // Make upstream H/1 so it creates connection for each request upstream_codec_type_ = Http::CodecType::HTTP1; cluster_creator_ = &ConfigHelper::buildH1ClusterWithHighCircuitBreakersLimits; config_helper_.addConfigModifier( [&](envoy::extensions::filters::network::http_connection_manager::v3::HttpConnectionManager& hcm) -> void { hcm.mutable_route_config() ->mutable_virtual_hosts(0) ->mutable_routes(0) ->mutable_route() ->mutable_timeout() ->set_seconds(600); hcm.mutable_route_config() ->mutable_virtual_hosts(0) ->mutable_routes(0) ->mutable_route() ->mutable_idle_timeout() ->set_seconds(600); }); initialize(); test_server_->waitForCounterGe("cluster_manager.cluster_added", 1); std::vector<IntegrationStreamDecoderPtr> responses; codec_client_ = makeHttpConnection(makeClientConnection((lookupPort(“http”)))); // Stop upstream at UpstreamIndex1 dispatcher, to prevent it from accepting TCP connections. // This will cause Envoy’s connections to that upstream hang in the connecting state. fake_upstreams_[UpstreamIndex1]->dispatcher()->exit(); for (int i = 0; i < num_requests; ++i) { Http::TestRequestHeaderMapImpl request_headers{{":method", "GET"}, {":path", "/cluster1"}, {":scheme", "http"}, {":authority", "host"}, {"x-lyft-user-id", absl::StrCat(i)}};
auto response = codec_client_->makeHeaderOnlyRequest(request_headers); responses.push_back(std::move(response)); }
// Wait for Envoy to try to establish all expected connections test_server_->waitForCounterEq("cluster.cluster_1.upstream_cx_total", num_requests);
// Tell Envoy that cluster_1 is gone. Envoy will try to close all pending connections EXPECT_TRUE(compareDiscoveryRequest(Config::TypeUrl::get().Cluster, "55", {}, {}, {})); sendDiscoveryResponse<envoy::config::cluster::v3::Cluster>(Config::TypeUrl::get().Cluster, {}, {}, {ClusterName1}, “42”); // We can continue the test once we’re sure that Envoy’s ClusterManager has made use of // the DiscoveryResponse that says cluster_1 is gone. test_server_->waitForCounterGe("cluster_manager.cluster_removed", 1);
cleanupUpstreamAndDownstream(); ASSERT_TRUE(codec_client_->waitForDisconnect()); // If we got here it means that the recursion guard in the // ConnPoolImplBase::closeIdleConnectionsForDrainingPool() did not fire, which is what this test // validates. }
} // namespace } // namespace Envoy