diff --git a/internal/xds/bootstrap/bootstrap.go b/internal/xds/bootstrap/bootstrap.go index 142e803930e1..f409e4bd77b2 100644 --- a/internal/xds/bootstrap/bootstrap.go +++ b/internal/xds/bootstrap/bootstrap.go @@ -217,10 +217,16 @@ func (sc *ServerConfig) ServerFeaturesIgnoreResourceDeletion() bool { return false } +// SelectedCreds returns the selected credentials configuration for +// communicating with this server. +func (sc *ServerConfig) SelectedCreds() ChannelCreds { + return sc.selectedCreds +} + // DialOptions returns a slice of all the configured dial options for this -// server. +// server except grpc.WithCredentialsBundle(). func (sc *ServerConfig) DialOptions() []grpc.DialOption { - dopts := []grpc.DialOption{sc.credsDialOption} + var dopts []grpc.DialOption if sc.extraDialOptions != nil { dopts = append(dopts, sc.extraDialOptions...) } diff --git a/xds/internal/balancer/clusterimpl/balancer_test.go b/xds/internal/balancer/clusterimpl/balancer_test.go index 70c01d7b0b30..cca103afee98 100644 --- a/xds/internal/balancer/clusterimpl/balancer_test.go +++ b/xds/internal/balancer/clusterimpl/balancer_test.go @@ -27,8 +27,6 @@ import ( "testing" "time" - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" "google.golang.org/grpc/balancer" "google.golang.org/grpc/balancer/base" "google.golang.org/grpc/balancer/roundrobin" @@ -45,7 +43,6 @@ import ( xdsinternal "google.golang.org/grpc/xds/internal" "google.golang.org/grpc/xds/internal/testutils/fakeclient" "google.golang.org/grpc/xds/internal/xdsclient" - "google.golang.org/grpc/xds/internal/xdsclient/load" v3orcapb "github.com/cncf/xds/go/xds/data/orca/v3" ) @@ -63,11 +60,6 @@ const ( var ( testBackendEndpoints = []resolver.Endpoint{{Addresses: []resolver.Address{{Addr: "1.1.1.1:1"}}}} - cmpOpts = cmp.Options{ - cmpopts.EquateEmpty(), - cmpopts.IgnoreFields(load.Data{}, "ReportInterval"), - } - toleranceCmpOpt = cmpopts.EquateApprox(0, 1e-5) ) type s struct { @@ -178,25 +170,6 @@ func (s) TestDropByCategory(t *testing.T) { if loadStore == nil { t.Fatal("loadStore is nil in xdsClient") } - const dropCount = rpcCount * dropNumerator / dropDenominator - wantStatsData0 := []*load.Data{{ - Cluster: testClusterName, - Service: testServiceName, - TotalDrops: dropCount, - Drops: map[string]uint64{dropReason: dropCount}, - LocalityStats: map[string]load.LocalityData{ - xdsinternal.LocalityID{}.ToString(): {RequestStats: load.RequestData{ - Succeeded: (rpcCount - dropCount) * 3 / 4, - Errored: (rpcCount - dropCount) / 4, - Issued: rpcCount - dropCount, - }}, - }, - }} - - gotStatsData0 := loadStore.Stats([]string{testClusterName}) - if diff := cmp.Diff(gotStatsData0, wantStatsData0, cmpOpts); diff != "" { - t.Fatalf("got unexpected reports, diff (-got, +want): %v", diff) - } // Send an update with new drop configs. const ( @@ -243,25 +216,6 @@ func (s) TestDropByCategory(t *testing.T) { }); err != nil { t.Fatal(err.Error()) } - - const dropCount2 = rpcCount * dropNumerator2 / dropDenominator2 - wantStatsData1 := []*load.Data{{ - Cluster: testClusterName, - Service: testServiceName, - TotalDrops: dropCount2, - Drops: map[string]uint64{dropReason2: dropCount2}, - LocalityStats: map[string]load.LocalityData{ - xdsinternal.LocalityID{}.ToString(): {RequestStats: load.RequestData{ - Succeeded: rpcCount - dropCount2, - Issued: rpcCount - dropCount2, - }}, - }, - }} - - gotStatsData1 := loadStore.Stats([]string{testClusterName}) - if diff := cmp.Diff(gotStatsData1, wantStatsData1, cmpOpts); diff != "" { - t.Fatalf("got unexpected reports, diff (-got, +want): %v", diff) - } } // TestDropCircuitBreaking verifies that the balancer correctly drops the picks @@ -367,24 +321,6 @@ func (s) TestDropCircuitBreaking(t *testing.T) { if loadStore == nil { t.Fatal("loadStore is nil in xdsClient") } - - wantStatsData0 := []*load.Data{{ - Cluster: testClusterName, - Service: testServiceName, - TotalDrops: uint64(maxRequest), - LocalityStats: map[string]load.LocalityData{ - xdsinternal.LocalityID{}.ToString(): {RequestStats: load.RequestData{ - Succeeded: uint64(rpcCount - maxRequest), - Errored: 50, - Issued: uint64(rpcCount - maxRequest + 50), - }}, - }, - }} - - gotStatsData0 := loadStore.Stats([]string{testClusterName}) - if diff := cmp.Diff(gotStatsData0, wantStatsData0, cmpOpts); diff != "" { - t.Fatalf("got unexpected drop reports, diff (-got, +want): %v", diff) - } } // TestPickerUpdateAfterClose covers the case where a child policy sends a @@ -700,36 +636,6 @@ func (s) TestLoadReporting(t *testing.T) { if loadStore == nil { t.Fatal("loadStore is nil in xdsClient") } - sds := loadStore.Stats([]string{testClusterName}) - if len(sds) == 0 { - t.Fatalf("loads for cluster %v not found in store", testClusterName) - } - sd := sds[0] - if sd.Cluster != testClusterName || sd.Service != testServiceName { - t.Fatalf("got unexpected load for %q, %q, want %q, %q", sd.Cluster, sd.Service, testClusterName, testServiceName) - } - testLocalityStr := testLocality.ToString() - localityData, ok := sd.LocalityStats[testLocalityStr] - if !ok { - t.Fatalf("loads for %v not found in store", testLocality) - } - reqStats := localityData.RequestStats - if reqStats.Succeeded != successCount { - t.Errorf("got succeeded %v, want %v", reqStats.Succeeded, successCount) - } - if reqStats.Errored != errorCount { - t.Errorf("got errord %v, want %v", reqStats.Errored, errorCount) - } - if reqStats.InProgress != 0 { - t.Errorf("got inProgress %v, want %v", reqStats.InProgress, 0) - } - wantLoadStats := map[string]load.ServerLoadData{ - testNamedMetricsKey1: {Count: 5, Sum: 15.7}, // aggregation of 5 * 3.14 = 15.7 - testNamedMetricsKey2: {Count: 5, Sum: 13.59}, // aggregation of 5 * 2.718 = 13.59 - } - if diff := cmp.Diff(wantLoadStats, localityData.LoadStats, toleranceCmpOpt); diff != "" { - t.Errorf("localityData.LoadStats returned unexpected diff (-want +got):\n%s", diff) - } b.Close() if err := xdsC.WaitForCancelReportLoad(ctx); err != nil { t.Fatalf("unexpected error waiting form load report to be canceled: %v", err) diff --git a/xds/internal/balancer/clusterimpl/clusterimpl.go b/xds/internal/balancer/clusterimpl/clusterimpl.go index 71a4c9c9da73..9196f627a052 100644 --- a/xds/internal/balancer/clusterimpl/clusterimpl.go +++ b/xds/internal/balancer/clusterimpl/clusterimpl.go @@ -24,10 +24,12 @@ package clusterimpl import ( + "context" "encoding/json" "fmt" "sync" "sync/atomic" + "time" "google.golang.org/grpc/balancer" "google.golang.org/grpc/connectivity" @@ -41,14 +43,15 @@ import ( "google.golang.org/grpc/serviceconfig" xdsinternal "google.golang.org/grpc/xds/internal" "google.golang.org/grpc/xds/internal/balancer/loadstore" + "google.golang.org/grpc/xds/internal/clients/lrsclient" "google.golang.org/grpc/xds/internal/xdsclient" - "google.golang.org/grpc/xds/internal/xdsclient/load" ) const ( // Name is the name of the cluster_impl balancer. Name = "xds_cluster_impl_experimental" defaultRequestCountMax = 1024 + loadStoreStopTimeout = 1 * time.Second ) var ( @@ -96,7 +99,7 @@ type clusterImplBalancer struct { // The following fields are only accessed from balancer API methods, which // are guaranteed to be called serially by gRPC. xdsClient xdsclient.XDSClient // Sent down in ResolverState attributes. - cancelLoadReport func() // To stop reporting load through the above xDS client. + cancelLoadReport func(context.Context) // To stop reporting load through the above xDS client. edsServiceName string // EDS service name to report load for. lrsServer *bootstrap.ServerConfig // Load reporting server configuration. dropCategories []DropConfig // The categories for drops. @@ -218,7 +221,9 @@ func (b *clusterImplBalancer) updateLoadStore(newConfig *LBConfig) error { if stopOldLoadReport { if b.cancelLoadReport != nil { - b.cancelLoadReport() + stopCtx, stopCancel := context.WithTimeout(context.Background(), loadStoreStopTimeout) + defer stopCancel() + b.cancelLoadReport(stopCtx) b.cancelLoadReport = nil if !startNewLoadReport { // If a new LRS stream will be started later, no need to update @@ -228,7 +233,7 @@ func (b *clusterImplBalancer) updateLoadStore(newConfig *LBConfig) error { } } if startNewLoadReport { - var loadStore *load.Store + var loadStore *lrsclient.LoadStore if b.xdsClient != nil { loadStore, b.cancelLoadReport = b.xdsClient.ReportLoad(b.lrsServer) } @@ -344,7 +349,9 @@ func (b *clusterImplBalancer) Close() { b.childState = balancer.State{} if b.cancelLoadReport != nil { - b.cancelLoadReport() + stopCtx, stopCancel := context.WithTimeout(context.Background(), loadStoreStopTimeout) + defer stopCancel() + b.cancelLoadReport(stopCtx) b.cancelLoadReport = nil } b.logger.Infof("Shutdown") diff --git a/xds/internal/balancer/clusterimpl/picker.go b/xds/internal/balancer/clusterimpl/picker.go index 018122f2c68c..0341f60d0b6b 100644 --- a/xds/internal/balancer/clusterimpl/picker.go +++ b/xds/internal/balancer/clusterimpl/picker.go @@ -28,6 +28,7 @@ import ( "google.golang.org/grpc/internal/stats" "google.golang.org/grpc/internal/wrr" "google.golang.org/grpc/status" + "google.golang.org/grpc/xds/internal" "google.golang.org/grpc/xds/internal/xdsclient" ) @@ -71,10 +72,10 @@ func (d *dropper) drop() (ret bool) { // loadReporter wraps the methods from the loadStore that are used here. type loadReporter interface { - CallStarted(locality string) - CallFinished(locality string, err error) - CallServerLoad(locality, name string, val float64) - CallDropped(locality string) + CallStarted(locality internal.LocalityID) + CallFinished(locality internal.LocalityID, err error) + CallServerLoad(locality internal.LocalityID, name string, val float64) + CallDropped(category string) } // Picker implements RPC drop, circuit breaking drop and load reporting. @@ -133,7 +134,7 @@ func (d *picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { } } - var lIDStr string + var lID internal.LocalityID pr, err := d.s.Picker.Pick(info) if scw, ok := pr.SubConn.(*scWrapper); ok { // This OK check also covers the case err!=nil, because SubConn will be @@ -141,7 +142,7 @@ func (d *picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { pr.SubConn = scw.SubConn // If locality ID isn't found in the wrapper, an empty locality ID will // be used. - lIDStr = scw.localityID().ToString() + lID = scw.localityID() } if err != nil { @@ -153,24 +154,24 @@ func (d *picker) Pick(info balancer.PickInfo) (balancer.PickResult, error) { } if labels := telemetryLabels(info.Ctx); labels != nil { - labels["grpc.lb.locality"] = lIDStr + labels["grpc.lb.locality"] = lID.ToString() } if d.loadStore != nil { - d.loadStore.CallStarted(lIDStr) + d.loadStore.CallStarted(lID) oldDone := pr.Done pr.Done = func(info balancer.DoneInfo) { if oldDone != nil { oldDone(info) } - d.loadStore.CallFinished(lIDStr, info.Err) + d.loadStore.CallFinished(lID, info.Err) load, ok := info.ServerLoad.(*v3orcapb.OrcaLoadReport) if !ok || load == nil { return } for n, c := range load.NamedMetrics { - d.loadStore.CallServerLoad(lIDStr, n, c) + d.loadStore.CallServerLoad(lID, n, c) } } } diff --git a/xds/internal/balancer/loadstore/load_store_wrapper.go b/xds/internal/balancer/loadstore/load_store_wrapper.go index f5605df83276..86b79bdf6273 100644 --- a/xds/internal/balancer/loadstore/load_store_wrapper.go +++ b/xds/internal/balancer/loadstore/load_store_wrapper.go @@ -22,7 +22,9 @@ package loadstore import ( "sync" - "google.golang.org/grpc/xds/internal/xdsclient/load" + "google.golang.org/grpc/xds/internal" + "google.golang.org/grpc/xds/internal/clients" + "google.golang.org/grpc/xds/internal/clients/lrsclient" ) // NewWrapper creates a Wrapper. @@ -53,8 +55,8 @@ type Wrapper struct { // store and perCluster are initialized as nil. They are only set by the // balancer when LRS is enabled. Before that, all functions to record loads // are no-op. - store *load.Store - perCluster load.PerClusterReporter + store *lrsclient.LoadStore + perCluster *lrsclient.PerClusterReporter } // UpdateClusterAndService updates the cluster name and eds service for this @@ -68,45 +70,52 @@ func (lsw *Wrapper) UpdateClusterAndService(cluster, edsService string) { } lsw.cluster = cluster lsw.edsService = edsService - lsw.perCluster = lsw.store.PerCluster(lsw.cluster, lsw.edsService) + if lsw.store == nil { + return + } + lsw.perCluster = lsw.store.ReporterForCluster(lsw.cluster, lsw.edsService) } // UpdateLoadStore updates the load store for this wrapper. If it is changed // from before, the perCluster store in this wrapper will also be updated. -func (lsw *Wrapper) UpdateLoadStore(store *load.Store) { +func (lsw *Wrapper) UpdateLoadStore(store *lrsclient.LoadStore) { lsw.mu.Lock() defer lsw.mu.Unlock() if store == lsw.store { return } lsw.store = store - lsw.perCluster = lsw.store.PerCluster(lsw.cluster, lsw.edsService) + if lsw.store == nil { + lsw.perCluster = nil + return + } + lsw.perCluster = lsw.store.ReporterForCluster(lsw.cluster, lsw.edsService) } // CallStarted records a call started in the store. -func (lsw *Wrapper) CallStarted(locality string) { +func (lsw *Wrapper) CallStarted(locality internal.LocalityID) { lsw.mu.RLock() defer lsw.mu.RUnlock() if lsw.perCluster != nil { - lsw.perCluster.CallStarted(locality) + lsw.perCluster.CallStarted(clients.Locality{Region: locality.Region, Zone: locality.Zone, SubZone: locality.SubZone}) } } // CallFinished records a call finished in the store. -func (lsw *Wrapper) CallFinished(locality string, err error) { +func (lsw *Wrapper) CallFinished(locality internal.LocalityID, err error) { lsw.mu.RLock() defer lsw.mu.RUnlock() if lsw.perCluster != nil { - lsw.perCluster.CallFinished(locality, err) + lsw.perCluster.CallFinished(clients.Locality{Region: locality.Region, Zone: locality.Zone, SubZone: locality.SubZone}, err) } } // CallServerLoad records the server load in the store. -func (lsw *Wrapper) CallServerLoad(locality, name string, val float64) { +func (lsw *Wrapper) CallServerLoad(locality internal.LocalityID, name string, val float64) { lsw.mu.RLock() defer lsw.mu.RUnlock() if lsw.perCluster != nil { - lsw.perCluster.CallServerLoad(locality, name, val) + lsw.perCluster.CallServerLoad(clients.Locality{Region: locality.Region, Zone: locality.Zone, SubZone: locality.SubZone}, name, val) } } diff --git a/xds/internal/clients/lrsclient/load_store.go b/xds/internal/clients/lrsclient/load_store.go index fd363ad62145..3934813862a8 100644 --- a/xds/internal/clients/lrsclient/load_store.go +++ b/xds/internal/clients/lrsclient/load_store.go @@ -23,6 +23,8 @@ import ( "sync" "sync/atomic" "time" + + "google.golang.org/grpc/xds/internal/clients" ) // A LoadStore aggregates loads for multiple clusters and services that are @@ -135,14 +137,14 @@ func (ls *LoadStore) stats(clusterNames []string) []*loadData { type PerClusterReporter struct { cluster, service string drops sync.Map // map[string]*uint64 - localityRPCCount sync.Map // map[string]*rpcCountData + localityRPCCount sync.Map // map[clients.Locality]*rpcCountData mu sync.Mutex lastLoadReportAt time.Time } // CallStarted records a call started in the LoadStore. -func (p *PerClusterReporter) CallStarted(locality string) { +func (p *PerClusterReporter) CallStarted(locality clients.Locality) { s, ok := p.localityRPCCount.Load(locality) if !ok { tp := newRPCCountData() @@ -153,7 +155,7 @@ func (p *PerClusterReporter) CallStarted(locality string) { } // CallFinished records a call finished in the LoadStore. -func (p *PerClusterReporter) CallFinished(locality string, err error) { +func (p *PerClusterReporter) CallFinished(locality clients.Locality, err error) { f, ok := p.localityRPCCount.Load(locality) if !ok { // The map is never cleared, only values in the map are reset. So the @@ -169,7 +171,7 @@ func (p *PerClusterReporter) CallFinished(locality string, err error) { } // CallServerLoad records the server load in the LoadStore. -func (p *PerClusterReporter) CallServerLoad(locality, name string, val float64) { +func (p *PerClusterReporter) CallServerLoad(locality clients.Locality, name string, val float64) { s, ok := p.localityRPCCount.Load(locality) if !ok { // The map is never cleared, only values in the map are reset. So the @@ -181,7 +183,8 @@ func (p *PerClusterReporter) CallServerLoad(locality, name string, val float64) // CallDropped records a call dropped in the LoadStore. func (p *PerClusterReporter) CallDropped(category string) { - d, ok := p.drops.Load(category) + c := clients.Locality{Region: category} + d, ok := p.drops.Load(c) if !ok { tp := new(uint64) d, _ = p.drops.LoadOrStore(category, tp) @@ -239,7 +242,7 @@ func (p *PerClusterReporter) stats() *loadData { } return true }) - sd.localityStats[key.(string)] = ld + sd.localityStats[key.(clients.Locality)] = ld return true }) @@ -266,7 +269,7 @@ type loadData struct { // drops is the number of dropped requests per category. drops map[string]uint64 // localityStats contains load reports per locality. - localityStats map[string]localityData + localityStats map[clients.Locality]localityData // reportInternal is the duration since last time load was reported (stats() // was called). reportInterval time.Duration @@ -322,7 +325,7 @@ func newLoadData(cluster, service string) *loadData { cluster: cluster, service: service, drops: make(map[string]uint64), - localityStats: make(map[string]localityData), + localityStats: make(map[clients.Locality]localityData), } } diff --git a/xds/internal/clients/lrsclient/load_store_test.go b/xds/internal/clients/lrsclient/load_store_test.go index a21ac71defca..955025baec93 100644 --- a/xds/internal/clients/lrsclient/load_store_test.go +++ b/xds/internal/clients/lrsclient/load_store_test.go @@ -25,11 +25,12 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" + "google.golang.org/grpc/xds/internal/clients" ) var ( dropCategories = []string{"drop_for_real", "drop_for_fun"} - localities = []string{"locality-A", "locality-B"} + localities = []clients.Locality{{Region: "locality-A"}, {Region: "locality-B"}} errTest = fmt.Errorf("test error") ) @@ -89,7 +90,7 @@ func TestDrops(t *testing.T) { // Store and makes sure they are as expected. func TestLocalityStats(t *testing.T) { var ( - ld = map[string]rpcData{ + ld = map[clients.Locality]rpcData{ localities[0]: { start: 40, success: 20, @@ -104,7 +105,7 @@ func TestLocalityStats(t *testing.T) { }, } wantStoreData = &loadData{ - localityStats: map[string]localityData{ + localityStats: map[clients.Locality]localityData{ localities[0]: { requestStats: requestData{ succeeded: 20, @@ -142,7 +143,7 @@ func TestLocalityStats(t *testing.T) { for locality, data := range ld { wg.Add(data.start) for i := 0; i < data.start; i++ { - go func(l string) { + go func(l clients.Locality) { ls.CallStarted(l) wg.Done() }(locality) @@ -153,7 +154,7 @@ func TestLocalityStats(t *testing.T) { wg.Add(data.success) for i := 0; i < data.success; i++ { - go func(l string, serverData map[string]float64) { + go func(l clients.Locality, serverData map[string]float64) { ls.CallFinished(l, nil) for n, d := range serverData { ls.CallServerLoad(l, n, d) @@ -163,7 +164,7 @@ func TestLocalityStats(t *testing.T) { } wg.Add(data.failure) for i := 0; i < data.failure; i++ { - go func(l string) { + go func(l clients.Locality) { ls.CallFinished(l, errTest) wg.Done() }(locality) @@ -187,7 +188,7 @@ func TestResetAfterStats(t *testing.T) { dropCategories[0]: 30, dropCategories[1]: 40, } - ld = map[string]rpcData{ + ld = map[clients.Locality]rpcData{ localities[0]: { start: 40, success: 20, @@ -207,7 +208,7 @@ func TestResetAfterStats(t *testing.T) { dropCategories[0]: 30, dropCategories[1]: 40, }, - localityStats: map[string]localityData{ + localityStats: map[clients.Locality]localityData{ localities[0]: { requestStats: requestData{ succeeded: 20, @@ -308,7 +309,7 @@ func TestStoreStats(t *testing.T) { var ( testClusters = []string{"c0", "c1", "c2"} testServices = []string{"s0", "s1"} - testLocality = "test-locality" + testLocality = clients.Locality{Region: "test-locality"} ) store := newLoadStore() @@ -325,8 +326,8 @@ func TestStoreStats(t *testing.T) { { cluster: "c0", service: "s0", totalDrops: 1, drops: map[string]uint64{"dropped": 1}, - localityStats: map[string]localityData{ - "test-locality": { + localityStats: map[clients.Locality]localityData{ + testLocality: { requestStats: requestData{succeeded: 1, issued: 1}, loadStats: map[string]serverLoadData{"abc": {count: 1, sum: 123}}, }, @@ -335,8 +336,8 @@ func TestStoreStats(t *testing.T) { { cluster: "c0", service: "s1", totalDrops: 1, drops: map[string]uint64{"dropped": 1}, - localityStats: map[string]localityData{ - "test-locality": { + localityStats: map[clients.Locality]localityData{ + testLocality: { requestStats: requestData{succeeded: 1, issued: 1}, loadStats: map[string]serverLoadData{"abc": {count: 1, sum: 123}}, }, @@ -352,8 +353,8 @@ func TestStoreStats(t *testing.T) { { cluster: "c1", service: "s0", totalDrops: 1, drops: map[string]uint64{"dropped": 1}, - localityStats: map[string]localityData{ - "test-locality": { + localityStats: map[clients.Locality]localityData{ + testLocality: { requestStats: requestData{succeeded: 1, issued: 1}, loadStats: map[string]serverLoadData{"abc": {count: 1, sum: 123}}, }, @@ -362,8 +363,8 @@ func TestStoreStats(t *testing.T) { { cluster: "c1", service: "s1", totalDrops: 1, drops: map[string]uint64{"dropped": 1}, - localityStats: map[string]localityData{ - "test-locality": { + localityStats: map[clients.Locality]localityData{ + testLocality: { requestStats: requestData{succeeded: 1, issued: 1}, loadStats: map[string]serverLoadData{"abc": {count: 1, sum: 123}}, }, @@ -372,8 +373,8 @@ func TestStoreStats(t *testing.T) { { cluster: "c2", service: "s0", totalDrops: 1, drops: map[string]uint64{"dropped": 1}, - localityStats: map[string]localityData{ - "test-locality": { + localityStats: map[clients.Locality]localityData{ + testLocality: { requestStats: requestData{succeeded: 1, issued: 1}, loadStats: map[string]serverLoadData{"abc": {count: 1, sum: 123}}, }, @@ -382,8 +383,8 @@ func TestStoreStats(t *testing.T) { { cluster: "c2", service: "s1", totalDrops: 1, drops: map[string]uint64{"dropped": 1}, - localityStats: map[string]localityData{ - "test-locality": { + localityStats: map[clients.Locality]localityData{ + testLocality: { requestStats: requestData{succeeded: 1, issued: 1}, loadStats: map[string]serverLoadData{"abc": {count: 1, sum: 123}}, }, @@ -403,7 +404,7 @@ func TestStoreStats(t *testing.T) { func TestStoreStatsEmptyDataNotReported(t *testing.T) { var ( testServices = []string{"s0", "s1"} - testLocality = "test-locality" + testLocality = clients.Locality{Region: "test-locality"} ) store := newLoadStore() @@ -420,26 +421,26 @@ func TestStoreStatsEmptyDataNotReported(t *testing.T) { want0 := []*loadData{ { cluster: "c0", service: "s0", - localityStats: map[string]localityData{ - "test-locality": {requestStats: requestData{succeeded: 1, issued: 1}}, + localityStats: map[clients.Locality]localityData{ + testLocality: {requestStats: requestData{succeeded: 1, issued: 1}}, }, }, { cluster: "c0", service: "s1", - localityStats: map[string]localityData{ - "test-locality": {requestStats: requestData{succeeded: 1, issued: 1}}, + localityStats: map[clients.Locality]localityData{ + testLocality: {requestStats: requestData{succeeded: 1, issued: 1}}, }, }, { cluster: "c1", service: "s0", - localityStats: map[string]localityData{ - "test-locality": {requestStats: requestData{inProgress: 1, issued: 1}}, + localityStats: map[clients.Locality]localityData{ + testLocality: {requestStats: requestData{inProgress: 1, issued: 1}}, }, }, { cluster: "c1", service: "s1", - localityStats: map[string]localityData{ - "test-locality": {requestStats: requestData{inProgress: 1, issued: 1}}, + localityStats: map[clients.Locality]localityData{ + testLocality: {requestStats: requestData{inProgress: 1, issued: 1}}, }, }, } @@ -453,14 +454,14 @@ func TestStoreStatsEmptyDataNotReported(t *testing.T) { want1 := []*loadData{ { cluster: "c1", service: "s0", - localityStats: map[string]localityData{ - "test-locality": {requestStats: requestData{inProgress: 1}}, + localityStats: map[clients.Locality]localityData{ + testLocality: {requestStats: requestData{inProgress: 1}}, }, }, { cluster: "c1", service: "s1", - localityStats: map[string]localityData{ - "test-locality": {requestStats: requestData{inProgress: 1}}, + localityStats: map[clients.Locality]localityData{ + testLocality: {requestStats: requestData{inProgress: 1}}, }, }, } diff --git a/xds/internal/clients/lrsclient/loadreport_test.go b/xds/internal/clients/lrsclient/loadreport_test.go index bcdec9c63492..916e84d661e5 100644 --- a/xds/internal/clients/lrsclient/loadreport_test.go +++ b/xds/internal/clients/lrsclient/loadreport_test.go @@ -54,8 +54,6 @@ func Test(t *testing.T) { } const ( - testLocality1 = `{"region":"test-region1"}` - testLocality2 = `{"region":"test-region2"}` testKey1 = "test-key1" testKey2 = "test-key2" defaultTestWatchExpiryTimeout = 100 * time.Millisecond @@ -64,6 +62,8 @@ const ( ) var ( + testLocality1 = clients.Locality{Region: "test-region1"} + testLocality2 = clients.Locality{Region: "test-region2"} toleranceCmpOpt = cmpopts.EquateApprox(0, 1e-5) ignoreOrderCmpOpt = protocmp.FilterField(&v3endpointpb.ClusterStats{}, "upstream_locality_stats", cmpopts.SortSlices(func(a, b protocmp.Message) bool { diff --git a/xds/internal/clients/lrsclient/lrs_stream.go b/xds/internal/clients/lrsclient/lrs_stream.go index df761d408e76..bb275bdb468e 100644 --- a/xds/internal/clients/lrsclient/lrs_stream.go +++ b/xds/internal/clients/lrsclient/lrs_stream.go @@ -19,7 +19,6 @@ package lrsclient import ( "context" - "encoding/json" "fmt" "io" "time" @@ -243,11 +242,7 @@ func (lrs *streamImpl) sendLoadStatsRequest(stream clients.Stream, loads []*load }) } localityStats := make([]*v3endpointpb.UpstreamLocalityStats, 0, len(sd.localityStats)) - for l, localityData := range sd.localityStats { - lid, err := localityFromString(l) - if err != nil { - return err - } + for lid, localityData := range sd.localityStats { loadMetricStats := make([]*v3endpointpb.EndpointLoadMetricStats, 0, len(localityData.loadStats)) for name, loadData := range localityData.loadStats { loadMetricStats = append(loadMetricStats, &v3endpointpb.EndpointLoadMetricStats{ @@ -306,13 +301,3 @@ func getStreamError(stream clients.Stream) error { } } } - -// localityFromString converts a json representation of locality, into a -// clients.Locality struct. -func localityFromString(s string) (ret clients.Locality, _ error) { - err := json.Unmarshal([]byte(s), &ret) - if err != nil { - return clients.Locality{}, fmt.Errorf("%s is not a well formatted locality, error: %v", s, err) - } - return ret, nil -} diff --git a/xds/internal/clients/xdsclient/channel.go b/xds/internal/clients/xdsclient/channel.go index 6faf16881599..97438353507c 100644 --- a/xds/internal/clients/xdsclient/channel.go +++ b/xds/internal/clients/xdsclient/channel.go @@ -253,6 +253,15 @@ func decodeResponse(opts *DecodeOptions, rType *ResourceType, resp response) (ma perResourceErrors := make(map[string]error) // Tracks resource validation errors, where we have a resource name. ret := make(map[string]dataAndErrTuple) // Return result, a map from resource name to either resource data or error. for _, r := range resp.resources { + r, err := xdsresource.UnwrapResource(r) + if err != nil { + topLevelErrors = append(topLevelErrors, err) + continue + } + if _, ok := opts.Config.ResourceTypes[r.TypeUrl]; !ok || r.TypeUrl != resp.typeURL { + topLevelErrors = append(topLevelErrors, xdsresource.NewErrorf(xdsresource.ErrorTypeResourceTypeUnsupported, "unexpected resource type: %q ", r.GetTypeUrl())) + continue + } result, err := rType.Decoder.Decode(r.GetValue(), *opts) // Name field of the result is left unpopulated only when resource diff --git a/xds/internal/clients/xdsclient/internal/xdsresource/type.go b/xds/internal/clients/xdsclient/internal/xdsresource/type.go index 647c36f06abb..ea4d85447d1a 100644 --- a/xds/internal/clients/xdsclient/internal/xdsresource/type.go +++ b/xds/internal/clients/xdsclient/internal/xdsresource/type.go @@ -20,7 +20,10 @@ package xdsresource import ( "time" + "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" + + v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" ) // UpdateMetadata contains the metadata for each update, including timestamp, @@ -51,6 +54,21 @@ func IsHTTPConnManagerResource(url string) bool { return url == V3HTTPConnManagerURL } +// UnwrapResource unwraps and returns the inner resource if it's in a resource +// wrapper. The original resource is returned if it's not wrapped. +func UnwrapResource(r *anypb.Any) (*anypb.Any, error) { + url := r.GetTypeUrl() + if url != V3ResourceWrapperURL { + // Not wrapped. + return r, nil + } + inner := &v3discoverypb.Resource{} + if err := proto.Unmarshal(r.GetValue(), inner); err != nil { + return nil, err + } + return inner.Resource, nil +} + // ServiceStatus is the status of the update. type ServiceStatus int @@ -81,13 +99,3 @@ type UpdateErrorMetadata struct { // Timestamp is when the NACKed response was received. Timestamp time.Time } - -// UpdateWithMD contains the raw message of the update and the metadata, -// including version, raw message, timestamp. -// -// This is to be used for config dump and CSDS, not directly by users (like -// resolvers/balancers). -type UpdateWithMD struct { - MD UpdateMetadata - Raw *anypb.Any -} diff --git a/xds/internal/clients/xdsclient/internal/xdsresource/version.go b/xds/internal/clients/xdsclient/internal/xdsresource/version.go index 68e67d7f6dc9..60f47e69428b 100644 --- a/xds/internal/clients/xdsclient/internal/xdsresource/version.go +++ b/xds/internal/clients/xdsclient/internal/xdsresource/version.go @@ -27,4 +27,5 @@ const ( V3ListenerURL = googleapiPrefix + "envoy.config.listener.v3.Listener" V3HTTPConnManagerURL = googleapiPrefix + "envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager" + V3ResourceWrapperURL = googleapiPrefix + "envoy.service.discovery.v3.Resource" ) diff --git a/xds/internal/testutils/fakeclient/client.go b/xds/internal/testutils/fakeclient/client.go index 806a207fabe5..5c30a148cb5c 100644 --- a/xds/internal/testutils/fakeclient/client.go +++ b/xds/internal/testutils/fakeclient/client.go @@ -24,8 +24,9 @@ import ( "google.golang.org/grpc/internal/testutils" "google.golang.org/grpc/internal/xds/bootstrap" + "google.golang.org/grpc/xds/internal/clients" + "google.golang.org/grpc/xds/internal/clients/lrsclient" "google.golang.org/grpc/xds/internal/xdsclient" - "google.golang.org/grpc/xds/internal/xdsclient/load" ) // Client is a fake implementation of an xds client. It exposes a bunch of @@ -39,7 +40,7 @@ type Client struct { name string loadReportCh *testutils.Channel lrsCancelCh *testutils.Channel - loadStore *load.Store + loadStore *lrsclient.LoadStore bootstrapCfg *bootstrap.Config } @@ -49,10 +50,45 @@ type ReportLoadArgs struct { Server *bootstrap.ServerConfig } +type transportBuilder struct { +} + +func (*transportBuilder) Build(clients.ServerIdentifier) (clients.Transport, error) { + return &transport{}, nil +} + +type transport struct { +} + +func (*transport) NewStream(context.Context, string) (clients.Stream, error) { + return &stream{}, nil +} + +func (*transport) Close() { +} + +type stream struct { + clients.Stream +} + +func (*stream) Send([]byte) error { + return nil +} + +func (*stream) Recv() ([]byte, error) { + return nil, nil + +} + // ReportLoad starts reporting load about clusterName to server. -func (xdsC *Client) ReportLoad(server *bootstrap.ServerConfig) (loadStore *load.Store, cancel func()) { +func (xdsC *Client) ReportLoad(server *bootstrap.ServerConfig) (loadStore *lrsclient.LoadStore, cancel func(context.Context)) { + lrsClient, _ := lrsclient.New(lrsclient.Config{Node: clients.Node{ID: "fake-node-id"}, TransportBuilder: &transportBuilder{}}) + xdsC.loadStore, _ = lrsClient.ReportLoad(clients.ServerIdentifier{ServerURI: server.ServerURI()}) + xdsC.loadReportCh.Send(ReportLoadArgs{Server: server}) - return xdsC.loadStore, func() { + + return xdsC.loadStore, func(ctx context.Context) { + xdsC.loadStore.Stop(ctx) xdsC.lrsCancelCh.Send(nil) } } @@ -65,7 +101,7 @@ func (xdsC *Client) WaitForCancelReportLoad(ctx context.Context) error { } // LoadStore returns the underlying load data store. -func (xdsC *Client) LoadStore() *load.Store { +func (xdsC *Client) LoadStore() *lrsclient.LoadStore { return xdsC.loadStore } @@ -107,7 +143,6 @@ func NewClientWithName(name string) *Client { name: name, loadReportCh: testutils.NewChannel(), lrsCancelCh: testutils.NewChannel(), - loadStore: load.NewStore(), bootstrapCfg: &bootstrap.Config{}, } } diff --git a/xds/internal/xdsclient/authority.go b/xds/internal/xdsclient/authority.go deleted file mode 100644 index ec3a7352f9b9..000000000000 --- a/xds/internal/xdsclient/authority.go +++ /dev/null @@ -1,884 +0,0 @@ -/* - * - * Copyright 2021 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package xdsclient - -import ( - "context" - "fmt" - "sync" - "sync/atomic" - - "google.golang.org/grpc/experimental/stats" - "google.golang.org/grpc/grpclog" - igrpclog "google.golang.org/grpc/internal/grpclog" - "google.golang.org/grpc/internal/grpcsync" - "google.golang.org/grpc/internal/xds/bootstrap" - "google.golang.org/grpc/xds/internal/xdsclient/transport/ads" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" - "google.golang.org/protobuf/types/known/anypb" - "google.golang.org/protobuf/types/known/timestamppb" - - v3adminpb "github.com/envoyproxy/go-control-plane/envoy/admin/v3" - v3statuspb "github.com/envoyproxy/go-control-plane/envoy/service/status/v3" -) - -type resourceState struct { - watchers map[xdsresource.ResourceWatcher]bool // Set of watchers for this resource. - cache xdsresource.ResourceData // Most recent ACKed update for this resource. - md xdsresource.UpdateMetadata // Metadata for the most recent update. - deletionIgnored bool // True, if resource deletion was ignored for a prior update. - xdsChannelConfigs map[*xdsChannelWithConfig]bool // Set of xdsChannels where this resource is subscribed. -} - -// xdsChannelForADS is used to acquire a reference to an xdsChannel. This -// functionality is provided by the xdsClient. -// -// The arguments to the function are as follows: -// - the server config for the xdsChannel -// - the calling authority on which a set of callbacks are invoked by the -// xdsChannel on ADS stream events -// -// Returns a reference to the xdsChannel and a function to release the same. A -// non-nil error is returned if the channel creation fails and the first two -// return values are meaningless in this case. -type xdsChannelForADS func(*bootstrap.ServerConfig, *authority) (*xdsChannel, func(), error) - -// xdsChannelWithConfig is a struct that holds an xdsChannel and its associated -// ServerConfig, along with a cleanup function to release the xdsChannel. -type xdsChannelWithConfig struct { - channel *xdsChannel - serverConfig *bootstrap.ServerConfig - cleanup func() -} - -// authority provides the functionality required to communicate with a -// management server corresponding to an authority name specified in the -// bootstrap configuration. -// -// It holds references to one or more xdsChannels, one for each server -// configuration in the bootstrap, to allow fallback from a primary management -// server to a secondary management server. Authorities that contain similar -// server configuration entries will end up sharing the xdsChannel for that -// server configuration. The xdsChannels are owned and managed by the xdsClient. -// -// It also contains a cache of resource state for resources requested from -// management server(s). This cache contains the list of registered watchers and -// the most recent resource configuration received from the management server. -type authority struct { - // The following fields are initialized at creation time and are read-only - // afterwards, and therefore don't need to be protected with a mutex. - name string // Name of the authority from bootstrap configuration. - watcherCallbackSerializer *grpcsync.CallbackSerializer // Serializer to run watcher callbacks, owned by the xDS client implementation. - getChannelForADS xdsChannelForADS // Function to get an xdsChannel for ADS, provided by the xDS client implementation. - xdsClientSerializer *grpcsync.CallbackSerializer // Serializer to run call ins from the xDS client, owned by this authority. - xdsClientSerializerClose func() // Function to close the above serializer. - logger *igrpclog.PrefixLogger // Logger for this authority. - target string // The gRPC Channel target. - metricsRecorder stats.MetricsRecorder // The metrics recorder used for emitting metrics. - - // The below defined fields must only be accessed in the context of the - // serializer callback, owned by this authority. - - // A two level map containing the state of all the resources being watched. - // - // The first level map key is the ResourceType (Listener, Route etc). This - // allows us to have a single map for all resources instead of having per - // resource-type maps. - // - // The second level map key is the resource name, with the value being the - // actual state of the resource. - resources map[xdsresource.Type]map[string]*resourceState - - // An ordered list of xdsChannels corresponding to the list of server - // configurations specified for this authority in the bootstrap. The - // ordering specifies the order in which these channels are preferred for - // fallback. - xdsChannelConfigs []*xdsChannelWithConfig - - // The current active xdsChannel. Here, active does not mean that the - // channel has a working connection to the server. It simply points to the - // channel that we are trying to work with, based on fallback logic. - activeXDSChannel *xdsChannelWithConfig -} - -// authorityBuildOptions wraps arguments required to create a new authority. -type authorityBuildOptions struct { - serverConfigs bootstrap.ServerConfigs // Server configs for the authority - name string // Name of the authority - serializer *grpcsync.CallbackSerializer // Callback serializer for invoking watch callbacks - getChannelForADS xdsChannelForADS // Function to acquire a reference to an xdsChannel - logPrefix string // Prefix for logging - target string // Target for the gRPC Channel that owns xDS Client/Authority - metricsRecorder stats.MetricsRecorder // metricsRecorder to emit metrics -} - -// newAuthority creates a new authority instance with the provided -// configuration. The authority is responsible for managing the state of -// resources requested from the management server, as well as acquiring and -// releasing references to channels used to communicate with the management -// server. -// -// Note that no channels to management servers are created at this time. Instead -// a channel to the first server configuration is created when the first watch -// is registered, and more channels are created as needed by the fallback logic. -func newAuthority(args authorityBuildOptions) *authority { - ctx, cancel := context.WithCancel(context.Background()) - l := grpclog.Component("xds") - logPrefix := args.logPrefix + fmt.Sprintf("[authority %q] ", args.name) - ret := &authority{ - name: args.name, - watcherCallbackSerializer: args.serializer, - getChannelForADS: args.getChannelForADS, - xdsClientSerializer: grpcsync.NewCallbackSerializer(ctx), - xdsClientSerializerClose: cancel, - logger: igrpclog.NewPrefixLogger(l, logPrefix), - resources: make(map[xdsresource.Type]map[string]*resourceState), - target: args.target, - metricsRecorder: args.metricsRecorder, - } - - // Create an ordered list of xdsChannels with their server configs. The - // actual channel to the first server configuration is created when the - // first watch is registered, and channels to other server configurations - // are created as needed to support fallback. - for _, sc := range args.serverConfigs { - ret.xdsChannelConfigs = append(ret.xdsChannelConfigs, &xdsChannelWithConfig{serverConfig: sc}) - } - return ret -} - -// adsStreamFailure is called to notify the authority about an ADS stream -// failure on an xdsChannel to the management server identified by the provided -// server config. The error is forwarded to all the resource watchers. -// -// This method is called by the xDS client implementation (on all interested -// authorities) when a stream error is reported by an xdsChannel. -// -// Errors of type xdsresource.ErrTypeStreamFailedAfterRecv are ignored. -func (a *authority) adsStreamFailure(serverConfig *bootstrap.ServerConfig, err error) { - a.xdsClientSerializer.TrySchedule(func(context.Context) { - a.handleADSStreamFailure(serverConfig, err) - }) -} - -// Handles ADS stream failure by invoking watch callbacks and triggering -// fallback if the associated conditions are met. -// -// Only executed in the context of a serializer callback. -func (a *authority) handleADSStreamFailure(serverConfig *bootstrap.ServerConfig, err error) { - if a.logger.V(2) { - a.logger.Infof("Connection to server %s failed with error: %v", serverConfig, err) - } - - // We do not consider it an error if the ADS stream was closed after having - // received a response on the stream. This is because there are legitimate - // reasons why the server may need to close the stream during normal - // operations, such as needing to rebalance load or the underlying - // connection hitting its max connection age limit. See gRFC A57 for more - // details. - if xdsresource.ErrType(err) == xdsresource.ErrTypeStreamFailedAfterRecv { - a.logger.Warningf("Watchers not notified since ADS stream failed after having received at least one response: %v", err) - return - } - - // Two conditions need to be met for fallback to be triggered: - // 1. There is a connectivity failure on the ADS stream, as described in - // gRFC A57. For us, this means that the ADS stream was closed before the - // first server response was received. We already checked that condition - // earlier in this method. - // 2. There is at least one watcher for a resource that is not cached. - // Cached resources include ones that - // - have been successfully received and can be used. - // - are considered non-existent according to xDS Protocol Specification. - if !a.watcherExistsForUncachedResource() { - if a.logger.V(2) { - a.logger.Infof("No watchers for uncached resources. Not triggering fallback") - } - // Since we are not triggering fallback, propagate the connectivity - // error to all watchers and return early. - a.propagateConnectivityErrorToAllWatchers(err) - return - } - - // Attempt to fallback to servers with lower priority than the failing one. - currentServerIdx := a.serverIndexForConfig(serverConfig) - for i := currentServerIdx + 1; i < len(a.xdsChannelConfigs); i++ { - if a.fallbackToServer(a.xdsChannelConfigs[i]) { - // Since we have successfully triggered fallback, we don't have to - // notify watchers about the connectivity error. - return - } - } - - // Having exhausted all available servers, we must notify watchers of the - // connectivity error - A71. - a.propagateConnectivityErrorToAllWatchers(err) -} - -// propagateConnectivityErrorToAllWatchers propagates the given connection error -// to all watchers of all resources. -// -// Only executed in the context of a serializer callback. -func (a *authority) propagateConnectivityErrorToAllWatchers(err error) { - for _, rType := range a.resources { - for _, state := range rType { - for watcher := range state.watchers { - if state.cache == nil { - a.watcherCallbackSerializer.TrySchedule(func(context.Context) { - watcher.ResourceError(xdsresource.NewErrorf(xdsresource.ErrorTypeConnection, "xds: error received from xDS stream: %v", err), func() {}) - }) - } else { - a.watcherCallbackSerializer.TrySchedule(func(context.Context) { - watcher.AmbientError(xdsresource.NewErrorf(xdsresource.ErrorTypeConnection, "xds: error received from xDS stream: %v", err), func() {}) - }) - } - } - } - } -} - -// serverIndexForConfig returns the index of the xdsChannelConfig matching the -// provided server config, panicking if no match is found (which indicates a -// programming error). -func (a *authority) serverIndexForConfig(sc *bootstrap.ServerConfig) int { - for i, cfg := range a.xdsChannelConfigs { - if cfg.serverConfig.Equal(sc) { - return i - } - } - panic(fmt.Sprintf("no server config matching %v found", sc)) -} - -// Determines the server to fallback to and triggers fallback to the same. If -// required, creates an xdsChannel to that server, and re-subscribes to all -// existing resources. -// -// Only executed in the context of a serializer callback. -func (a *authority) fallbackToServer(xc *xdsChannelWithConfig) bool { - if a.logger.V(2) { - a.logger.Infof("Attempting to initiate fallback to server %q", xc.serverConfig) - } - - if xc.channel != nil { - if a.logger.V(2) { - a.logger.Infof("Channel to the next server in the list %q already exists", xc.serverConfig) - } - return false - } - - channel, cleanup, err := a.getChannelForADS(xc.serverConfig, a) - if err != nil { - a.logger.Errorf("Failed to create xDS channel: %v", err) - return false - } - xc.channel = channel - xc.cleanup = cleanup - a.activeXDSChannel = xc - - // Subscribe to all existing resources from the new management server. - for typ, resources := range a.resources { - for name, state := range resources { - if a.logger.V(2) { - a.logger.Infof("Resubscribing to resource of type %q and name %q", typ.TypeName(), name) - } - xc.channel.subscribe(typ, name) - - // Add the new channel to the list of xdsChannels from which this - // resource has been requested from. Retain the cached resource and - // the set of existing watchers (and other metadata fields) in the - // resource state. - state.xdsChannelConfigs[xc] = true - } - } - return true -} - -// adsResourceUpdate is called to notify the authority about a resource update -// received on the ADS stream. -// -// This method is called by the xDS client implementation (on all interested -// authorities) when a stream error is reported by an xdsChannel. -func (a *authority) adsResourceUpdate(serverConfig *bootstrap.ServerConfig, rType xdsresource.Type, updates map[string]ads.DataAndErrTuple, md xdsresource.UpdateMetadata, onDone func()) { - a.xdsClientSerializer.TrySchedule(func(context.Context) { - a.handleADSResourceUpdate(serverConfig, rType, updates, md, onDone) - }) -} - -// handleADSResourceUpdate processes an update from the xDS client, updating the -// resource cache and notifying any registered watchers of the update. -// -// If the update is received from a higher priority xdsChannel that was -// previously down, we revert to it and close all lower priority xdsChannels. -// -// Once the update has been processed by all watchers, the authority is expected -// to invoke the onDone callback. -// -// Only executed in the context of a serializer callback. -func (a *authority) handleADSResourceUpdate(serverConfig *bootstrap.ServerConfig, rType xdsresource.Type, updates map[string]ads.DataAndErrTuple, md xdsresource.UpdateMetadata, onDone func()) { - a.handleRevertingToPrimaryOnUpdate(serverConfig) - - // We build a list of callback funcs to invoke, and invoke them at the end - // of this method instead of inline (when handling the update for a - // particular resource), because we want to make sure that all calls to - // increment watcherCnt happen before any callbacks are invoked. This will - // ensure that the onDone callback is never invoked before all watcher - // callbacks are invoked, and the watchers have processed the update. - watcherCnt := new(atomic.Int64) - done := func() { - if watcherCnt.Add(-1) == 0 { - onDone() - } - } - funcsToSchedule := []func(context.Context){} - defer func() { - if len(funcsToSchedule) == 0 { - // When there are no watchers for the resources received as part of - // this update, invoke onDone explicitly to unblock the next read on - // the ADS stream. - onDone() - return - } - for _, f := range funcsToSchedule { - a.watcherCallbackSerializer.ScheduleOr(f, onDone) - } - }() - - resourceStates := a.resources[rType] - for name, uErr := range updates { - state, ok := resourceStates[name] - if !ok { - continue - } - - // On error, keep previous version of the resource. But update status - // and error. - if uErr.Err != nil { - xdsClientResourceUpdatesInvalidMetric.Record(a.metricsRecorder, 1, a.target, serverConfig.ServerURI(), rType.TypeName()) - state.md.ErrState = md.ErrState - state.md.Status = md.Status - for watcher := range state.watchers { - watcher := watcher - err := uErr.Err - watcherCnt.Add(1) - if state.cache == nil { - funcsToSchedule = append(funcsToSchedule, func(context.Context) { watcher.ResourceError(err, done) }) - } else { - funcsToSchedule = append(funcsToSchedule, func(context.Context) { watcher.AmbientError(err, done) }) - } - } - continue - } - - xdsClientResourceUpdatesValidMetric.Record(a.metricsRecorder, 1, a.target, serverConfig.ServerURI(), rType.TypeName()) - - if state.deletionIgnored { - state.deletionIgnored = false - a.logger.Infof("A valid update was received for resource %q of type %q after previously ignoring a deletion", name, rType.TypeName()) - } - // Notify watchers if any of these conditions are met: - // - this is the first update for this resource - // - this update is different from the one currently cached - // - the previous update for this resource was NACKed, but the update - // before that was the same as this update. - if state.cache == nil || !state.cache.RawEqual(uErr.Resource) || state.md.ErrState != nil { - // Update the resource cache. - if a.logger.V(2) { - a.logger.Infof("Resource type %q with name %q added to cache", rType.TypeName(), name) - } - state.cache = uErr.Resource - - for watcher := range state.watchers { - watcher := watcher - resource := uErr.Resource - watcherCnt.Add(1) - funcsToSchedule = append(funcsToSchedule, func(context.Context) { watcher.ResourceChanged(resource, done) }) - } - } - - // Set status to ACK, and clear error state. The metadata might be a - // NACK metadata because some other resources in the same response - // are invalid. - state.md = md - state.md.ErrState = nil - state.md.Status = xdsresource.ServiceStatusACKed - if md.ErrState != nil { - state.md.Version = md.ErrState.Version - } - } - - // If this resource type requires that all resources be present in every - // SotW response from the server, a response that does not include a - // previously seen resource will be interpreted as a deletion of that - // resource unless ignore_resource_deletion option was set in the server - // config. - if !rType.AllResourcesRequiredInSotW() { - return - } - for name, state := range resourceStates { - if state.cache == nil { - // If the resource state does not contain a cached update, which can - // happen when: - // - resource was newly requested but has not yet been received, or, - // - resource was removed as part of a previous update, - // we don't want to generate an error for the watchers. - // - // For the first of the above two conditions, this ADS response may - // be in reaction to an earlier request that did not yet request the - // new resource, so its absence from the response does not - // necessarily indicate that the resource does not exist. For that - // case, we rely on the request timeout instead. - // - // For the second of the above two conditions, we already generated - // an error when we received the first response which removed this - // resource. So, there is no need to generate another one. - continue - } - if _, ok := updates[name]; ok { - // If the resource was present in the response, move on. - continue - } - if state.md.Status == xdsresource.ServiceStatusNotExist { - // The metadata status is set to "ServiceStatusNotExist" if a - // previous update deleted this resource, in which case we do not - // want to repeatedly call the watch callbacks with a - // "resource-not-found" error. - continue - } - if serverConfig.ServerFeaturesIgnoreResourceDeletion() { - // Per A53, resource deletions are ignored if the - // `ignore_resource_deletion` server feature is enabled through the - // bootstrap configuration. If the resource deletion is to be - // ignored, the resource is not removed from the cache and the - // corresponding ResourceError() callback is not invoked on - // the watchers. - if !state.deletionIgnored { - state.deletionIgnored = true - a.logger.Warningf("Ignoring resource deletion for resource %q of type %q", name, rType.TypeName()) - } - continue - } - - // If we get here, it means that the resource exists in cache, but not - // in the new update. Delete the resource from cache, and send a - // resource not found error to indicate that the resource has been - // removed. Metadata for the resource is still maintained, as this is - // required by CSDS. - state.cache = nil - state.md = xdsresource.UpdateMetadata{Status: xdsresource.ServiceStatusNotExist} - for watcher := range state.watchers { - watcher := watcher - watcherCnt.Add(1) - funcsToSchedule = append(funcsToSchedule, func(context.Context) { - watcher.ResourceError(xdsresource.NewErrorf(xdsresource.ErrorTypeResourceNotFound, "xds: resource %q of type %q has been removed", name, rType.TypeName()), done) - }) - } - } -} - -// adsResourceDoesNotExist is called by the xDS client implementation (on all -// interested authorities) to notify the authority that a subscribed resource -// does not exist. -func (a *authority) adsResourceDoesNotExist(rType xdsresource.Type, resourceName string) { - a.xdsClientSerializer.TrySchedule(func(context.Context) { - a.handleADSResourceDoesNotExist(rType, resourceName) - }) -} - -// handleADSResourceDoesNotExist is called when a subscribed resource does not -// exist. It removes the resource from the cache, updates the metadata status -// to ServiceStatusNotExist, and notifies all watchers that the resource does -// not exist. -func (a *authority) handleADSResourceDoesNotExist(rType xdsresource.Type, resourceName string) { - if a.logger.V(2) { - a.logger.Infof("Watch for resource %q of type %s timed out", resourceName, rType.TypeName()) - } - - resourceStates := a.resources[rType] - if resourceStates == nil { - if a.logger.V(2) { - a.logger.Infof("Resource %q of type %s currently not being watched", resourceName, rType.TypeName()) - } - return - } - state, ok := resourceStates[resourceName] - if !ok { - if a.logger.V(2) { - a.logger.Infof("Resource %q of type %s currently not being watched", resourceName, rType.TypeName()) - } - return - } - - state.cache = nil - state.md = xdsresource.UpdateMetadata{Status: xdsresource.ServiceStatusNotExist} - for watcher := range state.watchers { - watcher := watcher - a.watcherCallbackSerializer.TrySchedule(func(context.Context) { - watcher.ResourceError(xdsresource.NewErrorf(xdsresource.ErrorTypeResourceNotFound, "xds: resource %q of type %q does not exist", resourceName, rType.TypeName()), func() {}) - }) - } -} - -// handleRevertingToPrimaryOnUpdate is called when a resource update is received -// from the xDS client. -// -// If the update is from the currently active server, nothing is done. Else, all -// lower priority servers are closed and the active server is reverted to the -// highest priority server that sent the update. -// -// This method is only executed in the context of a serializer callback. -func (a *authority) handleRevertingToPrimaryOnUpdate(serverConfig *bootstrap.ServerConfig) { - if a.activeXDSChannel != nil && a.activeXDSChannel.serverConfig.Equal(serverConfig) { - // If the resource update is from the current active server, nothing - // needs to be done from fallback point of view. - return - } - - if a.logger.V(2) { - a.logger.Infof("Received update from non-active server %q", serverConfig) - } - - // If the resource update is not from the current active server, it means - // that we have received an update from a higher priority server and we need - // to revert back to it. This method guarantees that when an update is - // received from a server, all lower priority servers are closed. - serverIdx := a.serverIndexForConfig(serverConfig) - a.activeXDSChannel = a.xdsChannelConfigs[serverIdx] - - // Close all lower priority channels. - // - // But before closing any channel, we need to unsubscribe from any resources - // that were subscribed to on this channel. Resources could be subscribed to - // from multiple channels as we fallback to lower priority servers. But when - // a higher priority one comes back up, we need to unsubscribe from all - // lower priority ones before releasing the reference to them. - for i := serverIdx + 1; i < len(a.xdsChannelConfigs); i++ { - cfg := a.xdsChannelConfigs[i] - - for rType, rState := range a.resources { - for resourceName, state := range rState { - for xcc := range state.xdsChannelConfigs { - if xcc != cfg { - continue - } - // If the current resource is subscribed to on this channel, - // unsubscribe, and remove the channel from the list of - // channels that this resource is subscribed to. - xcc.channel.unsubscribe(rType, resourceName) - delete(state.xdsChannelConfigs, xcc) - } - } - } - - // Release the reference to the channel. - if cfg.cleanup != nil { - if a.logger.V(2) { - a.logger.Infof("Closing lower priority server %q", cfg.serverConfig) - } - cfg.cleanup() - cfg.cleanup = nil - } - cfg.channel = nil - } -} - -// watchResource registers a new watcher for the specified resource type and -// name. It returns a function that can be called to cancel the watch. -// -// If this is the first watch for any resource on this authority, an xdsChannel -// to the first management server (from the list of server configurations) will -// be created. -// -// If this is the first watch for the given resource name, it will subscribe to -// the resource with the xdsChannel. If a cached copy of the resource exists, it -// will immediately notify the new watcher. When the last watcher for a resource -// is removed, it will unsubscribe the resource from the xdsChannel. -func (a *authority) watchResource(rType xdsresource.Type, resourceName string, watcher xdsresource.ResourceWatcher) func() { - cleanup := func() {} - done := make(chan struct{}) - - a.xdsClientSerializer.ScheduleOr(func(context.Context) { - defer close(done) - - if a.logger.V(2) { - a.logger.Infof("New watch for type %q, resource name %q", rType.TypeName(), resourceName) - } - - xdsChannel, err := a.xdsChannelToUse() - if err != nil { - a.watcherCallbackSerializer.TrySchedule(func(context.Context) { watcher.ResourceError(err, func() {}) }) - return - } - - // Lookup the entry for the resource type in the top-level map. If there is - // no entry for this resource type, create one. - resources := a.resources[rType] - if resources == nil { - resources = make(map[string]*resourceState) - a.resources[rType] = resources - } - - // Lookup the resource state for the particular resource name that the watch - // is being registered for. If this is the first watch for this resource - // name, request it from the management server. - state := resources[resourceName] - if state == nil { - if a.logger.V(2) { - a.logger.Infof("First watch for type %q, resource name %q", rType.TypeName(), resourceName) - } - state = &resourceState{ - watchers: make(map[xdsresource.ResourceWatcher]bool), - md: xdsresource.UpdateMetadata{Status: xdsresource.ServiceStatusRequested}, - xdsChannelConfigs: map[*xdsChannelWithConfig]bool{xdsChannel: true}, - } - resources[resourceName] = state - xdsChannel.channel.subscribe(rType, resourceName) - } - // Always add the new watcher to the set of watchers. - state.watchers[watcher] = true - - // If we have a cached copy of the resource, notify the new watcher - // immediately. - if state.cache != nil { - if a.logger.V(2) { - a.logger.Infof("Resource type %q with resource name %q found in cache: %s", rType.TypeName(), resourceName, state.cache.ToJSON()) - } - // state can only be accessed in the context of an - // xdsClientSerializer callback. Hence making a copy of the cached - // resource here for watchCallbackSerializer. - resource := state.cache - a.watcherCallbackSerializer.TrySchedule(func(context.Context) { watcher.ResourceChanged(resource, func() {}) }) - } - // If last update was NACK'd, notify the new watcher of error - // immediately as well. - if state.md.Status == xdsresource.ServiceStatusNACKed { - if a.logger.V(2) { - a.logger.Infof("Resource type %q with resource name %q was NACKed", rType.TypeName(), resourceName) - } - // state can only be accessed in the context of an - // xdsClientSerializer callback. Hence making a copy of the error - // here for watchCallbackSerializer. - err := state.md.ErrState.Err - if state.cache == nil { - a.watcherCallbackSerializer.TrySchedule(func(context.Context) { watcher.ResourceError(err, func() {}) }) - } else { - a.watcherCallbackSerializer.TrySchedule(func(context.Context) { watcher.AmbientError(err, func() {}) }) - } - } - // If the metadata field is updated to indicate that the management - // server does not have this resource, notify the new watcher. - if state.md.Status == xdsresource.ServiceStatusNotExist { - a.watcherCallbackSerializer.TrySchedule(func(context.Context) { - watcher.ResourceError(xdsresource.NewErrorf(xdsresource.ErrorTypeResourceNotFound, "xds: resource %q of type %q does not exist", resourceName, rType.TypeName()), func() {}) - }) - } - cleanup = a.unwatchResource(rType, resourceName, watcher) - }, func() { - if a.logger.V(2) { - a.logger.Infof("Failed to schedule a watch for type %q, resource name %q, because the xDS client is closed", rType.TypeName(), resourceName) - } - close(done) - }) - <-done - return cleanup -} - -func (a *authority) unwatchResource(rType xdsresource.Type, resourceName string, watcher xdsresource.ResourceWatcher) func() { - return sync.OnceFunc(func() { - done := make(chan struct{}) - a.xdsClientSerializer.ScheduleOr(func(context.Context) { - defer close(done) - - if a.logger.V(2) { - a.logger.Infof("Canceling a watch for type %q, resource name %q", rType.TypeName(), resourceName) - } - - // Lookup the resource type from the resource cache. The entry is - // guaranteed to be present, since *we* were the ones who added it in - // there when the watch was registered. - resources := a.resources[rType] - state := resources[resourceName] - - // Delete this particular watcher from the list of watchers, so that its - // callback will not be invoked in the future. - delete(state.watchers, watcher) - if len(state.watchers) > 0 { - if a.logger.V(2) { - a.logger.Infof("Other watchers exist for type %q, resource name %q", rType.TypeName(), resourceName) - } - return - } - - // There are no more watchers for this resource. Unsubscribe this - // resource from all channels where it was subscribed to and delete - // the state associated with it. - if a.logger.V(2) { - a.logger.Infof("Removing last watch for resource name %q", resourceName) - } - for xcc := range state.xdsChannelConfigs { - xcc.channel.unsubscribe(rType, resourceName) - } - delete(resources, resourceName) - - // If there are no more watchers for this resource type, delete the - // resource type from the top-level map. - if len(resources) == 0 { - if a.logger.V(2) { - a.logger.Infof("Removing last watch for resource type %q", rType.TypeName()) - } - delete(a.resources, rType) - } - // If there are no more watchers for any resource type, release the - // reference to the xdsChannels. - if len(a.resources) == 0 { - if a.logger.V(2) { - a.logger.Infof("Removing last watch for for any resource type, releasing reference to the xdsChannel") - } - a.closeXDSChannels() - } - }, func() { close(done) }) - <-done - }) -} - -// xdsChannelToUse returns the xdsChannel to use for communicating with the -// management server. If an active channel is available, it returns that. -// Otherwise, it creates a new channel using the first server configuration in -// the list of configurations, and returns that. -// -// A non-nil error is returned if the channel creation fails. -// -// Only executed in the context of a serializer callback. -func (a *authority) xdsChannelToUse() (*xdsChannelWithConfig, error) { - if a.activeXDSChannel != nil { - return a.activeXDSChannel, nil - } - - sc := a.xdsChannelConfigs[0].serverConfig - xc, cleanup, err := a.getChannelForADS(sc, a) - if err != nil { - return nil, err - } - a.xdsChannelConfigs[0].channel = xc - a.xdsChannelConfigs[0].cleanup = cleanup - a.activeXDSChannel = a.xdsChannelConfigs[0] - return a.activeXDSChannel, nil -} - -// closeXDSChannels closes all the xDS channels associated with this authority, -// when there are no more watchers for any resource type. -// -// Only executed in the context of a serializer callback. -func (a *authority) closeXDSChannels() { - for _, xcc := range a.xdsChannelConfigs { - if xcc.cleanup != nil { - xcc.cleanup() - xcc.cleanup = nil - } - xcc.channel = nil - } - a.activeXDSChannel = nil -} - -// watcherExistsForUncachedResource returns true if there is at least one -// watcher for a resource that has not yet been cached. -// -// Only executed in the context of a serializer callback. -func (a *authority) watcherExistsForUncachedResource() bool { - for _, resourceStates := range a.resources { - for _, state := range resourceStates { - if state.md.Status == xdsresource.ServiceStatusRequested { - return true - } - } - } - return false -} - -// dumpResources returns a dump of the resource configuration cached by this -// authority, for CSDS purposes. -func (a *authority) dumpResources() []*v3statuspb.ClientConfig_GenericXdsConfig { - var ret []*v3statuspb.ClientConfig_GenericXdsConfig - done := make(chan struct{}) - - a.xdsClientSerializer.ScheduleOr(func(context.Context) { - defer close(done) - ret = a.resourceConfig() - }, func() { close(done) }) - <-done - return ret -} - -// resourceConfig returns a slice of GenericXdsConfig objects representing the -// current state of all resources managed by this authority. This is used for -// reporting the current state of the xDS client. -// -// Only executed in the context of a serializer callback. -func (a *authority) resourceConfig() []*v3statuspb.ClientConfig_GenericXdsConfig { - var ret []*v3statuspb.ClientConfig_GenericXdsConfig - for rType, resourceStates := range a.resources { - typeURL := rType.TypeURL() - for name, state := range resourceStates { - var raw *anypb.Any - if state.cache != nil { - raw = state.cache.Raw() - } - config := &v3statuspb.ClientConfig_GenericXdsConfig{ - TypeUrl: typeURL, - Name: name, - VersionInfo: state.md.Version, - XdsConfig: raw, - LastUpdated: timestamppb.New(state.md.Timestamp), - ClientStatus: serviceStatusToProto(state.md.Status), - } - if errState := state.md.ErrState; errState != nil { - config.ErrorState = &v3adminpb.UpdateFailureState{ - LastUpdateAttempt: timestamppb.New(errState.Timestamp), - Details: errState.Err.Error(), - VersionInfo: errState.Version, - } - } - ret = append(ret, config) - } - } - return ret -} - -func (a *authority) close() { - a.xdsClientSerializerClose() - <-a.xdsClientSerializer.Done() - if a.logger.V(2) { - a.logger.Infof("Closed") - } -} - -func serviceStatusToProto(serviceStatus xdsresource.ServiceStatus) v3adminpb.ClientResourceStatus { - switch serviceStatus { - case xdsresource.ServiceStatusUnknown: - return v3adminpb.ClientResourceStatus_UNKNOWN - case xdsresource.ServiceStatusRequested: - return v3adminpb.ClientResourceStatus_REQUESTED - case xdsresource.ServiceStatusNotExist: - return v3adminpb.ClientResourceStatus_DOES_NOT_EXIST - case xdsresource.ServiceStatusACKed: - return v3adminpb.ClientResourceStatus_ACKED - case xdsresource.ServiceStatusNACKed: - return v3adminpb.ClientResourceStatus_NACKED - default: - return v3adminpb.ClientResourceStatus_UNKNOWN - } -} diff --git a/xds/internal/xdsclient/channel.go b/xds/internal/xdsclient/channel.go deleted file mode 100644 index 60ab9290b553..000000000000 --- a/xds/internal/xdsclient/channel.go +++ /dev/null @@ -1,342 +0,0 @@ -/* - * - * Copyright 2024 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package xdsclient - -import ( - "errors" - "fmt" - "strings" - "time" - - "google.golang.org/grpc/grpclog" - "google.golang.org/grpc/internal/backoff" - igrpclog "google.golang.org/grpc/internal/grpclog" - "google.golang.org/grpc/internal/grpcsync" - "google.golang.org/grpc/internal/xds/bootstrap" - "google.golang.org/grpc/xds/internal/xdsclient/load" - "google.golang.org/grpc/xds/internal/xdsclient/transport" - "google.golang.org/grpc/xds/internal/xdsclient/transport/ads" - "google.golang.org/grpc/xds/internal/xdsclient/transport/lrs" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" -) - -// xdsChannelEventHandler wraps callbacks used to notify the xDS client about -// events on the xdsChannel. Methods in this interface may be invoked -// concurrently and the xDS client implementation needs to handle them in a -// thread-safe manner. -type xdsChannelEventHandler interface { - // adsStreamFailure is called when the xdsChannel encounters an ADS stream - // failure. - adsStreamFailure(error) - - // adsResourceUpdate is called when the xdsChannel receives an ADS response - // from the xDS management server. The callback is provided with the - // following: - // - the resource type of the resources in the response - // - a map of resources in the response, keyed by resource name - // - the metadata associated with the response - // - a callback to be invoked when the updated is processed - adsResourceUpdate(xdsresource.Type, map[string]ads.DataAndErrTuple, xdsresource.UpdateMetadata, func()) - - // adsResourceDoesNotExist is called when the xdsChannel determines that a - // requested ADS resource does not exist. - adsResourceDoesNotExist(xdsresource.Type, string) -} - -// xdsChannelOpts holds the options for creating a new xdsChannel. -type xdsChannelOpts struct { - transport transport.Transport // Takes ownership of this transport. - serverConfig *bootstrap.ServerConfig // Configuration of the server to connect to. - bootstrapConfig *bootstrap.Config // Complete bootstrap configuration, used to decode resources. - resourceTypeGetter func(string) xdsresource.Type // Function to retrieve resource parsing functionality, based on resource type. - eventHandler xdsChannelEventHandler // Callbacks for ADS stream events. - backoff func(int) time.Duration // Backoff function to use for stream retries. Defaults to exponential backoff, if unset. - watchExpiryTimeout time.Duration // Timeout for ADS resource watch expiry. - logPrefix string // Prefix to use for logging. -} - -// newXDSChannel creates a new xdsChannel instance with the provided options. -// It performs basic validation on the provided options and initializes the -// xdsChannel with the necessary components. -func newXDSChannel(opts xdsChannelOpts) (*xdsChannel, error) { - switch { - case opts.transport == nil: - return nil, errors.New("xdsChannel: transport is nil") - case opts.serverConfig == nil: - return nil, errors.New("xdsChannel: serverConfig is nil") - case opts.bootstrapConfig == nil: - return nil, errors.New("xdsChannel: bootstrapConfig is nil") - case opts.resourceTypeGetter == nil: - return nil, errors.New("xdsChannel: resourceTypeGetter is nil") - case opts.eventHandler == nil: - return nil, errors.New("xdsChannel: eventHandler is nil") - } - - xc := &xdsChannel{ - transport: opts.transport, - serverConfig: opts.serverConfig, - bootstrapConfig: opts.bootstrapConfig, - resourceTypeGetter: opts.resourceTypeGetter, - eventHandler: opts.eventHandler, - closed: grpcsync.NewEvent(), - } - - l := grpclog.Component("xds") - logPrefix := opts.logPrefix + fmt.Sprintf("[xds-channel %p] ", xc) - xc.logger = igrpclog.NewPrefixLogger(l, logPrefix) - - if opts.backoff == nil { - opts.backoff = backoff.DefaultExponential.Backoff - } - xc.ads = ads.NewStreamImpl(ads.StreamOpts{ - Transport: xc.transport, - EventHandler: xc, - Backoff: opts.backoff, - NodeProto: xc.bootstrapConfig.Node(), - WatchExpiryTimeout: opts.watchExpiryTimeout, - LogPrefix: logPrefix, - }) - xc.lrs = lrs.NewStreamImpl(lrs.StreamOpts{ - Transport: xc.transport, - Backoff: opts.backoff, - NodeProto: xc.bootstrapConfig.Node(), - LogPrefix: logPrefix, - }) - return xc, nil -} - -// xdsChannel represents a client channel to a management server, and is -// responsible for managing the lifecycle of the ADS and LRS streams. It invokes -// callbacks on the registered event handler for various ADS stream events. -type xdsChannel struct { - // The following fields are initialized at creation time and are read-only - // after that, and hence need not be guarded by a mutex. - transport transport.Transport // Takes ownership of this transport (used to make streaming calls). - ads *ads.StreamImpl // An ADS stream to the management server. - lrs *lrs.StreamImpl // An LRS stream to the management server. - serverConfig *bootstrap.ServerConfig // Configuration of the server to connect to. - bootstrapConfig *bootstrap.Config // Complete bootstrap configuration, used to decode resources. - resourceTypeGetter func(string) xdsresource.Type // Function to retrieve resource parsing functionality, based on resource type. - eventHandler xdsChannelEventHandler // Callbacks for ADS stream events. - logger *igrpclog.PrefixLogger // Logger to use for logging. - closed *grpcsync.Event // Fired when the channel is closed. -} - -func (xc *xdsChannel) close() { - xc.closed.Fire() - xc.ads.Stop() - xc.lrs.Stop() - xc.transport.Close() - xc.logger.Infof("Shutdown") -} - -// reportLoad returns a load.Store that can be used to report load to the LRS, and a -// function that can be called to stop reporting load. -func (xc *xdsChannel) reportLoad() (*load.Store, func()) { - if xc.closed.HasFired() { - if xc.logger.V(2) { - xc.logger.Infof("Attempt to start load reporting on closed channel") - } - return nil, func() {} - } - return xc.lrs.ReportLoad() -} - -// subscribe adds a subscription for the given resource name of the given -// resource type on the ADS stream. -func (xc *xdsChannel) subscribe(typ xdsresource.Type, name string) { - if xc.closed.HasFired() { - if xc.logger.V(2) { - xc.logger.Infof("Attempt to subscribe to an xDS resource of type %s and name %q on a closed channel", typ.TypeName(), name) - } - return - } - xc.ads.Subscribe(typ, name) -} - -// unsubscribe removes the subscription for the given resource name of the given -// resource type from the ADS stream. -func (xc *xdsChannel) unsubscribe(typ xdsresource.Type, name string) { - if xc.closed.HasFired() { - if xc.logger.V(2) { - xc.logger.Infof("Attempt to unsubscribe to an xDS resource of type %s and name %q on a closed channel", typ.TypeName(), name) - } - return - } - xc.ads.Unsubscribe(typ, name) -} - -// The following OnADSXxx() methods implement the ads.StreamEventHandler interface -// and are invoked by the ADS stream implementation. - -// OnADSStreamError is invoked when an error occurs on the ADS stream. It -// propagates the update to the xDS client. -func (xc *xdsChannel) OnADSStreamError(err error) { - if xc.closed.HasFired() { - if xc.logger.V(2) { - xc.logger.Infof("Received ADS stream error on a closed xdsChannel: %v", err) - } - return - } - xc.eventHandler.adsStreamFailure(err) -} - -// OnADSWatchExpiry is invoked when a watch for a resource expires. It -// propagates the update to the xDS client. -func (xc *xdsChannel) OnADSWatchExpiry(typ xdsresource.Type, name string) { - if xc.closed.HasFired() { - if xc.logger.V(2) { - xc.logger.Infof("Received ADS resource watch expiry for resource %q on a closed xdsChannel", name) - } - return - } - xc.eventHandler.adsResourceDoesNotExist(typ, name) -} - -// OnADSResponse is invoked when a response is received on the ADS stream. It -// decodes the resources in the response, and propagates the updates to the xDS -// client. -// -// It returns the list of resource names in the response and any errors -// encountered during decoding. -func (xc *xdsChannel) OnADSResponse(resp ads.Response, onDone func()) ([]string, error) { - if xc.closed.HasFired() { - if xc.logger.V(2) { - xc.logger.Infof("Received an update from the ADS stream on closed ADS stream") - } - return nil, errors.New("xdsChannel is closed") - } - - // Lookup the resource parser based on the resource type. - rType := xc.resourceTypeGetter(resp.TypeURL) - if rType == nil { - return nil, xdsresource.NewErrorf(xdsresource.ErrorTypeResourceTypeUnsupported, "Resource type URL %q unknown in response from server", resp.TypeURL) - } - - // Decode the resources and build the list of resource names to return. - opts := &xdsresource.DecodeOptions{ - BootstrapConfig: xc.bootstrapConfig, - ServerConfig: xc.serverConfig, - } - updates, md, err := decodeResponse(opts, rType, resp) - var names []string - for name := range updates { - names = append(names, name) - } - - xc.eventHandler.adsResourceUpdate(rType, updates, md, onDone) - return names, err -} - -// decodeResponse decodes the resources in the given ADS response. -// -// The opts parameter provides configuration options for decoding the resources. -// The rType parameter specifies the resource type parser to use for decoding -// the resources. -// -// The returned map contains a key for each resource in the response, with the -// value being either the decoded resource data or an error if decoding failed. -// The returned metadata includes the version of the response, the timestamp of -// the update, and the status of the update (ACKed or NACKed). -// -// If there are any errors decoding the resources, the metadata will indicate -// that the update was NACKed, and the returned error will contain information -// about all errors encountered by this function. -func decodeResponse(opts *xdsresource.DecodeOptions, rType xdsresource.Type, resp ads.Response) (map[string]ads.DataAndErrTuple, xdsresource.UpdateMetadata, error) { - timestamp := time.Now() - md := xdsresource.UpdateMetadata{ - Version: resp.Version, - Timestamp: timestamp, - } - - topLevelErrors := make([]error, 0) // Tracks deserialization errors, where we don't have a resource name. - perResourceErrors := make(map[string]error) // Tracks resource validation errors, where we have a resource name. - ret := make(map[string]ads.DataAndErrTuple) // Return result, a map from resource name to either resource data or error. - for _, r := range resp.Resources { - result, err := rType.Decode(opts, r) - - // Name field of the result is left unpopulated only when resource - // deserialization fails. - name := "" - if result != nil { - name = xdsresource.ParseName(result.Name).String() - } - if err == nil { - ret[name] = ads.DataAndErrTuple{Resource: result.Resource} - continue - } - if name == "" { - topLevelErrors = append(topLevelErrors, err) - continue - } - perResourceErrors[name] = err - // Add place holder in the map so we know this resource name was in - // the response. - ret[name] = ads.DataAndErrTuple{Err: xdsresource.NewError(xdsresource.ErrorTypeNACKed, err.Error())} - } - - if len(topLevelErrors) == 0 && len(perResourceErrors) == 0 { - md.Status = xdsresource.ServiceStatusACKed - return ret, md, nil - } - - md.Status = xdsresource.ServiceStatusNACKed - errRet := combineErrors(rType.TypeName(), topLevelErrors, perResourceErrors) - md.ErrState = &xdsresource.UpdateErrorMetadata{ - Version: resp.Version, - Err: xdsresource.NewError(xdsresource.ErrorTypeNACKed, errRet.Error()), - Timestamp: timestamp, - } - return ret, md, errRet -} - -func combineErrors(rType string, topLevelErrors []error, perResourceErrors map[string]error) error { - var errStrB strings.Builder - errStrB.WriteString(fmt.Sprintf("error parsing %q response: ", rType)) - if len(topLevelErrors) > 0 { - errStrB.WriteString("top level errors: ") - for i, err := range topLevelErrors { - if i != 0 { - errStrB.WriteString(";\n") - } - errStrB.WriteString(err.Error()) - } - } - if len(perResourceErrors) > 0 { - var i int - for name, err := range perResourceErrors { - if i != 0 { - errStrB.WriteString(";\n") - } - i++ - errStrB.WriteString(fmt.Sprintf("resource %q: %v", name, err.Error())) - } - } - return errors.New(errStrB.String()) -} - -func (xc *xdsChannel) triggerResourceNotFoundForTesting(rType xdsresource.Type, resourceName string) error { - if xc.closed.HasFired() { - return fmt.Errorf("triggerResourceNotFoundForTesting() called on a closed channel") - } - if xc.logger.V(2) { - xc.logger.Infof("Triggering resource not found for type: %s, resource name: %s", rType.TypeName(), resourceName) - } - xc.ads.TriggerResourceNotFoundForTesting(rType, resourceName) - return nil -} diff --git a/xds/internal/xdsclient/channel_test.go b/xds/internal/xdsclient/channel_test.go deleted file mode 100644 index 976240ebc22a..000000000000 --- a/xds/internal/xdsclient/channel_test.go +++ /dev/null @@ -1,972 +0,0 @@ -/* - * - * Copyright 2024 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package xdsclient - -import ( - "context" - "fmt" - "strings" - "testing" - "time" - - "github.com/envoyproxy/go-control-plane/pkg/wellknown" - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "github.com/google/uuid" - "google.golang.org/grpc/internal/testutils" - "google.golang.org/grpc/internal/testutils/xds/e2e" - "google.golang.org/grpc/internal/testutils/xds/fakeserver" - "google.golang.org/grpc/internal/xds/bootstrap" - xdsinternal "google.golang.org/grpc/xds/internal" - "google.golang.org/grpc/xds/internal/httpfilter" - "google.golang.org/grpc/xds/internal/httpfilter/router" - "google.golang.org/grpc/xds/internal/xdsclient/transport" - "google.golang.org/grpc/xds/internal/xdsclient/transport/ads" - "google.golang.org/grpc/xds/internal/xdsclient/transport/grpctransport" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" - "google.golang.org/protobuf/testing/protocmp" - "google.golang.org/protobuf/types/known/anypb" - "google.golang.org/protobuf/types/known/durationpb" - - v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" - v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" - v3routepb "github.com/envoyproxy/go-control-plane/envoy/config/route/v3" - v3routerpb "github.com/envoyproxy/go-control-plane/envoy/extensions/filters/http/router/v3" - v3httppb "github.com/envoyproxy/go-control-plane/envoy/extensions/filters/network/http_connection_manager/v3" - v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" -) - -// Lookup the listener resource type from the resource type map. This is used to -// parse listener resources used in this test. -var listenerType = xdsinternal.ResourceTypeMapForTesting[version.V3ListenerURL].(xdsresource.Type) - -// xdsChannelForTest creates an xdsChannel to the specified serverURI for -// testing purposes. -func xdsChannelForTest(t *testing.T, serverURI, nodeID string, watchExpiryTimeout time.Duration) *xdsChannel { - t.Helper() - - // Create server configuration for the above management server. - serverCfg, err := bootstrap.ServerConfigForTesting(bootstrap.ServerConfigTestingOptions{URI: serverURI}) - if err != nil { - t.Fatalf("Failed to create server config for testing: %v", err) - } - - // Create a grpc transport to the above management server. - tr, err := (&grpctransport.Builder{}).Build(transport.BuildOptions{ServerConfig: serverCfg}) - if err != nil { - t.Fatalf("Failed to create a transport for server config %s: %v", serverCfg, err) - } - - // Create bootstrap configuration with the top-level xds servers - // field containing the server configuration for the above - // management server. - contents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{ - Servers: []byte(fmt.Sprintf(`[{ - "server_uri": %q, - "channel_creds": [{"type": "insecure"}] - }]`, serverURI)), - Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)), - }) - if err != nil { - t.Fatalf("Failed to create bootstrap contents: %v", err) - } - bootstrapCfg, err := bootstrap.NewConfigFromContents(contents) - if err != nil { - t.Fatalf("Failed to create bootstrap configuration: %v", err) - } - - // Create an xdsChannel that uses everything set up above. - xc, err := newXDSChannel(xdsChannelOpts{ - transport: tr, - serverConfig: serverCfg, - bootstrapConfig: bootstrapCfg, - resourceTypeGetter: func(typeURL string) xdsresource.Type { - if typeURL != "type.googleapis.com/envoy.config.listener.v3.Listener" { - return nil - } - return listenerType - }, - eventHandler: newTestEventHandler(), - watchExpiryTimeout: watchExpiryTimeout, - }) - if err != nil { - t.Fatalf("Failed to create xdsChannel: %v", err) - } - t.Cleanup(func() { xc.close() }) - return xc -} - -// verifyUpdateAndMetadata verifies that the event handler received the expected -// updates and metadata. It checks that the received resource type matches the -// expected type, and that the received updates and metadata match the expected -// values. The function ignores the timestamp fields in the metadata, as those -// are expected to be different. -func verifyUpdateAndMetadata(ctx context.Context, t *testing.T, eh *testEventHandler, wantUpdates map[string]ads.DataAndErrTuple, wantMD xdsresource.UpdateMetadata) { - t.Helper() - - gotTyp, gotUpdates, gotMD, err := eh.waitForUpdate(ctx) - if err != nil { - t.Fatalf("Timeout when waiting for update callback to be invoked on the event handler") - } - - if gotTyp != listenerType { - t.Fatalf("Got resource type %v, want %v", gotTyp, listenerType) - } - opts := cmp.Options{ - protocmp.Transform(), - cmpopts.EquateEmpty(), - cmpopts.EquateErrors(), - cmpopts.IgnoreFields(xdsresource.UpdateMetadata{}, "Timestamp"), - cmpopts.IgnoreFields(xdsresource.UpdateErrorMetadata{}, "Timestamp"), - } - if diff := cmp.Diff(wantUpdates, gotUpdates, opts); diff != "" { - t.Fatalf("Got unexpected diff in update (-want +got):\n%s\n want: %+v\n got: %+v", diff, wantUpdates, gotUpdates) - } - if diff := cmp.Diff(wantMD, gotMD, opts); diff != "" { - t.Fatalf("Got unexpected diff in update (-want +got):\n%s\n want: %v\n got: %v", diff, wantMD, gotMD) - } -} - -// Tests different failure cases when creating a new xdsChannel. It checks that -// the xdsChannel creation fails when any of the required options (transport, -// serverConfig, bootstrapConfig, or resourceTypeGetter) are missing or nil. -func (s) TestChannel_New_FailureCases(t *testing.T) { - type fakeTransport struct { - transport.Transport - } - - tests := []struct { - name string - opts xdsChannelOpts - wantErrStr string - }{ - { - name: "emptyTransport", - opts: xdsChannelOpts{}, - wantErrStr: "transport is nil", - }, - { - name: "emptyServerConfig", - opts: xdsChannelOpts{transport: &fakeTransport{}}, - wantErrStr: "serverConfig is nil", - }, - { - name: "emptyBootstrapConfig", - opts: xdsChannelOpts{ - transport: &fakeTransport{}, - serverConfig: &bootstrap.ServerConfig{}, - }, - wantErrStr: "bootstrapConfig is nil", - }, - { - name: "emptyResourceTypeGetter", - opts: xdsChannelOpts{ - transport: &fakeTransport{}, - serverConfig: &bootstrap.ServerConfig{}, - bootstrapConfig: &bootstrap.Config{}, - }, - wantErrStr: "resourceTypeGetter is nil", - }, - { - name: "emptyEventHandler", - opts: xdsChannelOpts{ - transport: &fakeTransport{}, - serverConfig: &bootstrap.ServerConfig{}, - bootstrapConfig: &bootstrap.Config{}, - resourceTypeGetter: func(string) xdsresource.Type { return nil }, - }, - wantErrStr: "eventHandler is nil", - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - if _, err := newXDSChannel(test.opts); err == nil || !strings.Contains(err.Error(), test.wantErrStr) { - t.Fatalf("newXDSChannel() = %v, want %q", err, test.wantErrStr) - } - }) - } -} - -// Tests different scenarios of the xdsChannel receiving a response from the -// management server. In all scenarios, the xdsChannel is expected to pass the -// received responses as-is to the resource parsing functionality specified by -// the resourceTypeGetter. -func (s) TestChannel_ADS_HandleResponseFromManagementServer(t *testing.T) { - const ( - listenerName1 = "listener-name-1" - listenerName2 = "listener-name-2" - routeName = "route-name" - clusterName = "cluster-name" - ) - var ( - badlyMarshaledResource = &anypb.Any{ - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - Value: []byte{1, 2, 3, 4}, - } - apiListener = &v3listenerpb.ApiListener{ - ApiListener: testutils.MarshalAny(t, &v3httppb.HttpConnectionManager{ - RouteSpecifier: &v3httppb.HttpConnectionManager_RouteConfig{ - RouteConfig: &v3routepb.RouteConfiguration{ - Name: routeName, - VirtualHosts: []*v3routepb.VirtualHost{{ - Domains: []string{"*"}, - Routes: []*v3routepb.Route{{ - Match: &v3routepb.RouteMatch{ - PathSpecifier: &v3routepb.RouteMatch_Prefix{Prefix: "/"}, - }, - Action: &v3routepb.Route_Route{ - Route: &v3routepb.RouteAction{ - ClusterSpecifier: &v3routepb.RouteAction_Cluster{Cluster: clusterName}, - }}}}}}}, - }, - HttpFilters: []*v3httppb.HttpFilter{e2e.RouterHTTPFilter}, - CommonHttpProtocolOptions: &v3corepb.HttpProtocolOptions{ - MaxStreamDuration: durationpb.New(time.Second), - }, - }), - } - listener1 = testutils.MarshalAny(t, &v3listenerpb.Listener{ - Name: listenerName1, - ApiListener: apiListener, - }) - listener2 = testutils.MarshalAny(t, &v3listenerpb.Listener{ - Name: listenerName2, - ApiListener: apiListener, - }) - ) - - tests := []struct { - desc string - resourceNamesToRequest []string - managementServerResponse *v3discoverypb.DiscoveryResponse - wantUpdates map[string]ads.DataAndErrTuple - wantMD xdsresource.UpdateMetadata - wantErr error - }{ - { - desc: "one bad resource - deserialization failure", - resourceNamesToRequest: []string{listenerName1}, - managementServerResponse: &v3discoverypb.DiscoveryResponse{ - VersionInfo: "0", - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - Resources: []*anypb.Any{badlyMarshaledResource}, - }, - wantUpdates: nil, // No updates expected as the response runs into unmarshaling errors. - wantMD: xdsresource.UpdateMetadata{ - Status: xdsresource.ServiceStatusNACKed, - Version: "0", - ErrState: &xdsresource.UpdateErrorMetadata{ - Version: "0", - Err: cmpopts.AnyError, - }, - }, - wantErr: cmpopts.AnyError, - }, - { - desc: "one bad resource - validation failure", - resourceNamesToRequest: []string{listenerName1}, - managementServerResponse: &v3discoverypb.DiscoveryResponse{ - VersionInfo: "0", - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - Resources: []*anypb.Any{testutils.MarshalAny(t, &v3listenerpb.Listener{ - Name: listenerName1, - ApiListener: &v3listenerpb.ApiListener{ - ApiListener: testutils.MarshalAny(t, &v3httppb.HttpConnectionManager{ - RouteSpecifier: &v3httppb.HttpConnectionManager_ScopedRoutes{}, - }), - }, - })}, - }, - wantUpdates: map[string]ads.DataAndErrTuple{ - listenerName1: { - Err: cmpopts.AnyError, - }, - }, - wantMD: xdsresource.UpdateMetadata{ - Status: xdsresource.ServiceStatusNACKed, - Version: "0", - ErrState: &xdsresource.UpdateErrorMetadata{ - Version: "0", - Err: cmpopts.AnyError, - }, - }, - }, - { - desc: "two bad resources", - resourceNamesToRequest: []string{listenerName1, listenerName2}, - managementServerResponse: &v3discoverypb.DiscoveryResponse{ - VersionInfo: "0", - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - Resources: []*anypb.Any{ - badlyMarshaledResource, - testutils.MarshalAny(t, &v3listenerpb.Listener{ - Name: listenerName2, - ApiListener: &v3listenerpb.ApiListener{ - ApiListener: testutils.MarshalAny(t, &v3httppb.HttpConnectionManager{ - RouteSpecifier: &v3httppb.HttpConnectionManager_ScopedRoutes{}, - }), - }, - }), - }, - }, - wantUpdates: map[string]ads.DataAndErrTuple{ - listenerName2: { - Err: cmpopts.AnyError, - }, - }, - wantMD: xdsresource.UpdateMetadata{ - Status: xdsresource.ServiceStatusNACKed, - Version: "0", - ErrState: &xdsresource.UpdateErrorMetadata{ - Version: "0", - Err: cmpopts.AnyError, - }, - }, - }, - { - desc: "one good resource", - resourceNamesToRequest: []string{listenerName1}, - managementServerResponse: &v3discoverypb.DiscoveryResponse{ - VersionInfo: "0", - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - Resources: []*anypb.Any{listener1}, - }, - wantUpdates: map[string]ads.DataAndErrTuple{ - listenerName1: { - Resource: &xdsresource.ListenerResourceData{Resource: xdsresource.ListenerUpdate{ - InlineRouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{{ - Domains: []string{"*"}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: map[string]xdsresource.WeightedCluster{clusterName: {Weight: 1}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }}}, - MaxStreamDuration: time.Second, - Raw: listener1, - HTTPFilters: makeRouterFilterList(t), - }}, - }, - }, - wantMD: xdsresource.UpdateMetadata{ - Status: xdsresource.ServiceStatusACKed, - Version: "0", - }, - }, - { - desc: "one good and one bad - deserialization failure", - resourceNamesToRequest: []string{listenerName1, listenerName2}, - managementServerResponse: &v3discoverypb.DiscoveryResponse{ - VersionInfo: "0", - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - Resources: []*anypb.Any{ - badlyMarshaledResource, - listener2, - }, - }, - wantUpdates: map[string]ads.DataAndErrTuple{ - listenerName2: { - Resource: &xdsresource.ListenerResourceData{Resource: xdsresource.ListenerUpdate{ - InlineRouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{{ - Domains: []string{"*"}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: map[string]xdsresource.WeightedCluster{clusterName: {Weight: 1}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }}}, - MaxStreamDuration: time.Second, - Raw: listener2, - HTTPFilters: makeRouterFilterList(t), - }}, - }, - }, - wantMD: xdsresource.UpdateMetadata{ - Status: xdsresource.ServiceStatusNACKed, - Version: "0", - ErrState: &xdsresource.UpdateErrorMetadata{ - Version: "0", - Err: cmpopts.AnyError, - }, - }, - }, - { - desc: "one good and one bad - validation failure", - resourceNamesToRequest: []string{listenerName1, listenerName2}, - managementServerResponse: &v3discoverypb.DiscoveryResponse{ - VersionInfo: "0", - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - Resources: []*anypb.Any{ - testutils.MarshalAny(t, &v3listenerpb.Listener{ - Name: listenerName1, - ApiListener: &v3listenerpb.ApiListener{ - ApiListener: testutils.MarshalAny(t, &v3httppb.HttpConnectionManager{ - RouteSpecifier: &v3httppb.HttpConnectionManager_ScopedRoutes{}, - }), - }, - }), - listener2, - }, - }, - wantUpdates: map[string]ads.DataAndErrTuple{ - listenerName1: {Err: cmpopts.AnyError}, - listenerName2: { - Resource: &xdsresource.ListenerResourceData{Resource: xdsresource.ListenerUpdate{ - InlineRouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{{ - Domains: []string{"*"}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: map[string]xdsresource.WeightedCluster{clusterName: {Weight: 1}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }}}, - MaxStreamDuration: time.Second, - Raw: listener2, - HTTPFilters: makeRouterFilterList(t), - }}, - }, - }, - wantMD: xdsresource.UpdateMetadata{ - Status: xdsresource.ServiceStatusNACKed, - Version: "0", - ErrState: &xdsresource.UpdateErrorMetadata{ - Version: "0", - Err: cmpopts.AnyError, - }, - }, - }, - { - desc: "two good resources", - resourceNamesToRequest: []string{listenerName1, listenerName2}, - managementServerResponse: &v3discoverypb.DiscoveryResponse{ - VersionInfo: "0", - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - Resources: []*anypb.Any{listener1, listener2}, - }, - wantUpdates: map[string]ads.DataAndErrTuple{ - listenerName1: { - Resource: &xdsresource.ListenerResourceData{Resource: xdsresource.ListenerUpdate{ - InlineRouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{{ - Domains: []string{"*"}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: map[string]xdsresource.WeightedCluster{clusterName: {Weight: 1}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }}}, - MaxStreamDuration: time.Second, - Raw: listener1, - HTTPFilters: makeRouterFilterList(t), - }}, - }, - listenerName2: { - Resource: &xdsresource.ListenerResourceData{Resource: xdsresource.ListenerUpdate{ - InlineRouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{{ - Domains: []string{"*"}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: map[string]xdsresource.WeightedCluster{clusterName: {Weight: 1}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }}}, - MaxStreamDuration: time.Second, - Raw: listener2, - HTTPFilters: makeRouterFilterList(t), - }}, - }, - }, - wantMD: xdsresource.UpdateMetadata{ - Status: xdsresource.ServiceStatusACKed, - Version: "0", - }, - }, - { - desc: "two resources when we requested one", - resourceNamesToRequest: []string{listenerName1}, - managementServerResponse: &v3discoverypb.DiscoveryResponse{ - VersionInfo: "0", - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - Resources: []*anypb.Any{listener1, listener2}, - }, - wantUpdates: map[string]ads.DataAndErrTuple{ - listenerName1: { - Resource: &xdsresource.ListenerResourceData{Resource: xdsresource.ListenerUpdate{ - InlineRouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{{ - Domains: []string{"*"}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: map[string]xdsresource.WeightedCluster{clusterName: {Weight: 1}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }}}, - MaxStreamDuration: time.Second, - Raw: listener1, - HTTPFilters: makeRouterFilterList(t), - }}, - }, - listenerName2: { - Resource: &xdsresource.ListenerResourceData{Resource: xdsresource.ListenerUpdate{ - InlineRouteConfig: &xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{{ - Domains: []string{"*"}, - Routes: []*xdsresource.Route{{ - Prefix: newStringP("/"), - WeightedClusters: map[string]xdsresource.WeightedCluster{clusterName: {Weight: 1}}, - ActionType: xdsresource.RouteActionRoute}, - }, - }}}, - MaxStreamDuration: time.Second, - Raw: listener2, - HTTPFilters: makeRouterFilterList(t), - }}, - }, - }, - wantMD: xdsresource.UpdateMetadata{ - Status: xdsresource.ServiceStatusACKed, - Version: "0", - }, - }, - } - - for _, test := range tests { - t.Run(test.desc, func(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Start a fake xDS management server and configure the response it - // would send to its client. - mgmtServer, cleanup, err := fakeserver.StartServer(nil) - if err != nil { - t.Fatalf("Failed to start fake xDS server: %v", err) - } - defer cleanup() - t.Logf("Started xDS management server on %s", mgmtServer.Address) - mgmtServer.XDSResponseChan <- &fakeserver.Response{Resp: test.managementServerResponse} - - // Create an xdsChannel for the test with a long watch expiry timer - // to ensure that watches don't expire for the duration of the test. - nodeID := uuid.New().String() - xc := xdsChannelForTest(t, mgmtServer.Address, nodeID, 2*defaultTestTimeout) - defer xc.close() - - // Subscribe to the resources specified in the test table. - for _, name := range test.resourceNamesToRequest { - xc.subscribe(listenerType, name) - } - - // Wait for an update callback on the event handler and verify the - // contents of the update and the metadata. - verifyUpdateAndMetadata(ctx, t, xc.eventHandler.(*testEventHandler), test.wantUpdates, test.wantMD) - }) - } -} - -// Tests that the xdsChannel correctly handles the expiry of a watch for a -// resource by ensuring that the watch expiry callback is invoked on the event -// handler with the expected resource type and name. -func (s) TestChannel_ADS_HandleResponseWatchExpiry(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Start an xDS management server, but do not configure any resources on it. - // This will result in the watch for a resource to timeout. - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{}) - - // Create an xdsChannel for the test with a short watch expiry timer to - // ensure that the test does not run very long, as it needs to wait for the - // watch to expire. - nodeID := uuid.New().String() - xc := xdsChannelForTest(t, mgmtServer.Address, nodeID, 2*defaultTestShortTimeout) - defer xc.close() - - // Subscribe to a listener resource. - const listenerName = "listener-name" - xc.subscribe(listenerType, listenerName) - - // Wait for the watch expiry callback on the authority to be invoked and - // verify that the watch expired for the expected resource name and type. - eventHandler := xc.eventHandler.(*testEventHandler) - gotTyp, gotName, err := eventHandler.waitForResourceDoesNotExist(ctx) - if err != nil { - t.Fatal("Timeout when waiting for the watch expiry callback to be invoked on the xDS client") - } - - if gotTyp != listenerType { - t.Fatalf("Got type %v, want %v", gotTyp, listenerType) - } - if gotName != listenerName { - t.Fatalf("Got name %v, want %v", gotName, listenerName) - } -} - -// Tests that the xdsChannel correctly handles stream failures by ensuring that -// the stream failure callback is invoked on the event handler. -func (s) TestChannel_ADS_StreamFailure(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Start an xDS management server with a restartable listener to simulate - // connection failures. - l, err := testutils.LocalTCPListener() - if err != nil { - t.Fatalf("net.Listen() failed: %v", err) - } - lis := testutils.NewRestartableListener(l) - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: lis}) - - // Configure a listener resource on the management server. - const listenerResourceName = "test-listener-resource" - const routeConfigurationName = "test-route-configuration-resource" - nodeID := uuid.New().String() - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)}, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) - } - - // Create an xdsChannel for the test with a long watch expiry timer - // to ensure that watches don't expire for the duration of the test. - xc := xdsChannelForTest(t, mgmtServer.Address, nodeID, 2*defaultTestTimeout) - defer xc.close() - - // Subscribe to the resource created above. - xc.subscribe(listenerType, listenerResourceName) - - // Wait for an update callback on the event handler and verify the - // contents of the update and the metadata. - hcm := testutils.MarshalAny(t, &v3httppb.HttpConnectionManager{ - RouteSpecifier: &v3httppb.HttpConnectionManager_Rds{Rds: &v3httppb.Rds{ - ConfigSource: &v3corepb.ConfigSource{ - ConfigSourceSpecifier: &v3corepb.ConfigSource_Ads{Ads: &v3corepb.AggregatedConfigSource{}}, - }, - RouteConfigName: routeConfigurationName, - }}, - HttpFilters: []*v3httppb.HttpFilter{e2e.HTTPFilter("router", &v3routerpb.Router{})}, - }) - listenerResource, err := anypb.New(&v3listenerpb.Listener{ - Name: listenerResourceName, - ApiListener: &v3listenerpb.ApiListener{ApiListener: hcm}, - FilterChains: []*v3listenerpb.FilterChain{{ - Name: "filter-chain-name", - Filters: []*v3listenerpb.Filter{{ - Name: wellknown.HTTPConnectionManager, - ConfigType: &v3listenerpb.Filter_TypedConfig{TypedConfig: hcm}, - }}, - }}, - }) - if err != nil { - t.Fatalf("Failed to create listener resource: %v", err) - } - - wantUpdates := map[string]ads.DataAndErrTuple{ - listenerResourceName: { - Resource: &xdsresource.ListenerResourceData{ - Resource: xdsresource.ListenerUpdate{ - RouteConfigName: routeConfigurationName, - HTTPFilters: makeRouterFilterList(t), - Raw: listenerResource, - }, - }, - }, - } - wantMD := xdsresource.UpdateMetadata{ - Status: xdsresource.ServiceStatusACKed, - Version: "1", - } - - eventHandler := xc.eventHandler.(*testEventHandler) - verifyUpdateAndMetadata(ctx, t, eventHandler, wantUpdates, wantMD) - - lis.Stop() - if err := eventHandler.waitForStreamFailure(ctx); err != nil { - t.Fatalf("Timeout when waiting for the stream failure callback to be invoked on the xDS client: %v", err) - } -} - -// Tests the behavior of the xdsChannel when a resource is unsubscribed. -// Verifies that when a previously subscribed resource is unsubscribed, a -// request is sent without the previously subscribed resource name. -func (s) TestChannel_ADS_ResourceUnsubscribe(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Start an xDS management server that uses a channel to inform the test - // about the specific LDS resource names being requested. - ldsResourcesCh := make(chan []string, 1) - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ - OnStreamRequest: func(_ int64, req *v3discoverypb.DiscoveryRequest) error { - t.Logf("Received request for resources: %v of type %s", req.GetResourceNames(), req.GetTypeUrl()) - - if req.TypeUrl != version.V3ListenerURL { - return fmt.Errorf("unexpected resource type URL: %q", req.TypeUrl) - } - - // Make the most recently requested names available to the test. - ldsResourcesCh <- req.GetResourceNames() - return nil - }, - }) - - // Configure two listener resources on the management server. - const listenerResourceName1 = "test-listener-resource-1" - const routeConfigurationName1 = "test-route-configuration-resource-1" - const listenerResourceName2 = "test-listener-resource-2" - const routeConfigurationName2 = "test-route-configuration-resource-2" - nodeID := uuid.New().String() - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{ - e2e.DefaultClientListener(listenerResourceName1, routeConfigurationName1), - e2e.DefaultClientListener(listenerResourceName2, routeConfigurationName2), - }, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) - } - - // Create an xdsChannel for the test with a long watch expiry timer - // to ensure that watches don't expire for the duration of the test. - xc := xdsChannelForTest(t, mgmtServer.Address, nodeID, 2*defaultTestTimeout) - defer xc.close() - - // Subscribe to the resources created above and verify that a request is - // sent for the same. - xc.subscribe(listenerType, listenerResourceName1) - xc.subscribe(listenerType, listenerResourceName2) - if err := waitForResourceNames(ctx, t, ldsResourcesCh, []string{listenerResourceName1, listenerResourceName2}); err != nil { - t.Fatal(err) - } - - // Wait for the above resources to be ACKed. - if err := waitForResourceNames(ctx, t, ldsResourcesCh, []string{listenerResourceName1, listenerResourceName2}); err != nil { - t.Fatal(err) - } - - // Unsubscribe to one of the resources created above, and ensure that the - // other resource is still being requested. - xc.unsubscribe(listenerType, listenerResourceName1) - if err := waitForResourceNames(ctx, t, ldsResourcesCh, []string{listenerResourceName2}); err != nil { - t.Fatal(err) - } - - // Since the version on the management server for the above resource is not - // changed, we will not receive an update from it for the one resource that - // we are still requesting. - - // Unsubscribe to the remaining resource, and ensure that no more resources - // are being requested. - xc.unsubscribe(listenerType, listenerResourceName2) - if err := waitForResourceNames(ctx, t, ldsResourcesCh, []string{}); err != nil { - t.Fatal(err) - } -} - -// Tests the load reporting functionality of the xdsChannel. It creates an -// xdsChannel, starts load reporting, and verifies that an LRS streaming RPC is -// created. It then makes another call to the load reporting API and ensures -// that a new LRS stream is not created. Finally, it cancels the load reporting -// calls and ensures that the stream is closed when the last call is canceled. -// -// Note that this test does not actually report any load. That is already tested -// by an e2e style test in the xdsclient package. -func (s) TestChannel_LRS_ReportLoad(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Create a management server that serves LRS. - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{SupportLoadReportingService: true}) - - // Create an xdsChannel for the test. Node id and watch expiry timer don't - // matter for LRS. - xc := xdsChannelForTest(t, mgmtServer.Address, "", defaultTestTimeout) - defer xc.close() - - // Start load reporting and verify that an LRS streaming RPC is created. - _, stopLRS1 := xc.reportLoad() - lrsServer := mgmtServer.LRSServer - if _, err := lrsServer.LRSStreamOpenChan.Receive(ctx); err != nil { - t.Fatalf("Timeout when waiting for an LRS streaming RPC to be created: %v", err) - } - - // Make another call to the load reporting API, and ensure that a new LRS - // stream is not created. - _, stopLRS2 := xc.reportLoad() - sCtx, sCancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) - defer sCancel() - if _, err := lrsServer.LRSStreamOpenChan.Receive(sCtx); err != context.DeadlineExceeded { - t.Fatal("New LRS streaming RPC created when expected to use an existing one") - } - - // Cancel the first load reporting call, and ensure that the stream does not - // close (because we have another call open). - stopLRS1() - sCtx, sCancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) - defer sCancel() - if _, err := lrsServer.LRSStreamCloseChan.Receive(sCtx); err != context.DeadlineExceeded { - t.Fatal("LRS stream closed when expected to stay open") - } - - // Cancel the second load reporting call, and ensure the stream is closed. - stopLRS2() - if _, err := lrsServer.LRSStreamCloseChan.Receive(ctx); err != nil { - t.Fatal("Timeout waiting for LRS stream to close") - } -} - -// waitForResourceNames waits for the wantNames to be received on namesCh. -// Returns a non-nil error if the context expires before that. -func waitForResourceNames(ctx context.Context, t *testing.T, namesCh chan []string, wantNames []string) error { - t.Helper() - - var lastRequestedNames []string - for ; ; <-time.After(defaultTestShortTimeout) { - select { - case <-ctx.Done(): - return fmt.Errorf("timeout waiting for resources %v to be requested from the management server. Last requested resources: %v", wantNames, lastRequestedNames) - case gotNames := <-namesCh: - if cmp.Equal(gotNames, wantNames, cmpopts.EquateEmpty(), cmpopts.SortSlices(func(s1, s2 string) bool { return s1 < s2 })) { - return nil - } - lastRequestedNames = gotNames - } - } -} - -// newTestEventHandler creates a new testEventHandler instance with the -// necessary channels for testing the xdsChannel. -func newTestEventHandler() *testEventHandler { - return &testEventHandler{ - typeCh: make(chan xdsresource.Type, 1), - updateCh: make(chan map[string]ads.DataAndErrTuple, 1), - mdCh: make(chan xdsresource.UpdateMetadata, 1), - nameCh: make(chan string, 1), - connErrCh: make(chan error, 1), - } -} - -// testEventHandler is a struct that implements the xdsChannelEventhandler -// interface. It is used to receive events from an xdsChannel, and has multiple -// channels on which it makes these events available to the test. -type testEventHandler struct { - typeCh chan xdsresource.Type // Resource type of an update or resource-does-not-exist error. - updateCh chan map[string]ads.DataAndErrTuple // Resource updates. - mdCh chan xdsresource.UpdateMetadata // Metadata from an update. - nameCh chan string // Name of the non-existent resource. - connErrCh chan error // Connectivity error. - -} - -func (ta *testEventHandler) adsStreamFailure(err error) { - ta.connErrCh <- err -} - -func (ta *testEventHandler) waitForStreamFailure(ctx context.Context) error { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ta.connErrCh: - } - return nil -} - -func (ta *testEventHandler) adsResourceUpdate(typ xdsresource.Type, updates map[string]ads.DataAndErrTuple, md xdsresource.UpdateMetadata, onDone func()) { - ta.typeCh <- typ - ta.updateCh <- updates - ta.mdCh <- md - onDone() -} - -// waitForUpdate waits for the next resource update event from the xdsChannel. -// It returns the resource type, the resource updates, and the update metadata. -// If the context is canceled, it returns an error. -func (ta *testEventHandler) waitForUpdate(ctx context.Context) (xdsresource.Type, map[string]ads.DataAndErrTuple, xdsresource.UpdateMetadata, error) { - var typ xdsresource.Type - var updates map[string]ads.DataAndErrTuple - var md xdsresource.UpdateMetadata - - select { - case typ = <-ta.typeCh: - case <-ctx.Done(): - return nil, nil, xdsresource.UpdateMetadata{}, ctx.Err() - } - - select { - case updates = <-ta.updateCh: - case <-ctx.Done(): - return nil, nil, xdsresource.UpdateMetadata{}, ctx.Err() - } - - select { - case md = <-ta.mdCh: - case <-ctx.Done(): - return nil, nil, xdsresource.UpdateMetadata{}, ctx.Err() - } - return typ, updates, md, nil -} - -func (ta *testEventHandler) adsResourceDoesNotExist(typ xdsresource.Type, name string) { - ta.typeCh <- typ - ta.nameCh <- name -} - -// waitForResourceDoesNotExist waits for the next resource-does-not-exist event -// from the xdsChannel. It returns the resource type and the resource name. If -// the context is canceled, it returns an error. -func (ta *testEventHandler) waitForResourceDoesNotExist(ctx context.Context) (xdsresource.Type, string, error) { - var typ xdsresource.Type - var name string - - select { - case typ = <-ta.typeCh: - case <-ctx.Done(): - return nil, "", ctx.Err() - } - - select { - case name = <-ta.nameCh: - case <-ctx.Done(): - return nil, "", ctx.Err() - } - return typ, name, nil -} - -func newStringP(s string) *string { - return &s -} - -func makeRouterFilter(t *testing.T) xdsresource.HTTPFilter { - routerBuilder := httpfilter.Get(router.TypeURL) - routerConfig, _ := routerBuilder.ParseFilterConfig(testutils.MarshalAny(t, &v3routerpb.Router{})) - return xdsresource.HTTPFilter{Name: "router", Filter: routerBuilder, Config: routerConfig} -} - -func makeRouterFilterList(t *testing.T) []xdsresource.HTTPFilter { - return []xdsresource.HTTPFilter{makeRouterFilter(t)} -} diff --git a/xds/internal/xdsclient/client.go b/xds/internal/xdsclient/client.go index 8fa6b1a52261..42f1a28f0c08 100644 --- a/xds/internal/xdsclient/client.go +++ b/xds/internal/xdsclient/client.go @@ -21,9 +21,11 @@ package xdsclient import ( + "context" + v3statuspb "github.com/envoyproxy/go-control-plane/envoy/service/status/v3" "google.golang.org/grpc/internal/xds/bootstrap" - "google.golang.org/grpc/xds/internal/xdsclient/load" + "google.golang.org/grpc/xds/internal/clients/lrsclient" "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" ) @@ -47,7 +49,7 @@ type XDSClient interface { // the watcher is canceled. Callers need to handle this case. WatchResource(rType xdsresource.Type, resourceName string, watcher xdsresource.ResourceWatcher) (cancel func()) - ReportLoad(*bootstrap.ServerConfig) (*load.Store, func()) + ReportLoad(*bootstrap.ServerConfig) (*lrsclient.LoadStore, func(context.Context)) BootstrapConfig() *bootstrap.Config } diff --git a/xds/internal/xdsclient/clientimpl.go b/xds/internal/xdsclient/clientimpl.go index d8e727e3180e..19f392498f86 100644 --- a/xds/internal/xdsclient/clientimpl.go +++ b/xds/internal/xdsclient/clientimpl.go @@ -19,25 +19,25 @@ package xdsclient import ( - "context" "errors" "fmt" - "sync" "sync/atomic" "time" - v3statuspb "github.com/envoyproxy/go-control-plane/envoy/service/status/v3" + "google.golang.org/grpc" estats "google.golang.org/grpc/experimental/stats" - "google.golang.org/grpc/internal" "google.golang.org/grpc/internal/backoff" "google.golang.org/grpc/internal/grpclog" - "google.golang.org/grpc/internal/grpcsync" "google.golang.org/grpc/internal/xds/bootstrap" - xdsclientinternal "google.golang.org/grpc/xds/internal/xdsclient/internal" - "google.golang.org/grpc/xds/internal/xdsclient/transport" - "google.golang.org/grpc/xds/internal/xdsclient/transport/ads" - "google.golang.org/grpc/xds/internal/xdsclient/transport/grpctransport" "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" + "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" + + xdsbootstrap "google.golang.org/grpc/xds/bootstrap" + gclients "google.golang.org/grpc/xds/internal/clients" + "google.golang.org/grpc/xds/internal/clients/grpctransport" + glrsclient "google.golang.org/grpc/xds/internal/clients/lrsclient" + gxdsclient "google.golang.org/grpc/xds/internal/clients/xdsclient" + gxdsmetrics "google.golang.org/grpc/xds/internal/clients/xdsclient/metrics" ) const ( @@ -50,7 +50,6 @@ const ( ) var ( - _ XDSClient = &clientImpl{} // ErrClientClosed is returned when the xDS client is closed. ErrClientClosed = errors.New("xds: the xDS client is closed") @@ -88,59 +87,46 @@ var ( // clientImpl is the real implementation of the xDS client. The exported Client // is a wrapper of this struct with a ref count. type clientImpl struct { - // The following fields are initialized at creation time and are read-only - // after that, and therefore can be accessed without a mutex. - done *grpcsync.Event // Fired when the client is closed. - topLevelAuthority *authority // The top-level authority, used only for old-style names without an authority. - authorities map[string]*authority // Map from authority names in bootstrap to authority struct. - config *bootstrap.Config // Complete bootstrap configuration. - watchExpiryTimeout time.Duration // Expiry timeout for ADS watch. - backoff func(int) time.Duration // Backoff for ADS and LRS stream failures. - transportBuilder transport.Builder // Builder to create transports to xDS server. - resourceTypes *resourceTypeRegistry // Registry of resource types, for parsing incoming ADS responses. - serializer *grpcsync.CallbackSerializer // Serializer for invoking resource watcher callbacks. - serializerClose func() // Function to close the serializer. - logger *grpclog.PrefixLogger // Logger for this client. - metricsRecorder estats.MetricsRecorder // Metrics recorder for metrics. - target string // The gRPC target for this client. + *gxdsclient.XDSClient - // The clientImpl owns a bunch of channels to individual xDS servers - // specified in the bootstrap configuration. Authorities acquire references - // to these channels based on server configs within the authority config. - // The clientImpl maintains a list of interested authorities for each of - // these channels, and forwards updates from the channels to each of these - // authorities. - // - // Once all references to a channel are dropped, the channel is closed. - channelsMu sync.Mutex - xdsActiveChannels map[string]*channelState // Map from server config to in-use xdsChannels. + gConfig gxdsclient.Config + config *bootstrap.Config + logger *grpclog.PrefixLogger + target string + lrsClient *glrsclient.LRSClient } -func init() { - internal.TriggerXDSResourceNotFoundForTesting = triggerXDSResourceNotFoundForTesting - xdsclientinternal.ResourceWatchStateForTesting = resourceWatchStateForTesting +// metricsReporter implements the clients.MetricsReporter interface and uses an +// underlying stats.MetricsRecorderList to record metrics. +type metricsReporter struct { + estats.MetricsRecorder - DefaultPool = &Pool{clients: make(map[string]*clientRefCounted)} + target string } -// newClientImpl returns a new xdsClient with the given config. -func newClientImpl(config *bootstrap.Config, watchExpiryTimeout time.Duration, streamBackoff func(int) time.Duration, mr estats.MetricsRecorder, target string) (*clientImpl, error) { - ctx, cancel := context.WithCancel(context.Background()) - c := &clientImpl{ - metricsRecorder: mr, - target: target, - done: grpcsync.NewEvent(), - authorities: make(map[string]*authority), - config: config, - watchExpiryTimeout: watchExpiryTimeout, - backoff: streamBackoff, - serializer: grpcsync.NewCallbackSerializer(ctx), - serializerClose: cancel, - transportBuilder: &grpctransport.Builder{}, - resourceTypes: newResourceTypeRegistry(), - xdsActiveChannels: make(map[string]*channelState), +// ReportMetric implements the clients.MetricsReporter interface. +// It receives metric data, determines the appropriate metric based on the type +// of the data, and records it using the embedded MetricsRecorderList. +func (mr *metricsReporter) ReportMetric(metric any) { + if mr.MetricsRecorder == nil { + return + } + + switch m := metric.(type) { + case *gxdsmetrics.ResourceUpdateValid: + xdsClientResourceUpdatesValidMetric.Record(mr.MetricsRecorder, 1, mr.target, m.ServerURI, m.ResourceType) + case *gxdsmetrics.ResourceUpdateInvalid: + xdsClientResourceUpdatesInvalidMetric.Record(mr.MetricsRecorder, 1, mr.target, m.ServerURI, m.ResourceType) + case *gxdsmetrics.ServerFailure: + xdsClientServerFailureMetric.Record(mr.MetricsRecorder, 1, mr.target, m.ServerURI) } +} +func newClientImplGeneric(config *bootstrap.Config, metricsRecorder estats.MetricsRecorder, resourceTypes map[string]gxdsclient.ResourceType, target string) (*clientImpl, error) { + grpcTransportConfigs := make(map[string]grpctransport.Config) + gServerCfgMap := make(map[gxdsclient.ServerConfig]*bootstrap.ServerConfig) + + gAuthorities := make(map[string]gxdsclient.Authority) for name, cfg := range config.Authorities() { // If server configs are specified in the authorities map, use that. // Else, use the top-level server configs. @@ -148,328 +134,101 @@ func newClientImpl(config *bootstrap.Config, watchExpiryTimeout time.Duration, s if len(cfg.XDSServers) >= 1 { serverCfg = cfg.XDSServers } - c.authorities[name] = newAuthority(authorityBuildOptions{ - serverConfigs: serverCfg, - name: name, - serializer: c.serializer, - getChannelForADS: c.getChannelForADS, - logPrefix: clientPrefix(c), - target: target, - metricsRecorder: c.metricsRecorder, - }) - } - c.topLevelAuthority = newAuthority(authorityBuildOptions{ - serverConfigs: config.XDSServers(), - name: "", - serializer: c.serializer, - getChannelForADS: c.getChannelForADS, - logPrefix: clientPrefix(c), - target: target, - metricsRecorder: c.metricsRecorder, - }) - c.logger = prefixLogger(c) - return c, nil -} - -// BootstrapConfig returns the configuration read from the bootstrap file. -// Callers must treat the return value as read-only. -func (c *clientImpl) BootstrapConfig() *bootstrap.Config { - return c.config -} - -// close closes the xDS client and releases all resources. -func (c *clientImpl) close() { - if c.done.HasFired() { - return - } - c.done.Fire() - - c.topLevelAuthority.close() - for _, a := range c.authorities { - a.close() - } - - // Channel close cannot be invoked with the lock held, because it can race - // with stream failure happening at the same time. The latter will callback - // into the clientImpl and will attempt to grab the lock. This will result - // in a deadlock. So instead, we release the lock and wait for all active - // channels to be closed. - var channelsToClose []*xdsChannel - c.channelsMu.Lock() - for _, cs := range c.xdsActiveChannels { - channelsToClose = append(channelsToClose, cs.channel) - } - c.xdsActiveChannels = nil - c.channelsMu.Unlock() - for _, c := range channelsToClose { - c.close() - } - - c.serializerClose() - <-c.serializer.Done() - - for _, s := range c.config.XDSServers() { - for _, f := range s.Cleanups() { - f() - } - } - for _, a := range c.config.Authorities() { - for _, s := range a.XDSServers { - for _, f := range s.Cleanups() { - f() + var gServerCfg []gxdsclient.ServerConfig + for _, sc := range serverCfg { + if err := populateGRPCTransportConfigsFromServerConfig(sc, grpcTransportConfigs); err != nil { + return nil, err } + gsc := gxdsclient.ServerConfig{ + ServerIdentifier: gclients.ServerIdentifier{ServerURI: sc.ServerURI(), Extensions: grpctransport.ServerIdentifierExtension{ConfigName: sc.SelectedCreds().Type}}, + IgnoreResourceDeletion: sc.ServerFeaturesIgnoreResourceDeletion()} + gServerCfg = append(gServerCfg, gsc) + gServerCfgMap[gsc] = sc } - } - c.logger.Infof("Shutdown") -} - -// getChannelForADS returns an xdsChannel for the given server configuration. -// -// If an xdsChannel exists for the given server configuration, it is returned. -// Else a new one is created. It also ensures that the calling authority is -// added to the set of interested authorities for the returned channel. -// -// It returns the xdsChannel and a function to release the calling authority's -// reference on the channel. The caller must call the cancel function when it is -// no longer interested in this channel. -// -// A non-nil error is returned if an xdsChannel was not created. -func (c *clientImpl) getChannelForADS(serverConfig *bootstrap.ServerConfig, callingAuthority *authority) (*xdsChannel, func(), error) { - if c.done.HasFired() { - return nil, nil, ErrClientClosed + gAuthorities[name] = gxdsclient.Authority{XDSServers: gServerCfg} } - initLocked := func(s *channelState) { - if c.logger.V(2) { - c.logger.Infof("Adding authority %q to the set of interested authorities for channel [%p]", callingAuthority.name, s.channel) + gServerCfgs := make([]gxdsclient.ServerConfig, 0, len(config.XDSServers())) + for _, sc := range config.XDSServers() { + if err := populateGRPCTransportConfigsFromServerConfig(sc, grpcTransportConfigs); err != nil { + return nil, err } - s.interestedAuthorities[callingAuthority] = true - } - deInitLocked := func(s *channelState) { - if c.logger.V(2) { - c.logger.Infof("Removing authority %q from the set of interested authorities for channel [%p]", callingAuthority.name, s.channel) - } - delete(s.interestedAuthorities, callingAuthority) - } - - return c.getOrCreateChannel(serverConfig, initLocked, deInitLocked) -} - -// getChannelForLRS returns an xdsChannel for the given server configuration. -// -// If an xdsChannel exists for the given server configuration, it is returned. -// Else a new one is created. A reference count that tracks the number of LRS -// calls on the returned channel is incremented before returning the channel. -// -// It returns the xdsChannel and a function to decrement the reference count -// that tracks the number of LRS calls on the returned channel. The caller must -// call the cancel function when it is no longer interested in this channel. -// -// A non-nil error is returned if an xdsChannel was not created. -func (c *clientImpl) getChannelForLRS(serverConfig *bootstrap.ServerConfig) (*xdsChannel, func(), error) { - if c.done.HasFired() { - return nil, nil, ErrClientClosed + gsc := gxdsclient.ServerConfig{ + ServerIdentifier: gclients.ServerIdentifier{ServerURI: sc.ServerURI(), Extensions: grpctransport.ServerIdentifierExtension{ConfigName: sc.SelectedCreds().Type}}, + IgnoreResourceDeletion: sc.ServerFeaturesIgnoreResourceDeletion()} + gServerCfgs = append(gServerCfgs, gsc) + gServerCfgMap[gsc] = sc } - initLocked := func(s *channelState) { s.lrsRefs++ } - deInitLocked := func(s *channelState) { s.lrsRefs-- } - - return c.getOrCreateChannel(serverConfig, initLocked, deInitLocked) -} - -// getOrCreateChannel returns an xdsChannel for the given server configuration. -// -// If an active xdsChannel exists for the given server configuration, it is -// returned. If an idle xdsChannel exists for the given server configuration, it -// is revived from the idle cache and returned. Else a new one is created. -// -// The initLocked function runs some initialization logic before the channel is -// returned. This includes adding the calling authority to the set of interested -// authorities for the channel or incrementing the count of the number of LRS -// calls on the channel. -// -// The deInitLocked function runs some cleanup logic when the returned cleanup -// function is called. This involves removing the calling authority from the set -// of interested authorities for the channel or decrementing the count of the -// number of LRS calls on the channel. -// -// Both initLocked and deInitLocked are called with the c.channelsMu held. -// -// Returns the xdsChannel and a cleanup function to be invoked when the channel -// is no longer required. A non-nil error is returned if an xdsChannel was not -// created. -func (c *clientImpl) getOrCreateChannel(serverConfig *bootstrap.ServerConfig, initLocked, deInitLocked func(*channelState)) (*xdsChannel, func(), error) { - c.channelsMu.Lock() - defer c.channelsMu.Unlock() - - if c.logger.V(2) { - c.logger.Infof("Received request for a reference to an xdsChannel for server config %q", serverConfig) + node := config.Node() + gNode := gclients.Node{ + ID: node.GetId(), + Cluster: node.GetCluster(), + Metadata: node.Metadata, + UserAgentName: node.UserAgentName, + UserAgentVersion: node.GetUserAgentVersion(), } - - // Use an existing channel, if one exists for this server config. - if state, ok := c.xdsActiveChannels[serverConfig.String()]; ok { - if c.logger.V(2) { - c.logger.Infof("Reusing an existing xdsChannel for server config %q", serverConfig) + if node.Locality != nil { + gNode.Locality = gclients.Locality{ + Region: node.Locality.Region, + Zone: node.Locality.Zone, + SubZone: node.Locality.SubZone, } - initLocked(state) - return state.channel, c.releaseChannel(serverConfig, state, deInitLocked), nil - } - - if c.logger.V(2) { - c.logger.Infof("Creating a new xdsChannel for server config %q", serverConfig) - } - - // Create a new transport and create a new xdsChannel, and add it to the - // map of xdsChannels. - tr, err := c.transportBuilder.Build(transport.BuildOptions{ServerConfig: serverConfig}) - if err != nil { - return nil, func() {}, fmt.Errorf("xds: failed to create transport for server config %s: %v", serverConfig, err) - } - state := &channelState{ - parent: c, - serverConfig: serverConfig, - interestedAuthorities: make(map[*authority]bool), - } - channel, err := newXDSChannel(xdsChannelOpts{ - transport: tr, - serverConfig: serverConfig, - bootstrapConfig: c.config, - resourceTypeGetter: c.resourceTypes.get, - eventHandler: state, - backoff: c.backoff, - watchExpiryTimeout: c.watchExpiryTimeout, - logPrefix: clientPrefix(c), - }) - if err != nil { - return nil, func() {}, fmt.Errorf("xds: failed to create xdsChannel for server config %s: %v", serverConfig, err) } - state.channel = channel - c.xdsActiveChannels[serverConfig.String()] = state - initLocked(state) - return state.channel, c.releaseChannel(serverConfig, state, deInitLocked), nil -} -// releaseChannel is a function that is called when a reference to an xdsChannel -// needs to be released. It handles closing channels with no active references. -// -// The function takes the following parameters: -// - serverConfig: the server configuration for the xdsChannel -// - state: the state of the xdsChannel -// - deInitLocked: a function that performs any necessary cleanup for the xdsChannel -// -// The function returns another function that can be called to release the -// reference to the xdsChannel. This returned function is idempotent, meaning -// it can be called multiple times without any additional effect. -func (c *clientImpl) releaseChannel(serverConfig *bootstrap.ServerConfig, state *channelState, deInitLocked func(*channelState)) func() { - return sync.OnceFunc(func() { - c.channelsMu.Lock() + gTransportBuilder := grpctransport.NewBuilder(grpcTransportConfigs) - if c.logger.V(2) { - c.logger.Infof("Received request to release a reference to an xdsChannel for server config %q", serverConfig) + if resourceTypes == nil { + resourceTypes = make(map[string]gxdsclient.ResourceType) + resourceTypes[version.V3ListenerURL] = gxdsclient.ResourceType{ + TypeURL: version.V3ListenerURL, + TypeName: xdsresource.ListenerResourceTypeName, + AllResourcesRequiredInSotW: true, + Decoder: xdsresource.NewGenericListenerResourceTypeDecoder(config), } - deInitLocked(state) - - // The channel has active users. Do nothing and return. - if state.lrsRefs != 0 || len(state.interestedAuthorities) != 0 { - if c.logger.V(2) { - c.logger.Infof("xdsChannel %p has other active references", state.channel) - } - c.channelsMu.Unlock() - return + resourceTypes[version.V3RouteConfigURL] = gxdsclient.ResourceType{ + TypeURL: version.V3RouteConfigURL, + TypeName: xdsresource.RouteConfigTypeName, + AllResourcesRequiredInSotW: false, + Decoder: xdsresource.NewGenericRouteConfigResourceTypeDecoder(), } - - delete(c.xdsActiveChannels, serverConfig.String()) - if c.logger.V(2) { - c.logger.Infof("Closing xdsChannel [%p] for server config %s", state.channel, serverConfig) + resourceTypes[version.V3ClusterURL] = gxdsclient.ResourceType{ + TypeURL: version.V3ClusterURL, + TypeName: xdsresource.ClusterResourceTypeName, + AllResourcesRequiredInSotW: true, + Decoder: xdsresource.NewGenericClusterResourceTypeDecoder(config, gServerCfgMap), + } + resourceTypes[version.V3EndpointsURL] = gxdsclient.ResourceType{ + TypeURL: version.V3EndpointsURL, + TypeName: xdsresource.EndpointsResourceTypeName, + AllResourcesRequiredInSotW: false, + Decoder: xdsresource.NewGenericEndpointsResourceTypeDecoder(), } - channelToClose := state.channel - c.channelsMu.Unlock() - - channelToClose.close() - }) -} - -// dumpResources returns the status and contents of all xDS resources. -func (c *clientImpl) dumpResources() *v3statuspb.ClientConfig { - retCfg := c.topLevelAuthority.dumpResources() - for _, a := range c.authorities { - retCfg = append(retCfg, a.dumpResources()...) - } - - return &v3statuspb.ClientConfig{ - Node: c.config.Node(), - GenericXdsConfigs: retCfg, - } -} - -// channelState represents the state of an xDS channel. It tracks the number of -// LRS references, the authorities interested in the channel, and the server -// configuration used for the channel. -// -// It receives callbacks for events on the underlying ADS stream and invokes -// corresponding callbacks on interested authorities. -type channelState struct { - parent *clientImpl - serverConfig *bootstrap.ServerConfig - - // Access to the following fields should be protected by the parent's - // channelsMu. - channel *xdsChannel - lrsRefs int - interestedAuthorities map[*authority]bool -} - -func (cs *channelState) adsStreamFailure(err error) { - if cs.parent.done.HasFired() { - return - } - - if xdsresource.ErrType(err) != xdsresource.ErrTypeStreamFailedAfterRecv { - xdsClientServerFailureMetric.Record(cs.parent.metricsRecorder, 1, cs.parent.target, cs.serverConfig.ServerURI()) - } - - cs.parent.channelsMu.Lock() - defer cs.parent.channelsMu.Unlock() - for authority := range cs.interestedAuthorities { - authority.adsStreamFailure(cs.serverConfig, err) - } -} - -func (cs *channelState) adsResourceUpdate(typ xdsresource.Type, updates map[string]ads.DataAndErrTuple, md xdsresource.UpdateMetadata, onDone func()) { - if cs.parent.done.HasFired() { - return } - cs.parent.channelsMu.Lock() - defer cs.parent.channelsMu.Unlock() - - if len(cs.interestedAuthorities) == 0 { - onDone() - return - } + mr := &metricsReporter{MetricsRecorder: metricsRecorder, target: target} - authorityCnt := new(atomic.Int64) - authorityCnt.Add(int64(len(cs.interestedAuthorities))) - done := func() { - if authorityCnt.Add(-1) == 0 { - onDone() - } + gConfig := gxdsclient.Config{ + Authorities: gAuthorities, + Servers: gServerCfgs, + Node: gNode, + TransportBuilder: gTransportBuilder, + ResourceTypes: resourceTypes, + MetricsReporter: mr, } - for authority := range cs.interestedAuthorities { - authority.adsResourceUpdate(cs.serverConfig, typ, updates, md, done) + client, err := gxdsclient.New(gConfig) + if err != nil { + return nil, err } + c := &clientImpl{XDSClient: client, gConfig: gConfig, config: config, target: target} + c.logger = prefixLogger(c) + return c, nil } -func (cs *channelState) adsResourceDoesNotExist(typ xdsresource.Type, resourceName string) { - if cs.parent.done.HasFired() { - return - } - - cs.parent.channelsMu.Lock() - defer cs.parent.channelsMu.Unlock() - for authority := range cs.interestedAuthorities { - authority.adsResourceDoesNotExist(typ, resourceName) - } +// BootstrapConfig returns the configuration read from the bootstrap file. +// Callers must treat the return value as read-only. +func (c *clientImpl) BootstrapConfig() *bootstrap.Config { + return c.config } // clientRefCounted is ref-counted, and to be shared by the xds resolver and @@ -488,18 +247,26 @@ func (c *clientRefCounted) decrRef() int32 { return atomic.AddInt32(&c.refCount, -1) } -func triggerXDSResourceNotFoundForTesting(client XDSClient, typ xdsresource.Type, name string) error { - crc, ok := client.(*clientRefCounted) - if !ok { - return fmt.Errorf("xds: xDS client is of type %T, want %T", client, &clientRefCounted{}) - } - return crc.clientImpl.triggerResourceNotFoundForTesting(typ, name) -} - -func resourceWatchStateForTesting(client XDSClient, typ xdsresource.Type, name string) (ads.ResourceWatchState, error) { - crc, ok := client.(*clientRefCounted) - if !ok { - return ads.ResourceWatchState{}, fmt.Errorf("xds: xDS client is of type %T, want %T", client, &clientRefCounted{}) +// populateGRPCTransportConfigsFromServerConfig iterates through the channel +// credentials of the provided server configuration, builds credential bundles, +// and populates the grpctransport.Config map. +func populateGRPCTransportConfigsFromServerConfig(sc *bootstrap.ServerConfig, grpcTransportConfigs map[string]grpctransport.Config) error { + for _, cc := range sc.ChannelCreds() { + c := xdsbootstrap.GetCredentials(cc.Type) + if c == nil { + continue + } + bundle, _, err := c.Build(cc.Config) + if err != nil { + return fmt.Errorf("xds: failed to build credentials bundle from bootstrap for %q: %v", cc.Type, err) + } + grpcTransportConfigs[cc.Type] = grpctransport.Config{ + Credentials: bundle, + GRPCNewClient: func(target string, opts ...grpc.DialOption) (*grpc.ClientConn, error) { + opts = append(opts, sc.DialOptions()...) + return grpc.NewClient(target, opts...) + }, + } } - return crc.clientImpl.resourceWatchStateForTesting(typ, name) + return nil } diff --git a/xds/internal/xdsclient/clientimpl_loadreport.go b/xds/internal/xdsclient/clientimpl_loadreport.go index efb41b87db53..ac0f4c50e083 100644 --- a/xds/internal/xdsclient/clientimpl_loadreport.go +++ b/xds/internal/xdsclient/clientimpl_loadreport.go @@ -18,24 +18,39 @@ package xdsclient import ( + "context" + "sync" + "google.golang.org/grpc/internal/xds/bootstrap" - "google.golang.org/grpc/xds/internal/xdsclient/load" + "google.golang.org/grpc/xds/internal/clients" + "google.golang.org/grpc/xds/internal/clients/grpctransport" + "google.golang.org/grpc/xds/internal/clients/lrsclient" ) // ReportLoad starts a load reporting stream to the given server. All load // reports to the same server share the LRS stream. // -// It returns a Store for the user to report loads, a function to cancel the -// load reporting stream. -func (c *clientImpl) ReportLoad(server *bootstrap.ServerConfig) (*load.Store, func()) { - xc, releaseChannelRef, err := c.getChannelForLRS(server) +// It returns a lrsclient.LoadStore for the user to report loads. +func (c *clientImpl) ReportLoad(server *bootstrap.ServerConfig) (*lrsclient.LoadStore, func(context.Context)) { + if c.lrsClient == nil { + lrsConfig := lrsclient.Config{Node: c.gConfig.Node, TransportBuilder: c.gConfig.TransportBuilder} + lrsC, err := lrsclient.New(lrsConfig) + if err != nil { + c.logger.Warningf("Failed to create an lrs client to the management server to report load: %v", server, err) + return nil, func(context.Context) {} + } + c.lrsClient = lrsC + } + + load, err := c.lrsClient.ReportLoad(clients.ServerIdentifier{ServerURI: server.ServerURI(), Extensions: grpctransport.ServerIdentifierExtension{ConfigName: server.SelectedCreds().Type}}) if err != nil { - c.logger.Warningf("Failed to create a channel to the management server to report load: %v", server, err) - return nil, func() {} + c.logger.Warningf("Failed to create a load store to the management server to report load: %v", server, err) + return nil, func(context.Context) {} } - load, stopLoadReporting := xc.reportLoad() - return load, func() { - stopLoadReporting() - releaseChannelRef() + var loadStop sync.Once + return load, func(ctx context.Context) { + loadStop.Do(func() { + load.Stop(ctx) + }) } } diff --git a/xds/internal/xdsclient/clientimpl_watchers.go b/xds/internal/xdsclient/clientimpl_watchers.go index 2cce17b05a24..29435993f135 100644 --- a/xds/internal/xdsclient/clientimpl_watchers.go +++ b/xds/internal/xdsclient/clientimpl_watchers.go @@ -18,147 +18,14 @@ package xdsclient import ( - "context" - "fmt" - "sync" - - "google.golang.org/grpc/xds/internal/xdsclient/transport/ads" "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" ) -// wrappingWatcher is a wrapper around an xdsresource.ResourceWatcher that adds -// the node ID to the error messages reported to the watcher. -type wrappingWatcher struct { - xdsresource.ResourceWatcher - nodeID string -} - -func (w *wrappingWatcher) ResourceError(err error, done func()) { - w.ResourceWatcher.ResourceError(fmt.Errorf("[xDS node id: %v]: %w", w.nodeID, err), done) -} - -func (w *wrappingWatcher) AmbientError(err error, done func()) { - w.ResourceWatcher.AmbientError(fmt.Errorf("[xDS node id: %v]: %w", w.nodeID, err), done) -} - // WatchResource uses xDS to discover the resource associated with the provided // resource name. The resource type implementation determines how xDS responses // are are deserialized and validated, as received from the xDS management // server. Upon receipt of a response from the management server, an // appropriate callback on the watcher is invoked. func (c *clientImpl) WatchResource(rType xdsresource.Type, resourceName string, watcher xdsresource.ResourceWatcher) (cancel func()) { - // Return early if the client is already closed. - // - // The client returned from the top-level API is a ref-counted client which - // contains a pointer to `clientImpl`. When all references are released, the - // ref-counted client sets its pointer to `nil`. And if any watch APIs are - // made on such a closed client, we will get here with a `nil` receiver. - if c == nil || c.done.HasFired() { - logger.Warningf("Watch registered for name %q of type %q, but client is closed", rType.TypeName(), resourceName) - return func() {} - } - - watcher = &wrappingWatcher{ - ResourceWatcher: watcher, - nodeID: c.config.Node().GetId(), - } - - if err := c.resourceTypes.maybeRegister(rType); err != nil { - logger.Warningf("Watch registered for type %q, which is already registered", rType.TypeName()) - c.serializer.TrySchedule(func(context.Context) { watcher.ResourceError(err, func() {}) }) - return func() {} - } - - n := xdsresource.ParseName(resourceName) - a := c.getAuthorityForResource(n) - if a == nil { - logger.Warningf("Watch registered for name %q of type %q, authority %q is not found", rType.TypeName(), resourceName, n.Authority) - watcher.ResourceError(fmt.Errorf("authority %q not found in bootstrap config for resource %q", n.Authority, resourceName), func() {}) - return func() {} - } - // The watchResource method on the authority is invoked with n.String() - // instead of resourceName because n.String() canonicalizes the given name. - // So, two resource names which don't differ in the query string, but only - // differ in the order of context params will result in the same resource - // being watched by the authority. - return a.watchResource(rType, n.String(), watcher) -} - -// Gets the authority for the given resource name. -// -// See examples in this section of the gRFC: -// https://github.com/grpc/proposal/blob/master/A47-xds-federation.md#bootstrap-config-changes -func (c *clientImpl) getAuthorityForResource(name *xdsresource.Name) *authority { - // For new-style resource names, always lookup the authorities map. If the - // name does not specify an authority, we will end up looking for an entry - // in the map with the empty string as the key. - if name.Scheme == xdsresource.FederationScheme { - return c.authorities[name.Authority] - } - - // For old-style resource names, we use the top-level authority if the name - // does not specify an authority. - if name.Authority == "" { - return c.topLevelAuthority - } - return c.authorities[name.Authority] -} - -// A registry of xdsresource.Type implementations indexed by their corresponding -// type URLs. Registration of an xdsresource.Type happens the first time a watch -// for a resource of that type is invoked. -type resourceTypeRegistry struct { - mu sync.Mutex - types map[string]xdsresource.Type -} - -func newResourceTypeRegistry() *resourceTypeRegistry { - return &resourceTypeRegistry{types: make(map[string]xdsresource.Type)} -} - -func (r *resourceTypeRegistry) get(url string) xdsresource.Type { - r.mu.Lock() - defer r.mu.Unlock() - return r.types[url] -} - -func (r *resourceTypeRegistry) maybeRegister(rType xdsresource.Type) error { - r.mu.Lock() - defer r.mu.Unlock() - - url := rType.TypeURL() - typ, ok := r.types[url] - if ok && typ != rType { - return fmt.Errorf("attempt to re-register a resource type implementation for %v", rType.TypeName()) - } - r.types[url] = rType - return nil -} - -func (c *clientImpl) triggerResourceNotFoundForTesting(rType xdsresource.Type, resourceName string) error { - c.channelsMu.Lock() - defer c.channelsMu.Unlock() - - if c.logger.V(2) { - c.logger.Infof("Triggering resource not found for type: %s, resource name: %s", rType.TypeName(), resourceName) - } - - for _, state := range c.xdsActiveChannels { - if err := state.channel.triggerResourceNotFoundForTesting(rType, resourceName); err != nil { - return err - } - } - return nil -} - -func (c *clientImpl) resourceWatchStateForTesting(rType xdsresource.Type, resourceName string) (ads.ResourceWatchState, error) { - c.channelsMu.Lock() - defer c.channelsMu.Unlock() - - for _, state := range c.xdsActiveChannels { - if st, err := state.channel.ads.ResourceWatchStateForTesting(rType, resourceName); err == nil { - return st, nil - } - } - return ads.ResourceWatchState{}, fmt.Errorf("unable to find watch state for resource type %q and name %q", rType.TypeName(), resourceName) + return c.XDSClient.WatchResource(rType.TypeURL(), resourceName, xdsresource.GenericResourceWatcher(watcher)) } diff --git a/xds/internal/xdsclient/load/reporter.go b/xds/internal/xdsclient/load/reporter.go deleted file mode 100644 index 67e29e5bae13..000000000000 --- a/xds/internal/xdsclient/load/reporter.go +++ /dev/null @@ -1,27 +0,0 @@ -/* - * - * Copyright 2020 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package load - -// PerClusterReporter wraps the methods from the loadStore that are used here. -type PerClusterReporter interface { - CallStarted(locality string) - CallFinished(locality string, err error) - CallServerLoad(locality, name string, val float64) - CallDropped(category string) -} diff --git a/xds/internal/xdsclient/load/store.go b/xds/internal/xdsclient/load/store.go deleted file mode 100644 index f1e265ee7ddf..000000000000 --- a/xds/internal/xdsclient/load/store.go +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright 2020 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package load provides functionality to record and maintain load data. -package load - -import ( - "sync" - "sync/atomic" - "time" -) - -const negativeOneUInt64 = ^uint64(0) - -// Store keeps the loads for multiple clusters and services to be reported via -// LRS. It contains loads to reported to one LRS server. Create multiple stores -// for multiple servers. -// -// It is safe for concurrent use. -type Store struct { - // mu only protects the map (2 layers). The read/write to *perClusterStore - // doesn't need to hold the mu. - mu sync.Mutex - // clusters is a map with cluster name as the key. The second layer is a map - // with service name as the key. Each value (perClusterStore) contains data - // for a (cluster, service) pair. - // - // Note that new entries are added to this map, but never removed. This is - // potentially a memory leak. But the memory is allocated for each new - // (cluster,service) pair, and the memory allocated is just pointers and - // maps. So this shouldn't get too bad. - clusters map[string]map[string]*perClusterStore -} - -// NewStore creates a Store. -func NewStore() *Store { - return &Store{ - clusters: make(map[string]map[string]*perClusterStore), - } -} - -// Stats returns the load data for the given cluster names. Data is returned in -// a slice with no specific order. -// -// If no clusterName is given (an empty slice), all data for all known clusters -// is returned. -// -// If a cluster's Data is empty (no load to report), it's not appended to the -// returned slice. -func (s *Store) Stats(clusterNames []string) []*Data { - var ret []*Data - s.mu.Lock() - defer s.mu.Unlock() - - if len(clusterNames) == 0 { - for _, c := range s.clusters { - ret = appendClusterStats(ret, c) - } - return ret - } - - for _, n := range clusterNames { - if c, ok := s.clusters[n]; ok { - ret = appendClusterStats(ret, c) - } - } - return ret -} - -// appendClusterStats gets Data for the given cluster, append to ret, and return -// the new slice. -// -// Data is only appended to ret if it's not empty. -func appendClusterStats(ret []*Data, cluster map[string]*perClusterStore) []*Data { - for _, d := range cluster { - data := d.stats() - if data == nil { - // Skip this data if it doesn't contain any information. - continue - } - ret = append(ret, data) - } - return ret -} - -// PerCluster returns the perClusterStore for the given clusterName + -// serviceName. -func (s *Store) PerCluster(clusterName, serviceName string) PerClusterReporter { - if s == nil { - return nil - } - - s.mu.Lock() - defer s.mu.Unlock() - c, ok := s.clusters[clusterName] - if !ok { - c = make(map[string]*perClusterStore) - s.clusters[clusterName] = c - } - - if p, ok := c[serviceName]; ok { - return p - } - p := &perClusterStore{ - cluster: clusterName, - service: serviceName, - } - c[serviceName] = p - return p -} - -// perClusterStore is a repository for LB policy implementations to report store -// load data. It contains load for a (cluster, edsService) pair. -// -// It is safe for concurrent use. -// -// TODO(easwars): Use regular maps with mutexes instead of sync.Map here. The -// latter is optimized for two common use cases: (1) when the entry for a given -// key is only ever written once but read many times, as in caches that only -// grow, or (2) when multiple goroutines read, write, and overwrite entries for -// disjoint sets of keys. In these two cases, use of a Map may significantly -// reduce lock contention compared to a Go map paired with a separate Mutex or -// RWMutex. -// Neither of these conditions are met here, and we should transition to a -// regular map with a mutex for better type safety. -type perClusterStore struct { - cluster, service string - drops sync.Map // map[string]*uint64 - localityRPCCount sync.Map // map[string]*rpcCountData - - mu sync.Mutex - lastLoadReportAt time.Time -} - -// Update functions are called by picker for each RPC. To avoid contention, all -// updates are done atomically. - -// CallDropped adds one drop record with the given category to store. -func (ls *perClusterStore) CallDropped(category string) { - if ls == nil { - return - } - - p, ok := ls.drops.Load(category) - if !ok { - tp := new(uint64) - p, _ = ls.drops.LoadOrStore(category, tp) - } - atomic.AddUint64(p.(*uint64), 1) -} - -// CallStarted adds one call started record for the given locality. -func (ls *perClusterStore) CallStarted(locality string) { - if ls == nil { - return - } - - p, ok := ls.localityRPCCount.Load(locality) - if !ok { - tp := newRPCCountData() - p, _ = ls.localityRPCCount.LoadOrStore(locality, tp) - } - p.(*rpcCountData).incrInProgress() - p.(*rpcCountData).incrIssued() -} - -// CallFinished adds one call finished record for the given locality. -// For successful calls, err needs to be nil. -func (ls *perClusterStore) CallFinished(locality string, err error) { - if ls == nil { - return - } - - p, ok := ls.localityRPCCount.Load(locality) - if !ok { - // The map is never cleared, only values in the map are reset. So the - // case where entry for call-finish is not found should never happen. - return - } - p.(*rpcCountData).decrInProgress() - if err == nil { - p.(*rpcCountData).incrSucceeded() - } else { - p.(*rpcCountData).incrErrored() - } -} - -// CallServerLoad adds one server load record for the given locality. The -// load type is specified by desc, and its value by val. -func (ls *perClusterStore) CallServerLoad(locality, name string, d float64) { - if ls == nil { - return - } - - p, ok := ls.localityRPCCount.Load(locality) - if !ok { - // The map is never cleared, only values in the map are reset. So the - // case where entry for callServerLoad is not found should never happen. - return - } - p.(*rpcCountData).addServerLoad(name, d) -} - -// Data contains all load data reported to the Store since the most recent call -// to stats(). -type Data struct { - // Cluster is the name of the cluster this data is for. - Cluster string - // Service is the name of the EDS service this data is for. - Service string - // TotalDrops is the total number of dropped requests. - TotalDrops uint64 - // Drops is the number of dropped requests per category. - Drops map[string]uint64 - // LocalityStats contains load reports per locality. - LocalityStats map[string]LocalityData - // ReportInternal is the duration since last time load was reported (stats() - // was called). - ReportInterval time.Duration -} - -// LocalityData contains load data for a single locality. -type LocalityData struct { - // RequestStats contains counts of requests made to the locality. - RequestStats RequestData - // LoadStats contains server load data for requests made to the locality, - // indexed by the load type. - LoadStats map[string]ServerLoadData -} - -// RequestData contains request counts. -type RequestData struct { - // Succeeded is the number of succeeded requests. - Succeeded uint64 - // Errored is the number of requests which ran into errors. - Errored uint64 - // InProgress is the number of requests in flight. - InProgress uint64 - // Issued is the total number requests that were sent. - Issued uint64 -} - -// ServerLoadData contains server load data. -type ServerLoadData struct { - // Count is the number of load reports. - Count uint64 - // Sum is the total value of all load reports. - Sum float64 -} - -func newData(cluster, service string) *Data { - return &Data{ - Cluster: cluster, - Service: service, - Drops: make(map[string]uint64), - LocalityStats: make(map[string]LocalityData), - } -} - -// stats returns and resets all loads reported to the store, except inProgress -// rpc counts. -// -// It returns nil if the store doesn't contain any (new) data. -func (ls *perClusterStore) stats() *Data { - if ls == nil { - return nil - } - - sd := newData(ls.cluster, ls.service) - ls.drops.Range(func(key, val any) bool { - d := atomic.SwapUint64(val.(*uint64), 0) - if d == 0 { - return true - } - sd.TotalDrops += d - keyStr := key.(string) - if keyStr != "" { - // Skip drops without category. They are counted in total_drops, but - // not in per category. One example is drops by circuit breaking. - sd.Drops[keyStr] = d - } - return true - }) - ls.localityRPCCount.Range(func(key, val any) bool { - countData := val.(*rpcCountData) - succeeded := countData.loadAndClearSucceeded() - inProgress := countData.loadInProgress() - errored := countData.loadAndClearErrored() - issued := countData.loadAndClearIssued() - if succeeded == 0 && inProgress == 0 && errored == 0 && issued == 0 { - return true - } - - ld := LocalityData{ - RequestStats: RequestData{ - Succeeded: succeeded, - Errored: errored, - InProgress: inProgress, - Issued: issued, - }, - LoadStats: make(map[string]ServerLoadData), - } - countData.serverLoads.Range(func(key, val any) bool { - sum, count := val.(*rpcLoadData).loadAndClear() - if count == 0 { - return true - } - ld.LoadStats[key.(string)] = ServerLoadData{ - Count: count, - Sum: sum, - } - return true - }) - sd.LocalityStats[key.(string)] = ld - return true - }) - - ls.mu.Lock() - sd.ReportInterval = time.Since(ls.lastLoadReportAt) - ls.lastLoadReportAt = time.Now() - ls.mu.Unlock() - - if sd.TotalDrops == 0 && len(sd.Drops) == 0 && len(sd.LocalityStats) == 0 { - return nil - } - return sd -} - -type rpcCountData struct { - // Only atomic accesses are allowed for the fields. - succeeded *uint64 - errored *uint64 - inProgress *uint64 - issued *uint64 - - // Map from load desc to load data (sum+count). Loading data from map is - // atomic, but updating data takes a lock, which could cause contention when - // multiple RPCs try to report loads for the same desc. - // - // To fix the contention, shard this map. - serverLoads sync.Map // map[string]*rpcLoadData -} - -func newRPCCountData() *rpcCountData { - return &rpcCountData{ - succeeded: new(uint64), - errored: new(uint64), - inProgress: new(uint64), - issued: new(uint64), - } -} - -func (rcd *rpcCountData) incrSucceeded() { - atomic.AddUint64(rcd.succeeded, 1) -} - -func (rcd *rpcCountData) loadAndClearSucceeded() uint64 { - return atomic.SwapUint64(rcd.succeeded, 0) -} - -func (rcd *rpcCountData) incrErrored() { - atomic.AddUint64(rcd.errored, 1) -} - -func (rcd *rpcCountData) loadAndClearErrored() uint64 { - return atomic.SwapUint64(rcd.errored, 0) -} - -func (rcd *rpcCountData) incrInProgress() { - atomic.AddUint64(rcd.inProgress, 1) -} - -func (rcd *rpcCountData) decrInProgress() { - atomic.AddUint64(rcd.inProgress, negativeOneUInt64) // atomic.Add(x, -1) -} - -func (rcd *rpcCountData) loadInProgress() uint64 { - return atomic.LoadUint64(rcd.inProgress) // InProgress count is not clear when reading. -} - -func (rcd *rpcCountData) incrIssued() { - atomic.AddUint64(rcd.issued, 1) -} - -func (rcd *rpcCountData) loadAndClearIssued() uint64 { - return atomic.SwapUint64(rcd.issued, 0) -} - -func (rcd *rpcCountData) addServerLoad(name string, d float64) { - loads, ok := rcd.serverLoads.Load(name) - if !ok { - tl := newRPCLoadData() - loads, _ = rcd.serverLoads.LoadOrStore(name, tl) - } - loads.(*rpcLoadData).add(d) -} - -// Data for server loads (from trailers or oob). Fields in this struct must be -// updated consistently. -// -// The current solution is to hold a lock, which could cause contention. To fix, -// shard serverLoads map in rpcCountData. -type rpcLoadData struct { - mu sync.Mutex - sum float64 - count uint64 -} - -func newRPCLoadData() *rpcLoadData { - return &rpcLoadData{} -} - -func (rld *rpcLoadData) add(v float64) { - rld.mu.Lock() - rld.sum += v - rld.count++ - rld.mu.Unlock() -} - -func (rld *rpcLoadData) loadAndClear() (s float64, c uint64) { - rld.mu.Lock() - s = rld.sum - rld.sum = 0 - c = rld.count - rld.count = 0 - rld.mu.Unlock() - return -} diff --git a/xds/internal/xdsclient/load/store_test.go b/xds/internal/xdsclient/load/store_test.go deleted file mode 100644 index 44618966859c..000000000000 --- a/xds/internal/xdsclient/load/store_test.go +++ /dev/null @@ -1,468 +0,0 @@ -/* - * - * Copyright 2020 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package load - -import ( - "fmt" - "sort" - "sync" - "testing" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" -) - -var ( - dropCategories = []string{"drop_for_real", "drop_for_fun"} - localities = []string{"locality-A", "locality-B"} - errTest = fmt.Errorf("test error") -) - -// rpcData wraps the rpc counts and load data to be pushed to the store. -type rpcData struct { - start, success, failure int - serverData map[string]float64 // Will be reported with successful RPCs. -} - -// TestDrops spawns a bunch of goroutines which report drop data. After the -// goroutines have exited, the test dumps the stats from the Store and makes -// sure they are as expected. -func TestDrops(t *testing.T) { - var ( - drops = map[string]int{ - dropCategories[0]: 30, - dropCategories[1]: 40, - "": 10, - } - wantStoreData = &Data{ - TotalDrops: 80, - Drops: map[string]uint64{ - dropCategories[0]: 30, - dropCategories[1]: 40, - }, - } - ) - - ls := perClusterStore{} - var wg sync.WaitGroup - for category, count := range drops { - for i := 0; i < count; i++ { - wg.Add(1) - go func(c string) { - ls.CallDropped(c) - wg.Done() - }(category) - } - } - wg.Wait() - - gotStoreData := ls.stats() - if diff := cmp.Diff(wantStoreData, gotStoreData, cmpopts.EquateEmpty(), cmpopts.IgnoreFields(Data{}, "ReportInterval")); diff != "" { - t.Errorf("store.stats() returned unexpected diff (-want +got):\n%s", diff) - } -} - -// TestLocalityStats spawns a bunch of goroutines which report rpc and load -// data. After the goroutines have exited, the test dumps the stats from the -// Store and makes sure they are as expected. -func TestLocalityStats(t *testing.T) { - var ( - localityData = map[string]rpcData{ - localities[0]: { - start: 40, - success: 20, - failure: 10, - serverData: map[string]float64{"net": 1, "disk": 2, "cpu": 3, "mem": 4}, - }, - localities[1]: { - start: 80, - success: 40, - failure: 20, - serverData: map[string]float64{"net": 1, "disk": 2, "cpu": 3, "mem": 4}, - }, - } - wantStoreData = &Data{ - LocalityStats: map[string]LocalityData{ - localities[0]: { - RequestStats: RequestData{ - Succeeded: 20, - Errored: 10, - InProgress: 10, - Issued: 40, - }, - LoadStats: map[string]ServerLoadData{ - "net": {Count: 20, Sum: 20}, - "disk": {Count: 20, Sum: 40}, - "cpu": {Count: 20, Sum: 60}, - "mem": {Count: 20, Sum: 80}, - }, - }, - localities[1]: { - RequestStats: RequestData{ - Succeeded: 40, - Errored: 20, - InProgress: 20, - Issued: 80, - }, - LoadStats: map[string]ServerLoadData{ - "net": {Count: 40, Sum: 40}, - "disk": {Count: 40, Sum: 80}, - "cpu": {Count: 40, Sum: 120}, - "mem": {Count: 40, Sum: 160}, - }, - }, - }, - } - ) - - ls := perClusterStore{} - var wg sync.WaitGroup - for locality, data := range localityData { - wg.Add(data.start) - for i := 0; i < data.start; i++ { - go func(l string) { - ls.CallStarted(l) - wg.Done() - }(locality) - } - // The calls to callStarted() need to happen before the other calls are - // made. Hence the wait here. - wg.Wait() - - wg.Add(data.success) - for i := 0; i < data.success; i++ { - go func(l string, serverData map[string]float64) { - ls.CallFinished(l, nil) - for n, d := range serverData { - ls.CallServerLoad(l, n, d) - } - wg.Done() - }(locality, data.serverData) - } - wg.Add(data.failure) - for i := 0; i < data.failure; i++ { - go func(l string) { - ls.CallFinished(l, errTest) - wg.Done() - }(locality) - } - wg.Wait() - } - - gotStoreData := ls.stats() - if diff := cmp.Diff(wantStoreData, gotStoreData, cmpopts.EquateEmpty(), cmpopts.IgnoreFields(Data{}, "ReportInterval")); diff != "" { - t.Errorf("store.stats() returned unexpected diff (-want +got):\n%s", diff) - } -} - -func TestResetAfterStats(t *testing.T) { - // Push a bunch of drops, call stats and load stats, and leave inProgress to be non-zero. - // Dump the stats. Verify expected - // Push the same set of loads as before - // Now dump and verify the newly expected ones. - var ( - drops = map[string]int{ - dropCategories[0]: 30, - dropCategories[1]: 40, - } - localityData = map[string]rpcData{ - localities[0]: { - start: 40, - success: 20, - failure: 10, - serverData: map[string]float64{"net": 1, "disk": 2, "cpu": 3, "mem": 4}, - }, - localities[1]: { - start: 80, - success: 40, - failure: 20, - serverData: map[string]float64{"net": 1, "disk": 2, "cpu": 3, "mem": 4}, - }, - } - wantStoreData = &Data{ - TotalDrops: 70, - Drops: map[string]uint64{ - dropCategories[0]: 30, - dropCategories[1]: 40, - }, - LocalityStats: map[string]LocalityData{ - localities[0]: { - RequestStats: RequestData{ - Succeeded: 20, - Errored: 10, - InProgress: 10, - Issued: 40, - }, - - LoadStats: map[string]ServerLoadData{ - "net": {Count: 20, Sum: 20}, - "disk": {Count: 20, Sum: 40}, - "cpu": {Count: 20, Sum: 60}, - "mem": {Count: 20, Sum: 80}, - }, - }, - localities[1]: { - RequestStats: RequestData{ - Succeeded: 40, - Errored: 20, - InProgress: 20, - Issued: 80, - }, - - LoadStats: map[string]ServerLoadData{ - "net": {Count: 40, Sum: 40}, - "disk": {Count: 40, Sum: 80}, - "cpu": {Count: 40, Sum: 120}, - "mem": {Count: 40, Sum: 160}, - }, - }, - }, - } - ) - - reportLoad := func(ls *perClusterStore) { - for category, count := range drops { - for i := 0; i < count; i++ { - ls.CallDropped(category) - } - } - for locality, data := range localityData { - for i := 0; i < data.start; i++ { - ls.CallStarted(locality) - } - for i := 0; i < data.success; i++ { - ls.CallFinished(locality, nil) - for n, d := range data.serverData { - ls.CallServerLoad(locality, n, d) - } - } - for i := 0; i < data.failure; i++ { - ls.CallFinished(locality, errTest) - } - } - } - - ls := perClusterStore{} - reportLoad(&ls) - gotStoreData := ls.stats() - if diff := cmp.Diff(wantStoreData, gotStoreData, cmpopts.EquateEmpty(), cmpopts.IgnoreFields(Data{}, "ReportInterval")); diff != "" { - t.Errorf("store.stats() returned unexpected diff (-want +got):\n%s", diff) - } - - // The above call to stats() should have reset all load reports except the - // inProgress rpc count. We are now going to push the same load data into - // the store. So, we should expect to see twice the count for inProgress. - for _, l := range localities { - ls := wantStoreData.LocalityStats[l] - ls.RequestStats.InProgress *= 2 - wantStoreData.LocalityStats[l] = ls - } - reportLoad(&ls) - gotStoreData = ls.stats() - if diff := cmp.Diff(wantStoreData, gotStoreData, cmpopts.EquateEmpty(), cmpopts.IgnoreFields(Data{}, "ReportInterval")); diff != "" { - t.Errorf("store.stats() returned unexpected diff (-want +got):\n%s", diff) - } -} - -var sortDataSlice = cmp.Transformer("SortDataSlice", func(in []*Data) []*Data { - out := append([]*Data(nil), in...) // Copy input to avoid mutating it - sort.Slice(out, - func(i, j int) bool { - if out[i].Cluster < out[j].Cluster { - return true - } - if out[i].Cluster == out[j].Cluster { - return out[i].Service < out[j].Service - } - return false - }, - ) - return out -}) - -// Test all load are returned for the given clusters, and all clusters are -// reported if no cluster is specified. -func TestStoreStats(t *testing.T) { - var ( - testClusters = []string{"c0", "c1", "c2"} - testServices = []string{"s0", "s1"} - testLocality = "test-locality" - ) - - store := NewStore() - for _, c := range testClusters { - for _, s := range testServices { - store.PerCluster(c, s).CallStarted(testLocality) - store.PerCluster(c, s).CallServerLoad(testLocality, "abc", 123) - store.PerCluster(c, s).CallDropped("dropped") - store.PerCluster(c, s).CallFinished(testLocality, nil) - } - } - - wantC0 := []*Data{ - { - Cluster: "c0", Service: "s0", - TotalDrops: 1, Drops: map[string]uint64{"dropped": 1}, - LocalityStats: map[string]LocalityData{ - "test-locality": { - RequestStats: RequestData{Succeeded: 1, Issued: 1}, - LoadStats: map[string]ServerLoadData{"abc": {Count: 1, Sum: 123}}, - }, - }, - }, - { - Cluster: "c0", Service: "s1", - TotalDrops: 1, Drops: map[string]uint64{"dropped": 1}, - LocalityStats: map[string]LocalityData{ - "test-locality": { - RequestStats: RequestData{Succeeded: 1, Issued: 1}, - LoadStats: map[string]ServerLoadData{"abc": {Count: 1, Sum: 123}}, - }, - }, - }, - } - // Call Stats with just "c0", this should return data for "c0", and not - // touch data for other clusters. - gotC0 := store.Stats([]string{"c0"}) - if diff := cmp.Diff(wantC0, gotC0, cmpopts.EquateEmpty(), cmpopts.IgnoreFields(Data{}, "ReportInterval"), sortDataSlice); diff != "" { - t.Errorf("store.stats() returned unexpected diff (-want +got):\n%s", diff) - } - - wantOther := []*Data{ - { - Cluster: "c1", Service: "s0", - TotalDrops: 1, Drops: map[string]uint64{"dropped": 1}, - LocalityStats: map[string]LocalityData{ - "test-locality": { - RequestStats: RequestData{Succeeded: 1, Issued: 1}, - LoadStats: map[string]ServerLoadData{"abc": {Count: 1, Sum: 123}}, - }, - }, - }, - { - Cluster: "c1", Service: "s1", - TotalDrops: 1, Drops: map[string]uint64{"dropped": 1}, - LocalityStats: map[string]LocalityData{ - "test-locality": { - RequestStats: RequestData{Succeeded: 1, Issued: 1}, - LoadStats: map[string]ServerLoadData{"abc": {Count: 1, Sum: 123}}, - }, - }, - }, - { - Cluster: "c2", Service: "s0", - TotalDrops: 1, Drops: map[string]uint64{"dropped": 1}, - LocalityStats: map[string]LocalityData{ - "test-locality": { - RequestStats: RequestData{Succeeded: 1, Issued: 1}, - LoadStats: map[string]ServerLoadData{"abc": {Count: 1, Sum: 123}}, - }, - }, - }, - { - Cluster: "c2", Service: "s1", - TotalDrops: 1, Drops: map[string]uint64{"dropped": 1}, - LocalityStats: map[string]LocalityData{ - "test-locality": { - RequestStats: RequestData{Succeeded: 1, Issued: 1}, - LoadStats: map[string]ServerLoadData{"abc": {Count: 1, Sum: 123}}, - }, - }, - }, - } - // Call Stats with empty slice, this should return data for all the - // remaining clusters, and not include c0 (because c0 data was cleared). - gotOther := store.Stats(nil) - if diff := cmp.Diff(wantOther, gotOther, cmpopts.EquateEmpty(), cmpopts.IgnoreFields(Data{}, "ReportInterval"), sortDataSlice); diff != "" { - t.Errorf("store.stats() returned unexpected diff (-want +got):\n%s", diff) - } -} - -// Test the cases that if a cluster doesn't have load to report, its data is not -// appended to the slice returned by Stats(). -func TestStoreStatsEmptyDataNotReported(t *testing.T) { - var ( - testServices = []string{"s0", "s1"} - testLocality = "test-locality" - ) - - store := NewStore() - // "c0"'s RPCs all finish with success. - for _, s := range testServices { - store.PerCluster("c0", s).CallStarted(testLocality) - store.PerCluster("c0", s).CallFinished(testLocality, nil) - } - // "c1"'s RPCs never finish (always inprocess). - for _, s := range testServices { - store.PerCluster("c1", s).CallStarted(testLocality) - } - - want0 := []*Data{ - { - Cluster: "c0", Service: "s0", - LocalityStats: map[string]LocalityData{ - "test-locality": {RequestStats: RequestData{Succeeded: 1, Issued: 1}}, - }, - }, - { - Cluster: "c0", Service: "s1", - LocalityStats: map[string]LocalityData{ - "test-locality": {RequestStats: RequestData{Succeeded: 1, Issued: 1}}, - }, - }, - { - Cluster: "c1", Service: "s0", - LocalityStats: map[string]LocalityData{ - "test-locality": {RequestStats: RequestData{InProgress: 1, Issued: 1}}, - }, - }, - { - Cluster: "c1", Service: "s1", - LocalityStats: map[string]LocalityData{ - "test-locality": {RequestStats: RequestData{InProgress: 1, Issued: 1}}, - }, - }, - } - // Call Stats with empty slice, this should return data for all the - // clusters. - got0 := store.Stats(nil) - if diff := cmp.Diff(want0, got0, cmpopts.EquateEmpty(), cmpopts.IgnoreFields(Data{}, "ReportInterval"), sortDataSlice); diff != "" { - t.Errorf("store.stats() returned unexpected diff (-want +got):\n%s", diff) - } - - want1 := []*Data{ - { - Cluster: "c1", Service: "s0", - LocalityStats: map[string]LocalityData{ - "test-locality": {RequestStats: RequestData{InProgress: 1}}, - }, - }, - { - Cluster: "c1", Service: "s1", - LocalityStats: map[string]LocalityData{ - "test-locality": {RequestStats: RequestData{InProgress: 1}}, - }, - }, - } - // Call Stats with empty slice again, this should return data only for "c1", - // because "c0" data was cleared, but "c1" has in-progress RPCs. - got1 := store.Stats(nil) - if diff := cmp.Diff(want1, got1, cmpopts.EquateEmpty(), cmpopts.IgnoreFields(Data{}, "ReportInterval"), sortDataSlice); diff != "" { - t.Errorf("store.stats() returned unexpected diff (-want +got):\n%s", diff) - } -} diff --git a/xds/internal/xdsclient/metrics_test.go b/xds/internal/xdsclient/metrics_test.go index 369f7216411e..7a4ab0bd1934 100644 --- a/xds/internal/xdsclient/metrics_test.go +++ b/xds/internal/xdsclient/metrics_test.go @@ -32,6 +32,8 @@ import ( "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" + + _ "google.golang.org/grpc/xds/internal/httpfilter/router" // Register the router filter. ) type noopListenerWatcher struct{} diff --git a/xds/internal/xdsclient/pool.go b/xds/internal/xdsclient/pool.go index 4a9c0e0922f3..6b165d15c30b 100644 --- a/xds/internal/xdsclient/pool.go +++ b/xds/internal/xdsclient/pool.go @@ -25,15 +25,16 @@ import ( v3statuspb "github.com/envoyproxy/go-control-plane/envoy/service/status/v3" estats "google.golang.org/grpc/experimental/stats" - "google.golang.org/grpc/internal/backoff" istats "google.golang.org/grpc/internal/stats" "google.golang.org/grpc/internal/xds/bootstrap" + gxdsclient "google.golang.org/grpc/xds/internal/clients/xdsclient" + "google.golang.org/protobuf/proto" ) var ( // DefaultPool is the default pool for xDS clients. It is created at init // time by reading bootstrap configuration from env vars. - DefaultPool *Pool + DefaultPool = &Pool{clients: make(map[string]*clientRefCounted)} ) // Pool represents a pool of xDS clients that share the same bootstrap @@ -65,6 +66,15 @@ type OptionsForTesting struct { // MetricsRecorder is the metrics recorder the xDS Client will use. If // unspecified, uses a no-op MetricsRecorder. MetricsRecorder estats.MetricsRecorder + + // ResourceTypes is a map from resource type URLs to resource type + // implementations. Each resource type URL uniquely identifies a specific + // kind of xDS resource, and the corresponding resource type implementation + // provides logic for parsing, validating, and processing resources of that + // type. + // + // For example: "type.googleapis.com/envoy.config.listener.v3.Listener" + ResourceTypes map[string]gxdsclient.ResourceType } // NewPool creates a new xDS client pool with the given bootstrap config. @@ -89,7 +99,7 @@ func NewPool(config *bootstrap.Config) *Pool { // expected to invoke once they are done using the client. It is safe for the // caller to invoke this close function multiple times. func (p *Pool) NewClient(name string, metricsRecorder estats.MetricsRecorder) (XDSClient, func(), error) { - return p.newRefCounted(name, defaultWatchExpiryTimeout, backoff.DefaultExponential.Backoff, metricsRecorder) + return p.newRefCounted(name, metricsRecorder, nil) } // NewClientForTesting returns an xDS client configured with the provided @@ -116,7 +126,12 @@ func (p *Pool) NewClientForTesting(opts OptionsForTesting) (XDSClient, func(), e if opts.MetricsRecorder == nil { opts.MetricsRecorder = istats.NewMetricsRecorderList(nil) } - return p.newRefCounted(opts.Name, opts.WatchExpiryTimeout, opts.StreamBackoffAfterFailure, opts.MetricsRecorder) + c, cancel, err := p.newRefCounted(opts.Name, opts.MetricsRecorder, opts.ResourceTypes) + if err != nil { + return nil, nil, err + } + c.clientImpl.SetWatchExpiryTimeoutForTesting(opts.WatchExpiryTimeout) + return c, cancel, nil } // GetClientForTesting returns an xDS client created earlier using the given @@ -163,7 +178,15 @@ func (p *Pool) DumpResources() *v3statuspb.ClientStatusResponse { resp := &v3statuspb.ClientStatusResponse{} for key, client := range p.clients { - cfg := client.dumpResources() + b, err := client.clientImpl.DumpResources() + if err != nil { + return nil + } + r := &v3statuspb.ClientStatusResponse{} + if err := proto.Unmarshal(b, r); err != nil { + return nil + } + cfg := r.Config[0] cfg.ClientScope = key resp.Config = append(resp.Config, cfg) } @@ -208,14 +231,14 @@ func (p *Pool) clientRefCountedClose(name string) { // This attempts to close the transport to the management server and could // theoretically call back into the xdsclient package again and deadlock. // Hence, this needs to be called without holding the lock. - client.clientImpl.close() + client.clientImpl.Close() xdsClientImplCloseHook(name) } // newRefCounted creates a new reference counted xDS client implementation for // name, if one does not exist already. If an xDS client for the given name // exists, it gets a reference to it and returns it. -func (p *Pool) newRefCounted(name string, watchExpiryTimeout time.Duration, streamBackoff func(int) time.Duration, metricsRecorder estats.MetricsRecorder) (XDSClient, func(), error) { +func (p *Pool) newRefCounted(name string, metricsRecorder estats.MetricsRecorder, resourceTypes map[string]gxdsclient.ResourceType) (*clientRefCounted, func(), error) { p.mu.Lock() defer p.mu.Unlock() @@ -246,7 +269,7 @@ func (p *Pool) newRefCounted(name string, watchExpiryTimeout time.Duration, stre return c, sync.OnceFunc(func() { p.clientRefCountedClose(name) }), nil } - c, err := newClientImpl(p.config, watchExpiryTimeout, streamBackoff, metricsRecorder, name) + c, err := newClientImplGeneric(p.config, metricsRecorder, resourceTypes, name) if err != nil { return nil, nil, err } diff --git a/xds/internal/xdsclient/tests/ads_stream_ack_nack_test.go b/xds/internal/xdsclient/tests/ads_stream_ack_nack_test.go index 090faaa00de0..09e9b39edc0d 100644 --- a/xds/internal/xdsclient/tests/ads_stream_ack_nack_test.go +++ b/xds/internal/xdsclient/tests/ads_stream_ack_nack_test.go @@ -41,6 +41,25 @@ import ( v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" ) +// Creates an xDS client with the given bootstrap contents. +func createXDSClient(t *testing.T, bootstrapContents []byte) xdsclient.XDSClient { + t.Helper() + + config, err := bootstrap.NewConfigFromContents(bootstrapContents) + if err != nil { + t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err) + } + pool := xdsclient.NewPool(config) + client, close, err := pool.NewClientForTesting(xdsclient.OptionsForTesting{ + Name: t.Name(), + }) + if err != nil { + t.Fatalf("Failed to create xDS client: %v", err) + } + t.Cleanup(close) + return client +} + // Tests simple ACK and NACK scenarios on the ADS stream: // 1. When a good response is received, i.e. once that is expected to be ACKed, // the test verifies that an ACK is sent matching the version and nonce from diff --git a/xds/internal/xdsclient/tests/ads_stream_backoff_test.go b/xds/internal/xdsclient/tests/ads_stream_backoff_test.go deleted file mode 100644 index d0f5b215e266..000000000000 --- a/xds/internal/xdsclient/tests/ads_stream_backoff_test.go +++ /dev/null @@ -1,453 +0,0 @@ -/* - * - * Copyright 2024 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package xdsclient_test - -import ( - "context" - "errors" - "fmt" - "testing" - "time" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "github.com/google/uuid" - "google.golang.org/grpc" - "google.golang.org/grpc/internal/testutils" - "google.golang.org/grpc/internal/testutils/xds/e2e" - "google.golang.org/grpc/internal/xds/bootstrap" - "google.golang.org/grpc/xds/internal/xdsclient" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" - "google.golang.org/protobuf/testing/protocmp" - - v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" - v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" - v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" -) - -// Creates an xDS client with the given bootstrap contents and backoff function. -func createXDSClientWithBackoff(t *testing.T, bootstrapContents []byte, streamBackoff func(int) time.Duration) xdsclient.XDSClient { - t.Helper() - - config, err := bootstrap.NewConfigFromContents(bootstrapContents) - if err != nil { - t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err) - } - pool := xdsclient.NewPool(config) - client, close, err := pool.NewClientForTesting(xdsclient.OptionsForTesting{ - Name: t.Name(), - StreamBackoffAfterFailure: streamBackoff, - }) - if err != nil { - t.Fatalf("Failed to create xDS client: %v", err) - } - t.Cleanup(close) - return client -} - -// Tests the case where the management server returns an error in the ADS -// streaming RPC. Verifies that the ADS stream is restarted after a backoff -// period, and that the previously requested resources are re-requested on the -// new stream. -func (s) TestADS_BackoffAfterStreamFailure(t *testing.T) { - // Channels used for verifying different events in the test. - streamCloseCh := make(chan struct{}, 1) // ADS stream is closed. - ldsResourcesCh := make(chan []string, 1) // Listener resource names in the discovery request. - backoffCh := make(chan struct{}, 1) // Backoff after stream failure. - - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Create an xDS management server that returns RPC errors. - streamErr := errors.New("ADS stream error") - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ - OnStreamRequest: func(_ int64, req *v3discoverypb.DiscoveryRequest) error { - // Push the requested resource names on to a channel. - if req.GetTypeUrl() == version.V3ListenerURL { - t.Logf("Received LDS request for resources: %v", req.GetResourceNames()) - select { - case ldsResourcesCh <- req.GetResourceNames(): - case <-ctx.Done(): - } - } - // Return an error everytime a request is sent on the stream. This - // should cause the transport to backoff before attempting to - // recreate the stream. - return streamErr - }, - // Push on a channel whenever the stream is closed. - OnStreamClosed: func(int64, *v3corepb.Node) { - select { - case streamCloseCh <- struct{}{}: - case <-ctx.Done(): - } - }, - }) - - // Override the backoff implementation to push on a channel that is read by - // the test goroutine. - backoffCtx, backoffCancel := context.WithCancel(ctx) - streamBackoff := func(v int) time.Duration { - select { - case backoffCh <- struct{}{}: - case <-backoffCtx.Done(): - } - return 0 - } - defer backoffCancel() - - // Create an xDS client with bootstrap pointing to the above server. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - client := createXDSClientWithBackoff(t, bc, streamBackoff) - - // Register a watch for a listener resource. - const listenerName = "listener" - lw := newListenerWatcher() - ldsCancel := xdsresource.WatchListener(client, listenerName, lw) - defer ldsCancel() - - // Verify that an ADS stream is created and an LDS request with the above - // resource name is sent. - if err := waitForResourceNames(ctx, t, ldsResourcesCh, []string{listenerName}); err != nil { - t.Fatal(err) - } - - // Verify that the received stream error is reported to the watcher. - if err := verifyListenerError(ctx, lw.updateCh, streamErr.Error(), nodeID); err != nil { - t.Fatal(err) - } - - // Verify that the stream is closed. - select { - case <-streamCloseCh: - case <-ctx.Done(): - t.Fatalf("Timeout waiting for stream to be closed after an error") - } - - // Verify that the ADS stream backs off before recreating the stream. - select { - case <-backoffCh: - case <-ctx.Done(): - t.Fatalf("Timeout waiting for ADS stream to backoff after stream failure") - } - - // Verify that the same resource name is re-requested on the new stream. - if err := waitForResourceNames(ctx, t, ldsResourcesCh, []string{listenerName}); err != nil { - t.Fatal(err) - } - - // To prevent indefinite blocking during xDS client close, which is caused - // by a blocking backoff channel write, cancel the backoff context early - // given that the test is complete. - backoffCancel() - -} - -// Tests the case where a stream breaks because the server goes down. Verifies -// that when the server comes back up, the same resources are re-requested, this -// time with the previously acked version and an empty nonce. -func (s) TestADS_RetriesAfterBrokenStream(t *testing.T) { - // Channels used for verifying different events in the test. - streamRequestCh := make(chan *v3discoverypb.DiscoveryRequest, 1) // Discovery request is received. - streamResponseCh := make(chan *v3discoverypb.DiscoveryResponse, 1) // Discovery response is received. - - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Create an xDS management server listening on a local port. - l, err := testutils.LocalTCPListener() - if err != nil { - t.Fatalf("Failed to create a local listener for the xDS management server: %v", err) - } - lis := testutils.NewRestartableListener(l) - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ - Listener: lis, - // Push the received request on to a channel for the test goroutine to - // verify that it matches expectations. - OnStreamRequest: func(_ int64, req *v3discoverypb.DiscoveryRequest) error { - select { - case streamRequestCh <- req: - case <-ctx.Done(): - } - return nil - }, - // Push the response that the management server is about to send on to a - // channel. The test goroutine to uses this to extract the version and - // nonce, expected on subsequent requests. - OnStreamResponse: func(_ context.Context, _ int64, _ *v3discoverypb.DiscoveryRequest, resp *v3discoverypb.DiscoveryResponse) { - select { - case streamResponseCh <- resp: - case <-ctx.Done(): - } - }, - }) - - // Create a listener resource on the management server. - const listenerName = "listener" - const routeConfigName = "route-config" - nodeID := uuid.New().String() - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerName, routeConfigName)}, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatal(err) - } - - // Override the backoff implementation to always return 0, to reduce test - // run time. Instead control when the backoff returns by blocking on a - // channel, that the test closes. - backoffCh := make(chan struct{}) - streamBackoff := func(v int) time.Duration { - select { - case backoffCh <- struct{}{}: - case <-ctx.Done(): - } - return 0 - } - - // Create an xDS client with bootstrap pointing to the above server. - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - client := createXDSClientWithBackoff(t, bc, streamBackoff) - - // Register a watch for a listener resource. - lw := newListenerWatcher() - ldsCancel := xdsresource.WatchListener(client, listenerName, lw) - defer ldsCancel() - - // Verify that the initial discovery request matches expectation. - var gotReq *v3discoverypb.DiscoveryRequest - select { - case gotReq = <-streamRequestCh: - case <-ctx.Done(): - t.Fatalf("Timeout waiting for discovery request on the stream") - } - wantReq := &v3discoverypb.DiscoveryRequest{ - VersionInfo: "", - Node: &v3corepb.Node{ - Id: nodeID, - UserAgentName: "gRPC Go", - UserAgentVersionType: &v3corepb.Node_UserAgentVersion{UserAgentVersion: grpc.Version}, - ClientFeatures: []string{"envoy.lb.does_not_support_overprovisioning", "xds.config.resource-in-sotw"}, - }, - ResourceNames: []string{listenerName}, - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - ResponseNonce: "", - } - if diff := cmp.Diff(gotReq, wantReq, protocmp.Transform()); diff != "" { - t.Fatalf("Unexpected diff in received discovery request, diff (-got, +want):\n%s", diff) - } - - // Capture the version and nonce from the response. - var gotResp *v3discoverypb.DiscoveryResponse - select { - case gotResp = <-streamResponseCh: - case <-ctx.Done(): - t.Fatalf("Timeout waiting for discovery response on the stream") - } - version := gotResp.GetVersionInfo() - nonce := gotResp.GetNonce() - - // Verify that the ACK contains the appropriate version and nonce. - wantReq.VersionInfo = version - wantReq.ResponseNonce = nonce - select { - case gotReq = <-streamRequestCh: - case <-ctx.Done(): - t.Fatalf("Timeout waiting for the discovery request ACK on the stream") - } - if diff := cmp.Diff(gotReq, wantReq, protocmp.Transform()); diff != "" { - t.Fatalf("Unexpected diff in received discovery request, diff (-got, +want):\n%s", diff) - } - - // Verify the update received by the watcher. - wantUpdate := listenerUpdateErrTuple{ - update: xdsresource.ListenerUpdate{ - RouteConfigName: routeConfigName, - HTTPFilters: []xdsresource.HTTPFilter{{Name: "router"}}, - }, - } - if err := verifyListenerUpdate(ctx, lw.updateCh, wantUpdate); err != nil { - t.Fatal(err) - } - - // Bring down the management server to simulate a broken stream. - lis.Stop() - - // Verify that the error callback on the watcher is not invoked. - verifyNoListenerUpdate(ctx, lw.updateCh) - - // Wait for backoff to kick in, and unblock the first backoff attempt. - select { - case <-backoffCh: - case <-ctx.Done(): - t.Fatal("Timeout waiting for stream backoff") - } - - // Bring up the management server. The test does not have prcecise control - // over when new streams to the management server will start succeeding. The - // ADS stream implementation will backoff as many times as required before - // it can successfully create a new stream. Therefore, we need to receive on - // the backoffCh as many times as required, and unblock the backoff - // implementation. - lis.Restart() - go func() { - for { - select { - case <-backoffCh: - case <-ctx.Done(): - return - } - } - }() - - // Verify that the transport creates a new stream and sends out a new - // request which contains the previously acked version, but an empty nonce. - wantReq.ResponseNonce = "" - select { - case gotReq = <-streamRequestCh: - case <-ctx.Done(): - t.Fatalf("Timeout waiting for the discovery request ACK on the stream") - } - if diff := cmp.Diff(gotReq, wantReq, protocmp.Transform()); diff != "" { - t.Fatalf("Unexpected diff in received discovery request, diff (-got, +want):\n%s", diff) - } -} - -// Tests the case where a resource is requested before the a valid ADS stream -// exists. Verifies that the a discovery request is sent out for the previously -// requested resource once a valid stream is created. -func (s) TestADS_ResourceRequestedBeforeStreamCreation(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Channels used for verifying different events in the test. - streamRequestCh := make(chan *v3discoverypb.DiscoveryRequest, 1) // Discovery request is received. - - // Create an xDS management server listening on a local port. - l, err := testutils.LocalTCPListener() - if err != nil { - t.Fatalf("Failed to create a local listener: %v", err) - } - lis := testutils.NewRestartableListener(l) - streamErr := errors.New("ADS stream error") - - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ - Listener: lis, - - // Return an error everytime a request is sent on the stream. This - // should cause the transport to backoff before attempting to recreate - // the stream. - OnStreamRequest: func(id int64, req *v3discoverypb.DiscoveryRequest) error { - select { - case streamRequestCh <- req: - default: - } - return streamErr - }, - }) - - // Bring down the management server before creating the transport. This - // allows us to test the case where SendRequest() is called when there is no - // stream to the management server. - lis.Stop() - - // Override the backoff implementation to always return 0, to reduce test - // run time. Instead control when the backoff returns by blocking on a - // channel, that the test closes. - backoffCh := make(chan struct{}, 1) - unblockBackoffCh := make(chan struct{}) - streamBackoff := func(v int) time.Duration { - select { - case backoffCh <- struct{}{}: - default: - } - <-unblockBackoffCh - return 0 - } - - // Create an xDS client with bootstrap pointing to the above server. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - client := createXDSClientWithBackoff(t, bc, streamBackoff) - - // Register a watch for a listener resource. - const listenerName = "listener" - lw := newListenerWatcher() - ldsCancel := xdsresource.WatchListener(client, listenerName, lw) - defer ldsCancel() - - // The above watch results in an attempt to create a new stream, which will - // fail, and will result in backoff. Wait for backoff to kick in. - select { - case <-backoffCh: - case <-ctx.Done(): - t.Fatal("Timeout waiting for stream backoff") - } - - // Bring up the connection to the management server, and unblock the backoff - // implementation. - lis.Restart() - close(unblockBackoffCh) - - // Verify that the initial discovery request matches expectation. - var gotReq *v3discoverypb.DiscoveryRequest - select { - case gotReq = <-streamRequestCh: - case <-ctx.Done(): - t.Fatalf("Timeout waiting for discovery request on the stream") - } - wantReq := &v3discoverypb.DiscoveryRequest{ - VersionInfo: "", - Node: &v3corepb.Node{ - Id: nodeID, - UserAgentName: "gRPC Go", - UserAgentVersionType: &v3corepb.Node_UserAgentVersion{UserAgentVersion: grpc.Version}, - ClientFeatures: []string{"envoy.lb.does_not_support_overprovisioning", "xds.config.resource-in-sotw"}, - }, - ResourceNames: []string{listenerName}, - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - ResponseNonce: "", - } - if diff := cmp.Diff(gotReq, wantReq, protocmp.Transform()); diff != "" { - t.Fatalf("Unexpected diff in received discovery request, diff (-got, +want):\n%s", diff) - } -} - -// waitForResourceNames waits for the wantNames to be received on namesCh. -// Returns a non-nil error if the context expires before that. -func waitForResourceNames(ctx context.Context, t *testing.T, namesCh chan []string, wantNames []string) error { - t.Helper() - - var lastRequestedNames []string - for ; ; <-time.After(defaultTestShortTimeout) { - select { - case <-ctx.Done(): - return fmt.Errorf("timeout waiting for resources %v to be requested from the management server. Last requested resources: %v", wantNames, lastRequestedNames) - case gotNames := <-namesCh: - if cmp.Equal(gotNames, wantNames, cmpopts.EquateEmpty(), cmpopts.SortSlices(func(s1, s2 string) bool { return s1 < s2 })) { - return nil - } - lastRequestedNames = gotNames - } - } -} diff --git a/xds/internal/xdsclient/tests/ads_stream_flow_control_test.go b/xds/internal/xdsclient/tests/ads_stream_flow_control_test.go deleted file mode 100644 index ec1e3cef4d71..000000000000 --- a/xds/internal/xdsclient/tests/ads_stream_flow_control_test.go +++ /dev/null @@ -1,624 +0,0 @@ -/* - * - * Copyright 2024 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package xdsclient_test - -import ( - "context" - "errors" - "slices" - "sort" - "testing" - "time" - - "github.com/google/uuid" - "google.golang.org/grpc" - "google.golang.org/grpc/internal/testutils/xds/e2e" - "google.golang.org/grpc/internal/xds/bootstrap" - "google.golang.org/grpc/xds/internal/xdsclient" - xdsclientinternal "google.golang.org/grpc/xds/internal/xdsclient/internal" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" - - v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" - v3adsgrpc "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" - v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" -) - -// blockingListenerWatcher implements xdsresource.ListenerWatcher. It writes to -// a channel when it receives a callback from the watch. It also makes the -// DoneNotifier passed to the callback available to the test, thereby enabling -// the test to block this watcher for as long as required. -type blockingListenerWatcher struct { - doneNotifierCh chan func() // DoneNotifier passed to the callback. - updateCh chan struct{} // Written to when an update is received. - ambientErrCh chan struct{} // Written to when an ambient error is received. - resourceErrCh chan struct{} // Written to when a resource error is received. -} - -func newBLockingListenerWatcher() *blockingListenerWatcher { - return &blockingListenerWatcher{ - doneNotifierCh: make(chan func(), 1), - updateCh: make(chan struct{}, 1), - ambientErrCh: make(chan struct{}, 1), - resourceErrCh: make(chan struct{}, 1), - } -} - -func (lw *blockingListenerWatcher) ResourceChanged(update *xdsresource.ListenerResourceData, done func()) { - // Notify receipt of the update. - select { - case lw.updateCh <- struct{}{}: - default: - } - - select { - case lw.doneNotifierCh <- done: - default: - } -} - -func (lw *blockingListenerWatcher) ResourceError(err error, done func()) { - // Notify receipt of an error. - select { - case lw.resourceErrCh <- struct{}{}: - default: - } - - select { - case lw.doneNotifierCh <- done: - default: - } -} - -func (lw *blockingListenerWatcher) AmbientError(err error, done func()) { - // Notify receipt of an error. - select { - case lw.ambientErrCh <- struct{}{}: - default: - } - - select { - case lw.doneNotifierCh <- done: - default: - } -} - -type wrappedADSStream struct { - v3adsgrpc.AggregatedDiscoveryService_StreamAggregatedResourcesClient - recvCh chan struct{} - doneCh <-chan struct{} -} - -func newWrappedADSStream(stream v3adsgrpc.AggregatedDiscoveryService_StreamAggregatedResourcesClient, doneCh <-chan struct{}) *wrappedADSStream { - return &wrappedADSStream{ - AggregatedDiscoveryService_StreamAggregatedResourcesClient: stream, - recvCh: make(chan struct{}, 1), - doneCh: doneCh, - } -} - -func (w *wrappedADSStream) Recv() (*v3discoverypb.DiscoveryResponse, error) { - select { - case w.recvCh <- struct{}{}: - case <-w.doneCh: - return nil, errors.New("Recv() called after the test has finished") - } - return w.AggregatedDiscoveryService_StreamAggregatedResourcesClient.Recv() -} - -// Overrides the function to create a new ADS stream (used by the xdsclient -// transport), and returns a wrapped ADS stream, where the test can monitor -// Recv() calls. -func overrideADSStreamCreation(t *testing.T) chan *wrappedADSStream { - t.Helper() - - adsStreamCh := make(chan *wrappedADSStream, 1) - origNewADSStream := xdsclientinternal.NewADSStream - xdsclientinternal.NewADSStream = func(ctx context.Context, cc *grpc.ClientConn) (v3adsgrpc.AggregatedDiscoveryService_StreamAggregatedResourcesClient, error) { - s, err := v3adsgrpc.NewAggregatedDiscoveryServiceClient(cc).StreamAggregatedResources(ctx) - if err != nil { - return nil, err - } - ws := newWrappedADSStream(s, ctx.Done()) - select { - case adsStreamCh <- ws: - default: - } - return ws, nil - } - t.Cleanup(func() { xdsclientinternal.NewADSStream = origNewADSStream }) - return adsStreamCh -} - -// Creates an xDS client with the given bootstrap contents. -func createXDSClient(t *testing.T, bootstrapContents []byte) xdsclient.XDSClient { - t.Helper() - - config, err := bootstrap.NewConfigFromContents(bootstrapContents) - if err != nil { - t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err) - } - pool := xdsclient.NewPool(config) - client, close, err := pool.NewClientForTesting(xdsclient.OptionsForTesting{ - Name: t.Name(), - }) - if err != nil { - t.Fatalf("Failed to create xDS client: %v", err) - } - t.Cleanup(close) - return client -} - -// Tests ADS stream level flow control with a single resource. The test does the -// following: -// - Starts a management server and configures a listener resource on it. -// - Creates an xDS client to the above management server, starts a couple of -// listener watchers for the above resource, and verifies that the update -// reaches these watchers. -// - These watchers don't invoke the onDone callback until explicitly -// triggered by the test. This allows the test to verify that the next -// Recv() call on the ADS stream does not happen until both watchers have -// completely processed the update, i.e invoke the onDone callback. -// - Resource is updated on the management server, and the test verifies that -// the update reaches the watchers. -func (s) TestADSFlowControl_ResourceUpdates_SingleResource(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Override the ADS stream creation. - adsStreamCh := overrideADSStreamCreation(t) - - // Start an xDS management server. - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{}) - - // Create bootstrap configuration pointing to the above management server. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - - // Create an xDS client with the above bootstrap contents. - client := createXDSClient(t, bc) - - // Configure two watchers for the same listener resource. - const listenerResourceName = "test-listener-resource" - const routeConfigurationName = "test-route-configuration-resource" - watcher1 := newBLockingListenerWatcher() - cancel1 := xdsresource.WatchListener(client, listenerResourceName, watcher1) - defer cancel1() - watcher2 := newBLockingListenerWatcher() - cancel2 := xdsresource.WatchListener(client, listenerResourceName, watcher2) - defer cancel2() - - // Wait for the wrapped ADS stream to be created. - var adsStream *wrappedADSStream - select { - case adsStream = <-adsStreamCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for ADS stream to be created") - } - - // Configure the listener resource on the management server. - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)}, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) - } - - // Ensure that there is a read on the stream. - select { - case <-adsStream.recvCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for ADS stream to be read from") - } - - // Wait for the update to reach the watchers. - select { - case <-watcher1.updateCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for update to reach watcher 1") - } - select { - case <-watcher2.updateCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for update to reach watcher 2") - } - - // Update the listener resource on the management server to point to a new - // route configuration resource. - resources = e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, "new-route")}, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) - } - - // Unblock one watcher. - onDone := <-watcher1.doneNotifierCh - onDone() - - // Wait for a short duration and ensure that there is no read on the stream. - select { - case <-adsStream.recvCh: - t.Fatal("Recv() called on the ADS stream before all watchers have processed the previous update") - case <-time.After(defaultTestShortTimeout): - } - - // Unblock the second watcher. - onDone = <-watcher2.doneNotifierCh - onDone() - - // Ensure that there is a read on the stream, now that the previous update - // has been consumed by all watchers. - select { - case <-adsStream.recvCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for Recv() to be called on the ADS stream after all watchers have processed the previous update") - } - - // Wait for the new update to reach the watchers. - select { - case <-watcher1.updateCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for update to reach watcher 1") - } - select { - case <-watcher2.updateCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for update to reach watcher 2") - } - - // At this point, the xDS client is shut down (and the associated transport - // is closed) without the watchers invoking their respective onDone - // callbacks. This verifies that the closing a transport that has pending - // watchers does not block. -} - -// Tests ADS stream level flow control with a multiple resources. The test does -// the following: -// - Starts a management server and configures two listener resources on it. -// - Creates an xDS client to the above management server, starts a couple of -// listener watchers for the two resources, and verifies that the update -// reaches these watchers. -// - These watchers don't invoke the onDone callback until explicitly -// triggered by the test. This allows the test to verify that the next -// Recv() call on the ADS stream does not happen until both watchers have -// completely processed the update, i.e invoke the onDone callback. -func (s) TestADSFlowControl_ResourceUpdates_MultipleResources(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Override the ADS stream creation. - adsStreamCh := overrideADSStreamCreation(t) - - // Start an xDS management server. - const listenerResourceName1 = "test-listener-resource-1" - const listenerResourceName2 = "test-listener-resource-2" - wantResourceNames := []string{listenerResourceName1, listenerResourceName2} - requestCh := make(chan struct{}, 1) - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ - OnStreamRequest: func(id int64, req *v3discoverypb.DiscoveryRequest) error { - if req.GetTypeUrl() != version.V3ListenerURL { - return nil - } - gotResourceNames := req.GetResourceNames() - sort.Slice(gotResourceNames, func(i, j int) bool { return req.ResourceNames[i] < req.ResourceNames[j] }) - if slices.Equal(gotResourceNames, wantResourceNames) { - // The two resource names will be part of the initial request - // and also the ACK. Hence, we need to make this write - // non-blocking. - select { - case requestCh <- struct{}{}: - default: - } - } - return nil - }, - }) - - // Create bootstrap configuration pointing to the above management server. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - - // Create an xDS client with the above bootstrap contents. - client := createXDSClient(t, bc) - - // Configure two watchers for two different listener resources. - const routeConfigurationName1 = "test-route-configuration-resource-1" - watcher1 := newBLockingListenerWatcher() - cancel1 := xdsresource.WatchListener(client, listenerResourceName1, watcher1) - defer cancel1() - const routeConfigurationName2 = "test-route-configuration-resource-2" - watcher2 := newBLockingListenerWatcher() - cancel2 := xdsresource.WatchListener(client, listenerResourceName2, watcher2) - defer cancel2() - - // Wait for the wrapped ADS stream to be created. - var adsStream *wrappedADSStream - select { - case adsStream = <-adsStreamCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for ADS stream to be created") - } - - // Ensure that there is a read on the stream. - select { - case <-adsStream.recvCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for ADS stream to be read from") - } - - // Wait for both resource names to be requested. - select { - case <-requestCh: - case <-ctx.Done(): - t.Fatal("Timed out waiting for both resource names to be requested") - } - - // Configure the listener resources on the management server. - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{ - e2e.DefaultClientListener(listenerResourceName1, routeConfigurationName1), - e2e.DefaultClientListener(listenerResourceName2, routeConfigurationName2), - }, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) - } - - // At this point, we expect the management server to send both resources in - // the same response. So, both watchers would be notified at the same time, - // and no more Recv() calls should happen until both of them have invoked - // their respective onDone() callbacks. - - // The order of callback invocations among the two watchers is not - // guaranteed. So, we select on both of them and unblock the first watcher - // whose callback is invoked. - var otherWatcherUpdateCh chan struct{} - var otherWatcherDoneCh chan func() - select { - case <-watcher1.updateCh: - onDone := <-watcher1.doneNotifierCh - onDone() - otherWatcherUpdateCh = watcher2.updateCh - otherWatcherDoneCh = watcher2.doneNotifierCh - case <-watcher2.updateCh: - onDone := <-watcher2.doneNotifierCh - onDone() - otherWatcherUpdateCh = watcher1.updateCh - otherWatcherDoneCh = watcher1.doneNotifierCh - case <-ctx.Done(): - t.Fatal("Timed out waiting for update to reach first watchers") - } - - // Wait for a short duration and ensure that there is no read on the stream. - select { - case <-adsStream.recvCh: - t.Fatal("Recv() called on the ADS stream before all watchers have processed the previous update") - case <-time.After(defaultTestShortTimeout): - } - - // Wait for the update on the second watcher and unblock it. - select { - case <-otherWatcherUpdateCh: - onDone := <-otherWatcherDoneCh - onDone() - case <-ctx.Done(): - t.Fatal("Timed out waiting for update to reach second watcher") - } - - // Ensure that there is a read on the stream, now that the previous update - // has been consumed by all watchers. - select { - case <-adsStream.recvCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for Recv() to be called on the ADS stream after all watchers have processed the previous update") - } -} - -// Test ADS stream flow control with a single resource that is expected to be -// NACKed by the xDS client and the watcher's ResourceError() callback is -// expected to be invoked because resource is not cached. Verifies that no -// further reads are attempted until the error is completely processed by the -// watcher. -func (s) TestADSFlowControl_ResourceErrors(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Override the ADS stream creation. - adsStreamCh := overrideADSStreamCreation(t) - - // Start an xDS management server. - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{}) - - // Create bootstrap configuration pointing to the above management server. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - - // Create an xDS client with the above bootstrap contents. - client := createXDSClient(t, bc) - - // Configure a watcher for a listener resource. - const listenerResourceName = "test-listener-resource" - watcher := newBLockingListenerWatcher() - cancel = xdsresource.WatchListener(client, listenerResourceName, watcher) - defer cancel() - - // Wait for the wrapped ADS stream to be created. - var adsStream *wrappedADSStream - select { - case adsStream = <-adsStreamCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for ADS stream to be created") - } - - // Configure the management server to return a single listener resource - // which is expected to be NACKed by the client. - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{badListenerResource(t, listenerResourceName)}, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) - } - - // Ensure that there is a read on the stream. - select { - case <-adsStream.recvCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for ADS stream to be read from") - } - - // Wait for the resource error to reach the watcher. - select { - case <-watcher.resourceErrCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for error to reach watcher") - } - - // Wait for a short duration and ensure that there is no read on the stream. - select { - case <-adsStream.recvCh: - t.Fatal("Recv() called on the ADS stream before all watchers have processed the previous update") - case <-time.After(defaultTestShortTimeout): - } - - // Unblock one watcher. - onDone := <-watcher.doneNotifierCh - onDone() - - // Ensure that there is a read on the stream, now that the previous error - // has been consumed by the watcher. - select { - case <-adsStream.recvCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for Recv() to be called on the ADS stream after all watchers have processed the previous update") - } -} - -// Test ADS stream flow control with a single resource that is deleted from the -// management server and therefore the watcher's ResourceError() -// callback is expected to be invoked. Verifies that no further reads are -// attempted until the callback is completely handled by the watcher. -func (s) TestADSFlowControl_ResourceDoesNotExist(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Override the ADS stream creation. - adsStreamCh := overrideADSStreamCreation(t) - - // Start an xDS management server. - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{}) - - // Create bootstrap configuration pointing to the above management server. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - - // Create an xDS client with the above bootstrap contents. - client := createXDSClient(t, bc) - - // Configure a watcher for a listener resource. - const listenerResourceName = "test-listener-resource" - const routeConfigurationName = "test-route-configuration-resource" - watcher := newBLockingListenerWatcher() - cancel = xdsresource.WatchListener(client, listenerResourceName, watcher) - defer cancel() - - // Wait for the wrapped ADS stream to be created. - var adsStream *wrappedADSStream - select { - case adsStream = <-adsStreamCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for ADS stream to be created") - } - - // Configure the listener resource on the management server. - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)}, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) - } - - // Ensure that there is a read on the stream. - select { - case <-adsStream.recvCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for Recv() to be called on the ADS stream") - } - - // Wait for the update to reach the watcher and unblock it. - select { - case <-watcher.updateCh: - onDone := <-watcher.doneNotifierCh - onDone() - case <-ctx.Done(): - t.Fatalf("Timed out waiting for update to reach watcher 1") - } - - // Ensure that there is a read on the stream. - select { - case <-adsStream.recvCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for Recv() to be called on the ADS stream") - } - - // Remove the listener resource on the management server. - resources = e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{}, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) - } - - // Wait for the resource not found callback to be invoked. - select { - case <-watcher.resourceErrCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for resource not found callback to be invoked on the watcher") - } - - // Wait for a short duration and ensure that there is no read on the stream. - select { - case <-adsStream.recvCh: - t.Fatal("Recv() called on the ADS stream before all watchers have processed the previous update") - case <-time.After(defaultTestShortTimeout): - } - - // Unblock the watcher. - onDone := <-watcher.doneNotifierCh - onDone() - - // Ensure that there is a read on the stream. - select { - case <-adsStream.recvCh: - case <-ctx.Done(): - t.Fatalf("Timed out waiting for Recv() to be called on the ADS stream") - } -} diff --git a/xds/internal/xdsclient/tests/ads_stream_restart_test.go b/xds/internal/xdsclient/tests/ads_stream_restart_test.go index 522ecae6bfa5..a53f96fb6623 100644 --- a/xds/internal/xdsclient/tests/ads_stream_restart_test.go +++ b/xds/internal/xdsclient/tests/ads_stream_restart_test.go @@ -20,8 +20,12 @@ package xdsclient_test import ( "context" + "fmt" "testing" + "time" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" "github.com/google/uuid" "google.golang.org/grpc/internal/testutils" "google.golang.org/grpc/internal/testutils/xds/e2e" @@ -36,6 +40,25 @@ import ( v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" ) +// waitForResourceNames waits for the wantNames to be received on namesCh. +// Returns a non-nil error if the context expires before that. +func waitForResourceNames(ctx context.Context, t *testing.T, namesCh chan []string, wantNames []string) error { + t.Helper() + + var lastRequestedNames []string + for ; ; <-time.After(defaultTestShortTimeout) { + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for resources %v to be requested from the management server. Last requested resources: %v", wantNames, lastRequestedNames) + case gotNames := <-namesCh: + if cmp.Equal(gotNames, wantNames, cmpopts.EquateEmpty(), cmpopts.SortSlices(func(s1, s2 string) bool { return s1 < s2 })) { + return nil + } + lastRequestedNames = gotNames + } + } +} + // Tests that an ADS stream is restarted after a connection failure. Also // verifies that if there were any watches registered before the connection // failed, those resources are re-requested after the stream is restarted. diff --git a/xds/internal/xdsclient/tests/ads_stream_watch_test.go b/xds/internal/xdsclient/tests/ads_stream_watch_test.go deleted file mode 100644 index 2672b0f29820..000000000000 --- a/xds/internal/xdsclient/tests/ads_stream_watch_test.go +++ /dev/null @@ -1,209 +0,0 @@ -/* - * - * Copyright 2024 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package xdsclient_test - -import ( - "context" - "fmt" - "testing" - "time" - - "github.com/google/uuid" - "google.golang.org/grpc/internal/testutils" - "google.golang.org/grpc/internal/testutils/xds/e2e" - "google.golang.org/grpc/internal/xds/bootstrap" - xdsinternal "google.golang.org/grpc/xds/internal" - "google.golang.org/grpc/xds/internal/xdsclient" - "google.golang.org/grpc/xds/internal/xdsclient/internal" - "google.golang.org/grpc/xds/internal/xdsclient/transport/ads" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" - - v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" -) - -// Tests the state transitions of the resource specific watch state within the -// ADS stream, specifically when the stream breaks (for both resources that have -// been previously received and for resources that are yet to be received). -func (s) TestADS_WatchState_StreamBreaks(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Create an xDS management server with a restartable listener. - l, err := testutils.LocalTCPListener() - if err != nil { - t.Fatalf("Failed to create a local listener for the xDS management server: %v", err) - } - lis := testutils.NewRestartableListener(l) - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: lis}) - - // Create an xDS client with bootstrap pointing to the above server. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - client := createXDSClient(t, bc) - - // Create a watch for the first listener resource and verify that the timer - // is running and the watch state is `requested`. - const listenerName1 = "listener1" - ldsCancel1 := xdsresource.WatchListener(client, listenerName1, noopListenerWatcher{}) - defer ldsCancel1() - if err := waitForResourceWatchState(ctx, client, listenerName1, ads.ResourceWatchStateRequested, true); err != nil { - t.Fatal(err) - } - - // Configure the first resource on the management server. This should result - // in the resource being pushed to the xDS client and should result in the - // timer getting stopped and the watch state moving to `received`. - const routeConfigName = "route-config" - listenerResource1 := e2e.DefaultClientListener(listenerName1, routeConfigName) - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listenerResource1}, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatal(err) - } - if err := waitForResourceWatchState(ctx, client, listenerName1, ads.ResourceWatchStateReceived, false); err != nil { - t.Fatal(err) - } - - // Create a watch for the second listener resource and verify that the timer - // is running and the watch state is `requested`. - const listenerName2 = "listener2" - ldsCancel2 := xdsresource.WatchListener(client, listenerName2, noopListenerWatcher{}) - defer ldsCancel2() - if err := waitForResourceWatchState(ctx, client, listenerName2, ads.ResourceWatchStateRequested, true); err != nil { - t.Fatal(err) - } - - // Stop the server to break the ADS stream. Since the first resource was - // already received, this should not change anything for it. But for the - // second resource, it should result in the timer getting stopped and the - // watch state moving to `started`. - lis.Stop() - if err := waitForResourceWatchState(ctx, client, listenerName2, ads.ResourceWatchStateStarted, false); err != nil { - t.Fatal(err) - } - if err := verifyResourceWatchState(client, listenerName1, ads.ResourceWatchStateReceived, false); err != nil { - t.Fatal(err) - } - - // Restart the server and verify that the timer is running and the watch - // state is `requested`, for the second resource. For the first resource, - // nothing should change. - lis.Restart() - if err := waitForResourceWatchState(ctx, client, listenerName2, ads.ResourceWatchStateRequested, true); err != nil { - t.Fatal(err) - } - if err := verifyResourceWatchState(client, listenerName1, ads.ResourceWatchStateReceived, false); err != nil { - t.Fatal(err) - } - - // Configure the second resource on the management server. This should result - // in the resource being pushed to the xDS client and should result in the - // timer getting stopped and the watch state moving to `received`. - listenerResource2 := e2e.DefaultClientListener(listenerName2, routeConfigName) - resources = e2e.UpdateOptions{ - NodeID: nodeID, - Listeners: []*v3listenerpb.Listener{listenerResource1, listenerResource2}, - SkipValidation: true, - } - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatal(err) - } - if err := waitForResourceWatchState(ctx, client, listenerName2, ads.ResourceWatchStateReceived, false); err != nil { - t.Fatal(err) - } -} - -// Tests the behavior of the xDS client when a resource watch timer expires and -// verifies the resource watch state transitions as expected. -func (s) TestADS_WatchState_TimerFires(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Start an xDS management server. - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{}) - - // Create an xDS client with bootstrap pointing to the above server, and a - // short resource expiry timeout. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - config, err := bootstrap.NewConfigFromContents(bc) - if err != nil { - t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bc), err) - } - pool := xdsclient.NewPool(config) - client, close, err := pool.NewClientForTesting(xdsclient.OptionsForTesting{ - Name: t.Name(), - WatchExpiryTimeout: defaultTestWatchExpiryTimeout, - }) - if err != nil { - t.Fatalf("Failed to create xDS client: %v", err) - } - defer close() - - // Create a watch for the first listener resource and verify that the timer - // is running and the watch state is `requested`. - const listenerName = "listener" - ldsCancel1 := xdsresource.WatchListener(client, listenerName, noopListenerWatcher{}) - defer ldsCancel1() - if err := waitForResourceWatchState(ctx, client, listenerName, ads.ResourceWatchStateRequested, true); err != nil { - t.Fatal(err) - } - - // Since the resource is not configured on the management server, the watch - // expiry timer is expected to fire, and the watch state should move to - // `timeout`. - if err := waitForResourceWatchState(ctx, client, listenerName, ads.ResourceWatchStateTimeout, false); err != nil { - t.Fatal(err) - } -} - -func waitForResourceWatchState(ctx context.Context, client xdsclient.XDSClient, resourceName string, wantState ads.WatchState, wantTimer bool) error { - var lastErr error - for ; ctx.Err() == nil; <-time.After(defaultTestShortTimeout) { - err := verifyResourceWatchState(client, resourceName, wantState, wantTimer) - if err == nil { - break - } - lastErr = err - } - if ctx.Err() != nil { - return fmt.Errorf("timeout when waiting for expected watch state for resource %q: %v", resourceName, lastErr) - } - return nil -} - -func verifyResourceWatchState(client xdsclient.XDSClient, resourceName string, wantState ads.WatchState, wantTimer bool) error { - resourceWatchStateForTesting := internal.ResourceWatchStateForTesting.(func(xdsclient.XDSClient, xdsresource.Type, string) (ads.ResourceWatchState, error)) - listenerResourceType := xdsinternal.ResourceTypeMapForTesting[version.V3ListenerURL].(xdsresource.Type) - gotState, err := resourceWatchStateForTesting(client, listenerResourceType, resourceName) - if err != nil { - return fmt.Errorf("failed to get watch state for resource %q: %v", resourceName, err) - } - if gotState.State != wantState { - return fmt.Errorf("watch state for resource %q is %v, want %v", resourceName, gotState.State, wantState) - } - if (gotState.ExpiryTimer != nil) != wantTimer { - return fmt.Errorf("expiry timer for resource %q is %t, want %t", resourceName, gotState.ExpiryTimer != nil, wantTimer) - } - return nil -} diff --git a/xds/internal/xdsclient/tests/cds_watchers_test.go b/xds/internal/xdsclient/tests/cds_watchers_test.go index 82d7a06b91cf..948a0299148e 100644 --- a/xds/internal/xdsclient/tests/cds_watchers_test.go +++ b/xds/internal/xdsclient/tests/cds_watchers_test.go @@ -111,8 +111,8 @@ func verifyClusterUpdate(ctx context.Context, updateCh *testutils.Channel, wantU } got := u.(clusterUpdateErrTuple) if wantUpdate.err != nil { - if gotType, wantType := xdsresource.ErrType(got.err), xdsresource.ErrType(wantUpdate.err); gotType != wantType { - return fmt.Errorf("received update with error type %v, want %v", gotType, wantType) + if got.err == nil || !strings.Contains(got.err.Error(), wantUpdate.err.Error()) { + return fmt.Errorf("update received with error: %v, want %q", got.err, wantUpdate.err) } } cmpOpts := []cmp.Option{cmpopts.EquateEmpty(), cmpopts.IgnoreFields(xdsresource.ClusterUpdate{}, "Raw", "LBPolicy", "TelemetryLabels")} diff --git a/xds/internal/xdsclient/tests/eds_watchers_test.go b/xds/internal/xdsclient/tests/eds_watchers_test.go index fcb7de4c4a65..63648c56a00a 100644 --- a/xds/internal/xdsclient/tests/eds_watchers_test.go +++ b/xds/internal/xdsclient/tests/eds_watchers_test.go @@ -121,8 +121,8 @@ func verifyEndpointsUpdate(ctx context.Context, updateCh *testutils.Channel, wan } got := u.(endpointsUpdateErrTuple) if wantUpdate.err != nil { - if gotType, wantType := xdsresource.ErrType(got.err), xdsresource.ErrType(wantUpdate.err); gotType != wantType { - return fmt.Errorf("received update with error type %v, want %v", gotType, wantType) + if got.err == nil || !strings.Contains(got.err.Error(), wantUpdate.err.Error()) { + return fmt.Errorf("update received with error: %v, want %q", got.err, wantUpdate.err) } } cmpOpts := []cmp.Option{cmpopts.EquateEmpty(), cmpopts.IgnoreFields(xdsresource.EndpointsUpdate{}, "Raw")} diff --git a/xds/internal/xdsclient/tests/lds_watchers_test.go b/xds/internal/xdsclient/tests/lds_watchers_test.go index f75b572a4c22..b2498094b358 100644 --- a/xds/internal/xdsclient/tests/lds_watchers_test.go +++ b/xds/internal/xdsclient/tests/lds_watchers_test.go @@ -161,8 +161,8 @@ func verifyListenerUpdate(ctx context.Context, updateCh *testutils.Channel, want } got := u.(listenerUpdateErrTuple) if wantUpdate.err != nil { - if gotType, wantType := xdsresource.ErrType(got.err), xdsresource.ErrType(wantUpdate.err); gotType != wantType { - return fmt.Errorf("received update with error type %v, want %v", gotType, wantType) + if got.err == nil || !strings.Contains(got.err.Error(), wantUpdate.err.Error()) { + return fmt.Errorf("update received with error: %v, want %q", got.err, wantUpdate.err) } } cmpOpts := []cmp.Option{ @@ -176,21 +176,6 @@ func verifyListenerUpdate(ctx context.Context, updateCh *testutils.Channel, want return nil } -func verifyListenerError(ctx context.Context, updateCh *testutils.Channel, wantErr, wantNodeID string) error { - u, err := updateCh.Receive(ctx) - if err != nil { - return fmt.Errorf("timeout when waiting for a listener error from the management server: %v", err) - } - gotErr := u.(listenerUpdateErrTuple).err - if gotErr == nil || !strings.Contains(gotErr.Error(), wantErr) { - return fmt.Errorf("update received with error: %v, want %q", gotErr, wantErr) - } - if !strings.Contains(gotErr.Error(), wantNodeID) { - return fmt.Errorf("update received with error: %v, want error with node ID: %q", gotErr, wantNodeID) - } - return nil -} - func verifyErrorType(ctx context.Context, updateCh *testutils.Channel, wantErrType xdsresource.ErrorType, wantNodeID string) error { u, err := updateCh.Receive(ctx) if err != nil { @@ -1072,7 +1057,7 @@ func (s) TestLDSWatch_NACKError(t *testing.T) { } // Verify that the expected error is propagated to the existing watcher. - if err := verifyErrorType(ctx, lw.updateCh, xdsresource.ErrorTypeNACKed, nodeID); err != nil { + if err := verifyErrorType(ctx, lw.updateCh, xdsresource.ErrorTypeUnknown, nodeID); err != nil { t.Fatal(err) } @@ -1080,7 +1065,7 @@ func (s) TestLDSWatch_NACKError(t *testing.T) { lw2 := newListenerWatcher() ldsCancel2 := xdsresource.WatchListener(client, ldsName, lw2) defer ldsCancel2() - if err := verifyErrorType(ctx, lw2.updateCh, xdsresource.ErrorTypeNACKed, nodeID); err != nil { + if err := verifyErrorType(ctx, lw2.updateCh, xdsresource.ErrorTypeUnknown, nodeID); err != nil { t.Fatal(err) } } @@ -1152,7 +1137,7 @@ func (s) TestLDSWatch_ResourceCaching_NACKError(t *testing.T) { } // Verify that the expected error is propagated to the existing watcher. - if err := verifyErrorType(ctx, lw1.updateCh, xdsresource.ErrorTypeNACKed, nodeID); err != nil { + if err := verifyErrorType(ctx, lw1.updateCh, xdsresource.ErrorTypeUnknown, nodeID); err != nil { t.Fatal(err) } @@ -1165,7 +1150,7 @@ func (s) TestLDSWatch_ResourceCaching_NACKError(t *testing.T) { t.Fatal(err) } // Verify that the expected error is propagated to the existing watcher. - if err := verifyErrorType(ctx, lw2.updateCh, xdsresource.ErrorTypeNACKed, nodeID); err != nil { + if err := verifyErrorType(ctx, lw2.updateCh, xdsresource.ErrorTypeUnknown, nodeID); err != nil { t.Fatal(err) } } @@ -1243,7 +1228,7 @@ func (s) TestLDSWatch_PartialValid(t *testing.T) { // Verify that the expected error is propagated to the watcher which // requested for the bad resource. // Verify that the expected error is propagated to the existing watcher. - if err := verifyErrorType(ctx, lw1.updateCh, xdsresource.ErrorTypeNACKed, nodeID); err != nil { + if err := verifyErrorType(ctx, lw1.updateCh, xdsresource.ErrorTypeUnknown, nodeID); err != nil { t.Fatal(err) } diff --git a/xds/internal/xdsclient/tests/loadreport_test.go b/xds/internal/xdsclient/tests/loadreport_test.go index c249a3ace07e..a93b5bc1cbee 100644 --- a/xds/internal/xdsclient/tests/loadreport_test.go +++ b/xds/internal/xdsclient/tests/loadreport_test.go @@ -35,6 +35,7 @@ import ( "google.golang.org/grpc/internal/testutils/xds/fakeserver" "google.golang.org/grpc/internal/xds/bootstrap" "google.golang.org/grpc/status" + "google.golang.org/grpc/xds/internal/clients" "google.golang.org/protobuf/testing/protocmp" v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" @@ -44,13 +45,13 @@ import ( ) const ( - testLocality1 = `{region="test-region1", zone="", sub_zone=""}` - testLocality2 = `{region="test-region2", zone="", sub_zone=""}` - testKey1 = "test-key1" - testKey2 = "test-key2" + testKey1 = "test-key1" + testKey2 = "test-key2" ) var ( + testLocality1 = clients.Locality{Region: "test-region1"} + testLocality2 = clients.Locality{Region: "test-region2"} toleranceCmpOpt = cmpopts.EquateApprox(0, 1e-5) ignoreOrderCmpOpt = protocmp.FilterField(&v3endpointpb.ClusterStats{}, "upstream_locality_stats", cmpopts.SortSlices(func(a, b protocmp.Message) bool { @@ -143,7 +144,9 @@ func (s) TestReportLoad_ConnectionCreation(t *testing.T) { // Call the load reporting API to report load to the first management // server, and ensure that a connection to the server is created. store1, lrsCancel1 := client.ReportLoad(serverCfg1) - defer lrsCancel1() + sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout) + defer sCancel() + defer lrsCancel1(sCtx) if _, err := newConnChan1.Receive(ctx); err != nil { t.Fatal("Timeout when waiting for a connection to the first management server, after starting load reporting") } @@ -158,7 +161,9 @@ func (s) TestReportLoad_ConnectionCreation(t *testing.T) { // Call the load reporting API to report load to the second management // server, and ensure that a connection to the server is created. store2, lrsCancel2 := client.ReportLoad(serverCfg2) - defer lrsCancel2() + sCtx2, sCancel2 := context.WithTimeout(ctx, defaultTestShortTimeout) + defer sCancel2() + defer lrsCancel2(sCtx2) if _, err := newConnChan2.Receive(ctx); err != nil { t.Fatal("Timeout when waiting for a connection to the second management server, after starting load reporting") } @@ -171,7 +176,7 @@ func (s) TestReportLoad_ConnectionCreation(t *testing.T) { } // Push some loads on the received store. - store2.PerCluster("cluster", "eds").CallDropped("test") + store2.ReporterForCluster("cluster", "eds").CallDropped("test") // Ensure the initial load reporting request is received at the server. lrsServer := mgmtServer2.LRSServer @@ -226,7 +231,9 @@ func (s) TestReportLoad_ConnectionCreation(t *testing.T) { } // Cancel this load reporting stream, server should see error canceled. - lrsCancel2() + sCtx2, sCancel2 = context.WithTimeout(ctx, defaultTestShortTimeout) + defer sCancel2() + lrsCancel2(sCtx2) // Server should receive a stream canceled error. There may be additional // load reports from the client in the channel. @@ -280,14 +287,14 @@ func (s) TestReportLoad_StreamCreation(t *testing.T) { } // Push some loads on the received store. - store1.PerCluster("cluster1", "eds1").CallDropped("test") - store1.PerCluster("cluster1", "eds1").CallStarted(testLocality1) - store1.PerCluster("cluster1", "eds1").CallServerLoad(testLocality1, testKey1, 3.14) - store1.PerCluster("cluster1", "eds1").CallServerLoad(testLocality1, testKey1, 2.718) - store1.PerCluster("cluster1", "eds1").CallFinished(testLocality1, nil) - store1.PerCluster("cluster1", "eds1").CallStarted(testLocality2) - store1.PerCluster("cluster1", "eds1").CallServerLoad(testLocality2, testKey2, 1.618) - store1.PerCluster("cluster1", "eds1").CallFinished(testLocality2, nil) + store1.ReporterForCluster("cluster1", "eds1").CallDropped("test") + store1.ReporterForCluster("cluster1", "eds1").CallStarted(testLocality1) + store1.ReporterForCluster("cluster1", "eds1").CallServerLoad(testLocality1, testKey1, 3.14) + store1.ReporterForCluster("cluster1", "eds1").CallServerLoad(testLocality1, testKey1, 2.718) + store1.ReporterForCluster("cluster1", "eds1").CallFinished(testLocality1, nil) + store1.ReporterForCluster("cluster1", "eds1").CallStarted(testLocality2) + store1.ReporterForCluster("cluster1", "eds1").CallServerLoad(testLocality2, testKey2, 1.618) + store1.ReporterForCluster("cluster1", "eds1").CallFinished(testLocality2, nil) // Ensure the initial load reporting request is received at the server. req, err := lrsServer.LRSRequestChan.Receive(ctx) @@ -367,7 +374,7 @@ func (s) TestReportLoad_StreamCreation(t *testing.T) { } // Push more loads. - store2.PerCluster("cluster2", "eds2").CallDropped("test") + store2.ReporterForCluster("cluster2", "eds2").CallDropped("test") // Ensure that loads are seen on the server. We need a loop here because // there could have been some requests from the client in the time between @@ -402,7 +409,9 @@ func (s) TestReportLoad_StreamCreation(t *testing.T) { // Cancel the first load reporting call, and ensure that the stream does not // close (because we have another call open). - cancel1() + sCtx1, sCancel1 := context.WithTimeout(ctx, defaultTestShortTimeout) + defer sCancel1() + cancel1(sCtx1) sCtx, sCancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) defer sCancel() if _, err := lrsServer.LRSStreamCloseChan.Receive(sCtx); err != context.DeadlineExceeded { @@ -410,7 +419,9 @@ func (s) TestReportLoad_StreamCreation(t *testing.T) { } // Cancel the second load reporting call, and ensure the stream is closed. - cancel2() + sCtx2, sCancel2 := context.WithTimeout(ctx, defaultTestShortTimeout) + defer sCancel2() + cancel2(sCtx2) if _, err := lrsServer.LRSStreamCloseChan.Receive(ctx); err != nil { t.Fatal("Timeout waiting for LRS stream to close") } @@ -422,5 +433,7 @@ func (s) TestReportLoad_StreamCreation(t *testing.T) { if _, err := lrsServer.LRSStreamOpenChan.Receive(ctx); err != nil { t.Fatalf("Timeout when waiting for LRS stream to be created: %v", err) } - cancel3() + sCtx3, sCancel3 := context.WithTimeout(ctx, defaultTestShortTimeout) + defer sCancel3() + cancel3(sCtx3) } diff --git a/xds/internal/xdsclient/tests/misc_watchers_test.go b/xds/internal/xdsclient/tests/misc_watchers_test.go deleted file mode 100644 index f448a3430e74..000000000000 --- a/xds/internal/xdsclient/tests/misc_watchers_test.go +++ /dev/null @@ -1,508 +0,0 @@ -/* - * - * Copyright 2022 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package xdsclient_test - -import ( - "context" - "encoding/json" - "fmt" - "strings" - "testing" - - "github.com/google/uuid" - "google.golang.org/grpc" - "google.golang.org/grpc/internal/testutils" - "google.golang.org/grpc/internal/testutils/xds/e2e" - "google.golang.org/grpc/internal/testutils/xds/fakeserver" - "google.golang.org/grpc/internal/xds/bootstrap" - "google.golang.org/grpc/xds/internal" - xdstestutils "google.golang.org/grpc/xds/internal/testutils" - "google.golang.org/grpc/xds/internal/xdsclient" - xdsclientinternal "google.golang.org/grpc/xds/internal/xdsclient/internal" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" - "google.golang.org/protobuf/types/known/anypb" - - v3routepb "github.com/envoyproxy/go-control-plane/envoy/config/route/v3" - v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" -) - -var ( - // Resource type implementations retrieved from the resource type map in the - // internal package, which is initialized when the individual resource types - // are created. - listenerResourceType = internal.ResourceTypeMapForTesting[version.V3ListenerURL].(xdsresource.Type) - routeConfigResourceType = internal.ResourceTypeMapForTesting[version.V3RouteConfigURL].(xdsresource.Type) -) - -// This route configuration watcher registers two watches corresponding to the -// names passed in at creation time on a valid update. -type testRouteConfigWatcher struct { - client xdsclient.XDSClient - name1, name2 string - rcw1, rcw2 *routeConfigWatcher - cancel1, cancel2 func() - updateCh *testutils.Channel -} - -func newTestRouteConfigWatcher(client xdsclient.XDSClient, name1, name2 string) *testRouteConfigWatcher { - return &testRouteConfigWatcher{ - client: client, - name1: name1, - name2: name2, - rcw1: newRouteConfigWatcher(), - rcw2: newRouteConfigWatcher(), - updateCh: testutils.NewChannel(), - } -} - -func (rw *testRouteConfigWatcher) ResourceChanged(update *xdsresource.RouteConfigResourceData, onDone func()) { - rw.updateCh.Send(routeConfigUpdateErrTuple{update: update.Resource}) - - rw.cancel1 = xdsresource.WatchRouteConfig(rw.client, rw.name1, rw.rcw1) - rw.cancel2 = xdsresource.WatchRouteConfig(rw.client, rw.name2, rw.rcw2) - onDone() -} - -func (rw *testRouteConfigWatcher) ResourceError(err error, onDone func()) { - // When used with a go-control-plane management server that continuously - // resends resources which are NACKed by the xDS client, using a `Replace()` - // here and in AmbientError() simplifies tests which will have - // access to the most recently received error. - rw.updateCh.Replace(routeConfigUpdateErrTuple{err: err}) - onDone() -} - -func (rw *testRouteConfigWatcher) AmbientError(err error, onDone func()) { - rw.updateCh.Replace(routeConfigUpdateErrTuple{err: err}) - onDone() -} - -func (rw *testRouteConfigWatcher) cancel() { - rw.cancel1() - rw.cancel2() -} - -// TestWatchCallAnotherWatch tests the scenario where a watch is registered for -// a resource, and more watches are registered from the first watch's callback. -// The test verifies that this scenario does not lead to a deadlock. -func (s) TestWatchCallAnotherWatch(t *testing.T) { - // Start an xDS management server and set the option to allow it to respond - // to requests which only specify a subset of the configured resources. - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{AllowResourceSubset: true}) - - nodeID := uuid.New().String() - authority := makeAuthorityName(t.Name()) - bc, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{ - Servers: []byte(fmt.Sprintf(`[{ - "server_uri": %q, - "channel_creds": [{"type": "insecure"}] - }]`, mgmtServer.Address)), - Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)), - Authorities: map[string]json.RawMessage{ - // Xdstp style resource names used in this test use a slash removed - // version of t.Name as their authority, and the empty config - // results in the top-level xds server configuration being used for - // this authority. - authority: []byte(`{}`), - }, - }) - if err != nil { - t.Fatalf("Failed to create bootstrap configuration: %v", err) - } - - // Create an xDS client with the above bootstrap contents. - config, err := bootstrap.NewConfigFromContents(bc) - if err != nil { - t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bc), err) - } - pool := xdsclient.NewPool(config) - client, close, err := pool.NewClientForTesting(xdsclient.OptionsForTesting{ - Name: t.Name(), - }) - if err != nil { - t.Fatalf("Failed to create xDS client: %v", err) - } - defer close() - - // Configure the management server to respond with route config resources. - ldsNameNewStyle := makeNewStyleLDSName(authority) - rdsNameNewStyle := makeNewStyleRDSName(authority) - resources := e2e.UpdateOptions{ - NodeID: nodeID, - Routes: []*v3routepb.RouteConfiguration{ - e2e.DefaultRouteConfig(rdsName, ldsName, cdsName), - e2e.DefaultRouteConfig(rdsNameNewStyle, ldsNameNewStyle, cdsName), - }, - SkipValidation: true, - } - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - if err := mgmtServer.Update(ctx, resources); err != nil { - t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) - } - - // Create a route configuration watcher that registers two more watches from - // the OnUpdate callback: - // - one for the same resource name as this watch, which would be - // satisfied from xdsClient cache - // - the other for a different resource name, which would be - // satisfied from the server - rw := newTestRouteConfigWatcher(client, rdsName, rdsNameNewStyle) - defer rw.cancel() - rdsCancel := xdsresource.WatchRouteConfig(client, rdsName, rw) - defer rdsCancel() - - // Verify the contents of the received update for the all watchers. - wantUpdate12 := routeConfigUpdateErrTuple{ - update: xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{ - { - Domains: []string{ldsName}, - Routes: []*xdsresource.Route{ - { - Prefix: newStringP("/"), - ActionType: xdsresource.RouteActionRoute, - WeightedClusters: map[string]xdsresource.WeightedCluster{cdsName: {Weight: 100}}, - }, - }, - }, - }, - }, - } - wantUpdate3 := routeConfigUpdateErrTuple{ - update: xdsresource.RouteConfigUpdate{ - VirtualHosts: []*xdsresource.VirtualHost{ - { - Domains: []string{ldsNameNewStyle}, - Routes: []*xdsresource.Route{ - { - Prefix: newStringP("/"), - ActionType: xdsresource.RouteActionRoute, - WeightedClusters: map[string]xdsresource.WeightedCluster{cdsName: {Weight: 100}}, - }, - }, - }, - }, - }, - } - if err := verifyRouteConfigUpdate(ctx, rw.updateCh, wantUpdate12); err != nil { - t.Fatal(err) - } - if err := verifyRouteConfigUpdate(ctx, rw.rcw1.updateCh, wantUpdate12); err != nil { - t.Fatal(err) - } - if err := verifyRouteConfigUpdate(ctx, rw.rcw2.updateCh, wantUpdate3); err != nil { - t.Fatal(err) - } -} - -// TestNodeProtoSentOnlyInFirstRequest verifies that a non-empty node proto gets -// sent only on the first discovery request message on the ADS stream. -// -// It also verifies the same behavior holds after a stream restart. -func (s) TestNodeProtoSentOnlyInFirstRequest(t *testing.T) { - // Create a restartable listener which can close existing connections. - l, err := testutils.LocalTCPListener() - if err != nil { - t.Fatalf("testutils.LocalTCPListener() failed: %v", err) - } - lis := testutils.NewRestartableListener(l) - - // Start a fake xDS management server with the above restartable listener. - // - // We are unable to use the go-control-plane server here, because it caches - // the node proto received in the first request message and adds it to - // subsequent requests before invoking the OnStreamRequest() callback. - // Therefore we cannot verify what is sent by the xDS client. - mgmtServer, cleanup, err := fakeserver.StartServer(lis) - if err != nil { - t.Fatalf("Failed to start fake xDS server: %v", err) - } - defer cleanup() - - // Create a bootstrap file in a temporary directory. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - - // Create an xDS client with the above bootstrap contents. - config, err := bootstrap.NewConfigFromContents(bc) - if err != nil { - t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bc), err) - } - pool := xdsclient.NewPool(config) - client, close, err := pool.NewClientForTesting(xdsclient.OptionsForTesting{ - Name: t.Name(), - }) - if err != nil { - t.Fatalf("Failed to create xDS client: %v", err) - } - defer close() - - const ( - serviceName = "my-service-client-side-xds" - routeConfigName = "route-" + serviceName - clusterName = "cluster-" + serviceName - ) - - // Register a watch for the Listener resource. - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - watcher := xdstestutils.NewTestResourceWatcher() - client.WatchResource(listenerResourceType, serviceName, watcher) - - // Ensure the watch results in a discovery request with an empty node proto. - if err := readDiscoveryResponseAndCheckForNonEmptyNodeProto(ctx, mgmtServer.XDSRequestChan); err != nil { - t.Fatal(err) - } - - // Configure a listener resource on the fake xDS server. - lisAny, err := anypb.New(e2e.DefaultClientListener(serviceName, routeConfigName)) - if err != nil { - t.Fatalf("Failed to marshal listener resource into an Any proto: %v", err) - } - mgmtServer.XDSResponseChan <- &fakeserver.Response{ - Resp: &v3discoverypb.DiscoveryResponse{ - TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", - VersionInfo: "1", - Resources: []*anypb.Any{lisAny}, - }, - } - - // The xDS client is expected to ACK the Listener resource. The discovery - // request corresponding to the ACK must contain a nil node proto. - if err := readDiscoveryResponseAndCheckForEmptyNodeProto(ctx, mgmtServer.XDSRequestChan); err != nil { - t.Fatal(err) - } - - // Register a watch for a RouteConfiguration resource. - client.WatchResource(routeConfigResourceType, routeConfigName, watcher) - - // Ensure the watch results in a discovery request with an empty node proto. - if err := readDiscoveryResponseAndCheckForEmptyNodeProto(ctx, mgmtServer.XDSRequestChan); err != nil { - t.Fatal(err) - } - - // Configure the route configuration resource on the fake xDS server. - rcAny, err := anypb.New(e2e.DefaultRouteConfig(routeConfigName, serviceName, clusterName)) - if err != nil { - t.Fatalf("Failed to marshal route configuration resource into an Any proto: %v", err) - } - mgmtServer.XDSResponseChan <- &fakeserver.Response{ - Resp: &v3discoverypb.DiscoveryResponse{ - TypeUrl: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration", - VersionInfo: "1", - Resources: []*anypb.Any{rcAny}, - }, - } - - // Ensure the discovery request for the ACK contains an empty node proto. - if err := readDiscoveryResponseAndCheckForEmptyNodeProto(ctx, mgmtServer.XDSRequestChan); err != nil { - t.Fatal(err) - } - - // Stop the management server and expect the error callback to be invoked. - lis.Stop() - select { - case <-ctx.Done(): - t.Fatal("Timeout when waiting for the connection error to be propagated to the watcher") - case <-watcher.AmbientErrorCh: - } - - // Restart the management server. - lis.Restart() - - // The xDS client is expected to re-request previously requested resources. - // Hence, we expect two DiscoveryRequest messages (one for the Listener and - // one for the RouteConfiguration resource). The first message should contain - // a non-nil node proto and the second should contain a nil-proto. - // - // And since we don't push any responses on the response channel of the fake - // server, we do not expect any ACKs here. - if err := readDiscoveryResponseAndCheckForNonEmptyNodeProto(ctx, mgmtServer.XDSRequestChan); err != nil { - t.Fatal(err) - } - if err := readDiscoveryResponseAndCheckForEmptyNodeProto(ctx, mgmtServer.XDSRequestChan); err != nil { - t.Fatal(err) - } -} - -// readDiscoveryResponseAndCheckForEmptyNodeProto reads a discovery request -// message out of the provided reqCh. It returns an error if it fails to read a -// message before the context deadline expires, or if the read message contains -// a non-empty node proto. -func readDiscoveryResponseAndCheckForEmptyNodeProto(ctx context.Context, reqCh *testutils.Channel) error { - v, err := reqCh.Receive(ctx) - if err != nil { - return fmt.Errorf("Timeout when waiting for a DiscoveryRequest message") - } - req := v.(*fakeserver.Request).Req.(*v3discoverypb.DiscoveryRequest) - if node := req.GetNode(); node != nil { - return fmt.Errorf("Node proto received in DiscoveryRequest message is %v, want empty node proto", node) - } - return nil -} - -// readDiscoveryResponseAndCheckForNonEmptyNodeProto reads a discovery request -// message out of the provided reqCh. It returns an error if it fails to read a -// message before the context deadline expires, or if the read message contains -// an empty node proto. -func readDiscoveryResponseAndCheckForNonEmptyNodeProto(ctx context.Context, reqCh *testutils.Channel) error { - v, err := reqCh.Receive(ctx) - if err != nil { - return fmt.Errorf("Timeout when waiting for a DiscoveryRequest message") - } - req := v.(*fakeserver.Request).Req.(*v3discoverypb.DiscoveryRequest) - if node := req.GetNode(); node == nil { - return fmt.Errorf("Empty node proto received in DiscoveryRequest message, want non-empty node proto") - } - return nil -} - -type testRouteConfigResourceType struct{} - -func (testRouteConfigResourceType) TypeURL() string { return version.V3RouteConfigURL } -func (testRouteConfigResourceType) TypeName() string { return "RouteConfigResource" } -func (testRouteConfigResourceType) AllResourcesRequiredInSotW() bool { return false } -func (testRouteConfigResourceType) Decode(*xdsresource.DecodeOptions, *anypb.Any) (*xdsresource.DecodeResult, error) { - return nil, nil -} - -// Tests that the errors returned by the xDS client when watching a resource -// contain the node ID that was used to create the client. This test covers two -// scenarios: -// -// 1. When a watch is registered for an already registered resource type, but -// this time with a different implementation, -// 2. When a watch is registered for a resource name whose authority is not -// found in the bootstrap configuration. -func (s) TestWatchErrorsContainNodeID(t *testing.T) { - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{}) - - // Create bootstrap configuration pointing to the above management server. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - - // Create an xDS client with the above bootstrap contents. - config, err := bootstrap.NewConfigFromContents(bc) - if err != nil { - t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bc), err) - } - pool := xdsclient.NewPool(config) - client, close, err := pool.NewClientForTesting(xdsclient.OptionsForTesting{ - Name: t.Name(), - }) - if err != nil { - t.Fatalf("Failed to create xDS client: %v", err) - } - defer close() - - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - t.Run("Multiple_ResourceType_Implementations", func(t *testing.T) { - const routeConfigName = "route-config-name" - watcher := xdstestutils.NewTestResourceWatcher() - client.WatchResource(routeConfigResourceType, routeConfigName, watcher) - - sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout) - defer sCancel() - select { - case <-sCtx.Done(): - case <-watcher.UpdateCh: - t.Fatal("Unexpected resource update") - case <-watcher.AmbientErrorCh: - t.Fatal("Unexpected resource error") - case <-watcher.ResourceErrorCh: - t.Fatal("Unexpected resource does not exist") - } - - client.WatchResource(testRouteConfigResourceType{}, routeConfigName, watcher) - select { - case <-ctx.Done(): - t.Fatal("Timeout when waiting for error callback to be invoked") - case err := <-watcher.AmbientErrorCh: - if err == nil || !strings.Contains(err.Error(), nodeID) { - t.Fatalf("Unexpected error: %v, want error with node ID: %q", err, nodeID) - } - } - }) - - t.Run("Missing_Authority", func(t *testing.T) { - const routeConfigName = "xdstp://nonexistant-authority/envoy.config.route.v3.RouteConfiguration/route-config-name" - watcher := xdstestutils.NewTestResourceWatcher() - client.WatchResource(routeConfigResourceType, routeConfigName, watcher) - - select { - case <-ctx.Done(): - t.Fatal("Timeout when waiting for error callback to be invoked") - case err := <-watcher.AmbientErrorCh: - if err == nil || !strings.Contains(err.Error(), nodeID) { - t.Fatalf("Unexpected error: %v, want error with node ID: %q", err, nodeID) - } - } - }) -} - -// Tests that the errors returned by the xDS client when watching a resource -// contain the node ID when channel creation to the management server fails. -func (s) TestWatchErrorsContainNodeID_ChannelCreationFailure(t *testing.T) { - mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{}) - - // Create bootstrap configuration pointing to the above management server. - nodeID := uuid.New().String() - bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) - - // Create an xDS client with the above bootstrap contents. - config, err := bootstrap.NewConfigFromContents(bc) - if err != nil { - t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bc), err) - } - pool := xdsclient.NewPool(config) - client, close, err := pool.NewClientForTesting(xdsclient.OptionsForTesting{ - Name: t.Name(), - }) - if err != nil { - t.Fatalf("Failed to create xDS client: %v", err) - } - defer close() - - ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) - defer cancel() - - // Override the xDS channel dialer with one that always fails. - origDialer := xdsclientinternal.GRPCNewClient - xdsclientinternal.GRPCNewClient = func(target string, opts ...grpc.DialOption) (*grpc.ClientConn, error) { - return nil, fmt.Errorf("failed to create channel") - } - defer func() { xdsclientinternal.GRPCNewClient = origDialer }() - - const routeConfigName = "route-config-name" - watcher := xdstestutils.NewTestResourceWatcher() - client.WatchResource(routeConfigResourceType, routeConfigName, watcher) - - select { - case <-ctx.Done(): - t.Fatal("Timeout when waiting for error callback to be invoked") - case err := <-watcher.AmbientErrorCh: - if err == nil || !strings.Contains(err.Error(), nodeID) { - t.Fatalf("Unexpected error: %v, want error with node ID: %q", err, nodeID) - } - } -} diff --git a/xds/internal/xdsclient/tests/rds_watchers_test.go b/xds/internal/xdsclient/tests/rds_watchers_test.go index 2086c018855d..f7d322885061 100644 --- a/xds/internal/xdsclient/tests/rds_watchers_test.go +++ b/xds/internal/xdsclient/tests/rds_watchers_test.go @@ -122,8 +122,8 @@ func verifyRouteConfigUpdate(ctx context.Context, updateCh *testutils.Channel, w } got := u.(routeConfigUpdateErrTuple) if wantUpdate.err != nil { - if gotType, wantType := xdsresource.ErrType(got.err), xdsresource.ErrType(wantUpdate.err); gotType != wantType { - return fmt.Errorf("received update with error type %v, want %v", gotType, wantType) + if got.err == nil || !strings.Contains(got.err.Error(), wantUpdate.err.Error()) { + return fmt.Errorf("update received with error: %v, want %q", got.err, wantUpdate.err) } } cmpOpts := []cmp.Option{cmpopts.EquateEmpty(), cmpopts.IgnoreFields(xdsresource.RouteConfigUpdate{}, "Raw")} diff --git a/xds/internal/xdsclient/tests/resource_update_test.go b/xds/internal/xdsclient/tests/resource_update_test.go index a9fce90c8756..876b99ff78d2 100644 --- a/xds/internal/xdsclient/tests/resource_update_test.go +++ b/xds/internal/xdsclient/tests/resource_update_test.go @@ -161,7 +161,7 @@ func (s) TestHandleListenerResponseFromManagementServer(t *testing.T) { Value: []byte{1, 2, 3, 4}, }}, }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "ListenerResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "ListenerResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", @@ -177,7 +177,7 @@ func (s) TestHandleListenerResponseFromManagementServer(t *testing.T) { TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", VersionInfo: "1", }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "ListenerResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "ListenerResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", @@ -194,7 +194,7 @@ func (s) TestHandleListenerResponseFromManagementServer(t *testing.T) { VersionInfo: "1", Resources: []*anypb.Any{testutils.MarshalAny(t, &v3routepb.RouteConfiguration{})}, }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "ListenerResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "ListenerResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.listener.v3.Listener", @@ -422,7 +422,7 @@ func (s) TestHandleRouteConfigResponseFromManagementServer(t *testing.T) { Value: []byte{1, 2, 3, 4}, }}, }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "RouteConfigResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "RouteConfigResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration", @@ -438,7 +438,7 @@ func (s) TestHandleRouteConfigResponseFromManagementServer(t *testing.T) { TypeUrl: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration", VersionInfo: "1", }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "RouteConfigResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "RouteConfigResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration", @@ -455,7 +455,7 @@ func (s) TestHandleRouteConfigResponseFromManagementServer(t *testing.T) { VersionInfo: "1", Resources: []*anypb.Any{testutils.MarshalAny(t, &v3clusterpb.Cluster{})}, }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "RouteConfigResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "RouteConfigResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration", @@ -675,7 +675,7 @@ func (s) TestHandleClusterResponseFromManagementServer(t *testing.T) { Value: []byte{1, 2, 3, 4}, }}, }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "ClusterResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "ClusterResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.cluster.v3.Cluster", @@ -691,7 +691,7 @@ func (s) TestHandleClusterResponseFromManagementServer(t *testing.T) { TypeUrl: "type.googleapis.com/envoy.config.cluster.v3.Cluster", VersionInfo: "1", }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "ClusterResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "ClusterResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.cluster.v3.Cluster", @@ -708,7 +708,7 @@ func (s) TestHandleClusterResponseFromManagementServer(t *testing.T) { VersionInfo: "1", Resources: []*anypb.Any{testutils.MarshalAny(t, &v3endpointpb.ClusterLoadAssignment{})}, }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "ClusterResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "ClusterResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.cluster.v3.Cluster", @@ -986,7 +986,7 @@ func (s) TestHandleEndpointsResponseFromManagementServer(t *testing.T) { Value: []byte{1, 2, 3, 4}, }}, }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "EndpointsResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "EndpointsResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", @@ -1002,7 +1002,7 @@ func (s) TestHandleEndpointsResponseFromManagementServer(t *testing.T) { TypeUrl: "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", VersionInfo: "1", }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "EndpointsResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "EndpointsResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", @@ -1019,7 +1019,7 @@ func (s) TestHandleEndpointsResponseFromManagementServer(t *testing.T) { VersionInfo: "1", Resources: []*anypb.Any{testutils.MarshalAny(t, &v3listenerpb.Listener{})}, }, - wantErr: fmt.Sprintf("xds: resource %q of type %q does not exist", resourceName1, "EndpointsResource"), + wantErr: fmt.Sprintf("xds: resource %q of type %q has been removed", resourceName1, "EndpointsResource"), wantGenericXDSConfig: []*v3statuspb.ClientConfig_GenericXdsConfig{ { TypeUrl: "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", diff --git a/xds/internal/xdsclient/transport/ads/ads_stream.go b/xds/internal/xdsclient/transport/ads/ads_stream.go deleted file mode 100644 index fc41b38edade..000000000000 --- a/xds/internal/xdsclient/transport/ads/ads_stream.go +++ /dev/null @@ -1,825 +0,0 @@ -/* - * - * Copyright 2024 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package ads provides the implementation of an ADS (Aggregated Discovery -// Service) stream for the xDS client. -package ads - -import ( - "context" - "fmt" - "sync" - "sync/atomic" - "time" - - "google.golang.org/grpc/codes" - "google.golang.org/grpc/grpclog" - "google.golang.org/grpc/internal/backoff" - "google.golang.org/grpc/internal/buffer" - igrpclog "google.golang.org/grpc/internal/grpclog" - "google.golang.org/grpc/internal/pretty" - "google.golang.org/grpc/xds/internal/xdsclient/transport" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" - "google.golang.org/protobuf/types/known/anypb" - - v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" - v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" - statuspb "google.golang.org/genproto/googleapis/rpc/status" -) - -// Any per-RPC level logs which print complete request or response messages -// should be gated at this verbosity level. Other per-RPC level logs which print -// terse output should be at `INFO` and verbosity 2. -const perRPCVerbosityLevel = 9 - -// Response represents a response received on the ADS stream. It contains the -// type URL, version, and resources for the response. -type Response struct { - TypeURL string - Version string - Resources []*anypb.Any -} - -// DataAndErrTuple is a struct that holds a resource and an error. It is used to -// return a resource and any associated error from a function. -type DataAndErrTuple struct { - Resource xdsresource.ResourceData - Err error -} - -// StreamEventHandler is an interface that defines the callbacks for events that -// occur on the ADS stream. Methods on this interface may be invoked -// concurrently and implementations need to handle them in a thread-safe manner. -type StreamEventHandler interface { - OnADSStreamError(error) // Called when the ADS stream breaks. - OnADSWatchExpiry(xdsresource.Type, string) // Called when the watch timer expires for a resource. - OnADSResponse(Response, func()) ([]string, error) // Called when a response is received on the ADS stream. -} - -// WatchState is a enum that describes the watch state of a particular -// resource. -type WatchState int - -const ( - // ResourceWatchStateStarted is the state where a watch for a resource was - // started, but a request asking for that resource is yet to be sent to the - // management server. - ResourceWatchStateStarted WatchState = iota - // ResourceWatchStateRequested is the state when a request has been sent for - // the resource being watched. - ResourceWatchStateRequested - // ResourceWatchStateReceived is the state when a response has been received - // for the resource being watched. - ResourceWatchStateReceived - // ResourceWatchStateTimeout is the state when the watch timer associated - // with the resource expired because no response was received. - ResourceWatchStateTimeout -) - -// ResourceWatchState is the state corresponding to a resource being watched. -type ResourceWatchState struct { - State WatchState // Watch state of the resource. - ExpiryTimer *time.Timer // Timer for the expiry of the watch. -} - -// State corresponding to a resource type. -type resourceTypeState struct { - version string // Last acked version. Should not be reset when the stream breaks. - nonce string // Last received nonce. Should be reset when the stream breaks. - bufferedRequests chan struct{} // Channel to buffer requests when writing is blocked. - subscribedResources map[string]*ResourceWatchState // Map of subscribed resource names to their state. - pendingWrite bool // True if there is a pending write for this resource type. -} - -// StreamImpl provides the functionality associated with an ADS (Aggregated -// Discovery Service) stream on the client side. It manages the lifecycle of the -// ADS stream, including creating the stream, sending requests, and handling -// responses. It also handles flow control and retries for the stream. -type StreamImpl struct { - // The following fields are initialized from arguments passed to the - // constructor and are read-only afterwards, and hence can be accessed - // without a mutex. - transport transport.Transport // Transport to use for ADS stream. - eventHandler StreamEventHandler // Callbacks into the xdsChannel. - backoff func(int) time.Duration // Backoff for retries, after stream failures. - nodeProto *v3corepb.Node // Identifies the gRPC application. - watchExpiryTimeout time.Duration // Resource watch expiry timeout - logger *igrpclog.PrefixLogger - - // The following fields are initialized in the constructor and are not - // written to afterwards, and hence can be accessed without a mutex. - streamCh chan transport.StreamingCall // New ADS streams are pushed here. - requestCh *buffer.Unbounded // Subscriptions and unsubscriptions are pushed here. - runnerDoneCh chan struct{} // Notify completion of runner goroutine. - cancel context.CancelFunc // To cancel the context passed to the runner goroutine. - - // Guards access to the below fields (and to the contents of the map). - mu sync.Mutex - resourceTypeState map[xdsresource.Type]*resourceTypeState // Map of resource types to their state. - fc *adsFlowControl // Flow control for ADS stream. - firstRequest bool // False after the first request is sent out. -} - -// StreamOpts contains the options for creating a new ADS Stream. -type StreamOpts struct { - Transport transport.Transport // xDS transport to create the stream on. - EventHandler StreamEventHandler // Callbacks for stream events. - Backoff func(int) time.Duration // Backoff for retries, after stream failures. - NodeProto *v3corepb.Node // Node proto to identify the gRPC application. - WatchExpiryTimeout time.Duration // Resource watch expiry timeout. - LogPrefix string // Prefix to be used for log messages. -} - -// NewStreamImpl initializes a new StreamImpl instance using the given -// parameters. It also launches goroutines responsible for managing reads and -// writes for messages of the underlying stream. -func NewStreamImpl(opts StreamOpts) *StreamImpl { - s := &StreamImpl{ - transport: opts.Transport, - eventHandler: opts.EventHandler, - backoff: opts.Backoff, - nodeProto: opts.NodeProto, - watchExpiryTimeout: opts.WatchExpiryTimeout, - - streamCh: make(chan transport.StreamingCall, 1), - requestCh: buffer.NewUnbounded(), - runnerDoneCh: make(chan struct{}), - resourceTypeState: make(map[xdsresource.Type]*resourceTypeState), - } - - l := grpclog.Component("xds") - s.logger = igrpclog.NewPrefixLogger(l, opts.LogPrefix+fmt.Sprintf("[ads-stream %p] ", s)) - - ctx, cancel := context.WithCancel(context.Background()) - s.cancel = cancel - go s.runner(ctx) - return s -} - -// Stop blocks until the stream is closed and all spawned goroutines exit. -func (s *StreamImpl) Stop() { - s.cancel() - s.requestCh.Close() - <-s.runnerDoneCh - s.logger.Infof("Stopping ADS stream") -} - -// Subscribe subscribes to the given resource. It is assumed that multiple -// subscriptions for the same resource is deduped at the caller. A discovery -// request is sent out on the underlying stream for the resource type when there -// is sufficient flow control quota. -func (s *StreamImpl) Subscribe(typ xdsresource.Type, name string) { - if s.logger.V(2) { - s.logger.Infof("Subscribing to resource %q of type %q", name, typ.TypeName()) - } - - s.mu.Lock() - defer s.mu.Unlock() - - state, ok := s.resourceTypeState[typ] - if !ok { - // An entry in the type state map is created as part of the first - // subscription request for this type. - state = &resourceTypeState{ - subscribedResources: make(map[string]*ResourceWatchState), - bufferedRequests: make(chan struct{}, 1), - } - s.resourceTypeState[typ] = state - } - - // Create state for the newly subscribed resource. The watch timer will - // be started when a request for this resource is actually sent out. - state.subscribedResources[name] = &ResourceWatchState{State: ResourceWatchStateStarted} - state.pendingWrite = true - - // Send a request for the resource type with updated subscriptions. - s.requestCh.Put(typ) -} - -// Unsubscribe cancels the subscription to the given resource. It is a no-op if -// the given resource does not exist. The watch expiry timer associated with the -// resource is stopped if one is active. A discovery request is sent out on the -// stream for the resource type when there is sufficient flow control quota. -func (s *StreamImpl) Unsubscribe(typ xdsresource.Type, name string) { - if s.logger.V(2) { - s.logger.Infof("Unsubscribing to resource %q of type %q", name, typ.TypeName()) - } - - s.mu.Lock() - defer s.mu.Unlock() - - state, ok := s.resourceTypeState[typ] - if !ok { - return - } - - rs, ok := state.subscribedResources[name] - if !ok { - return - } - if rs.ExpiryTimer != nil { - rs.ExpiryTimer.Stop() - } - delete(state.subscribedResources, name) - state.pendingWrite = true - - // Send a request for the resource type with updated subscriptions. - s.requestCh.Put(typ) -} - -// runner is a long-running goroutine that handles the lifecycle of the ADS -// stream. It spwans another goroutine to handle writes of discovery request -// messages on the stream. Whenever an existing stream fails, it performs -// exponential backoff (if no messages were received on that stream) before -// creating a new stream. -func (s *StreamImpl) runner(ctx context.Context) { - defer close(s.runnerDoneCh) - - go s.send(ctx) - - runStreamWithBackoff := func() error { - stream, err := s.transport.CreateStreamingCall(ctx, "/envoy.service.discovery.v3.AggregatedDiscoveryService/StreamAggregatedResources") - if err != nil { - s.logger.Warningf("Failed to create a new ADS streaming RPC: %v", err) - s.onError(err, false) - return nil - } - if s.logger.V(2) { - s.logger.Infof("ADS stream created") - } - - s.mu.Lock() - // Flow control is a property of the underlying streaming RPC call and - // needs to be initialized everytime a new one is created. - s.fc = newADSFlowControl(s.logger) - s.firstRequest = true - s.mu.Unlock() - - // Ensure that the most recently created stream is pushed on the - // channel for the `send` goroutine to consume. - select { - case <-s.streamCh: - default: - } - s.streamCh <- stream - - // Backoff state is reset upon successful receipt of at least one - // message from the server. - if s.recv(ctx, stream) { - return backoff.ErrResetBackoff - } - return nil - } - backoff.RunF(ctx, runStreamWithBackoff, s.backoff) -} - -// send is a long running goroutine that handles sending discovery requests for -// two scenarios: -// - a new subscription or unsubscription request is received -// - a new stream is created after the previous one failed -func (s *StreamImpl) send(ctx context.Context) { - // Stores the most recent stream instance received on streamCh. - var stream transport.StreamingCall - for { - select { - case <-ctx.Done(): - return - case stream = <-s.streamCh: - if err := s.sendExisting(stream); err != nil { - // Send failed, clear the current stream. Attempt to resend will - // only be made after a new stream is created. - stream = nil - continue - } - case req, ok := <-s.requestCh.Get(): - if !ok { - return - } - s.requestCh.Load() - - typ := req.(xdsresource.Type) - if err := s.sendNew(stream, typ); err != nil { - stream = nil - continue - } - } - } -} - -// sendNew attempts to send a discovery request based on a new subscription or -// unsubscription. If there is no flow control quota, the request is buffered -// and will be sent later. This method also starts the watch expiry timer for -// resources that were sent in the request for the first time, i.e. their watch -// state is `watchStateStarted`. -func (s *StreamImpl) sendNew(stream transport.StreamingCall, typ xdsresource.Type) error { - s.mu.Lock() - defer s.mu.Unlock() - - // If there's no stream yet, skip the request. This request will be resent - // when a new stream is created. If no stream is created, the watcher will - // timeout (same as server not sending response back). - if stream == nil { - return nil - } - - // If local processing of the most recently received response is not yet - // complete, i.e. fc.pending == true, queue this write and return early. - // This allows us to batch writes for requests which are generated as part - // of local processing of a received response. - state := s.resourceTypeState[typ] - if s.fc.pending.Load() { - select { - case state.bufferedRequests <- struct{}{}: - default: - } - return nil - } - - return s.sendMessageIfWritePendingLocked(stream, typ, state) -} - -// sendExisting sends out discovery requests for existing resources when -// recovering from a broken stream. -// -// The stream argument is guaranteed to be non-nil. -func (s *StreamImpl) sendExisting(stream transport.StreamingCall) error { - s.mu.Lock() - defer s.mu.Unlock() - - for typ, state := range s.resourceTypeState { - // Reset only the nonces map when the stream restarts. - // - // xDS spec says the following. See section: - // https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol#ack-nack-and-resource-type-instance-version - // - // Note that the version for a resource type is not a property of an - // individual xDS stream but rather a property of the resources - // themselves. If the stream becomes broken and the client creates a new - // stream, the client’s initial request on the new stream should - // indicate the most recent version seen by the client on the previous - // stream - state.nonce = "" - - if len(state.subscribedResources) == 0 { - continue - } - - state.pendingWrite = true - if err := s.sendMessageIfWritePendingLocked(stream, typ, state); err != nil { - return err - } - } - return nil -} - -// sendBuffered sends out discovery requests for resources that were buffered -// when they were subscribed to, because local processing of the previously -// received response was not yet complete. -// -// The stream argument is guaranteed to be non-nil. -func (s *StreamImpl) sendBuffered(stream transport.StreamingCall) error { - s.mu.Lock() - defer s.mu.Unlock() - - for typ, state := range s.resourceTypeState { - select { - case <-state.bufferedRequests: - if err := s.sendMessageIfWritePendingLocked(stream, typ, state); err != nil { - return err - } - default: - // No buffered request. - continue - } - } - return nil -} - -// sendMessageIfWritePendingLocked attempts to sends a discovery request to the -// server, if there is a pending write for the given resource type. -// -// If the request is successfully sent, the pending write field is cleared and -// watch timers are started for the resources in the request. -// -// Caller needs to hold c.mu. -func (s *StreamImpl) sendMessageIfWritePendingLocked(stream transport.StreamingCall, typ xdsresource.Type, state *resourceTypeState) error { - if !state.pendingWrite { - if s.logger.V(2) { - s.logger.Infof("Skipping sending request for type %q, because all subscribed resources were already sent", typ.TypeURL()) - } - return nil - } - - names := resourceNames(state.subscribedResources) - if err := s.sendMessageLocked(stream, names, typ.TypeURL(), state.version, state.nonce, nil); err != nil { - return err - } - state.pendingWrite = false - - // Drain the buffered requests channel because we just sent a request for this - // resource type. - select { - case <-state.bufferedRequests: - default: - } - - s.startWatchTimersLocked(typ, names) - return nil -} - -// sendMessageLocked sends a discovery request to the server, populating the -// different fields of the message with the given parameters. Returns a non-nil -// error if the request could not be sent. -// -// Caller needs to hold c.mu. -func (s *StreamImpl) sendMessageLocked(stream transport.StreamingCall, names []string, url, version, nonce string, nackErr error) error { - req := &v3discoverypb.DiscoveryRequest{ - ResourceNames: names, - TypeUrl: url, - VersionInfo: version, - ResponseNonce: nonce, - } - - // The xDS protocol only requires that we send the node proto in the first - // discovery request on every stream. Sending the node proto in every - // request wastes CPU resources on the client and the server. - if s.firstRequest { - req.Node = s.nodeProto - } - - if nackErr != nil { - req.ErrorDetail = &statuspb.Status{ - Code: int32(codes.InvalidArgument), Message: nackErr.Error(), - } - } - - if err := stream.Send(req); err != nil { - s.logger.Warningf("Sending ADS request for type %q, resources: %v, version: %q, nonce: %q failed: %v", url, names, version, nonce, err) - return err - } - s.firstRequest = false - - if s.logger.V(perRPCVerbosityLevel) { - s.logger.Infof("ADS request sent: %v", pretty.ToJSON(req)) - } else if s.logger.V(2) { - s.logger.Warningf("ADS request sent for type %q, resources: %v, version: %q, nonce: %q", url, names, version, nonce) - } - return nil -} - -// recv is responsible for receiving messages from the ADS stream. -// -// It performs the following actions: -// - Waits for local flow control to be available before sending buffered -// requests, if any. -// - Receives a message from the ADS stream. If an error is encountered here, -// it is handled by the onError method which propagates the error to all -// watchers. -// - Invokes the event handler's OnADSResponse method to process the message. -// - Sends an ACK or NACK to the server based on the response. -// -// It returns a boolean indicating whether at least one message was received -// from the server. -func (s *StreamImpl) recv(ctx context.Context, stream transport.StreamingCall) bool { - msgReceived := false - for { - // Wait for ADS stream level flow control to be available, and send out - // a request if anything was buffered while we were waiting for local - // processing of the previous response to complete. - if !s.fc.wait(ctx) { - if s.logger.V(2) { - s.logger.Infof("ADS stream context canceled") - } - return msgReceived - } - s.sendBuffered(stream) - - resources, url, version, nonce, err := s.recvMessage(stream) - if err != nil { - s.onError(err, msgReceived) - s.logger.Warningf("ADS stream closed: %v", err) - return msgReceived - } - msgReceived = true - - // Invoke the onResponse event handler to parse the incoming message and - // decide whether to send an ACK or NACK. - resp := Response{ - Resources: resources, - TypeURL: url, - Version: version, - } - var resourceNames []string - var nackErr error - s.fc.setPending() - resourceNames, nackErr = s.eventHandler.OnADSResponse(resp, s.fc.onDone) - if xdsresource.ErrType(nackErr) == xdsresource.ErrorTypeResourceTypeUnsupported { - // Based on gRFC A27, a general guiding principle is that if the - // server sends something the client didn't actually subscribe to, - // then the client ignores it. Here, we have received a response - // with resources of a type that we don't know about. - // - // Sending a NACK doesn't really seem appropriate here, since we're - // not actually validating what the server sent and therefore don't - // know that it's invalid. But we shouldn't ACK either, because we - // don't know that it is valid. - s.logger.Warningf("%v", nackErr) - continue - } - - s.onRecv(stream, resourceNames, url, version, nonce, nackErr) - } -} - -func (s *StreamImpl) recvMessage(stream transport.StreamingCall) (resources []*anypb.Any, url, version, nonce string, err error) { - r, err := stream.Recv() - if err != nil { - return nil, "", "", "", err - } - resp, ok := r.(*v3discoverypb.DiscoveryResponse) - if !ok { - s.logger.Infof("Message received on ADS stream of unexpected type: %T", r) - return nil, "", "", "", fmt.Errorf("unexpected message type %T", r) - } - - if s.logger.V(perRPCVerbosityLevel) { - s.logger.Infof("ADS response received: %v", pretty.ToJSON(resp)) - } else if s.logger.V(2) { - s.logger.Infof("ADS response received for type %q, version %q, nonce %q", resp.GetTypeUrl(), resp.GetVersionInfo(), resp.GetNonce()) - } - return resp.GetResources(), resp.GetTypeUrl(), resp.GetVersionInfo(), resp.GetNonce(), nil -} - -// onRecv is invoked when a response is received from the server. The arguments -// passed to this method correspond to the most recently received response. -// -// It performs the following actions: -// - updates resource type specific state -// - updates resource specific state for resources in the response -// - sends an ACK or NACK to the server based on the response -func (s *StreamImpl) onRecv(stream transport.StreamingCall, names []string, url, version, nonce string, nackErr error) { - s.mu.Lock() - defer s.mu.Unlock() - - // Lookup the resource type specific state based on the type URL. - var typ xdsresource.Type - for t := range s.resourceTypeState { - if t.TypeURL() == url { - typ = t - break - } - } - typeState, ok := s.resourceTypeState[typ] - if !ok { - s.logger.Warningf("ADS stream received a response for type %q, but no state exists for it", url) - return - } - - // Update the resource type specific state. This includes: - // - updating the nonce unconditionally - // - updating the version only if the response is to be ACKed - previousVersion := typeState.version - typeState.nonce = nonce - if nackErr == nil { - typeState.version = version - } - - // Update the resource specific state. For all resources received as - // part of this response that are in state `started` or `requested`, - // this includes: - // - setting the watch state to watchstateReceived - // - stopping the expiry timer, if one exists - for _, name := range names { - rs, ok := typeState.subscribedResources[name] - if !ok { - s.logger.Warningf("ADS stream received a response for resource %q, but no state exists for it", name) - continue - } - if ws := rs.State; ws == ResourceWatchStateStarted || ws == ResourceWatchStateRequested { - rs.State = ResourceWatchStateReceived - if rs.ExpiryTimer != nil { - rs.ExpiryTimer.Stop() - rs.ExpiryTimer = nil - } - } - } - - // Send an ACK or NACK. - subscribedResourceNames := resourceNames(typeState.subscribedResources) - if nackErr != nil { - s.logger.Warningf("Sending NACK for resource type: %q, version: %q, nonce: %q, reason: %v", url, version, nonce, nackErr) - s.sendMessageLocked(stream, subscribedResourceNames, url, previousVersion, nonce, nackErr) - return - } - - if s.logger.V(2) { - s.logger.Infof("Sending ACK for resource type: %q, version: %q, nonce: %q", url, version, nonce) - } - s.sendMessageLocked(stream, subscribedResourceNames, url, version, nonce, nil) -} - -// onError is called when an error occurs on the ADS stream. It stops any -// outstanding resource timers and resets the watch state to started for any -// resources that were in the requested state. It also handles the case where -// the ADS stream was closed after receiving a response, which is not -// considered an error. -func (s *StreamImpl) onError(err error, msgReceived bool) { - // For resources that been requested but not yet responded to by the - // management server, stop the resource timers and reset the watch state to - // watchStateStarted. This is because we don't want the expiry timer to be - // running when we don't have a stream open to the management server. - s.mu.Lock() - for _, state := range s.resourceTypeState { - for _, rs := range state.subscribedResources { - if rs.State != ResourceWatchStateRequested { - continue - } - if rs.ExpiryTimer != nil { - rs.ExpiryTimer.Stop() - rs.ExpiryTimer = nil - } - rs.State = ResourceWatchStateStarted - } - } - s.mu.Unlock() - - // Note that we do not consider it an error if the ADS stream was closed - // after having received a response on the stream. This is because there - // are legitimate reasons why the server may need to close the stream during - // normal operations, such as needing to rebalance load or the underlying - // connection hitting its max connection age limit. - // (see [gRFC A9](https://github.com/grpc/proposal/blob/master/A9-server-side-conn-mgt.md)). - if msgReceived { - err = xdsresource.NewError(xdsresource.ErrTypeStreamFailedAfterRecv, err.Error()) - } - - s.eventHandler.OnADSStreamError(err) -} - -// startWatchTimersLocked starts the expiry timers for the given resource names -// of the specified resource type. For each resource name, if the resource -// watch state is in the "started" state, it transitions the state to -// "requested" and starts an expiry timer. When the timer expires, the resource -// watch state is set to "timeout" and the event handler callback is called. -// -// The caller must hold the s.mu lock. -func (s *StreamImpl) startWatchTimersLocked(typ xdsresource.Type, names []string) { - typeState := s.resourceTypeState[typ] - for _, name := range names { - resourceState, ok := typeState.subscribedResources[name] - if !ok { - continue - } - if resourceState.State != ResourceWatchStateStarted { - continue - } - resourceState.State = ResourceWatchStateRequested - - rs := resourceState - resourceState.ExpiryTimer = time.AfterFunc(s.watchExpiryTimeout, func() { - s.mu.Lock() - rs.State = ResourceWatchStateTimeout - rs.ExpiryTimer = nil - s.mu.Unlock() - s.eventHandler.OnADSWatchExpiry(typ, name) - }) - } -} - -func resourceNames(m map[string]*ResourceWatchState) []string { - ret := make([]string, len(m)) - idx := 0 - for name := range m { - ret[idx] = name - idx++ - } - return ret -} - -// TriggerResourceNotFoundForTesting triggers a resource not found event for the -// given resource type and name. This is intended for testing purposes only, to -// simulate a resource not found scenario. -func (s *StreamImpl) TriggerResourceNotFoundForTesting(typ xdsresource.Type, resourceName string) { - s.mu.Lock() - - state, ok := s.resourceTypeState[typ] - if !ok { - s.mu.Unlock() - return - } - resourceState, ok := state.subscribedResources[resourceName] - if !ok { - s.mu.Unlock() - return - } - - if s.logger.V(2) { - s.logger.Infof("Triggering resource not found for type: %s, resource name: %s", typ.TypeName(), resourceName) - } - resourceState.State = ResourceWatchStateTimeout - if resourceState.ExpiryTimer != nil { - resourceState.ExpiryTimer.Stop() - resourceState.ExpiryTimer = nil - } - s.mu.Unlock() - go s.eventHandler.OnADSWatchExpiry(typ, resourceName) -} - -// ResourceWatchStateForTesting returns the ResourceWatchState for the given -// resource type and name. This is intended for testing purposes only, to -// inspect the internal state of the ADS stream. -func (s *StreamImpl) ResourceWatchStateForTesting(typ xdsresource.Type, resourceName string) (ResourceWatchState, error) { - s.mu.Lock() - defer s.mu.Unlock() - - state, ok := s.resourceTypeState[typ] - if !ok { - return ResourceWatchState{}, fmt.Errorf("unknown resource type: %v", typ) - } - resourceState, ok := state.subscribedResources[resourceName] - if !ok { - return ResourceWatchState{}, fmt.Errorf("unknown resource name: %v", resourceName) - } - return *resourceState, nil -} - -// adsFlowControl implements ADS stream level flow control that enables the -// transport to block the reading of the next message off of the stream until -// the previous update is consumed by all watchers. -// -// The lifetime of the flow control is tied to the lifetime of the stream. -type adsFlowControl struct { - logger *igrpclog.PrefixLogger - - // Whether the most recent update is pending consumption by all watchers. - pending atomic.Bool - // Channel used to notify when all the watchers have consumed the most - // recent update. Wait() blocks on reading a value from this channel. - readyCh chan struct{} -} - -// newADSFlowControl returns a new adsFlowControl. -func newADSFlowControl(logger *igrpclog.PrefixLogger) *adsFlowControl { - return &adsFlowControl{ - logger: logger, - readyCh: make(chan struct{}, 1), - } -} - -// setPending changes the internal state to indicate that there is an update -// pending consumption by all watchers. -func (fc *adsFlowControl) setPending() { - fc.pending.Store(true) -} - -// wait blocks until all the watchers have consumed the most recent update and -// returns true. If the context expires before that, it returns false. -func (fc *adsFlowControl) wait(ctx context.Context) bool { - // If there is no pending update, there is no need to block. - if !fc.pending.Load() { - // If all watchers finished processing the most recent update before the - // `recv` goroutine made the next call to `Wait()`, there would be an - // entry in the readyCh channel that needs to be drained to ensure that - // the next call to `Wait()` doesn't unblock before it actually should. - select { - case <-fc.readyCh: - default: - } - return true - } - - select { - case <-ctx.Done(): - return false - case <-fc.readyCh: - return true - } -} - -// onDone indicates that all watchers have consumed the most recent update. -func (fc *adsFlowControl) onDone() { - select { - // Writes to the readyCh channel should not block ideally. The default - // branch here is to appease the paranoid mind. - case fc.readyCh <- struct{}{}: - default: - if fc.logger.V(2) { - fc.logger.Infof("ADS stream flow control readyCh is full") - } - } - fc.pending.Store(false) -} diff --git a/xds/internal/xdsclient/transport/grpctransport/grpctransport.go b/xds/internal/xdsclient/transport/grpctransport/grpctransport.go deleted file mode 100644 index fb740ade1395..000000000000 --- a/xds/internal/xdsclient/transport/grpctransport/grpctransport.go +++ /dev/null @@ -1,138 +0,0 @@ -/* - * - * Copyright 2024 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package grpctransport provides an implementation of the transport interface -// using gRPC. -package grpctransport - -import ( - "context" - "fmt" - "time" - - "google.golang.org/grpc" - "google.golang.org/grpc/keepalive" - "google.golang.org/grpc/xds/internal/xdsclient/internal" - "google.golang.org/grpc/xds/internal/xdsclient/transport" - - v3adsgrpc "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" - v3adspb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" - v3lrsgrpc "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3" - v3lrspb "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3" -) - -func init() { - internal.GRPCNewClient = grpc.NewClient - internal.NewADSStream = func(ctx context.Context, cc *grpc.ClientConn) (v3adsgrpc.AggregatedDiscoveryService_StreamAggregatedResourcesClient, error) { - return v3adsgrpc.NewAggregatedDiscoveryServiceClient(cc).StreamAggregatedResources(ctx) - } -} - -// Builder provides a way to build a gRPC-based transport to an xDS server. -type Builder struct{} - -// Build creates a new gRPC-based transport to an xDS server using the provided -// options. This involves creating a grpc.ClientConn to the server identified by -// the server URI in the provided options. -func (b *Builder) Build(opts transport.BuildOptions) (transport.Transport, error) { - if opts.ServerConfig == nil { - return nil, fmt.Errorf("ServerConfig field in opts cannot be nil") - } - - // NOTE: The bootstrap package ensures that the server_uri and credentials - // inside the server config are always populated. If we end up using a - // different type in BuildOptions to specify the server configuration, we - // must ensure that those fields are not empty before proceeding. - - // Dial the xDS management server with dial options specified by the server - // configuration and a static keepalive configuration that is common across - // gRPC language implementations. - kpCfg := grpc.WithKeepaliveParams(keepalive.ClientParameters{ - Time: 5 * time.Minute, - Timeout: 20 * time.Second, - }) - dopts := append(opts.ServerConfig.DialOptions(), kpCfg) - dialer := internal.GRPCNewClient.(func(string, ...grpc.DialOption) (*grpc.ClientConn, error)) - cc, err := dialer(opts.ServerConfig.ServerURI(), dopts...) - if err != nil { - // An error from a non-blocking dial indicates something serious. - return nil, fmt.Errorf("failed to create a grpc transport to the management server %q: %v", opts.ServerConfig.ServerURI(), err) - } - cc.Connect() - - return &grpcTransport{cc: cc}, nil -} - -type grpcTransport struct { - cc *grpc.ClientConn -} - -func (g *grpcTransport) CreateStreamingCall(ctx context.Context, method string) (transport.StreamingCall, error) { - switch method { - case v3adsgrpc.AggregatedDiscoveryService_StreamAggregatedResources_FullMethodName: - return g.newADSStreamingCall(ctx) - case v3lrsgrpc.LoadReportingService_StreamLoadStats_FullMethodName: - return g.newLRSStreamingCall(ctx) - default: - return nil, fmt.Errorf("unsupported method: %v", method) - } -} - -func (g *grpcTransport) newADSStreamingCall(ctx context.Context) (transport.StreamingCall, error) { - newStream := internal.NewADSStream.(func(context.Context, *grpc.ClientConn) (v3adsgrpc.AggregatedDiscoveryService_StreamAggregatedResourcesClient, error)) - stream, err := newStream(ctx, g.cc) - if err != nil { - return nil, fmt.Errorf("failed to create an ADS stream: %v", err) - } - return &adsStream{stream: stream}, nil -} - -func (g *grpcTransport) newLRSStreamingCall(ctx context.Context) (transport.StreamingCall, error) { - stream, err := v3lrsgrpc.NewLoadReportingServiceClient(g.cc).StreamLoadStats(ctx) - if err != nil { - return nil, fmt.Errorf("failed to create an LRS stream: %v", err) - } - return &lrsStream{stream: stream}, nil -} - -func (g *grpcTransport) Close() error { - return g.cc.Close() -} - -type adsStream struct { - stream v3adsgrpc.AggregatedDiscoveryService_StreamAggregatedResourcesClient -} - -func (a *adsStream) Send(msg any) error { - return a.stream.Send(msg.(*v3adspb.DiscoveryRequest)) -} - -func (a *adsStream) Recv() (any, error) { - return a.stream.Recv() -} - -type lrsStream struct { - stream v3lrsgrpc.LoadReportingService_StreamLoadStatsClient -} - -func (l *lrsStream) Send(msg any) error { - return l.stream.Send(msg.(*v3lrspb.LoadStatsRequest)) -} - -func (l *lrsStream) Recv() (any, error) { - return l.stream.Recv() -} diff --git a/xds/internal/xdsclient/transport/grpctransport/grpctransport_ext_test.go b/xds/internal/xdsclient/transport/grpctransport/grpctransport_ext_test.go deleted file mode 100644 index 2e375f0b5ac1..000000000000 --- a/xds/internal/xdsclient/transport/grpctransport/grpctransport_ext_test.go +++ /dev/null @@ -1,91 +0,0 @@ -/* - * - * Copyright 2024 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package grpctransport_test - -import ( - "testing" - - "google.golang.org/grpc" - "google.golang.org/grpc/internal/grpctest" - internalbootstrap "google.golang.org/grpc/internal/xds/bootstrap" - "google.golang.org/grpc/xds/internal/xdsclient/internal" - "google.golang.org/grpc/xds/internal/xdsclient/transport" - "google.golang.org/grpc/xds/internal/xdsclient/transport/grpctransport" -) - -type s struct { - grpctest.Tester -} - -func Test(t *testing.T) { - grpctest.RunSubTests(t, s{}) -} - -// Tests that the grpctransport.Builder creates a new grpc.ClientConn every time -// Build() is called. -func (s) TestBuild_CustomDialer(t *testing.T) { - // Override the dialer with a custom one. - customDialerCalled := false - origDialer := internal.GRPCNewClient - internal.GRPCNewClient = func(target string, opts ...grpc.DialOption) (*grpc.ClientConn, error) { - customDialerCalled = true - return grpc.NewClient(target, opts...) - } - defer func() { internal.GRPCNewClient = origDialer }() - - serverCfg, err := internalbootstrap.ServerConfigForTesting(internalbootstrap.ServerConfigTestingOptions{URI: "server-address"}) - if err != nil { - t.Fatalf("Failed to create server config for testing: %v", err) - } - - // Create a new transport and ensure that the custom dialer was called. - opts := transport.BuildOptions{ServerConfig: serverCfg} - builder := &grpctransport.Builder{} - tr, err := builder.Build(opts) - if err != nil { - t.Fatalf("Builder.Build(%+v) failed: %v", opts, err) - } - defer tr.Close() - - if !customDialerCalled { - t.Fatalf("Builder.Build(%+v): custom dialer called = false, want true", opts) - } - customDialerCalled = false - - // Create another transport and ensure that the custom dialer was called. - tr, err = builder.Build(opts) - if err != nil { - t.Fatalf("Builder.Build(%+v) failed: %v", opts, err) - } - defer tr.Close() - - if !customDialerCalled { - t.Fatalf("Builder.Build(%+v): custom dialer called = false, want true", opts) - } -} - -// Tests that the grpctransport.Builder fails to build a transport when the -// provided BuildOptions do not contain a ServerConfig. -func (s) TestBuild_EmptyServerConfig(t *testing.T) { - builder := &grpctransport.Builder{} - opts := transport.BuildOptions{} - if tr, err := builder.Build(opts); err == nil { - tr.Close() - t.Fatalf("Builder.Build(%+v) succeeded when expected to fail", opts) - } -} diff --git a/xds/internal/xdsclient/transport/lrs/lrs_stream.go b/xds/internal/xdsclient/transport/lrs/lrs_stream.go deleted file mode 100644 index 7260816b671d..000000000000 --- a/xds/internal/xdsclient/transport/lrs/lrs_stream.go +++ /dev/null @@ -1,339 +0,0 @@ -/* - * - * Copyright 2024 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package lrs provides the implementation of an LRS (Load Reporting Service) -// stream for the xDS client. -package lrs - -import ( - "context" - "fmt" - "io" - "sync" - "time" - - "google.golang.org/grpc/grpclog" - "google.golang.org/grpc/internal/backoff" - igrpclog "google.golang.org/grpc/internal/grpclog" - "google.golang.org/grpc/internal/pretty" - "google.golang.org/grpc/xds/internal" - "google.golang.org/grpc/xds/internal/xdsclient/load" - "google.golang.org/grpc/xds/internal/xdsclient/transport" - "google.golang.org/protobuf/proto" - "google.golang.org/protobuf/types/known/durationpb" - - v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" - v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" - v3lrspb "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3" -) - -// Any per-RPC level logs which print complete request or response messages -// should be gated at this verbosity level. Other per-RPC level logs which print -// terse output should be at `INFO` and verbosity 2. -const perRPCVerbosityLevel = 9 - -// StreamImpl provides all the functionality associated with an LRS (Load Reporting -// Service) stream on the client-side. It manages the lifecycle of the LRS stream, -// including starting, stopping, and retrying the stream. It also provides a -// load.Store that can be used to report load, and a cleanup function that should -// be called when the load reporting is no longer needed. -type StreamImpl struct { - // The following fields are initialized when a Stream instance is created - // and are read-only afterwards, and hence can be accessed without a mutex. - transport transport.Transport // Transport to use for LRS stream. - backoff func(int) time.Duration // Backoff for retries, after stream failures. - nodeProto *v3corepb.Node // Identifies the gRPC application. - doneCh chan struct{} // To notify exit of LRS goroutine. - logger *igrpclog.PrefixLogger - - // Guards access to the below fields. - mu sync.Mutex - cancelStream context.CancelFunc // Cancel the stream. If nil, the stream is not active. - refCount int // Number of interested parties. - lrsStore *load.Store // Store returned to user for pushing loads. -} - -// StreamOpts holds the options for creating an lrsStream. -type StreamOpts struct { - Transport transport.Transport // xDS transport to create the stream on. - Backoff func(int) time.Duration // Backoff for retries, after stream failures. - NodeProto *v3corepb.Node // Node proto to identify the gRPC application. - LogPrefix string // Prefix to be used for log messages. -} - -// NewStreamImpl creates a new StreamImpl with the provided options. -// -// The actual streaming RPC call is initiated when the first call to ReportLoad -// is made, and is terminated when the last call to ReportLoad is canceled. -func NewStreamImpl(opts StreamOpts) *StreamImpl { - lrs := &StreamImpl{ - transport: opts.Transport, - backoff: opts.Backoff, - nodeProto: opts.NodeProto, - lrsStore: load.NewStore(), - } - - l := grpclog.Component("xds") - lrs.logger = igrpclog.NewPrefixLogger(l, opts.LogPrefix+fmt.Sprintf("[lrs-stream %p] ", lrs)) - return lrs -} - -// ReportLoad returns a load.Store that can be used to report load, and a -// cleanup function that should be called when the load reporting is no longer -// needed. -// -// The first call to ReportLoad sets the reference count to one, and starts the -// LRS streaming call. Subsequent calls increment the reference count and return -// the same load.Store. -// -// The cleanup function decrements the reference count and stops the LRS stream -// when the last reference is removed. -func (lrs *StreamImpl) ReportLoad() (*load.Store, func()) { - lrs.mu.Lock() - defer lrs.mu.Unlock() - - cleanup := sync.OnceFunc(func() { - lrs.mu.Lock() - defer lrs.mu.Unlock() - - if lrs.refCount == 0 { - lrs.logger.Errorf("Attempting to stop already stopped StreamImpl") - return - } - lrs.refCount-- - if lrs.refCount != 0 { - return - } - - if lrs.cancelStream == nil { - // It is possible that Stop() is called before the cleanup function - // is called, thereby setting cancelStream to nil. Hence we need a - // nil check here bofore invoking the cancel function. - return - } - lrs.cancelStream() - lrs.cancelStream = nil - lrs.logger.Infof("Stopping StreamImpl") - }) - - if lrs.refCount != 0 { - lrs.refCount++ - return lrs.lrsStore, cleanup - } - - lrs.refCount++ - ctx, cancel := context.WithCancel(context.Background()) - lrs.cancelStream = cancel - lrs.doneCh = make(chan struct{}) - go lrs.runner(ctx) - return lrs.lrsStore, cleanup -} - -// runner is responsible for managing the lifetime of an LRS streaming call. It -// creates the stream, sends the initial LoadStatsRequest, receives the first -// LoadStatsResponse, and then starts a goroutine to periodically send -// LoadStatsRequests. The runner will restart the stream if it encounters any -// errors. -func (lrs *StreamImpl) runner(ctx context.Context) { - defer close(lrs.doneCh) - - // This feature indicates that the client supports the - // LoadStatsResponse.send_all_clusters field in the LRS response. - node := proto.Clone(lrs.nodeProto).(*v3corepb.Node) - node.ClientFeatures = append(node.ClientFeatures, "envoy.lrs.supports_send_all_clusters") - - runLoadReportStream := func() error { - // streamCtx is created and canceled in case we terminate the stream - // early for any reason, to avoid gRPC-Go leaking the RPC's monitoring - // goroutine. - streamCtx, cancel := context.WithCancel(ctx) - defer cancel() - - stream, err := lrs.transport.CreateStreamingCall(streamCtx, "/envoy.service.load_stats.v3.LoadReportingService/StreamLoadStats") - if err != nil { - lrs.logger.Warningf("Failed to create new LRS streaming RPC: %v", err) - return nil - } - if lrs.logger.V(2) { - lrs.logger.Infof("LRS stream created") - } - - if err := lrs.sendFirstLoadStatsRequest(stream, node); err != nil { - lrs.logger.Warningf("Sending first LRS request failed: %v", err) - return nil - } - - clusters, interval, err := lrs.recvFirstLoadStatsResponse(stream) - if err != nil { - lrs.logger.Warningf("Reading from LRS streaming RPC failed: %v", err) - return nil - } - - // We reset backoff state when we successfully receive at least one - // message from the server. - lrs.sendLoads(streamCtx, stream, clusters, interval) - return backoff.ErrResetBackoff - } - backoff.RunF(ctx, runLoadReportStream, lrs.backoff) -} - -// sendLoads is responsible for periodically sending load reports to the LRS -// server at the specified interval for the specified clusters, until the passed -// in context is canceled. -func (lrs *StreamImpl) sendLoads(ctx context.Context, stream transport.StreamingCall, clusterNames []string, interval time.Duration) { - tick := time.NewTicker(interval) - defer tick.Stop() - for { - select { - case <-tick.C: - case <-ctx.Done(): - return - } - if err := lrs.sendLoadStatsRequest(stream, lrs.lrsStore.Stats(clusterNames)); err != nil { - lrs.logger.Warningf("Writing to LRS stream failed: %v", err) - return - } - } -} - -func (lrs *StreamImpl) sendFirstLoadStatsRequest(stream transport.StreamingCall, node *v3corepb.Node) error { - req := &v3lrspb.LoadStatsRequest{Node: node} - if lrs.logger.V(perRPCVerbosityLevel) { - lrs.logger.Infof("Sending initial LoadStatsRequest: %s", pretty.ToJSON(req)) - } - err := stream.Send(req) - if err == io.EOF { - return getStreamError(stream) - } - return err -} - -// recvFirstLoadStatsResponse receives the first LoadStatsResponse from the LRS -// server. Returns the following: -// - a list of cluster names requested by the server or an empty slice if the -// server requested for load from all clusters -// - the load reporting interval, and -// - any error encountered -func (lrs *StreamImpl) recvFirstLoadStatsResponse(stream transport.StreamingCall) ([]string, time.Duration, error) { - r, err := stream.Recv() - if err != nil { - return nil, 0, fmt.Errorf("lrs: failed to receive first LoadStatsResponse: %v", err) - } - resp, ok := r.(*v3lrspb.LoadStatsResponse) - if !ok { - return nil, time.Duration(0), fmt.Errorf("lrs: unexpected message type %T", r) - } - if lrs.logger.V(perRPCVerbosityLevel) { - lrs.logger.Infof("Received first LoadStatsResponse: %s", pretty.ToJSON(resp)) - } - - interval := resp.GetLoadReportingInterval() - if err := interval.CheckValid(); err != nil { - return nil, 0, fmt.Errorf("lrs: invalid load_reporting_interval: %v", err) - } - loadReportingInterval := interval.AsDuration() - - clusters := resp.Clusters - if resp.SendAllClusters { - // Return an empty slice to send stats for all clusters. - clusters = []string{} - } - - return clusters, loadReportingInterval, nil -} - -func (lrs *StreamImpl) sendLoadStatsRequest(stream transport.StreamingCall, loads []*load.Data) error { - clusterStats := make([]*v3endpointpb.ClusterStats, 0, len(loads)) - for _, sd := range loads { - droppedReqs := make([]*v3endpointpb.ClusterStats_DroppedRequests, 0, len(sd.Drops)) - for category, count := range sd.Drops { - droppedReqs = append(droppedReqs, &v3endpointpb.ClusterStats_DroppedRequests{ - Category: category, - DroppedCount: count, - }) - } - localityStats := make([]*v3endpointpb.UpstreamLocalityStats, 0, len(sd.LocalityStats)) - for l, localityData := range sd.LocalityStats { - lid, err := internal.LocalityIDFromString(l) - if err != nil { - return err - } - loadMetricStats := make([]*v3endpointpb.EndpointLoadMetricStats, 0, len(localityData.LoadStats)) - for name, loadData := range localityData.LoadStats { - loadMetricStats = append(loadMetricStats, &v3endpointpb.EndpointLoadMetricStats{ - MetricName: name, - NumRequestsFinishedWithMetric: loadData.Count, - TotalMetricValue: loadData.Sum, - }) - } - localityStats = append(localityStats, &v3endpointpb.UpstreamLocalityStats{ - Locality: &v3corepb.Locality{ - Region: lid.Region, - Zone: lid.Zone, - SubZone: lid.SubZone, - }, - TotalSuccessfulRequests: localityData.RequestStats.Succeeded, - TotalRequestsInProgress: localityData.RequestStats.InProgress, - TotalErrorRequests: localityData.RequestStats.Errored, - TotalIssuedRequests: localityData.RequestStats.Issued, - LoadMetricStats: loadMetricStats, - UpstreamEndpointStats: nil, // TODO: populate for per endpoint loads. - }) - } - - clusterStats = append(clusterStats, &v3endpointpb.ClusterStats{ - ClusterName: sd.Cluster, - ClusterServiceName: sd.Service, - UpstreamLocalityStats: localityStats, - TotalDroppedRequests: sd.TotalDrops, - DroppedRequests: droppedReqs, - LoadReportInterval: durationpb.New(sd.ReportInterval), - }) - } - - req := &v3lrspb.LoadStatsRequest{ClusterStats: clusterStats} - if lrs.logger.V(perRPCVerbosityLevel) { - lrs.logger.Infof("Sending LRS loads: %s", pretty.ToJSON(req)) - } - err := stream.Send(req) - if err == io.EOF { - return getStreamError(stream) - } - return err -} - -func getStreamError(stream transport.StreamingCall) error { - for { - if _, err := stream.Recv(); err != nil { - return err - } - } -} - -// Stop blocks until the stream is closed and all spawned goroutines exit. -func (lrs *StreamImpl) Stop() { - lrs.mu.Lock() - defer lrs.mu.Unlock() - - if lrs.cancelStream == nil { - return - } - lrs.cancelStream() - lrs.cancelStream = nil - lrs.logger.Infof("Stopping LRS stream") - <-lrs.doneCh -} diff --git a/xds/internal/xdsclient/transport/transport_interface.go b/xds/internal/xdsclient/transport/transport_interface.go deleted file mode 100644 index 48ce82a06e9d..000000000000 --- a/xds/internal/xdsclient/transport/transport_interface.go +++ /dev/null @@ -1,64 +0,0 @@ -/* - * - * Copyright 2024 gRPC authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Package transport defines the interface that describe the functionality -// required to communicate with an xDS server using streaming calls. -package transport - -import ( - "context" - - "google.golang.org/grpc/internal/xds/bootstrap" -) - -// Builder is an interface for building a new xDS transport. -type Builder interface { - // Build creates a new xDS transport with the provided options. - Build(opts BuildOptions) (Transport, error) -} - -// BuildOptions contains the options for building a new xDS transport. -type BuildOptions struct { - // ServerConfig contains the configuration that controls how the transport - // interacts with the xDS server. This includes the server URI and the - // credentials to use to connect to the server, among other things. - ServerConfig *bootstrap.ServerConfig -} - -// Transport provides the functionality to communicate with an xDS server using -// streaming calls. -type Transport interface { - // CreateStreamingCall creates a new streaming call to the xDS server for the - // specified method name. The returned StreamingCall interface can be used to - // send and receive messages on the stream. - CreateStreamingCall(context.Context, string) (StreamingCall, error) - - // Close closes the underlying connection and cleans up any resources used by the - // Transport. - Close() error -} - -// StreamingCall is an interface that provides a way to send and receive -// messages on a stream. The methods accept or return any.Any messages instead -// of concrete types to allow this interface to be used for both ADS and LRS. -type StreamingCall interface { - // Send sends the provided message on the stream. - Send(any) error - - // Recv block until the next message is received on the stream. - Recv() (any, error) -} diff --git a/xds/internal/xdsclient/xdsresource/cluster_resource_type.go b/xds/internal/xdsclient/xdsresource/cluster_resource_type.go index 3d85c31ff433..8c8d196e07a4 100644 --- a/xds/internal/xdsclient/xdsresource/cluster_resource_type.go +++ b/xds/internal/xdsclient/xdsresource/cluster_resource_type.go @@ -19,6 +19,8 @@ package xdsresource import ( "google.golang.org/grpc/internal/pretty" + "google.golang.org/grpc/internal/xds/bootstrap" + gxdsclient "google.golang.org/grpc/xds/internal/clients/xdsclient" "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" @@ -150,3 +152,9 @@ func WatchCluster(p Producer, name string, w ClusterWatcher) (cancel func()) { delegator := &delegatingClusterWatcher{watcher: w} return p.WatchResource(clusterType, name, delegator) } + +// NewGenericClusterResourceTypeDecoder returns a gxdsclient.Decoder that +// wraps the xdsresource.clusterType. +func NewGenericClusterResourceTypeDecoder(bc *bootstrap.Config, gServerCfgMap map[gxdsclient.ServerConfig]*bootstrap.ServerConfig) gxdsclient.Decoder { + return &genericResourceTypeDecoder{xdsResourceType: clusterType, bootstrapConfig: bc, gServerConfigMap: gServerCfgMap} +} diff --git a/xds/internal/xdsclient/xdsresource/endpoints_resource_type.go b/xds/internal/xdsclient/xdsresource/endpoints_resource_type.go index de574dd8d345..fecca7bdbf7e 100644 --- a/xds/internal/xdsclient/xdsresource/endpoints_resource_type.go +++ b/xds/internal/xdsclient/xdsresource/endpoints_resource_type.go @@ -19,6 +19,7 @@ package xdsresource import ( "google.golang.org/grpc/internal/pretty" + gxdsclient "google.golang.org/grpc/xds/internal/clients/xdsclient" "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" @@ -146,3 +147,9 @@ func WatchEndpoints(p Producer, name string, w EndpointsWatcher) (cancel func()) delegator := &delegatingEndpointsWatcher{watcher: w} return p.WatchResource(endpointsType, name, delegator) } + +// NewGenericEndpointsResourceTypeDecoder returns a gxdsclient.Decoder that +// wraps the xdsresource.endpointsType. +func NewGenericEndpointsResourceTypeDecoder() gxdsclient.Decoder { + return &genericResourceTypeDecoder{xdsResourceType: endpointsType} +} diff --git a/xds/internal/xdsclient/xdsresource/listener_resource_type.go b/xds/internal/xdsclient/xdsresource/listener_resource_type.go index 0f49e6c56a3a..dbd6bc275eee 100644 --- a/xds/internal/xdsclient/xdsresource/listener_resource_type.go +++ b/xds/internal/xdsclient/xdsresource/listener_resource_type.go @@ -22,6 +22,7 @@ import ( "google.golang.org/grpc/internal/pretty" "google.golang.org/grpc/internal/xds/bootstrap" + gxdsclient "google.golang.org/grpc/xds/internal/clients/xdsclient" "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" @@ -103,7 +104,6 @@ func (listenerResourceType) Decode(opts *DecodeOptions, resource *anypb.Any) (*D } return &DecodeResult{Name: name, Resource: &ListenerResourceData{Resource: listener}}, nil - } // ListenerResourceData wraps the configuration of a Listener resource as @@ -127,7 +127,6 @@ func (l *ListenerResourceData) RawEqual(other ResourceData) bool { return false } return proto.Equal(l.Resource.Raw, other.Raw()) - } // ToJSON returns a JSON string representation of the resource data. @@ -182,3 +181,9 @@ func WatchListener(p Producer, name string, w ListenerWatcher) (cancel func()) { delegator := &delegatingListenerWatcher{watcher: w} return p.WatchResource(listenerType, name, delegator) } + +// NewGenericListenerResourceTypeDecoder returns a gxdsclient.Decoder that wraps +// the xdsresource.listenerType. +func NewGenericListenerResourceTypeDecoder(bc *bootstrap.Config) gxdsclient.Decoder { + return &genericResourceTypeDecoder{xdsResourceType: listenerType, bootstrapConfig: bc} +} diff --git a/xds/internal/xdsclient/xdsresource/resource_type.go b/xds/internal/xdsclient/xdsresource/resource_type.go index c22c5a6a3a84..db0d5eeb0f67 100644 --- a/xds/internal/xdsclient/xdsresource/resource_type.go +++ b/xds/internal/xdsclient/xdsresource/resource_type.go @@ -25,8 +25,11 @@ package xdsresource import ( + "fmt" + "google.golang.org/grpc/internal/xds/bootstrap" xdsinternal "google.golang.org/grpc/xds/internal" + gxdsclient "google.golang.org/grpc/xds/internal/clients/xdsclient" "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" "google.golang.org/protobuf/types/known/anypb" ) @@ -167,3 +170,107 @@ func (r resourceTypeState) TypeName() string { func (r resourceTypeState) AllResourcesRequiredInSotW() bool { return r.allResourcesRequiredInSotW } + +// genericResourceTypeDecoder embed an xdsresource.Type and implements +// gxdsclient.Decoder. +type genericResourceTypeDecoder struct { + xdsResourceType Type + bootstrapConfig *bootstrap.Config + gServerConfigMap map[gxdsclient.ServerConfig]*bootstrap.ServerConfig +} + +// Decode deserialize and validate resource bytes of an xDS resource received +// from the xDS management server. +func (a *genericResourceTypeDecoder) Decode(resourceBytes []byte, gOpts gxdsclient.DecodeOptions) (*gxdsclient.DecodeResult, error) { + raw := &anypb.Any{TypeUrl: a.xdsResourceType.TypeURL(), Value: resourceBytes} + opts := &DecodeOptions{BootstrapConfig: a.bootstrapConfig} + if gOpts.ServerConfig != nil { + opts.ServerConfig = a.gServerConfigMap[*gOpts.ServerConfig] + } + + result, err := a.xdsResourceType.Decode(opts, raw) + if result == nil { + return nil, err + } + if err != nil { + return &gxdsclient.DecodeResult{Name: result.Name}, err + } + + return &gxdsclient.DecodeResult{Name: result.Name, Resource: &genericResourceData{xdsResourceData: result.Resource}}, nil +} + +// genericResourceData embed an xdsresource.ResourceData and implements +// gxdsclient.ResourceData. +type genericResourceData struct { + xdsResourceData ResourceData +} + +// Equal returns true if the passed in gxdsclient.ResourceData +// is equal to that of the receiver. +func (a *genericResourceData) Equal(other gxdsclient.ResourceData) bool { + if other == nil { + return false + } + otherResourceData, ok := other.(*genericResourceData) + if !ok { + return false + } + return a.xdsResourceData.RawEqual(otherResourceData.xdsResourceData) +} + +// Bytes returns the underlying raw bytes of the wrapped resource. +func (a *genericResourceData) Bytes() []byte { + rawAny := a.xdsResourceData.Raw() + if rawAny == nil { + return nil + } + return rawAny.Value +} + +// genericResourceWatcher embed ResourceWatcher and implements +// gxdsclient.ResourceWatcher. +type genericResourceWatcher struct { + xdsResourceWatcher ResourceWatcher +} + +// ResourceChanged indicates a new version of the wrapped resource is +// available. +func (a *genericResourceWatcher) ResourceChanged(gData gxdsclient.ResourceData, done func()) { + if gData == nil { + a.xdsResourceWatcher.ResourceChanged(nil, done) + return + } + + grd, ok := gData.(*genericResourceData) + if !ok { + err := fmt.Errorf("genericResourceWatcher received unexpected gxdsclient.ResourceData type %T, want *genericResourceData", gData) + a.xdsResourceWatcher.ResourceError(err, done) + return + } + a.xdsResourceWatcher.ResourceChanged(grd.xdsResourceData, done) +} + +// ResourceError indicates an error occurred while trying to fetch or +// decode the associated wrapped resource. The previous version of the +// wrapped resource should be considered invalid. +func (a *genericResourceWatcher) ResourceError(err error, done func()) { + a.xdsResourceWatcher.ResourceError(err, done) +} + +// AmbientError indicates an error occurred after a resource has been +// received that should not modify the use of that wrapped resource but may +// provide useful information about the state of the XDSClient for debugging +// purposes. The previous version of the wrapped resource should still be +// considered valid. +func (a *genericResourceWatcher) AmbientError(err error, done func()) { + a.xdsResourceWatcher.AmbientError(err, done) +} + +// GenericResourceWatcher returns a gxdsclient.ResourceWatcher that wraps an +// xdsresource.ResourceWatcher to make it compatible with gxdsclient.ResourceWatcher. +func GenericResourceWatcher(xdsResourceWatcher ResourceWatcher) gxdsclient.ResourceWatcher { + if xdsResourceWatcher == nil { + return nil + } + return &genericResourceWatcher{xdsResourceWatcher: xdsResourceWatcher} +} diff --git a/xds/internal/xdsclient/xdsresource/route_config_resource_type.go b/xds/internal/xdsclient/xdsresource/route_config_resource_type.go index c292b1b8ef2c..11e790cb30d8 100644 --- a/xds/internal/xdsclient/xdsresource/route_config_resource_type.go +++ b/xds/internal/xdsclient/xdsresource/route_config_resource_type.go @@ -19,6 +19,7 @@ package xdsresource import ( "google.golang.org/grpc/internal/pretty" + gxdsclient "google.golang.org/grpc/xds/internal/clients/xdsclient" "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" @@ -148,3 +149,9 @@ func WatchRouteConfig(p Producer, name string, w RouteConfigWatcher) (cancel fun delegator := &delegatingRouteConfigWatcher{watcher: w} return p.WatchResource(routeConfigType, name, delegator) } + +// NewGenericRouteConfigResourceTypeDecoder returns a gxdsclient.Decoder that +// wraps the xdsresource.routeConfigType. +func NewGenericRouteConfigResourceTypeDecoder() gxdsclient.Decoder { + return &genericResourceTypeDecoder{xdsResourceType: routeConfigType} +} diff --git a/xds/server_ext_test.go b/xds/server_ext_test.go index cfc64d719f52..9b8e56d09e3f 100644 --- a/xds/server_ext_test.go +++ b/xds/server_ext_test.go @@ -35,19 +35,24 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/internal" "google.golang.org/grpc/internal/grpctest" "google.golang.org/grpc/internal/stubserver" "google.golang.org/grpc/internal/testutils" "google.golang.org/grpc/internal/testutils/xds/e2e" "google.golang.org/grpc/internal/xds/bootstrap" "google.golang.org/grpc/peer" + "google.golang.org/grpc/resolver" "google.golang.org/grpc/status" "google.golang.org/grpc/xds" "google.golang.org/grpc/xds/internal/xdsclient" + v3clusterpb "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3" + v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" v3routepb "github.com/envoyproxy/go-control-plane/envoy/config/route/v3" v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" + testgrpc "google.golang.org/grpc/interop/grpc_testing" testpb "google.golang.org/grpc/interop/grpc_testing" ) @@ -525,3 +530,159 @@ func (s) TestServer_MultipleServers_DifferentBootstrapConfigurations(t *testing. t.Errorf("Connected to wrong peer: %s, want %s", peer2.Addr, lis2.Addr()) } } + +// TestXDSRace_ClientStuckOnLDS reproduces a race condition where an xDS client +// can get stuck waiting for an LDS resource if channels are created and shut +// down rapidly for the same target. +func TestXDSRace_ClientStuckOnLDS(t *testing.T) { + wrappedLis := testutils.NewListenerWrapper(t, nil) + lis := testutils.NewRestartableListener(wrappedLis) + + mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: lis}) + + backend := stubserver.StartTestService(t, nil) + defer backend.Stop() + + nodeID := uuid.New().String() + const targetName = "my-xds-target-for-lds-race.com" + const routeConfigName = "route-for-lds-race-target" + const clusterName = "cluster-for-lds-race-target" + const edsServiceName = clusterName + + // Generate bootstrap configuration with the above two servers. + bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{ + Servers: []byte(fmt.Sprintf(`[ + { + "server_uri": %q, + "channel_creds": [{"type": "insecure"}] + }]`, mgmtServer.Address)), + Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)), + }) + if err != nil { + t.Fatalf("Failed to create bootstrap file: %v", err) + } + + // Create an xDS client with the above bootstrap configuration. + config, err := bootstrap.NewConfigFromContents(bootstrapContents) + if err != nil { + t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err) + } + pool := xdsclient.NewPool(config) + if err != nil { + t.Fatalf("Failed to create xDS client: %v", err) + } + + // Get the xDS resolver to use the above xDS client. + resolverBuilder := internal.NewXDSResolverWithPoolForTesting.(func(*xdsclient.Pool) (resolver.Builder, error)) + resolver, err := resolverBuilder(pool) + if err != nil { + t.Fatalf("Failed to create xDS resolver for testing: %v", err) + } + + listener := e2e.DefaultClientListener(targetName, routeConfigName) + routeConfig := e2e.DefaultRouteConfig(routeConfigName, targetName, clusterName) + cluster := e2e.DefaultCluster(clusterName, edsServiceName, e2e.SecurityLevelNone) + endpoints := e2e.DefaultEndpoint(edsServiceName, "localhost", []uint32{testutils.ParsePort(t, backend.Address)}) + + numIterations := 20 // Number of attempts to trigger the race. + // Each iteration can take up to ~20-25s. Add a buffer for overall test timeout. + overallTestTimeout := time.Duration(numIterations)*25*time.Second + 30*time.Second + if overallTestTimeout < 60*time.Second { // Minimum reasonable timeout + overallTestTimeout = 60 * time.Second + } + + ctx, cancel := context.WithTimeout(context.Background(), overallTestTimeout) + defer cancel() + + if err := mgmtServer.Update(ctx, e2e.UpdateOptions{ + NodeID: nodeID, + Listeners: []*v3listenerpb.Listener{listener}, + Routes: []*v3routepb.RouteConfiguration{routeConfig}, + Clusters: []*v3clusterpb.Cluster{cluster}, + Endpoints: []*v3endpointpb.ClusterLoadAssignment{endpoints}, + SkipValidation: true, + }); err != nil { + t.Fatalf("Failed to update management server: %v", err) + } + + raceConditionHit := false + var hitIteration int + + for i := 0; i < numIterations; i++ { + select { + case <-ctx.Done(): + t.Errorf("Overall test timeout reached at iteration %d before completing all iterations or hitting race.", i) + goto endLoop // Use goto to break out of the outer loop and proceed to final logging. + default: + } + + t.Logf("Starting iteration %d to trigger race condition", i) + // Iteration timeout must be > 15s (LDS timeout) + RPC processing + buffer. + iterCtx, iterCancel := context.WithTimeout(ctx, 25*time.Second) + + var cc1 *grpc.ClientConn + var errDial error + cc1, errDial = grpc.NewClient(fmt.Sprintf("xds:///%s", targetName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(resolver)) + if errDial != nil { + t.Logf("Iteration %d: cc1.NewClient failed: %v", i, errDial) + } + + client1 := testgrpc.NewTestServiceClient(cc1) + rpcCtx, rpcCancel := context.WithTimeout(iterCtx, 5*time.Second) + _, errRPC := client1.EmptyCall(rpcCtx, &testpb.Empty{}) // Prime xDS for cc1. + rpcCancel() + if errRPC != nil { + t.Logf("Iteration %d: cc1 RPC failed: %v", i, errRPC) + } else { + t.Logf("Iteration %d: cc1 RPC successful", i) + } + go func() { + cc1.Close() // This initiates unsubscription when goroutine exits. + }() + + cc2, errDial := grpc.NewClient(fmt.Sprintf("xds:///%s", targetName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(resolver)) // Channel 2 created "around the same time". + if errDial != nil { + t.Errorf("Iteration %d: cc2.NewClient failed: %v", i, errDial) + iterCancel() + continue + } + + client2 := testgrpc.NewTestServiceClient(cc2) + rpcAttemptCtx, rpcAttemptCancel := context.WithTimeout(iterCtx, 18*time.Second) // RPC on cc2, expect >15s hang if race hits. + startTime := time.Now() + _, rpcErr := client2.EmptyCall(rpcAttemptCtx, &testpb.Empty{}) + duration := time.Since(startTime) + rpcAttemptCancel() + + cc2.Close() // Clean up cc2. + + if rpcErr != nil { + st, _ := status.FromError(rpcErr) + isUnavailable := st.Code() == codes.Unavailable + rpcContextTimedOut := rpcAttemptCtx.Err() == context.DeadlineExceeded + + t.Logf("Iteration %d: cc2 RPC failed after %v. Error: %q. Code: %s. RPC_CTX_ERR: %v", i, duration, rpcErr, st.Code(), rpcAttemptCtx.Err()) + + // Symptom of the race: RPC takes ~15s and fails (Unavailable or context timeout). + if duration > 14*time.Second && duration < 20*time.Second { + if isUnavailable || rpcContextTimedOut { + t.Errorf("Iteration %d: cc2 RPC behavior consistent with LDS timeout due to race. Duration: %v, Error: %v. RACE CONDITION HIT.", i, duration, rpcErr) + raceConditionHit = true + hitIteration = i + iterCancel() + goto endLoop // Exit loop, race condition reproduced. + } + } + } else { + t.Logf("Iteration %d: cc2 RPC succeeded in %v (race not hit this iteration)", i, duration) + } + iterCancel() // Clean up this iteration's context. + } + +endLoop: + if raceConditionHit { + t.Logf("Successfully reproduced the race condition on iteration %d where channel 2's RPC was stuck.", hitIteration) + } else { + t.Logf("Race condition not reproduced after %d iterations. The bug might be fixed or is difficult to trigger consistently.", numIterations) + } +} diff --git a/xds/server_resource_ext_test.go b/xds/server_resource_ext_test.go index 364d27a0f2f4..b396eb9ffc6c 100644 --- a/xds/server_resource_ext_test.go +++ b/xds/server_resource_ext_test.go @@ -25,6 +25,7 @@ import ( "net" "strings" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/uuid" @@ -32,14 +33,11 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials/insecure" - "google.golang.org/grpc/internal" "google.golang.org/grpc/internal/testutils" "google.golang.org/grpc/internal/testutils/xds/e2e" "google.golang.org/grpc/internal/xds/bootstrap" "google.golang.org/grpc/xds" - xdsinternal "google.golang.org/grpc/xds/internal" "google.golang.org/grpc/xds/internal/xdsclient" - "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" "google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version" "google.golang.org/protobuf/types/known/wrapperspb" @@ -56,12 +54,13 @@ import ( // Tests the case where an LDS points to an RDS which returns resource not // found. Before getting the resource not found, the xDS Server has not received // all configuration needed, so it should Accept and Close any new connections. -// After it has received the resource not found error, the server should move to -// serving, successfully Accept Connections, and fail at the L7 level with -// resource not found specified. +// After it has received the resource not found error (due to short watch +// expiry), the server should move to serving, successfully Accept Connections, +// and fail at the L7 level with resource not found specified. func (s) TestServer_RouteConfiguration_ResourceNotFound(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) defer cancel() + routeConfigNamesCh := make(chan []string, 1) managementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ OnStreamRequest: func(_ int64, req *v3discoverypb.DiscoveryRequest) error { @@ -91,7 +90,6 @@ func (s) TestServer_RouteConfiguration_ResourceNotFound(t *testing.T) { if err != nil { t.Fatalf("Failed to retrieve host and port of server: %v", err) } - const routeConfigResourceName = "routeName" listener := e2e.DefaultServerListenerWithRouteConfigName(host, port, e2e.SecurityLevelNone, routeConfigResourceName) resources := e2e.UpdateOptions{ @@ -99,17 +97,29 @@ func (s) TestServer_RouteConfiguration_ResourceNotFound(t *testing.T) { Listeners: []*v3listenerpb.Listener{listener}, SkipValidation: true, } - if err := managementServer.Update(ctx, resources); err != nil { t.Fatal(err) } + modeChangeHandler := newServingModeChangeHandler(t) modeChangeOpt := xds.ServingModeCallback(modeChangeHandler.modeChangeCallback) + config, err := bootstrap.NewConfigFromContents(bootstrapContents) if err != nil { t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err) } + // Create a specific xDS client instance within that pool for the server, + // configuring it with a short WatchExpiryTimeout. pool := xdsclient.NewPool(config) + _, serverXDSClientClose, err := pool.NewClientForTesting(xdsclient.OptionsForTesting{ + Name: xdsclient.NameForServer, + WatchExpiryTimeout: 500 * time.Millisecond, + }) + if err != nil { + t.Fatalf("Failed to create xDS client for server: %v", err) + } + defer serverXDSClientClose() + // Start an xDS-enabled gRPC server using the above client from the pool. createStubServer(t, lis, modeChangeOpt, xds.ClientPoolForTesting(pool)) // Wait for the route configuration resource to be requested from the @@ -123,29 +133,20 @@ func (s) TestServer_RouteConfiguration_ResourceNotFound(t *testing.T) { t.Fatal("Timeout waiting for route config resource to be requested") } + // Do NOT send the RDS resource. The xDS client's watch expiry timer will + // fire. After the RDS resource is deemed "not found" (due to the short + // watch expiry), the server will transition to SERVING mode. + cc, err := grpc.NewClient(lis.Addr().String(), grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { t.Fatalf("failed to dial local test server: %v", err) } defer cc.Close() + // Before the watch expiry, the server is NOT_SERVING, RPCs should fail with UNAVAILABLE. waitForFailedRPCWithStatus(ctx, t, cc, codes.Unavailable, "", "") - // Lookup the xDS client in use based on the dedicated well-known key, as - // defined in A71, used by the xDS enabled gRPC server. - xdsC, close, err := pool.GetClientForTesting(xdsclient.NameForServer) - if err != nil { - t.Fatalf("Failed to find xDS client for configuration: %v", string(bootstrapContents)) - } - defer close() - - // Invoke resource not found error for the route configuration resource. - // This should cause the server to go SERVING, but fail RPCs with the - // appropriate error code. - triggerResourceNotFound := internal.TriggerXDSResourceNotFoundForTesting.(func(xdsclient.XDSClient, xdsresource.Type, string) error) - routeConfigResourceType := xdsinternal.ResourceTypeMapForTesting[version.V3RouteConfigURL].(xdsresource.Type) - if err := triggerResourceNotFound(xdsC, routeConfigResourceType, routeConfigResourceName); err != nil { - t.Fatalf("Failed to trigger resource name not found for testing: %v", err) - } + // Wait for the xDS-enabled gRPC server to go SERVING. This should happen + // after the RDS watch expiry timer fires. select { case <-ctx.Done(): t.Fatal("Timeout waiting for the xDS-enabled gRPC server to go SERVING") @@ -154,6 +155,8 @@ func (s) TestServer_RouteConfiguration_ResourceNotFound(t *testing.T) { t.Fatalf("Mode changed to %v, want %v", gotMode, connectivity.ServingModeServing) } } + // After watch expiry, the server should be SERVING, but RPCs should fail + // at the L7 level with resource not found. waitForFailedRPCWithStatus(ctx, t, cc, codes.Unavailable, "error from xDS configuration for matched route configuration", nodeID) }