chore(jobsdb): disable no results cache state filter optimization (#6964)

atzoum · web-flow · commit c20dad2998a6 · 2026-05-25T10:14:05.000+03:00
🔒 Scanned for secrets using gitleaks 8.30.1

# Description

Makes the per-state `noResultsCache` optimization in `jobsdb`
configurable, and **disables it by default**.

The optimization has two parts in `getJobsDS`:
1. **Read-side pruning** — `stateFilters` are narrowed against the cache
before querying, skipping states the cache reported as empty for the
current `(ds, partitions, workspace, customVals, params)` key.
2. **Write-side commit predicate** — when a queried state is absent from
the result set and no limit was reached, a "no jobs of state X" entry is
committed to the cache.

Both are now gated behind a single reloadable flag
`noResultsCacheStateOptimization`:
- `JobsDB.&lt;tablePrefix&gt;.noResultsCacheStateOptimization`
- `JobsDB.noResultsCacheStateOptimization`

Default is `false` (optimization disabled).

Also fixes a variable-shadowing bug in `getJobsDS` where the outer
`payloadSize` was never updated due to an inner shadow, making the
post-loop `payloadSize &gt;= PayloadSizeLimit` check dead code. As a
result, `LimitsReached` was wrongly reported as `false` when the payload
limit was reached at the boundary (exact total) or when a single
oversize job exhausted the budget alone. This is one of the concrete
mechanisms that can poison the `noResultsCache` — see the reasoning
below.

## Reasoning

We suspect the per-state optimization is the root cause of
`noResultsCache` being populated with stale entries, which in turn leads
to **out-of-order processing**: a state that actually has jobs is
wrongly cached as empty, so subsequent reads skip those jobs from one
dataset and process later jobs from another dataset first.

The write-side commit predicate `(!ok &amp;&amp; !limitsReached)` assumes that
when a state is missing from the result set and limits were not reached,
we have exhaustively scanned the dataset for that state. This invariant
can be broken by silent row-dropping filters that are not part of the
cache key, or incorrect calculation of when limits are reached, and the
resulting cache entry then poisons later reads.

The `payloadSize` shadowing bug fixed in this PR is a concrete instance
of the second class: at the payload-budget boundary `limitsReached` was
wrongly returned as `false`, so a state missing from the result set
(e.g. because the budget was exhausted before any state-X row appeared)
would satisfy `(!ok &amp;&amp; !limitsReached)` and get committed to the cache
as "no jobs for this state" — a stale entry that subsequent reads would
honour, skipping real jobs and triggering out-of-order processing.

Disabling the per-state optimization by default keeps only the coarser
`len(jobList) == 0` commit path, which can only record "no rows at all"
results and is therefore not vulnerable to per-state misattribution.

The optimization can be re-enabled per-deployment via the reloadable
flag while we investigate and harden the invariants.

## Linear Ticket

resolves PIPE-2994

## Security

- [x] The code changed/added as part of this pull request won't create
any security issues with how the software is being used.
diff --git a/cluster/migrator/partitionmigration/server/grpc_server.go b/cluster/migrator/partitionmigration/server/grpc_server.go
@@ -2,6 +2,7 @@
 package server
 
 import (
+	"errors"
 	"fmt"
 	"net"
 	"sync"
@@ -42,7 +43,7 @@ func (s *GRPCServer) Start() error {
 		return fmt.Errorf("failed to listen: %w", err)
 	}
 	s.wg.Go(func() {
-		if err := s.server.Serve(lis); err != nil {
+		if err := s.server.Serve(lis); err != nil && !errors.Is(err, grpc.ErrServerStopped) {
 			// This shouldn't really happen, only in very exceptional cases.
 			// No error is returned during GracefulStop or Stop.
 			panic(fmt.Errorf("failed to serve grpc server: %w", err))
diff --git a/jobsdb/jobsdb.go b/jobsdb/jobsdb.go
@@ -588,33 +588,34 @@ type Handle struct {
 
 	config *config.Config
 	conf   struct {
-		payloadColumnType              payloadColumnType
-		maxTableSize                   config.ValueLoader[int64]
-		cacheExpiration                config.ValueLoader[time.Duration]
-		addNewDSLoopSleepDuration      config.ValueLoader[time.Duration]
-		addNewDSTimeout                config.ValueLoader[time.Duration]
-		refreshDSListLoopSleepDuration config.ValueLoader[time.Duration]
-		refreshDSTimeout               config.ValueLoader[time.Duration]
-		minDSRetentionPeriod           config.ValueLoader[time.Duration]
-		maxDSRetentionPeriod           config.ValueLoader[time.Duration]
-		jobMaxAge                      config.ValueLoader[time.Duration]
-		writeCapacity                  chan struct{}
-		readCapacity                   chan struct{}
-		enableWriterQueue              bool
-		enableReaderQueue              bool
-		clearAll                       bool
-		skipMaintenanceError           bool
-		dsLimit                        config.ValueLoader[int]
-		maxReaders                     int
-		maxWriters                     int
-		maxOpenConnections             int
-		analyzeThreshold               config.ValueLoader[int]
-		MaxDSSize                      config.ValueLoader[int]
-		numPartitions                  int // if zero or negative, no partitioning
-		partitionFunction              func(job *JobT) string
-		warnOnStatusMissingPartitionID config.ValueLoader[bool]
-		holdDSListLockDuringStore      config.ValueLoader[bool] // escape hatch: hold the dsList read lock for the entire store callback
-		dbTablesVersion                int                      // version of the database tables schema (0 means latest)
+		payloadColumnType               payloadColumnType
+		maxTableSize                    config.ValueLoader[int64]
+		cacheExpiration                 config.ValueLoader[time.Duration]
+		addNewDSLoopSleepDuration       config.ValueLoader[time.Duration]
+		addNewDSTimeout                 config.ValueLoader[time.Duration]
+		refreshDSListLoopSleepDuration  config.ValueLoader[time.Duration]
+		refreshDSTimeout                config.ValueLoader[time.Duration]
+		minDSRetentionPeriod            config.ValueLoader[time.Duration]
+		maxDSRetentionPeriod            config.ValueLoader[time.Duration]
+		jobMaxAge                       config.ValueLoader[time.Duration]
+		writeCapacity                   chan struct{}
+		readCapacity                    chan struct{}
+		enableWriterQueue               bool
+		enableReaderQueue               bool
+		clearAll                        bool
+		skipMaintenanceError            bool
+		dsLimit                         config.ValueLoader[int]
+		maxReaders                      int
+		maxWriters                      int
+		maxOpenConnections              int
+		analyzeThreshold                config.ValueLoader[int]
+		MaxDSSize                       config.ValueLoader[int]
+		numPartitions                   int // if zero or negative, no partitioning
+		partitionFunction               func(job *JobT) string
+		warnOnStatusMissingPartitionID  config.ValueLoader[bool]
+		holdDSListLockDuringStore       config.ValueLoader[bool] // escape hatch: hold the dsList read lock for the entire store callback
+		noResultsCacheStateOptimization config.ValueLoader[bool]
+		dbTablesVersion                 int // version of the database tables schema (0 means latest)
 
 		migration struct {
 			maxMigrateOnce, maxMigrateDSProbe config.ValueLoader[int]
@@ -1107,6 +1108,10 @@ func (jd *Handle) loadConfig() {
 	// so long-running stores don't block dsList writers. Flip to true to revert to holding the lock for the whole callback.
 	jd.conf.holdDSListLockDuringStore = jd.config.GetReloadableBoolVar(false, jd.configKeys("holdDSListLockDuringStore")...)
 
+	// when true, the per-state noResultsCache optimization is enabled: stateFilters are narrowed
+	// against the cache before querying, and (!ok && !limitsReached) is used as a commit predicate.
+	jd.conf.noResultsCacheStateOptimization = jd.config.GetReloadableBoolVar(false, jd.configKeys("noResultsCacheStateOptimization")...)
+
 	if jd.TriggerAddNewDS == nil {
 		jd.TriggerAddNewDS = func() <-chan time.Time {
 			return time.After(jd.conf.addNewDSLoopSleepDuration.Load())
@@ -2254,10 +2259,13 @@ func (jd *Handle) getJobsDS(ctx context.Context, ds dataSetT, lastDS bool, param
 		CustomValFilters: params.CustomValFilters,
 		WorkspaceID:      workspaceID,
 	}
+	stateFilterOptimization := jd.conf.noResultsCacheStateOptimization.Load()
 
-	stateFilters = lo.Filter(stateFilters, func(state string, _ int) bool { // exclude states for which we already know that there are no jobs
-		return !jd.noResultsCache.Get(ds.Index, partitionFilters, workspaceID, customValFilters, []string{state}, parameterFilters)
-	})
+	if stateFilterOptimization {
+		stateFilters = lo.Filter(stateFilters, func(state string, _ int) bool { // exclude states for which we already know that there are no jobs
+			return !jd.noResultsCache.Get(ds.Index, partitionFilters, workspaceID, customValFilters, []string{state}, parameterFilters)
+		})
+	}
 
 	defer jd.getTimerStat("jobsdb_get_jobs_ds_time", &tags).RecordDuration()()
 
@@ -2390,17 +2398,22 @@ func (jd *Handle) getJobsDS(ctx context.Context, ds dataSetT, lastDS bool, param
 	}
 	defer func() { _ = rows.Close() }()
 
-	var runningEventCount int
-	var runningPayloadSize int64
-
 	var jobList []*JobT
 	var limitsReached bool
 	var eventCount int
 	var payloadSize int64
+
+	// we don't need the payload_size but still need to scan it because it is part of the resultset
+	// The query uses it for limits checking. The variable is declared before the for loop to avoid extra allocations,
+	// but if we were to actually use it in the future for returning in the result, we would need to move its declaration
+	// inside the loop
+	var discardRowPayloadSize int64
+
 	resultsetStates := map[string]struct{}{}
 	for rows.Next() {
-		var payloadSize int64
 		var job JobT
+		var runningEventCount int
+		var runningPayloadSize int64
 		var payload []byte
 		var jsState sql.NullString
 		var jsAttemptNum sql.NullInt64
@@ -2410,7 +2423,7 @@ func (jd *Handle) getJobsDS(ctx context.Context, ds dataSetT, lastDS bool, param
 		var jsErrorResponse []byte
 		var jsParameters []byte
 		err := rows.Scan(&job.JobID, &job.UUID, &job.UserID, &job.Parameters, &job.CustomVal,
-			&payload, &job.EventCount, &job.CreatedAt, &job.ExpireAt, &job.WorkspaceId, &job.PartitionID, &payloadSize, &runningEventCount, &runningPayloadSize,
+			&payload, &job.EventCount, &job.CreatedAt, &job.ExpireAt, &job.WorkspaceId, &job.PartitionID, &discardRowPayloadSize, &runningEventCount, &runningPayloadSize,
 			&jsState, &jsAttemptNum,
 			&jsExecTime, &jsRetryTime,
 			&jsErrorCode, &jsErrorResponse, &jsParameters)
@@ -2466,7 +2479,9 @@ func (jd *Handle) getJobsDS(ctx context.Context, ds dataSetT, lastDS bool, param
 			// we are committing the cache Tx only if
 			// (a) no jobs are returned by the query or
 			// (b) the state is not present in the resultset and limits have not been reached
-			if _, ok := resultsetStates[state]; len(jobList) == 0 || (!ok && !limitsReached) {
+			//     (skipped when the noResultsCache state-filter optimization is disabled)
+			_, ok := resultsetStates[state]
+			if len(jobList) == 0 || (stateFilterOptimization && !ok && !limitsReached) {
 				if allEntriesCommitted := cacheTx.Commit(); !allEntriesCommitted {
 					tags := &statTags{
 						StateFilters:     []string{state},
diff --git a/jobsdb/jobsdb_test.go b/jobsdb/jobsdb_test.go
@@ -1639,6 +1639,115 @@ func TestPayloadSizeColumnQueries(t *testing.T) {
 	jobsDB.TearDown()
 }
 
+// TestGetJobsLimitsReached verifies the LimitsReached flag and the number of jobs
+// returned by GetToProcess for every combination of JobsLimit / EventsLimit /
+// PayloadSizeLimit, including exact-boundary cases and single-oversize cases.
+// It runs against both ASCII and UTF-8 multibyte payloads to validate that
+// PayloadSizeLimit is measured in bytes (octet_length) and not characters.
+func TestGetJobsLimitsReached(t *testing.T) {
+	pgContainer := startPostgres(t)
+
+	const (
+		payloadBytes  = 100
+		eventsPerJob  = 2
+		numJobs       = 10
+		totalEvents   = numJobs * eventsPerJob
+		totalPayload  = int64(numJobs * payloadBytes)
+		singlePayload = int64(payloadBytes)
+	)
+
+	// Each payload is exactly payloadBytes bytes long.
+	// "🙂" is U+1F642, 4 bytes in UTF-8 → 25 × 4 = 100 bytes.
+	payloadVariants := []struct {
+		name string
+		data []byte
+	}{
+		{"ascii", []byte(strings.Repeat("x", payloadBytes))},
+		{"utf8_multibyte", []byte(strings.Repeat("🙂", payloadBytes/4))},
+	}
+
+	for _, pv := range payloadVariants {
+		t.Run(pv.name, func(t *testing.T) {
+			require.Len(t, pv.data, payloadBytes, "payload variant must be exactly %d bytes", payloadBytes)
+
+			customVal := strings.ToUpper(rsRand.String(8))
+			workspaceID := "workspaceID"
+			tablePrefix := strings.ToLower(rsRand.String(5))
+
+			t.Setenv("RSERVER_JOBS_DB_PAYLOAD_COLUMN_TYPE", "text")
+			jobsDB := &Handle{dbHandle: pgContainer.DB}
+			require.NoError(t, jobsDB.Setup(ReadWrite, true, tablePrefix))
+			t.Cleanup(jobsDB.TearDown)
+
+			jobs := make([]*JobT, numJobs)
+			for i := range numJobs {
+				jobs[i] = &JobT{
+					Parameters:   []byte(`{}`),
+					EventPayload: pv.data,
+					UserID:       "u",
+					UUID:         uuid.New(),
+					CustomVal:    customVal,
+					EventCount:   eventsPerJob,
+					WorkspaceId:  workspaceID,
+				}
+			}
+			require.NoError(t, jobsDB.Store(context.Background(), jobs))
+
+			testCases := []struct {
+				name             string
+				jobsLimit        int
+				eventsLimit      int
+				payloadSizeLimit int64
+				expectedJobs     int
+				expectedLimits   bool
+			}{
+				// --- JobsLimit only ---
+				{"JobsLimit_under", 5, 0, 0, 5, true},
+				{"JobsLimit_exact_boundary", numJobs, 0, 0, numJobs, true},
+				{"JobsLimit_over", numJobs * 2, 0, 0, numJobs, false},
+
+				// --- EventsLimit ---
+				// running_events overflows after the 2nd job (running=6 > 4)
+				{"EventsLimit_under", 100, 4, 0, 2, true},
+				// running_events overflows after the 1st job (running=4 > 2)
+				{"EventsLimit_exact_one_job", 100, eventsPerJob, 0, 1, true},
+				// boundary: total events == limit, no in-loop overflow,
+				// post-loop check sets limitsReached because eventCount >= limit
+				{"EventsLimit_exact_total_boundary", 100, totalEvents, 0, numJobs, true},
+				{"EventsLimit_over", 100, totalEvents * 10, 0, numJobs, false},
+				// limit < any single job's events: must still return one oversize job
+				{"EventsLimit_smaller_than_single_job", 100, 1, 0, 1, true},
+
+				// --- PayloadSizeLimit ---
+				{"PayloadSizeLimit_under", 100, 0, singlePayload * 5, 5, true},
+				{"PayloadSizeLimit_exact_one_job", 100, 0, singlePayload, 1, true},
+				{"PayloadSizeLimit_exact_total_boundary", 100, 0, totalPayload, numJobs, true},
+				{"PayloadSizeLimit_over", 100, 0, totalPayload * 2, numJobs, false},
+				// limit < any single job's payload: must still return one oversize job
+				{"PayloadSizeLimit_smaller_than_single_job", 100, 0, singlePayload / 2, 1, true},
+
+				// --- Combined ---
+				{"Combined_events_binds_first", 100, 4, totalPayload, 2, true},
+				{"Combined_payload_binds_first", 100, 100, singlePayload * 3, 3, true},
+			}
+
+			for _, tc := range testCases {
+				t.Run(tc.name, func(t *testing.T) {
+					res, err := jobsDB.GetToProcess(context.Background(), GetQueryParams{
+						CustomValFilters: []string{customVal},
+						JobsLimit:        tc.jobsLimit,
+						EventsLimit:      tc.eventsLimit,
+						PayloadSizeLimit: tc.payloadSizeLimit,
+					}, nil)
+					require.NoError(t, err)
+					require.Len(t, res.Jobs, tc.expectedJobs, "unexpected number of jobs")
+					require.Equal(t, tc.expectedLimits, res.LimitsReached, "unexpected LimitsReached")
+				})
+			}
+		})
+	}
+}
+
 func TestUpdateJobStatus(t *testing.T) {
 	_ = startPostgres(t)
 	c := config.New()