-
Notifications
You must be signed in to change notification settings - Fork 1.7k
TS: integration with state-based replication #10138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -110,9 +110,7 @@ func (t *timerQueueStandbyTaskExecutor) Execute( | |
| case *tasks.ChasmTask: | ||
| err = t.executeChasmSideEffectTimerTask(ctx, task) | ||
| case *tasks.TimeSkippingTimerTask: | ||
| // todo@time-skipping: replication. The disable-after-fast-forward transition is emitted | ||
| // on the active side and will replicate; standby drops the local task. | ||
| err = nil | ||
| err = t.executeTimeSkippingTimerTask(ctx, task) | ||
| default: | ||
| err = queueserrors.NewUnprocessableTaskError("unknown task type") | ||
| } | ||
|
|
@@ -231,17 +229,53 @@ func (t *timerQueueStandbyTaskExecutor) discardChasmTask( | |
| ) | ||
| } | ||
|
|
||
|
feiyang3cat marked this conversation as resolved.
|
||
| // executeTimeSkippingTimerTask waits on the standby until the active cluster | ||
| // replicates the fast-forward transition. If the fast-forward this task was | ||
| // generated for is still pending (same source event and not yet reached), the | ||
| // task is retried until the discard delay elapses; otherwise it is acked. | ||
| func (t *timerQueueStandbyTaskExecutor) executeTimeSkippingTimerTask( | ||
| ctx context.Context, | ||
| timerTask *tasks.TimeSkippingTimerTask, | ||
| ) error { | ||
| actionFn := func(_ context.Context, wfContext historyi.WorkflowContext, mutableState historyi.MutableState, _ historyi.ReleaseWorkflowContextFunc) (any, error) { | ||
| if !mutableState.IsWorkflowExecutionRunning() { | ||
| return nil, nil | ||
| } | ||
| tsi := mutableState.GetExecutionInfo().GetTimeSkippingInfo() | ||
| ffi := tsi.GetFastForwardInfo() | ||
|
|
||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question for CGS team: Not sure if in complicated failover cases (A failover to a slow B and then A is up again as stand-by) there may be legacy timer tasks that have larger event IDs than the one of the current mutable state but even this happens already added time skipping timer task logic to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can the source event ID in the skipping info > timer task event ID? Does it mean the timer task is invalid?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| // the fast-forward this timer task is associated with is still valid and has not been reached so keep waiting | ||
| if ffi != nil && ffi.GetSourceEventId() == timerTask.EventID && !ffi.GetHasReached() { | ||
| return &struct{}{}, nil | ||
| } | ||
| return nil, nil | ||
| } | ||
|
|
||
| return t.processTimer( | ||
| ctx, | ||
| timerTask, | ||
| actionFn, | ||
| getStandbyPostActionFn( | ||
| timerTask, | ||
| t.getCurrentTime, | ||
| t.config.StandbyTaskMissingEventsDiscardDelay(timerTask.GetType()), | ||
| t.checkExecutionStillExistsOnSourceBeforeDiscard, | ||
| ), | ||
| ) | ||
| } | ||
|
|
||
| func (t *timerQueueStandbyTaskExecutor) executeUserTimerTimeoutTask( | ||
| ctx context.Context, | ||
| timerTask *tasks.UserTimerTask, | ||
| ) error { | ||
| referenceTime := t.Now() | ||
| actionFn := func(_ context.Context, wfContext historyi.WorkflowContext, mutableState historyi.MutableState, _ historyi.ReleaseWorkflowContextFunc) (any, error) { | ||
| if !mutableState.IsWorkflowExecutionRunning() { | ||
| // workflow already finished, no need to process the timer | ||
| return nil, nil | ||
| } | ||
|
|
||
| referenceTime := mutableState.Now() | ||
|
|
||
| timerSequence := t.getTimerSequence(mutableState) | ||
| timerSequenceIDs := timerSequence.LoadAndSortUserTimers() | ||
| if len(timerSequenceIDs) > 0 { | ||
|
|
@@ -253,6 +287,10 @@ func (t *timerQueueStandbyTaskExecutor) executeUserTimerTimeoutTask( | |
| return nil, serviceerror.NewInternal(errString) | ||
| } | ||
|
|
||
| // Use mutableState.Now() as reference time as a mutable state may use virtual time | ||
| // which can skip duration and be before the wall clock time. | ||
| // And when this happens the timerSequenceID.Timestamp is also virtual time and before the wall clock time, | ||
| // while the timerTask.VisibilityTimestamp uses the wall clock time that maps to the virtual time. | ||
| if queues.IsTimeExpired( | ||
| timerTask, | ||
| referenceTime, | ||
|
|
@@ -295,13 +333,14 @@ func (t *timerQueueStandbyTaskExecutor) executeActivityTimeoutTask( | |
| // | ||
| // the overall solution is to attempt to generate a new activity timer task whenever the | ||
| // task passed in is safe to be throw away. | ||
| referenceTime := t.Now() | ||
| actionFn := func(ctx context.Context, wfContext historyi.WorkflowContext, mutableState historyi.MutableState, _ historyi.ReleaseWorkflowContextFunc) (any, error) { | ||
| if !mutableState.IsWorkflowExecutionRunning() { | ||
| // workflow already finished, no need to process the timer | ||
| return nil, nil | ||
| } | ||
|
|
||
| referenceTime := mutableState.Now() | ||
|
|
||
| timerSequence := t.getTimerSequence(mutableState) | ||
| updateMutableState := false | ||
| timerSequenceIDs := timerSequence.LoadAndSortActivityTimers() | ||
|
|
@@ -314,6 +353,10 @@ func (t *timerQueueStandbyTaskExecutor) executeActivityTimeoutTask( | |
| return nil, serviceerror.NewInternal(errString) | ||
| } | ||
|
|
||
| // Use mutableState.Now() as reference time as a mutable state may use virtual time | ||
| // which can skip duration and be before the wall clock time. | ||
| // And when this happens the timerSequenceID.Timestamp is also virtual time and before the wall clock time, | ||
| // while the timerTask.VisibilityTimestamp uses the wall clock time that maps to the virtual time. | ||
| if queues.IsTimeExpired( | ||
| timerTask, | ||
| referenceTime, | ||
|
|
@@ -336,6 +379,7 @@ func (t *timerQueueStandbyTaskExecutor) executeActivityTimeoutTask( | |
| // created. | ||
| isHeartBeatTask := timerTask.TimeoutType == enumspb.TIMEOUT_TYPE_HEARTBEAT | ||
| ai, heartbeatTimeoutVis, ok := mutableState.GetActivityInfoWithTimerHeartbeat(timerTask.EventID) | ||
|
|
||
| if isHeartBeatTask && ok && queues.IsTimeExpired(timerTask, timerTask.GetVisibilityTime(), mutableState.ToRealTime(heartbeatTimeoutVis)) { | ||
| if err := mutableState.UpdateActivityTaskStatusWithTimerHeartbeat(ai.ScheduledEventId, ai.TimerTaskStatus&^workflow.TimerTaskStatusCreatedHeartbeat, nil); err != nil { | ||
| return nil, err | ||
|
|
@@ -774,6 +818,12 @@ func (t *timerQueueStandbyTaskExecutor) pushActivity( | |
| ) | ||
| } | ||
|
|
||
| // getCurrentTime returns the shard's wall-clock view of "now" for t.clusterName. | ||
| // Must stay wall-clock: it gates standby task-retry timing against VisibilityTime | ||
| // (also wall-clock); mutableState.Now() is virtual time and would force-discard | ||
| // time-skipping workflows. actionFn closures compare against virtual timestamps, | ||
| // so they use mutableState.Now() instead. | ||
| // | ||
| // TODO: deprecate this function and always use t.Now() | ||
| // Only test code sets t.clusterName to be non-current cluster name | ||
| // and advance the time by setting calling shardContext.SetCurrentTime. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.