@@ -11,10 +11,12 @@ import { groupmqLifecycleExceptionWrapper, setIntervalAsync } from "./utils.js";
1111import { syncSearchContexts } from "./ee/syncSearchContexts.js" ;
1212import { captureEvent } from "./posthog.js" ;
1313import { PromClient } from "./promClient.js" ;
14+ import { GROUPMQ_WORKER_STOP_GRACEFUL_TIMEOUT_MS } from "./constants.js" ;
1415
1516const LOG_TAG = 'connection-manager' ;
1617const logger = createLogger ( LOG_TAG ) ;
1718const createJobLogger = ( jobId : string ) => createLogger ( `${ LOG_TAG } :job:${ jobId } ` ) ;
19+ const QUEUE_NAME = 'connection-sync-queue' ;
1820
1921type JobPayload = {
2022 jobId : string ,
@@ -30,19 +32,19 @@ type JobResult = {
3032const JOB_TIMEOUT_MS = 1000 * 60 * 60 * 2 ; // 2 hour timeout
3133
3234export class ConnectionManager {
33- private worker : Worker ;
35+ private worker : Worker < JobPayload > ;
3436 private queue : Queue < JobPayload > ;
3537 private interval ?: NodeJS . Timeout ;
3638
3739 constructor (
3840 private db : PrismaClient ,
3941 private settings : Settings ,
40- redis : Redis ,
42+ private redis : Redis ,
4143 private promClient : PromClient ,
4244 ) {
4345 this . queue = new Queue < JobPayload > ( {
4446 redis,
45- namespace : 'connection-sync-queue' ,
47+ namespace : QUEUE_NAME ,
4648 jobTimeoutMs : JOB_TIMEOUT_MS ,
4749 maxAttempts : 3 ,
4850 logger : env . DEBUG_ENABLE_GROUPMQ_LOGGING === 'true' ,
@@ -62,6 +64,10 @@ export class ConnectionManager {
6264 this . worker . on ( 'failed' , this . onJobFailed . bind ( this ) ) ;
6365 this . worker . on ( 'stalled' , this . onJobStalled . bind ( this ) ) ;
6466 this . worker . on ( 'error' , this . onWorkerError . bind ( this ) ) ;
67+ // graceful-timeout is triggered when a job is still processing after
68+ // worker.close() is called and the timeout period has elapsed. In this case,
69+ // we fail the job with no retry.
70+ this . worker . on ( 'graceful-timeout' , this . onJobGracefulTimeout . bind ( this ) ) ;
6571 }
6672
6773 public startScheduler ( ) {
@@ -128,6 +134,7 @@ export class ConnectionManager {
128134 } ) ;
129135
130136 for ( const job of jobs ) {
137+ logger . info ( `Scheduling job ${ job . id } for connection ${ job . connection . name } (id: ${ job . connectionId } )` ) ;
131138 await this . queue . add ( {
132139 groupId : `connection:${ job . connectionId } ` ,
133140 data : {
@@ -150,6 +157,22 @@ export class ConnectionManager {
150157 const logger = createJobLogger ( jobId ) ;
151158 logger . info ( `Running connection sync job ${ jobId } for connection ${ connectionName } (id: ${ job . data . connectionId } ) (attempt ${ job . attempts + 1 } / ${ job . maxAttempts } )` ) ;
152159
160+ const currentStatus = await this . db . connectionSyncJob . findUniqueOrThrow ( {
161+ where : {
162+ id : jobId ,
163+ } ,
164+ select : {
165+ status : true ,
166+ }
167+ } ) ;
168+
169+ // Fail safe: if the job is not PENDING (first run) or IN_PROGRESS (retry), it indicates the job
170+ // is in an invalid state and should be skipped.
171+ if ( currentStatus . status !== ConnectionSyncJobStatus . PENDING && currentStatus . status !== ConnectionSyncJobStatus . IN_PROGRESS ) {
172+ throw new Error ( `Job ${ jobId } is not in a valid state. Expected: ${ ConnectionSyncJobStatus . PENDING } or ${ ConnectionSyncJobStatus . IN_PROGRESS } . Actual: ${ currentStatus . status } . Skipping.` ) ;
173+ }
174+
175+
153176 this . promClient . pendingConnectionSyncJobs . dec ( { connection : connectionName } ) ;
154177 this . promClient . activeConnectionSyncJobs . inc ( { connection : connectionName } ) ;
155178
@@ -178,7 +201,7 @@ export class ConnectionManager {
178201 const result = await ( async ( ) => {
179202 switch ( config . type ) {
180203 case 'github' : {
181- return await compileGithubConfig ( config , job . data . connectionId , abortController ) ;
204+ return await compileGithubConfig ( config , job . data . connectionId , abortController . signal ) ;
182205 }
183206 case 'gitlab' : {
184207 return await compileGitlabConfig ( config , job . data . connectionId ) ;
@@ -200,7 +223,7 @@ export class ConnectionManager {
200223 }
201224 }
202225 } ) ( ) ;
203-
226+
204227 let { repoData, warnings } = result ;
205228
206229 await this . db . connectionSyncJob . update ( {
@@ -383,6 +406,33 @@ export class ConnectionManager {
383406 } ) ;
384407 } ) ;
385408
409+ private onJobGracefulTimeout = async ( job : Job < JobPayload > ) =>
410+ groupmqLifecycleExceptionWrapper ( 'onJobGracefulTimeout' , logger , async ( ) => {
411+ const logger = createJobLogger ( job . id ) ;
412+
413+ const { connection } = await this . db . connectionSyncJob . update ( {
414+ where : { id : job . id } ,
415+ data : {
416+ status : ConnectionSyncJobStatus . FAILED ,
417+ completedAt : new Date ( ) ,
418+ errorMessage : 'Job timed out' ,
419+ } ,
420+ select : {
421+ connection : true ,
422+ }
423+ } ) ;
424+
425+ this . promClient . activeConnectionSyncJobs . dec ( { connection : connection . name } ) ;
426+ this . promClient . connectionSyncJobFailTotal . inc ( { connection : connection . name } ) ;
427+
428+ logger . error ( `Job ${ job . id } timed out for connection ${ connection . name } (id: ${ connection . id } )` ) ;
429+
430+ captureEvent ( 'backend_connection_sync_job_failed' , {
431+ connectionId : connection . id ,
432+ error : 'Job timed out' ,
433+ } ) ;
434+ } ) ;
435+
386436 private async onWorkerError ( error : Error ) {
387437 Sentry . captureException ( error ) ;
388438 logger . error ( `Connection syncer worker error.` , error ) ;
@@ -392,8 +442,28 @@ export class ConnectionManager {
392442 if ( this . interval ) {
393443 clearInterval ( this . interval ) ;
394444 }
395- await this . worker . close ( ) ;
396- await this . queue . close ( ) ;
445+
446+ const inProgressJobs = this . worker . getCurrentJobs ( ) ;
447+ await this . worker . close ( GROUPMQ_WORKER_STOP_GRACEFUL_TIMEOUT_MS ) ;
448+
449+ // Manually release group locks for in progress jobs to prevent deadlocks.
450+ // @see : https://github.com/Openpanel-dev/groupmq/issues/8
451+ for ( const { job } of inProgressJobs ) {
452+ const lockKey = `groupmq:${ QUEUE_NAME } :lock:${ job . groupId } ` ;
453+ logger . debug ( `Releasing group lock ${ lockKey } for in progress job ${ job . id } ` ) ;
454+ try {
455+ await this . redis . del ( lockKey ) ;
456+ } catch ( error ) {
457+ Sentry . captureException ( error ) ;
458+ logger . error ( `Failed to release group lock ${ lockKey } for in progress job ${ job . id } . Error: ` , error ) ;
459+ }
460+ }
461+
462+ // @note : As of groupmq v1.0.0, queue.close() will just close the underlying
463+ // redis connection. Since we share the same redis client between, skip this
464+ // step and close the redis client directly in index.ts.
465+ // @see : https://github.com/Openpanel-dev/groupmq/blob/main/src/queue.ts#L1900
466+ // await this.queue.close();
397467 }
398468}
399469
0 commit comments