|
4 | 4 | DatabaseConnectionError, |
5 | 5 | logger as defaultLogger, |
6 | 6 | ErrorCode, |
7 | | - errors, |
8 | 7 | Logger, |
9 | 8 | ReplicationAbortedError, |
10 | 9 | ReplicationAssertionError |
@@ -100,8 +99,10 @@ export const sendKeepAlive = async (db: pgwire.PgClient) => { |
100 | 99 | }; |
101 | 100 |
|
102 | 101 | export class MissingReplicationSlotError extends Error { |
103 | | - constructor(message: string) { |
| 102 | + constructor(message: string, cause?: any) { |
104 | 103 | super(message); |
| 104 | + |
| 105 | + this.cause = cause; |
105 | 106 | } |
106 | 107 | } |
107 | 108 |
|
@@ -304,135 +305,54 @@ export class WalStream { |
304 | 305 | }) |
305 | 306 | )[0]; |
306 | 307 |
|
| 308 | + // Previously we also used pg_catalog.pg_logical_slot_peek_binary_changes to confirm that we can query the slot. |
| 309 | + // However, there were some edge cases where the query times out, repeating the query, ultimately |
| 310 | + // causing high load on the source database and never recovering automatically. |
| 311 | + // We now instead jump straight to replication if the wal_status is not "lost", rather detecting those |
| 312 | + // errors during streaming replication, which is a little more robust. |
| 313 | + |
| 314 | + // We can have: |
| 315 | + // 1. needsInitialSync: true, lost slot -> MissingReplicationSlotError (starts new sync rules version). |
| 316 | + // Theoretically we could handle this the same as (2). |
| 317 | + // 2. needsInitialSync: true, no slot -> create new slot |
| 318 | + // 3. needsInitialSync: true, valid slot -> resume initial sync |
| 319 | + // 4. needsInitialSync: false, lost slot -> MissingReplicationSlotError (starts new sync rules version) |
| 320 | + // 5. needsInitialSync: false, no slot -> MissingReplicationSlotError (starts new sync rules version) |
| 321 | + // 6. needsInitialSync: false, valid slot -> resume streaming replication |
| 322 | + // The main advantage of MissingReplicationSlotError are: |
| 323 | + // 1. If there was a complete snapshot already (cases 4/5), users can still sync from that snapshot while |
| 324 | + // we do the reprocessing under a new slot name. |
| 325 | + // 2. If there was a partial snapshot (case 1), we can start with the new slot faster by not waiting for |
| 326 | + // the partial data to be cleared. |
307 | 327 | if (slot != null) { |
308 | 328 | // This checks that the slot is still valid |
309 | | - const r = await this.checkReplicationSlot(slot as any); |
310 | | - if (snapshotDone && r.needsNewSlot) { |
311 | | - // We keep the current snapshot, and create a new replication slot |
312 | | - throw new MissingReplicationSlotError(`Replication slot ${slotName} is not valid anymore`); |
| 329 | + |
| 330 | + // wal_status is present in postgres 13+ |
| 331 | + // invalidation_reason is present in postgres 17+ |
| 332 | + const lost = slot.wal_status == 'lost'; |
| 333 | + if (lost) { |
| 334 | + // Case 1 / 4 |
| 335 | + throw new MissingReplicationSlotError( |
| 336 | + `Replication slot ${slotName} is not valid anymore. invalidation_reason: ${slot.invalidation_reason ?? 'unknown'}` |
| 337 | + ); |
313 | 338 | } |
314 | | - // We can have: |
315 | | - // needsInitialSync: true, needsNewSlot: true -> initial sync from scratch |
316 | | - // needsInitialSync: true, needsNewSlot: false -> resume initial sync |
317 | | - // needsInitialSync: false, needsNewSlot: true -> handled above |
318 | | - // needsInitialSync: false, needsNewSlot: false -> resume streaming replication |
| 339 | + // Case 3 / 6 |
319 | 340 | return { |
320 | 341 | needsInitialSync: !snapshotDone, |
321 | | - needsNewSlot: r.needsNewSlot |
| 342 | + needsNewSlot: false |
322 | 343 | }; |
323 | 344 | } else { |
324 | 345 | if (snapshotDone) { |
| 346 | + // Case 5 |
325 | 347 | // This will create a new slot, while keeping the current sync rules active |
326 | 348 | throw new MissingReplicationSlotError(`Replication slot ${slotName} is missing`); |
327 | 349 | } |
328 | | - // This will clear data and re-create the same slot |
| 350 | + // Case 2 |
| 351 | + // This will clear data (if any) and re-create the same slot |
329 | 352 | return { needsInitialSync: true, needsNewSlot: true }; |
330 | 353 | } |
331 | 354 | } |
332 | 355 |
|
333 | | - /** |
334 | | - * If a replication slot exists, check that it is healthy. |
335 | | - */ |
336 | | - private async checkReplicationSlot(slot: { |
337 | | - // postgres 13+ |
338 | | - wal_status?: string; |
339 | | - // postgres 17+ |
340 | | - invalidation_reason?: string | null; |
341 | | - }): Promise<{ needsNewSlot: boolean }> { |
342 | | - // Start with a placeholder error, should be replaced if there is an actual issue. |
343 | | - let last_error = new ReplicationAssertionError(`Slot health check failed to execute`); |
344 | | - |
345 | | - const slotName = this.slot_name; |
346 | | - |
347 | | - const lost = slot.wal_status == 'lost'; |
348 | | - if (lost) { |
349 | | - this.logger.warn( |
350 | | - `Replication slot ${slotName} is invalidated. invalidation_reason: ${slot.invalidation_reason ?? 'unknown'}` |
351 | | - ); |
352 | | - return { |
353 | | - needsNewSlot: true |
354 | | - }; |
355 | | - } |
356 | | - |
357 | | - // Check that replication slot exists, trying for up to 2 minutes. |
358 | | - const startAt = performance.now(); |
359 | | - while (performance.now() - startAt < 120_000) { |
360 | | - this.touch(); |
361 | | - |
362 | | - try { |
363 | | - // We peek a large number of changes here, to make it more likely to pick up replication slot errors. |
364 | | - // For example, "publication does not exist" only occurs here if the peek actually includes changes related |
365 | | - // to the slot. |
366 | | - this.logger.info(`Checking ${slotName}`); |
367 | | - |
368 | | - // The actual results can be quite large, so we don't actually return everything |
369 | | - // due to memory and processing overhead that would create. |
370 | | - const cursor = await this.connections.pool.stream({ |
371 | | - statement: `SELECT 1 FROM pg_catalog.pg_logical_slot_peek_binary_changes($1, NULL, 1000, 'proto_version', '1', 'publication_names', $2)`, |
372 | | - params: [ |
373 | | - { type: 'varchar', value: slotName }, |
374 | | - { type: 'varchar', value: PUBLICATION_NAME } |
375 | | - ] |
376 | | - }); |
377 | | - |
378 | | - for await (let _chunk of cursor) { |
379 | | - // No-op, just exhaust the cursor |
380 | | - } |
381 | | - |
382 | | - // Success |
383 | | - this.logger.info(`Slot ${slotName} appears healthy`); |
384 | | - return { needsNewSlot: false }; |
385 | | - } catch (e) { |
386 | | - last_error = e; |
387 | | - this.logger.warn(`Replication slot error`, e); |
388 | | - |
389 | | - if (this.stopped) { |
390 | | - throw e; |
391 | | - } |
392 | | - |
393 | | - if ( |
394 | | - /incorrect prev-link/.test(e.message) || |
395 | | - /replication slot.*does not exist/.test(e.message) || |
396 | | - /publication.*does not exist/.test(e.message) || |
397 | | - // Postgres 18 - exceeded max_slot_wal_keep_size |
398 | | - /can no longer access replication slot/.test(e.message) || |
399 | | - // Postgres 17 - exceeded max_slot_wal_keep_size |
400 | | - /can no longer get changes from replication slot/.test(e.message) |
401 | | - ) { |
402 | | - // Fatal error. In most cases since Postgres 13+, the `wal_status == 'lost'` check should pick this up, but this |
403 | | - // works as a fallback. |
404 | | - |
405 | | - container.reporter.captureException(e, { |
406 | | - level: errors.ErrorSeverity.WARNING, |
407 | | - metadata: { |
408 | | - replication_slot: slotName |
409 | | - } |
410 | | - }); |
411 | | - // Sample: record with incorrect prev-link 10000/10000 at 0/18AB778 |
412 | | - // Seen during development. Some internal error, fixed by re-creating slot. |
413 | | - // |
414 | | - // Sample: publication "powersync" does not exist |
415 | | - // Happens when publication deleted or never created. |
416 | | - // Slot must be re-created in this case. |
417 | | - this.logger.info(`${slotName} is not valid anymore`); |
418 | | - |
419 | | - return { needsNewSlot: true }; |
420 | | - } |
421 | | - // Try again after a pause |
422 | | - await new Promise((resolve) => setTimeout(resolve, 1000)); |
423 | | - } |
424 | | - } |
425 | | - |
426 | | - container.reporter.captureException(last_error, { |
427 | | - level: errors.ErrorSeverity.ERROR, |
428 | | - metadata: { |
429 | | - replication_slot: slotName |
430 | | - } |
431 | | - }); |
432 | | - |
433 | | - throw last_error; |
434 | | - } |
435 | | - |
436 | 356 | async estimatedCountNumber(db: pgwire.PgConnection, table: storage.SourceTable): Promise<number> { |
437 | 357 | const results = await db.query({ |
438 | 358 | statement: `SELECT reltuples::bigint AS estimate |
@@ -915,6 +835,17 @@ WHERE oid = $1::regclass`, |
915 | 835 | } |
916 | 836 |
|
917 | 837 | async streamChanges(replicationConnection: pgwire.PgConnection) { |
| 838 | + try { |
| 839 | + await this.streamChangesInternal(replicationConnection); |
| 840 | + } catch (e) { |
| 841 | + if (isReplicationSlotInvalidError(e)) { |
| 842 | + throw new MissingReplicationSlotError(e.message, e); |
| 843 | + } |
| 844 | + throw e; |
| 845 | + } |
| 846 | + } |
| 847 | + |
| 848 | + private async streamChangesInternal(replicationConnection: pgwire.PgConnection) { |
918 | 849 | // When changing any logic here, check /docs/wal-lsns.md. |
919 | 850 | const { createEmptyCheckpoints } = await this.ensureStorageCompatibility(); |
920 | 851 |
|
@@ -1179,3 +1110,27 @@ WHERE oid = $1::regclass`, |
1179 | 1110 | }); |
1180 | 1111 | } |
1181 | 1112 | } |
| 1113 | + |
| 1114 | +function isReplicationSlotInvalidError(e: any) { |
| 1115 | + // We could access the error code from pgwire using this: |
| 1116 | + // e[Symbol.for('pg.ErrorCode')] |
| 1117 | + // However, we typically get a generic code such as 42704 (undefined_object), which does not |
| 1118 | + // help much. So we check the actual error message. |
| 1119 | + const message = e.message ?? ''; |
| 1120 | + |
| 1121 | + // Sample: record with incorrect prev-link 10000/10000 at 0/18AB778 |
| 1122 | + // Seen during development. Some internal error, fixed by re-creating slot. |
| 1123 | + // |
| 1124 | + // Sample: publication "powersync" does not exist |
| 1125 | + // Happens when publication deleted or never created. |
| 1126 | + // Slot must be re-created in this case. |
| 1127 | + return ( |
| 1128 | + /incorrect prev-link/.test(message) || |
| 1129 | + /replication slot.*does not exist/.test(message) || |
| 1130 | + /publication.*does not exist/.test(message) || |
| 1131 | + // Postgres 18 - exceeded max_slot_wal_keep_size |
| 1132 | + /can no longer access replication slot/.test(message) || |
| 1133 | + // Postgres 17 - exceeded max_slot_wal_keep_size |
| 1134 | + /can no longer get changes from replication slot/.test(message) |
| 1135 | + ); |
| 1136 | +} |
0 commit comments