Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 146 additions & 11 deletions src/spock_apply.c
Original file line number Diff line number Diff line change
Expand Up @@ -1418,6 +1418,106 @@ handle_relation(StringInfo s)
(void) spock_read_rel(s);
}

/* Return the GUC name of the active exception behaviour mode. */
static const char *
get_exception_behaviour_name(void)
{
switch (exception_behaviour)
{
case DISCARD:
return "discard";
case TRANSDISCARD:
return "transdiscard";
case SUB_DISABLE:
return "sub_disable";
default:
return "unknown";
}
}

/*
* Format an error message with its SQLSTATE prefix so the root cause recorded
* in spock.exception_log (and the apply log) is unambiguous. exception_log has
* no dedicated sqlstate column, so we carry it inline in the message text.
*
* The result is allocated in ApplyOperationContext (reset per row/message)
* rather than the long-lived TopTransactionContext that is current on the
* exception path, so repeated logging within one large transaction does not
* accumulate. Callers copy it into the durable tuple immediately.
*/
static char *
errmsg_with_sqlstate(ErrorData *edata)
{
MemoryContext oldctx = MemoryContextSwitchTo(ApplyOperationContext);
char *msg;
int sqlerrcode = edata ? edata->sqlerrcode : ERRCODE_INTERNAL_ERROR;
const char *detail = (edata && edata->message) ? edata->message
: "(no error detail captured)";

/*
* Prefix the SQLSTATE only when it carries information. A bare
* elog(ERROR) defaults to ERRCODE_INTERNAL_ERROR (XX000), which is just
* noise (e.g. spock's own "did not find row to update"). Real errors --
* constraint violations, deadlocks, connection failures -- keep their
* code, which is what makes the cause unambiguous.
*/
if (sqlerrcode == ERRCODE_INTERNAL_ERROR)
msg = pstrdup(detail);
else
msg = psprintf("[SQLSTATE %s] %s", unpack_sql_state(sqlerrcode), detail);

MemoryContextSwitchTo(oldctx);
return msg;
}

/*
* Build the error_message for an entry discarded as collateral -- i.e. it was
* not itself the failing command (the caller passed a NULL message). 'what'
* names the discarded entry for the message: "tuple" on the DML path,
* "statement" on the SQL/DDL path (where no tuple is involved).
*
* Normally we just point the operator at the entry that carries the real error
* via its command_counter, which avoids repeating the same message on every
* discarded entry. But when the failure is not attributable to a specific row,
* failed_action is 0 and that pointer dangles (no entry's counter is 0), so we
* surface the captured root cause instead -- or note that none was captured.
*/
static char *
discard_collateral_message(const char *what)
{
SpockExceptionLog *e;
MemoryContext oldctx;
char *msg;

/*
* Match the paranoia level of the surrounding code: every caller reaches
* this only after exception logging has been set up for this worker.
*/
Assert(exception_log_ptr != NULL && my_exception_log_index >= 0);

e = &exception_log_ptr[my_exception_log_index];

/*
* Allocate in ApplyOperationContext (reset per row/message) so logging
* every discarded row of a large transaction does not pile up in the
* long-lived TopTransactionContext current on the exception path.
*/
oldctx = MemoryContextSwitchTo(ApplyOperationContext);

if (e->failed_action != 0)
msg = psprintf("%s: %s discarded due to exception at command_counter %u",
get_exception_behaviour_name(), what, e->failed_action);
else if (e->initial_error_message[0] != '\0')
msg = psprintf("%s: %s discarded due to exception: %s",
get_exception_behaviour_name(), what, e->initial_error_message);
else
msg = psprintf("%s: %s discarded due to exception (root cause not captured)",
get_exception_behaviour_name(), what);

MemoryContextSwitchTo(oldctx);
return msg;
}

static void
log_insert_exception(bool failed, char *errmsg, SpockRelation *rel,
SpockTupleData *oldtup, SpockTupleData *newtup,
Expand All @@ -1433,6 +1533,14 @@ log_insert_exception(bool failed, char *errmsg, SpockRelation *rel,
if (!should_log_exception(failed))
return;

/*
* A NULL message means this entry is collateral damage -- some other
* command in the same transaction failed, not this one. Build an
* informative message rather than recording the opaque "unavailable".
*/
if (errmsg == NULL)
errmsg = discard_collateral_message("tuple");

/*
* Run the exception-log work in ApplyOperationContext so its JSON and
* tuple allocations get released by the per-message reset instead of
Expand Down Expand Up @@ -1463,7 +1571,7 @@ log_insert_exception(bool failed, char *errmsg, SpockRelation *rel,
rel, localtup, oldtup, newtup,
NULL, NULL,
action_name,
(failed) ? errmsg : NULL);
errmsg);

MemoryContextSwitchTo(oldctx);
}
Expand Down Expand Up @@ -1597,7 +1705,7 @@ handle_insert(StringInfo s)

/* Let's create an exception log entry if true. */
{
char *err_msg = failed ? (edata ? edata->message : NULL) :
char *err_msg = failed ? errmsg_with_sqlstate(edata) :
(xact_action_counter ==
exception_log_ptr[my_exception_log_index].failed_action &&
exception_log_ptr[my_exception_log_index].initial_error_message[0] != '\0') ?
Expand Down Expand Up @@ -1764,7 +1872,7 @@ handle_update(StringInfo s)

/* Let's create an exception log entry if true. */
{
char *err_msg = failed ? (edata ? edata->message : NULL) :
char *err_msg = failed ? errmsg_with_sqlstate(edata) :
(xact_action_counter ==
exception_log_ptr[my_exception_log_index].failed_action &&
exception_log_ptr[my_exception_log_index].initial_error_message[0] != '\0') ?
Expand Down Expand Up @@ -1869,7 +1977,7 @@ handle_delete(StringInfo s)

/* Let's create an exception log entry if true. */
{
char *err_msg = failed ? (edata ? edata->message : NULL) :
char *err_msg = failed ? errmsg_with_sqlstate(edata) :
(xact_action_counter ==
exception_log_ptr[my_exception_log_index].failed_action &&
exception_log_ptr[my_exception_log_index].initial_error_message[0] != '\0') ?
Expand Down Expand Up @@ -2521,12 +2629,20 @@ handle_sql_or_exception(QueuedMessage *queued_message, bool tx_just_started)
if (should_log_exception(failed))
{
MemoryContext oldctx;
char *err_msg = failed ? edata->message :
char *err_msg = failed ? errmsg_with_sqlstate(edata) :
(xact_action_counter ==
exception_log_ptr[my_exception_log_index].failed_action &&
exception_log_ptr[my_exception_log_index].initial_error_message[0] != '\0') ?
exception_log_ptr[my_exception_log_index].initial_error_message : NULL;

/*
* A NULL message means this statement is collateral damage -- some
* other command in the same transaction failed. Build an
* informative message rather than recording "unavailable".
*/
if (err_msg == NULL)
err_msg = discard_collateral_message("statement");

/* See note in log_insert_exception about the context switch. */
oldctx = MemoryContextSwitchTo(ApplyOperationContext);

Expand Down Expand Up @@ -3540,20 +3656,39 @@ apply_work(PGconn *streamConn)
AbortOutOfAnyTransaction();

MemoryContextSwitchTo(MessageContext);
elog(LOG, "SPOCK: caught initial exception - %s", edata->message);
elog(LOG, "SPOCK: caught initial exception - %s",
errmsg_with_sqlstate(edata));

/*
* Save the initial error message and which action triggered it.
* On the retry pass, the matching row gets this message in
* exception_log; all other rows get NULL ("unavailable").
* The message carries its SQLSTATE (via errmsg_with_sqlstate, which
* omits the uninformative XX000) so that whichever entry surfaces it
* later (the matching row, or every collateral row when the failure is
* not attributable) carries an unambiguous root cause. On the retry
* pass, the matching row gets this message in exception_log; other
* rows get an informative collateral message built by
* discard_collateral_message().
*/
if (exception_log_ptr != NULL)
{
snprintf(exception_log_ptr[my_exception_log_index].initial_error_message,
sizeof(exception_log_ptr[my_exception_log_index].initial_error_message),
"%s", edata->message);
exception_log_ptr[my_exception_log_index].failed_action =
xact_action_counter;
"%s", errmsg_with_sqlstate(edata));

/*
* A failure during COMMIT (e.g. a deferred constraint trigger that
* fires at commit) is not attributable to any replayed row:
* handle_commit has already bumped the action counter, so no row's
* command_counter would match and the pointer would dangle. Treat
* it as non-attributable (failed_action = 0) so the replay surfaces
* the captured root cause instead of a dangling command_counter.
*/
if (errcallback_arg.action_name != NULL &&
strcmp(errcallback_arg.action_name, "COMMIT") == 0)
exception_log_ptr[my_exception_log_index].failed_action = 0;
else
exception_log_ptr[my_exception_log_index].failed_action =
xact_action_counter;
}

FlushErrorState();
Expand Down
13 changes: 9 additions & 4 deletions src/spock_exception_handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,15 @@ add_entry_to_exception_log(Oid remote_origin, TimestampTz remote_commit_ts,
values[Anum_exception_log_ddl_user - 1] = CStringGetTextDatum(ddl_user);
}

if (error_message == NULL)
values[Anum_exception_log_error_message - 1] = CStringGetTextDatum("unavailable");
else
values[Anum_exception_log_error_message - 1] = CStringGetTextDatum(error_message);
/*
* All callers now build an informative message (the failing command's
* error, or a collateral-discard message) before reaching here. Keep a
* defensive fallback so a future caller passing NULL degrades to a
* placeholder rather than crashing.
*/
Assert(error_message != NULL);
values[Anum_exception_log_error_message - 1] =
Comment on lines +195 to +196

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Assert() here is not needed and should be removed.

CStringGetTextDatum(error_message != NULL ? error_message : "unavailable");
values[Anum_exception_log_retry_errored_at - 1] = TimestampTzGetDatum(GetCurrentTimestamp());

tup = heap_form_tuple(tupDesc, values, nulls);
Expand Down
14 changes: 7 additions & 7 deletions tests/regress/expected/replication_set.out
Original file line number Diff line number Diff line change
Expand Up @@ -449,14 +449,14 @@ SELECT
remote_new_tup,error_message
FROM spock.exception_log
ORDER BY command_counter;
command_counter | table_schema | table_name | operation | remote_new_tup | error_message
-----------------+--------------+------------+-----------+----------------------------------------------------+---------------------------
command_counter | table_schema | table_name | operation | remote_new_tup | error_message
-----------------+--------------+------------+-----------+----------------------------------------------------+---------------------------------------------------------------------
1 | | | INSERT | | Spock can't find relation
2 | | | INSERT | | Spock can't find relation
3 | public | spoc_102g | INSERT | [{"value": -4, "attname": "x", "atttype": "int4"}] | unavailable
3 | public | spoc_102g | INSERT | [{"value": -4, "attname": "x", "atttype": "int4"}] | transdiscard: tuple discarded due to exception at command_counter 1
4 | | | INSERT | | Spock can't find relation
5 | | | INSERT | | Spock can't find relation
6 | public | spoc_102g | INSERT | [{"value": -4, "attname": "x", "atttype": "int4"}] | unavailable
6 | public | spoc_102g | INSERT | [{"value": -4, "attname": "x", "atttype": "int4"}] | discard: tuple discarded due to exception at command_counter 1
(6 rows)

\c :provider_dsn
Expand Down Expand Up @@ -564,13 +564,13 @@ ORDER BY command_counter;
-----------------+--------------+-------------+-----------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------
1 | | | INSERT | | Spock can't find relation
2 | | | INSERT | | Spock can't find relation
3 | public | spoc_102g | INSERT | [{"value": -4, "attname": "x", "atttype": "int4"}] | unavailable
3 | public | spoc_102g | INSERT | [{"value": -4, "attname": "x", "atttype": "int4"}] | transdiscard: tuple discarded due to exception at command_counter 1
4 | | | INSERT | | Spock can't find relation
5 | | | INSERT | | Spock can't find relation
6 | public | spoc_102g | INSERT | [{"value": -4, "attname": "x", "atttype": "int4"}] | unavailable
6 | public | spoc_102g | INSERT | [{"value": -4, "attname": "x", "atttype": "int4"}] | discard: tuple discarded due to exception at command_counter 1
7 | | | UPDATE | | Spock can't find relation
8 | | | UPDATE | | Spock can't find relation
9 | public | spoc_102g_u | UPDATE | [{"value": -3, "attname": "x", "atttype": "int4"}] | unavailable
9 | public | spoc_102g_u | UPDATE | [{"value": -3, "attname": "x", "atttype": "int4"}] | discard: tuple discarded due to exception at command_counter 2
10 | public | spoc_102l_u | UPDATE | [{"value": -3, "attname": "x", "atttype": "int4"}] | logical replication did not find row to be updated in replication target relation (public.spoc_102l_u)
(10 rows)

Expand Down
40 changes: 28 additions & 12 deletions tests/tap/t/013_exception_handling.pl
Original file line number Diff line number Diff line change
Expand Up @@ -307,12 +307,15 @@ sub set_exception_behaviour {
# ===========================================================================
# Part 5: TRANSDISCARD error_message quality (backport fix regression test)
# ===========================================================================
# Before the fix, TRANSDISCARD logged bystander rows with error_message =
# 'unknown' because the NULL fallback in add_entry_to_exception_log was the
# string "unknown". After the fix:
# - The NULL fallback is "unavailable"
# - The originally-failing row carries the real constraint-violation message
# - Bystander rows carry "unavailable"
# Backport of ed17523 + b32ef95 to v5_STABLE. Before the fix, every row of a
# discarded transaction that was not itself the failing command was logged with
# the opaque placeholder error_message = 'unavailable' (the NULL fallback in
# add_entry_to_exception_log). After the fix:
# - No row carries the opaque 'unavailable' placeholder.
# - The originally-failing row carries the real constraint-violation message,
# prefixed with its SQLSTATE so the root cause is unambiguous.
# - Bystander (collateral-discard) rows carry an informative message that
# names the active behaviour and points at the command that failed.
#
# Scenario: three-table transaction where t_ehx_b has NOT NULL on n2 only.
# The INSERT with v=NULL succeeds on n1 but fails on n2, triggering TRANSDISCARD.
Expand Down Expand Up @@ -365,28 +368,41 @@ sub set_exception_behaviour {
is($p5_unknown_cnt, '0',
"P5: no exception_log entries with error_message = 'unknown' (regression guard)");

# Failing row must carry the real NOT NULL constraint-violation message.
# Failing row must carry the real NOT NULL constraint-violation message,
# now prefixed with its SQLSTATE so the root cause is unambiguous.
my $p5_b_err = scalar_query(2,
"SELECT error_message FROM spock.exception_log " .
"WHERE table_name = 't_ehx_b' " .
"ORDER BY retry_errored_at DESC LIMIT 1");
like($p5_b_err, qr/null|not.null|violates/i,
'P5: t_ehx_b exception_log has real constraint-violation message');
# scalar_query() strips whitespace, so match the SQLSTATE prefix space-agnostically.
like($p5_b_err, qr/\[SQLSTATE\s*23502\]/,
'P5: t_ehx_b message carries SQLSTATE 23502 (not_null_violation)');

# Bystander rows must have 'unavailable', not 'unknown'.
# The opaque 'unavailable' placeholder must be gone entirely.
my $p5_unavail_cnt = scalar_query(2,
"SELECT COUNT(*) FROM spock.exception_log " .
"WHERE error_message = 'unavailable' " .
"AND table_name IN ('t_ehx_a', 't_ehx_b', 't_ehx_c')");
is($p5_unavail_cnt, '0',
"P5: no exception_log entries with opaque 'unavailable' placeholder");

# Bystander rows must now carry an informative collateral-discard message
# (names the behaviour and points at the failing command), not 'unavailable'.
my $p5_a_err = scalar_query(2,
"SELECT error_message FROM spock.exception_log " .
"WHERE table_name = 't_ehx_a' " .
"ORDER BY retry_errored_at DESC LIMIT 1");
is($p5_a_err, 'unavailable',
"P5: bystander t_ehx_a has error_message = 'unavailable'");
like($p5_a_err, qr/discarded\s*due\s*to\s*exception/,
'P5: bystander t_ehx_a has informative discard message (not unavailable)');

my $p5_c_err = scalar_query(2,
"SELECT error_message FROM spock.exception_log " .
"WHERE table_name = 't_ehx_c' " .
"ORDER BY retry_errored_at DESC LIMIT 1");
is($p5_c_err, 'unavailable',
"P5: bystander t_ehx_c has error_message = 'unavailable'");
like($p5_c_err, qr/discarded\s*due\s*to\s*exception/,
'P5: bystander t_ehx_c has informative discard message (not unavailable)');

# TRANSDISCARD rolls back the entire transaction — no rows land on n2.
sleep(3);
Expand Down
Loading
Loading