From b41dd5427379b890fc3cb84baf2d657bd590aeb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henning=20P=C3=B6ttker?= <25299532+hpoettker@users.noreply.github.com> Date: Sat, 18 Apr 2026 16:11:21 +0200 Subject: [PATCH] Support deleted rows in SAS data files --- src/sas/readstat_sas.h | 12 ++-- src/sas/readstat_sas7bdat_read.c | 97 ++++++++++++++++++++++++++++---- 2 files changed, 94 insertions(+), 15 deletions(-) diff --git a/src/sas/readstat_sas.h b/src/sas/readstat_sas.h index 6b80c2f4..a7efd31e 100644 --- a/src/sas/readstat_sas.h +++ b/src/sas/readstat_sas.h @@ -95,8 +95,9 @@ typedef struct sas_text_ref_s { #define SAS_PAGE_TYPE_AMD 0x0400 #define SAS_PAGE_TYPE_MASK 0x0F00 -#define SAS_PAGE_TYPE_META2 0x4000 -#define SAS_PAGE_TYPE_COMP 0x9000 +#define SAS_PAGE_TYPE_DELETED 0x0080 +#define SAS_PAGE_TYPE_META2 0x4000 +#define SAS_PAGE_TYPE_COMP 0x9000 #define SAS_SUBHEADER_POINTER_SIZE_32BIT 12 #define SAS_SUBHEADER_POINTER_SIZE_64BIT 24 @@ -104,9 +105,10 @@ typedef struct sas_text_ref_s { #define SAS_PAGE_HEADER_SIZE_32BIT 24 #define SAS_PAGE_HEADER_SIZE_64BIT 40 -#define SAS_COMPRESSION_NONE 0x00 -#define SAS_COMPRESSION_TRUNC 0x01 -#define SAS_COMPRESSION_ROW 0x04 +#define SAS_COMPRESSION_NONE 0x00 +#define SAS_COMPRESSION_TRUNC 0x01 +#define SAS_COMPRESSION_ROW 0x04 +#define SAS_COMPRESSION_ROW_DELETED 0x05 #define SAS_COMPRESSION_SIGNATURE_RLE "SASYZCRL" #define SAS_COMPRESSION_SIGNATURE_RDC "SASYZCR2" diff --git a/src/sas/readstat_sas7bdat_read.c b/src/sas/readstat_sas7bdat_read.c index b7f02965..b2e62709 100644 --- a/src/sas/readstat_sas7bdat_read.c +++ b/src/sas/readstat_sas7bdat_read.c @@ -46,8 +46,10 @@ typedef struct sas7bdat_ctx_s { uint32_t row_length; uint32_t page_row_count; uint32_t parsed_row_count; + uint32_t parsed_deleted_row_count; uint32_t column_count; uint32_t row_limit; + uint32_t deleted_row_limit; uint32_t row_offset; uint64_t header_size; @@ -232,7 +234,7 @@ static readstat_error_t sas7bdat_parse_column_size_subheader(const char *subhead static readstat_error_t sas7bdat_parse_row_size_subheader(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) { readstat_error_t retval = READSTAT_OK; uint64_t total_row_count; - uint64_t row_length, page_row_count; + uint64_t row_length, deleted_row_limit, page_row_count; if (len < (ctx->u64 ? 250: 190)) { retval = READSTAT_ERROR_PARSE; @@ -242,13 +244,21 @@ static readstat_error_t sas7bdat_parse_row_size_subheader(const char *subheader, if (ctx->u64) { row_length = sas_read8(&subheader[40], ctx->bswap); total_row_count = sas_read8(&subheader[48], ctx->bswap); + deleted_row_limit = sas_read8(&subheader[56], ctx->bswap); page_row_count = sas_read8(&subheader[120], ctx->bswap); } else { row_length = sas_read4(&subheader[20], ctx->bswap); total_row_count = sas_read4(&subheader[24], ctx->bswap); + deleted_row_limit = sas_read4(&subheader[28], ctx->bswap); page_row_count = sas_read4(&subheader[60], ctx->bswap); } + if (deleted_row_limit > total_row_count) { + retval = READSTAT_ERROR_PARSE; + goto cleanup; + } + ctx->deleted_row_limit = deleted_row_limit; + sas_text_ref_t file_label_ref = sas7bdat_parse_text_ref(&subheader[len-130], ctx); if (file_label_ref.length) { if ((retval = sas7bdat_copy_text_ref(ctx->file_label, sizeof(ctx->file_label), @@ -390,6 +400,19 @@ static readstat_error_t sas7bdat_parse_column_format_subheader(const char *subhe return retval; } +static readstat_error_t sas7bdat_register_deleted_row(sas7bdat_ctx_t* ctx) { + if (ctx->parsed_deleted_row_count >= ctx->deleted_row_limit) { + return READSTAT_ERROR_PARSE; + } + ctx->parsed_row_count++; + ctx->parsed_deleted_row_count++; + return READSTAT_OK; +} + +static uint32_t sas7bdat_get_current_row_id(sas7bdat_ctx_t* ctx) { + return ctx->parsed_row_count - ctx->parsed_deleted_row_count; +} + static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable, col_info_t *col_info, const char *col_data, sas7bdat_ctx_t *ctx) { readstat_error_t retval = READSTAT_OK; @@ -406,7 +429,7 @@ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable if (ctx->handle.error) { snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Error converting string (row=%u, col=%u) to specified encoding: %.*s", - ctx->parsed_row_count+1, col_info->index+1, col_info->width, col_data); + sas7bdat_get_current_row_id(ctx)+1, col_info->index+1, col_info->width, col_data); ctx->handle.error(ctx->error_buf, ctx->user_ctx); } goto cleanup; @@ -438,7 +461,7 @@ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable value.v.double_value = dval; } } - cb_retval = ctx->handle.value(ctx->parsed_row_count, variable, value, ctx->user_ctx); + cb_retval = ctx->handle.value(sas7bdat_get_current_row_id(ctx), variable, value, ctx->user_ctx); if (cb_retval != READSTAT_HANDLER_OK) retval = READSTAT_ERROR_USER_ABORT; @@ -487,7 +510,14 @@ static readstat_error_t sas7bdat_parse_single_row(const char *data, sas7bdat_ctx return retval; } -static readstat_error_t sas7bdat_parse_rows(const char *data, size_t len, sas7bdat_ctx_t *ctx) { +static uint8_t sas7bdat_read_bitmap(const uint8_t* bitmap, int index) { + uint8_t current_byte = bitmap[index / 8]; + uint8_t mask = 1 << (7 - index % 8); + + return current_byte & mask; +} + +static readstat_error_t sas7bdat_parse_rows(const char* data, size_t len, const uint8_t* deleted_bitmap, sas7bdat_ctx_t* ctx) { readstat_error_t retval = READSTAT_OK; int i; size_t row_offset=0; @@ -496,8 +526,13 @@ static readstat_error_t sas7bdat_parse_rows(const char *data, size_t len, sas7bd retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH; goto cleanup; } - if ((retval = sas7bdat_parse_single_row(&data[row_offset], ctx)) != READSTAT_OK) + if (deleted_bitmap != NULL && sas7bdat_read_bitmap(deleted_bitmap, i)) { + if ((retval = sas7bdat_register_deleted_row(ctx)) != READSTAT_OK) { + goto cleanup; + } + } else if ((retval = sas7bdat_parse_single_row(&data[row_offset], ctx)) != READSTAT_OK) { goto cleanup; + } row_offset += ctx->row_length; } @@ -608,7 +643,7 @@ static readstat_error_t sas7bdat_parse_subheader_rle(const char *subheader, size if (ctx->handle.error) { snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Row #%d decompressed to %ld bytes (expected %d bytes)", - ctx->parsed_row_count, (long)(bytes_decompressed), ctx->row_length); + sas7bdat_get_current_row_id(ctx), (long)(bytes_decompressed), ctx->row_length); ctx->handle.error(ctx->error_buf, ctx->user_ctx); } goto cleanup; @@ -735,7 +770,7 @@ static readstat_error_t sas7bdat_submit_columns(sas7bdat_ctx_t *ctx, int compres readstat_error_t retval = READSTAT_OK; if (ctx->handle.metadata) { readstat_metadata_t metadata = { - .row_count = ctx->row_limit, + .row_count = ctx->row_limit - ctx->deleted_row_limit, .var_count = ctx->column_count, .table_name = ctx->table_name, .file_label = ctx->file_label, @@ -895,7 +930,7 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_ goto cleanup; } } - } else if (shp_info.compression == SAS_COMPRESSION_ROW) { + } else if (shp_info.compression == SAS_COMPRESSION_ROW || shp_info.compression == SAS_COMPRESSION_ROW_DELETED) { /* void */ } else { retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION; @@ -911,6 +946,26 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_ return retval; } +static readstat_error_t sas7bdat_parse_deleted_row_bitmap(const char* page, const char* data, + size_t page_size, const uint8_t** deleted_row_bitmap, sas7bdat_ctx_t* ctx) { + uint64_t page_unused_bytes; + if (ctx->u64) { + page_unused_bytes = sas_read8(&page[24], ctx->bswap); + } + else { + page_unused_bytes = sas_read4(&page[12], ctx->bswap); + } + uint32_t row_count = ctx->page_row_count < ctx->row_limit ? ctx->page_row_count : ctx->row_limit; + uint64_t deleted_row_bitmap_offset = row_count * ctx->row_length + page_unused_bytes; + uint32_t required_bytes = row_count / 8 + (row_count % 8 == 0 ? 0 : 1); + + if ((data - page) + deleted_row_bitmap_offset + required_bytes > page_size) { + return READSTAT_ERROR_PARSE; + } + *deleted_row_bitmap = (const uint8_t*)data + deleted_row_bitmap_offset; + return READSTAT_OK; +} + static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_size, sas7bdat_ctx_t *ctx) { uint16_t page_type; @@ -975,6 +1030,10 @@ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_ if ((retval = sas7bdat_parse_subheader_compressed(page + shp_info.offset, shp_info.len, ctx)) != READSTAT_OK) { goto cleanup; } + } else if (shp_info.compression == SAS_COMPRESSION_ROW_DELETED) { + if ((retval = sas7bdat_register_deleted_row(ctx)) != READSTAT_OK) { + goto cleanup; + } } else { retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION; goto cleanup; @@ -1004,7 +1063,14 @@ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_ goto cleanup; } if (ctx->handle.value) { - retval = sas7bdat_parse_rows(data, page + page_size - data, ctx); + const uint8_t* deleted_row_bitmap = NULL; + if (page_type & SAS_PAGE_TYPE_DELETED) { + if ((retval = sas7bdat_parse_deleted_row_bitmap(page, data, page_size, + &deleted_row_bitmap, ctx)) != READSTAT_OK) { + goto cleanup; + } + } + retval = sas7bdat_parse_rows(data, page + page_size - data, deleted_row_bitmap, ctx); } } cleanup: @@ -1276,11 +1342,22 @@ readstat_error_t readstat_parse_sas7bdat(readstat_parser_t *parser, const char * goto cleanup; } + if (ctx->handle.value && ctx->parsed_deleted_row_count != ctx->deleted_row_limit) { + retval = READSTAT_ERROR_ROW_COUNT_MISMATCH; + if (ctx->handle.error) { + snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Expected %d deleted rows in file, found %d", + ctx->deleted_row_limit, ctx->parsed_deleted_row_count); + ctx->handle.error(ctx->error_buf, ctx->user_ctx); + } + goto cleanup; + } + if (ctx->handle.value && ctx->parsed_row_count != ctx->row_limit) { retval = READSTAT_ERROR_ROW_COUNT_MISMATCH; if (ctx->handle.error) { snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Expected %d rows in file, found %d", - ctx->row_limit, ctx->parsed_row_count); + ctx->row_limit - ctx->deleted_row_limit, + ctx->parsed_row_count - ctx->parsed_deleted_row_count); ctx->handle.error(ctx->error_buf, ctx->user_ctx); } goto cleanup;