Skip to content

Commit 36a9a1f

Browse files
committed
Rewrite boyer moore implementation
It has been externally tested with many test cases, including specific tests for the Rytter correction
1 parent 0e6eeaf commit 36a9a1f

File tree

1 file changed

+58
-43
lines changed

1 file changed

+58
-43
lines changed

extract-xiso.c

Lines changed: 58 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -593,8 +593,8 @@ avl_result avl_insert( dir_node_avl **in_root, dir_node_avl *in_node );
593593
int avl_traverse_depth_first( dir_node_avl *in_root, traversal_callback in_callback, void *in_context, avl_traversal_method in_method, int in_depth );
594594

595595
void boyer_moore_done();
596-
char *boyer_moore_search( char *in_text, long in_text_len );
597-
int boyer_moore_init( const char *in_pattern, long in_pat_len, long in_alphabet_size );
596+
char *boyer_moore_search( char *in_text, size_t in_text_len );
597+
int boyer_moore_init( const char *in_pattern, size_t in_pat_len, size_t in_alphabet_size );
598598

599599
int free_dir_node_avl(dir_node_avl* in_dir_node_avl, void* in_context, int in_depth);
600600
int extract_file( int in_xiso, dir_node *in_file, modes in_mode, const char *path );
@@ -622,11 +622,11 @@ void write_sector( int in_xiso, xoff_t in_start, const char *in_name, const char
622622
#endif
623623

624624

625-
static long s_pat_len;
625+
static size_t s_pat_len;
626626
static bool s_quiet = false;
627627
static const char *s_pattern = NULL;
628-
static long *s_gs_table = NULL;
629-
static long *s_bc_table = NULL;
628+
static size_t *s_gs_table = NULL;
629+
static size_t *s_bc_table = NULL;
630630
static long long s_total_bytes = 0;
631631
static int s_total_files = 0;
632632
static char *s_copy_buffer = NULL;
@@ -1622,41 +1622,58 @@ int avl_traverse_depth_first( dir_node_avl *in_root, traversal_callback in_callb
16221622
#endif
16231623

16241624

1625-
int boyer_moore_init( const char *in_pattern, long in_pat_len, long in_alphabet_size ) {
1626-
long i, j, k, *backup, err = 0;
1625+
int boyer_moore_init( const char *in_pattern, size_t in_pat_len, size_t in_alphabet_size ) {
1626+
size_t j, k, t, t1, q, q1, *aux = NULL;
1627+
int err = 0;
16271628

16281629
s_pattern = in_pattern;
16291630
s_pat_len = in_pat_len;
1630-
1631-
if ( ( s_bc_table = (long *) malloc( in_alphabet_size * sizeof(long) ) ) == NULL ) mem_err();
1632-
1633-
if ( ! err ) {
1634-
for ( i = 0; i < in_alphabet_size; ++i ) s_bc_table[ i ] = in_pat_len;
1635-
for ( i = 0; i < in_pat_len - 1; ++i ) s_bc_table[ (uint8_t) in_pattern[ i ] ] = in_pat_len - i - 1;
1636-
1637-
if ( ( s_gs_table = (long *) malloc( 2 * ( in_pat_len + 1 ) * sizeof(long) ) ) == NULL ) mem_err();
1638-
}
16391631

1640-
if ( ! err ) {
1641-
backup = s_gs_table + in_pat_len + 1;
1642-
1643-
for ( i = 1; i <= in_pat_len; ++i ) s_gs_table[ i ] = 2 * in_pat_len - i;
1644-
for ( i = in_pat_len, j = in_pat_len + 1; i; --i, --j ) {
1645-
backup[ i ] = j;
1632+
boyer_moore_done(); // Prepare for a new init
1633+
1634+
if (in_pat_len == 0) return 0;
16461635

1647-
while ( j <= in_pat_len && in_pattern[ i - 1 ] != in_pattern[ j - 1 ] ) {
1648-
if ( s_gs_table[ j ] > in_pat_len - i ) s_gs_table[ j ] = in_pat_len - i;
1649-
j = backup[ j ];
1636+
// Delta1 table
1637+
if ((s_bc_table = (size_t*)malloc(in_alphabet_size * sizeof(size_t))) == NULL) mem_err();
1638+
if (!err) {
1639+
for (k = 0; k < in_alphabet_size; k++) s_bc_table[k] = in_pat_len;
1640+
for (k = 0; k < in_pat_len; k++) s_bc_table[(unsigned char)in_pattern[k]] = in_pat_len - 1 - k;
1641+
}
1642+
1643+
// Delta2 table (dd' algorithm with Rytter correction)
1644+
if (!err && (s_gs_table = (size_t*)malloc(in_pat_len * sizeof(size_t))) == NULL) mem_err();
1645+
if (!err && (aux = (size_t*)malloc(in_pat_len * sizeof(size_t))) == NULL) mem_err();
1646+
if (!err) {
1647+
// Step A1
1648+
for (k = 1; k <= in_pat_len; k++) s_gs_table[k - 1] = 2 * in_pat_len - k;
1649+
1650+
// Step A2
1651+
for (j = in_pat_len, t = in_pat_len + 1; j > 0; j--, t--) {
1652+
aux[j - 1] = t;
1653+
while (t <= in_pat_len && in_pattern[j - 1] != in_pattern[t - 1]) {
1654+
s_gs_table[t - 1] = min(s_gs_table[t - 1], in_pat_len - j);
1655+
t = aux[t - 1];
16501656
}
16511657
}
1652-
for ( i = 1; i <= j; ++i ) if ( s_gs_table[ i ] > in_pat_len + j - i ) s_gs_table[ i ] = in_pat_len + j - i;
1653-
1654-
k = backup[ j ];
1655-
1656-
for ( ; j <= in_pat_len; k = backup[ k ] ) {
1657-
for ( ; j <= k; ++j ) if ( s_gs_table[ j ] >= k - j + in_pat_len ) s_gs_table[ j ] = k - j + in_pat_len;
1658+
1659+
// Step B1
1660+
q = t; t = in_pat_len + 1 - q;
1661+
for (j = 1, t1 = 0; j <= t; t1++, j++) {
1662+
aux[j - 1] = t1;
1663+
while (t1 >= 1 && in_pattern[j - 1] != in_pattern[t1 - 1]) t1 = aux[t1 - 1];
1664+
}
1665+
1666+
// Step B2
1667+
q1 = 1;
1668+
while (q < in_pat_len) {
1669+
for (k = q1; k <= q; k++) {
1670+
s_gs_table[k - 1] = min(s_gs_table[k - 1], in_pat_len + q - k);
1671+
}
1672+
q1 = q + 1; q = q + t - aux[t - 1]; t = aux[t - 1];
16581673
}
16591674
}
1675+
1676+
if (aux) free(aux);
16601677

16611678
return err;
16621679
}
@@ -1668,22 +1685,20 @@ void boyer_moore_done() {
16681685
}
16691686

16701687

1671-
char *boyer_moore_search( char *in_text, long in_text_len ) {
1672-
long i, j, k, l;
1688+
char* boyer_moore_search(char* in_text, size_t in_text_len) {
1689+
size_t i, j;
16731690

1674-
for ( i = j = s_pat_len - 1; j < in_text_len && i >= 0; ) {
1675-
if ( in_text[ j ] == s_pattern[ i ] ) { --i; --j; }
1676-
else {
1677-
k = s_gs_table[ i + 1 ];
1678-
l = s_bc_table[ (uint8_t) in_text[ j ] ];
1691+
if (s_pat_len == 0) return in_text;
16791692

1680-
j += max( k, l );
1681-
1682-
i = s_pat_len - 1;
1693+
i = s_pat_len - 1;
1694+
while (i < in_text_len) {
1695+
for (j = s_pat_len - 1; in_text[i] == s_pattern[j]; --i, --j) {
1696+
if (j == 0) return in_text + i;
16831697
}
1698+
1699+
i += max(s_bc_table[(unsigned char)in_text[i]], s_gs_table[j]);
16841700
}
1685-
1686-
return i < 0 ? in_text + j + 1 : NULL;
1701+
return NULL;
16871702
}
16881703

16891704

0 commit comments

Comments
 (0)