2929#define XDL_GUESS_NLINES1 256
3030#define XDL_GUESS_NLINES2 20
3131
32+ #define DISCARD 0
33+ #define KEEP 1
34+ #define INVESTIGATE 2
3235
3336typedef struct s_xdlclass {
3437 struct s_xdlclass * next ;
@@ -190,15 +193,15 @@ void xdl_free_env(xdfenv_t *xe) {
190193}
191194
192195
193- static int xdl_clean_mmatch (char const * dis , long i , long s , long e ) {
196+ static bool xdl_clean_mmatch (uint8_t const * action , long i , long s , long e ) {
194197 long r , rdis0 , rpdis0 , rdis1 , rpdis1 ;
195198
196199 /*
197- * Limits the window the is examined during the similar-lines
198- * scan. The loops below stops when dis [i - r] == 1 (line that
199- * has no match), but there are corner cases where the loop
200- * proceed all the way to the extremities by causing huge
201- * performance penalties in case of big files.
200+ * Limits the window that is examined during the similar-lines
201+ * scan. The loops below stops when action [i - r] == KEEP
202+ * (line that has no match), but there are corner cases where
203+ * the loop proceed all the way to the extremities by causing
204+ * huge performance penalties in case of big files.
202205 */
203206 if (i - s > XDL_SIMSCAN_WINDOW )
204207 s = i - XDL_SIMSCAN_WINDOW ;
@@ -207,40 +210,47 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
207210
208211 /*
209212 * Scans the lines before 'i' to find a run of lines that either
210- * have no match (dis[j] == 0) or have multiple matches (dis[j] > 1).
211- * Note that we always call this function with dis[i] > 1, so the
212- * current line (i) is already a multimatch line.
213+ * have no match (action[j] == DISCARD) or have multiple matches
214+ * (action[j] == INVESTIGATE). Note that we always call this
215+ * function with action[i] == INVESTIGATE, so the current line
216+ * (i) is already a multimatch line.
213217 */
214218 for (r = 1 , rdis0 = 0 , rpdis0 = 1 ; (i - r ) >= s ; r ++ ) {
215- if (! dis [i - r ])
219+ if (action [i - r ] == DISCARD )
216220 rdis0 ++ ;
217- else if (dis [i - r ] == 2 )
221+ else if (action [i - r ] == INVESTIGATE )
218222 rpdis0 ++ ;
219- else
223+ else if ( action [ i - r ] == KEEP )
220224 break ;
225+ else
226+ BUG ("Illegal value for action[i - r]" );
221227 }
222228 /*
223- * If the run before the line 'i' found only multimatch lines, we
224- * return 0 and hence we don't make the current line (i) discarded.
225- * We want to discard multimatch lines only when they appear in the
226- * middle of runs with nomatch lines (dis[j] == 0).
229+ * If the run before the line 'i' found only multimatch lines,
230+ * we return false and hence we don't make the current line (i)
231+ * discarded. We want to discard multimatch lines only when
232+ * they appear in the middle of runs with nomatch lines
233+ * (action[j] == DISCARD).
227234 */
228235 if (rdis0 == 0 )
229236 return 0 ;
230237 for (r = 1 , rdis1 = 0 , rpdis1 = 1 ; (i + r ) <= e ; r ++ ) {
231- if (! dis [i + r ])
238+ if (action [i + r ] == DISCARD )
232239 rdis1 ++ ;
233- else if (dis [i + r ] == 2 )
240+ else if (action [i + r ] == INVESTIGATE )
234241 rpdis1 ++ ;
235- else
242+ else if ( action [ i + r ] == KEEP )
236243 break ;
244+ else
245+ BUG ("Illegal value for action[i + r]" );
237246 }
238247 /*
239- * If the run after the line 'i' found only multimatch lines, we
240- * return 0 and hence we don't make the current line (i) discarded.
248+ * If the run after the line 'i' found only multimatch lines,
249+ * we return false and hence we don't make the current line (i)
250+ * discarded.
241251 */
242252 if (rdis1 == 0 )
243- return 0 ;
253+ return false ;
244254 rdis1 += rdis0 ;
245255 rpdis1 += rpdis0 ;
246256
@@ -251,59 +261,81 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
251261/*
252262 * Try to reduce the problem complexity, discard records that have no
253263 * matches on the other file. Also, lines that have multiple matches
254- * might be potentially discarded if they happear in a run of discardable.
264+ * might be potentially discarded if they appear in a run of discardable.
255265 */
256266static int xdl_cleanup_records (xdlclassifier_t * cf , xdfile_t * xdf1 , xdfile_t * xdf2 ) {
257267 long i , nm , nreff , mlim ;
258268 xrecord_t * recs ;
259269 xdlclass_t * rcrec ;
260- char * dis , * dis1 , * dis2 ;
261- int need_min = !!(cf -> flags & XDF_NEED_MINIMAL );
270+ uint8_t * action1 = NULL , * action2 = NULL ;
271+ bool need_min = !!(cf -> flags & XDF_NEED_MINIMAL );
272+ int ret = 0 ;
262273
263- if (!XDL_CALLOC_ARRAY (dis , xdf1 -> nrec + xdf2 -> nrec + 2 ))
264- return -1 ;
265- dis1 = dis ;
266- dis2 = dis1 + xdf1 -> nrec + 1 ;
274+ /*
275+ * Create temporary arrays that will help us decide if
276+ * changed[i] should remain 0 or become 1.
277+ */
278+ if (!XDL_CALLOC_ARRAY (action1 , xdf1 -> nrec + 1 )) {
279+ ret = -1 ;
280+ goto cleanup ;
281+ }
282+ if (!XDL_CALLOC_ARRAY (action2 , xdf2 -> nrec + 1 )) {
283+ ret = -1 ;
284+ goto cleanup ;
285+ }
267286
287+ /*
288+ * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
289+ */
268290 if ((mlim = xdl_bogosqrt (xdf1 -> nrec )) > XDL_MAX_EQLIMIT )
269291 mlim = XDL_MAX_EQLIMIT ;
270292 for (i = xdf1 -> dstart , recs = & xdf1 -> recs [xdf1 -> dstart ]; i <= xdf1 -> dend ; i ++ , recs ++ ) {
271293 rcrec = cf -> rcrecs [recs -> ha ];
272294 nm = rcrec ? rcrec -> len2 : 0 ;
273- dis1 [i ] = (nm == 0 ) ? 0 : (nm >= mlim && !need_min ) ? 2 : 1 ;
295+ action1 [i ] = (nm == 0 ) ? DISCARD : (nm >= mlim && !need_min ) ? INVESTIGATE : KEEP ;
274296 }
275297
276298 if ((mlim = xdl_bogosqrt (xdf2 -> nrec )) > XDL_MAX_EQLIMIT )
277299 mlim = XDL_MAX_EQLIMIT ;
278300 for (i = xdf2 -> dstart , recs = & xdf2 -> recs [xdf2 -> dstart ]; i <= xdf2 -> dend ; i ++ , recs ++ ) {
279301 rcrec = cf -> rcrecs [recs -> ha ];
280302 nm = rcrec ? rcrec -> len1 : 0 ;
281- dis2 [i ] = (nm == 0 ) ? 0 : (nm >= mlim && !need_min ) ? 2 : 1 ;
303+ action2 [i ] = (nm == 0 ) ? DISCARD : (nm >= mlim && !need_min ) ? INVESTIGATE : KEEP ;
282304 }
283305
306+ /*
307+ * Use temporary arrays to decide if changed[i] should remain
308+ * 0 or become 1.
309+ */
284310 for (nreff = 0 , i = xdf1 -> dstart , recs = & xdf1 -> recs [xdf1 -> dstart ];
285311 i <= xdf1 -> dend ; i ++ , recs ++ ) {
286- if (dis1 [i ] == 1 ||
287- (dis1 [i ] == 2 && !xdl_clean_mmatch (dis1 , i , xdf1 -> dstart , xdf1 -> dend ))) {
312+ if (action1 [i ] == KEEP ||
313+ (action1 [i ] == INVESTIGATE && !xdl_clean_mmatch (action1 , i , xdf1 -> dstart , xdf1 -> dend ))) {
288314 xdf1 -> rindex [nreff ++ ] = i ;
315+ /* changed[i] remains 0, i.e. keep */
289316 } else
290317 xdf1 -> changed [i ] = 1 ;
318+ /* i.e. discard */
291319 }
292320 xdf1 -> nreff = nreff ;
293321
294322 for (nreff = 0 , i = xdf2 -> dstart , recs = & xdf2 -> recs [xdf2 -> dstart ];
295323 i <= xdf2 -> dend ; i ++ , recs ++ ) {
296- if (dis2 [i ] == 1 ||
297- (dis2 [i ] == 2 && !xdl_clean_mmatch (dis2 , i , xdf2 -> dstart , xdf2 -> dend ))) {
324+ if (action2 [i ] == KEEP ||
325+ (action2 [i ] == INVESTIGATE && !xdl_clean_mmatch (action2 , i , xdf2 -> dstart , xdf2 -> dend ))) {
298326 xdf2 -> rindex [nreff ++ ] = i ;
327+ /* changed[i] remains 0, i.e. keep */
299328 } else
300329 xdf2 -> changed [i ] = 1 ;
330+ /* i.e. discard */
301331 }
302332 xdf2 -> nreff = nreff ;
303333
304- xdl_free (dis );
334+ cleanup :
335+ xdl_free (action1 );
336+ xdl_free (action2 );
305337
306- return 0 ;
338+ return ret ;
307339}
308340
309341
0 commit comments