Extensively commented cdparanoia's stage 1 matching. No code changes apart

from added white space for improved readability. Comments containing "???" suggest areas for further study and documentation.
2005-10-17 15:31:08 +00:00
parent 10166d3d88
commit 6bcd6c5609
1 changed files with 413 additions and 65 deletions
--- a/lib/paranoia/paranoia.c
+++ b/lib/paranoia/paranoia.c
@@ -1,5 +1,5 @@
 /*
-  $Id: paranoia.c,v 1.18 2005/10/14 02:07:06 rocky Exp $
+  $Id: paranoia.c,v 1.19 2005/10/17 15:31:08 pjcreath Exp $
  Copyright (C) 2004, 2005 Rocky Bernstein <rocky@panix.com>
  Copyright (C) 1998 Monty xiphmont@mit.edu
@@ -149,8 +149,23 @@ enum  {
 #define FLAGS_UNREAD   0x02 
 #define FLAGS_VERIFIED 0x04
 /**** matching and analysis code *****************************************/
 /* ===========================================================================
 * i_paranoia_overlap() (internal)
 *
 * This function is called when buffA[offsetA] == buffB[offsetB].  This
 * function searches backward and forward to see how many consecutive
 * samples also match.
 *
 * This function is called by do_const_sync() when we're not doing any
 * verification.  Its more complicated sibling is i_paranoia_overlap2.
 *
 * This function returns the number of consecutive matching samples.
 * If (ret_begin) or (ret_end) are not NULL, it fills them with the
 * offsets of the first and last matching samples in A.
 */
 static inline long 
 i_paranoia_overlap(int16_t *buffA,int16_t *buffB,
 		   long offsetA, long offsetB,
@@ -160,19 +175,39 @@ i_paranoia_overlap(int16_t *buffA,int16_t *buffB,
  long beginA=offsetA,endA=offsetA;
  long beginB=offsetB,endB=offsetB;
  /* Scan backward to extend the matching run in that direction. */
  for(; beginA>=0 && beginB>=0; beginA--,beginB--)
    if (buffA[beginA] != buffB[beginB]) break;
  beginA++;
  beginB++;
  /* Scan forward to extend the matching run in that direction. */
  for(; endA<sizeA && endB<sizeB; endA++,endB++)
    if (buffA[endA] != buffB[endB]) break;
  /* Return the result of our search. */
  if (ret_begin) *ret_begin = beginA;
  if (ret_end) *ret_end = endA;
  return (endA-beginA);
 }
 /* ===========================================================================
 * i_paranoia_overlap2() (internal)
 *
 * This function is called when buffA[offsetA] == buffB[offsetB].  This
 * function searches backward and forward to see how many consecutive
 * samples also match.
 *
 * This function is called by do_const_sync() when we're verifying the
 * data coming off the CD.  Its less complicated sibling is
 * i_paranoia_overlap, which is a good place to look to see the simplest
 * outline of how this function works.
 *
 * This function returns the number of consecutive matching samples.
 * If (ret_begin) or (ret_end) are not NULL, it fills them with the
 * offsets of the first and last matching samples in A.
 */
 static inline long 
 i_paranoia_overlap2(int16_t *buffA,int16_t *buffB,
 		    unsigned char *flagsA, unsigned char *flagsB,
@@ -183,45 +218,69 @@ i_paranoia_overlap2(int16_t *buffA,int16_t *buffB,
  long beginA=offsetA, endA=offsetA;
  long beginB=offsetB, endB=offsetB;
  /* Scan backward to extend the matching run in that direction. */
  for (; beginA>=0 && beginB>=0; beginA--,beginB--) {
    if (buffA[beginA] != buffB[beginB]) break;
    /* don't allow matching across matching sector boundaries */
-    /* don't allow matching through known missing data */
+    /* Stop if both samples were at the edges of a low-level read.
     * ???: What implications does this have?
     * ???: Why do we include the first sample for which this is true?
     */
    if ((flagsA[beginA]&flagsB[beginB]&FLAGS_EDGE)) {
      beginA--;
      beginB--;
      break;
    }
-    if ((flagsA[beginA]&FLAGS_UNREAD) || (flagsB[beginB]&FLAGS_UNREAD))break;
+
    /* don't allow matching through known missing data */
    if ((flagsA[beginA]&FLAGS_UNREAD) || (flagsB[beginB]&FLAGS_UNREAD))
      break;
  }
  beginA++;
  beginB++;
  /* Scan forward to extend the matching run in that direction. */
  for (; endA<sizeA && endB<sizeB; endA++,endB++) {
    if (buffA[endA] != buffB[endB]) break;
    /* don't allow matching across matching sector boundaries */
    /* Stop if both samples were at the edges of a low-level read.
     * ???: What implications does this have?
     * ???: Why do we not stop if endA == beginA?
     */
    if ((flagsA[endA]&flagsB[endB]&FLAGS_EDGE) && endA!=beginA){
      break;
    }
    /* don't allow matching through known missing data */
-    if ((flagsA[endA]&FLAGS_UNREAD) || (flagsB[endB]&FLAGS_UNREAD))break;
+    if ((flagsA[endA]&FLAGS_UNREAD) || (flagsB[endB]&FLAGS_UNREAD))
      break;
  }
  /* Return the result of our search. */
  if (ret_begin) *ret_begin = beginA;
  if (ret_end) *ret_end = endA;
  return (endA-beginA);
 }
 /* Top level of the first stage matcher */
 /* We match each analysis point of new to the preexisting blocks
 recursively.  We can also optionally maintain a list of fragments of
 the preexisting block that didn't match anything, and match them back
 afterward. */
 #define OVERLAP_ADJ (MIN_WORDS_OVERLAP/2-1)
 /* ===========================================================================
 * do_const_sync() (internal)
 *
 * This function is called when samples A[posA] == B[posB].  It tries to
 * build a matching run from that point, looking forward and backward to
 * see how many consecutive samples match.  Since the starting samples
 * might only be coincidentally identical, we only consider the run to
 * be a true match if it's longer than MIN_WORDS_SEARCH.
 *
 * This function returns the length of the run if a matching run was found,
 * or 0 otherwise.  If a matching run was found, (begin) and (end) are set
 * to the absolute positions of the beginning and ending samples of the
 * run in A, and (offset) is set to the jitter between the c_blocks.
 * (I.e., offset indicates the distance between what A considers sample N
 * on the CD and what B considers sample N.)
 */
 static inline long int 
 do_const_sync(c_block_t *A,
 	      sort_info_t *B, unsigned char *flagB,
@@ -231,6 +290,10 @@ do_const_sync(c_block_t *A,
  unsigned char *flagA=A->flags;
  long ret=0;
  /* If we're doing any verification whatsoever, we have flags and will
   * take them into account.  Otherwise, we just do the simple equality
   * test for samples on both sides of the initial match.
   */
  if (flagB==NULL)
    ret=i_paranoia_overlap(cv(A), iv(B), posA, posB,
 			   cs(A), is(B), begin, end);
@@ -240,8 +303,15 @@ do_const_sync(c_block_t *A,
 			      posA, posB, cs(A), is(B),
 			      begin, end);
  /* Small matching runs could just be coincidental.  We only consider this
   * a real match if it's long enough.
   */
  if (ret > MIN_WORDS_SEARCH) {
    *offset=+(posA+cb(A))-(posB+ib(B));
    /* ???: Contrary to the original comment, this appears to be relative to
     * A, not B.
     */
    *begin+=cb(A);
    *end+=cb(A);
    return(ret);
@@ -250,6 +320,30 @@ do_const_sync(c_block_t *A,
  return(0);
 }
 /* ===========================================================================
 * try_sort_sync() (internal)
 *
 * Starting from the sample in B with the absolute position (post), look
 * for a matching run in A.  This search will look in A for a first
 * matching sample within (p->dynoverlap) samples around (post).  If it
 * finds one, it will then determine how many consecutive samples match
 * both A and B from that point, looking backwards and forwards.  If
 * this search produces a matching run longer than MIN_WORDS_SEARCH, we
 * consider it a match.
 *
 * When used by stage 1, the "post" is planted with respect to the old
 * c_block being compare to the new c_block.  In stage 2, the "post" is
 * planted with respect to the verified root.
 *
 * This function returns 1 if a match is found and 0 if not.  When a match
 * is found, (begin) and (end) are set to the boundaries of the run, and
 * (offset) is set to the difference in position of the run in A and B.
 * (begin) and (end) are the absolute positions of the samples in
 * A.  (offset) counts from B's frame of reference.  I.e., an offset of
 * -2 would mean that A's absolute 3 is equivalent to B's 5.
 */
 /* post is w.r.t. B.  in stage one, we post from old.  In stage 2 we
   post from root. Begin, end, offset count from B's frame of
   reference */
@@ -275,11 +369,24 @@ try_sort_sync(cdrom_paranoia_t *p,
    {
      long zeropos=post-ib(A);
      if (zeropos>=0 && zeropos<is(A)) {
 	/* Before we bother with the search for a matching samples,
 	 * we check the simple case.  If there's no jitter at all
 	 * (i.e. the absolute positions of A's and B's samples are
 	 * consistent), A's sample at (post) should be identical
 	 * to B's sample at the same position.
 	 */
 	if ( cv(B)[post-cb(B)] == iv(A)[zeropos] ) {
 	  /* The first sample matched, now try to grow the matching run
 	   * in both directions.  We only consider it a match if more
 	   * than MIN_WORDS_SEARCH consecutive samples match.
 	   */
 	  if (do_const_sync(B, A, Aflags,
 			    post-cb(B), zeropos,
 			    begin, end, offset) ) {
 	    /* ???: To be studied. */
 	    offset_add_value(p,&(p->stage1),*offset,callback);
 	    return(1);
@@ -290,25 +397,79 @@ try_sort_sync(cdrom_paranoia_t *p,
  } else
    return(0);
  /* If the samples with the same absolute position didn't match, it's
   * either a bad sample, or the two c_blocks are jittered with respect
   * to each other.  Now we search through A for samples that do have
   * the same value as B's post.  The search looks from first to last
   * occurrence witin (dynoverlap) samples of (post).
   */
  ptr=sort_getmatch(A,post-ib(A),dynoverlap,cv(B)[post-cb(B)]);
  while (ptr){
    /* We've found a matching sample, so try to grow the matching run in
     * both directions.  If we find a long enough run (longer than
     * MIN_WORDS_SEARCH), we've found a match.
     */
    if (do_const_sync(B,A,Aflags,
 		     post-cb(B),ipos(A,ptr),
 		     begin,end,offset)){
      /* ???: To be studied. */
      offset_add_value(p,&(p->stage1),*offset,callback);
      return(1);
    }
    /* The matching sample was just a fluke -- there weren't enough adjacent
     * samples that matched to consider a matching run.  So now we check
     * for the next occurrence of that value in A.
     */
    ptr=sort_nextmatch(A,ptr);
  }
  /* We didn't find any matches. */
  *begin=-1;
  *end=-1;
  *offset=-1;
  return(0);
 }
 /* ===========================================================================
 * STAGE 1 MATCHING
 *
 * ???: Insert high-level explanation here.
 * ===========================================================================
 */
 /* Top level of the first stage matcher */
 /* We match each analysis point of new to the preexisting blocks
 recursively.  We can also optionally maintain a list of fragments of
 the preexisting block that didn't match anything, and match them back
 afterward. */
 #define OVERLAP_ADJ (MIN_WORDS_OVERLAP/2-1)
 /* ===========================================================================
 * stage1_matched() (internal)
 *
 * This function is called whenever stage 1 verification finds two identical
 * runs of samples from different reads.  The runs must be more than
 * MIN_WORDS_SEARCH samples long.  They may be jittered (i.e. their absolute
 * positions on the CD may not match due to inaccurate seeking) with respect
 * to each other, but they have been verified to have no dropped samples
 * within them.
 *
 * This function provides feedback via the callback mechanism and marks the
 * runs as verified.  The details of the marking are somehwat subtle and
 * are described near the relevant code.
 *
 * Subsequent portions of the stage 1 code will build a verified fragment
 * from this run.  The verified fragment will eventually be merged
 * into the verified root (and its absolute position determined) in
 * stage 2.
 */
 static inline void 
 stage1_matched(c_block_t *old, c_block_t *new,
 	       long matchbegin,long matchend,
@@ -321,6 +482,15 @@ stage1_matched(c_block_t *old, c_block_t *new,
  long newadjbegin=matchbegin-matchoffset-cb(new);
  long newadjend=matchend-matchoffset-cb(new);
  /* Provide feedback via the callback about the samples we've just
   * verified.
   *
   * ???: How can matchbegin ever be < cb(old)?
   *
   * ???: Why do edge samples get logged only when there's jitter
   * between the matched runs (matchoffset != 0)?
   */
  if ( matchbegin-matchoffset<=cb(new)
       || matchbegin<=cb(old)
       || (new->flags[newadjbegin]&FLAGS_EDGE) 
@@ -341,6 +511,54 @@ stage1_matched(c_block_t *old, c_block_t *new,
    if (callback) 
      (*callback)(matchend, PARANOIA_CB_FIXUP_ATOM);
  /* Mark verified samples as "verified," but trim the verified region
   * by OVERLAP_ADJ samples on each side.  There are several significant
   * implications of this trimming:
   *
   * 1) Why we trim at all:  We have to trim to distinguish between two
   * adjacent verified runs and one long verified run.  We encounter this
   * situation when samples have been dropped:
   *
   *   matched portion of read 1 ....)(.... matched portion of read 1
   *       read 2 adjacent run  .....)(..... read 2 adjacent run
   *                                 ||
   *                      dropped samples in read 2
   *
   * So at this point, the fact that we have two adjacent runs means
   * that we have not yet verified that the two runs really are adjacent.
   * (In fact, just the opposite:  there are two runs because they were
   * matched by separate runs, indicating that some samples didn't match
   * across the length of read 2.)
   *
   * If we verify that they are actually adjacent (e.g. if the two runs
   * are simply a result of matching runs from different reads, not from
   * dropped samples), we will indeed mark them as one long merged run.
   *
   * 2) Why we trim by this amount: We want to ensure that when we
   * verify the relationship between these two runs, we do so with
   * an overlapping fragment at least OVERLAP samples long.  Following
   * from the above example:
   *
   *                (..... matched portion of read 3 .....)
   *       read 2 adjacent run  .....)(..... read 2 adjacent run
   *
   * Assuming there were no dropped samples between the adjacent runs,
   * the matching portion of read 3 will need to be at least OVERLAP
   * samples long to mark the two runs as one long verified run.
   * If there were dropped samples, read 3 wouldn't match across the
   * two runs, proving our caution worthwhile.
   *
   * 3) Why we partially discard the work we've done:  We don't.
   * When subsequently creating verified fragments from this run,
   * we compensate for this trimming.  Thus the verified fragment will
   * contain the full length of verified samples.  Only the c_blocks
   * will reflect this trimming.
   *
   * ???: The comment below indicates that the sort cache is updated in
   * some way, but this does not appear to be the case.
   */
  /* Mark the verification flags.  Don't mark the first or
     last OVERLAP/2 elements so that overlapping fragments
     have to overlap by OVERLAP to actually merge. We also
@@ -359,13 +577,44 @@ stage1_matched(c_block_t *old, c_block_t *new,
 }
 /* ===========================================================================
 * i_iterate_stage1 (internal)
 *
 * This function is called by i_stage1() to compare newly read samples with
 * previously read samples, searching for contiguous runs of identical
 * samples.  Matching runs indicate that at least two reads of the CD
 * returned identical data, with no dropped samples in that run.
 * The runs may be jittered (i.e. their absolute positions on the CD may
 * not be accurate due to inaccurate seeking) at this point.  Their
 * positions will be determined in stage 2.
 *
 * This function compares the new c_block (which has been indexed in
 * p->sortcache) to a previous c_block.  It is called for each previous
 * c_block.  It searches for runs of identical samples longer than
 * MIN_WORDS_SEARCH.  Samples in matched runs are marked as verified.
 *
 * Subsequent stage 1 code builds verified fragments from the runs of
 * verified samples.  These fragments are merged into the verified root
 * in stage 2.
 *
 * This function returns the number of distinct runs verified in the new
 * c_block when compared against this old c_block.
 */
 static long int 
 i_iterate_stage1(cdrom_paranoia_t *p, c_block_t *old, c_block_t *new,
 		 void(*callback)(long int, paranoia_cb_mode_t)) 
 {
  long matchbegin = -1;
  long matchend   = -1;
  long matchoffset;
-  long matchbegin=-1,matchend=-1,matchoffset;
+  /* ???: Why do we limit our search only to the samples with overlapping
-
+   * absolute positions?  It could be because it eliminates some further
   * bounds checking.
   *
   * Why do we "no longer try to spread the ... search" as mentioned below?
   */
  /* we no longer try to spread the stage one search area by dynoverlap */
  long searchend   = min(ce(old), ce(new));
  long searchbegin = max(cb(old), cb(new));
@@ -374,16 +623,36 @@ i_iterate_stage1(cdrom_paranoia_t *p, c_block_t *old, c_block_t *new,
  long ret = 0;
  long int j;
-  long tried=0,matched=0;
+  long tried = 0;
  long matched = 0;
-  if (searchsize<=0)return(0);
+  if (searchsize<=0)
    return(0);
  /* match return values are in terms of the new vector, not old */
  /* ???: Why 23?  */
  for (j=searchbegin; j<searchend; j+=23) {
    /* Skip past any samples verified in previous comparisons to
     * other old c_blocks.  Also, obviously, don't bother verifying
     * unread/unmatchable samples.
     */
    if ((new->flags[j-cb(new)] & (FLAGS_VERIFIED|FLAGS_UNREAD)) == 0) {
      tried++;
-      if (try_sort_sync(p,i,new->flags,old,j,&matchbegin,&matchend,&matchoffset,
+
      /* Starting from the sample in the old c_block with the absolute
       * position j, look for a matching run in the new c_block.  This
       * search will look a certain distance around j, and if successful
       * will extend the matching run as far backward and forward as
       * it can.
       *
       * The search will only return 1 if it finds a matching run long
       * enough to be deemed significant.
       */
      if (try_sort_sync(p, i, new->flags, old, j,
 			&matchbegin, &matchend, &matchoffset,
 			callback) == 1) {
 	matched+=matchend-matchbegin;
@@ -394,6 +663,11 @@ i_iterate_stage1(cdrom_paranoia_t *p, c_block_t *old, c_block_t *new,
 	  long j = matchbegin-cb(old);
 	  long end = matchend-cb(old);
 	  for (; j<end; j++) if (cv(old)[j]!=0) break;
 	  /* Mark the matched samples in both c_blocks as verified.
 	   * In reality, not all the samples are marked.  See
 	   * stage1_matched() for details.
 	   */
 	  if (j<end) {
 	    stage1_matched(old,new,matchbegin,matchend,matchoffset,callback);
 	  } else {
@@ -401,10 +675,14 @@ i_iterate_stage1(cdrom_paranoia_t *p, c_block_t *old, c_block_t *new,
 	  }
 	}
 	ret++;
-	if (matchend-1>j)j=matchend-1;
+
-      }
+	/* Skip past this verified run to look for more matches. */
 	if (matchend-1 > j)
 	  j = matchend-1;
      }
    }
  } /* end for */
 #ifdef NOISY 
  fprintf(stderr,"iterate_stage1: search area=%ld[%ld-%ld] tried=%ld matched=%ld spans=%ld\n",
 	  searchsize,searchbegin,searchend,tried,matched,ret);
@@ -413,6 +691,36 @@ i_iterate_stage1(cdrom_paranoia_t *p, c_block_t *old, c_block_t *new,
  return(ret);
 }
 /* ===========================================================================
 * i_stage1() (internal)
 *
 * Compare newly read samples against previously read samples, searching
 * for contiguous runs of identical samples.  Matching runs indicate that
 * at least two reads of the CD returned identical data, with no dropped
 * samples in that run.  The runs may be jittered (i.e. their absolute
 * positions on the CD may not be accurate due to inaccurate seeking) at
 * this point.  Their positions will be determined in stage 2.
 *
 * This function compares a new c_block against all other c_blocks in memory,
 * searching for sufficiently long runs of identical samples.  Since each
 * c_block represents a separate call to read_c_block, this ensures that
 * multiple reads have returned identical data.  (Additionally, read_c_block
 * varies the reads so that multiple reads are unlikely to produce identical
 * errors, so any matches between reads are considered verified.  See
 * i_read_c_block for more details.)
 *
 * Each time we find such a  run (longer than MIN_WORDS_SEARCH), we mark
 * the samples as "verified" in both c_blocks.  Runs of verified samples in
 * the new c_block are promoted into verified fragments, which will later
 * be merged into the verified root in stage 2.
 *
 * In reality, not all the verified samples are marked as "verified."
 * See stage1_matched() for an explanation.
 *
 * This function returns the number of verified fragments created by the
 * stage 1 matching.
 */
 static long int
 i_stage1(cdrom_paranoia_t *p, c_block_t *p_new, 
 	 void (*callback)(long int, paranoia_cb_mode_t))
@@ -423,10 +731,23 @@ i_stage1(cdrom_paranoia_t *p, c_block_t *p_new,
  long int begin=0;
  long int end;
  /* We're going to be comparing the new c_block against the other
   * c_blocks in memory.  Initialize the "sort cache" index to allow
   * for fast searching through the new c_block.  (The index will
   * actually be built the first time we search.)
   */
  if (ptr) 
    sort_setup( p->sortcache, cv(p_new), &cb(p_new), cs(p_new), cb(p_new), 
 		ce(p_new) );
  /* Iterate from oldest to newest c_block, comparing the new c_block
   * to each, looking for a sufficiently long run of identical samples
   * (longer than MIN_WORDS_SEARCH), which will be marked as "verified"
   * in both c_blocks.
   *
   * Since the new c_block is already in the list (at the head), don't
   * compare it against itself.
   */
  while ( ptr && ptr != p_new ) {
    if (callback)
      (*callback)(cb(p_new), PARANOIA_CB_VERIFY);
@@ -437,6 +758,9 @@ i_stage1(cdrom_paranoia_t *p, c_block_t *p_new,
  /* parse the verified areas of p_new into v_fragments */
  /* Find each run of contiguous verified samples in the new c_block
   * and create a verified fragment from each run.
   */
  begin=0;
  while (begin<size) {
    for ( ; begin < size; begin++)
@@ -447,6 +771,15 @@ i_stage1(cdrom_paranoia_t *p, c_block_t *p_new,
    ret++;
    /* We create a new verified fragment from the contiguous run
     * of verified samples.
     *
     * We expand the "verified" range by OVERLAP_ADJ on each side
     * to compensate for trimming done to the verified range by
     * stage1_matched().  The samples were actually verified, and
     * hence belong in the verified fragment.  See stage1_matched()
     * for an explanation of the trimming.
     */
    new_v_fragment(p,p_new,cb(p_new)+max(0,begin-OVERLAP_ADJ),
 		   cb(p_new)+min(size,end+OVERLAP_ADJ),
 		   (end+OVERLAP_ADJ>=size && p_new->lastsector));
@@ -454,9 +787,20 @@ i_stage1(cdrom_paranoia_t *p, c_block_t *p_new,
    begin=end;
  }
  /* Return the number of distinct verified fragments we found with
   * stage 1 matching.
   */
  return(ret);
 }
 /* ===========================================================================
 * STAGE 2 MATCHING
 *
 * ???: Insert high-level explanation here.
 * ===========================================================================
 */
 typedef struct sync_result {
  long offset;
  long begin;
@@ -1273,6 +1617,10 @@ i_read_c_block(cdrom_paranoia_t *p,long beginword,long endword,
  readat+=driftcomp;
  /* Create a new, empty c_block and add it to the head of the
   * list of c_blocks in memory.  It will be empty until the end of
   * this subroutine.
   */
  if (p->enable&(PARANOIA_MODE_OVERLAP|PARANOIA_MODE_VERIFY)) {
    flags=calloc(totaltoread*CD_FRAMEWORDS, 1);
    new=new_c_block(p);
@@ -1407,9 +1755,9 @@ i_read_c_block(cdrom_paranoia_t *p,long beginword,long endword,
  } /* end while */
-  /* If we managed to read any sectors at all (anyflag), create a new
+  /* If we managed to read any sectors at all (anyflag), fill in the
-   * c_block containing the read data.  Otherwise, free our buffers and
+   * previously allocated c_block with the read data.  Otherwise, free
-   * return NULL.
+   * our buffers, dispose of the c_block, and return NULL.
   */
  if (anyflag) {
    new->vector=buffer;