Extensively commented cdparanoia's stage 1 matching. No code changes apart

from added white space for improved readability.  Comments containing "???"
suggest areas for further study and documentation.
This commit is contained in:
pjcreath
2005-10-17 15:31:08 +00:00
parent 10166d3d88
commit 6bcd6c5609

View File

@@ -1,5 +1,5 @@
/*
$Id: paranoia.c,v 1.18 2005/10/14 02:07:06 rocky Exp $
$Id: paranoia.c,v 1.19 2005/10/17 15:31:08 pjcreath Exp $
Copyright (C) 2004, 2005 Rocky Bernstein <rocky@panix.com>
Copyright (C) 1998 Monty xiphmont@mit.edu
@@ -99,7 +99,7 @@ const char *paranoia_cb_mode2str[] = {
"fixup duplicated",
"read error"
};
static inline long
re(root_block *root)
{
@@ -149,8 +149,23 @@ enum {
#define FLAGS_UNREAD 0x02
#define FLAGS_VERIFIED 0x04
/**** matching and analysis code *****************************************/
/* ===========================================================================
* i_paranoia_overlap() (internal)
*
* This function is called when buffA[offsetA] == buffB[offsetB]. This
* function searches backward and forward to see how many consecutive
* samples also match.
*
* This function is called by do_const_sync() when we're not doing any
* verification. Its more complicated sibling is i_paranoia_overlap2.
*
* This function returns the number of consecutive matching samples.
* If (ret_begin) or (ret_end) are not NULL, it fills them with the
* offsets of the first and last matching samples in A.
*/
static inline long
i_paranoia_overlap(int16_t *buffA,int16_t *buffB,
long offsetA, long offsetB,
@@ -160,19 +175,39 @@ i_paranoia_overlap(int16_t *buffA,int16_t *buffB,
long beginA=offsetA,endA=offsetA;
long beginB=offsetB,endB=offsetB;
for(;beginA>=0 && beginB>=0;beginA--,beginB--)
if (buffA[beginA]!=buffB[beginB])break;
/* Scan backward to extend the matching run in that direction. */
for(; beginA>=0 && beginB>=0; beginA--,beginB--)
if (buffA[beginA] != buffB[beginB]) break;
beginA++;
beginB++;
for(;endA<sizeA && endB<sizeB;endA++,endB++)
if (buffA[endA]!=buffB[endB])break;
if (ret_begin)*ret_begin=beginA;
if (ret_end)*ret_end=endA;
return(endA-beginA);
/* Scan forward to extend the matching run in that direction. */
for(; endA<sizeA && endB<sizeB; endA++,endB++)
if (buffA[endA] != buffB[endB]) break;
/* Return the result of our search. */
if (ret_begin) *ret_begin = beginA;
if (ret_end) *ret_end = endA;
return (endA-beginA);
}
/* ===========================================================================
* i_paranoia_overlap2() (internal)
*
* This function is called when buffA[offsetA] == buffB[offsetB]. This
* function searches backward and forward to see how many consecutive
* samples also match.
*
* This function is called by do_const_sync() when we're verifying the
* data coming off the CD. Its less complicated sibling is
* i_paranoia_overlap, which is a good place to look to see the simplest
* outline of how this function works.
*
* This function returns the number of consecutive matching samples.
* If (ret_begin) or (ret_end) are not NULL, it fills them with the
* offsets of the first and last matching samples in A.
*/
static inline long
i_paranoia_overlap2(int16_t *buffA,int16_t *buffB,
unsigned char *flagsA, unsigned char *flagsB,
@@ -183,45 +218,69 @@ i_paranoia_overlap2(int16_t *buffA,int16_t *buffB,
long beginA=offsetA, endA=offsetA;
long beginB=offsetB, endB=offsetB;
for( ; beginA>=0 && beginB>=0; beginA--,beginB-- ) {
if ( buffA[beginA] != buffB[beginB] ) break;
/* Scan backward to extend the matching run in that direction. */
for (; beginA>=0 && beginB>=0; beginA--,beginB--) {
if (buffA[beginA] != buffB[beginB]) break;
/* don't allow matching across matching sector boundaries */
/* don't allow matching through known missing data */
if ((flagsA[beginA]&flagsB[beginB]&FLAGS_EDGE)){
/* Stop if both samples were at the edges of a low-level read.
* ???: What implications does this have?
* ???: Why do we include the first sample for which this is true?
*/
if ((flagsA[beginA]&flagsB[beginB]&FLAGS_EDGE)) {
beginA--;
beginB--;
break;
}
if ((flagsA[beginA]&FLAGS_UNREAD) || (flagsB[beginB]&FLAGS_UNREAD))break;
/* don't allow matching through known missing data */
if ((flagsA[beginA]&FLAGS_UNREAD) || (flagsB[beginB]&FLAGS_UNREAD))
break;
}
beginA++;
beginB++;
for(;endA<sizeA && endB<sizeB;endA++,endB++){
if (buffA[endA]!=buffB[endB])break;
/* Scan forward to extend the matching run in that direction. */
for (; endA<sizeA && endB<sizeB; endA++,endB++) {
if (buffA[endA] != buffB[endB]) break;
/* don't allow matching across matching sector boundaries */
/* Stop if both samples were at the edges of a low-level read.
* ???: What implications does this have?
* ???: Why do we not stop if endA == beginA?
*/
if ((flagsA[endA]&flagsB[endB]&FLAGS_EDGE) && endA!=beginA){
break;
}
/* don't allow matching through known missing data */
if ((flagsA[endA]&FLAGS_UNREAD) || (flagsB[endB]&FLAGS_UNREAD))break;
if ((flagsA[endA]&FLAGS_UNREAD) || (flagsB[endB]&FLAGS_UNREAD))
break;
}
if (ret_begin)*ret_begin=beginA;
if (ret_end)*ret_end=endA;
return(endA-beginA);
/* Return the result of our search. */
if (ret_begin) *ret_begin = beginA;
if (ret_end) *ret_end = endA;
return (endA-beginA);
}
/* Top level of the first stage matcher */
/* We match each analysis point of new to the preexisting blocks
recursively. We can also optionally maintain a list of fragments of
the preexisting block that didn't match anything, and match them back
afterward. */
#define OVERLAP_ADJ (MIN_WORDS_OVERLAP/2-1)
/* ===========================================================================
* do_const_sync() (internal)
*
* This function is called when samples A[posA] == B[posB]. It tries to
* build a matching run from that point, looking forward and backward to
* see how many consecutive samples match. Since the starting samples
* might only be coincidentally identical, we only consider the run to
* be a true match if it's longer than MIN_WORDS_SEARCH.
*
* This function returns the length of the run if a matching run was found,
* or 0 otherwise. If a matching run was found, (begin) and (end) are set
* to the absolute positions of the beginning and ending samples of the
* run in A, and (offset) is set to the jitter between the c_blocks.
* (I.e., offset indicates the distance between what A considers sample N
* on the CD and what B considers sample N.)
*/
static inline long int
do_const_sync(c_block_t *A,
sort_info_t *B, unsigned char *flagB,
@@ -231,6 +290,10 @@ do_const_sync(c_block_t *A,
unsigned char *flagA=A->flags;
long ret=0;
/* If we're doing any verification whatsoever, we have flags and will
* take them into account. Otherwise, we just do the simple equality
* test for samples on both sides of the initial match.
*/
if (flagB==NULL)
ret=i_paranoia_overlap(cv(A), iv(B), posA, posB,
cs(A), is(B), begin, end);
@@ -240,8 +303,15 @@ do_const_sync(c_block_t *A,
posA, posB, cs(A), is(B),
begin, end);
if (ret>MIN_WORDS_SEARCH){
/* Small matching runs could just be coincidental. We only consider this
* a real match if it's long enough.
*/
if (ret > MIN_WORDS_SEARCH) {
*offset=+(posA+cb(A))-(posB+ib(B));
/* ???: Contrary to the original comment, this appears to be relative to
* A, not B.
*/
*begin+=cb(A);
*end+=cb(A);
return(ret);
@@ -250,6 +320,30 @@ do_const_sync(c_block_t *A,
return(0);
}
/* ===========================================================================
* try_sort_sync() (internal)
*
* Starting from the sample in B with the absolute position (post), look
* for a matching run in A. This search will look in A for a first
* matching sample within (p->dynoverlap) samples around (post). If it
* finds one, it will then determine how many consecutive samples match
* both A and B from that point, looking backwards and forwards. If
* this search produces a matching run longer than MIN_WORDS_SEARCH, we
* consider it a match.
*
* When used by stage 1, the "post" is planted with respect to the old
* c_block being compare to the new c_block. In stage 2, the "post" is
* planted with respect to the verified root.
*
* This function returns 1 if a match is found and 0 if not. When a match
* is found, (begin) and (end) are set to the boundaries of the run, and
* (offset) is set to the difference in position of the run in A and B.
* (begin) and (end) are the absolute positions of the samples in
* A. (offset) counts from B's frame of reference. I.e., an offset of
* -2 would mean that A's absolute 3 is equivalent to B's 5.
*/
/* post is w.r.t. B. in stage one, we post from old. In stage 2 we
post from root. Begin, end, offset count from B's frame of
reference */
@@ -275,11 +369,24 @@ try_sort_sync(cdrom_paranoia_t *p,
{
long zeropos=post-ib(A);
if (zeropos>=0 && zeropos<is(A)) {
/* Before we bother with the search for a matching samples,
* we check the simple case. If there's no jitter at all
* (i.e. the absolute positions of A's and B's samples are
* consistent), A's sample at (post) should be identical
* to B's sample at the same position.
*/
if ( cv(B)[post-cb(B)] == iv(A)[zeropos] ) {
/* The first sample matched, now try to grow the matching run
* in both directions. We only consider it a match if more
* than MIN_WORDS_SEARCH consecutive samples match.
*/
if (do_const_sync(B, A, Aflags,
post-cb(B), zeropos,
begin, end, offset) ) {
/* ???: To be studied. */
offset_add_value(p,&(p->stage1),*offset,callback);
return(1);
@@ -289,26 +396,80 @@ try_sort_sync(cdrom_paranoia_t *p,
}
} else
return(0);
/* If the samples with the same absolute position didn't match, it's
* either a bad sample, or the two c_blocks are jittered with respect
* to each other. Now we search through A for samples that do have
* the same value as B's post. The search looks from first to last
* occurrence witin (dynoverlap) samples of (post).
*/
ptr=sort_getmatch(A,post-ib(A),dynoverlap,cv(B)[post-cb(B)]);
while (ptr){
/* We've found a matching sample, so try to grow the matching run in
* both directions. If we find a long enough run (longer than
* MIN_WORDS_SEARCH), we've found a match.
*/
if (do_const_sync(B,A,Aflags,
post-cb(B),ipos(A,ptr),
begin,end,offset)){
/* ???: To be studied. */
offset_add_value(p,&(p->stage1),*offset,callback);
return(1);
}
/* The matching sample was just a fluke -- there weren't enough adjacent
* samples that matched to consider a matching run. So now we check
* for the next occurrence of that value in A.
*/
ptr=sort_nextmatch(A,ptr);
}
/* We didn't find any matches. */
*begin=-1;
*end=-1;
*offset=-1;
return(0);
}
/* ===========================================================================
* STAGE 1 MATCHING
*
* ???: Insert high-level explanation here.
* ===========================================================================
*/
/* Top level of the first stage matcher */
/* We match each analysis point of new to the preexisting blocks
recursively. We can also optionally maintain a list of fragments of
the preexisting block that didn't match anything, and match them back
afterward. */
#define OVERLAP_ADJ (MIN_WORDS_OVERLAP/2-1)
/* ===========================================================================
* stage1_matched() (internal)
*
* This function is called whenever stage 1 verification finds two identical
* runs of samples from different reads. The runs must be more than
* MIN_WORDS_SEARCH samples long. They may be jittered (i.e. their absolute
* positions on the CD may not match due to inaccurate seeking) with respect
* to each other, but they have been verified to have no dropped samples
* within them.
*
* This function provides feedback via the callback mechanism and marks the
* runs as verified. The details of the marking are somehwat subtle and
* are described near the relevant code.
*
* Subsequent portions of the stage 1 code will build a verified fragment
* from this run. The verified fragment will eventually be merged
* into the verified root (and its absolute position determined) in
* stage 2.
*/
static inline void
stage1_matched(c_block_t *old, c_block_t *new,
long matchbegin,long matchend,
@@ -320,7 +481,16 @@ stage1_matched(c_block_t *old, c_block_t *new,
long oldadjend=matchend-cb(old);
long newadjbegin=matchbegin-matchoffset-cb(new);
long newadjend=matchend-matchoffset-cb(new);
/* Provide feedback via the callback about the samples we've just
* verified.
*
* ???: How can matchbegin ever be < cb(old)?
*
* ???: Why do edge samples get logged only when there's jitter
* between the matched runs (matchoffset != 0)?
*/
if ( matchbegin-matchoffset<=cb(new)
|| matchbegin<=cb(old)
|| (new->flags[newadjbegin]&FLAGS_EDGE)
@@ -340,13 +510,61 @@ stage1_matched(c_block_t *old, c_block_t *new,
} else
if (callback)
(*callback)(matchend, PARANOIA_CB_FIXUP_ATOM);
/* Mark verified samples as "verified," but trim the verified region
* by OVERLAP_ADJ samples on each side. There are several significant
* implications of this trimming:
*
* 1) Why we trim at all: We have to trim to distinguish between two
* adjacent verified runs and one long verified run. We encounter this
* situation when samples have been dropped:
*
* matched portion of read 1 ....)(.... matched portion of read 1
* read 2 adjacent run .....)(..... read 2 adjacent run
* ||
* dropped samples in read 2
*
* So at this point, the fact that we have two adjacent runs means
* that we have not yet verified that the two runs really are adjacent.
* (In fact, just the opposite: there are two runs because they were
* matched by separate runs, indicating that some samples didn't match
* across the length of read 2.)
*
* If we verify that they are actually adjacent (e.g. if the two runs
* are simply a result of matching runs from different reads, not from
* dropped samples), we will indeed mark them as one long merged run.
*
* 2) Why we trim by this amount: We want to ensure that when we
* verify the relationship between these two runs, we do so with
* an overlapping fragment at least OVERLAP samples long. Following
* from the above example:
*
* (..... matched portion of read 3 .....)
* read 2 adjacent run .....)(..... read 2 adjacent run
*
* Assuming there were no dropped samples between the adjacent runs,
* the matching portion of read 3 will need to be at least OVERLAP
* samples long to mark the two runs as one long verified run.
* If there were dropped samples, read 3 wouldn't match across the
* two runs, proving our caution worthwhile.
*
* 3) Why we partially discard the work we've done: We don't.
* When subsequently creating verified fragments from this run,
* we compensate for this trimming. Thus the verified fragment will
* contain the full length of verified samples. Only the c_blocks
* will reflect this trimming.
*
* ???: The comment below indicates that the sort cache is updated in
* some way, but this does not appear to be the case.
*/
/* Mark the verification flags. Don't mark the first or
last OVERLAP/2 elements so that overlapping fragments
have to overlap by OVERLAP to actually merge. We also
remove elements from the sort such that later sorts do
not have to sift through already matched data */
newadjbegin+=OVERLAP_ADJ;
newadjend-=OVERLAP_ADJ;
for(i=newadjbegin;i<newadjend;i++)
@@ -359,52 +577,112 @@ stage1_matched(c_block_t *old, c_block_t *new,
}
/* ===========================================================================
* i_iterate_stage1 (internal)
*
* This function is called by i_stage1() to compare newly read samples with
* previously read samples, searching for contiguous runs of identical
* samples. Matching runs indicate that at least two reads of the CD
* returned identical data, with no dropped samples in that run.
* The runs may be jittered (i.e. their absolute positions on the CD may
* not be accurate due to inaccurate seeking) at this point. Their
* positions will be determined in stage 2.
*
* This function compares the new c_block (which has been indexed in
* p->sortcache) to a previous c_block. It is called for each previous
* c_block. It searches for runs of identical samples longer than
* MIN_WORDS_SEARCH. Samples in matched runs are marked as verified.
*
* Subsequent stage 1 code builds verified fragments from the runs of
* verified samples. These fragments are merged into the verified root
* in stage 2.
*
* This function returns the number of distinct runs verified in the new
* c_block when compared against this old c_block.
*/
static long int
i_iterate_stage1(cdrom_paranoia_t *p, c_block_t *old, c_block_t *new,
void(*callback)(long int, paranoia_cb_mode_t))
{
long matchbegin = -1;
long matchend = -1;
long matchoffset;
long matchbegin=-1,matchend=-1,matchoffset;
/* ???: Why do we limit our search only to the samples with overlapping
* absolute positions? It could be because it eliminates some further
* bounds checking.
*
* Why do we "no longer try to spread the ... search" as mentioned below?
*/
/* we no longer try to spread the stage one search area by dynoverlap */
long searchend=min(ce(old),ce(new));
long searchbegin=max(cb(old),cb(new));
long searchsize=searchend-searchbegin;
sort_info_t *i=p->sortcache;
long ret=0;
long searchend = min(ce(old), ce(new));
long searchbegin = max(cb(old), cb(new));
long searchsize = searchend-searchbegin;
sort_info_t *i = p->sortcache;
long ret = 0;
long int j;
long tried=0,matched=0;
long tried = 0;
long matched = 0;
if (searchsize<=0)
return(0);
if (searchsize<=0)return(0);
/* match return values are in terms of the new vector, not old */
for(j=searchbegin;j<searchend;j+=23){
if ((new->flags[j-cb(new)]&(FLAGS_VERIFIED|FLAGS_UNREAD))==0){
/* ???: Why 23? */
for (j=searchbegin; j<searchend; j+=23) {
/* Skip past any samples verified in previous comparisons to
* other old c_blocks. Also, obviously, don't bother verifying
* unread/unmatchable samples.
*/
if ((new->flags[j-cb(new)] & (FLAGS_VERIFIED|FLAGS_UNREAD)) == 0) {
tried++;
if (try_sort_sync(p,i,new->flags,old,j,&matchbegin,&matchend,&matchoffset,
callback)==1){
/* Starting from the sample in the old c_block with the absolute
* position j, look for a matching run in the new c_block. This
* search will look a certain distance around j, and if successful
* will extend the matching run as far backward and forward as
* it can.
*
* The search will only return 1 if it finds a matching run long
* enough to be deemed significant.
*/
if (try_sort_sync(p, i, new->flags, old, j,
&matchbegin, &matchend, &matchoffset,
callback) == 1) {
matched+=matchend-matchbegin;
/* purely cosmetic: if we're matching zeros, don't use the
callback because they will appear to be all skewed */
{
long j=matchbegin-cb(old);
long end=matchend-cb(old);
for(;j<end;j++)if (cv(old)[j]!=0)break;
if (j<end){
long j = matchbegin-cb(old);
long end = matchend-cb(old);
for (; j<end; j++) if (cv(old)[j]!=0) break;
/* Mark the matched samples in both c_blocks as verified.
* In reality, not all the samples are marked. See
* stage1_matched() for details.
*/
if (j<end) {
stage1_matched(old,new,matchbegin,matchend,matchoffset,callback);
} else {
stage1_matched(old,new,matchbegin,matchend,matchoffset,NULL);
}
}
ret++;
if (matchend-1>j)j=matchend-1;
/* Skip past this verified run to look for more matches. */
if (matchend-1 > j)
j = matchend-1;
}
}
}
} /* end for */
#ifdef NOISY
fprintf(stderr,"iterate_stage1: search area=%ld[%ld-%ld] tried=%ld matched=%ld spans=%ld\n",
searchsize,searchbegin,searchend,tried,matched,ret);
@@ -413,6 +691,36 @@ i_iterate_stage1(cdrom_paranoia_t *p, c_block_t *old, c_block_t *new,
return(ret);
}
/* ===========================================================================
* i_stage1() (internal)
*
* Compare newly read samples against previously read samples, searching
* for contiguous runs of identical samples. Matching runs indicate that
* at least two reads of the CD returned identical data, with no dropped
* samples in that run. The runs may be jittered (i.e. their absolute
* positions on the CD may not be accurate due to inaccurate seeking) at
* this point. Their positions will be determined in stage 2.
*
* This function compares a new c_block against all other c_blocks in memory,
* searching for sufficiently long runs of identical samples. Since each
* c_block represents a separate call to read_c_block, this ensures that
* multiple reads have returned identical data. (Additionally, read_c_block
* varies the reads so that multiple reads are unlikely to produce identical
* errors, so any matches between reads are considered verified. See
* i_read_c_block for more details.)
*
* Each time we find such a run (longer than MIN_WORDS_SEARCH), we mark
* the samples as "verified" in both c_blocks. Runs of verified samples in
* the new c_block are promoted into verified fragments, which will later
* be merged into the verified root in stage 2.
*
* In reality, not all the verified samples are marked as "verified."
* See stage1_matched() for an explanation.
*
* This function returns the number of verified fragments created by the
* stage 1 matching.
*/
static long int
i_stage1(cdrom_paranoia_t *p, c_block_t *p_new,
void (*callback)(long int, paranoia_cb_mode_t))
@@ -423,10 +731,23 @@ i_stage1(cdrom_paranoia_t *p, c_block_t *p_new,
long int begin=0;
long int end;
/* We're going to be comparing the new c_block against the other
* c_blocks in memory. Initialize the "sort cache" index to allow
* for fast searching through the new c_block. (The index will
* actually be built the first time we search.)
*/
if (ptr)
sort_setup( p->sortcache, cv(p_new), &cb(p_new), cs(p_new), cb(p_new),
ce(p_new) );
/* Iterate from oldest to newest c_block, comparing the new c_block
* to each, looking for a sufficiently long run of identical samples
* (longer than MIN_WORDS_SEARCH), which will be marked as "verified"
* in both c_blocks.
*
* Since the new c_block is already in the list (at the head), don't
* compare it against itself.
*/
while ( ptr && ptr != p_new ) {
if (callback)
(*callback)(cb(p_new), PARANOIA_CB_VERIFY);
@@ -436,7 +757,10 @@ i_stage1(cdrom_paranoia_t *p, c_block_t *p_new,
}
/* parse the verified areas of p_new into v_fragments */
/* Find each run of contiguous verified samples in the new c_block
* and create a verified fragment from each run.
*/
begin=0;
while (begin<size) {
for ( ; begin < size; begin++)
@@ -447,16 +771,36 @@ i_stage1(cdrom_paranoia_t *p, c_block_t *p_new,
ret++;
/* We create a new verified fragment from the contiguous run
* of verified samples.
*
* We expand the "verified" range by OVERLAP_ADJ on each side
* to compensate for trimming done to the verified range by
* stage1_matched(). The samples were actually verified, and
* hence belong in the verified fragment. See stage1_matched()
* for an explanation of the trimming.
*/
new_v_fragment(p,p_new,cb(p_new)+max(0,begin-OVERLAP_ADJ),
cb(p_new)+min(size,end+OVERLAP_ADJ),
(end+OVERLAP_ADJ>=size && p_new->lastsector));
begin=end;
}
/* Return the number of distinct verified fragments we found with
* stage 1 matching.
*/
return(ret);
}
/* ===========================================================================
* STAGE 2 MATCHING
*
* ???: Insert high-level explanation here.
* ===========================================================================
*/
typedef struct sync_result {
long offset;
long begin;
@@ -1272,7 +1616,11 @@ i_read_c_block(cdrom_paranoia_t *p,long beginword,long endword,
}
readat+=driftcomp;
/* Create a new, empty c_block and add it to the head of the
* list of c_blocks in memory. It will be empty until the end of
* this subroutine.
*/
if (p->enable&(PARANOIA_MODE_OVERLAP|PARANOIA_MODE_VERIFY)) {
flags=calloc(totaltoread*CD_FRAMEWORDS, 1);
new=new_c_block(p);
@@ -1407,9 +1755,9 @@ i_read_c_block(cdrom_paranoia_t *p,long beginword,long endword,
} /* end while */
/* If we managed to read any sectors at all (anyflag), create a new
* c_block containing the read data. Otherwise, free our buffers and
* return NULL.
/* If we managed to read any sectors at all (anyflag), fill in the
* previously allocated c_block with the read data. Otherwise, free
* our buffers, dispose of the c_block, and return NULL.
*/
if (anyflag) {
new->vector=buffer;