2 * util.c : routines for doing diffs
\r
4 * ====================================================================
\r
5 * Copyright (c) 2000-2004 CollabNet. All rights reserved.
\r
7 * This software is licensed as described in the file COPYING, which
\r
8 * you should have received as part of this distribution. The terms
\r
9 * are also available at http://subversion.tigris.org/license-1.html.
\r
10 * If newer versions of this license are posted there, you may use a
\r
11 * newer version instead, at your option.
\r
13 * This software consists of voluntary contributions made by many
\r
14 * individuals. For exact contribution history, see the revision
\r
15 * history and logs, available at http://subversion.tigris.org/.
\r
16 * ====================================================================
\r
21 #include <apr_general.h>
\r
23 #include "svn_error.h"
\r
24 #include "svn_version.h"
\r
26 #include "svn_types.h"
\r
27 #include "svn_ctype.h"
\r
30 #include "svn_diff.h"
\r
32 * An Adler-32 implementation per RFC1950.
\r
34 * "The Adler-32 algorithm is much faster than the CRC32 algorithm yet
\r
35 * still provides an extremely low probability of undetected errors"
\r
39 * 65521 is the largest prime less than 65536.
\r
40 * "That 65521 is prime is important to avoid a possible large class of
\r
41 * two-byte errors that leave the check unchanged."
\r
43 #define ADLER_MOD_BASE 65521
\r
46 * "The modulo on unsigned long accumulators can be delayed for 5552 bytes,
\r
47 * so the modulo operation time is negligible."
\r
49 #define ADLER_MOD_BLOCK_SIZE 5552
\r
53 * Start with CHECKSUM and update the checksum by processing a chunk
\r
54 * of DATA sized LEN.
\r
57 svn_diff__adler32(apr_uint32_t checksum, const char *data, apr_size_t len)
\r
59 const unsigned char *input = (const unsigned char *)data;
\r
60 apr_uint32_t s1 = checksum & 0xFFFF;
\r
61 apr_uint32_t s2 = checksum >> 16;
\r
63 apr_size_t blocks = len / ADLER_MOD_BLOCK_SIZE;
\r
65 len %= ADLER_MOD_BLOCK_SIZE;
\r
69 int count = ADLER_MOD_BLOCK_SIZE;
\r
77 s1 %= ADLER_MOD_BASE;
\r
78 s2 %= ADLER_MOD_BASE;
\r
88 return ((s2 % ADLER_MOD_BASE) << 16) | (s1 % ADLER_MOD_BASE);
\r
93 svn_diff_contains_conflicts(svn_diff_t *diff)
\r
95 while (diff != NULL)
\r
97 if (diff->type == svn_diff__type_conflict)
\r
109 svn_diff_contains_diffs(svn_diff_t *diff)
\r
111 while (diff != NULL)
\r
113 if (diff->type != svn_diff__type_common)
\r
125 svn_diff_output(svn_diff_t *diff,
\r
126 void *output_baton,
\r
127 const svn_diff_output_fns_t *vtable)
\r
129 svn_error_t *(*output_fn)(void *,
\r
130 apr_off_t, apr_off_t,
\r
131 apr_off_t, apr_off_t,
\r
132 apr_off_t, apr_off_t);
\r
134 while (diff != NULL)
\r
136 switch (diff->type)
\r
138 case svn_diff__type_common:
\r
139 output_fn = vtable->output_common;
\r
142 case svn_diff__type_diff_common:
\r
143 output_fn = vtable->output_diff_common;
\r
146 case svn_diff__type_diff_modified:
\r
147 output_fn = vtable->output_diff_modified;
\r
150 case svn_diff__type_diff_latest:
\r
151 output_fn = vtable->output_diff_latest;
\r
154 case svn_diff__type_conflict:
\r
156 if (vtable->output_conflict != NULL)
\r
158 SVN_ERR(vtable->output_conflict(output_baton,
\r
159 diff->original_start, diff->original_length,
\r
160 diff->modified_start, diff->modified_length,
\r
161 diff->latest_start, diff->latest_length,
\r
162 diff->resolved_diff));
\r
171 if (output_fn != NULL)
\r
173 SVN_ERR(output_fn(output_baton,
\r
174 diff->original_start, diff->original_length,
\r
175 diff->modified_start, diff->modified_length,
\r
176 diff->latest_start, diff->latest_length));
\r
182 return SVN_NO_ERROR;
\r
187 svn_diff__normalize_buffer(char **tgt,
\r
188 apr_off_t *lengthp,
\r
189 svn_diff__normalize_state_t *statep,
\r
191 const svn_diff_file_options_t *opts)
\r
193 /* Variables for looping through BUF */
\r
194 const char *curp, *endp;
\r
196 /* Variable to record normalizing state */
\r
197 svn_diff__normalize_state_t state = *statep;
\r
199 /* Variables to track what needs copying into the target buffer */
\r
200 const char *start = buf;
\r
201 apr_size_t include_len = 0;
\r
202 svn_boolean_t last_skipped = FALSE; /* makes sure we set 'start' */
\r
204 /* Variable to record the state of the target buffer */
\r
205 char *tgt_newend = *tgt;
\r
207 /* If this is a noop, then just get out of here. */
\r
208 if (! opts->ignore_space && ! opts->ignore_eol_style)
\r
210 *tgt = (char *)buf;
\r
215 /* It only took me forever to get this routine right,
\r
216 so here my thoughts go:
\r
218 Below, we loop through the data, doing 2 things:
\r
221 - Copying other data
\r
223 The routine tries its hardest *not* to copy data, but instead
\r
224 returning a pointer into already normalized existing data.
\r
226 To this end, a block 'other data' shouldn't be copied when found,
\r
227 but only as soon as it can't be returned in-place.
\r
229 On a character level, there are 3 possible operations:
\r
231 - Skip the character (don't include in the normalized data)
\r
232 - Include the character (do include in the normalizad data)
\r
233 - Include as another character
\r
234 This is essentially the same as skipping the current character
\r
235 and inserting a given character in the output data.
\r
237 The macros below (SKIP, INCLUDE and INCLUDE_AS) are defined to
\r
238 handle the character based operations. The macros themselves
\r
239 collect character level data into blocks.
\r
241 At all times designate the START, INCLUDED_LEN and CURP pointers
\r
242 an included and and skipped block like this:
\r
244 [ start, start + included_len ) [ start + included_len, curp )
\r
247 When the routine flips from skipping to including, the last
\r
248 included block has to be flushed to the output buffer.
\r
251 /* Going from including to skipping; only schedules the current
\r
252 included section for flushing.
\r
253 Also, simply chop off the character if it's the first in the buffer,
\r
254 so we can possibly just return the remainder of the buffer */
\r
257 if (start == curp) \
\r
259 last_skipped = TRUE; \
\r
264 if (last_skipped) \
\r
265 COPY_INCLUDED_SECTION; \
\r
267 last_skipped = FALSE; \
\r
270 #define COPY_INCLUDED_SECTION \
\r
272 if (include_len > 0) \
\r
274 memmove(tgt_newend, start, include_len); \
\r
275 tgt_newend += include_len; \
\r
281 /* Include the current character as character X.
\r
282 If the current character already *is* X, add it to the
\r
283 currently included region, increasing chances for consecutive
\r
284 fully normalized blocks. */
\r
285 #define INCLUDE_AS(x) \
\r
287 if (*curp == (x)) \
\r
296 /* Insert character X in the output buffer */
\r
297 #define INSERT(x) \
\r
299 COPY_INCLUDED_SECTION; \
\r
300 *tgt_newend++ = (x); \
\r
303 for (curp = buf, endp = buf + *lengthp; curp != endp; ++curp)
\r
308 if (opts->ignore_eol_style)
\r
312 state = svn_diff__normalize_state_cr;
\r
316 if (state == svn_diff__normalize_state_cr
\r
317 && opts->ignore_eol_style)
\r
321 state = svn_diff__normalize_state_normal;
\r
325 if (svn_ctype_isspace(*curp)
\r
326 && opts->ignore_space != svn_diff_file_ignore_space_none)
\r
328 /* Whitespace but not '\r' or '\n' */
\r
329 if (state != svn_diff__normalize_state_whitespace
\r
330 && opts->ignore_space
\r
331 == svn_diff_file_ignore_space_change)
\r
332 /*### If we can postpone insertion of the space
\r
333 until the next non-whitespace character,
\r
334 we have a potential of reducing the number of copies:
\r
335 If this space is followed by more spaces,
\r
336 this will cause a block-copy.
\r
337 If the next non-space block is considered normalized
\r
338 *and* preceded by a space, we can take advantage of that. */
\r
339 /* Note, the above optimization applies to 90% of the source
\r
340 lines in our own code, since it (generally) doesn't use
\r
341 more than one space per blank section, except for the
\r
342 beginning of a line. */
\r
346 state = svn_diff__normalize_state_whitespace;
\r
350 /* Non-whitespace character, or whitespace character in
\r
351 svn_diff_file_ignore_space_none mode. */
\r
353 state = svn_diff__normalize_state_normal;
\r
358 /* If we're not in whitespace, flush the last chunk of data.
\r
359 * Note that this will work correctly when this is the last chunk of the
\r
361 * * If there is an eol, it will either have been output when we entered
\r
362 * the state_cr, or it will be output now.
\r
363 * * If there is no eol and we're not in whitespace, then we just output
\r
364 * everything below.
\r
365 * * If there's no eol and we are in whitespace, we want to ignore
\r
366 * whitespace unconditionally. */
\r
368 if (*tgt == tgt_newend)
\r
370 /* we haven't copied any data in to *tgt and our chunk consists
\r
371 only of one block of (already normalized) data.
\r
372 Just return the block. */
\r
373 *tgt = (char *)start;
\r
374 *lengthp = include_len;
\r
378 COPY_INCLUDED_SECTION;
\r
379 *lengthp = tgt_newend - *tgt;
\r
388 #undef COPY_INCLUDED_SECTION
\r
392 /* Return the library version number. */
\r
393 const svn_version_t *
\r
394 svn_diff_version(void)
\r