Update bundled PCRE2-library to version 10.23
Some manual changes done to the library were lost with this update. They will be added in the next commit.
This commit is contained in:
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2014 University of Cambridge
|
||||
New API code Copyright (c) 2016 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -50,6 +50,10 @@ collecting data (e.g. minimum matching length). */
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
/* The maximum remembered capturing brackets minimum. */
|
||||
|
||||
#define MAX_CACHE_BACKREF 128
|
||||
|
||||
/* Set a bit in the starting code unit bit map. */
|
||||
|
||||
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
|
||||
@ -59,15 +63,23 @@ collecting data (e.g. minimum matching length). */
|
||||
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find the minimum subject length for a group *
|
||||
*************************************************/
|
||||
|
||||
/* Scan a parenthesized group and compute the minimum length of subject that
|
||||
is needed to match it. This is a lower bound; it does not mean there is a
|
||||
string of that length that matches. In UTF8 mode, the result is in characters
|
||||
rather than bytes.
|
||||
string of that length that matches. In UTF mode, the result is in characters
|
||||
rather than code units. The field in a compiled pattern for storing the minimum
|
||||
length is 16-bits long (on the grounds that anything longer than that is
|
||||
pathological), so we give up when we reach that amount. This also means that
|
||||
integer overflow for really crazy patterns cannot happen.
|
||||
|
||||
Backreference minimum lengths are cached to speed up multiple references. This
|
||||
function is called only when the highest back reference in the pattern is less
|
||||
than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
|
||||
caching vector. The zeroth element contains the number of the highest set
|
||||
value.
|
||||
|
||||
Arguments:
|
||||
re compiled pattern block
|
||||
@ -75,35 +87,58 @@ Arguments:
|
||||
startcode pointer to start of the whole pattern's code
|
||||
utf UTF flag
|
||||
recurses chain of recurse_check to catch mutual recursion
|
||||
countptr pointer to call count (to catch over complexity)
|
||||
backref_cache vector for caching back references.
|
||||
|
||||
Returns: the minimum length
|
||||
-1 \C in UTF-8 mode
|
||||
or (*ACCEPT)
|
||||
or pattern too complicated
|
||||
or back reference to duplicate name/number
|
||||
-2 internal error (missing capturing bracket)
|
||||
-3 internal error (opcode not listed)
|
||||
*/
|
||||
|
||||
static int
|
||||
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
|
||||
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses)
|
||||
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
|
||||
int *backref_cache)
|
||||
{
|
||||
int length = -1;
|
||||
int prev_cap_recno = -1;
|
||||
int prev_cap_d = 0;
|
||||
int prev_recurse_recno = -1;
|
||||
int prev_recurse_d = 0;
|
||||
uint32_t once_fudge = 0;
|
||||
BOOL had_recurse = FALSE;
|
||||
BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
|
||||
recurse_check this_recurse;
|
||||
register int branchlength = 0;
|
||||
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
|
||||
int branchlength = 0;
|
||||
PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
|
||||
|
||||
if (*code == OP_CBRA || *code == OP_SCBRA ||
|
||||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
|
||||
/* If this is a "could be empty" group, its minimum length is 0. */
|
||||
|
||||
/* Scan along the opcodes for this branch. If we get to the end of the
|
||||
branch, check the length against that of the other branches. */
|
||||
if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
|
||||
|
||||
/* Skip over capturing bracket number */
|
||||
|
||||
if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
|
||||
|
||||
/* A large and/or complex regex can take too long to process. */
|
||||
|
||||
if ((*countptr)++ > 1000) return -1;
|
||||
|
||||
/* Scan along the opcodes for this branch. If we get to the end of the branch,
|
||||
check the length against that of the other branches. If the accumulated length
|
||||
passes 16-bits, stop. */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
int d, min;
|
||||
int d, min, recno;
|
||||
PCRE2_UCHAR *cs, *ce;
|
||||
register PCRE2_UCHAR op = *cc;
|
||||
PCRE2_UCHAR op = *cc;
|
||||
|
||||
if (branchlength >= UINT16_MAX) return UINT16_MAX;
|
||||
|
||||
switch (op)
|
||||
{
|
||||
@ -112,7 +147,8 @@ for (;;)
|
||||
|
||||
/* If there is only one branch in a condition, the implied branch has zero
|
||||
length, so we don't add anything. This covers the DEFINE "condition"
|
||||
automatically. */
|
||||
automatically. If there are two branches we can treat it the same as any
|
||||
other non-capturing subpattern. */
|
||||
|
||||
cs = cc + GET(cc, 1);
|
||||
if (*cs != OP_ALT)
|
||||
@ -120,23 +156,54 @@ for (;;)
|
||||
cc = cs + 1 + LINK_SIZE;
|
||||
break;
|
||||
}
|
||||
goto PROCESS_NON_CAPTURE;
|
||||
|
||||
/* Otherwise we can fall through and treat it the same as any other
|
||||
subpattern. */
|
||||
/* There's a special case of OP_ONCE, when it is wrapped round an
|
||||
OP_RECURSE. We'd like to process the latter at this level so that
|
||||
remembering the value works for repeated cases. So we do nothing, but
|
||||
set a fudge value to skip over the OP_KET after the recurse. */
|
||||
|
||||
case OP_ONCE:
|
||||
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
|
||||
{
|
||||
once_fudge = 1 + LINK_SIZE;
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
}
|
||||
/* Fall through */
|
||||
|
||||
case OP_ONCE_NC:
|
||||
case OP_BRA:
|
||||
case OP_SBRA:
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRAPOS:
|
||||
PROCESS_NON_CAPTURE:
|
||||
d = find_minlength(re, cc, startcode, utf, recurses, countptr,
|
||||
backref_cache);
|
||||
if (d < 0) return d;
|
||||
branchlength += d;
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* To save time for repeated capturing subpatterns, we remember the
|
||||
length of the previous one. Unfortunately we can't do the same for
|
||||
the unnumbered ones above. Nor can we do this if (?| is present in the
|
||||
pattern because captures with the same number are not then identical. */
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_SCBRA:
|
||||
case OP_BRA:
|
||||
case OP_SBRA:
|
||||
case OP_CBRAPOS:
|
||||
case OP_SCBRAPOS:
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRAPOS:
|
||||
case OP_ONCE:
|
||||
case OP_ONCE_NC:
|
||||
d = find_minlength(re, cc, startcode, utf, recurses);
|
||||
if (d < 0) return d;
|
||||
branchlength += d;
|
||||
recno = (int)GET2(cc, 1+LINK_SIZE);
|
||||
if (dupcapused || recno != prev_cap_recno)
|
||||
{
|
||||
prev_cap_recno = recno;
|
||||
prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
|
||||
backref_cache);
|
||||
if (prev_cap_d < 0) return prev_cap_d;
|
||||
}
|
||||
branchlength += prev_cap_d;
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
@ -388,8 +455,12 @@ for (;;)
|
||||
matches an empty string (by default it causes a matching failure), so in
|
||||
that case we must set the minimum length to zero. */
|
||||
|
||||
case OP_DNREF: /* Duplicate named pattern back reference */
|
||||
/* Duplicate named pattern back reference. We cannot reliably find a length
|
||||
for this if duplicate numbers are present in the pattern. */
|
||||
|
||||
case OP_DNREF:
|
||||
case OP_DNREFI:
|
||||
if (dupcapused) return -1;
|
||||
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
|
||||
{
|
||||
int count = GET2(cc, 1+IMM2_SIZE);
|
||||
@ -399,18 +470,80 @@ for (;;)
|
||||
|
||||
d = INT_MAX;
|
||||
|
||||
/* Scan all groups with the same name */
|
||||
/* Scan all groups with the same name; find the shortest. */
|
||||
|
||||
while (count-- > 0)
|
||||
{
|
||||
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
|
||||
int dd, i;
|
||||
recno = GET2(slot, 0);
|
||||
|
||||
if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
|
||||
dd = backref_cache[recno];
|
||||
else
|
||||
{
|
||||
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
|
||||
if (cs == NULL) return -2;
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce) /* Simple recursion */
|
||||
{
|
||||
dd = 0;
|
||||
had_recurse = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
recurse_check *r = recurses;
|
||||
for (r = recurses; r != NULL; r = r->prev)
|
||||
if (r->group == cs) break;
|
||||
if (r != NULL) /* Mutual recursion */
|
||||
{
|
||||
dd = 0;
|
||||
had_recurse = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.group = cs;
|
||||
dd = find_minlength(re, cs, startcode, utf, &this_recurse,
|
||||
countptr, backref_cache);
|
||||
if (dd < 0) return dd;
|
||||
}
|
||||
}
|
||||
|
||||
backref_cache[recno] = dd;
|
||||
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
|
||||
backref_cache[0] = recno;
|
||||
}
|
||||
|
||||
if (dd < d) d = dd;
|
||||
if (d <= 0) break; /* No point looking at any more */
|
||||
slot += re->name_entry_size;
|
||||
}
|
||||
}
|
||||
else d = 0;
|
||||
cc += 1 + 2*IMM2_SIZE;
|
||||
goto REPEAT_BACK_REFERENCE;
|
||||
|
||||
/* Single back reference. We cannot find a length for this if duplicate
|
||||
numbers are present in the pattern. */
|
||||
|
||||
case OP_REF:
|
||||
case OP_REFI:
|
||||
if (dupcapused) return -1;
|
||||
recno = GET2(cc, 1);
|
||||
if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
|
||||
d = backref_cache[recno];
|
||||
else
|
||||
{
|
||||
int i;
|
||||
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
|
||||
{
|
||||
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
|
||||
if (cs == NULL) return -2;
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce) /* Simple recursion */
|
||||
{
|
||||
d = 0;
|
||||
had_recurse = TRUE;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -420,54 +553,24 @@ for (;;)
|
||||
{
|
||||
d = 0;
|
||||
had_recurse = TRUE;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
int dd;
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.group = cs;
|
||||
dd = find_minlength(re, cs, startcode, utf, &this_recurse);
|
||||
if (dd < d) d = dd;
|
||||
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
|
||||
backref_cache);
|
||||
if (d < 0) return d;
|
||||
}
|
||||
}
|
||||
slot += re->name_entry_size;
|
||||
}
|
||||
}
|
||||
else d = 0;
|
||||
cc += 1 + 2*IMM2_SIZE;
|
||||
goto REPEAT_BACK_REFERENCE;
|
||||
else d = 0;
|
||||
|
||||
case OP_REF: /* Single back reference */
|
||||
case OP_REFI:
|
||||
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
|
||||
{
|
||||
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
|
||||
if (cs == NULL) return -2;
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce) /* Simple recursion */
|
||||
{
|
||||
d = 0;
|
||||
had_recurse = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
recurse_check *r = recurses;
|
||||
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
||||
if (r != NULL) /* Mutual recursion */
|
||||
{
|
||||
d = 0;
|
||||
had_recurse = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.group = cs;
|
||||
d = find_minlength(re, cs, startcode, utf, &this_recurse);
|
||||
}
|
||||
}
|
||||
backref_cache[recno] = d;
|
||||
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
|
||||
backref_cache[0] = recno;
|
||||
}
|
||||
else d = 0;
|
||||
|
||||
cc += 1 + IMM2_SIZE;
|
||||
|
||||
/* Handle repeated back references */
|
||||
@ -504,28 +607,51 @@ for (;;)
|
||||
break;
|
||||
}
|
||||
|
||||
branchlength += min * d;
|
||||
/* Take care not to overflow: (1) min and d are ints, so check that their
|
||||
product is not greater than INT_MAX. (2) branchlength is limited to
|
||||
UINT16_MAX (checked at the top of the loop). */
|
||||
|
||||
if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
|
||||
branchlength = UINT16_MAX;
|
||||
else branchlength += min * d;
|
||||
break;
|
||||
|
||||
/* Recursion always refers to the first occurrence of a subpattern with a
|
||||
given number. Therefore, we can always make use of caching, even when the
|
||||
pattern contains multiple subpatterns with the same number. */
|
||||
|
||||
case OP_RECURSE:
|
||||
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce) /* Simple recursion */
|
||||
had_recurse = TRUE;
|
||||
recno = GET2(cs, 1+LINK_SIZE);
|
||||
if (recno == prev_recurse_recno)
|
||||
{
|
||||
branchlength += prev_recurse_d;
|
||||
}
|
||||
else
|
||||
{
|
||||
recurse_check *r = recurses;
|
||||
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
||||
if (r != NULL) /* Mutual recursion */
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce) /* Simple recursion */
|
||||
had_recurse = TRUE;
|
||||
else
|
||||
{
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.group = cs;
|
||||
branchlength += find_minlength(re, cs, startcode, utf, &this_recurse);
|
||||
recurse_check *r = recurses;
|
||||
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
||||
if (r != NULL) /* Mutual recursion */
|
||||
had_recurse = TRUE;
|
||||
else
|
||||
{
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.group = cs;
|
||||
prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
|
||||
countptr, backref_cache);
|
||||
if (prev_recurse_d < 0) return prev_recurse_d;
|
||||
prev_recurse_recno = recno;
|
||||
branchlength += prev_recurse_d;
|
||||
}
|
||||
}
|
||||
}
|
||||
cc += 1 + LINK_SIZE;
|
||||
cc += 1 + LINK_SIZE + once_fudge;
|
||||
once_fudge = 0;
|
||||
break;
|
||||
|
||||
/* Anything else does not or need not match a character. We can get the
|
||||
@ -708,7 +834,7 @@ Returns: nothing
|
||||
static void
|
||||
set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
|
||||
{
|
||||
register uint32_t c;
|
||||
uint32_t c;
|
||||
for (c = 0; c < table_limit; c++)
|
||||
re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||
@ -749,7 +875,7 @@ Returns: nothing
|
||||
static void
|
||||
set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
|
||||
{
|
||||
register uint32_t c;
|
||||
uint32_t c;
|
||||
for (c = 0; c < table_limit; c++)
|
||||
re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||
@ -789,7 +915,7 @@ Returns: SSB_FAIL => Failed to find any starting code units
|
||||
static int
|
||||
set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
|
||||
{
|
||||
register uint32_t c;
|
||||
uint32_t c;
|
||||
int yield = SSB_DONE;
|
||||
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
||||
@ -1368,7 +1494,7 @@ do
|
||||
for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
|
||||
for (c = 128; c < 256; c++)
|
||||
{
|
||||
if ((classmap[c/8] && (1 << (c&7))) != 0)
|
||||
if ((classmap[c/8] & (1 << (c&7))) != 0)
|
||||
{
|
||||
int d = (c >> 6) | 0xc0; /* Set bit for this starter */
|
||||
re->start_bitmap[d/8] |= (1 << (d&7)); /* and then skip on to the */
|
||||
@ -1441,6 +1567,7 @@ int
|
||||
PRIV(study)(pcre2_real_code *re)
|
||||
{
|
||||
int min;
|
||||
int count = 0;
|
||||
PCRE2_UCHAR *code;
|
||||
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
|
||||
|
||||
@ -1461,22 +1588,35 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
|
||||
if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
|
||||
}
|
||||
|
||||
/* Find the minimum length of subject string. */
|
||||
/* Find the minimum length of subject string. If the pattern can match an empty
|
||||
string, the minimum length is already known. If there are more back references
|
||||
than the size of the vector we are going to cache them in, do nothing. A
|
||||
pattern that complicated will probably take a long time to analyze and may in
|
||||
any case turn out to be too complicated. Note that back reference minima are
|
||||
held as 16-bit numbers. */
|
||||
|
||||
switch(min = find_minlength(re, code, code, utf, NULL))
|
||||
if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
|
||||
re->top_backref <= MAX_CACHE_BACKREF)
|
||||
{
|
||||
case -1: /* \C in UTF mode or (*ACCEPT) */
|
||||
break; /* Leave minlength unchanged (will be zero) */
|
||||
int backref_cache[MAX_CACHE_BACKREF+1];
|
||||
backref_cache[0] = 0; /* Highest one that is set */
|
||||
min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
|
||||
switch(min)
|
||||
{
|
||||
case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
|
||||
break; /* Leave minlength unchanged (will be zero) */
|
||||
|
||||
case -2:
|
||||
return 2; /* missing capturing bracket */
|
||||
case -2:
|
||||
return 2; /* missing capturing bracket */
|
||||
|
||||
case -3:
|
||||
return 3; /* unrecognized opcode */
|
||||
case -3:
|
||||
return 3; /* unrecognized opcode */
|
||||
|
||||
default:
|
||||
re->minlength = min;
|
||||
break;
|
||||
default:
|
||||
if (min > UINT16_MAX) min = UINT16_MAX;
|
||||
re->minlength = min;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
Reference in New Issue
Block a user