Update bundled PCRE2-library to version 10.23

Some manual changes done to the library were lost with this update.
They will be added in the next commit.
This commit is contained in:
Esa Korhonen
2017-05-29 15:31:42 +03:00
parent 7231563937
commit 36af74cb25
218 changed files with 49218 additions and 26130 deletions

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2014 University of Cambridge
New API code Copyright (c) 2016 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -371,7 +371,7 @@ internal_dfa_match(
uint32_t offsetcount,
int *workspace,
int wscount,
int rlevel)
uint32_t rlevel)
{
stateblock *active_states, *new_states, *temp_states;
stateblock *next_active_state, *next_new_state;
@ -400,8 +400,8 @@ BOOL utf = FALSE;
BOOL reset_could_continue = FALSE;
rlevel++;
offsetcount &= (-2);
if (rlevel++ > mb->match_limit_recursion) return PCRE2_ERROR_RECURSIONLIMIT;
offsetcount &= (uint32_t)(-2); /* Round down */
wscount -= 2;
wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
@ -433,13 +433,13 @@ move back, and set up each alternative appropriately. */
if (*first_op == OP_REVERSE)
{
int max_back = 0;
int gone_back;
size_t max_back = 0;
size_t gone_back;
end_code = this_start_code;
do
{
int back = GET(end_code, 2+LINK_SIZE);
size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
if (back > max_back) max_back = back;
end_code += GET(end_code, 1);
}
@ -466,8 +466,8 @@ if (*first_op == OP_REVERSE)
/* In byte-mode we can do this quickly. */
{
gone_back = (current_subject - max_back < start_subject)?
(int)(current_subject - start_subject) : max_back;
size_t current_offset = (size_t)(current_subject - start_subject);
gone_back = (current_offset < max_back)? current_offset : max_back;
current_subject -= gone_back;
}
@ -481,11 +481,11 @@ if (*first_op == OP_REVERSE)
end_code = this_start_code;
do
{
int back = GET(end_code, 2+LINK_SIZE);
size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
if (back <= gone_back)
{
int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
ADD_NEW_DATA(-bstate, 0, gone_back - back);
ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
}
end_code += GET(end_code, 1);
}
@ -509,7 +509,7 @@ else
do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
new_count = workspace[1];
if (!workspace[0])
memcpy(new_states, active_states, new_count * sizeof(stateblock));
memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
}
/* Not restarting */
@ -593,8 +593,9 @@ for (;;)
stateblock *current_state = active_states + i;
BOOL caseless = FALSE;
PCRE2_SPTR code;
uint32_t codevalue;
int state_offset = current_state->offset;
int codevalue, rrc;
int rrc;
int count;
/* A negative offset is a special case meaning "hold off going to this
@ -719,7 +720,7 @@ for (;;)
ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
if (codevalue != OP_KET)
{
ADD_ACTIVE(state_offset - GET(code, 1), 0);
ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
}
}
else
@ -733,11 +734,12 @@ for (;;)
else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
match_count = 0;
count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(PCRE2_SIZE));
if (count > 0) memmove(offsets + 2, offsets,
(size_t)count * sizeof(PCRE2_SIZE));
if (offsetcount >= 2)
{
offsets[0] = (int)(current_subject - start_subject);
offsets[1] = (int)(ptr - start_subject);
offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
}
if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
}
@ -959,7 +961,7 @@ for (;;)
{
if (d == '_') left_word = TRUE; else
{
int cat = UCD_CATEGORY(d);
uint32_t cat = UCD_CATEGORY(d);
left_word = (cat == ucp_L || cat == ucp_N);
}
}
@ -984,7 +986,7 @@ for (;;)
{
if (c == '_') right_word = TRUE; else
{
int cat = UCD_CATEGORY(c);
uint32_t cat = UCD_CATEGORY(c);
right_word = (cat == ucp_L || cat == ucp_N);
}
}
@ -1369,7 +1371,7 @@ for (;;)
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
if (clen > 0)
{
int lgb, rgb;
uint32_t lgb, rgb;
PCRE2_SPTR nptr = ptr + clen;
int ncount = 0;
if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
@ -1383,7 +1385,7 @@ for (;;)
dlen = 1;
if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
rgb = UCD_GRAPHBREAK(d);
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
ncount++;
lgb = rgb;
nptr += dlen;
@ -1630,7 +1632,7 @@ for (;;)
ADD_ACTIVE(state_offset + 2, 0);
if (clen > 0)
{
int lgb, rgb;
uint32_t lgb, rgb;
PCRE2_SPTR nptr = ptr + clen;
int ncount = 0;
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
@ -1645,7 +1647,7 @@ for (;;)
dlen = 1;
if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
rgb = UCD_GRAPHBREAK(d);
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
ncount++;
lgb = rgb;
nptr += dlen;
@ -1902,7 +1904,7 @@ for (;;)
count = current_state->count; /* Number already matched */
if (clen > 0)
{
int lgb, rgb;
uint32_t lgb, rgb;
PCRE2_SPTR nptr = ptr + clen;
int ncount = 0;
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
@ -1916,7 +1918,7 @@ for (;;)
dlen = 1;
if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
rgb = UCD_GRAPHBREAK(d);
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
ncount++;
lgb = rgb;
nptr += dlen;
@ -2097,7 +2099,7 @@ for (;;)
case OP_EXTUNI:
if (clen > 0)
{
int lgb, rgb;
uint32_t lgb, rgb;
PCRE2_SPTR nptr = ptr + clen;
int ncount = 0;
lgb = UCD_GRAPHBREAK(c);
@ -2106,7 +2108,7 @@ for (;;)
dlen = 1;
if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
rgb = UCD_GRAPHBREAK(d);
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
ncount++;
lgb = rgb;
nptr += dlen;
@ -2582,14 +2584,14 @@ for (;;)
mb, /* static match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
(int)(ptr - start_subject), /* start offset */
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
rlevel); /* function recursion level */
if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
}
@ -2601,8 +2603,8 @@ for (;;)
{
PCRE2_SIZE local_offsets[1000];
int local_workspace[1000];
int codelink = GET(code, 1);
int condcode;
int codelink = (int)GET(code, 1);
PCRE2_UCHAR condcode;
/* Because of the way auto-callout works during compile, a callout item
is inserted between OP_COND and an assertion condition. This does not
@ -2611,8 +2613,10 @@ for (;;)
if (code[LINK_SIZE + 1] == OP_CALLOUT
|| code[LINK_SIZE + 1] == OP_CALLOUT_STR)
{
unsigned int callout_length = (code[LINK_SIZE + 1] == OP_CALLOUT)
? PRIV(OP_lengths)[OP_CALLOUT] : GET(code, 2 + 3*LINK_SIZE);
PCRE2_SIZE callout_length = (code[LINK_SIZE + 1] == OP_CALLOUT)?
(PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
(PCRE2_SIZE)GET(code, 2 + 3*LINK_SIZE);
rrc = 0;
if (mb->callout != NULL)
{
@ -2678,7 +2682,7 @@ for (;;)
else if (condcode == OP_RREF)
{
int value = GET2(code, LINK_SIZE + 2);
unsigned int value = GET2(code, LINK_SIZE + 2);
if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
if (mb->recursive != NULL)
{ ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
@ -2699,14 +2703,14 @@ for (;;)
mb, /* fixed match data */
asscode, /* this subexpression's code */
ptr, /* where we currently are */
(int)(ptr - start_subject), /* start offset */
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
local_workspace, /* workspace vector */
sizeof(local_workspace)/sizeof(int), /* size of same */
rlevel); /* function recursion level */
if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
if ((rc >= 0) ==
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
@ -2747,7 +2751,7 @@ for (;;)
mb, /* fixed match data */
callpat, /* this subexpression's code */
ptr, /* where we currently are */
(int)(ptr - start_subject), /* start offset */
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
local_workspace, /* workspace vector */
@ -2768,18 +2772,19 @@ for (;;)
{
for (rc = rc*2 - 2; rc >= 0; rc -= 2)
{
int charcount = local_offsets[rc+1] - local_offsets[rc];
PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (utf)
{
PCRE2_SPTR p = start_subject + local_offsets[rc];
PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
}
#endif
if (charcount > 0)
{
ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
(int)(charcount - 1));
}
else
{
@ -2798,7 +2803,7 @@ for (;;)
case OP_SCBRAPOS:
case OP_BRAPOSZERO:
{
int charcount, matched_count;
PCRE2_SIZE charcount, matched_count;
PCRE2_SPTR local_ptr = ptr;
BOOL allow_zero;
@ -2821,7 +2826,7 @@ for (;;)
mb, /* fixed match data */
code, /* this subexpression's code */
local_ptr, /* where we currently are */
(int)(ptr - start_subject), /* start offset */
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
local_workspace, /* workspace vector */
@ -2872,11 +2877,11 @@ for (;;)
{
PCRE2_SPTR p = ptr;
PCRE2_SPTR pp = local_ptr;
charcount = (int)(pp - p);
charcount = (PCRE2_SIZE)(pp - p);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
}
}
}
@ -2893,7 +2898,7 @@ for (;;)
mb, /* fixed match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
(int)(ptr - start_subject), /* start offset */
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
local_workspace, /* workspace vector */
@ -2903,7 +2908,7 @@ for (;;)
if (rc >= 0)
{
PCRE2_SPTR end_subpattern = code;
int charcount = local_offsets[1] - local_offsets[0];
PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
int next_state_offset, repeat_state_offset;
do { end_subpattern += GET(end_subpattern, 1); }
@ -2960,12 +2965,12 @@ for (;;)
{
PCRE2_SPTR p = start_subject + local_offsets[0];
PCRE2_SPTR pp = start_subject + local_offsets[1];
while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
}
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
if (repeat_state_offset >= 0)
{ ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
{ ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
}
}
else if (rc != PCRE2_ERROR_NOMATCH) return rc;
@ -3018,7 +3023,7 @@ for (;;)
return rrc; /* Abandon */
}
if (rrc == 0)
{ ADD_ACTIVE(state_offset + callout_length, 0); }
{ ADD_ACTIVE(state_offset + (int)callout_length, 0); }
}
break;
@ -3110,12 +3115,13 @@ Returns: > 0 => number of match offset pairs placed in offsets
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
pcre2_match_context *mcontext, int *workspace, size_t wscount)
pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
{
const pcre2_real_code *re = (const pcre2_real_code *)code;
PCRE2_SPTR start_match;
PCRE2_SPTR end_subject;
PCRE2_SPTR bumpalong_limit;
PCRE2_SPTR req_cu_ptr;
BOOL utf, anchored, startline, firstline;
@ -3172,15 +3178,10 @@ occur. */
#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
options |= (re->flags & FF) / ((FF & -FF) / (OO & -OO));
options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
#undef FF
#undef OO
/* A NULL match context means "use a default context" */
if (mcontext == NULL)
mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
/* If restarting after a partial match, do some sanity checks on the contents
of the workspace. */
@ -3205,20 +3206,33 @@ where to start. */
startline = (re->flags & PCRE2_STARTLINE) != 0;
firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
bumpalong_limit = end_subject;
/* Fill in the fields in the match block. */
/* Get data from the match context, if present, and fill in the fields in the
match block. It is an error to set an offset limit without setting the flag at
compile time. */
if (mcontext == NULL)
{
mb->callout = NULL;
mb->memctl = re->memctl;
mb->match_limit_recursion = PRIV(default_match_context).recursion_limit;
}
else
{
if (mcontext->offset_limit != PCRE2_UNSET)
{
if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
return PCRE2_ERROR_BADOFFSETLIMIT;
bumpalong_limit = subject + mcontext->offset_limit;
}
mb->callout = mcontext->callout;
mb->callout_data = mcontext->callout_data;
mb->memctl = mcontext->memctl;
mb->match_limit_recursion = mcontext->recursion_limit;
}
if (mb->match_limit_recursion > re->limit_recursion)
mb->match_limit_recursion = re->limit_recursion;
mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
re->name_count * re->name_entry_size;
@ -3264,18 +3278,50 @@ switch(re->newline_convention)
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
we must also check that a starting offset does not point into the middle of a
multiunit character. */
multiunit character. We check only the portion of the subject that is going to
be inspected during matching - from the offset minus the maximum back reference
to the given length. This saves time when a small part of a large subject is
being matched by the use of a starting offset. Note that the maximum lookbehind
is a number of characters, not code units. */
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
{
match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
if (match_data->rc != 0) return match_data->rc;
PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
if (start_offset > 0)
{
#if PCRE2_CODE_UNIT_WIDTH != 32
if (start_offset > 0 && start_offset < length &&
NOT_FIRSTCHAR(subject[start_offset]))
return PCRE2_ERROR_BADUTFOFFSET;
unsigned int i;
if (start_match < end_subject && NOT_FIRSTCU(*start_match))
return PCRE2_ERROR_BADUTFOFFSET;
for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
{
check_subject--;
while (check_subject > subject &&
#if PCRE2_CODE_UNIT_WIDTH == 8
(*check_subject & 0xc0) == 0x80)
#else /* 16-bit */
(*check_subject & 0xfc00) == 0xdc00)
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
check_subject--;
}
#else /* In the 32-bit library, one code unit equals one character. */
check_subject -= re->max_lookbehind;
if (check_subject < subject) check_subject = subject;
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
}
/* Validate the relevant portion of the subject. After an error, adjust the
offset to be an absolute offset in the whole string. */
match_data->rc = PRIV(valid_utf)(check_subject,
length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
if (match_data->rc != 0)
{
match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
return match_data->rc;
}
}
#endif /* SUPPORT_UNICODE */
@ -3295,7 +3341,8 @@ if (!anchored)
{
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
if (utf && first_cu > 127)
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#endif
}
}
@ -3315,7 +3362,7 @@ if ((re->flags & PCRE2_LASTSET) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#endif
}
}
@ -3427,7 +3474,7 @@ for (;;)
{
while (start_match < end_subject)
{
register uint32_t c = UCHAR21TEST(start_match);
uint32_t c = UCHAR21TEST(start_match);
#if PCRE2_CODE_UNIT_WIDTH != 8
if (c > 255) c = 255;
#endif
@ -3467,7 +3514,7 @@ for (;;)
if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
{
register PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
/* We don't need to repeat the search if we haven't yet reached the
place we found it at last time. */
@ -3478,7 +3525,7 @@ for (;;)
{
while (p < end_subject)
{
register uint32_t pp = UCHAR21INCTEST(p);
uint32_t pp = UCHAR21INCTEST(p);
if (pp == req_cu || pp == req_cu2) { p--; break; }
}
}
@ -3507,6 +3554,10 @@ for (;;)
/* ------------ End of start of match optimizations ------------ */
/* Give no match if we have passed the bumpalong limit. */
if (start_match > bumpalong_limit) break;
/* OK, now we can do the business */
mb->start_used_ptr = start_match;
@ -3519,9 +3570,9 @@ for (;;)
start_match, /* where we currently are */
start_offset, /* start offset in subject */
match_data->ovector, /* offset vector */
match_data->oveccount * 2, /* actual size of same */
(uint32_t)match_data->oveccount * 2, /* actual size of same */
workspace, /* workspace vector */
wscount, /* size of same */
(int)wscount, /* size of same */
0); /* function recurse level */
/* Anything other than "no match" means we are done, always; otherwise, carry
@ -3535,7 +3586,7 @@ for (;;)
match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
}
match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
match_data->rightchar = mb->last_used_ptr - subject;
match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
match_data->startchar = (PCRE2_SIZE)(start_match - subject);
match_data->rc = rc;
return rc;