diff --git a/CMakeLists.txt b/CMakeLists.txt
index 11a55902e..85e9f8229 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,15 +46,11 @@ find_package(Avro)
find_package(GSSAPI)
find_package(SQLite)
-# Find or build PCRE2
+# Build PCRE2 so we always know the version
# Read BuildPCRE2 for details about how to add pcre2 as a dependency to a target
-find_package(PCRE2)
-if(NOT PCRE2_FOUND)
- message(STATUS "Using bundled PCRE2 library")
- include(cmake/BuildPCRE2.cmake)
-endif()
+include(cmake/BuildPCRE2.cmake)
-include_directories(${PCRE2_INCLUDE_DIRS})
+include_directories(BEFORE ${PCRE2_INCLUDE_DIRS})
# If the connector was not found, download and build it from source
if(NOT MARIADB_CONNECTOR_FOUND)
diff --git a/COPYRIGHT b/COPYRIGHT
index ddfc55378..906404381 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/Documentation/Filters/Masking.md b/Documentation/Filters/Masking.md
index 6122bd4d6..d15eea215 100644
--- a/Documentation/Filters/Masking.md
+++ b/Documentation/Filters/Masking.md
@@ -198,23 +198,21 @@ specified name.
#### `with`
-The value of this key is an object that specifies what the value
-of the matched column should be replaced with. Currently, the object
-is expected to contain either the key `value` or the key `fill`. The
-value of both must be a string. If both keys are specified, then
-`value` takes presedence.
+The value of this key is an object that specifies what the value of the matched
+column should be replaced with. Currently, the object is expected to contain
+either the key `value` or the key `fill`. The value of both must be a string
+with length greater than zero. If both keys are specified, `value` takes
+precedence. If `fill` is not specified, the default `X` is used as its value.
-If `value` is specified, then its value is used to replace the actual
-value verbatim and the length of the specified value must match the
-actual returned value (from the server) exactly. If the lengths do
-not match, then if `fill` is specified its value will be used to
-mask the actual value. Otherwise an error is logged and the value
-is *not* masked.
+If `value` is specified, then its value is used to replace the actual value
+verbatim and the length of the specified value must match the actual returned
+value (from the server) exactly. If the lengths do not match, the value of
+`fill` is used to mask the actual value.
-If `fill` is specified, then its value will be used for masking the
-value; as such if the lenghts match, by cutting it if the actual value
-is shorter, and by repeating it, fully or partially, the necessary
-amount of times, if the actual value is longer.
+When the value of `fill` (fill-value) is used for masking the returned value,
+the fill-value is used as many times as necessary to match the length of the
+return value. If required, only a part of the fill-value may be used in the end
+of the mask value to get the lengths to match.
```
{
"rules": [
diff --git a/Documentation/Filters/Tee-Filter.md b/Documentation/Filters/Tee-Filter.md
index 8e1155351..72e7f4424 100644
--- a/Documentation/Filters/Tee-Filter.md
+++ b/Documentation/Filters/Tee-Filter.md
@@ -6,12 +6,15 @@ The tee filter is a "plumbing" fitting in the MariaDB MaxScale filter toolkit.
It can be used in a filter pipeline of a service to make copies of requests from
the client and send the copies to another service within MariaDB MaxScale.
+**Please Note:** Starting with MaxScale 2.2.0, any client that connects to a
+ service which uses a tee filter will require a grant for the loopback address,
+ i.e. `127.0.0.1`.
+
## Configuration
The configuration block for the TEE filter requires the minimal filter
parameters in its section within the MaxScale configuration file. The service to
-send the duplicates to must be defined. Currently the tee filter does not
-support multi-statements.
+send the duplicates to must be defined.
```
[DataMartFilter]
diff --git a/Documentation/REST-API/API.md b/Documentation/REST-API/API.md
index 1d3986aa9..c203e0f55 100644
--- a/Documentation/REST-API/API.md
+++ b/Documentation/REST-API/API.md
@@ -111,10 +111,16 @@ Credentials for authentication.
#### Content-Type
All PUT and POST requests must use the `Content-Type: application/json` media
-type and the request body must be a valid JSON representation of a resource. All
-PATCH requests must use the `Content-Type: application/json` media type and the
-request body must be a JSON document containing a partial definition of the
-original resource.
+type and the request body must be a complete and valid JSON representation of a
+resource. All PATCH requests must use the `Content-Type: application/json` media
+type and the request body must be a JSON document containing a partial
+definition of the original resource.
+
+The current version of the API supports PATCH-like PUT requests with
+partial definitions of resources in the request body. This is discouraged
+as it goes against the intended use of the PUT method. Future versions of
+the MaxScale REST API can remove this support which means that this
+functionality is deprecated.
#### Host
diff --git a/Documentation/REST-API/Resources-MaxScale.md b/Documentation/REST-API/Resources-MaxScale.md
index 6a19a52ab..5e6756e00 100644
--- a/Documentation/REST-API/Resources-MaxScale.md
+++ b/Documentation/REST-API/Resources-MaxScale.md
@@ -304,89 +304,69 @@ GET /v1/maxscale/modules
"self": "http://localhost:8989/v1/maxscale/modules/"
},
"data": {
- "id": "readwritesplit",
+ "id": "dbfwfilter",
"type": "module",
"attributes": {
- "module_type": "Router",
- "version": "V1.1.0",
- "description": "A Read/Write splitting router for enhancement read scalability",
- "api": "router",
+ "module_type": "Filter",
+ "version": "V1.2.0",
+ "description": "Firewall Filter",
+ "api": "filter",
"status": "GA",
+ "commands": [
+ {
+ "id": "rules/reload",
+ "type": "module_command",
+ "links": {
+ "self": "http://localhost:8989/v1/modules/dbfwfilter/rules/reload"
+ },
+ "attributes": {
+ "method": "POST",
+ "arg_min": 1,
+ "arg_max": 2,
+ "parameters": [
+ {
+ "description": "Filter to reload",
+ "type": "FILTER",
+ "required": true
+ },
+ {
+ "description": "Path to rule file",
+ "type": "[STRING]",
+ "required": false
+ }
+ ]
+ }
+ }
+ ],
"parameters": [
{
- "name": "use_sql_variables_in",
- "type": "enum",
- "default_value": "all",
- "enum_values": [
- "all",
- "master"
- ]
+ "name": "rules",
+ "type": "path"
},
{
- "name": "slave_selection_criteria",
- "type": "enum",
- "default_value": "LEAST_CURRENT_OPERATIONS",
- "enum_values": [
- "LEAST_GLOBAL_CONNECTIONS",
- "LEAST_ROUTER_CONNECTIONS",
- "LEAST_BEHIND_MASTER",
- "LEAST_CURRENT_OPERATIONS"
- ]
- },
- {
- "name": "master_failure_mode",
- "type": "enum",
- "default_value": "fail_instantly",
- "enum_values": [
- "fail_instantly",
- "fail_on_write",
- "error_on_write"
- ]
- },
- {
- "name": "max_slave_replication_lag",
- "type": "int",
- "default_value": "-1"
- },
- {
- "name": "max_slave_connections",
- "type": "string",
- "default_value": "255"
- },
- {
- "name": "retry_failed_reads",
- "type": "bool",
- "default_value": "true"
- },
- {
- "name": "disable_sescmd_history",
- "type": "bool",
- "default_value": "true"
- },
- {
- "name": "max_sescmd_history",
- "type": "count",
- "default_value": "0"
- },
- {
- "name": "strict_multi_stmt",
- "type": "bool",
- "default_value": "true"
- },
- {
- "name": "master_accept_reads",
+ "name": "log_match",
"type": "bool",
"default_value": "false"
},
{
- "name": "connection_keepalive",
- "type": "count",
- "default_value": "0"
+ "name": "log_no_match",
+ "type": "bool",
+ "default_value": "false"
+ },
+ {
+ "name": "action",
+ "type": "enum",
+ "default_value": "block",
+ "enum_values": [
+ "allow",
+ "block",
+ "ignore"
+ ]
}
]
},
"links": {
- "self": "http://localhost:8989/v1/modules/readwritesplit"
+ "self": "http://localhost:8989/v1/modules/dbfwfilter"
}
}
}
diff --git a/Documentation/REST-API/Resources-Monitor.md b/Documentation/REST-API/Resources-Monitor.md
index cb7a70f6e..832577c74 100644
--- a/Documentation/REST-API/Resources-Monitor.md
+++ b/Documentation/REST-API/Resources-Monitor.md
@@ -267,7 +267,7 @@ The :name in the URI must map to a monitor name with all whitespace replaced wit
hyphens. The request body must be a valid JSON document representing the modified monitor.
```
-PUT /v1/monitor/:name
+PATCH /v1/monitor/:name
```
### Modifiable Fields
diff --git a/Documentation/REST-API/Resources-Server.md b/Documentation/REST-API/Resources-Server.md
index 8254fa896..c50f186ca 100644
--- a/Documentation/REST-API/Resources-Server.md
+++ b/Documentation/REST-API/Resources-Server.md
@@ -312,7 +312,7 @@ Status: 403 Forbidden
### Update a server
```
-PUT /v1/servers/:name
+PATCH /v1/servers/:name
```
The _:name_ in the URI must map to a server name with all whitespace replaced
@@ -443,12 +443,12 @@ Request for `PUT /v1/server/server1`:
}
```
-The current implementation accepts both PUT and PATCH requests with partially
-defined resources as request body. If parts of the resource are not defined
-(e.g. the `attributes` field in the above example), those parts of the resource
-are not modified. All parts that are defined are interpreted as the new
-definition of those part of the resource. In the above example, the
-`relationships` of the resource are completely redefined.
+The current implementation accepts PATCH requests with partially defined
+resources as request body. If parts of the resource are not defined (e.g. the
+`attributes` field in the above example), those parts of the resource are not
+modified. All parts that are defined are interpreted as the new definition of
+those part of the resource. In the above example, the `relationships` of the
+resource are completely redefined.
#### Response
diff --git a/Documentation/REST-API/Resources-Service.md b/Documentation/REST-API/Resources-Service.md
index 0f34f550b..e43c585b8 100644
--- a/Documentation/REST-API/Resources-Service.md
+++ b/Documentation/REST-API/Resources-Service.md
@@ -265,7 +265,7 @@ The _:name_ in the URI must map to a service name and the request body must be a
valid JSON Patch document which is applied to the resource.
```
-PUT /v1/services/:name
+PATCH /v1/services/:name
```
The following standard service parameters can be modified.
diff --git a/Documentation/Release-Notes/MaxScale-2.1.2-Release-Notes.md b/Documentation/Release-Notes/MaxScale-2.1.2-Release-Notes.md
index da7a52163..121ab76b9 100644
--- a/Documentation/Release-Notes/MaxScale-2.1.2-Release-Notes.md
+++ b/Documentation/Release-Notes/MaxScale-2.1.2-Release-Notes.md
@@ -1,4 +1,4 @@
-# MariaDB MaxScale 2.1.2 Release Notes
+# MariaDB MaxScale 2.1.2 Release Notes -- 2017-04-03
Release 2.1.2 is a Beta release.
diff --git a/Documentation/Release-Notes/MaxScale-2.1.3-Release-Notes.md b/Documentation/Release-Notes/MaxScale-2.1.3-Release-Notes.md
index 31071b8d9..02abaab50 100644
--- a/Documentation/Release-Notes/MaxScale-2.1.3-Release-Notes.md
+++ b/Documentation/Release-Notes/MaxScale-2.1.3-Release-Notes.md
@@ -1,4 +1,4 @@
-# MariaDB MaxScale 2.1.3 Release Notes
+# MariaDB MaxScale 2.1.3 Release Notes -- 2017-05-23
Release 2.1.3 is a GA release.
diff --git a/Documentation/Release-Notes/MaxScale-2.2.0-Release-Notes.md b/Documentation/Release-Notes/MaxScale-2.2.0-Release-Notes.md
index 781f7776f..3d4640569 100644
--- a/Documentation/Release-Notes/MaxScale-2.2.0-Release-Notes.md
+++ b/Documentation/Release-Notes/MaxScale-2.2.0-Release-Notes.md
@@ -23,6 +23,18 @@ This filter now uses the PCRE2-libarary to match queries. Previously, it used
the POSIX-version of PCRE2. The filter also accepts multiple match-server pairs.
Please see the NamedServerFilter documentation for details.
+### Tee Filter
+
+The `tee` filter has been rewritten to better suit the way MaxScale now
+functions. The filter requires that the service where the branched session is
+created has at least one network listener. The users must also be able to
+connect from the local MaxScale host. Usually this means that an extra grant for
+the loopback address is required (e.g. `myuser@127.0.0.1`).
+
+In addition to the aforementioned requirements, a failure to create a branched
+session no longer causes the actual client session to be closed. In most cases,
+this is desired behavior.
+
## Dropped Features
### MaxAdmin
diff --git a/Documentation/check_links.sh b/Documentation/check_links.sh
index 7b97efb64..5f949c782 100755
--- a/Documentation/check_links.sh
+++ b/Documentation/check_links.sh
@@ -5,7 +5,7 @@
# Use of this software is governed by the Business Source License included
# in the LICENSE.TXT file and at www.mariadb.com/bsl11.
#
-# Change Date: 2019-07-01
+# Change Date: 2020-01-01
#
# On the date above, in accordance with the Business Source License, use
# of this software will be governed by version 2 or later of the General
diff --git a/LICENSE.TXT b/LICENSE.TXT
index d781d6f9a..c2f9a61c7 100644
--- a/LICENSE.TXT
+++ b/LICENSE.TXT
@@ -4,13 +4,13 @@ License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
Parameters
Licensor: MariaDB Corporation Ab
-Licensed Work: MariaDB MaxScale (TM) v.2.1.
+Licensed Work: MariaDB MaxScale (TM) v.2.2.
The Licensed Work is (c) 2017 MariaDB Corporation Ab
Additional Use Grant: You may use the Licensed Work when your application
uses the Licensed Work with a total of less than three
server instances for any purpose.
-Change Date: 2019-07-01
+Change Date: 2020-01-01
Change License: Version 2 or later of the GNU General Public License as
published by the Free Software Foundation.
diff --git a/avro/maxavro.c b/avro/maxavro.c
index 8717ee19d..c013a3b4e 100644
--- a/avro/maxavro.c
+++ b/avro/maxavro.c
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/avro/maxavro.h b/avro/maxavro.h
index 00a9649b9..f4b103354 100644
--- a/avro/maxavro.h
+++ b/avro/maxavro.h
@@ -6,7 +6,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/avro/maxavro_datablock.c b/avro/maxavro_datablock.c
index 1b21d2615..c4800d56d 100644
--- a/avro/maxavro_datablock.c
+++ b/avro/maxavro_datablock.c
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/avro/maxavro_file.c b/avro/maxavro_file.c
index 2a1a98213..df5292099 100644
--- a/avro/maxavro_file.c
+++ b/avro/maxavro_file.c
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/avro/maxavro_internal.h b/avro/maxavro_internal.h
index 8c4f4e1a5..476085be1 100644
--- a/avro/maxavro_internal.h
+++ b/avro/maxavro_internal.h
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/avro/maxavro_record.c b/avro/maxavro_record.c
index bc7bea243..0cccafc9e 100644
--- a/avro/maxavro_record.c
+++ b/avro/maxavro_record.c
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/avro/maxavro_schema.c b/avro/maxavro_schema.c
index 5bb6fa4a4..29215151f 100644
--- a/avro/maxavro_schema.c
+++ b/avro/maxavro_schema.c
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/avro/maxavro_write.c b/avro/maxavro_write.c
index 1ea2abc4a..1a3fb17fe 100644
--- a/avro/maxavro_write.c
+++ b/avro/maxavro_write.c
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/avro/maxavrocheck.c b/avro/maxavrocheck.c
index 191802515..5cc552ac9 100644
--- a/avro/maxavrocheck.c
+++ b/avro/maxavrocheck.c
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/avro/test/test_values.c b/avro/test/test_values.c
index 6798d7cbe..578821b4f 100644
--- a/avro/test/test_values.c
+++ b/avro/test/test_values.c
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
diff --git a/client/maxadmin.c b/client/maxadmin.c
index 3a27083c0..f6ad531b8 100644
--- a/client/maxadmin.c
+++ b/client/maxadmin.c
@@ -4,7 +4,7 @@
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
- * Change Date: 2019-07-01
+ * Change Date: 2020-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
@@ -36,6 +36,7 @@
#include
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to
problems, because it may leave the current matching point in the middle of a
-multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to
-lock out the use of \C, causing a compile-time error if it is encountered.
+multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
+application to lock out the use of \C, causing a compile-time error if it is
+encountered. It is also possible to build PCRE2 with the use of \C permanently
+disabled.
Another way that performance can be hit is by running a pattern that has a very
@@ -187,7 +189,7 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
-Last updated: 13 April 2015
+Last updated: 16 October 2015
+Return to the PCRE2 index page.
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+#include <pcre2.h>
+
+pcre2_code *pcre2_code_copy(const pcre2_code *code);
+
+This function makes a copy of the memory used for a compiled pattern, excluding
+any memory used by the JIT compiler. Without a subsequent call to
+pcre2_jit_compile(), the copy can be used only for non-JIT matching. The
+pointer to the character tables is copied, not the tables themselves (see
+pcre2_code_copy_with_tables()). The yield of the function is NULL if
+code is NULL or if sufficient memory cannot be obtained.
+
+There is a complete description of the PCRE2 native API in the
+pcre2api
+page and a description of the POSIX API in the
+pcre2posix
+page.
+
+Return to the PCRE2 index page.
+
+Return to the PCRE2 index page.
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+#include <pcre2.h>
+
+pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
+
+This function makes a copy of the memory used for a compiled pattern, excluding
+any memory used by the JIT compiler. Without a subsequent call to
+pcre2_jit_compile(), the copy can be used only for non-JIT matching.
+Unlike pcre2_code_copy(), a separate copy of the character tables is also
+made, with the new code pointing to it. This memory will be automatically freed
+when pcre2_code_free() is called. The yield of the function is NULL if
+code is NULL or if sufficient memory cannot be obtained.
+
+There is a complete description of the PCRE2 native API in the
+pcre2api
+page and a description of the POSIX API in the
+pcre2posix
+page.
+
+Return to the PCRE2 index page.
+
$title
\n",
- $ref, $ref);
+ $ref);
$ref++;
}
else
diff --git a/pcre2/AUTHORS b/pcre2/AUTHORS
index 14a1a19fd..e056ad686 100644
--- a/pcre2/AUTHORS
+++ b/pcre2/AUTHORS
@@ -8,7 +8,7 @@ Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England.
-Copyright (c) 1997-2015 University of Cambridge
+Copyright (c) 1997-2017 University of Cambridge
All rights reserved
@@ -19,7 +19,7 @@ Written by: Zoltan Herczeg
Email local part: hzmester
Emain domain: freemail.hu
-Copyright(c) 2010-2015 Zoltan Herczeg
+Copyright(c) 2010-2017 Zoltan Herczeg
All rights reserved.
@@ -30,7 +30,7 @@ Written by: Zoltan Herczeg
Email local part: hzmester
Emain domain: freemail.hu
-Copyright(c) 2009-2015 Zoltan Herczeg
+Copyright(c) 2009-2017 Zoltan Herczeg
All rights reserved.
####
diff --git a/pcre2/CMakeLists.txt b/pcre2/CMakeLists.txt
index b625873cc..883e947b0 100644
--- a/pcre2/CMakeLists.txt
+++ b/pcre2/CMakeLists.txt
@@ -67,7 +67,16 @@
# 2013-10-08 PH got rid of the "source" command, which is a bash-ism (use ".")
# 2013-11-05 PH added support for PARENS_NEST_LIMIT
# 2014-08-29 PH converted the file for PCRE2 (which has no C++).
-# 2015-04024 PH added support for PCRE2_DEBUG
+# 2015-04-24 PH added support for PCRE2_DEBUG
+# 2015-07-16 PH updated for new pcre2_find_bracket source module
+# 2015-08-24 PH correct C_FLAGS setting (patch from Roy Ivy III)
+# 2015-10=16 PH added support for never-backslash-C
+# 2016-03-01 PH applied Chris Wilson's patch for MSVC static
+# 2016-06-24 PH applied Chris Wilson's second patch, putting the first under
+# a new option instead of being unconditional.
+# 2016-10-05 PH fixed a typo (PCRE should be PCRE2) in above patch
+# fix by David Gaussmann
+# 2016-10-07 PH added PCREGREP_MAX_BUFSIZE
PROJECT(PCRE2 C)
@@ -79,7 +88,7 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0)
SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # for FindReadline.cmake
-SET(CMAKE_C_FLAGS "-I${PROJECT_SOURCE_DIR}/src ${CMAKE_C_FLAGS}")
+SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR}/src")
# external packages
FIND_PACKAGE( BZip2 )
@@ -140,7 +149,10 @@ SET(PCRE2_MATCH_LIMIT_RECURSION "MATCH_LIMIT" CACHE STRING
"Default limit on internal recursion. See MATCH_LIMIT_RECURSION in config.h.in for details.")
SET(PCRE2GREP_BUFSIZE "20480" CACHE STRING
- "Buffer size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.")
+ "Buffer starting size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.")
+
+SET(PCRE2GREP_MAX_BUFSIZE "1048576" CACHE STRING
+ "Buffer maximum size parameter for pcre2grep. See PCRE2GREP_MAX_BUFSIZE in config.h.in for details.")
SET(PCRE2_NEWLINE "LF" CACHE STRING
"What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).")
@@ -154,12 +166,18 @@ SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL
SET(PCRE2_SUPPORT_PCRE2GREP_JIT ON CACHE BOOL
"Enable use of Just-in-time compiling in pcre2grep.")
+SET(PCRE2_SUPPORT_PCRE2GREP_CALLOUT ON CACHE BOOL
+ "Enable callout string support in pcre2grep.")
+
SET(PCRE2_SUPPORT_UNICODE ON CACHE BOOL
"Enable support for Unicode and UTF-8/UTF-16/UTF-32 encoding.")
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
+SET(PCRE2_NEVER_BACKSLASH_C OFF CACHE BOOL
+ "If ON, backslash-C (upper case C) is locked out.")
+
SET(PCRE2_SUPPORT_VALGRIND OFF CACHE BOOL
"Enable Valgrind support.")
@@ -178,6 +196,9 @@ IF (MINGW)
ENDIF(MINGW)
IF(MSVC)
+ OPTION(PCRE2_STATIC_RUNTIME
+ "ON=Compile against the static runtime (/MT)."
+ OFF)
OPTION(INSTALL_MSVC_PDB
"ON=Install .pdb files built by MSVC, if generated"
OFF)
@@ -250,6 +271,10 @@ IF(PCRE2_SUPPORT_BSR_ANYCRLF)
SET(BSR_ANYCRLF 1)
ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF)
+IF(PCRE2_NEVER_BACKSLASH_C)
+ SET(NEVER_BACKSLASH_C 1)
+ENDIF(PCRE2_NEVER_BACKSLASH_C)
+
IF(PCRE2_SUPPORT_UNICODE)
SET(SUPPORT_UNICODE 1)
ENDIF(PCRE2_SUPPORT_UNICODE)
@@ -262,6 +287,10 @@ IF(PCRE2_SUPPORT_PCRE2GREP_JIT)
SET(SUPPORT_PCRE2GREP_JIT 1)
ENDIF(PCRE2_SUPPORT_PCRE2GREP_JIT)
+IF(PCRE2_SUPPORT_PCRE2GREP_CALLOUT)
+ SET(SUPPORT_PCRE2GREP_CALLOUT 1)
+ENDIF(PCRE2_SUPPORT_PCRE2GREP_CALLOUT)
+
IF(PCRE2_SUPPORT_VALGRIND)
SET(SUPPORT_VALGRIND 1)
ENDIF(PCRE2_SUPPORT_VALGRIND)
@@ -390,6 +419,7 @@ SET(PCRE2_SOURCES
src/pcre2_context.c
src/pcre2_dfa_match.c
src/pcre2_error.c
+ src/pcre2_find_bracket.c
src/pcre2_jit_compile.c
src/pcre2_maketables.c
src/pcre2_match.c
@@ -445,6 +475,18 @@ SET(PCRE2POSIX_SOURCES
ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
ENDIF(MSVC AND NOT PCRE2_STATIC)
+# Fix static compilation with MSVC: https://bugs.exim.org/show_bug.cgi?id=1681
+# This code was taken from the CMake wiki, not from WebM.
+
+IF(MSVC AND PCRE2_STATIC_RUNTIME)
+ MESSAGE(STATUS "** MSVC and PCRE2_STATIC_RUNTIME: modifying compiler flags to use static runtime library")
+ foreach(flag_var
+ CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+ CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+ string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+ endforeach()
+ENDIF(MSVC AND PCRE2_STATIC_RUNTIME)
+
# Build setup
ADD_DEFINITIONS(-DHAVE_CONFIG_H)
@@ -468,21 +510,19 @@ IF(PCRE2_BUILD_PCRE2_8)
ADD_LIBRARY(pcre2-8 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
SET_PROPERTY(TARGET pcre2-8
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
-SET_PROPERTY(TARGET pcre2-8
- PROPERTY VERSION 1.0.0)
SET(targets ${targets} pcre2-8)
-ADD_LIBRARY(pcre2posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
-SET_PROPERTY(TARGET pcre2posix
+ADD_LIBRARY(pcre2-posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
+SET_PROPERTY(TARGET pcre2-posix
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
-SET(targets ${targets} pcre2posix)
-TARGET_LINK_LIBRARIES(pcre2posix pcre2-8)
+SET(targets ${targets} pcre2-posix)
+TARGET_LINK_LIBRARIES(pcre2-posix pcre2-8)
IF(MINGW AND NOT PCRE2_STATIC)
IF(NON_STANDARD_LIB_PREFIX)
- SET_TARGET_PROPERTIES(pcre2-8 pcre2posix PROPERTIES PREFIX "")
+ SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES PREFIX "")
ENDIF(NON_STANDARD_LIB_PREFIX)
IF(NON_STANDARD_LIB_SUFFIX)
- SET_TARGET_PROPERTIES(pcre2-8 pcre2posix PROPERTIES SUFFIX "-0.dll")
+ SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES SUFFIX "-0.dll")
ENDIF(NON_STANDARD_LIB_SUFFIX)
ENDIF(MINGW AND NOT PCRE2_STATIC)
ENDIF(PCRE2_BUILD_PCRE2_8)
@@ -530,7 +570,7 @@ IF(PCRE2_BUILD_PCRE2GREP)
SET_PROPERTY(TARGET pcre2grep
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
SET(targets ${targets} pcre2grep)
- TARGET_LINK_LIBRARIES(pcre2grep pcre2posix ${PCRE2GREP_LIBS})
+ TARGET_LINK_LIBRARIES(pcre2grep pcre2-posix ${PCRE2GREP_LIBS})
ENDIF(PCRE2_BUILD_PCRE2GREP)
# Testing
@@ -543,7 +583,7 @@ IF(PCRE2_BUILD_TESTS)
ADD_EXECUTABLE(pcre2test ${PCRE2TEST_SOURCES})
SET(targets ${targets} pcre2test)
IF(PCRE2_BUILD_PCRE2_8)
- LIST(APPEND PCRE2TEST_LIBS pcre2posix pcre2-8)
+ LIST(APPEND PCRE2TEST_LIBS pcre2-posix pcre2-8)
ENDIF(PCRE2_BUILD_PCRE2_8)
IF(PCRE2_BUILD_PCRE2_16)
LIST(APPEND PCRE2TEST_LIBS pcre2-16)
@@ -718,6 +758,7 @@ IF(PCRE2_SHOW_REPORT)
MESSAGE(STATUS " Enable Unicode support .......... : ${PCRE2_SUPPORT_UNICODE}")
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE2_NEWLINE}")
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE2_SUPPORT_BSR_ANYCRLF}")
+ MESSAGE(STATUS " \\C is disabled .................. : ${PCRE2_NEVER_BACKSLASH_C}")
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE2_EBCDIC}")
MESSAGE(STATUS " EBCDIC coding with NL=0x25 ...... : ${PCRE2_EBCDIC_NL25}")
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE2_REBUILD_CHARTABLES}")
@@ -730,6 +771,7 @@ IF(PCRE2_SHOW_REPORT)
MESSAGE(STATUS " Build static libs ............... : ${BUILD_STATIC_LIBS}")
MESSAGE(STATUS " Build pcre2grep ................. : ${PCRE2_BUILD_PCRE2GREP}")
MESSAGE(STATUS " Enable JIT in pcre2grep ......... : ${PCRE2_SUPPORT_PCRE2GREP_JIT}")
+ MESSAGE(STATUS " Enable callouts in pcre2grep .... : ${PCRE2_SUPPORT_PCRE2GREP_CALLOUT}")
MESSAGE(STATUS " Buffer size for pcre2grep ....... : ${PCRE2GREP_BUFSIZE}")
MESSAGE(STATUS " Build tests (implies pcre2test . : ${PCRE2_BUILD_TESTS}")
MESSAGE(STATUS " and pcre2grep)")
diff --git a/pcre2/COPYING b/pcre2/COPYING
index 94a9ed024..c233950f6 100644
--- a/pcre2/COPYING
+++ b/pcre2/COPYING
@@ -1,674 +1,5 @@
- GNU GENERAL PUBLIC LICENSE
- Version 3, 29 June 2007
+PCRE2 LICENCE
- Copyright (C) 2007 Free Software Foundation, Inc. pcre2_callout_enumerate
Enumerate callouts in a compiled pattern
+
+
+pcre2_code_copy
+ Copy a compiled pattern
+
pcre2_code_copy_with_tables
+ Copy a compiled pattern and its character tables
@@ -210,9 +216,15 @@ in the library.
pcre2_code_free
Free a compiled pattern
+pcre2_set_match_limit
Set the match limit
+
pcre2_set_max_pattern_length
+ Set the maximum length of pattern
+pcre2_set_newline
Set the newline convention
+
pcre2_set_offset_limit
+ Set the offset limit
diff --git a/pcre2/doc/html/pcre2.html b/pcre2/doc/html/pcre2.html
index e94b355a3..07ab8e9e8 100644
--- a/pcre2/doc/html/pcre2.html
+++ b/pcre2/doc/html/pcre2.html
@@ -126,8 +126,10 @@ running redundant checks.
pcre2_set_parens_nest_limit
Set the parentheses nesting limit
REVISION
Copyright © 1997-2015 University of Cambridge.
diff --git a/pcre2/doc/html/pcre2_code_copy.html b/pcre2/doc/html/pcre2_code_copy.html
new file mode 100644
index 000000000..667d7b7ff
--- /dev/null
+++ b/pcre2/doc/html/pcre2_code_copy.html
@@ -0,0 +1,43 @@
+
+pcre2_code_copy man page
+
+
+SYNOPSIS
+
+
+DESCRIPTION
+
+pcre2_code_copy_with_tables man page
+
+
+SYNOPSIS
+
+
+DESCRIPTION
+
+
-pcre2_code_free(pcre2_code *code); +void pcre2_code_free(pcre2_code *code);
PCRE2_ANCHORED Match only at the first position PCRE2_NOTBOL Subject is not the beginning of a line diff --git a/pcre2/doc/html/pcre2_get_error_message.html b/pcre2/doc/html/pcre2_get_error_message.html index 5d422913e..26c80febe 100644 --- a/pcre2/doc/html/pcre2_get_error_message.html +++ b/pcre2/doc/html/pcre2_get_error_message.html @@ -35,7 +35,10 @@ errors are negative numbers. The arguments are: bufflen the length of the buffer (code units)The function returns the length of the message, excluding the trailing zero, or -a negative error code if the buffer is too small. +the negative error code PCRE2_ERROR_NOMEMORY if the buffer is too small. In +this case, the returned message is truncated (but still with a trailing zero). +If errorcode does not contain a recognized error code number, the +negative value PCRE2_ERROR_BADDATA is returned.
There is a complete description of the PCRE2 native API in the diff --git a/pcre2/doc/html/pcre2_match_data_create.html b/pcre2/doc/html/pcre2_match_data_create.html index 03cbe244a..8d0321b55 100644 --- a/pcre2/doc/html/pcre2_match_data_create.html +++ b/pcre2/doc/html/pcre2_match_data_create.html @@ -19,7 +19,7 @@ SYNOPSIS #include <pcre2.h>
-pcre2_match_data_create(uint32_t ovecsize, +pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext);
-pcre2_match_data_create_from_pattern(const pcre2_code *code, - pcre2_general_context *gcontext); +pcre2_match_data *pcre2_match_data_create_from_pattern( + const pcre2_code *code, pcre2_general_context *gcontext);
int32_t pcre2_serialize_decode(pcre2_code **codes, - int32_t number_of_codes, const uint32_t *bytes, + int32_t number_of_codes, const uint8_t *bytes, pcre2_general_context *gcontext);
-int32_t pcre2_serialize_encode(pcre2_code **codes, - int32_t number_of_codes, uint32_t **serialized_bytes, +int32_t pcre2_serialize_encode(const pcre2_code **codes, + int32_t number_of_codes, uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
+Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, + PCRE2_SIZE value); +
++This function sets, in a compile context, the maximum text length (in code +units) of the pattern that can be compiled. The result is always zero. If a +longer pattern is passed to pcre2_compile() there is an immediate error +return. The default is effectively unlimited, being the largest value a +PCRE2_SIZE variable can hold. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/pcre2/doc/html/pcre2_set_offset_limit.html b/pcre2/doc/html/pcre2_set_offset_limit.html new file mode 100644 index 000000000..6d9a85c64 --- /dev/null +++ b/pcre2/doc/html/pcre2_set_offset_limit.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_offset_limit(pcre2_match_context *mcontext, + PCRE2_SIZE value); +
++This function sets the offset limit field in a match context. The result is +always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/pcre2/doc/html/pcre2_substitute.html b/pcre2/doc/html/pcre2_substitute.html index 0976947dd..2dfd09475 100644 --- a/pcre2/doc/html/pcre2_substitute.html +++ b/pcre2/doc/html/pcre2_substitute.html @@ -59,20 +59,25 @@ units, not characters, as is the contents of the variable pointed at by outlengthptr, which is updated to the actual length of the new string. The options are:- PCRE2_ANCHORED Match only at the first position - PCRE2_NOTBOL Subject string is not the beginning of a line - PCRE2_NOTEOL Subject string is not the end of a line - PCRE2_NOTEMPTY An empty string is not a valid match - PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject - is not a valid match - PCRE2_NO_UTF_CHECK Do not check the subject or replacement for - UTF validity (only relevant if PCRE2_UTF - was set at compile time) - PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject + PCRE2_ANCHORED Match only at the first position + PCRE2_NOTBOL Subject is not the beginning of a line + PCRE2_NOTEOL Subject is not the end of a line + PCRE2_NOTEMPTY An empty string is not a valid match + PCRE2_NOTEMPTY_ATSTART An empty string at the start of the + subject is not a valid match + PCRE2_NO_UTF_CHECK Do not check the subject or replacement + for UTF validity (only relevant if + PCRE2_UTF was set at compile time) + PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing + PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length + PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset + PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty stringThe function returns the number of substitutions, which may be zero if there were no matches. The result can be greater than one only when -PCRE2_SUBSTITUTE_GLOBAL is set. +PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code +is returned.
There is a complete description of the PCRE2 native API in the diff --git a/pcre2/doc/html/pcre2api.html b/pcre2/doc/html/pcre2api.html index 60d2bf569..6ffa69fb6 100644 --- a/pcre2/doc/html/pcre2api.html +++ b/pcre2/doc/html/pcre2api.html @@ -43,16 +43,17 @@ please consult the man page, in case the conversion went wrong.
#include <pcre2.h>
@@ -70,15 +71,15 @@ document for an overview of all the PCRE2 documentation.
pcre2_compile_context *ccontext);
-pcre2_code_free(pcre2_code *code);
+void pcre2_code_free(pcre2_code *code);
-pcre2_match_data_create(uint32_t ovecsize,
+pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
pcre2_general_context *gcontext);
-pcre2_match_data_create_from_pattern(const pcre2_code *code,
- pcre2_general_context *gcontext);
+pcre2_match_data *pcre2_match_data_create_from_pattern(
+ const pcre2_code *code, pcre2_general_context *gcontext);
int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
@@ -143,6 +144,10 @@ document for an overview of all the PCRE2 documentation.
const unsigned char *tables);
+int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
+ PCRE2_SIZE value);
+
+
int pcre2_set_newline(pcre2_compile_context *ccontext,
uint32_t value);
@@ -176,6 +181,10 @@ document for an overview of all the PCRE2 documentation.
uint32_t value);
+int pcre2_set_offset_limit(pcre2_match_context *mcontext,
+ PCRE2_SIZE value);
+
+
int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
uint32_t value);
@@ -266,12 +275,12 @@ document for an overview of all the PCRE2 documentation.
PCRE2 NATIVE API SERIALIZATION FUNCTIONS
int32_t pcre2_serialize_decode(pcre2_code **codes,
- int32_t number_of_codes, const uint32_t *bytes,
+ int32_t number_of_codes, const uint8_t *bytes,
pcre2_general_context *gcontext);
-int32_t pcre2_serialize_encode(pcre2_code **codes,
- int32_t number_of_codes, uint32_t **serialized_bytes,
+int32_t pcre2_serialize_encode(const pcre2_code **codes,
+ int32_t number_of_codes, uint8_t **serialized_bytes,
PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
@@ -282,6 +291,12 @@ document for an overview of all the PCRE2 documentation.
+pcre2_code *pcre2_code_copy(const pcre2_code *code);
+
+
+pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
+
+
int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
PCRE2_SIZE bufflen);
@@ -406,9 +421,10 @@ More complicated programs might need to make use of the specialist functions
pcre2_jit_stack_assign() in order to control the JIT code's memory usage.
-JIT matching is automatically used by pcre2_match() if it is available. -There is also a direct interface for JIT matching, which gives improved -performance. The JIT-specific functions are discussed in the +JIT matching is automatically used by pcre2_match() if it is available, +unless the PCRE2_NO_JIT option is set. There is also a direct interface for JIT +matching, which gives improved performance. The JIT-specific functions are +discussed in the pcre2jit documentation.
@@ -447,10 +463,19 @@ return a copy of the subject string with substitutions for parts that were matched.+Functions whose names begin with pcre2_serialize_ are used for saving +compiled patterns on disc or elsewhere, and reloading them later. +
+Finally, there are functions for finding out information about a compiled pattern (pcre2_pattern_info()) and about the configuration with which PCRE2 was built (pcre2_config()).
++Functions with names ending with _free() are used for freeing memory +blocks of various sorts. In all cases, if one of these functions is called with +a NULL argument, it does nothing. +
The PCRE2 API uses string lengths and offsets into strings of code units in @@ -508,20 +533,52 @@ time ensuring that multithreaded applications can use it. There are several different blocks of data that are used to pass information between the application and the PCRE2 libraries.
+-(1) A pointer to the compiled form of a pattern is returned to the user when +A pointer to the compiled form of a pattern is returned to the user when pcre2_compile() is successful. The data in the compiled pattern is fixed, and does not change when the pattern is matched. Therefore, it is thread-safe, that is, the same compiled pattern can be used by more than one thread -simultaneously. An application can compile all its patterns at the start, -before forking off multiple threads that use them. However, if the just-in-time -optimization feature is being used, it needs separate memory stack areas for -each thread. See the +simultaneously. For example, an application can compile all its patterns at the +start, before forking off multiple threads that use them. However, if the +just-in-time optimization feature is being used, it needs separate memory stack +areas for each thread. See the pcre2jit documentation for more details.
-(2) The next section below introduces the idea of "contexts" in which PCRE2 +In a more complicated situation, where patterns are compiled only when they are +first needed, but are still shared between threads, pointers to compiled +patterns must be protected from simultaneous writing by multiple threads, at +least until a pattern has been compiled. The logic can be something like this: +
+ Get a read-only (shared) lock (mutex) for pointer
+ if (pointer == NULL)
+ {
+ Get a write (unique) lock for pointer
+ pointer = pcre2_compile(...
+ }
+ Release the lock
+ Use pointer in pcre2_match()
+
+Of course, testing for compilation errors should also be included in the code.
+
++If JIT is being used, but the JIT compilation is not being done immediately, +(perhaps waiting to see if the pattern is used often enough) similar logic is +required. JIT compilation updates a pointer within the compiled code block, so +a thread must gain unique write access to the pointer before calling +pcre2_jit_compile(). Alternatively, pcre2_code_copy() or +pcre2_code_copy_with_tables() can be used to obtain a private copy of the +compiled code. +
++The next main section below introduces the idea of "contexts" in which PCRE2 functions are called. A context is nothing more than a collection of parameters that control the way PCRE2 operates. Grouping a number of parameters together in a context is a convenient way of passing them to a PCRE2 function without @@ -535,11 +592,14 @@ are never changed, the same context can be used by all the threads. However, if any thread needs to change any value in a context, it must make its own thread-specific copy.
+-(3) The matching functions need a block of memory for working space and for -storing the results of a match. This includes details of what was matched, as -well as additional information such as the name of a (*MARK) setting. Each -thread must provide its own version of this memory. +The matching functions need a block of memory for working space and for storing +the results of a match. This includes details of what was matched, as well as +additional information such as the name of a (*MARK) setting. Each thread must +provide its own copy of this memory.
@@ -610,6 +670,7 @@ of the following compile-time parameters:
PCRE2's character tables
The newline character sequence
The compile time nested parentheses limit
+ The maximum length of the pattern string
An external function for stack checking
A compile context is also required if you are using custom memory management.
@@ -648,6 +709,15 @@ interpreted matching functions, pcre2_match() and
The value must be the result of a call to pcre2_maketables(), whose only
argument is a general context. This function builds a set of character tables
in the current locale.
+int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
+ PCRE2_SIZE value);
+
+
+This sets a maximum length, in code units, for the pattern string that is to be
+compiled. If the pattern is longer, an error is generated. This facility is
+provided so that applications that accept patterns from external sources can
+limit their size. The default is the largest number that a PCRE2_SIZE variable
+can hold, which is effectively unlimited.
int pcre2_set_newline(pcre2_compile_context *ccontext,
uint32_t value);
@@ -670,7 +740,8 @@ functions, pcre2_match() and pcre2_dfa_match().
This parameter ajusts the limit, set when PCRE2 is built (default 250), on the
depth of parenthesis nesting in a pattern. This limit stops rogue patterns
-using up too much system stack when being compiled.
+using up too much system stack when being compiled. The limit applies to
+parentheses of all kinds, not just capturing parentheses.
int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
int (*guard_function)(uint32_t, void *), void *user_data);
@@ -697,8 +768,9 @@ A match context is required if you want to change the default values of any
of the following match-time parameters:
A callout function - The limit for calling match() - The limit for calling match() recursively + The offset limit for matching an unanchored pattern + The limit for calling match() (see below) + The limit for calling match() recursivelyA match context is also required if you are using custom memory management. If none of these apply, just pass NULL as the context argument of @@ -729,6 +801,32 @@ This sets up a "callout" function, which PCRE2 will call at specified points during a matching operation. Details are given in the pcre2callout documentation. +int pcre2_set_offset_limit(pcre2_match_context *mcontext, + PCRE2_SIZE value); +
+When using this facility, you must set PCRE2_USE_OFFSET_LIMIT when calling +pcre2_compile() so that when JIT is in use, different code can be +compiled. If a match is started with a non-default match limit when +PCRE2_USE_OFFSET_LIMIT is not set, an error is generated. +
+
+The offset limit facility can be used to track progress when searching large
+subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
+start within the first line of the subject. If this is set with an offset
+limit, a match must occur in the first line and also within the offset limit.
+In other words, whichever limit comes first is used.
int pcre2_set_match_limit(pcre2_match_context *mcontext,
uint32_t value);
@@ -781,21 +879,23 @@ This limit is of use only if it is set smaller than match_limit.
Limiting the recursion depth limits the amount of system stack that can be
used, or, when PCRE2 has been compiled to use memory on the heap instead of the
stack, the amount of heap memory that can be used. This limit is not relevant,
-and is ignored, when matching is done using JIT compiled code or by the
-pcre2_dfa_match() function.
+and is ignored, when matching is done using JIT compiled code. However, it is
+supported by pcre2_dfa_match(), which uses recursive function calls less
+frequently than pcre2_match(), but which can be caused to use a lot of
+stack by a recursive pattern such as /(.)(?1)/ matched to a very long string.
The default value for recursion_limit can be set when PCRE2 is built; the default default is the same value as the default for match_limit. If the -limit is exceeded, pcre2_match() returns PCRE2_ERROR_RECURSIONLIMIT. A -value for the recursion limit may also be supplied by an item at the start of a -pattern of the form +limit is exceeded, pcre2_match() and pcre2_dfa_match() return +PCRE2_ERROR_RECURSIONLIMIT. A value for the recursion limit may also be +supplied by an item at the start of a pattern of the form
(*LIMIT_RECURSION=ddd)where ddd is a decimal number. However, such a setting is ignored unless ddd is -less than the limit set by the caller of pcre2_match() or, if no such -limit is set, less than the default. +less than the limit set by the caller of pcre2_match() or +pcre2_dfa_match() or, if no such limit is set, less than the default. int pcre2_set_recursion_memory_management( pcre2_match_context *mcontext, void *(*private_malloc)(PCRE2_SIZE, void *), @@ -936,7 +1036,7 @@ The where argument should point to a buffer that is at least 24 code units long. (The exact length required can be found by calling pcre2_config() with where set to NULL.) If PCRE2 has been compiled without Unicode support, the buffer is filled with the text "Unicode not -supported". Otherwise, the Unicode version string (for example, "7.0.0") is +supported". Otherwise, the Unicode version string (for example, "8.0.0") is inserted. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero.
@@ -961,35 +1061,67 @@ zero. pcre2_compile_context *ccontext);The internal recursion limit was reached. + +
-pcre2_code_free(pcre2_code *code); +void pcre2_code_free(pcre2_code *code); +
+
+pcre2_code *pcre2_code_copy(const pcre2_code *code); +
+
+pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);The pcre2_compile() function compiles a pattern into an internal form. -The pattern is defined by a pointer to a string of code units and a length, If +The pattern is defined by a pointer to a string of code units and a length. If the pattern is zero-terminated, the length can be specified as PCRE2_ZERO_TERMINATED. The function returns a pointer to a block of memory that -contains the compiled pattern and related data. The caller must free the memory -by calling pcre2_code_free() when it is no longer needed. -
--NOTE: When one of the matching functions is called, pointers to the compiled -pattern and the subject string are set in the match data block so that they can -be referenced by the extraction functions. After running a match, you must not -free a compiled pattern (or a subject string) until after all operations on the -match data block -have taken place. +contains the compiled pattern and related data, or NULL if an error occurred.
If the compile context argument ccontext is NULL, memory for the compiled pattern is obtained by calling malloc(). Otherwise, it is obtained from -the same memory function that was used for the compile context. +the same memory function that was used for the compile context. The caller must +free the memory by calling pcre2_code_free() when it is no longer needed.
-The options argument contains various bit settings that affect the -compilation. It should be zero if no options are required. The available -options are described below. Some of them (in particular, those that are -compatible with Perl, but some others as well) can also be set and unset from -within the pattern (see the detailed description in the +The function pcre2_code_copy() makes a copy of the compiled code in new +memory, using the same memory allocator as was used for the original. However, +if the code has been processed by the JIT compiler (see +below), +the JIT information cannot be copied (because it is position-dependent). +The new copy can initially be used only for non-JIT matching, though it can be +passed to pcre2_jit_compile() if required. +
++The pcre2_code_copy() function provides a way for individual threads in a +multithreaded application to acquire a private copy of shared compiled code. +However, it does not make a copy of the character tables used by the compiled +pattern; the new pattern code points to the same tables as the original code. +(See +"Locale Support" +below for details of these character tables.) In many applications the same +tables are used throughout, so this behaviour is appropriate. Nevertheless, +there are occasions when a copy of a compiled pattern and the relevant tables +are needed. The pcre2_code_copy_with_tables() provides this facility. +Copies of both the code and the tables are made, with the new code pointing to +the new tables. The memory for the new tables is automatically freed when +pcre2_code_free() is called for the new copy of the compiled code. +
++NOTE: When one of the matching functions is called, pointers to the compiled +pattern and the subject string are set in the match data block so that they can +be referenced by the substring extraction functions. After running a match, you +must not free a compiled pattern (or a subject string) until after all +operations on the +match data block +have taken place. +
++The options argument for pcre2_compile() contains various bit +settings that affect the compilation. It should be zero if no options are +required. The available options are described below. Some of them (in +particular, those that are compatible with Perl, but some others as well) can +also be set and unset from within the pattern (see the detailed description in +the pcre2pattern documentation).
@@ -1006,13 +1138,27 @@ newline setting) can be provided in a compile context (as describedIf errorcode or erroroffset is NULL, pcre2_compile() returns -NULL immediately. Otherwise, if compilation of a pattern fails, -pcre2_compile() returns NULL, having set these variables to an error code -and an offset (number of code units) within the pattern, respectively. The -pcre2_get_error_message() function provides a textual message for each -error code. Compilation errors are positive numbers, but UTF formatting errors -are negative numbers. For an invalid UTF-8 or UTF-16 string, the offset is that -of the first code unit of the failing character. +NULL immediately. Otherwise, the variables to which these point are set to an +error code and an offset (number of code units) within the pattern, +respectively, when pcre2_compile() returns NULL because a compilation +error has occurred. The values are not defined when compilation is successful +and pcre2_compile() returns a non-NULL value. +
++The value returned in erroroffset is an indication of where in the +pattern the error occurred. It is not necessarily the furthest point in the +pattern that was read. For example, after the error "lookbehind assertion is +not fixed length", the error offset points to the start of the failing +assertion. +
++The pcre2_get_error_message() function (see "Obtaining a textual error +message" +below) +provides a textual message for each error code. Compilation errors have +positive error codes; UTF formatting error codes are negative. For an invalid +UTF-8 or UTF-16 string, the offset is that of the first code unit of the +failing character.
Some errors are not detected until the whole pattern has been scanned; in these @@ -1083,12 +1229,24 @@ after any internal newline. However, it does not match after a newline at the end of the subject, for compatibility with Perl. If you want a multiline circumflex also to match after a terminating newline, you must set PCRE2_ALT_CIRCUMFLEX. +
+ PCRE2_ALT_VERBNAMES ++By default, for compatibility with Perl, the name in any verb sequence such as +(*MARK:NAME) is any sequence of characters that does not include a closing +parenthesis. The name is not processed in any way, and it is not possible to +include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES +option is set, normal backslash processing is applied to verb names and only an +unescaped closing parenthesis terminates the name. A closing parenthesis can be +included in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED +option is set, unescaped whitespace in verb names is skipped and #-comments are +recognized, exactly as in the rest of the pattern.PCRE2_AUTO_CALLOUTIf this bit is set, pcre2_compile() automatically inserts callout items, -all with number 255, before each pattern item. For discussion of the callout -facility, see the +all with number 255, before each pattern item, except immediately before or +after a callout in the pattern. For discussion of the callout facility, see the pcre2callout documentation.@@ -1156,7 +1314,10 @@ built.If this option is set, an unanchored pattern is required to match before or at the first newline in the subject string, though the matched text may continue -over the newline. +over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more +general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a +match must occur in the first line and also within the offset limit. In other +words, whichever limit comes first is used.PCRE2_MATCH_UNSET_BACKREF@@ -1195,7 +1356,8 @@ This option locks out the use of \C in the pattern that is being compiled. This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because it may leave the current matching point in the middle of a multi-code-unit character. This option may be useful in applications that process patterns from -external sources. +external sources. Note that there is also a build-time option that permanently +locks out the use of \C.PCRE2_NEVER_UCP@@ -1221,7 +1383,9 @@ If this option is set, it disables the use of numbered capturing parentheses in the pattern. Any opening parenthesis that is not followed by ? behaves as if it were followed by ?: but named parentheses can still be used for capturing (and they acquire numbers in the usual way). There is no equivalent of this option -in Perl. +in Perl. Note that, if this option is set, references to capturing groups (back +references or recursion/subroutine calls) may only refer to named groups, +though the reference can be by name or by number.PCRE2_NO_AUTO_POSSESS@@ -1338,6 +1502,17 @@ support. This option inverts the "greediness" of the quantifiers so that they are not greedy by default, but become greedy if followed by "?". It is not compatible with Perl. It can also be set by a (?U) option setting within the pattern. ++ PCRE2_USE_OFFSET_LIMIT ++This option must be set for pcre2_compile() if +pcre2_set_offset_limit() is going to be used to set a non-default offset +limit in a match context for matches that use this pattern. An error is +generated if an offset limit is set without this option. For more details, see +the description of pcre2_set_offset_limit() in the +section +that describes match contexts. See also the PCRE2_FIRSTLINE +option above.PCRE2_UTF@@ -1352,14 +1527,17 @@ page.
COMPILATION ERROR CODES
-There are over 80 positive error codes that pcre2_compile() may return if -it finds an error in the pattern. There are also some negative error codes that -are used for invalid UTF strings. These are the same as given by -pcre2_match() and pcre2_dfa_match(), and are described in the +There are over 80 positive error codes that pcre2_compile() may return +(via errorcode) if it finds an error in the pattern. There are also some +negative error codes that are used for invalid UTF strings. These are the same +as given by pcre2_match() and pcre2_dfa_match(), and are described +in the pcre2unicode -page. The pcre2_get_error_message() function can be called to obtain a -textual error message from any error code. -
+page. The pcre2_get_error_message() function (see "Obtaining a textual +error message" +below) +can be called to obtain a textual error message from any error code. +
JUST-IN-TIME (JIT) COMPILATION
int pcre2_jit_compile(pcre2_code *code, uint32_t options); @@ -1495,11 +1673,15 @@ are as follows: Return a copy of the pattern's options. The third argument should point to a uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOPTIONS returns -the compile options as modified by any top-level option settings at the start -of the pattern itself. In other words, they are the options that will be in -force when matching starts. For example, if the pattern /(?im)abc(?-i)d/ is -compiled with the PCRE2_EXTENDED option, the result is PCRE2_CASELESS, -PCRE2_MULTILINE, and PCRE2_EXTENDED. +the compile options as modified by any top-level (*XXX) option settings such as +(*UTF) at the start of the pattern itself. +
++For example, if the pattern /(*UTF)abc/ is compiled with the PCRE2_EXTENDED +option, the result for PCRE2_INFO_ALLOPTIONS is PCRE2_EXTENDED and PCRE2_UTF. +Option settings such as (?i) that can change within a pattern do not affect the +result of PCRE2_INFO_ALLOPTIONS, even if they appear right at the start of the +pattern. (This was different in some earlier releases.)
A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if @@ -1541,18 +1723,27 @@ matches only CR, LF, or CRLF.
PCRE2_INFO_CAPTURECOUNT-Return the number of capturing subpatterns in the pattern. The third argument -should point to an uint32_t variable. +Return the highest capturing subpattern number in the pattern. In patterns +where (?| is not used, this is also the total number of capturing subpatterns. +The third argument should point to an uint32_t variable. ++ PCRE2_INFO_FIRSTBITMAP ++In the absence of a single first code unit for a non-anchored pattern, +pcre2_compile() may construct a 256-bit table that defines a fixed set of +values for the first code unit in any match. For example, a pattern that starts +with [abc] results in a table with three bits set. When code unit values +greater than 255 are supported, the flag bit for 255 means "any code unit of +value 255 or above". If such a table was constructed, a pointer to it is +returned. Otherwise NULL is returned. The third argument should point to an +const uint8_t * variable.PCRE2_INFO_FIRSTCODETYPEReturn information about the first code unit of any matched string, for a non-anchored pattern. The third argument should point to an uint32_t -variable. - --If there is a fixed first value, for example, the letter "c" from a pattern -such as (cat|cow|coyote), 1 is returned, and the character value can be +variable. If there is a fixed first value, for example, the letter "c" from a +pattern such as (cat|cow|coyote), 1 is returned, and the character value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but it is known that a match can occur only at the start of the subject or following a newline in the subject, 2 is returned. Otherwise, and for anchored @@ -1567,16 +1758,10 @@ value is always less than 256. In the 16-bit library the value can be up to 0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32 mode.
- PCRE2_INFO_FIRSTBITMAP + PCRE2_INFO_HASBACKSLASHC-In the absence of a single first code unit for a non-anchored pattern, -pcre2_compile() may construct a 256-bit table that defines a fixed set of -values for the first code unit in any match. For example, a pattern that starts -with [abc] results in a table with three bits set. When code unit values -greater than 255 are supported, the flag bit for 255 means "any code unit of -value 255 or above". If such a table was constructed, a pointer to it is -returned. Otherwise NULL is returned. The third argument should point to an -const uint8_t * variable. +Return 1 if the pattern contains any instances of \C, otherwise 0. The third +argument should point to an uint32_t variable.PCRE2_INFO_HASCRORLF@@ -1602,13 +1787,10 @@ Returns 1 if there is a rightmost literal code unit that must exist in any matched string, other than at its start. The third argument should point to an uint32_t variable. If there is no such value, 0 is returned. When 1 is returned, the code unit value itself can be retrieved using -PCRE2_INFO_LASTCODEUNIT. - --For anchored patterns, a last literal value is recorded only if it follows -something of variable length. For example, for the pattern /^a\d+z\d+/ the -returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for -/^a\dz\d/ the returned value is 0. +PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is +recorded only if it follows something of variable length. For example, for the +pattern /^a\d+z\d+/ the returned value is 1 (with "z" returned from +PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0.
PCRE2_INFO_LASTCODEUNIT@@ -1619,8 +1801,11 @@ value, 0 is returned.PCRE2_INFO_MATCHEMPTY-Return 1 if the pattern can match an empty string, otherwise 0. The third -argument should point to an uint32_t variable. +Return 1 if the pattern might match an empty string, otherwise 0. The third +argument should point to an uint32_t variable. When a pattern contains +recursive subroutine calls it is not always possible to determine whether or +not it can match an empty string. PCRE2 takes a cautious approach and returns 1 +in such cases.PCRE2_INFO_MATCHLIMIT@@ -1778,12 +1963,12 @@ documentation.
THE MATCH DATA BLOCK
-pcre2_match_data_create(uint32_t ovecsize, +pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext);
-pcre2_match_data_create_from_pattern(const pcre2_code *code, - pcre2_general_context *gcontext); +pcre2_match_data *pcre2_match_data_create_from_pattern( + const pcre2_code *code, pcre2_general_context *gcontext);
void pcre2_match_data_free(pcre2_match_data *match_data); @@ -1793,7 +1978,7 @@ Information about a successful or unsuccessful match is placed in a match data block, which is an opaque structure that is accessed by function calls. In particular, the match data block contains a vector of offsets into the subject string that define the matched part of the subject and any substrings that were -captured. This is know as the ovector. +captured. This is known as the ovector.Before calling pcre2_match(), pcre2_dfa_match(), or @@ -1951,14 +2136,15 @@ Option bits for pcre2_match()
The unused bits of the options argument for pcre2_match() must be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL, -PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, -PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below. +PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, +PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is +described below.
Setting PCRE2_ANCHORED at match time is not supported by the just-in-time (JIT) compiler. If it is set, JIT matching is disabled and the normal interpretive -code in pcre2_match() is run. The remaining options are supported for JIT -matching. +code in pcre2_match() is run. Apart from PCRE2_NO_JIT (obviously), the +remaining options are supported for JIT matching.
PCRE2_ANCHORED@@ -2005,17 +2191,32 @@ only at the first matching position, that is, at the start of the subject plus the starting offset. An empty string match later in the subject is permitted. If the pattern is anchored, such a match can occur only if the pattern contains \K. ++ PCRE2_NO_JIT ++By default, if a pattern has been successfully processed by +pcre2_jit_compile(), JIT is automatically used when pcre2_match() +is called with options that JIT supports. Setting PCRE2_NO_JIT disables the use +of JIT; it forces matching to be done by the interpreter.PCRE2_NO_UTF_CHECKWhen PCRE2_UTF is set at compile time, the validity of the subject as a UTF string is checked by default when pcre2_match() is subsequently called. -The entire string is checked before any other processing takes place, and a +If a non-zero starting offset is given, the check is applied only to that part +of the subject that could be inspected during matching, and there is a check +that the starting offset points to the first code unit of a character or to the +end of the subject. If there are no lookbehind assertions in the pattern, the +check starts at the starting offset. Otherwise, it starts at the length of the +longest lookbehind before the starting offset, or at the start of the subject +if there are not that many characters before the starting offset. Note that the +sequences \b and \B are one-character lookbehinds. + ++The check is carried out before any other processing takes place, and a negative error code is returned if the check fails. There are several UTF error codes for each code unit width, corresponding to different problems with the -code unit sequence. The value of startoffset is also checked, to ensure -that it points to the start of a character or to the end of the subject. There -are discussions about the validity of +code unit sequence. There are discussions about the validity of UTF-8 strings, UTF-16 strings, and @@ -2066,8 +2267,13 @@ documentation. When PCRE2 is built, a default newline convention is set; this is usually the standard convention for the operating system. The default can be overridden in a -compile context. -During matching, the newline choice affects the behaviour of the dot, +compile context +by calling pcre2_set_newline(). It can also be overridden by starting a +pattern string with, for example, (*CRLF), as described in the +section on newline conventions +in the +pcre2pattern +page. During matching, the newline choice affects the behaviour of the dot, circumflex, and dollar metacharacters. It may also alter the way the match starting position is advanced after a match failure for an unanchored pattern.
@@ -2115,19 +2321,20 @@ function can be used to find out how many capturing subpatterns there are in a compiled pattern.-A successful match returns the overall matched string and any captured -substrings to the caller via a vector of PCRE2_SIZE values. This is called the -ovector, and is contained within the -match data block. -You can obtain direct access to the ovector by calling -pcre2_get_ovector_pointer() to find its address, and -pcre2_get_ovector_count() to find the number of pairs of values it -contains. Alternatively, you can use the auxiliary functions for accessing -captured substrings +You can use auxiliary functions for accessing captured substrings by number or -by name -(see below). +by name, +as described in sections below. +
++Alternatively, you can make direct use of the vector of PCRE2_SIZE values, +called the ovector, which contains the offsets of captured strings. It is +part of the +match data block. +The function pcre2_get_ovector_pointer() returns the address of the +ovector, and pcre2_get_ovector_count() returns the number of pairs of +values it contains.
Within the ovector, the first in each pair of values is set to the offset of @@ -2216,7 +2423,13 @@ After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be available, and pcre2_get_mark() can be called. It returns a pointer to the zero-terminated name, which is within the compiled pattern. Otherwise NULL is -returned. After a successful match, the (*MARK) name that is returned is the +returned. The length of the (*MARK) name (excluding the terminating zero) is +stored in the code unit that preceeds the name. You should use this instead of +relying on the terminating zero if the (*MARK) name might contain a binary +zero. +
++After a successful match, the (*MARK) name that is returned is the last one encountered on the matching path through the pattern. After a "no match" or a partial match, the last encountered (*MARK) name is returned. For example, consider this pattern: @@ -2237,7 +2450,7 @@ escape sequence. After a partial match, however, this value is always the same as ovector[0] because \K does not affect the result of a partial match.
-After a UTF check failure, \fBpcre2_get_startchar()\fB can be used to obtain +After a UTF check failure, pcre2_get_startchar() can be used to obtain the code unit offset of the invalid UTF character. Details are given in the pcre2unicode page. @@ -2245,11 +2458,13 @@ page.
ERROR RETURNS FROM pcre2_match()
If pcre2_match() fails, it returns a negative number. This can be -converted to a text string by calling pcre2_get_error_message(). Negative -error codes are also returned by other functions, and are documented with them. -The codes are given names in the header file. If UTF checking is in force and -an invalid UTF subject string is detected, one of a number of UTF-specific -negative error codes is returned. Details are given in the +converted to a text string by calling the pcre2_get_error_message() +function (see "Obtaining a textual error message" +below). +Negative error codes are also returned by other functions, and are documented +with them. The codes are given names in the header file. If UTF checking is in +force and an invalid UTF subject string is detected, one of a number of +UTF-specific negative error codes is returned. Details are given in the pcre2unicode page. The following are the other errors that may be returned by pcre2_match(): @@ -2350,8 +2565,29 @@ is attempted. PCRE2_ERROR_RECURSIONLIMIT
+int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, + PCRE2_SIZE bufflen); +
++A text message for an error code from any PCRE2 function (compile, match, or +auxiliary) can be obtained by calling pcre2_get_error_message(). The code +is passed as the first argument, with the remaining two arguments specifying a +code unit buffer and its length, into which the text message is placed. Note +that the message is returned in code units of the appropriate width for the +library that is being used. +
++The returned message is terminated with a trailing zero, and the function +returns the number of code units used, excluding the trailing zero. If the +error number is unknown, the negative error code PCRE2_ERROR_BADDATA is +returned. If the buffer is too small, the message is truncated (but still with +a trailing zero), and the negative error code PCRE2_ERROR_NOMEMORY is returned. +None of the messages are very long; a buffer size of 120 code units is ample.
-int pcre2_substring_length_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_SIZE *length); @@ -2448,7 +2684,7 @@ The substring did not participate in the match. For example, if the pattern is (abc)|(def) and the subject is "def", and the ovector contains at least two capturing slots, substring number 1 is unset.
-int pcre2_substring_list_get(pcre2_match_data *match_data, " PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr); @@ -2487,7 +2723,7 @@ can be distinguished from a genuine zero-length substring by inspecting the appropriate offset in the ovector, which contain PCRE2_UNSET for unset substrings, or by calling pcre2_substring_length_bynumber().
-int pcre2_substring_number_from_name(const pcre2_code *code, PCRE2_SPTR name); @@ -2547,37 +2783,22 @@ names are not included in the compiled code. The matching process uses only numbers. For this reason, the use of different names for subpatterns of the same number causes an error at compile time.
-int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, - pcre2_match_context *mcontext, PCRE2_SPTR \fIreplacementzfP, + pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, PCRE2_UCHAR *\fIoutputbuffer\zfP, PCRE2_SIZE *outlengthptr); +
+This function calls pcre2_match() and then makes a copy of the subject string in outputbuffer, replacing the part that was matched with the replacement string, whose length is supplied in rlength. This can -be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. -
--In the replacement string, which is interpreted as a UTF string in UTF mode, -and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a -dollar character is an escape character that can specify the insertion of -characters from capturing groups in the pattern. The following forms are -recognized: -
- $$ insert a dollar character
- $<n> insert the contents of group <n>
- ${<n>} insert the contents of group <n>
-
-Either a group number or a group name can be given for <n>. Curly brackets are
-required only if the following character would be interpreted as part of the
-number or name. The number may be zero to include the entire matched string.
-For example, if the pattern a(b)c is matched with "=abc=" and the replacement
-string "+$1$0$1+", the result is "=+babcb+=". Group insertion is done by
-calling pcre2_copy_byname() or pcre2_copy_bynumber() as
-appropriate.
+be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
+which a \K item in a lookahead in the pattern causes the match to end before
+it starts are not supported, and give rise to an error return.
The first seven arguments of pcre2_substitute() are the same as for @@ -2588,27 +2809,188 @@ functions from the match context, if provided, or else those that were used to allocate memory for the compiled code.
-There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the -function to iterate over the subject string, replacing every matching -substring. If this is not set, only the first matching substring is replaced. -
-The outlengthptr argument must point to a variable that contains the -length, in code units, of the output buffer. It is updated to contain the -length of the new string, excluding the trailing zero that is automatically -added. +length, in code units, of the output buffer. If the function is successful, the +value is updated to contain the length of the new string, excluding the +trailing zero that is automatically added.
-The function returns the number of replacements that were made. This may be -zero if no matches were found, and is never greater than 1 unless -PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code -is returned. Except for PCRE2_ERROR_NOMATCH (which is never returned), any -errors from pcre2_match() or the substring copying functions are passed -straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid -replacement string (unrecognized sequence following a dollar sign), and -PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. +If the function is not successful, the value set via outlengthptr depends +on the type of error. For syntax errors in the replacement string, the value is +the offset in the replacement string where the error was detected. For other +errors, the value is PCRE2_UNSET by default. This includes the case of the +output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set +(see below), in which case the value is the minimum length needed, including +space for the trailing zero. Note that in order to compute the required length, +pcre2_substitute() has to simulate all the matching and copying, instead +of giving an error return as soon as the buffer overflows. Note also that the +length is in code units, not bytes.
-+In the replacement string, which is interpreted as a UTF string in UTF mode, +and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a +dollar character is an escape character that can specify the insertion of +characters from capturing groups or (*MARK) items in the pattern. The following +forms are always recognized: +
+ $$ insert a dollar character
+ $<n> or ${<n>} insert the contents of group <n>
+ $*MARK or ${*MARK} insert the name of the last (*MARK) encountered
+
+Either a group number or a group name can be given for <n>. Curly brackets are
+required only if the following character would be interpreted as part of the
+number or name. The number may be zero to include the entire matched string.
+For example, if the pattern a(b)c is matched with "=abc=" and the replacement
+string "+$1$0$1+", the result is "=+babcb+=".
+
++The facility for inserting a (*MARK) name can be used to perform simple +simultaneous substitutions, as this pcre2test example shows: +
+ /(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
+ apple lemon
+ 2: pear orange
+
+As well as the usual options for pcre2_match(), a number of additional
+options can be set in the options argument.
+
++PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string, +replacing every matching substring. If this is not set, only the first matching +substring is replaced. If any matched substring has zero length, after the +substitution has happened, an attempt to find a non-empty match at the same +position is performed. If this is not successful, the current position is +advanced by one character except when CRLF is a valid newline sequence and the +next two characters are CR, LF. In this case, the current position is advanced +by two characters. +
++PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is +too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If +this option is set, however, pcre2_substitute() continues to go through +the motions of matching and substituting (without, of course, writing anything) +in order to compute the size of buffer that is needed. This value is passed +back via the outlengthptr variable, with the result of the function still +being PCRE2_ERROR_NOMEMORY. +
++Passing a buffer size of zero is a permitted way of finding out how much memory +is needed for given substitution. However, this does mean that the entire +operation is carried out twice. Depending on the application, it may be more +efficient to allocate a large buffer and free the excess afterwards, instead of +using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH. +
++PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups that do +not appear in the pattern to be treated as unset groups. This option should be +used with care, because it means that a typo in a group name or number no +longer causes the PCRE2_ERROR_NOSUBSTRING error. +
++PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including unknown +groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty +strings when inserted as described above. If this option is not set, an attempt +to insert an unset group causes the PCRE2_ERROR_UNSET error. This option does +not influence the extended substitution syntax described below. +
++PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the +replacement string. Without this option, only the dollar character is special, +and only the group insertion forms listed above are valid. When +PCRE2_SUBSTITUTE_EXTENDED is set, two things change: +
++Firstly, backslash in a replacement string is interpreted as an escape +character. The usual forms such as \n or \x{ddd} can be used to specify +particular character codes, and backslash followed by any non-alphanumeric +character quotes that character. Extended quoting can be coded using \Q...\E, +exactly as in pattern strings. +
++There are also four escape sequences for forcing the case of inserted letters. +The insertion mechanism has three states: no case forcing, force upper case, +and force lower case. The escape sequences change the current state: \U and +\L change to upper or lower case forcing, respectively, and \E (when not +terminating a \Q quoted sequence) reverts to no case forcing. The sequences +\u and \l force the next character (if it is a letter) to upper or lower +case, respectively, and then the state automatically reverts to no case +forcing. Case forcing applies to all inserted characters, including those from +captured groups and letters within \Q...\E quoted sequences. +
++Note that case forcing sequences such as \U...\E do not nest. For example, +the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no +effect. +
++The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more +flexibility to group substitution. The syntax is similar to that used by Bash: +
+ ${<n>:-<string>}
+ ${<n>:+<string1>:<string2>}
+
+As before, <n> may be a group number or a name. The first form specifies a
+default value. If group <n> is set, its value is inserted; if not, <string> is
+expanded and the result inserted. The second form specifies strings that are
+expanded and inserted when group <n> is set or unset, respectively. The first
+form is just a convenient shorthand for
+
+ ${<n>:+${<n>}:<string>}
+
+Backslash can be used to escape colons and closing curly brackets in the
+replacement strings. A change of the case forcing state within a replacement
+string remains in force afterwards, as shown in this pcre2test example:
+
+ /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
+ body
+ 1: hello
+ somebody
+ 1: HELLO
+
+The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
+substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
+groups in the extended syntax forms to be treated as unset.
+
++If successful, pcre2_substitute() returns the number of replacements that +were made. This may be zero if no matches were found, and is never greater than +1 unless PCRE2_SUBSTITUTE_GLOBAL is set. +
++In the event of an error, a negative error code is returned. Except for +PCRE2_ERROR_NOMATCH (which is never returned), errors from pcre2_match() +are passed straight back. +
++PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion, +unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set. +
++PCRE2_ERROR_UNSET is returned for an unset substring insertion (including an +unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple +(non-extended) syntax is used and PCRE2_SUBSTITUTE_UNSET_EMPTY is not set. +
++PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the +PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is +needed is returned via outlengthptr. Note that this does not happen by +default. +
++PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the +replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE +(invalid escape sequence), PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket +not found), PCRE2_BADSUBSTITUTION (syntax error in extended group +substitution), and PCRE2_BADSUBPATTERN (the pattern match ended before it +started, which can happen if \K is used in an assertion). +
++As for all PCRE2 errors, a text message that describes the error can be +obtained by calling the pcre2_get_error_message() function (see +"Obtaining a textual error message" +above). +
+int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); @@ -2647,13 +3029,13 @@ function returns the length of each entry in code units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name.
-The format of the name table is described above in the section entitled -Information about a pattern -above. -Given all the relevant entries for the name, you can extract each of their -numbers, and hence the captured data. +The format of the name table is described +above +in the section entitled Information about a pattern. Given all the +relevant entries for the name, you can extract each of their numbers, and hence +the captured data.
-The traditional matching function uses a similar algorithm to Perl, which stops when it finds the first match at a given point in the subject. If you want to @@ -2671,7 +3053,7 @@ substring. Then return 1, which forces pcre2_match() to backtrack and try other alternatives. Ultimately, when it runs out of matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
-int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, @@ -2838,8 +3220,8 @@ There are in addition the following errors that are specific to PCRE2_ERROR_DFA_UITEM This return is given if pcre2_dfa_match() encounters an item in the -pattern that it does not support, for instance, the use of \C or a back -reference. +pattern that it does not support, for instance, the use of \C in a UTF mode or +a back reference.
PCRE2_ERROR_DFA_UCOND@@ -2866,13 +3248,13 @@ some plausibility checks are made on the contents of the workspace, which should contain data about the previous partial match. If any of these checks fail, this error is given. -
pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2stack(3), pcre2unicode(3).
-
Philip Hazel
@@ -2881,11 +3263,11 @@ University Computing Service
Cambridge, England.
-Last updated: 22 April 2015
+Last updated: 23 December 2016
-Copyright © 1997-2015 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2build.html b/pcre2/doc/html/pcre2build.html index 8d9f9ce9e..2e75505f6 100644 --- a/pcre2/doc/html/pcre2build.html +++ b/pcre2/doc/html/pcre2build.html @@ -18,23 +18,26 @@ please consult the man page, in case the conversion went wrong.
@@ -148,13 +151,19 @@ properties. The application can request that they do by setting the PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also request this by starting with (*UCP).
+The \C escape sequence, which matches a single code unit, even in a UTF mode, can cause unpredictable behaviour because it may leave the current matching -point in the middle of a multi-code-unit character. It can be locked out by -setting the PCRE2_NEVER_BACKSLASH_C option. +point in the middle of a multi-code-unit character. The application can lock it +out by setting the PCRE2_NEVER_BACKSLASH_C option when calling +pcre2_compile(). There is also a build-time option +
+ --enable-never-backslash-C ++(note the upper case C) which locks out the use of \C entirely. -
Just-in-time compiler support is included in the build by specifying
@@ -171,7 +180,7 @@ pcre2grep automatically makes use of it, unless you addto the "configure" command. -
By default, PCRE2 interprets the linefeed (LF) character as indicating the end of a line. This is the normal newline character on Unix-like systems. You can @@ -208,7 +217,7 @@ Whatever default line ending convention is selected when PCRE2 is built can be overridden by applications that use the library. At build time it is conventional to use the standard for your operating system.
-By default, the sequence \R in a pattern matches any Unicode newline sequence, independently of what has been selected as the line ending sequence. If you @@ -220,7 +229,7 @@ the default is changed so that \R matches only CR, LF, or CRLF. Whatever is selected when PCRE2 is built can be overridden by applications that use the called.
-Within a compiled pattern, offset values are used to point from one part to another (for example, from an opening parenthesis to an alternation @@ -239,7 +248,7 @@ longer offsets slows down the operation of PCRE2 because it has to load additional data when handling them. For the 32-bit library the value is always 4 and cannot be overridden; the value of --with-link-size is ignored.
-When matching with the pcre2_match() function, PCRE2 implements backtracking by making recursive calls to an internal function called @@ -261,7 +270,7 @@ custom memory management functions can be called instead. PCRE2 runs noticeably more slowly when built in this way. This option affects only the pcre2_match() function; it is not relevant for pcre2_dfa_match().
-Internally, PCRE2 has a function called match(), which it calls repeatedly (sometimes recursively) when matching a pattern with the @@ -290,7 +299,7 @@ constraints. However, you can set a lower limit by adding, for example, to the configure command. This value can also be overridden at run time.
-PCRE2 uses fixed tables for processing characters whose code points are less than 256. By default, PCRE2 is built with a set of tables that are distributed @@ -307,7 +316,7 @@ compiling, because dftables is run on the local host. If you need to create alternative tables when cross compiling, you will have to do so "by hand".)
-PCRE2 assumes by default that it will run in an environment where the character code is ASCII or Unicode, which is a superset of ASCII. This is the case for @@ -342,7 +351,16 @@ The options that select newline behaviour, such as --enable-newline-is-cr, and equivalent run-time options, refer to these character values in an EBCDIC environment.
-+By default, on non-Windows systems, pcre2grep supports the use of +callouts with string arguments within the patterns it is matching, in order to +run external scripts. For details, see the +pcre2grep +documentation. This support can be disabled by adding +--disable-pcre2grep-callout to the configure command. +
+By default, pcre2grep reads all files as plain text. You can build it so that it recognizes files whose names end in .gz or .bz2, and reads @@ -355,22 +373,25 @@ to the configure command. These options naturally require that the relevant libraries are installed on your system. Configuration will fail if they are not.
-pcre2grep uses an internal buffer to hold a "window" on the file it is scanning, in order to be able to output "before" and "after" lines when it -finds a match. The size of the buffer is controlled by a parameter whose -default value is 20K. The buffer itself is three times this size, but because -of the way it is used for holding "before" lines, the longest line that is -guaranteed to be processable is the parameter size. You can change the default -parameter value by adding, for example, +finds a match. The starting size of the buffer is controlled by a parameter +whose default value is 20K. The buffer itself is three times this size, but +because of the way it is used for holding "before" lines, the longest line that +is guaranteed to be processable is the parameter size. If a longer line is +encountered, pcre2grep automatically expands the buffer, up to a +specified maximum size, whose default is 1M or the starting size, whichever is +the larger. You can change the default parameter values by adding, for example,
- --with-pcre2grep-bufsize=50K + --with-pcre2grep-bufsize=51200 + --with-pcre2grep-max-bufsize=2097152-to the configure command. The caller of \fPpcre2grep\fP can override this -value by using --buffer-size on the command line.. +to the configure command. The caller of \fPpcre2grep\fP can override +these values by using --buffer-size and --max-buffer-size on the command line. -
If you add one of
@@ -404,7 +425,7 @@ automatically included, you may need to add something likeimmediately before the configure command. -
If you add
@@ -413,7 +434,7 @@ If you add to the configure command, additional debugging code is included in the build. This feature is intended for use by the PCRE2 maintainers. -This indicates that when matching [bc] fails, there is no backtracking into a+ -and therefore the callouts that would be taken for the backtracks do not occur. -You can disable the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to -pcre2_compile(), or starting the pattern with (*NO_AUTO_POSSESS). In this -case, the output changes to this: +(because it is being treated as a++) and therefore the callouts that would be +taken for the backtracks do not occur. You can disable the auto-possessify +feature by passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting +the pattern with (*NO_AUTO_POSSESS). In this case, the output changes to this:
DEBUGGING WITH VALGRIND SUPPORT
+
DEBUGGING WITH VALGRIND SUPPORT
If you add
@@ -423,7 +444,7 @@ to the configure command, PCRE2 will use valgrind annotations to mark certain memory regions as unaddressable. This allows it to detect invalid memory accesses, and is mostly useful for debugging PCRE2 itself. -If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2 automatically inserts callouts, all with number 255, before each item in the -pattern. For example, if PCRE2_AUTO_CALLOUT is used with the pattern +pattern except for immediately before or after a callout item in the pattern. +For example, if PCRE2_AUTO_CALLOUT is used with the pattern +
CODE COVERAGE REPORTING
+
CODE COVERAGE REPORTING
If your C compiler is gcc, you can build a version of PCRE2 that can generate a code coverage report for its test suite. To enable this, you must install @@ -480,11 +501,32 @@ This cleans all coverage data including the generated coverage report. For more information about code coverage, see the gcov and lcov documentation.
-
SEE ALSO
+
SUPPORT FOR FUZZERS
++There is a special option for use by people who want to run fuzzing tests on +PCRE2: +
+ --enable-fuzz-support ++At present this applies only to the 8-bit library. If set, it causes an extra +library called libpcre2-fuzzsupport.a to be built, but not installed. This +contains a single function called LLVMFuzzerTestOneInput() whose arguments are +a pointer to a string and the length of the string. When called, this function +tries to compile the string as a pattern, and if that succeeds, to match it. +This is done both with no options and with some random options bits that are +generated from the string. Setting --enable-fuzz-support also causes a binary +called pcre2fuzzcheck to be created. This is normally run under valgrind +or used when PCRE2 is compiled with address sanitizing enabled. It calls the +fuzzing function and outputs information about it is doing. The input strings +are specified by arguments: if an argument starts with "=" the rest of it is a +literal input string. Otherwise, it is assumed to be a file name, and the +contents of the file are the test string. + +
SEE ALSO
pcre2api(3), pcre2-config(3).
-
AUTHOR
+
AUTHOR
Philip Hazel
-
@@ -493,11 +535,11 @@ University Computing Service Cambridge, England.
REVISION
+
REVISION
-Last updated: 24 April 2015 +Last updated: 01 November 2016
-Copyright © 1997-2015 University of Cambridge. +Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2callout.html b/pcre2/doc/html/pcre2callout.html index 7e85c9a39..4e307f778 100644 --- a/pcre2/doc/html/pcre2callout.html +++ b/pcre2/doc/html/pcre2callout.html @@ -57,11 +57,20 @@ two callout points:
+ A(?C3)B ++it is processed as if it were ++ (?C255)A(?C3)B(?C255) ++Here is a more complicated example:A(\d{2}|--)-it is processed as if it were +With PCRE2_AUTO_CALLOUT, this pattern is processed as if it were
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255) @@ -107,10 +116,10 @@ with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied to the string No match
--->aaaa
+0 ^ a+
@@ -235,8 +244,8 @@ Fields for numerical callouts
For a numerical callout, callout_string is NULL, and callout_number
contains the number of the callout, in the range 0-255. This is the number
-that follows (?C for manual callouts; it is 255 for automatically generated
-callouts.
+that follows (?C for callouts that part of the pattern; it is 255 for
+automatically generated callouts.
Fields for string callouts
@@ -310,10 +319,15 @@ the next item to be matched.
The next_item_length field contains the length of the next item to be
-matched in the pattern string. When the callout immediately precedes an
-alternation bar, a closing parenthesis, or the end of the pattern, the length
-is zero. When the callout precedes an opening parenthesis, the length is that
-of the entire subpattern.
+processed in the pattern string. When the callout is at the end of the pattern,
+the length is zero. When the callout precedes an opening parenthesis, the
+length includes meta characters that follow the parenthesis. For example, in a
+callout before an assertion such as (?=ab) the length is 3. For an an
+alternation bar or a closing parenthesis, the length is one, unless a closing
+parenthesis is followed by a quantifier, in which case its length is included.
+(This changed in release 10.23. In earlier releases, before an opening
+parenthesis the length was that of the entire subpattern, and before an
+alternation bar or a closing parenthesis the length was zero.)
The pattern_position and next_item_length fields are intended to
@@ -399,9 +413,9 @@ Cambridge, England.
REVISION
-Last updated: 23 March 2015
+Last updated: 29 September 2016
-Copyright © 1997-2015 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page.
diff --git a/pcre2/doc/html/pcre2compat.html b/pcre2/doc/html/pcre2compat.html
index 3b29e6fa2..993dfd1d0 100644
--- a/pcre2/doc/html/pcre2compat.html
+++ b/pcre2/doc/html/pcre2compat.html
@@ -107,7 +107,7 @@ processed as anchored at the point where they are tested.
one that is backtracked onto acts. For example, in the pattern
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
-same as PCRE2, but there are examples where it differs.
+same as PCRE2, but there are cases where it differs.
11. Most backtracking verbs in assertions have their normal actions. They are
@@ -123,7 +123,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
13. PCRE2's handling of duplicate subpattern numbers and duplicate subpattern
names is not as general as Perl's. This is a consequence of the fact the PCRE2
works internally just with numbers, using an external table to translate
-between numbers and names. In particular, a pattern such as (?|(?<a>A)|(?<b)B),
+between numbers and names. In particular, a pattern such as (?|(?<a>A)|(?<b>B),
where the two capturing parentheses have the same number but different names,
is not supported, and causes an error at compile time. If it were allowed, it
would not be possible to distinguish which parentheses matched, because both
@@ -131,10 +131,11 @@ names map to capturing subpattern number 1. To avoid this confusing situation,
an error is given at compile time.
-14. Perl recognizes comments in some places that PCRE2 does not, for example,
-between the ( and ? at the start of a subpattern. If the /x modifier is set,
-Perl allows white space between ( and ? (though current Perls warn that this is
-deprecated) but PCRE2 never does, even if the PCRE2_EXTENDED option is set.
+14. Perl used to recognize comments in some places that PCRE2 does not, for
+example, between the ( and ? at the start of a subpattern. If the /x modifier
+is set, Perl allowed white space between ( and ? though the latest Perls give
+an error (for a while it was just deprecated). There may still be some cases
+where Perl behaves differently.
15. Perl, when in warning mode, gives warnings for character classes such as
@@ -161,42 +162,47 @@ each alternative branch of a lookbehind assertion can match a different length
of string. Perl requires them all to have the same length.
-(b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $
+(b) From PCRE2 10.23, back references to groups of fixed length are supported
+in lookbehinds, provided that there is no possibility of referencing a
+non-unique number or name. Perl does not support backreferences in lookbehinds.
+
+
+(c) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $
meta-character matches only at the very end of the string.
-(c) A backslash followed by a letter with no special meaning is faulted. (Perl
+(d) A backslash followed by a letter with no special meaning is faulted. (Perl
can be made to issue a warning.)
-(d) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is
+(e) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is
inverted, that is, by default they are not greedy, but if followed by a
question mark they are.
-(e) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried
+(f) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried
only at the first matching position in the subject string.
-(f) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, and
+(g) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, and
PCRE2_NO_AUTO_CAPTURE options have no Perl equivalents.
-(g) The \R escape sequence can be restricted to match only CR, LF, or CRLF
+(h) The \R escape sequence can be restricted to match only CR, LF, or CRLF
by the PCRE2_BSR_ANYCRLF option.
-(h) The callout facility is PCRE2-specific.
+(i) The callout facility is PCRE2-specific.
-(i) The partial matching facility is PCRE2-specific.
+(j) The partial matching facility is PCRE2-specific.
-(j) The alternative matching function (pcre2_dfa_match() matches in a
+(k) The alternative matching function (pcre2_dfa_match() matches in a
different way and is not Perl-compatible.
-(k) PCRE2 recognizes some special sequences such as (*CR) at the start of
+(l) PCRE2 recognizes some special sequences such as (*CR) at the start of
a pattern that set overall options that cannot be changed within the pattern.
@@ -214,9 +220,9 @@ Cambridge, England.
REVISION
-Last updated: 15 March 2015
+Last updated: 18 October 2016
-Copyright © 1997-2015 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page.
diff --git a/pcre2/doc/html/pcre2demo.html b/pcre2/doc/html/pcre2demo.html
index 5919117a1..d64e16be1 100644
--- a/pcre2/doc/html/pcre2demo.html
+++ b/pcre2/doc/html/pcre2demo.html
@@ -20,28 +20,31 @@ please consult the man page, in case the conversion went wrong.
*************************************************/
/* This is a demonstration program to illustrate a straightforward way of
-calling the PCRE2 regular expression library from a C program. See the
+using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit
-width. This demonstration program uses the 8-bit library.
+width. This demonstration program uses the 8-bit library. The default is to
+process each code unit as a separate character, but if the pattern begins with
+"(*UTF)", both it and the subject are treated as UTF-8 strings, where
+characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command:
-gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
+cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command:
-gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
+cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
-If you do not have pkg-config, you may have to use this:
+If you do not have pkg-config, you may have to use something like this:
-gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
+cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@@ -56,9 +59,14 @@ the following line. */
/* #define PCRE2_STATIC */
-/* This macro must be defined before including pcre2.h. For a program that uses
-only one code unit width, it makes it possible to use generic function names
-such as pcre2_compile(). */
+/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
+For a program that uses only one code unit width, setting it to 8, 16, or 32
+makes it possible to use generic function names such as pcre2_compile(). Note
+that just changing 8 to 16 (for example) is not sufficient to convert this
+program to process 16-bit characters. Even in a fully 16-bit environment, where
+string-handling functions such as strcmp() and printf() work with 16-bit
+characters, the code for handling the table of named substrings will still need
+to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8
@@ -79,19 +87,19 @@ int main(int argc, char **argv)
{
pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
-PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
+PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table;
int crlf_is_newline;
int errornumber;
int find_all;
int i;
-int namecount;
-int name_entry_size;
int rc;
int utf8;
uint32_t option_bits;
+uint32_t namecount;
+uint32_t name_entry_size;
uint32_t newline;
PCRE2_SIZE erroroffset;
@@ -106,15 +114,19 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value *
-* if the -g option is present. Apart from that, there must be exactly two *
-* arguments. *
+* if the -g option is present. *
**************************************************************************/
find_all = 0;
for (i = 1; i < argc; i++)
{
if (strcmp(argv[i], "-g") == 0) find_all = 1;
- else break;
+ else if (argv[i][0] == '-')
+ {
+ printf("Unrecognised option %s\n", argv[i]);
+ return 1;
+ }
+ else break;
}
/* After the options, we require exactly two arguments, which are the pattern,
@@ -122,7 +134,7 @@ and the subject string. */
if (argc - i != 2)
{
- printf("Two arguments required: a regex and a subject string\n");
+ printf("Exactly two arguments required: a regex and a subject string\n");
return 1;
}
@@ -201,7 +213,7 @@ if (rc < 0)
stored. */
ovector = pcre2_get_ovector_pointer(match_data);
-printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
+printf("Match succeeded at offset %d\n", (int)ovector[0]);
/*************************************************************************
@@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&namecount); /* where to put the answer */
-if (namecount <= 0) printf("No named substrings\n"); else
+if (namecount == 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr;
printf("Named substrings\n");
@@ -330,8 +342,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
for (;;)
{
- uint32_t options = 0; /* Normally no options */
- PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
+ uint32_t options = 0; /* Normally no options */
+ PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the
@@ -371,7 +383,7 @@ for (;;)
{
if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */
- if (crlf_is_newline && /* If CRLF is newline & */
+ if (crlf_is_newline && /* If CRLF is a newline & */
start_offset < subject_length - 1 && /* we are at CRLF, */
subject[start_offset] == '\r' &&
subject[start_offset + 1] == '\n')
@@ -417,7 +429,7 @@ for (;;)
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
}
- if (namecount <= 0) printf("No named substrings\n"); else
+ if (namecount == 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr = name_table;
printf("Named substrings\n");
diff --git a/pcre2/doc/html/pcre2grep.html b/pcre2/doc/html/pcre2grep.html
index dcfb96f34..c5d1a33d7 100644
--- a/pcre2/doc/html/pcre2grep.html
+++ b/pcre2/doc/html/pcre2grep.html
@@ -22,11 +22,12 @@ please consult the man page, in case the conversion went wrong.
@@ -79,11 +80,19 @@ span line boundaries. What defines a line boundary is controlled by the
The amount of memory used for buffering files that are being scanned is -controlled by a parameter that can be set by the --buffer-size option. -The default value for this parameter is specified when pcre2grep is -built, with the default default being 20K. A block of memory three times this -size is used (to allow for buffering "before" and "after" lines). An error -occurs if a line overflows the buffer. +controlled by parameters that can be set by the --buffer-size and +--max-buffer-size options. The first of these sets the size of buffer +that is obtained at the start of processing. If an input file contains very +long lines, a larger buffer may be needed; this is handled by automatically +extending the buffer, up to the limit specified by --max-buffer-size. The +default values for these parameters are specified when pcre2grep is +built, with the default defaults being 20K and 1M respectively. An error occurs +if a line is too long and the buffer can no longer be expanded. +
++The block of memory that is actually used is three times the "buffer size", to +allow for buffering "before" and "after" lines. If the buffer size is too +small, fewer than requested "before" and "after" lines may be output.
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater. @@ -154,12 +163,13 @@ processing of patterns and file names that start with hyphens.
-A number, --after-context=number -Output number lines of context after each matching line. If file names -and/or line numbers are being output, a hyphen separator is used instead of a -colon for the context lines. A line containing "--" is output between each -group of lines, unless they are in fact contiguous in the input file. The value -of number is expected to be relatively small. However, pcre2grep -guarantees to have up to 8K of following text available for context output. +Output up to number lines of context after each matching line. Fewer +lines are output if the next match or the end of the file is reached, or if the +processing buffer size has been set too small. If file names and/or line +numbers are being output, a hyphen separator is used instead of a colon for the +context lines. A line containing "--" is output between each group of lines, +unless they are in fact contiguous in the input file. The value of number +is expected to be relatively small. When -c is used, -A is ignored.
-a, --text @@ -168,12 +178,14 @@ Treat binary files as text. This is equivalent to
-B number, --before-context=number -Output number lines of context before each matching line. If file names -and/or line numbers are being output, a hyphen separator is used instead of a -colon for the context lines. A line containing "--" is output between each -group of lines, unless they are in fact contiguous in the input file. The value -of number is expected to be relatively small. However, pcre2grep -guarantees to have up to 8K of preceding text available for context output. +Output up to number lines of context before each matching line. Fewer +lines are output if the previous match or the start of the file is within +number lines, or if the processing buffer size has been set too small. If +file names and/or line numbers are being output, a hyphen separator is used +instead of a colon for the context lines. A line containing "--" is output +between each group of lines, unless they are in fact contiguous in the input +file. The value of number is expected to be relatively small. When +-c is used, -B is ignored.
--binary-files=word @@ -190,8 +202,9 @@ return code.
--buffer-size=number -Set the parameter that controls how much memory is used for buffering files -that are being scanned. +Set the parameter that controls how much memory is obtained at the start of +processing for buffering files that are being scanned. See also +--max-buffer-size below.
-C number, --context=number @@ -201,14 +214,16 @@ This is equivalent to setting both -A and -B to the same value.
-c, --count
Do not output lines from the files that are being scanned; instead output the
-number of matches (or non-matches if -v is used) that would otherwise
-have caused lines to be shown. By default, this count is the same as the number
-of suppressed lines, but if the -M (multiline) option is used (without
--v), there may be more suppressed lines than the number of matches.
+number of lines that would have been shown, either because they matched, or, if
+-v is set, because they failed to match. By default, this count is
+exactly the same as the number of lines that would have been output, but if the
+-M (multiline) option is used (without -v), there may be more
+suppressed lines than the count (that is, the number of matches).
If no lines are selected, the number zero is output. If several files are are
-being scanned, a count is output for each of them. However, if the
+being scanned, a count is output for each of them and the -t option can
+be used to cause a total to be output at the end. However, if the
--files-with-matches option is also used, only those files whose counts
are greater than zero are listed. When -c is used, the -A,
-B, and -C options are ignored.
@@ -230,12 +245,23 @@ because pcre2grep has to search for all possible matches in a line, not
just one, in order to colour them all.
-The colour that is used can be specified by setting the environment variable
-PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The value of this variable should be a
-string of two numbers, separated by a semicolon. They are copied directly into
-the control string for setting colour on a terminal, so it is your
-responsibility to ensure that they make sense. If neither of the environment
-variables is set, the default is "1;31", which gives red.
+The colour that is used can be specified by setting one of the environment
+variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, PCREGREP_COLOUR, or
+PCREGREP_COLOR, which are checked in that order. If none of these are set,
+pcre2grep looks for GREP_COLORS or GREP_COLOR (in that order). The value
+of the variable should be a string of two numbers, separated by a semicolon,
+except in the case of GREP_COLORS, which must start with "ms=" or "mt="
+followed by two semicolon-separated colours, terminated by the end of the
+string or by a colon. If GREP_COLORS does not start with "ms=" or "mt=" it is
+ignored, and GREP_COLOR is checked.
+
+
+If the string obtained from one of the above variables contains any characters
+other than semicolon or digits, the setting is ignored and the default colour
+is used. The string is copied directly into the control string for setting
+colour on a terminal, so it is your responsibility to ensure that the values
+make sense. If no relevant environment variable is set, the default is "1;31",
+which gives red.
-D action, --devices=action @@ -320,18 +346,18 @@ files; it does not apply to patterns specified by any of the --include or
-f filename, --file=filename
-Read patterns from the file, one per line, and match them against
-each line of input. What constitutes a newline when reading the file is the
-operating system's default. The --newline option has no effect on this
-option. Trailing white space is removed from each line, and blank lines are
-ignored. An empty file contains no patterns and therefore matches nothing. See
-also the comments about multiple patterns versus a single pattern with
-alternatives in the description of -e above.
+Read patterns from the file, one per line, and match them against each line of
+input. What constitutes a newline when reading the file is the operating
+system's default. The --newline option has no effect on this option.
+Trailing white space is removed from each line, and blank lines are ignored. An
+empty file contains no patterns and therefore matches nothing. See also the
+comments about multiple patterns versus a single pattern with alternatives in
+the description of -e above.
-If this option is given more than once, all the specified files are
-read. A data line is output if any of the patterns match it. A file name can
-be given as "-" to refer to the standard input. When -f is used, patterns
+If this option is given more than once, all the specified files are read. A
+data line is output if any of the patterns match it. A file name can be given
+as "-" to refer to the standard input. When -f is used, patterns
specified on the command line using -e may also be present; they are
tested before the file's patterns. However, no other pattern is taken from the
command line; all arguments are treated as the names of paths to be searched.
@@ -501,19 +527,27 @@ There are no short forms for these options. The default settings are specified
when the PCRE2 library is compiled, with the default default being 10 million.
+\fB--max-buffer-size=number +This limits the expansion of the processing buffer, whose initial size can be +set by --buffer-size. The maximum buffer size is silently forced to be no +smaller than the starting buffer size. +
+
-M, --multiline
-Allow patterns to match more than one line. When this option is given, patterns
-may usefully contain literal newline characters and internal occurrences of ^
-and $ characters. The output for a successful match may consist of more than
-one line. The first is the line in which the match started, and the last is the
-line in which the match ended. If the matched string ends with a newline
-sequence the output ends at the end of that line.
+Allow patterns to match more than one line. When this option is set, the PCRE2
+library is called in "multiline" mode. This allows a matched string to extend
+past the end of a line and continue on one or more subsequent lines. Patterns
+used with -M may usefully contain literal newline characters and internal
+occurrences of ^ and $ characters. The output for a successful match may
+consist of more than one line. The first line is the line in which the match
+started, and the last line is the line in which the match ended. If the matched
+string ends with a newline sequence, the output ends at the end of that line.
+If -v is set, none of the lines in a multi-line match are output. Once a
+match has been handled, scanning restarts at the beginning of the line after
+the one in which the match ended.
-When this option is set, the PCRE2 library is called in "multiline" mode.
-However, pcre2grep still processes the input line by line. The difference
-is that a matched string may extend past the end of a line and continue on
-one or more subsequent lines. The newline sequence must be matched as part of
+The newline sequence that separates multiple lines must be matched as part of
the pattern. For example, to find the phrase "regular expression" in a file
where "regular" might be at the end of a line and "expression" at the start of
the next line, you could use this command:
@@ -526,11 +560,8 @@ well as possibly handling a two-character newline sequence.
There is a limit to the number of lines that can be matched, imposed by the way
-that pcre2grep buffers the input file as it scans it. However,
-pcre2grep ensures that at least 8K characters or the rest of the file
-(whichever is the shorter) are available for forward matching, and similarly
-the previous 8K characters (or all the previous characters, if fewer than 8K)
-are guaranteed to be available for lookbehind assertions. The -M option
+that pcre2grep buffers the input file as it scans it. With a sufficiently
+large processing buffer, this should not be a problem, but the -M option
does not work when input is read line by line (see \fP--line-buffered\fP.)
@@ -578,12 +609,13 @@ It should never be needed in normal use. Show only the part of the line that matched a pattern instead of the whole line. In this mode, no context is shown. That is, the -A, -B, and -C options are ignored. If there is more than one match in a line, each -of them is shown separately. If -o is combined with -v (invert the -sense of the match to find non-matching lines), no output is generated, but the -return code is set appropriately. If the matched portion of the line is empty, -nothing is output unless the file name or line number are being printed, in -which case they are shown on an otherwise empty line. This option is mutually -exclusive with --file-offsets and --line-offsets. +of them is shown separately, on a separate line of output. If -o is +combined with -v (invert the sense of the match to find non-matching +lines), no output is generated, but the return code is set appropriately. If +the matched portion of the line is empty, nothing is output unless the file +name or line number are being printed, in which case they are shown on an +otherwise empty line. This option is mutually exclusive with +--file-offsets and --line-offsets.
-onumber, --only-matching=number
@@ -597,10 +629,11 @@ capturing parentheses do not exist in the pattern, or were not set in the
match, nothing is output unless the file name or line number are being output.
-If this option is given multiple times, multiple substrings are output, in the
-order the options are given. For example, -o3 -o1 -o3 causes the substrings
-matched by capturing parentheses 3 and 1 and then 3 again to be output. By
-default, there is no separator (but see the next option).
+If this option is given multiple times, multiple substrings are output for each
+match, in the order the options are given, and all on one line. For example,
+-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
+then 3 again to be output. By default, there is no separator (but see the next
+option).
--om-separator=text @@ -631,6 +664,18 @@ quietly skipped. However, the return code is still 2, even if matches were found in other files.
+-t, --total-count +This option is useful when scanning more than one file. If used on its own, +-t suppresses all output except for a grand total number of matching +lines (or non-matching lines if -v is used) in all the files. If -t +is used with -c, a grand total is output except when the previous output +is just one line. In other words, it is not output when just one file's count +is listed. If file names are being output, the grand total is preceded by +"TOTAL:". Otherwise, it appears as just another number. The -t option is +ignored when used with -L (list files without matches), because the grand +total would always be zero. +
+-u, --utf-8 Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (including those for any --exclude and @@ -658,11 +703,12 @@ specified by any of the --include or --exclude options.
-x, --line-regex, --line-regexp Force the patterns to be anchored (each must start matching at the beginning of -a line) and in addition, require them to match entire lines. This is equivalent -to having ^ and $ characters at the start and end of each alternative top-level -branch in every pattern. This option applies only to the patterns that are -matched against the contents of files; it does not apply to patterns specified -by any of the --include or --exclude options. +a line) and in addition, require them to match entire lines. In multiline mode +the match may be more than one line. This is equivalent to having \A and \Z +characters at the start and end of each alternative top-level branch in every +pattern. This option applies only to the patterns that are matched against the +contents of files; it does not apply to patterns specified by any of the +--include or --exclude options.
@@ -735,7 +781,57 @@ The exceptions to the above are the --colour (or --color) and options does have data, it must be given in the first form, using an equals character. Otherwise pcre2grep will assume that it has no data.
-+pcre2grep has, by default, support for calling external programs or +scripts during matching by making use of PCRE2's callout facility. However, +this support can be disabled when pcre2grep is built. You can find out +whether your binary has support for callouts by running it with the --help +option. If the support is not enabled, all callouts in patterns are ignored by +pcre2grep. +
++A callout in a PCRE2 pattern is of the form (?C<arg>) where the argument is +either a number or a quoted string (see the +pcre2callout +documentation for details). Numbered callouts are ignored by pcre2grep. +String arguments are parsed as a list of substrings separated by pipe (vertical +bar) characters. The first substring must be an executable name, with the +following substrings specifying arguments: +
+ executable_name|arg1|arg2|... ++Any substring (including the executable name) may contain escape sequences +started by a dollar character: $<digits> or ${<digits>} is replaced by the +captured substring of the given decimal number, which must be greater than +zero. If the number is greater than the number of capturing substrings, or if +the capture is unset, the replacement is empty. + +
+Any other character is substituted by itself. In particular, $$ is replaced by +a single dollar and $| is replaced by a pipe character. Here is an example: +
+ echo -e "abcde\n12345" | pcre2grep \
+ '(?x)(.)(..(.))
+ (?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' -
+
+ Output:
+
+ Arg1: [a] [bcd] [d] Arg2: |a| ()
+ abcde
+ Arg1: [1] [234] [4] Arg2: |1| ()
+ 12345
+
+The parameters for the execv() system call that is used to run the
+program or script are zero-terminated strings. This means that binary zero
+characters in the callout argument will cause premature termination of their
+substrings, and therefore should not be present. Any syntax errors in the
+string (for example, a dollar not followed by another character) cause the
+callout to be ignored. If running the program fails for any reason (including
+the non-existence of the executable), a local matching failure occurs and the
+matcher backtracks in the normal way.
+
+It is possible to supply a regular expression that takes a very long time to fail to match certain lines. Such patterns normally involve nested indefinite @@ -751,7 +847,7 @@ overall resource limit; there is a second option called --recursion-limit that sets a limit on the amount of memory (usually stack) that is used (see the discussion of these options above).
-Exit status is 0 if any matches were found, 1 if no matches were found, and 2 for syntax errors, overlong lines, non-existent or inaccessible files (even if @@ -759,11 +855,11 @@ matches were found in other files) or too many matching errors. Using the -s option to suppress error messages about inaccessible files does not affect the return code.
--pcre2pattern(3), pcre2syntax(3). +pcre2pattern(3), pcre2syntax(3), pcre2callout(3).
-
Philip Hazel
@@ -772,11 +868,11 @@ University Computing Service
Cambridge, England.
-Last updated: 03 January 2015
+Last updated: 31 December 2016
-Copyright © 1997-2015 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2jit.html b/pcre2/doc/html/pcre2jit.html index 9e3207340..4a6d4ff37 100644 --- a/pcre2/doc/html/pcre2jit.html +++ b/pcre2/doc/html/pcre2jit.html @@ -86,6 +86,13 @@ results. The returned value from pcre2_jit_compile() is zero on success, or a negative error code.
+There is a limit to the size of pattern that JIT supports, imposed by the size +of machine stack that it uses. The exact rules are not documented because they +may change at any time, in particular, when new optimizations are introduced. +If a pattern is too big, a call to \fBpcre2_jit_compile()\fB returns +PCRE2_ERROR_NOMEMORY. +
+PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete matches. If you want to run partial matches using the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT options of pcre2_match(), you should set one or both @@ -145,6 +152,10 @@ PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED option is not supported at match time.
+If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the +use of JIT, forcing matching by the interpreter code. +
+The only unsupported pattern items are \C (match a single data unit) when running in a UTF mode, and a callout immediately before an assertion condition in a conditional group. @@ -224,8 +235,14 @@ whether a match operation was executed by JIT or by the interpreter.
You may safely use the same JIT stack for more than one pattern (either by -assigning directly or by callback), as long as the patterns are all matched -sequentially in the same thread. In a multithread application, if you do not +assigning directly or by callback), as long as the patterns are matched +sequentially in the same thread. Currently, the only way to set up +non-sequential matches in one thread is to use callouts: if a callout function +starts another match, that match must use a different JIT stack to the one used +for currently suspended match(es). +
++In a multithread application, if you do not specify a JIT stack, or if you assign or pass back NULL from a callback, that is thread-safe, because each thread has its own machine stack. However, if you assign or pass back a non-NULL JIT stack, this must be a different stack for @@ -390,7 +407,7 @@ The fast path function is called pcre2_jit_match(), and it takes exactly the same arguments as pcre2_match(). The return values are also the same, plus PCRE2_ERROR_JIT_BADOPTION if a matching mode (partial or complete) is requested that was not compiled. Unsupported option bits (for example, -PCRE2_ANCHORED) are ignored. +PCRE2_ANCHORED) are ignored, as is the PCRE2_NO_JIT option.
When you call pcre2_match(), as well as testing for invalid options, a @@ -419,9 +436,9 @@ Cambridge, England.
-Last updated: 27 November 2014
+Last updated: 05 June 2016
-Copyright © 1997-2014 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2limits.html b/pcre2/doc/html/pcre2limits.html index b1c06f55f..d7e382bfc 100644 --- a/pcre2/doc/html/pcre2limits.html +++ b/pcre2/doc/html/pcre2limits.html @@ -32,6 +32,11 @@ However, the speed of execution is slower. In the 32-bit library, the internal linkage size is always 4.
+The maximum length of a source pattern string is essentially unlimited; it is +the largest number a PCRE2_SIZE variable can hold. However, the program that +calls pcre2_compile() can specify a smaller limit. +
+The maximum length (in code units) of a subject string is one less than the largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an unsigned integer type, usually defined as size_t. Its maximum value (that is @@ -50,17 +55,16 @@ documentation. All values in repeating quantifiers must be less than 65536.
+The maximum length of a lookbehind assertion is 65535 characters. +
+There is no limit to the number of parenthesized subpatterns, but there can be no more than 65535 capturing subpatterns. There is, however, a limit to the depth of nesting of parenthesized subpatterns of all kinds. This is imposed in -order to limit the amount of system stack used at compile time. The limit can -be specified when PCRE2 is built; the default is 250. -
--There is a limit to the number of forward references to subsequent subpatterns -of around 200,000. Repeated forward references with fixed upper limits, for -example, (?2){0,100} when subpattern number 2 is to the right, are included in -the count. There is no limit to the number of backward references. +order to limit the amount of system stack used at compile time. The default +limit can be specified when PCRE2 is built; the default default is 250. An +application can change this limit by calling pcre2_set_parens_nest_limit() to +set the limit in a compile context.
The maximum length of name for a named subpattern is 32 code units, and the @@ -68,7 +72,12 @@ maximum number of named subpatterns is 10000.
The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb -is 255 for the 8-bit library and 65535 for the 16-bit and 32-bit libraries. +is 255 code units for the 8-bit library and 65535 code units for the 16-bit and +32-bit libraries. +
++The maximum length of a string argument to a callout is the largest number a +32-bit unsigned integer can hold.
-Last updated: 25 November 2014
+Last updated: 26 October 2016
-Copyright © 1997-2014 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2pattern.html b/pcre2/doc/html/pcre2pattern.html index a9ca60e62..58eb0e6d5 100644 --- a/pcre2/doc/html/pcre2pattern.html +++ b/pcre2/doc/html/pcre2pattern.html @@ -190,6 +190,12 @@ be less than the value set (or defaulted) by the caller of pcre2_match() for it to have any effect. In other words, the pattern writer can lower the limits set by the programmer, but not raise them. If there is more than one setting of one of these limits, the lower value is used. +
++The match limit is used (but in a different way) when JIT is being used, but it +is not relevant, and is ignored, when matching with pcre2_dfa_match(). +However, the recursion limit is relevant for DFA matching, which does use some +function recursion, in particular, for recursions within the pattern.
When PCRE2 is compiled in EBCDIC mode, \a, \e, \f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c escape is processed as specified for Perl in the perlebcdic document. The only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?. Any -other character provokes a compile-time error. The sequence \@ encodes -character code 0; the letters (in either case) encode characters 1-26 (hex 01 -to hex 1A); [, \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and -\? becomes either 255 (hex FF) or 95 (hex 5F). +other character provokes a compile-time error. The sequence \c@ encodes +character code 0; after \c the letters (in either case) encode characters 1-26 +(hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 (hex 1B to hex +1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F).
-Thus, apart from \?, these escapes generate the same character code values as +Thus, apart from \c?, these escapes generate the same character code values as they do in an ASCII environment, though the meanings of the values mostly -differ. For example, \G always generates code value 7, which is BEL in ASCII +differ. For example, \cG always generates code value 7, which is BEL in ASCII but DEL in EBCDIC.
-The sequence \? generates DEL (127, hex 7F) in an ASCII environment, but +The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but because 127 is not a control character in EBCDIC, Perl makes it generate the APC character. Unfortunately, there are several variants of EBCDIC. In most of them the APC character has the value 255 (hex FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC -values, PCRE2 makes \? generate 95; otherwise it generates 255. +values, PCRE2 makes \c? generate 95; otherwise it generates 255.
After \0 up to two further octal digits are read. If there are fewer than two
@@ -526,9 +531,9 @@ by code point, as described in the previous section.
Absolute and relative back references
-The sequence \g followed by an unsigned or a negative number, optionally -enclosed in braces, is an absolute or relative back reference. A named back -reference can be coded as \g{name}. Back references are discussed +The sequence \g followed by a signed or unsigned number, optionally enclosed +in braces, is an absolute or relative back reference. A named back reference +can be coded as \g{name}. Back references are discussed later, following the discussion of parenthesized subpatterns. @@ -669,8 +674,8 @@ This is an example of an "atomic group", details of which are given This particular group matches either the two-character sequence CR followed by LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (carriage return, U+000D), or NEL (next -line, U+0085). The two-character sequence is treated as a single unit that -cannot be split. +line, U+0085). Because this is an atomic group, the two-character sequence is +treated as a single unit that cannot be split.
In other modes, two additional characters whose codepoints are greater than 255 @@ -736,6 +741,8 @@ Those that are not part of an identified script are lumped together as "Common". The current list of scripts is:
+Ahom, +Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, @@ -776,6 +783,7 @@ Gurmukhi, Han, Hangul, Hanunoo, +Hatran, Hebrew, Hiragana, Imperial_Aramaic, @@ -812,12 +820,14 @@ Miao, Modi, Mongolian, Mro, +Multani, Myanmar, Nabataean, New_Tai_Lue, Nko, Ogham, Ol_Chiki, +Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic, @@ -839,6 +849,7 @@ Saurashtra, Sharada, Shavian, Siddham, +SignWriting, Sinhala, Sora_Sompeng, Sundanese, @@ -1180,6 +1191,16 @@ when the startoffset argument of pcre2_match() is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set.
+When the newline convention (see +"Newline conventions" +below) recognizes the two-character sequence CRLF as a newline, this is +preferred, even if the single characters CR and LF are also recognized as +newlines. For example, if the newline convention is "any", a multiline mode +circumflex matches before "xyz" in the string "abc\r\nxyz" rather than after +CR, even though CR on its own is a valid newline. (It also matches at the very +start of the string, of course.) +
+Note that the sequences \A, \Z, and \z can be used to match the start and end of the subject in both modes, and if all branches of a pattern start with \A it is always anchored, whether or not PCRE2_MULTILINE is set. @@ -1230,20 +1251,32 @@ with \C in UTF-8 or UTF-16 mode means that the rest of the string may start with a malformed UTF character. This has undefined results, because PCRE2 assumes that it is matching character by character in a valid UTF string (by default it checks the subject string's validity at the start of processing -unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the -use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. +unless the PCRE2_NO_UTF_CHECK option is used). +
++An application can lock out the use of \C by setting the +PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to +build PCRE2 with the use of \C permanently disabled.
PCRE2 does not allow \C to appear in lookbehind assertions (described below) -in a UTF mode, because this would make it impossible to calculate the length of -the lookbehind. +in UTF-8 or UTF-16 modes, because this would make it impossible to calculate +the length of the lookbehind. Neither the alternative matching function +pcre2_dfa_match() nor the JIT optimizer support \C in these UTF modes. +The former gives a match-time error; the latter fails to optimize and so the +match is always run using the interpreter. +
++In the 32-bit library, however, \C is always supported (when not explicitly +locked out) because it always matches a single code unit, whether or not UTF-32 +is specified.
In general, the \C escape sequence is best avoided. However, one way of using -it that avoids the problem of malformed UTF characters is to use a lookahead to -check the length of the next character, as in this pattern, which could be used -with a UTF-8 string (ignore white space and line breaks): +it that avoids the problem of malformed UTF-8 or UTF-16 characters is to use a +lookahead to check the length of the next character, as in this pattern, which +could be used with a UTF-8 string (ignore white space and line breaks):
(?| (?=[\x00-\x7f])(\C) |
(?=[\x80-\x{7ff}])(\C)(\C) |
@@ -1298,42 +1331,6 @@ whatever setting of the PCRE2_DOTALL and PCRE2_MULTILINE options is used. A
class such as [^a] always matches one of these characters.
-The minus (hyphen) character can be used to specify a range of characters in a
-character class. For example, [d-m] matches any letter between d and m,
-inclusive. If a minus character is required in a class, it must be escaped with
-a backslash or appear in a position where it cannot be interpreted as
-indicating a range, typically as the first or last character in the class, or
-immediately after a range. For example, [b-d-z] matches letters in the range b
-to d, a hyphen character, or z.
-
-
-It is not possible to have the literal character "]" as the end character of a
-range. A pattern such as [W-]46] is interpreted as a class of two characters
-("W" and "-") followed by a literal string "46]", so it would match "W46]" or
-"-46]". However, if the "]" is escaped with a backslash it is interpreted as
-the end of range, so [W-\]46] is interpreted as a class containing a range
-followed by two other characters. The octal or hexadecimal representation of
-"]" can also be used to end a range.
-
-
-An error is generated if a POSIX character class (see below) or an escape
-sequence other than one that defines a single character appears at a point
-where a range ending character is expected. For example, [z-\xff] is valid,
-but [A-\d] and [A-[:digit:]] are not.
-
-
-Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\000-\037]. Ranges
-can include any characters that are valid for the current mode.
-
-
-If a range that includes letters is used when caseless matching is set, it
-matches the letters in either case. For example, [W-c] is equivalent to
-[][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character
-tables for a French locale are in use, [\xc8-\xcb] matches accented E
-characters in both cases.
-
-
The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v,
\V, \w, and \W may appear in a character class, and add the characters that
they match to the class. For example, [\dABCDEF] matches any hexadecimal
@@ -1347,6 +1344,52 @@ are not special inside a character class. Like any other unrecognized escape
sequences, they cause an error.
+The minus (hyphen) character can be used to specify a range of characters in a
+character class. For example, [d-m] matches any letter between d and m,
+inclusive. If a minus character is required in a class, it must be escaped with
+a backslash or appear in a position where it cannot be interpreted as
+indicating a range, typically as the first or last character in the class,
+or immediately after a range. For example, [b-d-z] matches letters in the range
+b to d, a hyphen character, or z.
+
+
+Perl treats a hyphen as a literal if it appears before or after a POSIX class
+(see below) or a character type escape such as as \d, but gives a warning in
+its warning mode, as this is most likely a user error. As PCRE2 has no facility
+for warning, an error is given in these cases.
+
+
+It is not possible to have the literal character "]" as the end character of a
+range. A pattern such as [W-]46] is interpreted as a class of two characters
+("W" and "-") followed by a literal string "46]", so it would match "W46]" or
+"-46]". However, if the "]" is escaped with a backslash it is interpreted as
+the end of range, so [W-\]46] is interpreted as a class containing a range
+followed by two other characters. The octal or hexadecimal representation of
+"]" can also be used to end a range.
+
+
+Ranges normally include all code points between the start and end characters,
+inclusive. They can also be used for code points specified numerically, for
+example [\000-\037]. Ranges can include any characters that are valid for the
+current mode.
+
+
+There is a special case in EBCDIC environments for ranges whose end points are
+both specified as literal letters in the same case. For compatibility with
+Perl, EBCDIC code points within the range that are not letters are omitted. For
+example, [h-k] matches only four characters, even though the codes for h and k
+are 0x88 and 0x92, a range of 11 code points. However, if the range is
+specified numerically, for example, [\x88-\x92] or [h-\x92], all code points
+are included.
+
+
+If a range that includes letters is used when caseless matching is set, it
+matches the letters in either case. For example, [W-c] is equivalent to
+[][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character
+tables for a French locale are in use, [\xc8-\xcb] matches accented E
+characters in both cases.
+
+
A circumflex can conveniently be used with the upper case character types to
specify a more restricted set of characters than the matching lower case type.
For example, the class [^\W_] matches any letter or digit, but not underscore,
@@ -1514,13 +1557,8 @@ respectively.
When one of these option changes occurs at top level (that is, not inside
subpattern parentheses), the change applies to the remainder of the pattern
-that follows. If the change is placed right at the start of a pattern, PCRE2
-extracts it into the global options (and it will therefore show up in data
-extracted by the pcre2_pattern_info() function).
-
-
-An option change within a subpattern (see below for a description of
-subpatterns) affects only that part of the subpattern that follows it, so
+that follows. An option change within a subpattern (see below for a description
+of subpatterns) affects only that part of the subpattern that follows it, so
(a(?i)b)c
@@ -1649,6 +1687,10 @@ first one in the pattern with the given number. The following pattern matches
/(?|(abc)|(def))(?1)/
+A relative reference such as (?-1) is no different: it is just a convenient way
+of computing an absolute group number.
+
+
If a
condition test
for a subpattern's having matched refers to a non-unique number, the test is
@@ -2051,9 +2093,9 @@ subpattern is possible using named parentheses (see below).
Another way of avoiding the ambiguity inherent in the use of digits following a
-backslash is to use the \g escape sequence. This escape must be followed by an
-unsigned number or a negative number, optionally enclosed in braces. These
-examples are all identical:
+backslash is to use the \g escape sequence. This escape must be followed by a
+signed or unsigned number, optionally enclosed in braces. These examples are
+all identical:
(ring), \1
(ring), \g1
@@ -2061,8 +2103,7 @@ examples are all identical:
An unsigned number specifies an absolute reference without the ambiguity that
is present in the older syntax. It is also useful when literal digits follow
-the reference. A negative number is a relative reference. Consider this
-example:
+the reference. A signed number is a relative reference. Consider this example:
(abc(def)ghi)\g{-1}
@@ -2073,6 +2114,11 @@ can be helpful in long patterns, and also in patterns that are created by
joining together fragments that contain references within themselves.
+The sequence \g{+1} is a reference to the next capturing subpattern. This kind
+of forward reference can be useful it patterns that repeat. Perl does not
+support the use of + in this way.
+
+
A back reference matches whatever actually matched the capturing subpattern in
the current subject string, rather than anything matching the subpattern
itself (see
@@ -2172,6 +2218,14 @@ capturing is carried out only for positive assertions. (Perl sometimes, but not
always, does do capturing in negative assertions.)
+WARNING: If a positive assertion containing one or more capturing subpatterns
+succeeds, but failure to match later in the pattern causes backtracking over
+this assertion, the captures within the assertion are reset only if no higher
+numbered captures are already set. This is, unfortunately, a fundamental
+limitation of the current implementation; it may get removed in a future
+reworking.
+
+
For compatibility with Perl, most assertion subpatterns may be repeated; though
it makes no sense to assert the same thing several times, the side effect of
capturing parentheses may occasionally be useful. However, an assertion that
@@ -2268,18 +2322,31 @@ match. If there are insufficient characters before the current position, the
assertion fails.
-In a UTF mode, PCRE2 does not allow the \C escape (which matches a single code
-unit even in a UTF mode) to appear in lookbehind assertions, because it makes
-it impossible to calculate the length of the lookbehind. The \X and \R
-escapes, which can match different numbers of code units, are also not
-permitted.
+In UTF-8 and UTF-16 modes, PCRE2 does not allow the \C escape (which matches a
+single code unit even in a UTF mode) to appear in lookbehind assertions,
+because it makes it impossible to calculate the length of the lookbehind. The
+\X and \R escapes, which can match different numbers of code units, are never
+permitted in lookbehinds.
"Subroutine"
calls (see below) such as (?2) or (?&X) are permitted in lookbehinds, as long
-as the subpattern matches a fixed-length string.
-Recursion,
-however, is not supported.
+as the subpattern matches a fixed-length string. However,
+recursion,
+that is, a "subroutine" call into a group that is already active,
+is not supported.
+
+
+Perl does not support back references in lookbehinds. PCRE2 does support them,
+but only if certain conditions are met. The PCRE2_MATCH_UNSET_BACKREF option
+must not be set, there must be no use of (?| in the pattern (it creates
+duplicate subpattern numbers), and if the back reference is by name, the name
+must be unique. Of course, the referenced subpattern must itself be of fixed
+length. The following pattern matches words containing at least two characters
+that begin and end with the same character:
+
+ \b(\w)\w++(?<=\1)
+
Possessive quantifiers can be used in conjunction with lookbehind assertions to
@@ -2417,7 +2484,9 @@ Checking for a used subpattern by name
Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a used
subpattern by name. For compatibility with earlier versions of PCRE1, which had
-this facility before Perl, the syntax (?(name)...) is also recognized.
+this facility before Perl, the syntax (?(name)...) is also recognized. Note,
+however, that undelimited names consisting of the letter R followed by digits
+are ambiguous (see the following section).
Rewriting the above example to use a named subpattern gives this:
@@ -2432,30 +2501,52 @@ matched.
Checking for pattern recursion
-If the condition is the string (R), and there is no subpattern with the name R,
-the condition is true if a recursive call to the whole pattern or any
-subpattern has been made. If digits or a name preceded by ampersand follow the
-letter R, for example:
+"Recursion" in this sense refers to any subroutine-like call from one part of
+the pattern to another, whether or not it is actually recursive. See the
+sections entitled
+"Recursive patterns"
+and
+"Subpatterns as subroutines"
+below for details of recursion and subpattern calls.
+
+
+If a condition is the string (R), and there is no subpattern with the name R,
+the condition is true if matching is currently in a recursion or subroutine
+call to the whole pattern or any subpattern. If digits follow the letter R, and
+there is no subpattern with that name, the condition is true if the most recent
+call is into a subpattern with the given number, which must exist somewhere in
+the overall pattern. This is a contrived example that is equivalent to a+b:
- (?(R3)...) or (?(R&name)...)
+ ((?(R1)a+|(?1)b))
-the condition is true if the most recent recursion is into a subpattern whose
-number or name is given. This condition does not check the entire recursion
-stack. If the name used in a condition of this kind is a duplicate, the test is
-applied to all subpatterns of the same name, and is true if any one of them is
-the most recent recursion.
+However, in both cases, if there is a subpattern with a matching name, the
+condition tests for its being set, as described in the section above, instead
+of testing for recursion. For example, creating a group with the name R1 by
+adding (?<R1>) to the above pattern completely changes its meaning.
+
+
+If a name preceded by ampersand follows the letter R, for example:
+
+ (?(R&name)...)
+
+the condition is true if the most recent recursion is into a subpattern of that
+name (which must exist within the pattern).
+
+
+This condition does not check the entire recursion stack. It tests only the
+current level. If the name used in a condition of this kind is a duplicate, the
+test is applied to all subpatterns of the same name, and is true if any one of
+them is the most recent recursion.
At "top level", all these recursion test conditions are false.
-The syntax for recursive patterns
-is described below.
Defining subpatterns for use by reference only
-If the condition is the string (DEFINE), and there is no subpattern with the
-name DEFINE, the condition is always false. In this case, there may be only one
+If the condition is the string (DEFINE), the condition is always false, even if
+there is a group with the name DEFINE. In this case, there may be only one
alternative in the subpattern. It is always skipped if control reaches this
point in the pattern; the idea of DEFINE is that it can be used to define
subroutines that can be referenced from elsewhere. (The use of
@@ -2489,7 +2580,8 @@ For example:
(?(VERSION>=10.4)yes|no)
This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or
-"no" otherwise.
+"no" otherwise. The fractional part of the version number may not contain more
+than two digits.
+Be aware however, that if +duplicate subpattern numbers +are in use, relative references refer to the earliest subpattern with the +appropriate number. Consider, for example: +
+ (?|(a)|(b)) (c) (?-2) ++The first two capturing groups (a) and (b) are both numbered 1, and group (c) +is number 2. When the reference (?-2) is encountered, the second most recently +opened parentheses has the number 1, but it is the first such group (the (a) +group) to which the recursion refers. This would be the same if an absolute +reference (?1) was used. In other words, relative references are just a +shorthand for computing a group number. + +
It is also possible to refer to subsequently opened parentheses, by writing references such as (?+2). However, these cannot be recursive because the reference is not inside the parentheses that are referenced. They are always @@ -2899,14 +3006,36 @@ remarks apply to the PCRE2 features described in this section.
The new verbs make use of what was previously invalid syntax: an opening -parenthesis followed by an asterisk. They are generally of the form -(*VERB) or (*VERB:NAME). Some may take either form, possibly behaving -differently depending on whether or not a name is present. A name is any -sequence of characters that does not include a closing parenthesis. The maximum -length of name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit -libraries. If the name is empty, that is, if the closing parenthesis -immediately follows the colon, the effect is as if the colon were not there. -Any number of these verbs may occur in a pattern. +parenthesis followed by an asterisk. They are generally of the form (*VERB) or +(*VERB:NAME). Some verbs take either form, possibly behaving differently +depending on whether or not a name is present. +
++By default, for compatibility with Perl, a name is any sequence of characters +that does not include a closing parenthesis. The name is not processed in +any way, and it is not possible to include a closing parenthesis in the name. +This can be changed by setting the PCRE2_ALT_VERBNAMES option, but the result +is no longer Perl-compatible. +
++When PCRE2_ALT_VERBNAMES is set, backslash processing is applied to verb names +and only an unescaped closing parenthesis terminates the name. However, the +only backslash items that are permitted are \Q, \E, and sequences such as +\x{100} that define character code points. Character type escapes such as \d +are faulted. +
++A closing parenthesis can be included in a name either as \) or between \Q +and \E. In addition to backslash processing, if the PCRE2_EXTENDED option is +also set, unescaped whitespace in verb names is skipped, and #-comments are +recognized, exactly as in the rest of the pattern. PCRE2_EXTENDED does not +affect verb names unless PCRE2_ALT_VERBNAMES is also set. +
++The maximum length of a name is 255 in the 8-bit library and 65535 in the +16-bit and 32-bit libraries. If the name is empty, that is, if the closing +parenthesis immediately follows the colon, the effect is as if the colon were +not there. Any number of these verbs may occur in a pattern.
Since these verbs are specifically related to backtracking, most of them can be @@ -3323,9 +3452,9 @@ Cambridge, England.
-Last updated: 13 June 2015
+Last updated: 27 December 2016
-Copyright © 1997-2015 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page.
diff --git a/pcre2/doc/html/pcre2perform.html b/pcre2/doc/html/pcre2perform.html
index 3b6a4a6c8..ac9d23cd8 100644
--- a/pcre2/doc/html/pcre2perform.html
+++ b/pcre2/doc/html/pcre2perform.html
@@ -12,17 +12,21 @@ This page is part of the PCRE2 HTML documentation. It was generated
automatically from the original man page. If there is any nonsense in it,
please consult the man page, in case the conversion went wrong.
-
-PCRE2 PERFORMANCE
-
+
Two aspects of performance are discussed below: memory usage and processing time. The way you express your pattern as a regular expression can affect both of them.
-Patterns are compiled by PCRE2 into a reasonably efficient interpretive code, so that most simple patterns do not use much memory. However, there is one case @@ -75,9 +79,7 @@ pattern. Nevertheless, if the atomic grouping is not a problem and the loss of speed is acceptable, this kind of rewriting will allow you to process patterns that PCRE2 cannot otherwise handle.
-When pcre2_match() is used for matching, certain kinds of pattern can cause it to use large amounts of the process stack. In some environments the @@ -86,9 +88,7 @@ SIGSEGV. Rewriting your pattern can often help. The pcre2stack documentation discusses this issue in detail.
-Certain items in regular expression patterns are processed more efficiently than others. It is more efficient to use a character class like [aeiou] than a @@ -177,9 +177,7 @@ appreciable time with strings longer than about 20 characters. In many cases, the solution to this kind of performance issue is to use an atomic group or a possessive quantifier.
-
Philip Hazel
@@ -188,9 +186,7 @@ University Computing Service
Cambridge, England.
Last updated: 02 January 2015
diff --git a/pcre2/doc/html/pcre2posix.html b/pcre2/doc/html/pcre2posix.html
index 5e4b5a3b4..1d5fe6356 100644
--- a/pcre2/doc/html/pcre2posix.html
+++ b/pcre2/doc/html/pcre2posix.html
@@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE2 regular
expression 8-bit library. See the
pcre2api
documentation for a description of PCRE2's native API, which contains much
-additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit
+additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit
and 32-bit libraries.
@@ -67,9 +67,9 @@ POSIX interface often use it, this makes it easier to slot in PCRE2 as a replacement library. Other POSIX options are not even defined.
-There are also some other options that are not defined by POSIX. These have -been added at the request of users who want to make use of certain -PCRE2-specific features via the POSIX calling interface. +There are also some options that are not defined by POSIX. These have been +added at the request of users who want to make use of certain PCRE2-specific +features via the POSIX calling interface.
When PCRE2 is called via these functions, it is only the API that is POSIX-like @@ -119,11 +119,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section).
REG_NOSUB-The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed -for compilation to the native function. In addition, when a pattern that is -compiled with this flag is passed to regexec() for matching, the -nmatch and pmatch arguments are ignored, and no captured strings -are returned. +When a pattern that is compiled with this flag is passed to regexec() for +matching, the nmatch and pmatch arguments are ignored, and no +captured strings are returned. Versions of the PCRE library prior to 10.22 used +to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens +because it disables the use of back references.
REG_UCP@@ -170,7 +170,7 @@ use the contents of the preg structure. If, for example, you pass it to This area is not simple, because POSIX and Perl take different views of things. It is not possible to get PCRE2 to obey POSIX semantics, but then PCRE2 was never intended to be a POSIX engine. The following table lists the different -possibilities for matching newline characters in PCRE2: +possibilities for matching newline characters in Perl and PCRE2:
Default Change with
@@ -180,7 +180,7 @@ possibilities for matching newline characters in PCRE2:
$ matches \n in middle no PCRE2_MULTILINE
^ matches \n in middle no PCRE2_MULTILINE
-This is the equivalent table for POSIX:
+This is the equivalent table for a POSIX-compatible pattern matcher:
Default Change with
@@ -190,14 +190,18 @@ This is the equivalent table for POSIX:
$ matches \n in middle no REG_NEWLINE
^ matches \n in middle no REG_NEWLINE
-PCRE2's behaviour is the same as Perl's, except that there is no equivalent for
-PCRE2_DOLLAR_ENDONLY in Perl. In both PCRE2 and Perl, there is no way to stop
-newline from matching [^a].
+This behaviour is not what happens when PCRE2 is called via its POSIX
+API. By default, PCRE2's behaviour is the same as Perl's, except that there is
+no equivalent for PCRE2_DOLLAR_ENDONLY in Perl. In both PCRE2 and Perl, there
+is no way to stop newline from matching [^a].
-The default POSIX newline handling can be obtained by setting PCRE2_DOTALL and -PCRE2_DOLLAR_ENDONLY, but there is no way to make PCRE2 behave exactly as for -the REG_NEWLINE action. +Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and +PCRE2_DOLLAR_ENDONLY when calling pcre2_compile() directly, but there is +no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using +the POSIX API, passing REG_NEWLINE to PCRE2's regcomp() function +causes PCRE2_MULTILINE to be passed to pcre2_compile(), and REG_DOTALL +passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY.
@@ -231,19 +235,21 @@ to have a terminating NUL located at string + pmatch[0].rm_eo IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software intended to be portable to other systems. Note that a non-zero rm_so does not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not -how it is matched. +how it is matched. Setting REG_STARTEND and passing pmatch as NULL are +mutually exclusive; the error REG_INVARG is returned.
If the pattern was compiled with the REG_NOSUB flag, no data about any matched strings is returned. The nmatch and pmatch arguments of -regexec() are ignored. +regexec() are ignored (except possibly as input for REG_STARTEND).
-If the value of nmatch is zero, or if the value pmatch is NULL, -no data about any matched strings is returned. +The value of nmatch may be zero, and the value pmatch may be NULL +(unless REG_STARTEND is set); in both these cases no data about any matched +strings is returned.
-Otherwise,the portion of the string that was matched, and also any captured +Otherwise, the portion of the string that was matched, and also any captured substrings, are returned via the pmatch argument, which points to an array of nmatch structures of type regmatch_t, containing the members rm_so and rm_eo. These contain the byte offset to the first @@ -262,9 +268,11 @@ header file, of which REG_NOMATCH is the "expected" failure code. The regerror() function maps a non-zero errorcode from either regcomp() or regexec() to a printable message. If preg is not NULL, the error should have arisen from the use of that structure. A message -terminated by a binary zero is placed in errbuf. The length of the -message, including the zero, is limited to errbuf_size. The yield of the -function is the size of buffer needed to hold the whole message. +terminated by a binary zero is placed in errbuf. If the buffer is too +short, only the first errbuf_size - 1 characters of the error message are +used. The yield of the function is the size of buffer needed to hold the whole +message, including the terminating zero. This value is greater than +errbuf_size if the message was truncated.
@@ -283,9 +291,9 @@ Cambridge, England.
-Last updated: 20 October 2014
+Last updated: 31 January 2016
-Copyright © 1997-2014 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2sample.html b/pcre2/doc/html/pcre2sample.html index 60a928bcc..2b36f1fc6 100644 --- a/pcre2/doc/html/pcre2sample.html +++ b/pcre2/doc/html/pcre2sample.html @@ -24,12 +24,11 @@ documentation. If you do not have a copy of the PCRE2 distribution, you can save this listing to re-create the contents of pcre2demo.c.
-The demonstration program, which uses the PCRE2 8-bit library, compiles the -regular expression that is its first argument, and matches it against the -subject string in its second argument. No PCRE2 options are set, and default -character tables are used. If matching succeeds, the program outputs the -portion of the subject that matched, together with the contents of any captured -substrings. +The demonstration program compiles the regular expression that is its +first argument, and matches it against the subject string in its second +argument. No PCRE2 options are set, and default character tables are used. If +matching succeeds, the program outputs the portion of the subject that matched, +together with the contents of any captured substrings.
If the -g option is given on the command line, the program then goes on to @@ -38,34 +37,39 @@ string. The logic is a little bit tricky because of the possibility of matching an empty string. Comments in the code explain what is going on.
+The code in pcre2demo.c is an 8-bit program that uses the PCRE2 8-bit +library. It handles strings and characters that are stored in 8-bit code units. +By default, one character corresponds to one code unit, but if the pattern +starts with "(*UTF)", both it and the subject are treated as UTF-8 strings, +where characters may occupy multiple code units. +
+If PCRE2 is installed in the standard include and library directories for your operating system, you should be able to compile the demonstration program using -this command: +a command like this:
- gcc -o pcre2demo pcre2demo.c -lpcre2-8 + cc -o pcre2demo pcre2demo.c -lpcre2-8If PCRE2 is installed elsewhere, you may need to add additional options to the command line. For example, on a Unix-like system that has PCRE2 installed in /usr/local, you can compile the demonstration program using a command like this:
- gcc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8 - -- -
-Once you have compiled and linked the demonstration program, you can run simple -tests like this: + cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8 + +Once you have built the demonstration program, you can run simple tests like +this:
./pcre2demo 'cat|dog' 'the cat sat on the mat' ./pcre2demo -g 'cat|dog' 'the dog sat on the cat'Note that there is a much more comprehensive test program, called pcre2test, -which supports many more facilities for testing regular expressions using the -PCRE2 libraries. The +which supports many more facilities for testing regular expressions using all +three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be +installed). The pcre2demo -program is provided as a simple coding example. +program is provided as a relatively simple coding example.
If you try to run @@ -73,7 +77,7 @@ If you try to run when PCRE2 is not installed in the standard library directory, you may get an error like this on some operating systems (e.g. Solaris):
- ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory + ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directoryThis is caused by the way shared library support works on those systems. You need to add @@ -97,9 +101,9 @@ Cambridge, England. REVISION
-Last updated: 20 October 2014
+Last updated: 02 February 2016
-Copyright © 1997-2014 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page.
diff --git a/pcre2/doc/html/pcre2serialize.html b/pcre2/doc/html/pcre2serialize.html
index c32ebe034..edf415afd 100644
--- a/pcre2/doc/html/pcre2serialize.html
+++ b/pcre2/doc/html/pcre2serialize.html
@@ -14,10 +14,11 @@ please consult the man page, in case the conversion went wrong.
@@ -41,14 +42,22 @@ If you are running an application that uses a large number of regular expression patterns, it may be useful to store them in a precompiled form instead of having to compile them every time the application is run. However, if you are using the just-in-time optimization feature, it is not possible to -save and reload the JIT data, because it is position-dependent. In addition, -the host on which the patterns are reloaded must be running the same version of -PCRE2, with the same code unit width, and must also have the same endianness, -pointer width and PCRE2_SIZE type. For example, patterns compiled on a 32-bit -system using PCRE2's 16-bit library cannot be reloaded on a 64-bit system, nor -can they be reloaded using the 8-bit library. +save and reload the JIT data, because it is position-dependent. The host on +which the patterns are reloaded must be running the same version of PCRE2, with +the same code unit width, and must also have the same endianness, pointer width +and PCRE2_SIZE type. For example, patterns compiled on a 32-bit system using +PCRE2's 16-bit library cannot be reloaded on a 64-bit system, nor can they be +reloaded using the 8-bit library.
-+The facility for saving and restoring compiled patterns is intended for use +within individual applications. As such, the data supplied to +pcre2_serialize_decode() is expected to be trusted data, not data from +arbitrary external sources. There is only some simple consistency checking, not +complete validation of what is being re-loaded. +
+Before compiled patterns can be saved they must be serialized, that is, converted to a stream of bytes. A single byte stream may contain any number of @@ -110,7 +119,7 @@ still be used for matching. Their memory must eventually be freed in the usual way by calling pcre2_code_free(). When you have finished with the byte stream, it too must be freed by calling pcre2_serialize_free().
-In order to re-use a set of saved patterns you must first make the serialized byte stream available in main memory (for example, by reading from a file). The @@ -142,21 +151,27 @@ is filled with those that fit, and the remainder are ignored. The yield of the function is the number of decoded patterns, or one of the following negative error codes:
- PCRE2_ERROR_BADDATA second argument is zero or less - PCRE2_ERROR_BADMAGIC mismatch of id bytes in the data - PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE2 version - PCRE2_ERROR_MEMORY memory allocation failed - PCRE2_ERROR_NULL first or third argument is NULL + PCRE2_ERROR_BADDATA second argument is zero or less + PCRE2_ERROR_BADMAGIC mismatch of id bytes in the data + PCRE2_ERROR_BADMODE mismatch of code unit size or PCRE2 version + PCRE2_ERROR_BADSERIALIZEDDATA other sanity check failure + PCRE2_ERROR_MEMORY memory allocation failed + PCRE2_ERROR_NULL first or third argument is NULLPCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled on a system with different endianness.
Decoded patterns can be used for matching in the usual way, and must be freed -by calling pcre2_code_free() as normal. A single copy of the character -tables is used by all the decoded patterns. A reference count is used to +by calling pcre2_code_free(). However, be aware that there is a potential +race issue if you are using multiple patterns that were decoded from a single +byte stream in a multithreaded application. A single copy of the character +tables is used by all the decoded patterns and a reference count is used to arrange for its memory to be automatically freed when the last pattern is -freed. +freed, but there is no locking on this reference count. Therefore, if you want +to call pcre2_code_free() for these patterns in different threads, you +must arrange your own locking, and ensure that pcre2_code_free() cannot +be called by two threads at the same time.
If a pattern was processed by pcre2_jit_compile() before being @@ -164,7 +179,7 @@ serialized, the JIT data is discarded and so is no longer available after a save/restore cycle. You can, however, process a restored pattern with pcre2_jit_compile() if you wish.
-
Philip Hazel
@@ -173,11 +188,11 @@ University Computing Service
Cambridge, England.
-Last updated: 20 January 2015
+Last updated: 24 May 2016
-Copyright © 1997-2015 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2stack.html b/pcre2/doc/html/pcre2stack.html index 2942c7a7a..8b5c783de 100644 --- a/pcre2/doc/html/pcre2stack.html +++ b/pcre2/doc/html/pcre2stack.html @@ -57,12 +57,13 @@ assertion and "once-only" subpatterns, which are handled like subroutine calls. Normally, these are never very deep, and the limit on the complexity of pcre2_dfa_match() is controlled by the amount of workspace it is given. However, it is possible to write patterns with runaway infinite recursions; -such patterns will cause pcre2_dfa_match() to run out of stack. At -present, there is no protection against this. +such patterns will cause pcre2_dfa_match() to run out of stack unless a +limit is applied (see below).
-The comments that follow do NOT apply to pcre2_dfa_match(); they are -relevant only for pcre2_match() without the JIT optimization. +The comments in the next three sections do not apply to +pcre2_dfa_match(); they are relevant only for pcre2_match() without +the JIT optimization.
+The recursion limit, as described above for pcre2_match(), also applies +to pcre2_dfa_match(), whose use of recursive function calls for +recursions in the pattern can lead to runaway stack usage. The non-recursive +match limit is not relevant for DFA matching, and is ignored. +
+
@@ -198,9 +208,9 @@ Cambridge, England.
REVISION
-Last updated: 21 November 2014
+Last updated: 23 December 2016
-Copyright © 1997-2014 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2syntax.html b/pcre2/doc/html/pcre2syntax.html index 28ba02362..4cbbba7b0 100644 --- a/pcre2/doc/html/pcre2syntax.html +++ b/pcre2/doc/html/pcre2syntax.html @@ -111,9 +111,10 @@ it matches a literal "u". \W a "non-word" character \X a Unicode extended grapheme cluster -The application can lock out the use of \C by setting the -PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the -current matching point in the middle of a UTF-8 or UTF-16 character. +\C is dangerous because it may leave the current matching point in the middle +of a UTF-8 or UTF-16 character. The application can lock out the use of \C by +setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2 +with the use of \C permanently disabled.
By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode @@ -187,6 +188,8 @@ at release 5.18.
+Ahom, +Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, @@ -227,6 +230,7 @@ Gurmukhi, Han, Hangul, Hanunoo, +Hatran, Hebrew, Hiragana, Imperial_Aramaic, @@ -263,12 +267,14 @@ Miao, Modi, Mongolian, Mro, +Multani, Myanmar, Nabataean, New_Tai_Lue, Nko, Ogham, Ol_Chiki, +Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic, @@ -290,6 +296,7 @@ Saurashtra, Sharada, Shavian, Siddham, +SignWriting, Sinhala, Sora_Sompeng, Sundanese, @@ -444,9 +451,10 @@ appear. (*UCP) set PCRE2_UCP (use Unicode properties for \d etc) Note that LIMIT_MATCH and LIMIT_RECURSION can only reduce the value of the -limits set by the caller of pcre2_match(), not increase them. The application -can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or -PCRE2_NEVER_UCP options, respectively, at compile time. +limits set by the caller of pcre2_match() or pcre2_dfa_match(), not +increase them. The application can lock out the use of (*UTF) and (*UCP) by +setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at +compile time.
@@ -485,6 +493,9 @@ Each top-level branch of a look behind must be of a fixed length. \n reference by number (can be ambiguous) \gn reference by number \g{n} reference by number + \g+n relative reference by number (PCRE2 extension) + \g-n relative reference by number + \g{+n} relative reference by number (PCRE2 extension) \g{-n} relative reference by number \k<name> reference by name (Perl) \k'name' reference by name (Perl) @@ -523,14 +534,17 @@ Each top-level branch of a look behind must be of a fixed length. (?(-n) relative reference condition (?(<name>) named reference condition (Perl) (?('name') named reference condition (Perl) - (?(name) named reference condition (PCRE2) + (?(name) named reference condition (PCRE2, deprecated) (?(R) overall recursion condition - (?(Rn) specific group recursion condition - (?(R&name) specific recursion condition + (?(Rn) specific numbered group recursion condition + (?(R&name) specific named group recursion condition (?(DEFINE) define subpattern for reference (?(VERSION[>]=n.m) test PCRE2 version (?(assert) assertion condition - + +Note the ambiguity of (?(R) and (?(Rn) which might be named reference +conditions or recursion tests. Such a condition is interpreted as a reference +condition if the relevant named group exists.
@@ -582,9 +596,9 @@ Cambridge, England.
-Last updated: 13 June 2015
+Last updated: 23 December 2016
-Copyright © 1997-2015 University of Cambridge.
+Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2test.html b/pcre2/doc/html/pcre2test.html index 5165c1e54..ee41e4303 100644 --- a/pcre2/doc/html/pcre2test.html +++ b/pcre2/doc/html/pcre2test.html @@ -61,7 +61,7 @@ subject is processed, and what output is produced.
As the original fairly simple PCRE library evolved, it acquired many different features, and as a result, the original pcretest program ended up with a -lot of options in a messy, arcane syntax, for testing all the features. The +lot of options in a messy, arcane syntax for testing all the features. The move to the new PCRE2 API provided an opportunity to re-implement the test program as pcre2test, with a cleaner modifier syntax. Nevertheless, there are still many obscure modifiers, some of which are specifically designed for @@ -77,31 +77,61 @@ strings that are encoded in 8-bit, 16-bit, or 32-bit code units. One, two, or all three of these libraries may be simultaneously installed. The pcre2test program can be used to test all the libraries. However, its own input and output are always in 8-bit format. When testing the 16-bit or 32-bit -libraries, patterns and subject strings are converted to 16- or 32-bit format -before being passed to the library functions. Results are converted back to -8-bit code units for output. +libraries, patterns and subject strings are converted to 16-bit or 32-bit +format before being passed to the library functions. Results are converted back +to 8-bit code units for output.
In the rest of this document, the names of library functions and structures are given in generic form, for example, pcre_compile(). The actual names used in the libraries have a suffix _8, _16, or _32, as appropriate. -
+Input to pcre2test is processed line by line, either by calling the C -library's fgets() function, or via the libreadline library (see -below). The input is processed using using C's string functions, so must not -contain binary zeroes, even though in Unix-like environments, fgets() -treats any bytes other than newline as data characters. In some Windows -environments character 26 (hex 1A) causes an immediate end of file, and no -further data is read. +library's fgets() function, or via the libreadline library. In some +Windows environments character 26 (hex 1A) causes an immediate end of file, and +no further data is read, so this character should be avoided unless you really +want that action.
-For maximum portability, therefore, it is safest to avoid non-printing -characters in pcre2test input files. There is a facility for specifying a -pattern's characters as hexadecimal pairs, thus making it possible to include -binary zeroes in a pattern for testing purposes. Subject lines are processed -for backslash escapes, which makes it possible to include any data value. +The input is processed using using C's string functions, so must not +contain binary zeroes, even though in Unix-like environments, fgets() +treats any bytes other than newline as data characters. An error is generated +if a binary zero is encountered. Subject lines are processed for backslash +escapes, which makes it possible to include any data value in strings that are +passed to the library for matching. For patterns, there is a facility for +specifying some or all of the 8-bit input characters as hexadecimal pairs, +which makes it possible to include binary zeros. +
++When testing the 16-bit or 32-bit libraries, there is a need to be able to +generate character code points greater than 255 in the strings that are passed +to the library. For subject lines, backslash escapes can be used. In addition, +when the utf modifier (see +"Setting compilation options" +below) is set, the pattern and any following subject lines are interpreted as +UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate. +
++For non-UTF testing of wide characters, the utf8_input modifier can be +used. This is mutually exclusive with utf, and is allowed only in 16-bit +or 32-bit mode. It causes the pattern and following subject lines to be treated +as UTF-8 according to the original definition (RFC 2279), which allows for +character values up to 0x7fffffff. Each character is placed in one 16-bit or +32-bit code unit (in the 16-bit case, values greater than 0xffff cause an error +to occur). +
++UTF-8 is not capable of encoding values greater than 0x7fffffff, but such +values can be handled by the 32-bit library. When testing this library in +non-UTF mode with utf8_input set, if any character is preceded by the +byte 0xff (which is an illegal byte in UTF-8) 0x80000000 is added to the +character's value. This is the only way of passing such code points in a +pattern string. For subject strings, using an escape sequence is preferable.
@@ -123,8 +153,13 @@ the 32-bit library has been built, this is the default. If the 32-bit library has not been built, this option causes an error.
+-ac +Behave as if each pattern has the auto_callout modifier, that is, insert +automatic callouts into every pattern that is compiled. +
+-b -Behave as if each pattern has the /fullbincode modifier; the full +Behave as if each pattern has the fullbincode modifier; the full internal binary form of the pattern is output after compilation.
@@ -155,12 +190,13 @@ following options output the value and set the exit code as indicated: The following options output 1 for true or 0 for false, and set the exit code to the same value:
- ebcdic compiled for an EBCDIC environment - jit just-in-time support is available - pcre2-16 the 16-bit library was built - pcre2-32 the 32-bit library was built - pcre2-8 the 8-bit library was built - unicode Unicode support is available + backslash-C \C is supported (not locked out) + ebcdic compiled for an EBCDIC environment + jit just-in-time support is available + pcre2-16 the 16-bit library was built + pcre2-32 the 32-bit library was built + pcre2-8 the 8-bit library was built + unicode Unicode support is availableIf an unknown option is given, an error message is output; the exit code is 0. @@ -177,12 +213,19 @@ using the pcre2_dfa_match() function instead of the default pcre2_match().
+-error number[,number,...] +Call pcre2_get_error_message() for each of the error numbers in the +comma-separated list, display the resulting messages on the standard output, +then exit with zero exit code. The numbers may be positive or negative. This is +a convenience facility for PCRE2 maintainers. +
+-help Output a brief summary these options and then exit.
-i -Behave as if each pattern has the /info modifier; information about the +Behave as if each pattern has the info modifier; information about the compiled pattern is given after compilation.
@@ -265,9 +308,9 @@ Each subject line is matched separately and independently. If you want to do multi-line matches, you have to use the \n escape sequence (or \r or \r\n, etc., depending on the newline setting) in a single line of input to encode the newline sequences. There is no limit on the length of subject lines; the input -buffer is automatically extended if it is too small. There is a replication -feature that makes it possible to generate long subject lines without having to -supply them explicitly. +buffer is automatically extended if it is too small. There are replication +features that makes it possible to generate long repetitive pattern or subject +lines without having to supply them explicitly.
An empty line or the end of the file signals the end of the subject lines for a @@ -304,6 +347,36 @@ output. This command is used to load a set of precompiled patterns from a file, as described in the section entitled "Saving and restoring compiled patterns" below. +
+ #newline_default [<newline-list>] ++When PCRE2 is built, a default newline convention can be specified. This +determines which characters and/or character pairs are recognized as indicating +a newline in a pattern or subject string. The default can be overridden when a +pattern is compiled. The standard test files contain tests of various newline +conventions, but the majority of the tests expect a single linefeed to be +recognized as a newline by default. Without special action the tests would fail +when PCRE2 is compiled with either CR or CRLF as the default newline. + +
+The #newline_default command specifies a list of newline types that are +acceptable as the default. The types must be one of CR, LF, CRLF, ANYCRLF, or +ANY (in upper or lower case), for example: +
+ #newline_default LF Any anyCRLF ++If the default newline is in the list, this command has no effect. Otherwise, +except when testing the POSIX API, a newline modifier that specifies the +first newline convention in the list (LF in the above example) is added to any +pattern that does not already have a newline modifier. If the newline +list is empty, the feature is turned off. This command is present in a number +of the standard test input files. + +
+When the POSIX API is being tested there is no way to override the default +newline convention, though it is possible to set the newline convention from +within the pattern. A warning is given if the posix modifier is used when +#newline_default would set a default for the non-POSIX API.
#pattern <modifier-list>@@ -321,9 +394,10 @@ test files that are also processed by perltest.sh. The #perltest command helps detect tests that are accidentally put in the wrong file.
#pop [<modifiers>] + #popcopy [<modifiers>]-This command is used to manipulate the stack of compiled patterns, as described -in the section entitled "Saving and restoring compiled patterns" +These commands are used to manipulate the stack of compiled patterns, as +described in the section entitled "Saving and restoring compiled patterns" below.
#save <filename> @@ -340,12 +414,13 @@ subject lines. Modifiers on a subject line can change these settings.The effects of these modifiers are described in the following sections. @@ -604,40 +695,145 @@ is requested. For each callout, either its number or string is given, followed by the item that follows it in the pattern.
MODIFIER SYNTAX
Modifier lists are used with both pattern and subject lines. Items in a list -are separated by commas and optional white space. Some modifiers may be given -for both patterns and subject lines, whereas others are valid for one or the -other only. Each modifier has a long name, for example "anchored", and some of -them must be followed by an equals sign and a value, for example, "offset=12". -Modifiers that do not take values may be preceded by a minus sign to turn off a -previous setting. +are separated by commas followed by optional white space. Trailing whitespace +in a modifier list is ignored. Some modifiers may be given for both patterns +and subject lines, whereas others are valid only for one or the other. Each +modifier has a long name, for example "anchored", and some of them must be +followed by an equals sign and a value, for example, "offset=12". Values cannot +contain comma characters, but may contain spaces. Modifiers that do not take +values may be preceded by a minus sign to turn off a previous setting.
A few of the more common modifiers can also be specified as single letters, for @@ -454,6 +529,12 @@ the start of a modifier list. For example:
abc\=notbol,notempty+If the subject string is empty and \= is followed by whitespace, the line is +treated as a comment line, and is not used for matching. For example: ++ \= This is a comment. + abc\= This is an invalid modifier list. +A backslash followed by any other non-alphanumeric character just escapes that character. A backslash followed by anything else causes an error. However, if the very last character in the line is a backslash (and there is no modifier @@ -462,10 +543,10 @@ a real empty line terminates the data input.
PATTERN MODIFIERS
-There are three types of modifier that can appear in pattern lines, two of -which may also be used in a #pattern command. A pattern's modifier list -can add to or override default modifiers that were set by a previous -#pattern command. +There are several types of modifier that can appear in pattern lines. Except +where noted below, they may also be used in #pattern commands. A +pattern's modifier list can add to or override default modifiers that were set +by a previous #pattern command.
Setting compilation options @@ -473,12 +554,13 @@ Setting compilation optionsThe following modifiers set options for pcre2_compile(). The most common ones have single-letter abbreviations. See -pcreapi +pcre2api for a description of their effects.
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED auto_callout set PCRE2_AUTO_CALLOUT /i caseless set PCRE2_CASELESS @@ -499,12 +581,15 @@ for a description of their effects. no_utf_check set PCRE2_NO_UTF_CHECK ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY + use_offset_limit set PCRE2_USE_OFFSET_LIMIT utf set PCRE2_UTFAs well as turning on the PCRE2_UTF option, the utf modifier causes all non-printing characters in output strings to be printed using the \x{hh...} notation. Otherwise, those less than 0x100 are output in hex without the curly -brackets. +brackets. Setting utf in 16-bit or 32-bit mode also causes pattern and +subject strings to be translated to UTF-16 or UTF-32, respectively, before +being passed to library functions.
Setting compilation controls @@ -519,18 +604,24 @@ about the pattern: debug same as info,fullbincode fullbincode show binary code with lengths /I info show info about compiled pattern - hex pattern is coded in hexadecimal + hex unquoted characters are hexadecimal jit[=<number>] use JIT jitfast use JIT fast path jitverify verify JIT use locale=<name> use this locale + max_pattern_length=<n> set the maximum pattern length memory show memory used newline=<type> set newline type + null_context compile with a NULL context parens_nest_limit=<n> set maximum parentheses depth posix use the POSIX API + posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack + pushcopy push a copy onto the stack stackguard=<number> test the stackguard feature tables=[0|1|2] select internal tables + use_length do not zero-terminate the pattern + utf8_input treat input as UTF-8
-The hex modifier specifies that the characters of the pattern are to be -interpreted as pairs of hexadecimal digits. White space is permitted between -pairs. For example: +Normally, pcre2test passes a context block to pcre2_compile(). If +the null_context modifier is set, however, NULL is passed. This is for +testing that pcre2_compile() behaves correctly in this case (it uses +default values). +
++By default, patterns are passed to the compiling functions as zero-terminated +strings. When using the POSIX wrapper API, there is no other option. However, +when using PCRE2's native API, patterns can be passed by length instead of +being zero-terminated. The use_length modifier causes this to happen. +Using a length happens automatically (whether or not use_length is set) +when hex is set, because patterns specified in hexadecimal may contain +binary zeros. +
++The hex modifier specifies that the characters of the pattern, except for +substrings enclosed in single or double quotes, are to be interpreted as pairs +of hexadecimal digits. This feature is provided as a way of creating patterns +that contain binary zeros and other non-printing characters. White space is +permitted between pairs of digits. For example, this pattern contains three +characters:
/ab 32 59/hex-This feature is provided as a way of creating patterns that contain binary zero -and other non-printing characters. By default, pcre2test passes patterns -as zero-terminated strings to pcre2_compile(), giving the length as -PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the -actual length of the pattern is passed. +Parts of such a pattern are taken literally if quoted. This pattern contains +nine characters, only two of which are specified in hexadecimal: +
+ /ab "literal" 32/hex ++Either single or double quotes may be used. There is no way of including +the delimiter within a substring. The hex and expand modifiers are +mutually exclusive. + +
+The POSIX API cannot be used with patterns specified in hexadecimal because +they may contain binary zeros, which conflicts with regcomp()'s +requirement for a zero-terminated string. Such patterns are always passed to +pcre2_compile() as a string with a length, not as zero-terminated. +
++In 16-bit and 32-bit modes, all input is automatically treated as UTF-8 and +translated to UTF-16 or UTF-32 when the utf modifier is set. For testing +the 16-bit and 32-bit libraries in non-UTF mode, the utf8_input modifier +can be used. It is mutually exclusive with utf. Input lines are +interpreted as UTF-8 as a means of specifying wide characters. More details are +given in +"Input encoding" +above. +
++Some tests use long patterns that are very repetitive. Instead of creating a +very long input line for such a pattern, you can use a special repetition +feature, similar to the one described for subject lines above. If the +expand modifier is present on a pattern, parts of the pattern that have +the form +
+ \[<characters>]{<count>}
+
+are expanded before the pattern is passed to pcre2_compile(). For
+example, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction
+cannot be nested. An initial "\[" sequence is recognized only if "]{" followed
+by decimal digits and "}" is found later in the pattern. If not, the characters
+remain in the pattern unaltered. The expand and hex modifiers are
+mutually exclusive.
+
++If part of an expanded pattern looks like an expansion, but is really part of +the actual pattern, unwanted expansion can be avoided by giving two values in +the quantifier. For example, \[AB]{6000,6000} is not recognized as an +expansion item. +
++If the info modifier is set on an expanded pattern, the result of the +expansion is included in the information that is output.
-The /jit modifier may optionally be followed by an equals sign and a -number in the range 0 to 7: +Just-in-time (JIT) compiling is a heavyweight optimization that can greatly +speed up pattern matching. See the +pcre2jit +documentation for details. JIT compiling happens, optionally, after a pattern +has been successfully compiled into an internal form. The JIT compiler converts +this to optimized machine code. It needs to know whether the match-time options +PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, because +different code is generated for the different cases. See the partial +modifier in "Subject Modifiers" +below +for details of how these options are specified for each match attempt. +
++JIT compilation is requested by the /jit pattern modifier, which may +optionally be followed by an equals sign and a number in the range 0 to 7. +The three bits that make up the number specify which of the three JIT operating +modes are to be compiled: +
+ 1 compile JIT code for non-partial matching + 2 compile JIT code for soft partial matching + 4 compile JIT code for hard partial matching ++The possible values for the jit modifier are therefore:
0 disable JIT - 1 use JIT for normal match only - 2 use JIT for soft partial match only - 3 use JIT for normal match and soft partial match - 4 use JIT for hard partial match only - 6 use JIT for soft and hard partial match + 1 normal matching only + 2 soft partial matching only + 3 normal and soft partial matching + 4 hard partial matching only + 6 soft and hard partial matching only 7 all three modes-If no number is given, 7 is assumed. If JIT compilation is successful, the -compiled JIT code will automatically be used when pcre2_match() is run -for the appropriate type of match, except when incompatible run-time options -are specified. For more details, see the +If no number is given, 7 is assumed. The phrase "partial matching" means a call +to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the +PCRE2_PARTIAL_HARD option set. Note that such a call may return a complete +match; the options enable the possibility of a partial match, but do not +require it. Note also that if you request JIT compilation only for partial +matching (for example, /jit=2) but do not set the partial modifier on a +subject line, that match will not use JIT code because none was compiled for +non-partial matching. + +
+If JIT compilation is successful, the compiled JIT code will automatically be
+used when an appropriate type of match is run, except when incompatible
+run-time options are specified. For more details, see the
pcre2jit
documentation. See also the jitstack modifier below for a way of
setting the size of the JIT stack.
@@ -661,14 +857,14 @@ code was actually used in the match.
Setting a locale
-The /locale modifier must specify the name of a locale, for example: +The locale modifier must specify the name of a locale, for example:
/pattern/locale=fr_FRThe given locale is set, pcre2_maketables() is called to build a set of character tables for the locale, and this is then passed to pcre2_compile() when compiling the regular expression. The same tables -are used when matching the following subject lines. The /locale modifier +are used when matching the following subject lines. The locale modifier applies only to the pattern on which it appears, but can be given in a #pattern command if a default is needed. Setting a locale and alternate character tables are mutually exclusive. @@ -677,7 +873,7 @@ character tables are mutually exclusive. Showing pattern memory
-The /memory modifier causes the size in bytes of the memory used to hold +The memory modifier causes the size in bytes of the memory used to hold the compiled pattern to be output. This does not include the size of the pcre2_code block; it is just the actual compiled data. If the pattern is subsequently passed to the JIT compiler, the size of the JIT compiled code is @@ -700,30 +896,53 @@ sets its own default of 220, which is required for running the standard test suite.
+The max_pattern_length modifier sets a limit, in code units, to the +length of pattern that pcre2_compile() will accept. Breaching the limit +causes a compilation error. The default is the largest number a PCRE2_SIZE +variable can hold (essentially unlimited). +
+-The /posix modifier causes pcre2test to call PCRE2 via the POSIX -wrapper API rather than its native API. This supports only the 8-bit library. -When the POSIX API is being used, the following pattern modifiers set options -for the regcomp() function: +The /posix and posix_nosub modifiers cause pcre2test to call +PCRE2 via the POSIX wrapper API rather than its native API. When +posix_nosub is used, the POSIX option REG_NOSUB is passed to +regcomp(). The POSIX wrapper supports only the 8-bit library. Note that +it does not imply POSIX matching semantics; for more detail see the +pcre2posix +documentation. The following pattern modifiers set options for the +regcomp() function:
caseless REG_ICASE multiline REG_NEWLINE - no_auto_capture REG_NOSUB dotall REG_DOTALL ) ungreedy REG_UNGREEDY ) These options are not part of ucp REG_UCP ) the POSIX standard utf REG_UTF8 )+The regerror_buffsize modifier specifies a size for the error buffer that +is passed to regerror() in the event of a compilation error. For example: +
+ /abc/posix,regerror_buffsize=20 ++This provides a means of testing the behaviour of regerror() when the +buffer is too small for the error message. If this modifier has not been set, a +large buffer is used. + +
The aftertext and allaftertext subject modifiers work as described -below. All other modifiers cause an error. +below. All other modifiers are either ignored, with a warning message, or cause +an error.
-The /stackguard modifier is used to test the use of
+The stackguard modifier is used to test the use of
pcre2_set_compile_recursion_guard(), a function that is provided to
enable stack availability to be checked during compilation (see the
pcre2api
@@ -738,7 +957,7 @@ be aborted.
Using alternative character tables
-The value specified for the /tables modifier must be one of the digits 0, +The value specified for the tables modifier must be one of the digits 0, 1, or 2. It causes a specific set of built-in character tables to be passed to pcre2_compile(). This is used in the PCRE2 tests to check behaviour with different character tables. The digit specifies the tables as follows: @@ -758,17 +977,22 @@ Setting certain match controls
The following modifiers are really subject modifiers, and are described below. However, they may be included in a pattern's modifier list, in which case they -are applied to every subject line that is processed with that pattern. They do -not affect the compilation process. +are applied to every subject line that is processed with that pattern. They may +not appear in #pattern commands. These modifiers do not affect the +compilation process.
- aftertext show text after match - allaftertext show text after captures - allcaptures show all captures - allusedtext show all consulted text - /g global global matching - mark show mark values - replace=<string> specify a replacement string - startchar show starting character when relevant + aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allusedtext show all consulted text + /g global global matching + mark show mark values + replace=<string> specify a replacement string + startchar show starting character when relevant + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED + substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTYThese modifiers may not appear in a #pattern command. If you want them as defaults, set them in a #subject command. @@ -782,13 +1006,17 @@ pushed onto a stack of compiled patterns, and pcre2test expects the next line to contain a new pattern (or a command) instead of a subject line. This facility is used when saving compiled patterns to a file, as described in the section entitled "Saving and restoring compiled patterns" -below. -The push modifier is incompatible with compilation modifiers such as -global that act at match time. Any that are specified are ignored, with a -warning message, except for replace, which causes an error. Note that, -jitverify, which is allowed, does not carry through to any subsequent -matching that uses this pattern. - +below. If pushcopy is used instead of push, a copy of the compiled +pattern is stacked, leaving the original as current, ready to match the +following input lines. This provides a way of testing the +pcre2_code_copy() function. +The push and pushcopy modifiers are incompatible with compilation +modifiers such as global that act at match time. Any that are specified +are ignored (for the stacked copy), with a warning message, except for +replace, which causes an error. Note that jitverify, which is +allowed, does not carry through to any subsequent matching that uses a stacked +pattern. +
The modifiers that can appear in subject lines and the #subject @@ -806,6 +1034,7 @@ for a description of their effects. anchored set PCRE2_ANCHORED dfa_restart set PCRE2_DFA_RESTART dfa_shortest set PCRE2_DFA_SHORTEST + no_jit set PCRE2_NO_JIT no_utf_check set PCRE2_NO_UTF_CHECK notbol set PCRE2_NOTBOL notempty set PCRE2_NOTEMPTY @@ -818,11 +1047,11 @@ The partial matching modifiers are provided with abbreviations because they appear frequently in tests.
-If the /posix modifier was present on the pattern, causing the POSIX +If the posix modifier was present on the pattern, causing the POSIX wrapper API to be used, the only option-setting modifiers that have any effect are notbol, notempty, and noteol, causing REG_NOTBOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). -Any other modifiers cause an error. +The other modifiers are ignored, with a warning message.
- aftertext show text after match - allaftertext show text after captures - allcaptures show all captures - allusedtext show all consulted text (non-JIT only) - altglobal alternative global matching - callout_capture show captures at callout time - callout_data=<n> set a value to pass via callouts - callout_fail=<n>[:<m>] control callout failure - callout_none do not supply a callout function - copy=<number or name> copy captured substring - dfa use pcre2_dfa_match() - find_limits find match and recursion limits - get=<number or name> extract captured substring - getall extract all captured substrings - /g global global matching - jitstack=<n> set size of JIT stack - mark show mark values - match_limit=>n> set a match limit - memory show memory usage - offset=<n> set starting offset - ovector=<n> set size of output vector - recursion_limit=<n> set a recursion limit - replace=<string> specify a replacement string - startchar show startchar when relevant - zero_terminate pass the subject as zero-terminated + aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allusedtext show all consulted text (non-JIT only) + altglobal alternative global matching + callout_capture show captures at callout time + callout_data=<n> set a value to pass via callouts + callout_error=<n>[:<m>] control callout error + callout_fail=<n>[:<m>] control callout failure + callout_none do not supply a callout function + copy=<number or name> copy captured substring + dfa use pcre2_dfa_match() + find_limits find match and recursion limits + get=<number or name> extract captured substring + getall extract all captured substrings + /g global global matching + jitstack=<n> set size of JIT stack + mark show mark values + match_limit=<n> set a match limit + memory show memory usage + null_context match with a NULL context + offset=<n> set starting offset + offset_limit=<n> set offset limit + ovector=<n> set size of output vector + recursion_limit=<n> set a recursion limit + replace=<string> specify a replacement string + startchar show startchar when relevant + startoffset=<n> same as offset=<n> + substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY + zero_terminate pass the subject as zero-terminated-The effects of these modifiers are described in the following sections. +The effects of these modifiers are described in the following sections. When +matching via the POSIX wrapper API, the aftertext, allaftertext, +and ovector subject modifiers work as described below. All other +modifiers are either ignored, with a warning message, or cause an error.
A callout function is supplied when pcre2test calls the library matching functions, unless callout_none is specified. If callout_capture is -set, the current captured groups are output when a callout occurs. +set, the current captured groups are output when a callout occurs. The default +return from the callout function is zero, which allows matching to continue.
The callout_fail modifier can be given one or two numbers. If there is -only one number, 1 is returned instead of 0 when a callout of that number is -reached. If two numbers are given, 1 is returned when callout <n> is reached -for the <m>th time. Note that callouts with string arguments are always given -the number zero. See "Callouts" below for a description of the output when a -callout it taken. +only one number, 1 is returned instead of 0 (causing matching to backtrack) +when a callout of that number is reached. If two numbers (<n>:<m>) are given, 1 +is returned when callout <n> is reached and there have been at least <m> +callouts. The callout_error modifier is similar, except that +PCRE2_ERROR_CALLOUT is returned, causing the entire matching process to be +aborted. If both these modifiers are set for the same callout number, +callout_error takes precedence. +
++Note that callouts with string arguments are always given the number zero. See +"Callouts" below for a description of the output when a callout it taken.
The callout_data modifier can be given an unsigned or a negative number.
@@ -945,7 +1193,7 @@ Finding all matches in a string
Searching for all possible matches within a subject can be requested by the
-global or /altglobal modifier. After finding a match, the matching
+global or altglobal modifier. After finding a match, the matching
function is called again to search the remainder of the subject. The difference
between global and altglobal is that the former uses the
start_offset argument to pcre2_match() or pcre2_dfa_match()
@@ -996,19 +1244,34 @@ Testing the substitution function
If the replace modifier is set, the pcre2_substitute() function is -called instead of one of the matching functions. Unlike subject strings, -pcre2test does not process replacement strings for escape sequences. In -UTF mode, a replacement string is checked to see if it is a valid UTF-8 string. -If so, it is correctly converted to a UTF string of the appropriate code unit -width. If it is not a valid UTF-8 string, the individual code units are copied -directly. This provides a means of passing an invalid UTF-8 string for testing -purposes. +called instead of one of the matching functions. Note that replacement strings +cannot contain commas, because a comma signifies the end of a modifier. This is +not thought to be an issue in a test program.
-If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to -pcre2_substitute(). After a successful substitution, the modified string -is output, preceded by the number of replacements. This may be zero if there -were no matches. Here is a simple example of a substitution test: +Unlike subject strings, pcre2test does not process replacement strings +for escape sequences. In UTF mode, a replacement string is checked to see if it +is a valid UTF-8 string. If so, it is correctly converted to a UTF string of +the appropriate code unit width. If it is not a valid UTF-8 string, the +individual code units are copied directly. This provides a means of passing an +invalid UTF-8 string for testing purposes. +
++The following modifiers set options (in additional to the normal match options) +for pcre2_substitute(): +
+ global PCRE2_SUBSTITUTE_GLOBAL + substitute_extended PCRE2_SUBSTITUTE_EXTENDED + substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY + ++ +
+After a successful substitution, the modified string is output, preceded by the +number of replacements. This may be zero if there were no matches. Here is a +simple example of a substitution test:
/abc/replace=xxx
=abc=abc=
@@ -1016,12 +1279,12 @@ were no matches. Here is a simple example of a substitution test:
=abc=abc=\=global
2: =xxx=xxx=
-Subject and replacement strings should be kept relatively short for
-substitution tests, as fixed-size buffers are used. To make it easy to test for
-buffer overflow, if the replacement string starts with a number in square
-brackets, that number is passed to pcre2_substitute() as the size of the
-output buffer, with the replacement string starting at the next character. Here
-is an example that tests the edge case:
+Subject and replacement strings should be kept relatively short (fewer than 256
+characters) for substitution tests, as fixed-size buffers are used. To make it
+easy to test for buffer overflow, if the replacement string starts with a
+number in square brackets, that number is passed to pcre2_substitute() as
+the size of the output buffer, with the replacement string starting at the next
+character. Here is an example that tests the edge case:
/abc/
123abc123\=replace=[10]XYZ
@@ -1029,6 +1292,19 @@ is an example that tests the edge case:
123abc123\=replace=[9]XYZ
Failed: error -47: no more memory
+The default action of pcre2_substitute() is to return
+PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the
+PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the
+substitute_overflow_length modifier), pcre2_substitute() continues
+to go through the motions of matching and substituting, in order to compute the
+size of buffer that is required. When this happens, pcre2test shows the
+required buffer length (which includes space for the trailing zero) as part of
+the error message. For example:
++ /abc/substitute_overflow_length + 123abc123\=replace=[9]XYZ + Failed: error -47: no more memory: 10 code units are needed +A replacement string is ignored with POSIX and DFA matching. Specifying partial matching provokes an error return ("bad option value") from pcre2_substitute(). @@ -1100,6 +1376,16 @@ The offset modifier sets an offset in the subject string at which matching starts. Its value is a number of code units, not characters.
+The offset_limit modifier sets a limit for unanchored matches. If a match +cannot be found starting at or before this offset in the subject, a "no match" +return is given. The data value is a number of code units, not characters. When +this modifier is used, the use_offset_limit modifier must have been set +for the pattern; if not, an error is generated. +
+@@ -1131,6 +1417,17 @@ this modifier has no effect, as there is no facility for passing a length.) When testing pcre2_substitute(), this modifier also has the effect of passing the replacement string as zero-terminated.
++Normally, pcre2test passes a context block to pcre2_match(), +pcre2_dfa_match() or pcre2_jit_match(). If the null_context +modifier is set, however, NULL is passed. This is for testing that the matching +functions behave correctly in this case (they use default values). This +modifier cannot be used with the find_limits modifier or when testing the +substitution function. +
By default, pcre2test uses the standard PCRE2 matching function, @@ -1196,7 +1493,7 @@ unset substring is shown as "<unset>", as for the second data line. If the strings contain any non-printing characters, they are output as \xhh escapes if the value is less than 256 and UTF mode is not set. Otherwise they are output as \x{hh...} escapes. See below for the definition of non-printing -characters. If the /aftertext modifier is set, the output for substring +characters. If the aftertext modifier is set, the output for substring 0 is followed by the the rest of the subject string, identified by "0+" like this:
@@ -1321,7 +1618,9 @@ item to be tested. For example: This output indicates that callout number 0 occurred for a match attempt starting at the fourth character of the subject string, when the pointer was at the seventh character, and when the next pattern item was \d. Just -one circumflex is output if the start and current positions are the same. +one circumflex is output if the start and current positions are the same, or if +the current position precedes the start position, which can happen if the +callout is in a lookbehind assertion.Callouts numbered 255 are assumed to be automatic callouts, inserted as a @@ -1387,7 +1686,7 @@ therefore shown as hex escapes.
When pcre2test is outputting text that is a matched part of a subject string, it behaves in the same way, unless a different locale has been set for -the pattern (using the /locale modifier). In this case, the +the pattern (using the locale modifier). In this case, the isprint() function is used to distinguish printing and non-printing characters.
@@ -1413,11 +1712,16 @@ can be used to test these functions.When a pattern with push modifier is successfully compiled, it is pushed onto a stack of compiled patterns, and pcre2test expects the next line to -contain a new pattern (or command) instead of a subject line. By this means, a -number of patterns can be compiled and retained. The push modifier is -incompatible with posix, and control modifiers that act at match time are -ignored (with a message). The jitverify modifier applies only at compile -time. The command +contain a new pattern (or command) instead of a subject line. By contrast, +the pushcopy modifier causes a copy of the compiled pattern to be +stacked, leaving the original available for immediate matching. By using +push and/or pushcopy, a number of patterns can be compiled and +retained. These modifiers are incompatible with posix, and control +modifiers that act at match time are ignored (with a message) for the stacked +patterns. The jitverify modifier applies only at compile time. +
++The command
#save <filename>@@ -1434,7 +1738,8 @@ usual by an empty line or end of file. This command may be followed by a modifier list containing only control modifiers that act after a pattern has been compiled. In particular, hex, -posix, and push are not allowed, nor are any +posix, posix_nosub, push, and pushcopy are not allowed, +nor are any option-setting modifiers. The JIT modifiers are, however permitted. Here is an example that saves and reloads two patterns. @@ -1452,6 +1757,11 @@ reloads two patterns. If jitverify is used with #pop, it does not automatically imply jit, which is different behaviour from when it is used on a pattern. ++The #popcopy command is analagous to the pushcopy modifier in that it +makes current a copy of the topmost stack pattern, leaving the original still +on the stack. +
SEE ALSO
pcre2(3), pcre2api(3), pcre2callout(3), @@ -1469,9 +1779,9 @@ Cambridge, England.
REVISION
-Last updated: 20 May 2015 +Last updated: 28 December 2016
-Copyright © 1997-2015 University of Cambridge. +Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2unicode.html b/pcre2/doc/html/pcre2unicode.html index 22c1792d9..6ca367f4e 100644 --- a/pcre2/doc/html/pcre2unicode.html +++ b/pcre2/doc/html/pcre2unicode.html @@ -67,15 +67,20 @@ In UTF modes, the dot metacharacter matches one UTF character instead of a single code unit.
-The escape sequence \C can be used to match a single code unit, in a UTF mode, +The escape sequence \C can be used to match a single code unit in a UTF mode, but its use can lead to some strange effects because it breaks up multi-unit characters (see the description of \C in the pcre2pattern -documentation). The use of \C is not supported in the alternative matching -function pcre2_dfa_match(), nor is it supported in UTF mode by the JIT -optimization. If JIT optimization is requested for a UTF pattern that contains -\C, it will not succeed, and so the matching will be carried out by the normal -interpretive function. +documentation). +
++The use of \C is not supported by the alternative matching function +pcre2_dfa_match() when in UTF-8 or UTF-16 mode, that is, when a character +may consist of more than one code unit. The use of \C in these modes provokes +a match-time error. Also, the JIT optimization does not support \C in these +modes. If JIT optimization is requested for a UTF-8 or UTF-16 pattern that +contains \C, it will not succeed, and so when pcre2_match() is called, +the matching will be carried out by the normal interpretive function.
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test @@ -126,11 +131,22 @@ as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting strings to be in host byte order.
-The entire string is checked before any other processing takes place. In -addition to checking the format of the string, there is a check to ensure that -all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area. -The so-called "non-character" code points are not excluded because Unicode -corrigendum #9 makes it clear that they should not be. +A UTF string is checked before any other processing takes place. In the case of +pcre2_match() and pcre2_dfa_match() calls with a non-zero starting +offset, the check is applied only to that part of the subject that could be +inspected during matching, and there is a check that the starting offset points +to the first code unit of a character or to the end of the subject. If there +are no lookbehind assertions in the pattern, the check starts at the starting +offset. Otherwise, it starts at the length of the longest lookbehind before the +starting offset, or at the start of the subject if there are not that many +characters before the starting offset. Note that the sequences \b and \B are +one-character lookbehinds. +
++In addition to checking the format of the string, there is a check to ensure +that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate +area. The so-called "non-character" code points are not excluded because +Unicode corrigendum #9 makes it clear that they should not be.
Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16, @@ -232,9 +248,9 @@ Errors in UTF-16 strings
The following negative error codes are given for invalid UTF-16 strings:
- PCRE_UTF16_ERR1 Missing low surrogate at end of string - PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate - PCRE_UTF16_ERR3 Isolated low surrogate + PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at end of string + PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate follows high surrogate + PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate@@ -244,8 +260,8 @@ Errors in UTF-32 stringsThe following negative error codes are given for invalid UTF-32 strings:
- PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff) - PCRE_UTF32_ERR2 Code point is greater than 0x10ffff + PCRE2_ERROR_UTF32_ERR1 Surrogate character (0xd800 to 0xdfff) + PCRE2_ERROR_UTF32_ERR2 Code point is greater than 0x10ffff@@ -264,9 +280,9 @@ Cambridge, England. REVISION
-Last updated: 23 November 2014 +Last updated: 03 July 2016
-Copyright © 1997-2014 University of Cambridge. +Copyright © 1997-2016 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/index.html.src b/pcre2/doc/index.html.src index ae938fd3c..eebb80b77 100644 --- a/pcre2/doc/index.html.src +++ b/pcre2/doc/index.html.src @@ -91,6 +91,12 @@ in the library.
+ pcre2_callout_enumerate Enumerate callouts in a compiled pattern + + pcre2_code_copy +Copy a compiled pattern + pcre2_code_copy_with_tables +Copy a compiled pattern and its character tables @@ -210,9 +216,15 @@ in the library. pcre2_code_free Free a compiled pattern + pcre2_set_match_limit Set the match limit + pcre2_set_max_pattern_length +Set the maximum length of pattern + pcre2_set_newline Set the newline convention + pcre2_set_offset_limit +Set the offset limit diff --git a/pcre2/doc/pcre2.3 b/pcre2/doc/pcre2.3 index e315bbb60..9a84ce31e 100644 --- a/pcre2/doc/pcre2.3 +++ b/pcre2/doc/pcre2.3 @@ -1,4 +1,4 @@ -.TH PCRE2 3 "13 April 2015" "PCRE2 10.20" +.TH PCRE2 3 "16 October 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH INTRODUCTION @@ -118,8 +118,10 @@ running redundant checks. .P The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to problems, because it may leave the current matching point in the middle of a -multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to -lock out the use of \eC, causing a compile-time error if it is encountered. +multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an +application to lock out the use of \eC, causing a compile-time error if it is +encountered. It is also possible to build PCRE2 with the use of \eC permanently +disabled. .P Another way that performance can be hit is by running a pattern that has a very large search tree against a string that will never match. Nested unlimited @@ -187,6 +189,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk. .rs .sp .nf -Last updated: 13 April 2015 +Last updated: 16 October 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2.txt b/pcre2/doc/pcre2.txt index 3193d3054..650660957 100644 --- a/pcre2/doc/pcre2.txt +++ b/pcre2/doc/pcre2.txt @@ -104,26 +104,27 @@ SECURITY CONSIDERATIONS The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to problems, because it may leave the current matching point in the middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C - option can be used to lock out the use of \C, causing a compile-time - error if it is encountered. + option can be used by an application to lock out the use of \C, causing + a compile-time error if it is encountered. It is also possible to build + PCRE2 with the use of \C permanently disabled. - Another way that performance can be hit is by running a pattern that - has a very large search tree against a string that will never match. - Nested unlimited repeats in a pattern are a common example. PCRE2 pro- - vides some protection against this: see the pcre2_set_match_limit() + Another way that performance can be hit is by running a pattern that + has a very large search tree against a string that will never match. + Nested unlimited repeats in a pattern are a common example. PCRE2 pro- + vides some protection against this: see the pcre2_set_match_limit() function in the pcre2api page. USER DOCUMENTATION - The user documentation for PCRE2 comprises a number of different sec- - tions. In the "man" format, each of these is a separate "man page". In - the HTML format, each is a separate page, linked from the index page. - In the plain text format, the descriptions of the pcre2grep and + The user documentation for PCRE2 comprises a number of different sec- + tions. In the "man" format, each of these is a separate "man page". In + the HTML format, each is a separate page, linked from the index page. + In the plain text format, the descriptions of the pcre2grep and pcre2test programs are in files called pcre2grep.txt and pcre2test.txt, - respectively. The remaining sections, except for the pcre2demo section - (which is a program listing), and the short pages for individual func- - tions, are concatenated in pcre2.txt, for ease of searching. The sec- + respectively. The remaining sections, except for the pcre2demo section + (which is a program listing), and the short pages for individual func- + tions, are concatenated in pcre2.txt, for ease of searching. The sec- tions are as follows: pcre2 this document @@ -148,7 +149,7 @@ USER DOCUMENTATION pcre2test description of the pcre2test command pcre2unicode discussion of Unicode and UTF support - In the "man" and HTML formats, there is also a short page for each C + In the "man" and HTML formats, there is also a short page for each C library function, listing its arguments and results. @@ -158,14 +159,14 @@ AUTHOR University Computing Service Cambridge, England. - Putting an actual email address here is a spam magnet. If you want to - email me, use my two initials, followed by the two digits 10, at the + Putting an actual email address here is a spam magnet. If you want to + email me, use my two initials, followed by the two digits 10, at the domain cam.ac.uk. REVISION - Last updated: 13 April 2015 + Last updated: 16 October 2015 Copyright (c) 1997-2015 University of Cambridge. ------------------------------------------------------------------------------ @@ -190,13 +191,13 @@ PCRE2 NATIVE API BASIC FUNCTIONS uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext); - pcre2_code_free(pcre2_code *code); + void pcre2_code_free(pcre2_code *code); - pcre2_match_data_create(uint32_t ovecsize, + pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext); - pcre2_match_data_create_from_pattern(const pcre2_code *code, - pcre2_general_context *gcontext); + pcre2_match_data *pcre2_match_data_create_from_pattern( + const pcre2_code *code, pcre2_general_context *gcontext); int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, @@ -251,6 +252,9 @@ PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS int pcre2_set_character_tables(pcre2_compile_context *ccontext, const unsigned char *tables); + int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, + PCRE2_SIZE value); + int pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t value); @@ -278,6 +282,9 @@ PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS int pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t value); + int pcre2_set_offset_limit(pcre2_match_context *mcontext, + PCRE2_SIZE value); + int pcre2_set_recursion_limit(pcre2_match_context *mcontext, uint32_t value); @@ -356,11 +363,11 @@ PCRE2 NATIVE API JIT FUNCTIONS PCRE2 NATIVE API SERIALIZATION FUNCTIONS int32_t pcre2_serialize_decode(pcre2_code **codes, - int32_t number_of_codes, const uint32_t *bytes, + int32_t number_of_codes, const uint8_t *bytes, pcre2_general_context *gcontext); - int32_t pcre2_serialize_encode(pcre2_code **codes, - int32_t number_of_codes, uint32_t **serialized_bytes, + int32_t pcre2_serialize_encode(const pcre2_code **codes, + int32_t number_of_codes, uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext); void pcre2_serialize_free(uint8_t *bytes); @@ -370,6 +377,10 @@ PCRE2 NATIVE API SERIALIZATION FUNCTIONS PCRE2 NATIVE API AUXILIARY FUNCTIONS + pcre2_code *pcre2_code_copy(const pcre2_code *code); + + pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code); + int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, PCRE2_SIZE bufflen); @@ -480,10 +491,10 @@ PCRE2 API OVERVIEW pcre2_jit_stack_assign() in order to control the JIT code's memory usage. - JIT matching is automatically used by pcre2_match() if it is available. - There is also a direct interface for JIT matching, which gives improved - performance. The JIT-specific functions are discussed in the pcre2jit - documentation. + JIT matching is automatically used by pcre2_match() if it is available, + unless the PCRE2_NO_JIT option is set. There is also a direct interface + for JIT matching, which gives improved performance. The JIT-specific + functions are discussed in the pcre2jit documentation. A second matching function, pcre2_dfa_match(), which is not Perl-com- patible, is also provided. This uses a different algorithm for the @@ -516,76 +527,113 @@ PCRE2 API OVERVIEW return a copy of the subject string with substitutions for parts that were matched. - Finally, there are functions for finding out information about a com- - piled pattern (pcre2_pattern_info()) and about the configuration with + Functions whose names begin with pcre2_serialize_ are used for saving + compiled patterns on disc or elsewhere, and reloading them later. + + Finally, there are functions for finding out information about a com- + piled pattern (pcre2_pattern_info()) and about the configuration with which PCRE2 was built (pcre2_config()). + Functions with names ending with _free() are used for freeing memory + blocks of various sorts. In all cases, if one of these functions is + called with a NULL argument, it does nothing. + STRING LENGTHS AND OFFSETS - The PCRE2 API uses string lengths and offsets into strings of code - units in several places. These values are always of type PCRE2_SIZE, - which is an unsigned integer type, currently always defined as size_t. - The largest value that can be stored in such a type (that is - ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated - strings and unset offsets. Therefore, the longest string that can be + The PCRE2 API uses string lengths and offsets into strings of code + units in several places. These values are always of type PCRE2_SIZE, + which is an unsigned integer type, currently always defined as size_t. + The largest value that can be stored in such a type (that is + ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated + strings and unset offsets. Therefore, the longest string that can be handled is one less than this maximum. NEWLINES PCRE2 supports five different conventions for indicating line breaks in - strings: a single CR (carriage return) character, a single LF (line- + strings: a single CR (carriage return) character, a single LF (line- feed) character, the two-character sequence CRLF, any of the three pre- - ceding, or any Unicode newline sequence. The Unicode newline sequences - are the three just mentioned, plus the single characters VT (vertical + ceding, or any Unicode newline sequence. The Unicode newline sequences + are the three just mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). - Each of the first three conventions is used by at least one operating + Each of the first three conventions is used by at least one operating system as its standard newline sequence. When PCRE2 is built, a default - can be specified. The default default is LF, which is the Unix stan- - dard. However, the newline convention can be changed by an application + can be specified. The default default is LF, which is the Unix stan- + dard. However, the newline convention can be changed by an application when calling pcre2_compile(), or it can be specified by special text at the start of the pattern itself; this overrides any other settings. See the pcre2pattern page for details of the special character sequences. - In the PCRE2 documentation the word "newline" is used to mean "the + In the PCRE2 documentation the word "newline" is used to mean "the character or pair of characters that indicate a line break". The choice - of newline convention affects the handling of the dot, circumflex, and + of newline convention affects the handling of the dot, circumflex, and dollar metacharacters, the handling of #-comments in /x mode, and, when - CRLF is a recognized line ending sequence, the match position advance- + CRLF is a recognized line ending sequence, the match position advance- ment for a non-anchored pattern. There is more detail about this in the section on pcre2_match() options below. - The choice of newline convention does not affect the interpretation of + The choice of newline convention does not affect the interpretation of the \n or \r escape sequences, nor does it affect what \R matches; this has its own separate convention. MULTITHREADING - In a multithreaded application it is important to keep thread-specific - data separate from data that can be shared between threads. The PCRE2 - library code itself is thread-safe: it contains no static or global - variables. The API is designed to be fairly simple for non-threaded - applications while at the same time ensuring that multithreaded appli- + In a multithreaded application it is important to keep thread-specific + data separate from data that can be shared between threads. The PCRE2 + library code itself is thread-safe: it contains no static or global + variables. The API is designed to be fairly simple for non-threaded + applications while at the same time ensuring that multithreaded appli- cations can use it. There are several different blocks of data that are used to pass infor- mation between the application and the PCRE2 libraries. - (1) A pointer to the compiled form of a pattern is returned to the user - when pcre2_compile() is successful. The data in the compiled pattern is - fixed, and does not change when the pattern is matched. Therefore, it - is thread-safe, that is, the same compiled pattern can be used by more - than one thread simultaneously. An application can compile all its pat- - terns at the start, before forking off multiple threads that use them. - However, if the just-in-time optimization feature is being used, it - needs separate memory stack areas for each thread. See the pcre2jit - documentation for more details. + The compiled pattern - (2) The next section below introduces the idea of "contexts" in which + A pointer to the compiled form of a pattern is returned to the user + when pcre2_compile() is successful. The data in the compiled pattern is + fixed, and does not change when the pattern is matched. Therefore, it + is thread-safe, that is, the same compiled pattern can be used by more + than one thread simultaneously. For example, an application can compile + all its patterns at the start, before forking off multiple threads that + use them. However, if the just-in-time optimization feature is being + used, it needs separate memory stack areas for each thread. See the + pcre2jit documentation for more details. + + In a more complicated situation, where patterns are compiled only when + they are first needed, but are still shared between threads, pointers + to compiled patterns must be protected from simultaneous writing by + multiple threads, at least until a pattern has been compiled. The logic + can be something like this: + + Get a read-only (shared) lock (mutex) for pointer + if (pointer == NULL) + { + Get a write (unique) lock for pointer + pointer = pcre2_compile(... + } + Release the lock + Use pointer in pcre2_match() + + Of course, testing for compilation errors should also be included in + the code. + + If JIT is being used, but the JIT compilation is not being done immedi- + ately, (perhaps waiting to see if the pattern is used often enough) + similar logic is required. JIT compilation updates a pointer within the + compiled code block, so a thread must gain unique write access to the + pointer before calling pcre2_jit_compile(). Alternatively, + pcre2_code_copy() or pcre2_code_copy_with_tables() can be used to + obtain a private copy of the compiled code. + + Context blocks + + The next main section below introduces the idea of "contexts" in which PCRE2 functions are called. A context is nothing more than a collection of parameters that control the way PCRE2 operates. Grouping a number of parameters together in a context is a convenient way of passing them to @@ -598,44 +646,45 @@ MULTITHREADING threads. However, if any thread needs to change any value in a context, it must make its own thread-specific copy. - (3) The matching functions need a block of memory for working space and - for storing the results of a match. This includes details of what was + Match blocks + + The matching functions need a block of memory for working space and for + storing the results of a match. This includes details of what was matched, as well as additional information such as the name of a - (*MARK) setting. Each thread must provide its own version of this mem- - ory. + (*MARK) setting. Each thread must provide its own copy of this memory. PCRE2 CONTEXTS - Some PCRE2 functions have a lot of parameters, many of which are used - only by specialist applications, for example, those that use custom - memory management or non-standard character tables. To keep function - argument lists at a reasonable size, and at the same time to keep the - API extensible, "uncommon" parameters are passed to certain functions - in a context instead of directly. A context is just a block of memory - that holds the parameter values. Applications that do not need to - adjust any of the context parameters can pass NULL when a context + Some PCRE2 functions have a lot of parameters, many of which are used + only by specialist applications, for example, those that use custom + memory management or non-standard character tables. To keep function + argument lists at a reasonable size, and at the same time to keep the + API extensible, "uncommon" parameters are passed to certain functions + in a context instead of directly. A context is just a block of memory + that holds the parameter values. Applications that do not need to + adjust any of the context parameters can pass NULL when a context pointer is required. - There are three different types of context: a general context that is - relevant for several PCRE2 operations, a compile-time context, and a + There are three different types of context: a general context that is + relevant for several PCRE2 operations, a compile-time context, and a match-time context. The general context - At present, this context just contains pointers to (and data for) - external memory management functions that are called from several + At present, this context just contains pointers to (and data for) + external memory management functions that are called from several places in the PCRE2 library. The context is named `general' rather than - specifically `memory' because in future other fields may be added. If - you do not want to supply your own custom memory management functions, - you do not need to bother with a general context. A general context is + specifically `memory' because in future other fields may be added. If + you do not want to supply your own custom memory management functions, + you do not need to bother with a general context. A general context is created by: pcre2_general_context *pcre2_general_context_create( void *(*private_malloc)(PCRE2_SIZE, void *), void (*private_free)(void *, void *), void *memory_data); - The two function pointers specify custom memory management functions, + The two function pointers specify custom memory management functions, whose prototypes are: void *private_malloc(PCRE2_SIZE, void *); @@ -643,16 +692,16 @@ PCRE2 CONTEXTS Whenever code in PCRE2 calls these functions, the final argument is the value of memory_data. Either of the first two arguments of the creation - function may be NULL, in which case the system memory management func- - tions malloc() and free() are used. (This is not currently useful, as - there are no other fields in a general context, but in future there - might be.) The private_malloc() function is used (if supplied) to - obtain memory for storing the context, and all three values are saved + function may be NULL, in which case the system memory management func- + tions malloc() and free() are used. (This is not currently useful, as + there are no other fields in a general context, but in future there + might be.) The private_malloc() function is used (if supplied) to + obtain memory for storing the context, and all three values are saved as part of the context. - Whenever PCRE2 creates a data block of any kind, the block contains a - pointer to the free() function that matches the malloc() function that - was used. When the time comes to free the block, this function is + Whenever PCRE2 creates a data block of any kind, the block contains a + pointer to the free() function that matches the malloc() function that + was used. When the time comes to free the block, this function is called. A general context can be copied by calling: @@ -667,20 +716,21 @@ PCRE2 CONTEXTS The compile context - A compile context is required if you want to change the default values + A compile context is required if you want to change the default values of any of the following compile-time parameters: What \R matches (Unicode newlines or CR, LF, CRLF only) PCRE2's character tables The newline character sequence The compile time nested parentheses limit + The maximum length of the pattern string An external function for stack checking - A compile context is also required if you are using custom memory man- - agement. If none of these apply, just pass NULL as the context argu- + A compile context is also required if you are using custom memory man- + agement. If none of these apply, just pass NULL as the context argu- ment of pcre2_compile(). - A compile context is created, copied, and freed by the following func- + A compile context is created, copied, and freed by the following func- tions: pcre2_compile_context *pcre2_compile_context_create( @@ -691,26 +741,36 @@ PCRE2 CONTEXTS void pcre2_compile_context_free(pcre2_compile_context *ccontext); - A compile context is created with default values for its parameters. + A compile context is created with default values for its parameters. These can be changed by calling the following functions, which return 0 on success, or PCRE2_ERROR_BADDATA if invalid data is detected. int pcre2_set_bsr(pcre2_compile_context *ccontext, uint32_t value); - The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only - CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any + The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only + CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line ending sequence. The value is used by the JIT compiler and - by the two interpreted matching functions, pcre2_match() and + by the two interpreted matching functions, pcre2_match() and pcre2_dfa_match(). int pcre2_set_character_tables(pcre2_compile_context *ccontext, const unsigned char *tables); - The value must be the result of a call to pcre2_maketables(), whose + The value must be the result of a call to pcre2_maketables(), whose only argument is a general context. This function builds a set of char- acter tables in the current locale. + int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, + PCRE2_SIZE value); + + This sets a maximum length, in code units, for the pattern string that + is to be compiled. If the pattern is longer, an error is generated. + This facility is provided so that applications that accept patterns + from external sources can limit their size. The default is the largest + number that a PCRE2_SIZE variable can hold, which is effectively unlim- + ited. + int pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t value); @@ -731,7 +791,9 @@ PCRE2 CONTEXTS This parameter ajusts the limit, set when PCRE2 is built (default 250), on the depth of parenthesis nesting in a pattern. This limit stops - rogue patterns using up too much system stack when being compiled. + rogue patterns using up too much system stack when being compiled. The + limit applies to parentheses of all kinds, not just capturing parenthe- + ses. int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, int (*guard_function)(uint32_t, void *), void *user_data); @@ -755,7 +817,8 @@ PCRE2 CONTEXTS any of the following match-time parameters: A callout function - The limit for calling match() + The offset limit for matching an unanchored pattern + The limit for calling match() (see below) The limit for calling match() recursively A match context is also required if you are using custom memory manage- @@ -785,6 +848,31 @@ PCRE2 CONTEXTS points during a matching operation. Details are given in the pcre2call- out documentation. + int pcre2_set_offset_limit(pcre2_match_context *mcontext, + PCRE2_SIZE value); + + The offset_limit parameter limits how far an unanchored search can + advance in the subject string. The default value is PCRE2_UNSET. The + pcre2_match() and pcre2_dfa_match() functions return + PCRE2_ERROR_NOMATCH if a match with a starting point before or at the + given offset is not found. For example, if the pattern /abc/ is matched + against "123abc" with an offset limit less than 3, the result is + PCRE2_ERROR_NO_MATCH. A match can never be found if the startoffset + argument of pcre2_match() or pcre2_dfa_match() is greater than the off- + set limit. + + When using this facility, you must set PCRE2_USE_OFFSET_LIMIT when + calling pcre2_compile() so that when JIT is in use, different code can + be compiled. If a match is started with a non-default match limit when + PCRE2_USE_OFFSET_LIMIT is not set, an error is generated. + + The offset limit facility can be used to track progress when searching + large subject strings. See also the PCRE2_FIRSTLINE option, which + requires a match to start within the first line of the subject. If this + is set with an offset limit, a match must occur in the first line and + also within the offset limit. In other words, whichever limit comes + first is used. + int pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t value); @@ -834,19 +922,22 @@ PCRE2 CONTEXTS be used, or, when PCRE2 has been compiled to use memory on the heap instead of the stack, the amount of heap memory that can be used. This limit is not relevant, and is ignored, when matching is done using JIT - compiled code or by the pcre2_dfa_match() function. + compiled code. However, it is supported by pcre2_dfa_match(), which + uses recursive function calls less frequently than pcre2_match(), but + which can be caused to use a lot of stack by a recursive pattern such + as /(.)(?1)/ matched to a very long string. - The default value for recursion_limit can be set when PCRE2 is built; - the default default is the same value as the default for match_limit. - If the limit is exceeded, pcre2_match() returns PCRE2_ERROR_RECURSION- - LIMIT. A value for the recursion limit may also be supplied by an item - at the start of a pattern of the form + The default value for recursion_limit can be set when PCRE2 is built; + the default default is the same value as the default for match_limit. + If the limit is exceeded, pcre2_match() and pcre2_dfa_match() return + PCRE2_ERROR_RECURSIONLIMIT. A value for the recursion limit may also be + supplied by an item at the start of a pattern of the form (*LIMIT_RECURSION=ddd) - where ddd is a decimal number. However, such a setting is ignored - unless ddd is less than the limit set by the caller of pcre2_match() - or, if no such limit is set, less than the default. + where ddd is a decimal number. However, such a setting is ignored + unless ddd is less than the limit set by the caller of pcre2_match() or + pcre2_dfa_match() or, if no such limit is set, less than the default. int pcre2_set_recursion_memory_management( pcre2_match_context *mcontext, @@ -854,21 +945,21 @@ PCRE2 CONTEXTS void (*private_free)(void *, void *), void *memory_data); This function sets up two additional custom memory management functions - for use by pcre2_match() when PCRE2 is compiled to use the heap for + for use by pcre2_match() when PCRE2 is compiled to use the heap for remembering backtracking data, instead of recursive function calls that - use the system stack. There is a discussion about PCRE2's stack usage - in the pcre2stack documentation. See the pcre2build documentation for + use the system stack. There is a discussion about PCRE2's stack usage + in the pcre2stack documentation. See the pcre2build documentation for details of how to build PCRE2. - Using the heap for recursion is a non-standard way of building PCRE2, - for use in environments that have limited stacks. Because of the + Using the heap for recursion is a non-standard way of building PCRE2, + for use in environments that have limited stacks. Because of the greater use of memory management, pcre2_match() runs more slowly. Func- - tions that are different to the general custom memory functions are - provided so that special-purpose external code can be used for this - case, because the memory blocks are all the same size. The blocks are + tions that are different to the general custom memory functions are + provided so that special-purpose external code can be used for this + case, because the memory blocks are all the same size. The blocks are retained by pcre2_match() until it is about to exit so that they can be - re-used when possible during the match. In the absence of these func- - tions, the normal custom memory management functions are used, if sup- + re-used when possible during the match. In the absence of these func- + tions, the normal custom memory management functions are used, if sup- plied, otherwise the system functions. @@ -876,75 +967,75 @@ CHECKING BUILD-TIME OPTIONS int pcre2_config(uint32_t what, void *where); - The function pcre2_config() makes it possible for a PCRE2 client to - discover which optional features have been compiled into the PCRE2 - library. The pcre2build documentation has more details about these + The function pcre2_config() makes it possible for a PCRE2 client to + discover which optional features have been compiled into the PCRE2 + library. The pcre2build documentation has more details about these optional features. - The first argument for pcre2_config() specifies which information is - required. The second argument is a pointer to memory into which the - information is placed. If NULL is passed, the function returns the - amount of memory that is needed for the requested information. For - calls that return numerical values, the value is in bytes; when - requesting these values, where should point to appropriately aligned - memory. For calls that return strings, the required length is given in + The first argument for pcre2_config() specifies which information is + required. The second argument is a pointer to memory into which the + information is placed. If NULL is passed, the function returns the + amount of memory that is needed for the requested information. For + calls that return numerical values, the value is in bytes; when + requesting these values, where should point to appropriately aligned + memory. For calls that return strings, the required length is given in code units, not counting the terminating zero. - When requesting information, the returned value from pcre2_config() is - non-negative on success, or the negative error code PCRE2_ERROR_BADOP- - TION if the value in the first argument is not recognized. The follow- + When requesting information, the returned value from pcre2_config() is + non-negative on success, or the negative error code PCRE2_ERROR_BADOP- + TION if the value in the first argument is not recognized. The follow- ing information is available: PCRE2_CONFIG_BSR - The output is a uint32_t integer whose value indicates what character - sequences the \R escape sequence matches by default. A value of + The output is a uint32_t integer whose value indicates what character + sequences the \R escape sequence matches by default. A value of PCRE2_BSR_UNICODE means that \R matches any Unicode line ending - sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR, + sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF. The default can be overridden when a pattern is compiled. PCRE2_CONFIG_JIT - The output is a uint32_t integer that is set to one if support for + The output is a uint32_t integer that is set to one if support for just-in-time compiling is available; otherwise it is set to zero. PCRE2_CONFIG_JITTARGET - The where argument should point to a buffer that is at least 48 code - units long. (The exact length required can be found by calling - pcre2_config() with where set to NULL.) The buffer is filled with a - string that contains the name of the architecture for which the JIT - compiler is configured, for example "x86 32bit (little endian + - unaligned)". If JIT support is not available, PCRE2_ERROR_BADOPTION is - returned, otherwise the number of code units used is returned. This is + The where argument should point to a buffer that is at least 48 code + units long. (The exact length required can be found by calling + pcre2_config() with where set to NULL.) The buffer is filled with a + string that contains the name of the architecture for which the JIT + compiler is configured, for example "x86 32bit (little endian + + unaligned)". If JIT support is not available, PCRE2_ERROR_BADOPTION is + returned, otherwise the number of code units used is returned. This is the length of the string, plus one unit for the terminating zero. PCRE2_CONFIG_LINKSIZE The output is a uint32_t integer that contains the number of bytes used - for internal linkage in compiled regular expressions. When PCRE2 is - configured, the value can be set to 2, 3, or 4, with the default being - 2. This is the value that is returned by pcre2_config(). However, when - the 16-bit library is compiled, a value of 3 is rounded up to 4, and - when the 32-bit library is compiled, internal linkages always use 4 + for internal linkage in compiled regular expressions. When PCRE2 is + configured, the value can be set to 2, 3, or 4, with the default being + 2. This is the value that is returned by pcre2_config(). However, when + the 16-bit library is compiled, a value of 3 is rounded up to 4, and + when the 32-bit library is compiled, internal linkages always use 4 bytes, so the configured value is not relevant. The default value of 2 for the 8-bit and 16-bit libraries is sufficient - for all but the most massive patterns, since it allows the size of the + for all but the most massive patterns, since it allows the size of the compiled pattern to be up to 64K code units. Larger values allow larger - regular expressions to be compiled by those two libraries, but at the + regular expressions to be compiled by those two libraries, but at the expense of slower matching. PCRE2_CONFIG_MATCHLIMIT - The output is a uint32_t integer that gives the default limit for the - number of internal matching function calls in a pcre2_match() execu- + The output is a uint32_t integer that gives the default limit for the + number of internal matching function calls in a pcre2_match() execu- tion. Further details are given with pcre2_match() below. PCRE2_CONFIG_NEWLINE - The output is a uint32_t integer whose value specifies the default - character sequence that is recognized as meaning "newline". The values + The output is a uint32_t integer whose value specifies the default + character sequence that is recognized as meaning "newline". The values are: PCRE2_NEWLINE_CR Carriage return (CR) @@ -953,56 +1044,56 @@ CHECKING BUILD-TIME OPTIONS PCRE2_NEWLINE_ANY Any Unicode line ending PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF - The default should normally correspond to the standard sequence for + The default should normally correspond to the standard sequence for your operating system. PCRE2_CONFIG_PARENSLIMIT - The output is a uint32_t integer that gives the maximum depth of nest- + The output is a uint32_t integer that gives the maximum depth of nest- ing of parentheses (of any kind) in a pattern. This limit is imposed to - cap the amount of system stack used when a pattern is compiled. It is - specified when PCRE2 is built; the default is 250. This limit does not - take into account the stack that may already be used by the calling - application. For finer control over compilation stack usage, see + cap the amount of system stack used when a pattern is compiled. It is + specified when PCRE2 is built; the default is 250. This limit does not + take into account the stack that may already be used by the calling + application. For finer control over compilation stack usage, see pcre2_set_compile_recursion_guard(). PCRE2_CONFIG_RECURSIONLIMIT - The output is a uint32_t integer that gives the default limit for the - depth of recursion when calling the internal matching function in a - pcre2_match() execution. Further details are given with pcre2_match() + The output is a uint32_t integer that gives the default limit for the + depth of recursion when calling the internal matching function in a + pcre2_match() execution. Further details are given with pcre2_match() below. PCRE2_CONFIG_STACKRECURSE - The output is a uint32_t integer that is set to one if internal recur- - sion when running pcre2_match() is implemented by recursive function - calls that use the system stack to remember their state. This is the - usual way that PCRE2 is compiled. The output is zero if PCRE2 was com- - piled to use blocks of data on the heap instead of recursive function + The output is a uint32_t integer that is set to one if internal recur- + sion when running pcre2_match() is implemented by recursive function + calls that use the system stack to remember their state. This is the + usual way that PCRE2 is compiled. The output is zero if PCRE2 was com- + piled to use blocks of data on the heap instead of recursive function calls. PCRE2_CONFIG_UNICODE_VERSION - The where argument should point to a buffer that is at least 24 code - units long. (The exact length required can be found by calling - pcre2_config() with where set to NULL.) If PCRE2 has been compiled - without Unicode support, the buffer is filled with the text "Unicode - not supported". Otherwise, the Unicode version string (for example, - "7.0.0") is inserted. The number of code units used is returned. This + The where argument should point to a buffer that is at least 24 code + units long. (The exact length required can be found by calling + pcre2_config() with where set to NULL.) If PCRE2 has been compiled + without Unicode support, the buffer is filled with the text "Unicode + not supported". Otherwise, the Unicode version string (for example, + "8.0.0") is inserted. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero. PCRE2_CONFIG_UNICODE - The output is a uint32_t integer that is set to one if Unicode support - is available; otherwise it is set to zero. Unicode support implies UTF + The output is a uint32_t integer that is set to one if Unicode support + is available; otherwise it is set to zero. Unicode support implies UTF support. PCRE2_CONFIG_VERSION - The where argument should point to a buffer that is at least 12 code - units long. (The exact length required can be found by calling - pcre2_config() with where set to NULL.) The buffer is filled with the + The where argument should point to a buffer that is at least 12 code + units long. (The exact length required can be found by calling + pcre2_config() with where set to NULL.) The buffer is filled with the PCRE2 version string, zero-terminated. The number of code units used is returned. This is the length of the string plus one unit for the termi- nating zero. @@ -1014,34 +1105,59 @@ COMPILING A PATTERN uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext); - pcre2_code_free(pcre2_code *code); + void pcre2_code_free(pcre2_code *code); - The pcre2_compile() function compiles a pattern into an internal form. - The pattern is defined by a pointer to a string of code units and a - length, If the pattern is zero-terminated, the length can be specified - as PCRE2_ZERO_TERMINATED. The function returns a pointer to a block of - memory that contains the compiled pattern and related data. The caller - must free the memory by calling pcre2_code_free() when it is no longer - needed. + pcre2_code *pcre2_code_copy(const pcre2_code *code); - NOTE: When one of the matching functions is called, pointers to the + pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code); + + The pcre2_compile() function compiles a pattern into an internal form. + The pattern is defined by a pointer to a string of code units and a + length. If the pattern is zero-terminated, the length can be specified + as PCRE2_ZERO_TERMINATED. The function returns a pointer to a block of + memory that contains the compiled pattern and related data, or NULL if + an error occurred. + + If the compile context argument ccontext is NULL, memory for the com- + piled pattern is obtained by calling malloc(). Otherwise, it is + obtained from the same memory function that was used for the compile + context. The caller must free the memory by calling pcre2_code_free() + when it is no longer needed. + + The function pcre2_code_copy() makes a copy of the compiled code in new + memory, using the same memory allocator as was used for the original. + However, if the code has been processed by the JIT compiler (see + below), the JIT information cannot be copied (because it is position- + dependent). The new copy can initially be used only for non-JIT match- + ing, though it can be passed to pcre2_jit_compile() if required. + + The pcre2_code_copy() function provides a way for individual threads in + a multithreaded application to acquire a private copy of shared com- + piled code. However, it does not make a copy of the character tables + used by the compiled pattern; the new pattern code points to the same + tables as the original code. (See "Locale Support" below for details + of these character tables.) In many applications the same tables are + used throughout, so this behaviour is appropriate. Nevertheless, there + are occasions when a copy of a compiled pattern and the relevant tables + are needed. The pcre2_code_copy_with_tables() provides this facility. + Copies of both the code and the tables are made, with the new code + pointing to the new tables. The memory for the new tables is automati- + cally freed when pcre2_code_free() is called for the new copy of the + compiled code. + + NOTE: When one of the matching functions is called, pointers to the compiled pattern and the subject string are set in the match data block - so that they can be referenced by the extraction functions. After run- - ning a match, you must not free a compiled pattern (or a subject - string) until after all operations on the match data block have taken - place. + so that they can be referenced by the substring extraction functions. + After running a match, you must not free a compiled pattern (or a sub- + ject string) until after all operations on the match data block have + taken place. - If the compile context argument ccontext is NULL, memory for the com- - piled pattern is obtained by calling malloc(). Otherwise, it is - obtained from the same memory function that was used for the compile - context. - - The options argument contains various bit settings that affect the com- - pilation. It should be zero if no options are required. The available - options are described below. Some of them (in particular, those that - are compatible with Perl, but some others as well) can also be set and - unset from within the pattern (see the detailed description in the - pcre2pattern documentation). + The options argument for pcre2_compile() contains various bit settings + that affect the compilation. It should be zero if no options are + required. The available options are described below. Some of them (in + particular, those that are compatible with Perl, but some others as + well) can also be set and unset from within the pattern (see the + detailed description in the pcre2pattern documentation). For those options that can be different in different parts of the pat- tern, the contents of the options argument specifies their settings at @@ -1053,13 +1169,23 @@ COMPILING A PATTERN above). If errorcode or erroroffset is NULL, pcre2_compile() returns NULL imme- - diately. Otherwise, if compilation of a pattern fails, pcre2_compile() - returns NULL, having set these variables to an error code and an offset - (number of code units) within the pattern, respectively. The - pcre2_get_error_message() function provides a textual message for each - error code. Compilation errors are positive numbers, but UTF formatting - errors are negative numbers. For an invalid UTF-8 or UTF-16 string, the - offset is that of the first code unit of the failing character. + diately. Otherwise, the variables to which these point are set to an + error code and an offset (number of code units) within the pattern, + respectively, when pcre2_compile() returns NULL because a compilation + error has occurred. The values are not defined when compilation is suc- + cessful and pcre2_compile() returns a non-NULL value. + + The value returned in erroroffset is an indication of where in the pat- + tern the error occurred. It is not necessarily the furthest point in + the pattern that was read. For example, after the error "lookbehind + assertion is not fixed length", the error offset points to the start of + the failing assertion. + + The pcre2_get_error_message() function (see "Obtaining a textual error + message" below) provides a textual message for each error code. Compi- + lation errors have positive error codes; UTF formatting error codes are + negative. For an invalid UTF-8 or UTF-16 string, the offset is that of + the first code unit of the failing character. Some errors are not detected until the whole pattern has been scanned; in these cases, the offset passed back is the length of the pattern. @@ -1128,11 +1254,25 @@ COMPILING A PATTERN Perl. If you want a multiline circumflex also to match after a termi- nating newline, you must set PCRE2_ALT_CIRCUMFLEX. + PCRE2_ALT_VERBNAMES + + By default, for compatibility with Perl, the name in any verb sequence + such as (*MARK:NAME) is any sequence of characters that does not + include a closing parenthesis. The name is not processed in any way, + and it is not possible to include a closing parenthesis in the name. + However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash + processing is applied to verb names and only an unescaped closing + parenthesis terminates the name. A closing parenthesis can be included + in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED + option is set, unescaped whitespace in verb names is skipped and #-com- + ments are recognized, exactly as in the rest of the pattern. + PCRE2_AUTO_CALLOUT - If this bit is set, pcre2_compile() automatically inserts callout - items, all with number 255, before each pattern item. For discussion of - the callout facility, see the pcre2callout documentation. + If this bit is set, pcre2_compile() automatically inserts callout + items, all with number 255, before each pattern item, except immedi- + ately before or after a callout in the pattern. For discussion of the + callout facility, see the pcre2callout documentation. PCRE2_CASELESS @@ -1196,7 +1336,11 @@ COMPILING A PATTERN If this option is set, an unanchored pattern is required to match before or at the first newline in the subject string, though the - matched text may continue over the newline. + matched text may continue over the newline. See also PCRE2_USE_OFF- + SET_LIMIT, which provides a more general limiting facility. If + PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the + first line and also within the offset limit. In other words, whichever + limit comes first is used. PCRE2_MATCH_UNSET_BACKREF @@ -1236,7 +1380,9 @@ COMPILING A PATTERN piled. This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because it may leave the current matching point in the middle of a multi-code-unit character. This option may be useful in - applications that process patterns from external sources. + applications that process patterns from external sources. Note that + there is also a build-time option that permanently locks out the use of + \C. PCRE2_NEVER_UCP @@ -1263,118 +1409,130 @@ COMPILING A PATTERN theses in the pattern. Any opening parenthesis that is not followed by ? behaves as if it were followed by ?: but named parentheses can still be used for capturing (and they acquire numbers in the usual way). - There is no equivalent of this option in Perl. + There is no equivalent of this option in Perl. Note that, if this + option is set, references to capturing groups (back references or + recursion/subroutine calls) may only refer to named groups, though the + reference can be by name or by number. PCRE2_NO_AUTO_POSSESS If this option is set, it disables "auto-possessification", which is an - optimization that, for example, turns a+b into a++b in order to avoid - backtracks into a+ that can never be successful. However, if callouts - are in use, auto-possessification means that some callouts are never + optimization that, for example, turns a+b into a++b in order to avoid + backtracks into a+ that can never be successful. However, if callouts + are in use, auto-possessification means that some callouts are never taken. You can set this option if you want the matching functions to do - a full unoptimized search and run all the callouts, but it is mainly + a full unoptimized search and run all the callouts, but it is mainly provided for testing purposes. PCRE2_NO_DOTSTAR_ANCHOR If this option is set, it disables an optimization that is applied when - .* is the first significant item in a top-level branch of a pattern, - and all the other branches also start with .* or with \A or \G or ^. - The optimization is automatically disabled for .* if it is inside an - atomic group or a capturing group that is the subject of a back refer- - ence, or if the pattern contains (*PRUNE) or (*SKIP). When the opti- - mization is not disabled, such a pattern is automatically anchored if + .* is the first significant item in a top-level branch of a pattern, + and all the other branches also start with .* or with \A or \G or ^. + The optimization is automatically disabled for .* if it is inside an + atomic group or a capturing group that is the subject of a back refer- + ence, or if the pattern contains (*PRUNE) or (*SKIP). When the opti- + mization is not disabled, such a pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set - for any ^ items. Otherwise, the fact that any match must start either - at the start of the subject or following a newline is remembered. Like + for any ^ items. Otherwise, the fact that any match must start either + at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped. PCRE2_NO_START_OPTIMIZE - This is an option whose main effect is at matching time. It does not + This is an option whose main effect is at matching time. It does not change what pcre2_compile() generates, but it does affect the output of the JIT compiler. - There are a number of optimizations that may occur at the start of a - match, in order to speed up the process. For example, if it is known - that an unanchored match must start with a specific character, the - matching code searches the subject for that character, and fails imme- - diately if it cannot find it, without actually running the main match- - ing function. This means that a special item such as (*COMMIT) at the - start of a pattern is not considered until after a suitable starting - point for the match has been found. Also, when callouts or (*MARK) - items are in use, these "start-up" optimizations can cause them to be - skipped if the pattern is never actually used. The start-up optimiza- - tions are in effect a pre-scan of the subject that takes place before + There are a number of optimizations that may occur at the start of a + match, in order to speed up the process. For example, if it is known + that an unanchored match must start with a specific character, the + matching code searches the subject for that character, and fails imme- + diately if it cannot find it, without actually running the main match- + ing function. This means that a special item such as (*COMMIT) at the + start of a pattern is not considered until after a suitable starting + point for the match has been found. Also, when callouts or (*MARK) + items are in use, these "start-up" optimizations can cause them to be + skipped if the pattern is never actually used. The start-up optimiza- + tions are in effect a pre-scan of the subject that takes place before the pattern is run. The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, - possibly causing performance to suffer, but ensuring that in cases - where the result is "no match", the callouts do occur, and that items + possibly causing performance to suffer, but ensuring that in cases + where the result is "no match", the callouts do occur, and that items such as (*COMMIT) and (*MARK) are considered at every possible starting position in the subject string. - Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching + Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching operation. Consider the pattern (*COMMIT)ABC - When this is compiled, PCRE2 records the fact that a match must start - with the character "A". Suppose the subject string is "DEFABC". The - start-up optimization scans along the subject, finds "A" and runs the - first match attempt from there. The (*COMMIT) item means that the pat- - tern must match the current starting position, which in this case, it - does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE - set, the initial scan along the subject string does not happen. The - first match attempt is run starting from "D" and when this fails, - (*COMMIT) prevents any further matches being tried, so the overall + When this is compiled, PCRE2 records the fact that a match must start + with the character "A". Suppose the subject string is "DEFABC". The + start-up optimization scans along the subject, finds "A" and runs the + first match attempt from there. The (*COMMIT) item means that the pat- + tern must match the current starting position, which in this case, it + does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE + set, the initial scan along the subject string does not happen. The + first match attempt is run starting from "D" and when this fails, + (*COMMIT) prevents any further matches being tried, so the overall result is "no match". There are also other start-up optimizations. For example, a minimum length for the subject may be recorded. Consider the pattern (*MARK:A)(X|Y) - The minimum length for a match is one character. If the subject is + The minimum length for a match is one character. If the subject is "ABC", there will be attempts to match "ABC", "BC", and "C". An attempt to match an empty string at the end of the subject does not take place, - because PCRE2 knows that the subject is now too short, and so the - (*MARK) is never encountered. In this case, the optimization does not + because PCRE2 knows that the subject is now too short, and so the + (*MARK) is never encountered. In this case, the optimization does not affect the overall match result, which is still "no match", but it does affect the auxiliary information that is returned. PCRE2_NO_UTF_CHECK - When PCRE2_UTF is set, the validity of the pattern as a UTF string is - automatically checked. There are discussions about the validity of - UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode + When PCRE2_UTF is set, the validity of the pattern as a UTF string is + automatically checked. There are discussions about the validity of + UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode document. If an invalid UTF sequence is found, pcre2_compile() returns a negative error code. If you know that your pattern is valid, and you want to skip this check - for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. - When it is set, the effect of passing an invalid UTF string as a pat- - tern is undefined. It may cause your program to crash or loop. Note - that this option can also be passed to pcre2_match() and + for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. + When it is set, the effect of passing an invalid UTF string as a pat- + tern is undefined. It may cause your program to crash or loop. Note + that this option can also be passed to pcre2_match() and pcre_dfa_match(), to suppress validity checking of the subject string. PCRE2_UCP This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W, - \w, and some of the POSIX character classes. By default, only ASCII - characters are recognized, but if PCRE2_UCP is set, Unicode properties - are used instead to classify characters. More details are given in the + \w, and some of the POSIX character classes. By default, only ASCII + characters are recognized, but if PCRE2_UCP is set, Unicode properties + are used instead to classify characters. More details are given in the section on generic character types in the pcre2pattern page. If you set - PCRE2_UCP, matching one of the items it affects takes much longer. The - option is available only if PCRE2 has been compiled with Unicode sup- + PCRE2_UCP, matching one of the items it affects takes much longer. The + option is available only if PCRE2 has been compiled with Unicode sup- port. PCRE2_UNGREEDY - This option inverts the "greediness" of the quantifiers so that they - are not greedy by default, but become greedy if followed by "?". It is - not compatible with Perl. It can also be set by a (?U) option setting + This option inverts the "greediness" of the quantifiers so that they + are not greedy by default, but become greedy if followed by "?". It is + not compatible with Perl. It can also be set by a (?U) option setting within the pattern. + PCRE2_USE_OFFSET_LIMIT + + This option must be set for pcre2_compile() if pcre2_set_offset_limit() + is going to be used to set a non-default offset limit in a match con- + text for matches that use this pattern. An error is generated if an + offset limit is set without this option. For more details, see the + description of pcre2_set_offset_limit() in the section that describes + match contexts. See also the PCRE2_FIRSTLINE option above. + PCRE2_UTF This option causes PCRE2 to regard both the pattern and the subject @@ -1389,11 +1547,12 @@ COMPILING A PATTERN COMPILATION ERROR CODES There are over 80 positive error codes that pcre2_compile() may return - if it finds an error in the pattern. There are also some negative error - codes that are used for invalid UTF strings. These are the same as - given by pcre2_match() and pcre2_dfa_match(), and are described in the - pcre2unicode page. The pcre2_get_error_message() function can be called - to obtain a textual error message from any error code. + (via errorcode) if it finds an error in the pattern. There are also + some negative error codes that are used for invalid UTF strings. These + are the same as given by pcre2_match() and pcre2_dfa_match(), and are + described in the pcre2unicode page. The pcre2_get_error_message() func- + tion (see "Obtaining a textual error message" below) can be called to + obtain a textual error message from any error code. JUST-IN-TIME (JIT) COMPILATION @@ -1415,53 +1574,53 @@ JUST-IN-TIME (JIT) COMPILATION void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack); - These functions provide support for JIT compilation, which, if the - just-in-time compiler is available, further processes a compiled pat- + These functions provide support for JIT compilation, which, if the + just-in-time compiler is available, further processes a compiled pat- tern into machine code that executes much faster than the pcre2_match() - interpretive matching function. Full details are given in the pcre2jit + interpretive matching function. Full details are given in the pcre2jit documentation. - JIT compilation is a heavyweight optimization. It can take some time - for patterns to be analyzed, and for one-off matches and simple pat- - terns the benefit of faster execution might be offset by a much slower - compilation time. Most, but not all patterns can be optimized by the + JIT compilation is a heavyweight optimization. It can take some time + for patterns to be analyzed, and for one-off matches and simple pat- + terns the benefit of faster execution might be offset by a much slower + compilation time. Most, but not all patterns can be optimized by the JIT compiler. LOCALE SUPPORT - PCRE2 handles caseless matching, and determines whether characters are - letters, digits, or whatever, by reference to a set of tables, indexed - by character code point. This applies only to characters whose code - points are less than 256. By default, higher-valued code points never - match escapes such as \w or \d. However, if PCRE2 is built with UTF - support, all characters can be tested with \p and \P, or, alterna- - tively, the PCRE2_UCP option can be set when a pattern is compiled; - this causes \w and friends to use Unicode property support instead of + PCRE2 handles caseless matching, and determines whether characters are + letters, digits, or whatever, by reference to a set of tables, indexed + by character code point. This applies only to characters whose code + points are less than 256. By default, higher-valued code points never + match escapes such as \w or \d. However, if PCRE2 is built with UTF + support, all characters can be tested with \p and \P, or, alterna- + tively, the PCRE2_UCP option can be set when a pattern is compiled; + this causes \w and friends to use Unicode property support instead of the built-in tables. - The use of locales with Unicode is discouraged. If you are handling - characters with code points greater than 128, you should either use + The use of locales with Unicode is discouraged. If you are handling + characters with code points greater than 128, you should either use Unicode support, or use locales, but not try to mix the two. - PCRE2 contains an internal set of character tables that are used by - default. These are sufficient for many applications. Normally, the + PCRE2 contains an internal set of character tables that are used by + default. These are sufficient for many applications. Normally, the internal tables recognize only ASCII characters. However, when PCRE2 is built, it is possible to cause the internal tables to be rebuilt in the default "C" locale of the local system, which may cause them to be dif- ferent. - The internal tables can be overridden by tables supplied by the appli- - cation that calls PCRE2. These may be created in a different locale - from the default. As more and more applications change to using Uni- + The internal tables can be overridden by tables supplied by the appli- + cation that calls PCRE2. These may be created in a different locale + from the default. As more and more applications change to using Uni- code, the need for this locale support is expected to die away. - External tables are built by calling the pcre2_maketables() function, - in the relevant locale. The result can be passed to pcre2_compile() as - often as necessary, by creating a compile context and calling - pcre2_set_character_tables() to set the tables pointer therein. For - example, to build and use tables that are appropriate for the French - locale (where accented characters with values greater than 128 are + External tables are built by calling the pcre2_maketables() function, + in the relevant locale. The result can be passed to pcre2_compile() as + often as necessary, by creating a compile context and calling + pcre2_set_character_tables() to set the tables pointer therein. For + example, to build and use tables that are appropriate for the French + locale (where accented characters with values greater than 128 are treated as letters), the following code could be used: setlocale(LC_CTYPE, "fr_FR"); @@ -1470,15 +1629,15 @@ LOCALE SUPPORT pcre2_set_character_tables(ccontext, tables); re = pcre2_compile(..., ccontext); - The locale name "fr_FR" is used on Linux and other Unix-like systems; - if you are using Windows, the name for the French locale is "french". - It is the caller's responsibility to ensure that the memory containing + The locale name "fr_FR" is used on Linux and other Unix-like systems; + if you are using Windows, the name for the French locale is "french". + It is the caller's responsibility to ensure that the memory containing the tables remains available for as long as it is needed. The pointer that is passed (via the compile context) to pcre2_compile() - is saved with the compiled pattern, and the same tables are used by - pcre2_match() and pcre_dfa_match(). Thus, for any single pattern, com- - pilation, and matching all happen in the same locale, but different + is saved with the compiled pattern, and the same tables are used by + pcre2_match() and pcre_dfa_match(). Thus, for any single pattern, com- + pilation, and matching all happen in the same locale, but different patterns can be processed in different locales. @@ -1486,13 +1645,13 @@ INFORMATION ABOUT A COMPILED PATTERN int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where); - The pcre2_pattern_info() function returns general information about a + The pcre2_pattern_info() function returns general information about a compiled pattern. For information about callouts, see the next section. - The first argument for pcre2_pattern_info() is a pointer to the com- + The first argument for pcre2_pattern_info() is a pointer to the com- piled pattern. The second argument specifies which piece of information - is required, and the third argument is a pointer to a variable to - receive the data. If the third argument is NULL, the first argument is - ignored, and the function returns the size in bytes of the variable + is required, and the third argument is a pointer to a variable to + receive the data. If the third argument is NULL, the first argument is + ignored, and the function returns the size in bytes of the variable that is required for the information requested. Otherwise, The yield of the function is zero for success, or one of the following negative num- bers: @@ -1502,9 +1661,9 @@ INFORMATION ABOUT A COMPILED PATTERN PCRE2_ERROR_BADOPTION the value of what was invalid PCRE2_ERROR_UNSET the requested field is not set - The "magic number" is placed at the start of each compiled pattern as - an simple check against passing an arbitrary memory pointer. Here is a - typical call of pcre2_pattern_info(), to obtain the length of the com- + The "magic number" is placed at the start of each compiled pattern as + an simple check against passing an arbitrary memory pointer. Here is a + typical call of pcre2_pattern_info(), to obtain the length of the com- piled pattern: int rc; @@ -1521,14 +1680,17 @@ INFORMATION ABOUT A COMPILED PATTERN PCRE2_INFO_ARGOPTIONS Return a copy of the pattern's options. The third argument should point - to a uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the - options that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOP- - TIONS returns the compile options as modified by any top-level option - settings at the start of the pattern itself. In other words, they are - the options that will be in force when matching starts. For example, if - the pattern /(?im)abc(?-i)d/ is compiled with the PCRE2_EXTENDED - option, the result is PCRE2_CASELESS, PCRE2_MULTILINE, and - PCRE2_EXTENDED. + to a uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the + options that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOP- + TIONS returns the compile options as modified by any top-level (*XXX) + option settings such as (*UTF) at the start of the pattern itself. + + For example, if the pattern /(*UTF)abc/ is compiled with the + PCRE2_EXTENDED option, the result for PCRE2_INFO_ALLOPTIONS is + PCRE2_EXTENDED and PCRE2_UTF. Option settings such as (?i) that can + change within a pattern do not affect the result of PCRE2_INFO_ALLOP- + TIONS, even if they appear right at the start of the pattern. (This was + different in some earlier releases.) A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if the first significant item in every top-level branch is one of @@ -1572,20 +1734,30 @@ INFORMATION ABOUT A COMPILED PATTERN PCRE2_INFO_CAPTURECOUNT - Return the number of capturing subpatterns in the pattern. The third - argument should point to an uint32_t variable. + Return the highest capturing subpattern number in the pattern. In pat- + terns where (?| is not used, this is also the total number of capturing + subpatterns. The third argument should point to an uint32_t variable. + + PCRE2_INFO_FIRSTBITMAP + + In the absence of a single first code unit for a non-anchored pattern, + pcre2_compile() may construct a 256-bit table that defines a fixed set + of values for the first code unit in any match. For example, a pattern + that starts with [abc] results in a table with three bits set. When + code unit values greater than 255 are supported, the flag bit for 255 + means "any code unit of value 255 or above". If such a table was con- + structed, a pointer to it is returned. Otherwise NULL is returned. The + third argument should point to an const uint8_t * variable. PCRE2_INFO_FIRSTCODETYPE Return information about the first code unit of any matched string, for a non-anchored pattern. The third argument should point to an uint32_t - variable. - - If there is a fixed first value, for example, the letter "c" from a - pattern such as (cat|cow|coyote), 1 is returned, and the character - value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no - fixed first value, but it is known that a match can occur only at the - start of the subject or following a newline in the subject, 2 is + variable. If there is a fixed first value, for example, the letter "c" + from a pattern such as (cat|cow|coyote), 1 is returned, and the charac- + ter value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is + no fixed first value, but it is known that a match can occur only at + the start of the subject or following a newline in the subject, 2 is returned. Otherwise, and for anchored patterns, 0 is returned. PCRE2_INFO_FIRSTCODEUNIT @@ -1598,16 +1770,10 @@ INFORMATION ABOUT A COMPILED PATTERN value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32 mode. - PCRE2_INFO_FIRSTBITMAP + PCRE2_INFO_HASBACKSLASHC - In the absence of a single first code unit for a non-anchored pattern, - pcre2_compile() may construct a 256-bit table that defines a fixed set - of values for the first code unit in any match. For example, a pattern - that starts with [abc] results in a table with three bits set. When - code unit values greater than 255 are supported, the flag bit for 255 - means "any code unit of value 255 or above". If such a table was con- - structed, a pointer to it is returned. Otherwise NULL is returned. The - third argument should point to an const uint8_t * variable. + Return 1 if the pattern contains any instances of \C, otherwise 0. The + third argument should point to an uint32_t variable. PCRE2_INFO_HASCRORLF @@ -1635,24 +1801,26 @@ INFORMATION ABOUT A COMPILED PATTERN any matched string, other than at its start. The third argument should point to an uint32_t variable. If there is no such value, 0 is returned. When 1 is returned, the code unit value itself can be - retrieved using PCRE2_INFO_LASTCODEUNIT. - - For anchored patterns, a last literal value is recorded only if it fol- - lows something of variable length. For example, for the pattern - /^a\d+z\d+/ the returned value is 1 (with "z" returned from - PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0. + retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last + literal value is recorded only if it follows something of variable + length. For example, for the pattern /^a\d+z\d+/ the returned value is + 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ + the returned value is 0. PCRE2_INFO_LASTCODEUNIT - Return the value of the rightmost literal data unit that must exist in - any matched string, other than at its start, if such a value has been - recorded. The third argument should point to an uint32_t variable. If + Return the value of the rightmost literal data unit that must exist in + any matched string, other than at its start, if such a value has been + recorded. The third argument should point to an uint32_t variable. If there is no such value, 0 is returned. PCRE2_INFO_MATCHEMPTY - Return 1 if the pattern can match an empty string, otherwise 0. The - third argument should point to an uint32_t variable. + Return 1 if the pattern might match an empty string, otherwise 0. The + third argument should point to an uint32_t variable. When a pattern + contains recursive subroutine calls it is not always possible to deter- + mine whether or not it can match an empty string. PCRE2 takes a cau- + tious approach and returns 1 in such cases. PCRE2_INFO_MATCHLIMIT @@ -1809,11 +1977,11 @@ SERIALIZATION AND PRECOMPILING THE MATCH DATA BLOCK - pcre2_match_data_create(uint32_t ovecsize, + pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext); - pcre2_match_data_create_from_pattern(const pcre2_code *code, - pcre2_general_context *gcontext); + pcre2_match_data *pcre2_match_data_create_from_pattern( + const pcre2_code *code, pcre2_general_context *gcontext); void pcre2_match_data_free(pcre2_match_data *match_data); @@ -1821,7 +1989,7 @@ THE MATCH DATA BLOCK match data block, which is an opaque structure that is accessed by function calls. In particular, the match data block contains a vector of offsets into the subject string that define the matched part of the - subject and any substrings that were captured. This is know as the + subject and any substrings that were captured. This is known as the ovector. Before calling pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match() @@ -1962,72 +2130,88 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION The unused bits of the options argument for pcre2_match() must be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL, - PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, + PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below. Setting PCRE2_ANCHORED at match time is not supported by the just-in- time (JIT) compiler. If it is set, JIT matching is disabled and the - normal interpretive code in pcre2_match() is run. The remaining options - are supported for JIT matching. + normal interpretive code in pcre2_match() is run. Apart from + PCRE2_NO_JIT (obviously), the remaining options are supported for JIT + matching. PCRE2_ANCHORED The PCRE2_ANCHORED option limits pcre2_match() to matching at the first - matching position. If a pattern was compiled with PCRE2_ANCHORED, or - turned out to be anchored by virtue of its contents, it cannot be made - unachored at matching time. Note that setting the option at match time + matching position. If a pattern was compiled with PCRE2_ANCHORED, or + turned out to be anchored by virtue of its contents, it cannot be made + unachored at matching time. Note that setting the option at match time disables JIT matching. PCRE2_NOTBOL This option specifies that first character of the subject string is not - the beginning of a line, so the circumflex metacharacter should not - match before it. Setting this without having set PCRE2_MULTILINE at + the beginning of a line, so the circumflex metacharacter should not + match before it. Setting this without having set PCRE2_MULTILINE at compile time causes circumflex never to match. This option affects only the behaviour of the circumflex metacharacter. It does not affect \A. PCRE2_NOTEOL This option specifies that the end of the subject string is not the end - of a line, so the dollar metacharacter should not match it nor (except - in multiline mode) a newline immediately before it. Setting this with- - out having set PCRE2_MULTILINE at compile time causes dollar never to + of a line, so the dollar metacharacter should not match it nor (except + in multiline mode) a newline immediately before it. Setting this with- + out having set PCRE2_MULTILINE at compile time causes dollar never to match. This option affects only the behaviour of the dollar metacharac- ter. It does not affect \Z or \z. PCRE2_NOTEMPTY An empty string is not considered to be a valid match if this option is - set. If there are alternatives in the pattern, they are tried. If all - the alternatives match the empty string, the entire match fails. For + set. If there are alternatives in the pattern, they are tried. If all + the alternatives match the empty string, the entire match fails. For example, if the pattern a?b? - is applied to a string not beginning with "a" or "b", it matches an + is applied to a string not beginning with "a" or "b", it matches an empty string at the start of the subject. With PCRE2_NOTEMPTY set, this - match is not valid, so pcre2_match() searches further into the string + match is not valid, so pcre2_match() searches further into the string for occurrences of "a" or "b". PCRE2_NOTEMPTY_ATSTART - This is like PCRE2_NOTEMPTY, except that it locks out an empty string + This is like PCRE2_NOTEMPTY, except that it locks out an empty string match only at the first matching position, that is, at the start of the - subject plus the starting offset. An empty string match later in the - subject is permitted. If the pattern is anchored, such a match can + subject plus the starting offset. An empty string match later in the + subject is permitted. If the pattern is anchored, such a match can occur only if the pattern contains \K. + PCRE2_NO_JIT + + By default, if a pattern has been successfully processed by + pcre2_jit_compile(), JIT is automatically used when pcre2_match() is + called with options that JIT supports. Setting PCRE2_NO_JIT disables + the use of JIT; it forces matching to be done by the interpreter. + PCRE2_NO_UTF_CHECK When PCRE2_UTF is set at compile time, the validity of the subject as a UTF string is checked by default when pcre2_match() is subsequently - called. The entire string is checked before any other processing takes - place, and a negative error code is returned if the check fails. There - are several UTF error codes for each code unit width, corresponding to - different problems with the code unit sequence. The value of startoff- - set is also checked, to ensure that it points to the start of a charac- - ter or to the end of the subject. There are discussions about the + called. If a non-zero starting offset is given, the check is applied + only to that part of the subject that could be inspected during match- + ing, and there is a check that the starting offset points to the first + code unit of a character or to the end of the subject. If there are no + lookbehind assertions in the pattern, the check starts at the starting + offset. Otherwise, it starts at the length of the longest lookbehind + before the starting offset, or at the start of the subject if there are + not that many characters before the starting offset. Note that the + sequences \b and \B are one-character lookbehinds. + + The check is carried out before any other processing takes place, and a + negative error code is returned if the check fails. There are several + UTF error codes for each code unit width, corresponding to different + problems with the code unit sequence. There are discussions about the validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode page. @@ -2068,32 +2252,35 @@ NEWLINE HANDLING WHEN MATCHING When PCRE2 is built, a default newline convention is set; this is usu- ally the standard convention for the operating system. The default can - be overridden in a compile context. During matching, the newline - choice affects the behaviour of the dot, circumflex, and dollar - metacharacters. It may also alter the way the match starting position - is advanced after a match failure for an unanchored pattern. + be overridden in a compile context by calling pcre2_set_newline(). It + can also be overridden by starting a pattern string with, for example, + (*CRLF), as described in the section on newline conventions in the + pcre2pattern page. During matching, the newline choice affects the be- + haviour of the dot, circumflex, and dollar metacharacters. It may also + alter the way the match starting position is advanced after a match + failure for an unanchored pattern. When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is - set as the newline convention, and a match attempt for an unanchored + set as the newline convention, and a match attempt for an unanchored pattern fails when the current starting position is at a CRLF sequence, - and the pattern contains no explicit matches for CR or LF characters, - the match position is advanced by two characters instead of one, in + and the pattern contains no explicit matches for CR or LF characters, + the match position is advanced by two characters instead of one, in other words, to after the CRLF. The above rule is a compromise that makes the most common cases work as - expected. For example, if the pattern is .+A (and the PCRE2_DOTALL + expected. For example, if the pattern is .+A (and the PCRE2_DOTALL option is not set), it does not match the string "\r\nA" because, after - failing at the start, it skips both the CR and the LF before retrying. - However, the pattern [\r\n]A does match that string, because it con- + failing at the start, it skips both the CR and the LF before retrying. + However, the pattern [\r\n]A does match that string, because it con- tains an explicit CR or LF reference, and so advances only by one char- acter after the first failure. An explicit match for CR of LF is either a literal appearance of one of - those characters in the pattern, or one of the \r or \n escape - sequences. Implicit matches such as [^X] do not count, nor does \s, + those characters in the pattern, or one of the \r or \n escape + sequences. Implicit matches such as [^X] do not count, nor does \s, even though it includes CR and LF in the characters that it matches. - Notwithstanding the above, anomalous effects may still occur when CRLF + Notwithstanding the above, anomalous effects may still occur when CRLF is a valid newline sequence and explicit \r or \n escapes appear in the pattern. @@ -2104,24 +2291,25 @@ HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data); - In general, a pattern matches a certain portion of the subject, and in - addition, further substrings from the subject may be picked out by - parenthesized parts of the pattern. Following the usage in Jeffrey - Friedl's book, this is called "capturing" in what follows, and the - phrase "capturing subpattern" or "capturing group" is used for a frag- - ment of a pattern that picks out a substring. PCRE2 supports several + In general, a pattern matches a certain portion of the subject, and in + addition, further substrings from the subject may be picked out by + parenthesized parts of the pattern. Following the usage in Jeffrey + Friedl's book, this is called "capturing" in what follows, and the + phrase "capturing subpattern" or "capturing group" is used for a frag- + ment of a pattern that picks out a substring. PCRE2 supports several other kinds of parenthesized subpattern that do not cause substrings to - be captured. The pcre2_pattern_info() function can be used to find out + be captured. The pcre2_pattern_info() function can be used to find out how many capturing subpatterns there are in a compiled pattern. - A successful match returns the overall matched string and any captured - substrings to the caller via a vector of PCRE2_SIZE values. This is - called the ovector, and is contained within the match data block. You - can obtain direct access to the ovector by calling pcre2_get_ovec- - tor_pointer() to find its address, and pcre2_get_ovector_count() to - find the number of pairs of values it contains. Alternatively, you can - use the auxiliary functions for accessing captured substrings by number - or by name (see below). + You can use auxiliary functions for accessing captured substrings by + number or by name, as described in sections below. + + Alternatively, you can make direct use of the vector of PCRE2_SIZE val- + ues, called the ovector, which contains the offsets of captured + strings. It is part of the match data block. The function + pcre2_get_ovector_pointer() returns the address of the ovector, and + pcre2_get_ovector_count() returns the number of pairs of values it con- + tains. Within the ovector, the first in each pair of values is set to the off- set of the first code unit of a substring, and the second is set to the @@ -2200,42 +2388,48 @@ OTHER INFORMATION ABOUT A MATCH failure to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be avail- able, and pcre2_get_mark() can be called. It returns a pointer to the zero-terminated name, which is within the compiled pattern. Otherwise - NULL is returned. After a successful match, the (*MARK) name that is - returned is the last one encountered on the matching path through the - pattern. After a "no match" or a partial match, the last encountered - (*MARK) name is returned. For example, consider this pattern: + NULL is returned. The length of the (*MARK) name (excluding the termi- + nating zero) is stored in the code unit that preceeds the name. You + should use this instead of relying on the terminating zero if the + (*MARK) name might contain a binary zero. + + After a successful match, the (*MARK) name that is returned is the last + one encountered on the matching path through the pattern. After a "no + match" or a partial match, the last encountered (*MARK) name is + returned. For example, consider this pattern: ^(*MARK:A)((*MARK:B)a|b)c - When it matches "bc", the returned mark is A. The B mark is "seen" in - the first branch of the group, but it is not on the matching path. On - the other hand, when this pattern fails to match "bx", the returned + When it matches "bc", the returned mark is A. The B mark is "seen" in + the first branch of the group, but it is not on the matching path. On + the other hand, when this pattern fails to match "bx", the returned mark is B. - After a successful match, a partial match, or one of the invalid UTF - errors (for example, PCRE2_ERROR_UTF8_ERR5), pcre2_get_startchar() can + After a successful match, a partial match, or one of the invalid UTF + errors (for example, PCRE2_ERROR_UTF8_ERR5), pcre2_get_startchar() can be called. After a successful or partial match it returns the code unit - offset of the character at which the match started. For a non-partial - match, this can be different to the value of ovector[0] if the pattern - contains the \K escape sequence. After a partial match, however, this - value is always the same as ovector[0] because \K does not affect the + offset of the character at which the match started. For a non-partial + match, this can be different to the value of ovector[0] if the pattern + contains the \K escape sequence. After a partial match, however, this + value is always the same as ovector[0] because \K does not affect the result of a partial match. - After a UTF check failure, pcre2_get_startchar() can be used to obtain + After a UTF check failure, pcre2_get_startchar() can be used to obtain the code unit offset of the invalid UTF character. Details are given in the pcre2unicode page. ERROR RETURNS FROM pcre2_match() - If pcre2_match() fails, it returns a negative number. This can be con- - verted to a text string by calling pcre2_get_error_message(). Negative - error codes are also returned by other functions, and are documented - with them. The codes are given names in the header file. If UTF check- - ing is in force and an invalid UTF subject string is detected, one of a - number of UTF-specific negative error codes is returned. Details are - given in the pcre2unicode page. The following are the other errors that - may be returned by pcre2_match(): + If pcre2_match() fails, it returns a negative number. This can be con- + verted to a text string by calling the pcre2_get_error_message() func- + tion (see "Obtaining a textual error message" below). Negative error + codes are also returned by other functions, and are documented with + them. The codes are given names in the header file. If UTF checking is + in force and an invalid UTF subject string is detected, one of a number + of UTF-specific negative error codes is returned. Details are given in + the pcre2unicode page. The following are the other errors that may be + returned by pcre2_match(): PCRE2_ERROR_NOMATCH @@ -2331,6 +2525,27 @@ ERROR RETURNS FROM pcre2_match() The internal recursion limit was reached. +OBTAINING A TEXTUAL ERROR MESSAGE + + int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, + PCRE2_SIZE bufflen); + + A text message for an error code from any PCRE2 function (compile, + match, or auxiliary) can be obtained by calling pcre2_get_error_mes- + sage(). The code is passed as the first argument, with the remaining + two arguments specifying a code unit buffer and its length, into which + the text message is placed. Note that the message is returned in code + units of the appropriate width for the library that is being used. + + The returned message is terminated with a trailing zero, and the func- + tion returns the number of code units used, excluding the trailing + zero. If the error number is unknown, the negative error code + PCRE2_ERROR_BADDATA is returned. If the buffer is too small, the mes- + sage is truncated (but still with a trailing zero), and the negative + error code PCRE2_ERROR_NOMEMORY is returned. None of the messages are + very long; a buffer size of 120 code units is ample. + + EXTRACTING CAPTURED SUBSTRINGS BY NUMBER int pcre2_substring_length_bynumber(pcre2_match_data *match_data, @@ -2346,39 +2561,39 @@ EXTRACTING CAPTURED SUBSTRINGS BY NUMBER void pcre2_substring_free(PCRE2_UCHAR *buffer); - Captured substrings can be accessed directly by using the ovector as + Captured substrings can be accessed directly by using the ovector as described above. For convenience, auxiliary functions are provided for - extracting captured substrings as new, separate, zero-terminated + extracting captured substrings as new, separate, zero-terminated strings. A substring that contains a binary zero is correctly extracted - and has a further zero added on the end, but the result is not, of + and has a further zero added on the end, but the result is not, of course, a C string. The functions in this section identify substrings by number. The number zero refers to the entire matched substring, with higher numbers refer- - ring to substrings captured by parenthesized groups. After a partial - match, only substring zero is available. An attempt to extract any - other substring gives the error PCRE2_ERROR_PARTIAL. The next section + ring to substrings captured by parenthesized groups. After a partial + match, only substring zero is available. An attempt to extract any + other substring gives the error PCRE2_ERROR_PARTIAL. The next section describes similar functions for extracting captured substrings by name. - If a pattern uses the \K escape sequence within a positive assertion, + If a pattern uses the \K escape sequence within a positive assertion, the reported start of a successful match can be greater than the end of - the match. For example, if the pattern (?=ab\K) is matched against - "ab", the start and end offset values for the match are 2 and 0. In - this situation, calling these functions with a zero substring number + the match. For example, if the pattern (?=ab\K) is matched against + "ab", the start and end offset values for the match are 2 and 0. In + this situation, calling these functions with a zero substring number extracts a zero-length empty string. - You can find the length in code units of a captured substring without - extracting it by calling pcre2_substring_length_bynumber(). The first - argument is a pointer to the match data block, the second is the group - number, and the third is a pointer to a variable into which the length - is placed. If you just want to know whether or not the substring has + You can find the length in code units of a captured substring without + extracting it by calling pcre2_substring_length_bynumber(). The first + argument is a pointer to the match data block, the second is the group + number, and the third is a pointer to a variable into which the length + is placed. If you just want to know whether or not the substring has been captured, you can pass the third argument as NULL. - The pcre2_substring_copy_bynumber() function copies a captured sub- - string into a supplied buffer, whereas pcre2_substring_get_bynumber() - copies it into new memory, obtained using the same memory allocation - function that was used for the match data block. The first two argu- - ments of these functions are a pointer to the match data block and a + The pcre2_substring_copy_bynumber() function copies a captured sub- + string into a supplied buffer, whereas pcre2_substring_get_bynumber() + copies it into new memory, obtained using the same memory allocation + function that was used for the match data block. The first two argu- + ments of these functions are a pointer to the match data block and a capturing group number. The final arguments of pcre2_substring_copy_bynumber() are a pointer to @@ -2387,25 +2602,25 @@ EXTRACTING CAPTURED SUBSTRINGS BY NUMBER for the extracted substring, excluding the terminating zero. For pcre2_substring_get_bynumber() the third and fourth arguments point - to variables that are updated with a pointer to the new memory and the - number of code units that comprise the substring, again excluding the - terminating zero. When the substring is no longer needed, the memory + to variables that are updated with a pointer to the new memory and the + number of code units that comprise the substring, again excluding the + terminating zero. When the substring is no longer needed, the memory should be freed by calling pcre2_substring_free(). - The return value from all these functions is zero for success, or a - negative error code. If the pattern match failed, the match failure - code is returned. If a substring number greater than zero is used - after a partial match, PCRE2_ERROR_PARTIAL is returned. Other possible + The return value from all these functions is zero for success, or a + negative error code. If the pattern match failed, the match failure + code is returned. If a substring number greater than zero is used + after a partial match, PCRE2_ERROR_PARTIAL is returned. Other possible error codes are: PCRE2_ERROR_NOMEMORY - The buffer was too small for pcre2_substring_copy_bynumber(), or the + The buffer was too small for pcre2_substring_copy_bynumber(), or the attempt to get memory failed for pcre2_substring_get_bynumber(). PCRE2_ERROR_NOSUBSTRING - There is no substring with that number in the pattern, that is, the + There is no substring with that number in the pattern, that is, the number is greater than the number of capturing parentheses. PCRE2_ERROR_UNAVAILABLE @@ -2416,8 +2631,8 @@ EXTRACTING CAPTURED SUBSTRINGS BY NUMBER PCRE2_ERROR_UNSET - The substring did not participate in the match. For example, if the - pattern is (abc)|(def) and the subject is "def", and the ovector con- + The substring did not participate in the match. For example, if the + pattern is (abc)|(def) and the subject is "def", and the ovector con- tains at least two capturing slots, substring number 1 is unset. @@ -2428,32 +2643,32 @@ EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS void pcre2_substring_list_free(PCRE2_SPTR *list); - The pcre2_substring_list_get() function extracts all available sub- - strings and builds a list of pointers to them. It also (optionally) - builds a second list that contains their lengths (in code units), + The pcre2_substring_list_get() function extracts all available sub- + strings and builds a list of pointers to them. It also (optionally) + builds a second list that contains their lengths (in code units), excluding a terminating zero that is added to each of them. All this is done in a single block of memory that is obtained using the same memory allocation function that was used to get the match data block. - This function must be called only after a successful match. If called + This function must be called only after a successful match. If called after a partial match, the error code PCRE2_ERROR_PARTIAL is returned. - The address of the memory block is returned via listptr, which is also + The address of the memory block is returned via listptr, which is also the start of the list of string pointers. The end of the list is marked - by a NULL pointer. The address of the list of lengths is returned via - lengthsptr. If your strings do not contain binary zeros and you do not + by a NULL pointer. The address of the list of lengths is returned via + lengthsptr. If your strings do not contain binary zeros and you do not therefore need the lengths, you may supply NULL as the lengthsptr argu- - ment to disable the creation of a list of lengths. The yield of the - function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the mem- - ory block could not be obtained. When the list is no longer needed, it + ment to disable the creation of a list of lengths. The yield of the + function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the mem- + ory block could not be obtained. When the list is no longer needed, it should be freed by calling pcre2_substring_list_free(). If this function encounters a substring that is unset, which can happen - when capturing subpattern number n+1 matches some part of the subject, - but subpattern n has not been used at all, it returns an empty string. - This can be distinguished from a genuine zero-length substring by + when capturing subpattern number n+1 matches some part of the subject, + but subpattern n has not been used at all, it returns an empty string. + This can be distinguished from a genuine zero-length substring by inspecting the appropriate offset in the ovector, which contain - PCRE2_UNSET for unset substrings, or by calling pcre2_sub- + PCRE2_UNSET for unset substrings, or by calling pcre2_sub- string_length_bynumber(). @@ -2473,39 +2688,39 @@ EXTRACTING CAPTURED SUBSTRINGS BY NAME void pcre2_substring_free(PCRE2_UCHAR *buffer); - To extract a substring by name, you first have to find associated num- + To extract a substring by name, you first have to find associated num- ber. For example, for this pattern: (a+)b(? pcre2_set_parens_nest_limit Set the parentheses nesting limit \d+)... the number of the subpattern called "xxx" is 2. If the name is known to - be unique (PCRE2_DUPNAMES was not set), you can find the number from + be unique (PCRE2_DUPNAMES was not set), you can find the number from the name by calling pcre2_substring_number_from_name(). The first argu- - ment is the compiled pattern, and the second is the name. The yield of + ment is the compiled pattern, and the second is the name. The yield of the function is the subpattern number, PCRE2_ERROR_NOSUBSTRING if there - is no subpattern of that name, or PCRE2_ERROR_NOUNIQUESUBSTRING if - there is more than one subpattern of that name. Given the number, you - can extract the substring directly, or use one of the functions + is no subpattern of that name, or PCRE2_ERROR_NOUNIQUESUBSTRING if + there is more than one subpattern of that name. Given the number, you + can extract the substring directly, or use one of the functions described above. - For convenience, there are also "byname" functions that correspond to - the "bynumber" functions, the only difference being that the second - argument is a name instead of a number. If PCRE2_DUPNAMES is set and + For convenience, there are also "byname" functions that correspond to + the "bynumber" functions, the only difference being that the second + argument is a name instead of a number. If PCRE2_DUPNAMES is set and there are duplicate names, these functions scan all the groups with the given name, and return the first named string that is set. - If there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is - returned. If all groups with the name have numbers that are greater - than the number of slots in the ovector, PCRE2_ERROR_UNAVAILABLE is - returned. If there is at least one group with a slot in the ovector, + If there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is + returned. If all groups with the name have numbers that are greater + than the number of slots in the ovector, PCRE2_ERROR_UNAVAILABLE is + returned. If there is at least one group with a slot in the ovector, but no group is found to be set, PCRE2_ERROR_UNSET is returned. Warning: If the pattern uses the (?| feature to set up multiple subpat- - terns with the same number, as described in the section on duplicate - subpattern numbers in the pcre2pattern page, you cannot use names to - distinguish the different subpatterns, because names are not included - in the compiled code. The matching process uses only numbers. For this - reason, the use of different names for subpatterns of the same number + terns with the same number, as described in the section on duplicate + subpattern numbers in the pcre2pattern page, you cannot use names to + distinguish the different subpatterns, because names are not included + in the compiled code. The matching process uses only numbers. For this + reason, the use of different names for subpatterns of the same number causes an error at compile time. @@ -2514,58 +2729,195 @@ CREATING A NEW STRING WITH SUBSTITUTIONS int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options, pcre2_match_data *match_data, - pcre2_match_context *mcontext, PCRE2_SPTR replacementzfP, + pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferP, PCRE2_SIZE *outlengthptr); - This function calls pcre2_match() and then makes a copy of the subject - string in outputbuffer, replacing the part that was matched with the - replacement string, whose length is supplied in rlength. This can be - given as PCRE2_ZERO_TERMINATED for a zero-terminated string. - In the replacement string, which is interpreted as a UTF string in UTF - mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK - option is set, a dollar character is an escape character that can spec- - ify the insertion of characters from capturing groups in the pattern. - The following forms are recognized: + This function calls pcre2_match() and then makes a copy of the subject + string in outputbuffer, replacing the part that was matched with the + replacement string, whose length is supplied in rlength. This can be + given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in + which a \K item in a lookahead in the pattern causes the match to end + before it starts are not supported, and give rise to an error return. - $$ insert a dollar character - $ insert the contents of group - ${ } insert the contents of group - - Either a group number or a group name can be given for . Curly - brackets are required only if the following character would be inter- - preted as part of the number or name. The number may be zero to include - the entire matched string. For example, if the pattern a(b)c is - matched with "=abc=" and the replacement string "+$1$0$1+", the result - is "=+babcb+=". Group insertion is done by calling pcre2_copy_byname() - or pcre2_copy_bynumber() as appropriate. - - The first seven arguments of pcre2_substitute() are the same as for + The first seven arguments of pcre2_substitute() are the same as for pcre2_match(), except that the partial matching options are not permit- - ted, and match_data may be passed as NULL, in which case a match data - block is obtained and freed within this function, using memory manage- - ment functions from the match context, if provided, or else those that + ted, and match_data may be passed as NULL, in which case a match data + block is obtained and freed within this function, using memory manage- + ment functions from the match context, if provided, or else those that were used to allocate memory for the compiled code. - There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes - the function to iterate over the subject string, replacing every match- - ing substring. If this is not set, only the first matching substring is - replaced. - The outlengthptr argument must point to a variable that contains the - length, in code units, of the output buffer. It is updated to contain - the length of the new string, excluding the trailing zero that is auto- - matically added. + length, in code units, of the output buffer. If the function is suc- + cessful, the value is updated to contain the length of the new string, + excluding the trailing zero that is automatically added. - The function returns the number of replacements that were made. This - may be zero if no matches were found, and is never greater than 1 - unless PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a neg- - ative error code is returned. Except for PCRE2_ERROR_NOMATCH (which is - never returned), any errors from pcre2_match() or the substring copying - functions are passed straight back. PCRE2_ERROR_BADREPLACEMENT is - returned for an invalid replacement string (unrecognized sequence fol- - lowing a dollar sign), and PCRE2_ERROR_NOMEMORY is returned if the out- - put buffer is not big enough. + If the function is not successful, the value set via outlengthptr + depends on the type of error. For syntax errors in the replacement + string, the value is the offset in the replacement string where the + error was detected. For other errors, the value is PCRE2_UNSET by + default. This includes the case of the output buffer being too small, + unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set (see below), in which + case the value is the minimum length needed, including space for the + trailing zero. Note that in order to compute the required length, + pcre2_substitute() has to simulate all the matching and copying, + instead of giving an error return as soon as the buffer overflows. Note + also that the length is in code units, not bytes. + + In the replacement string, which is interpreted as a UTF string in UTF + mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK + option is set, a dollar character is an escape character that can spec- + ify the insertion of characters from capturing groups or (*MARK) items + in the pattern. The following forms are always recognized: + + $$ insert a dollar character + $ or ${ } insert the contents of group + $*MARK or ${*MARK} insert the name of the last (*MARK) encountered + + Either a group number or a group name can be given for . Curly + brackets are required only if the following character would be inter- + preted as part of the number or name. The number may be zero to include + the entire matched string. For example, if the pattern a(b)c is + matched with "=abc=" and the replacement string "+$1$0$1+", the result + is "=+babcb+=". + + The facility for inserting a (*MARK) name can be used to perform simple + simultaneous substitutions, as this pcre2test example shows: + + /(*:pear)apple|(*:orange)lemon/g,replace=${*MARK} + apple lemon + 2: pear orange + + As well as the usual options for pcre2_match(), a number of additional + options can be set in the options argument. + + PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject + string, replacing every matching substring. If this is not set, only + the first matching substring is replaced. If any matched substring has + zero length, after the substitution has happened, an attempt to find a + non-empty match at the same position is performed. If this is not suc- + cessful, the current position is advanced by one character except when + CRLF is a valid newline sequence and the next two characters are CR, + LF. In this case, the current position is advanced by two characters. + + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output + buffer is too small. The default action is to return PCRE2_ERROR_NOMEM- + ORY immediately. If this option is set, however, pcre2_substitute() + continues to go through the motions of matching and substituting (with- + out, of course, writing anything) in order to compute the size of buf- + fer that is needed. This value is passed back via the outlengthptr + variable, with the result of the function still being + PCRE2_ERROR_NOMEMORY. + + Passing a buffer size of zero is a permitted way of finding out how + much memory is needed for given substitution. However, this does mean + that the entire operation is carried out twice. Depending on the appli- + cation, it may be more efficient to allocate a large buffer and free + the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVER- + FLOW_LENGTH. + + PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups + that do not appear in the pattern to be treated as unset groups. This + option should be used with care, because it means that a typo in a + group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING + error. + + PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including + unknown groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be + treated as empty strings when inserted as described above. If this + option is not set, an attempt to insert an unset group causes the + PCRE2_ERROR_UNSET error. This option does not influence the extended + substitution syntax described below. + + PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the + replacement string. Without this option, only the dollar character is + special, and only the group insertion forms listed above are valid. + When PCRE2_SUBSTITUTE_EXTENDED is set, two things change: + + Firstly, backslash in a replacement string is interpreted as an escape + character. The usual forms such as \n or \x{ddd} can be used to specify + particular character codes, and backslash followed by any non-alphanu- + meric character quotes that character. Extended quoting can be coded + using \Q...\E, exactly as in pattern strings. + + There are also four escape sequences for forcing the case of inserted + letters. The insertion mechanism has three states: no case forcing, + force upper case, and force lower case. The escape sequences change the + current state: \U and \L change to upper or lower case forcing, respec- + tively, and \E (when not terminating a \Q quoted sequence) reverts to + no case forcing. The sequences \u and \l force the next character (if + it is a letter) to upper or lower case, respectively, and then the + state automatically reverts to no case forcing. Case forcing applies to + all inserted characters, including those from captured groups and let- + ters within \Q...\E quoted sequences. + + Note that case forcing sequences such as \U...\E do not nest. For exam- + ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final + \E has no effect. + + The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more + flexibility to group substitution. The syntax is similar to that used + by Bash: + + ${ :- } + ${ :+ : } + + As before, may be a group number or a name. The first form speci- + fies a default value. If group is set, its value is inserted; if + not, is expanded and the result inserted. The second form + specifies strings that are expanded and inserted when group is set + or unset, respectively. The first form is just a convenient shorthand + for + + ${ :+${ }: } + + Backslash can be used to escape colons and closing curly brackets in + the replacement strings. A change of the case forcing state within a + replacement string remains in force afterwards, as shown in this + pcre2test example: + + /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo + body + 1: hello + somebody + 1: HELLO + + The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended + substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause + unknown groups in the extended syntax forms to be treated as unset. + + If successful, pcre2_substitute() returns the number of replacements + that were made. This may be zero if no matches were found, and is never + greater than 1 unless PCRE2_SUBSTITUTE_GLOBAL is set. + + In the event of an error, a negative error code is returned. Except for + PCRE2_ERROR_NOMATCH (which is never returned), errors from + pcre2_match() are passed straight back. + + PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser- + tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set. + + PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ- + ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) + when the simple (non-extended) syntax is used and PCRE2_SUBSTI- + TUTE_UNSET_EMPTY is not set. + + PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big + enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size + of buffer that is needed is returned via outlengthptr. Note that this + does not happen by default. + + PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in + the replacement string, with more particular errors being + PCRE2_ERROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REP- + MISSING_BRACE (closing curly bracket not found), PCRE2_BADSUBSTITUTION + (syntax error in extended group substitution), and PCRE2_BADSUBPATTERN + (the pattern match ended before it started, which can happen if \K is + used in an assertion). + + As for all PCRE2 errors, a text message that describes the error can be + obtained by calling the pcre2_get_error_message() function (see + "Obtaining a textual error message" above). DUPLICATE SUBPATTERN NAMES @@ -2604,8 +2956,8 @@ DUPLICATE SUBPATTERN NAMES no entries for the given name. The format of the name table is described above in the section entitled - Information about a pattern above. Given all the relevant entries for - the name, you can extract each of their numbers, and hence the captured + Information about a pattern. Given all the relevant entries for the + name, you can extract each of their numbers, and hence the captured data. @@ -2781,8 +3133,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION PCRE2_ERROR_DFA_UITEM This return is given if pcre2_dfa_match() encounters an item in the - pattern that it does not support, for instance, the use of \C or a back - reference. + pattern that it does not support, for instance, the use of \C in a UTF + mode or a back reference. PCRE2_ERROR_DFA_UCOND @@ -2826,8 +3178,8 @@ AUTHOR REVISION - Last updated: 22 April 2015 - Copyright (c) 1997-2015 University of Cambridge. + Last updated: 23 December 2016 + Copyright (c) 1997-2016 University of Cambridge. ------------------------------------------------------------------------------ @@ -2946,10 +3298,18 @@ UNICODE AND UTF SUPPORT PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also request this by starting with (*UCP). + +DISABLING THE USE OF \C + The \C escape sequence, which matches a single code unit, even in a UTF mode, can cause unpredictable behaviour because it may leave the cur- - rent matching point in the middle of a multi-code-unit character. It - can be locked out by setting the PCRE2_NEVER_BACKSLASH_C option. + rent matching point in the middle of a multi-code-unit character. The + application can lock it out by setting the PCRE2_NEVER_BACKSLASH_C + option when calling pcre2_compile(). There is also a build-time option + + --enable-never-backslash-C + + (note the upper case C) which locks out the use of \C entirely. JUST-IN-TIME COMPILER SUPPORT @@ -2958,10 +3318,10 @@ JUST-IN-TIME COMPILER SUPPORT --enable-jit - This support is available only for certain hardware architectures. If - this option is set for an unsupported architecture, a building error - occurs. See the pcre2jit documentation for a discussion of JIT usage. - When JIT support is enabled, pcre2grep automatically makes use of it, + This support is available only for certain hardware architectures. If + this option is set for an unsupported architecture, a building error + occurs. See the pcre2jit documentation for a discussion of JIT usage. + When JIT support is enabled, pcre2grep automatically makes use of it, unless you add --disable-pcre2grep-jit @@ -2971,14 +3331,14 @@ JUST-IN-TIME COMPILER SUPPORT NEWLINE RECOGNITION - By default, PCRE2 interprets the linefeed (LF) character as indicating - the end of a line. This is the normal newline character on Unix-like - systems. You can compile PCRE2 to use carriage return (CR) instead, by + By default, PCRE2 interprets the linefeed (LF) character as indicating + the end of a line. This is the normal newline character on Unix-like + systems. You can compile PCRE2 to use carriage return (CR) instead, by adding --enable-newline-is-cr - to the configure command. There is also an --enable-newline-is-lf + to the configure command. There is also an --enable-newline-is-lf option, which explicitly specifies linefeed as the newline character. Alternatively, you can specify that line endings are to be indicated by @@ -2991,76 +3351,76 @@ NEWLINE RECOGNITION --enable-newline-is-anycrlf - which causes PCRE2 to recognize any of the three sequences CR, LF, or + which causes PCRE2 to recognize any of the three sequences CR, LF, or CRLF as indicating a line ending. Finally, a fifth option, specified by --enable-newline-is-any - causes PCRE2 to recognize any Unicode newline sequence. The Unicode + causes PCRE2 to recognize any Unicode newline sequence. The Unicode newline sequences are the three just mentioned, plus the single charac- ters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, - U+0085), LS (line separator, U+2028), and PS (paragraph separator, + U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). Whatever default line ending convention is selected when PCRE2 is built - can be overridden by applications that use the library. At build time + can be overridden by applications that use the library. At build time it is conventional to use the standard for your operating system. WHAT \R MATCHES - By default, the sequence \R in a pattern matches any Unicode newline - sequence, independently of what has been selected as the line ending + By default, the sequence \R in a pattern matches any Unicode newline + sequence, independently of what has been selected as the line ending sequence. If you specify --enable-bsr-anycrlf - the default is changed so that \R matches only CR, LF, or CRLF. What- - ever is selected when PCRE2 is built can be overridden by applications + the default is changed so that \R matches only CR, LF, or CRLF. What- + ever is selected when PCRE2 is built can be overridden by applications that use the called. HANDLING VERY LARGE PATTERNS - Within a compiled pattern, offset values are used to point from one - part to another (for example, from an opening parenthesis to an alter- - nation metacharacter). By default, in the 8-bit and 16-bit libraries, - two-byte values are used for these offsets, leading to a maximum size - for a compiled pattern of around 64K code units. This is sufficient to + Within a compiled pattern, offset values are used to point from one + part to another (for example, from an opening parenthesis to an alter- + nation metacharacter). By default, in the 8-bit and 16-bit libraries, + two-byte values are used for these offsets, leading to a maximum size + for a compiled pattern of around 64K code units. This is sufficient to handle all but the most gigantic patterns. Nevertheless, some people do - want to process truly enormous patterns, so it is possible to compile - PCRE2 to use three-byte or four-byte offsets by adding a setting such + want to process truly enormous patterns, so it is possible to compile + PCRE2 to use three-byte or four-byte offsets by adding a setting such as --with-link-size=3 - to the configure command. The value given must be 2, 3, or 4. For the - 16-bit library, a value of 3 is rounded up to 4. In these libraries, - using longer offsets slows down the operation of PCRE2 because it has - to load additional data when handling them. For the 32-bit library the - value is always 4 and cannot be overridden; the value of --with-link- + to the configure command. The value given must be 2, 3, or 4. For the + 16-bit library, a value of 3 is rounded up to 4. In these libraries, + using longer offsets slows down the operation of PCRE2 because it has + to load additional data when handling them. For the 32-bit library the + value is always 4 and cannot be overridden; the value of --with-link- size is ignored. AVOIDING EXCESSIVE STACK USAGE - When matching with the pcre2_match() function, PCRE2 implements back- - tracking by making recursive calls to an internal function called - match(). In environments where the size of the stack is limited, this - can severely limit PCRE2's operation. (The Unix environment does not - usually suffer from this problem, but it may sometimes be necessary to + When matching with the pcre2_match() function, PCRE2 implements back- + tracking by making recursive calls to an internal function called + match(). In environments where the size of the stack is limited, this + can severely limit PCRE2's operation. (The Unix environment does not + usually suffer from this problem, but it may sometimes be necessary to increase the maximum stack size. There is a discussion in the - pcre2stack documentation.) An alternative approach to recursion that - uses memory from the heap to remember data, instead of using recursive - function calls, has been implemented to work round the problem of lim- - ited stack size. If you want to build a version of PCRE2 that works + pcre2stack documentation.) An alternative approach to recursion that + uses memory from the heap to remember data, instead of using recursive + function calls, has been implemented to work round the problem of lim- + ited stack size. If you want to build a version of PCRE2 that works this way, add --disable-stack-for-recursion to the configure command. By default, the system functions malloc() and - free() are called to manage the heap memory that is required, but cus- - tom memory management functions can be called instead. PCRE2 runs + free() are called to manage the heap memory that is required, but cus- + tom memory management functions can be called instead. PCRE2 runs noticeably more slowly when built in this way. This option affects only the pcre2_match() function; it is not relevant for pcre2_dfa_match(). @@ -3068,30 +3428,30 @@ AVOIDING EXCESSIVE STACK USAGE LIMITING PCRE2 RESOURCE USAGE Internally, PCRE2 has a function called match(), which it calls repeat- - edly (sometimes recursively) when matching a pattern with the + edly (sometimes recursively) when matching a pattern with the pcre2_match() function. By controlling the maximum number of times this - function may be called during a single matching operation, a limit can - be placed on the resources used by a single call to pcre2_match(). The + function may be called during a single matching operation, a limit can + be placed on the resources used by a single call to pcre2_match(). The limit can be changed at run time, as described in the pcre2api documen- - tation. The default is 10 million, but this can be changed by adding a + tation. The default is 10 million, but this can be changed by adding a setting such as --with-match-limit=500000 - to the configure command. This setting has no effect on the + to the configure command. This setting has no effect on the pcre2_dfa_match() matching function. - In some environments it is desirable to limit the depth of recursive + In some environments it is desirable to limit the depth of recursive calls of match() more strictly than the total number of calls, in order - to restrict the maximum amount of stack (or heap, if --disable-stack- + to restrict the maximum amount of stack (or heap, if --disable-stack- for-recursion is specified) that is used. A second limit controls this; - it defaults to the value that is set for --with-match-limit, which - imposes no additional constraints. However, you can set a lower limit + it defaults to the value that is set for --with-match-limit, which + imposes no additional constraints. However, you can set a lower limit by adding, for example, --with-match-limit-recursion=10000 - to the configure command. This value can also be overridden at run + to the configure command. This value can also be overridden at run time. @@ -3099,45 +3459,45 @@ CREATING CHARACTER TABLES AT BUILD TIME PCRE2 uses fixed tables for processing characters whose code points are less than 256. By default, PCRE2 is built with a set of tables that are - distributed in the file src/pcre2_chartables.c.dist. These tables are + distributed in the file src/pcre2_chartables.c.dist. These tables are for ASCII codes only. If you add --enable-rebuild-chartables - to the configure command, the distributed tables are no longer used. - Instead, a program called dftables is compiled and run. This outputs + to the configure command, the distributed tables are no longer used. + Instead, a program called dftables is compiled and run. This outputs the source for new set of tables, created in the default locale of your - C run-time system. (This method of replacing the tables does not work - if you are cross compiling, because dftables is run on the local host. + C run-time system. (This method of replacing the tables does not work + if you are cross compiling, because dftables is run on the local host. If you need to create alternative tables when cross compiling, you will have to do so "by hand".) USING EBCDIC CODE - PCRE2 assumes by default that it will run in an environment where the - character code is ASCII or Unicode, which is a superset of ASCII. This + PCRE2 assumes by default that it will run in an environment where the + character code is ASCII or Unicode, which is a superset of ASCII. This is the case for most computer operating systems. PCRE2 can, however, be compiled to run in an 8-bit EBCDIC environment by adding --enable-ebcdic --disable-unicode to the configure command. This setting implies --enable-rebuild-charta- - bles. You should only use it if you know that you are in an EBCDIC + bles. You should only use it if you know that you are in an EBCDIC environment (for example, an IBM mainframe operating system). - It is not possible to support both EBCDIC and UTF-8 codes in the same - version of the library. Consequently, --enable-unicode and --enable- + It is not possible to support both EBCDIC and UTF-8 codes in the same + version of the library. Consequently, --enable-unicode and --enable- ebcdic are mutually exclusive. The EBCDIC character that corresponds to an ASCII LF is assumed to have - the value 0x15 by default. However, in some EBCDIC environments, 0x25 + the value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In such an environment you should use --enable-ebcdic-nl25 as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR - has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and + has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and 0x25 is not chosen as LF is made to correspond to the Unicode NEL char- acter (which, in Unicode, is 0x85). @@ -3146,34 +3506,48 @@ USING EBCDIC CODE an EBCDIC environment. +PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS + + By default, on non-Windows systems, pcre2grep supports the use of call- + outs with string arguments within the patterns it is matching, in order + to run external scripts. For details, see the pcre2grep documentation. + This support can be disabled by adding --disable-pcre2grep-callout to + the configure command. + + PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT - By default, pcre2grep reads all files as plain text. You can build it - so that it recognizes files whose names end in .gz or .bz2, and reads + By default, pcre2grep reads all files as plain text. You can build it + so that it recognizes files whose names end in .gz or .bz2, and reads them with libz or libbz2, respectively, by adding one or both of --enable-pcre2grep-libz --enable-pcre2grep-libbz2 to the configure command. These options naturally require that the rel- - evant libraries are installed on your system. Configuration will fail + evant libraries are installed on your system. Configuration will fail if they are not. PCRE2GREP BUFFER SIZE - pcre2grep uses an internal buffer to hold a "window" on the file it is + pcre2grep uses an internal buffer to hold a "window" on the file it is scanning, in order to be able to output "before" and "after" lines when - it finds a match. The size of the buffer is controlled by a parameter - whose default value is 20K. The buffer itself is three times this size, - but because of the way it is used for holding "before" lines, the long- - est line that is guaranteed to be processable is the parameter size. - You can change the default parameter value by adding, for example, + it finds a match. The starting size of the buffer is controlled by a + parameter whose default value is 20K. The buffer itself is three times + this size, but because of the way it is used for holding "before" + lines, the longest line that is guaranteed to be processable is the + parameter size. If a longer line is encountered, pcre2grep automati- + cally expands the buffer, up to a specified maximum size, whose default + is 1M or the starting size, whichever is the larger. You can change the + default parameter values by adding, for example, - --with-pcre2grep-bufsize=50K + --with-pcre2grep-bufsize=51200 + --with-pcre2grep-max-bufsize=2097152 - to the configure command. The caller of pcre2grep can override this - value by using --buffer-size on the command line.. + to the configure command. The caller of pcre2grep can override these + values by using --buffer-size and --max-buffer-size on the command + line. PCRE2TEST OPTION FOR LIBREADLINE SUPPORT @@ -3183,26 +3557,26 @@ PCRE2TEST OPTION FOR LIBREADLINE SUPPORT --enable-pcre2test-libreadline --enable-pcre2test-libedit - to the configure command, pcre2test is linked with the libreadline + to the configure command, pcre2test is linked with the libreadline orlibedit library, respectively, and when its input is from a terminal, - it reads it using the readline() function. This provides line-editing - and history facilities. Note that libreadline is GPL-licensed, so if - you distribute a binary of pcre2test linked in this way, there may be + it reads it using the readline() function. This provides line-editing + and history facilities. Note that libreadline is GPL-licensed, so if + you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking instead with libedit, which has a BSD licence. - Setting --enable-pcre2test-libreadline causes the -lreadline option to - be added to the pcre2test build. In many operating environments with a - sytem-installed readline library this is sufficient. However, in some + Setting --enable-pcre2test-libreadline causes the -lreadline option to + be added to the pcre2test build. In many operating environments with a + sytem-installed readline library this is sufficient. However, in some environments (e.g. if an unmodified distribution version of readline is - in use), some extra configuration may be necessary. The INSTALL file + in use), some extra configuration may be necessary. The INSTALL file for libreadline says this: "Readline uses the termcap functions, but does not link with the termcap or curses library itself, allowing applications which link with readline the to choose an appropriate library." - If your environment has not been set up so that an appropriate library + If your environment has not been set up so that an appropriate library is automatically included, you may need to add something like LIBS="-ncurses" @@ -3216,7 +3590,7 @@ INCLUDING DEBUGGING CODE --enable-debug - to the configure command, additional debugging code is included in the + to the configure command, additional debugging code is included in the build. This feature is intended for use by the PCRE2 maintainers. @@ -3226,15 +3600,15 @@ DEBUGGING WITH VALGRIND SUPPORT --enable-valgrind - to the configure command, PCRE2 will use valgrind annotations to mark - certain memory regions as unaddressable. This allows it to detect - invalid memory accesses, and is mostly useful for debugging PCRE2 + to the configure command, PCRE2 will use valgrind annotations to mark + certain memory regions as unaddressable. This allows it to detect + invalid memory accesses, and is mostly useful for debugging PCRE2 itself. CODE COVERAGE REPORTING - If your C compiler is gcc, you can build a version of PCRE2 that can + If your C compiler is gcc, you can build a version of PCRE2 that can generate a code coverage report for its test suite. To enable this, you must install lcov version 1.6 or above. Then specify @@ -3243,20 +3617,20 @@ CODE COVERAGE REPORTING to the configure command and build PCRE2 in the usual way. Note that using ccache (a caching C compiler) is incompatible with code - coverage reporting. If you have configured ccache to run automatically + coverage reporting. If you have configured ccache to run automatically on your system, you must set the environment variable CCACHE_DISABLE=1 before running make to build PCRE2, so that ccache is not used. - When --enable-coverage is used, the following addition targets are + When --enable-coverage is used, the following addition targets are added to the Makefile: make coverage - This creates a fresh coverage report for the PCRE2 test suite. It is - equivalent to running "make coverage-reset", "make coverage-baseline", + This creates a fresh coverage report for the PCRE2 test suite. It is + equivalent to running "make coverage-reset", "make coverage-baseline", "make check", and then "make coverage-report". make coverage-reset @@ -3273,21 +3647,44 @@ CODE COVERAGE REPORTING make coverage-clean-report - This removes the generated coverage report without cleaning the cover- + This removes the generated coverage report without cleaning the cover- age data itself. make coverage-clean-data - This removes the captured coverage data without removing the coverage + This removes the captured coverage data without removing the coverage files created at compile time (*.gcno). make coverage-clean - This cleans all coverage data including the generated coverage report. - For more information about code coverage, see the gcov and lcov docu- + This cleans all coverage data including the generated coverage report. + For more information about code coverage, see the gcov and lcov docu- mentation. +SUPPORT FOR FUZZERS + + There is a special option for use by people who want to run fuzzing + tests on PCRE2: + + --enable-fuzz-support + + At present this applies only to the 8-bit library. If set, it causes an + extra library called libpcre2-fuzzsupport.a to be built, but not + installed. This contains a single function called LLVMFuzzerTestOneIn- + put() whose arguments are a pointer to a string and the length of the + string. When called, this function tries to compile the string as a + pattern, and if that succeeds, to match it. This is done both with no + options and with some random options bits that are generated from the + string. Setting --enable-fuzz-support also causes a binary called + pcre2fuzzcheck to be created. This is normally run under valgrind or + used when PCRE2 is compiled with address sanitizing enabled. It calls + the fuzzing function and outputs information about it is doing. The + input strings are specified by arguments: if an argument starts with + "=" the rest of it is a literal input string. Otherwise, it is assumed + to be a file name, and the contents of the file are the test string. + + SEE ALSO pcre2api(3), pcre2-config(3). @@ -3302,8 +3699,8 @@ AUTHOR REVISION - Last updated: 24 April 2015 - Copyright (c) 1997-2015 University of Cambridge. + Last updated: 01 November 2016 + Copyright (c) 1997-2016 University of Cambridge. ------------------------------------------------------------------------------ @@ -3347,45 +3744,54 @@ DESCRIPTION If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2 automatically inserts callouts, all with number 255, before each - item in the pattern. For example, if PCRE2_AUTO_CALLOUT is used with + item in the pattern except for immediately before or after a callout + item in the pattern. For example, if PCRE2_AUTO_CALLOUT is used with the pattern - A(\d{2}|--) + A(?C3)B it is processed as if it were + (?C255)A(?C3)B(?C255) + + Here is a more complicated example: + + A(\d{2}|--) + + With PCRE2_AUTO_CALLOUT, this pattern is processed as if it were + (?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255) - Notice that there is a callout before and after each parenthesis and + Notice that there is a callout before and after each parenthesis and alternation bar. If the pattern contains a conditional group whose con- - dition is an assertion, an automatic callout is inserted immediately - before the condition. Such a callout may also be inserted explicitly, + dition is an assertion, an automatic callout is inserted immediately + before the condition. Such a callout may also be inserted explicitly, for example: (?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de) - This applies only to assertion conditions (because they are themselves + This applies only to assertion conditions (because they are themselves independent groups). - Callouts can be useful for tracking the progress of pattern matching. + Callouts can be useful for tracking the progress of pattern matching. The pcre2test program has a pattern qualifier (/auto_callout) that sets - automatic callouts. When any callouts are present, the output from - pcre2test indicates how the pattern is being matched. This is useful - information when you are trying to optimize the performance of a par- + automatic callouts. When any callouts are present, the output from + pcre2test indicates how the pattern is being matched. This is useful + information when you are trying to optimize the performance of a par- ticular pattern. MISSING CALLOUTS - You should be aware that, because of optimizations in the way PCRE2 + You should be aware that, because of optimizations in the way PCRE2 compiles and matches patterns, callouts sometimes do not happen exactly as you might expect. Auto-possessification At compile time, PCRE2 "auto-possessifies" repeated items when it knows - that what follows cannot be part of the repeat. For example, a+[bc] is - compiled as if it were a++[bc]. The pcre2test output when this pattern + that what follows cannot be part of the repeat. For example, a+[bc] is + compiled as if it were a++[bc]. The pcre2test output when this pattern is compiled with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied to the string "aaaa" is: @@ -3394,11 +3800,12 @@ MISSING CALLOUTS +2 ^ ^ [bc] No match - This indicates that when matching [bc] fails, there is no backtracking - into a+ and therefore the callouts that would be taken for the back- - tracks do not occur. You can disable the auto-possessify feature by - passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting the pat- - tern with (*NO_AUTO_POSSESS). In this case, the output changes to this: + This indicates that when matching [bc] fails, there is no backtracking + into a+ (because it is being treated as a++) and therefore the callouts + that would be taken for the backtracks do not occur. You can disable + the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to + pcre2_compile(), or starting the pattern with (*NO_AUTO_POSSESS). In + this case, the output changes to this: --->aaaa +0 ^ a+ @@ -3517,8 +3924,8 @@ THE CALLOUT INTERFACE For a numerical callout, callout_string is NULL, and callout_number contains the number of the callout, in the range 0-255. This is the - number that follows (?C for manual callouts; it is 255 for automati- - cally generated callouts. + number that follows (?C for callouts that part of the pattern; it is + 255 for automatically generated callouts. Fields for string callouts @@ -3579,10 +3986,16 @@ THE CALLOUT INTERFACE the next item to be matched. The next_item_length field contains the length of the next item to be - matched in the pattern string. When the callout immediately precedes an - alternation bar, a closing parenthesis, or the end of the pattern, the - length is zero. When the callout precedes an opening parenthesis, the - length is that of the entire subpattern. + processed in the pattern string. When the callout is at the end of the + pattern, the length is zero. When the callout precedes an opening + parenthesis, the length includes meta characters that follow the paren- + thesis. For example, in a callout before an assertion such as (?=ab) + the length is 3. For an an alternation bar or a closing parenthesis, + the length is one, unless a closing parenthesis is followed by a quan- + tifier, in which case its length is included. (This changed in release + 10.23. In earlier releases, before an opening parenthesis the length + was that of the entire subpattern, and before an alternation bar or a + closing parenthesis the length was zero.) The pattern_position and next_item_length fields are intended to help in distinguishing between different automatic callouts, which all have @@ -3666,8 +4079,8 @@ AUTHOR REVISION - Last updated: 23 March 2015 - Copyright (c) 1997-2015 University of Cambridge. + Last updated: 29 September 2016 + Copyright (c) 1997-2016 University of Cambridge. ------------------------------------------------------------------------------ @@ -3761,7 +4174,7 @@ DIFFERENCES BETWEEN PCRE2 AND PERL first one that is backtracked onto acts. For example, in the pattern A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases - it is the same as PCRE2, but there are examples where it differs. + it is the same as PCRE2, but there are cases where it differs. 11. Most backtracking verbs in assertions have their normal actions. They are not confined to the assertion. @@ -3775,18 +4188,18 @@ DIFFERENCES BETWEEN PCRE2 AND PERL pattern names is not as general as Perl's. This is a consequence of the fact the PCRE2 works internally just with numbers, using an external table to translate between numbers and names. In particular, a pattern - such as (?|(?A)|(?A)|(?B), where the two capturing parentheses have the same number but different names, is not supported, and causes an error at compile time. If it were allowed, it would not be possible to distinguish which parentheses matched, because both names map to cap- turing subpattern number 1. To avoid this confusing situation, an error is given at compile time. - 14. Perl recognizes comments in some places that PCRE2 does not, for - example, between the ( and ? at the start of a subpattern. If the /x - modifier is set, Perl allows white space between ( and ? (though cur- - rent Perls warn that this is deprecated) but PCRE2 never does, even if - the PCRE2_EXTENDED option is set. + 14. Perl used to recognize comments in some places that PCRE2 does not, + for example, between the ( and ? at the start of a subpattern. If the + /x modifier is set, Perl allowed white space between ( and ? though the + latest Perls give an error (for a while it was just deprecated). There + may still be some cases where Perl behaves differently. 15. Perl, when in warning mode, gives warnings for character classes such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter- @@ -3810,34 +4223,39 @@ DIFFERENCES BETWEEN PCRE2 AND PERL different length of string. Perl requires them all to have the same length. - (b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the + (b) From PCRE2 10.23, back references to groups of fixed length are + supported in lookbehinds, provided that there is no possibility of ref- + erencing a non-unique number or name. Perl does not support backrefer- + ences in lookbehinds. + + (c) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $ meta-character matches only at the very end of the string. - (c) A backslash followed by a letter with no special meaning is + (d) A backslash followed by a letter with no special meaning is faulted. (Perl can be made to issue a warning.) - (d) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti- + (e) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti- fiers is inverted, that is, by default they are not greedy, but if fol- lowed by a question mark they are. - (e) PCRE2_ANCHORED can be used at matching time to force a pattern to + (f) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried only at the first matching position in the subject string. - (f) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, - PCRE2_NOTEMPTY_ATSTART, and PCRE2_NO_AUTO_CAPTURE options have no Perl + (g) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, + PCRE2_NOTEMPTY_ATSTART, and PCRE2_NO_AUTO_CAPTURE options have no Perl equivalents. - (g) The \R escape sequence can be restricted to match only CR, LF, or + (h) The \R escape sequence can be restricted to match only CR, LF, or CRLF by the PCRE2_BSR_ANYCRLF option. - (h) The callout facility is PCRE2-specific. + (i) The callout facility is PCRE2-specific. - (i) The partial matching facility is PCRE2-specific. + (j) The partial matching facility is PCRE2-specific. - (j) The alternative matching function (pcre2_dfa_match() matches in a + (k) The alternative matching function (pcre2_dfa_match() matches in a different way and is not Perl-compatible. - (k) PCRE2 recognizes some special sequences such as (*CR) at the start + (l) PCRE2 recognizes some special sequences such as (*CR) at the start of a pattern that set overall options that cannot be changed within the pattern. @@ -3851,8 +4269,8 @@ AUTHOR REVISION - Last updated: 15 March 2015 - Copyright (c) 1997-2015 University of Cambridge. + Last updated: 18 October 2016 + Copyright (c) 1997-2016 University of Cambridge. ------------------------------------------------------------------------------ @@ -3922,6 +4340,12 @@ SIMPLE USE OF JIT exactly the same results. The returned value from pcre2_jit_compile() is zero on success, or a negative error code. + There is a limit to the size of pattern that JIT supports, imposed by + the size of machine stack that it uses. The exact rules are not docu- + mented because they may change at any time, in particular, when new + optimizations are introduced. If a pattern is too big, a call to + pcre2_jit_compile() returns PCRE2_ERROR_NOMEMORY. + PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for com- plete matches. If you want to run partial matches using the PCRE2_PAR- TIAL_HARD or PCRE2_PARTIAL_SOFT options of pcre2_match(), you should @@ -3975,49 +4399,52 @@ UNSUPPORTED OPTIONS AND PATTERN ITEMS PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED option is not supported at match time. - The only unsupported pattern items are \C (match a single data unit) - when running in a UTF mode, and a callout immediately before an asser- + If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the + use of JIT, forcing matching by the interpreter code. + + The only unsupported pattern items are \C (match a single data unit) + when running in a UTF mode, and a callout immediately before an asser- tion condition in a conditional group. RETURN VALUES FROM JIT MATCHING When a pattern is matched using JIT matching, the return values are the - same as those given by the interpretive pcre2_match() code, with the - addition of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means - that the memory used for the JIT stack was insufficient. See "Control- + same as those given by the interpretive pcre2_match() code, with the + addition of one new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means + that the memory used for the JIT stack was insufficient. See "Control- ling the JIT stack" below for a discussion of JIT stack usage. - The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if - searching a very large pattern tree goes on for too long, as it is in - the same circumstance when JIT is not used, but the details of exactly - what is counted are not the same. The PCRE2_ERROR_RECURSIONLIMIT error + The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if + searching a very large pattern tree goes on for too long, as it is in + the same circumstance when JIT is not used, but the details of exactly + what is counted are not the same. The PCRE2_ERROR_RECURSIONLIMIT error code is never returned when JIT matching is used. CONTROLLING THE JIT STACK When the compiled JIT code runs, it needs a block of memory to use as a - stack. By default, it uses 32K on the machine stack. However, some - large or complicated patterns need more than this. The error - PCRE2_ERROR_JIT_STACKLIMIT is given when there is not enough stack. - Three functions are provided for managing blocks of memory for use as - JIT stacks. There is further discussion about the use of JIT stacks in + stack. By default, it uses 32K on the machine stack. However, some + large or complicated patterns need more than this. The error + PCRE2_ERROR_JIT_STACKLIMIT is given when there is not enough stack. + Three functions are provided for managing blocks of memory for use as + JIT stacks. There is further discussion about the use of JIT stacks in the section entitled "JIT stack FAQ" below. - The pcre2_jit_stack_create() function creates a JIT stack. Its argu- - ments are a starting size, a maximum size, and a general context (for - memory allocation functions, or NULL for standard memory allocation). + The pcre2_jit_stack_create() function creates a JIT stack. Its argu- + ments are a starting size, a maximum size, and a general context (for + memory allocation functions, or NULL for standard memory allocation). It returns a pointer to an opaque structure of type pcre2_jit_stack, or - NULL if there is an error. The pcre2_jit_stack_free() function is used - to free a stack that is no longer needed. (For the technically minded: + NULL if there is an error. The pcre2_jit_stack_free() function is used + to free a stack that is no longer needed. (For the technically minded: the address space is allocated by mmap or VirtualAlloc.) - JIT uses far less memory for recursion than the interpretive code, and - a maximum stack size of 512K to 1M should be more than enough for any + JIT uses far less memory for recursion than the interpretive code, and + a maximum stack size of 512K to 1M should be more than enough for any pattern. - The pcre2_jit_stack_assign() function specifies which stack JIT code + The pcre2_jit_stack_assign() function specifies which stack JIT code should use. Its arguments are as follows: pcre2_match_context *mcontext @@ -4026,7 +4453,7 @@ CONTROLLING THE JIT STACK The first argument is a pointer to a match context. When this is subse- quently passed to a matching function, its information determines which - JIT stack is used. There are three cases for the values of the other + JIT stack is used. There are three cases for the values of the other two options: (1) If callback is NULL and data is NULL, an internal 32K block @@ -4044,30 +4471,34 @@ CONTROLLING THE JIT STACK return value must be a valid JIT stack, the result of calling pcre2_jit_stack_create(). - A callback function is obeyed whenever JIT code is about to be run; it + A callback function is obeyed whenever JIT code is about to be run; it is not obeyed when pcre2_match() is called with options that are incom- - patible for JIT matching. A callback function can therefore be used to - determine whether a match operation was executed by JIT or by the + patible for JIT matching. A callback function can therefore be used to + determine whether a match operation was executed by JIT or by the interpreter. You may safely use the same JIT stack for more than one pattern (either - by assigning directly or by callback), as long as the patterns are all - matched sequentially in the same thread. In a multithread application, - if you do not specify a JIT stack, or if you assign or pass back NULL - from a callback, that is thread-safe, because each thread has its own - machine stack. However, if you assign or pass back a non-NULL JIT - stack, this must be a different stack for each thread so that the - application is thread-safe. + by assigning directly or by callback), as long as the patterns are + matched sequentially in the same thread. Currently, the only way to set + up non-sequential matches in one thread is to use callouts: if a call- + out function starts another match, that match must use a different JIT + stack to the one used for currently suspended match(es). - Strictly speaking, even more is allowed. You can assign the same non- - NULL stack to a match context that is used by any number of patterns, - as long as they are not used for matching by multiple threads at the - same time. For example, you could use the same stack in all compiled - patterns, with a global mutex in the callback to wait until the stack + In a multithread application, if you do not specify a JIT stack, or if + you assign or pass back NULL from a callback, that is thread-safe, + because each thread has its own machine stack. However, if you assign + or pass back a non-NULL JIT stack, this must be a different stack for + each thread so that the application is thread-safe. + + Strictly speaking, even more is allowed. You can assign the same non- + NULL stack to a match context that is used by any number of patterns, + as long as they are not used for matching by multiple threads at the + same time. For example, you could use the same stack in all compiled + patterns, with a global mutex in the callback to wait until the stack is available for use. However, this is an inefficient solution, and not recommended. - This is a suggestion for how a multithreaded program that needs to set + This is a suggestion for how a multithreaded program that needs to set up non-default JIT stacks might operate: During thread initalization @@ -4079,7 +4510,7 @@ CONTROLLING THE JIT STACK Use a one-line callback function return thread_local_var - All the functions described in this section do nothing if JIT is not + All the functions described in this section do nothing if JIT is not available. @@ -4088,20 +4519,20 @@ JIT STACK FAQ (1) Why do we need JIT stacks? PCRE2 (and JIT) is a recursive, depth-first engine, so it needs a stack - where the local data of the current node is pushed before checking its + where the local data of the current node is pushed before checking its child nodes. Allocating real machine stack on some platforms is diffi- cult. For example, the stack chain needs to be updated every time if we - extend the stack on PowerPC. Although it is possible, its updating + extend the stack on PowerPC. Although it is possible, its updating time overhead decreases performance. So we do the recursion in memory. (2) Why don't we simply allocate blocks of memory with malloc()? - Modern operating systems have a nice feature: they can reserve an + Modern operating systems have a nice feature: they can reserve an address space instead of allocating memory. We can safely allocate mem- - ory pages inside this address space, so the stack could grow without + ory pages inside this address space, so the stack could grow without moving memory data (this is important because of pointers). Thus we can - allocate 1M address space, and use only a single memory page (usually - 4K) if that is enough. However, we can still grow up to 1M anytime if + allocate 1M address space, and use only a single memory page (usually + 4K) if that is enough. However, we can still grow up to 1M anytime if needed. (3) Who "owns" a JIT stack? @@ -4109,8 +4540,8 @@ JIT STACK FAQ The owner of the stack is the user program, not the JIT studied pattern or anything else. The user program must ensure that if a stack is being used by pcre2_match(), (that is, it is assigned to a match context that - is passed to the pattern currently running), that stack must not be - used by any other threads (to avoid overwriting the same memory area). + is passed to the pattern currently running), that stack must not be + used by any other threads (to avoid overwriting the same memory area). The best practice for multithreaded programs is to allocate a stack for each thread, and return this stack through the JIT callback function. @@ -4118,36 +4549,36 @@ JIT STACK FAQ You can free a JIT stack at any time, as long as it will not be used by pcre2_match() again. When you assign the stack to a match context, only - a pointer is set. There is no reference counting or any other magic. + a pointer is set. There is no reference counting or any other magic. You can free compiled patterns, contexts, and stacks in any order, any- - time. Just do not call pcre2_match() with a match context pointing to + time. Just do not call pcre2_match() with a match context pointing to an already freed stack, as that will cause SEGFAULT. (Also, do not free - a stack currently used by pcre2_match() in another thread). You can - also replace the stack in a context at any time when it is not in use. + a stack currently used by pcre2_match() in another thread). You can + also replace the stack in a context at any time when it is not in use. You should free the previous stack before assigning a replacement. - (5) Should I allocate/free a stack every time before/after calling + (5) Should I allocate/free a stack every time before/after calling pcre2_match()? - No, because this is too costly in terms of resources. However, you - could implement some clever idea which release the stack if it is not - used in let's say two minutes. The JIT callback can help to achieve + No, because this is too costly in terms of resources. However, you + could implement some clever idea which release the stack if it is not + used in let's say two minutes. The JIT callback can help to achieve this without keeping a list of patterns. - (6) OK, the stack is for long term memory allocation. But what happens - if a pattern causes stack overflow with a stack of 1M? Is that 1M kept + (6) OK, the stack is for long term memory allocation. But what happens + if a pattern causes stack overflow with a stack of 1M? Is that 1M kept until the stack is freed? - Especially on embedded sytems, it might be a good idea to release mem- - ory sometimes without freeing the stack. There is no API for this at - the moment. Probably a function call which returns with the currently - allocated memory for any stack and another which allows releasing mem- + Especially on embedded sytems, it might be a good idea to release mem- + ory sometimes without freeing the stack. There is no API for this at + the moment. Probably a function call which returns with the currently + allocated memory for any stack and another which allows releasing mem- ory (shrinking the stack) would be a good idea if someone needs this. (7) This is too much of a headache. Isn't there any better solution for JIT stack handling? - No, thanks to Windows. If POSIX threads were used everywhere, we could + No, thanks to Windows. If POSIX threads were used everywhere, we could throw out this complicated API. @@ -4156,18 +4587,18 @@ FREEING JIT SPECULATIVE MEMORY void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); The JIT executable allocator does not free all memory when it is possi- - ble. It expects new allocations, and keeps some free memory around to - improve allocation speed. However, in low memory conditions, it might - be better to free all possible memory. You can cause this to happen by - calling pcre2_jit_free_unused_memory(). Its argument is a general con- + ble. It expects new allocations, and keeps some free memory around to + improve allocation speed. However, in low memory conditions, it might + be better to free all possible memory. You can cause this to happen by + calling pcre2_jit_free_unused_memory(). Its argument is a general con- text, for custom memory management, or NULL for standard memory manage- ment. EXAMPLE CODE - This is a single-threaded example that specifies a JIT stack without - using a callback. A real program should include error checking after + This is a single-threaded example that specifies a JIT stack without + using a callback. A real program should include error checking after all the function calls. int rc; @@ -4195,19 +4626,20 @@ EXAMPLE CODE JIT FAST PATH API Because the API described above falls back to interpreted matching when - JIT is not available, it is convenient for programs that are written + JIT is not available, it is convenient for programs that are written for general use in many environments. However, calling JIT via pcre2_match() does have a performance impact. Programs that are written - for use where JIT is known to be available, and which need the best - possible performance, can instead use a "fast path" API to call JIT - matching directly instead of calling pcre2_match() (obviously only for + for use where JIT is known to be available, and which need the best + possible performance, can instead use a "fast path" API to call JIT + matching directly instead of calling pcre2_match() (obviously only for patterns that have been successfully processed by pcre2_jit_compile()). - The fast path function is called pcre2_jit_match(), and it takes + The fast path function is called pcre2_jit_match(), and it takes exactly the same arguments as pcre2_match(). The return values are also the same, plus PCRE2_ERROR_JIT_BADOPTION if a matching mode (partial or - complete) is requested that was not compiled. Unsupported option bits - (for example, PCRE2_ANCHORED) are ignored. + complete) is requested that was not compiled. Unsupported option bits + (for example, PCRE2_ANCHORED) are ignored, as is the PCRE2_NO_JIT + option. When you call pcre2_match(), as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For exam- @@ -4234,8 +4666,8 @@ AUTHOR REVISION - Last updated: 27 November 2014 - Copyright (c) 1997-2014 University of Cambridge. + Last updated: 05 June 2016 + Copyright (c) 1997-2016 University of Cambridge. ------------------------------------------------------------------------------ @@ -4262,6 +4694,10 @@ SIZE AND OTHER LIMITATIONS of execution is slower. In the 32-bit library, the internal linkage size is always 4. + The maximum length of a source pattern string is essentially unlimited; + it is the largest number a PCRE2_SIZE variable can hold. However, the + program that calls pcre2_compile() can specify a smaller limit. + The maximum length (in code units) of a subject string is one less than the largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an unsigned integer type, usually defined as size_t. Its maximum value @@ -4276,25 +4712,26 @@ SIZE AND OTHER LIMITATIONS All values in repeating quantifiers must be less than 65536. + The maximum length of a lookbehind assertion is 65535 characters. + There is no limit to the number of parenthesized subpatterns, but there can be no more than 65535 capturing subpatterns. There is, however, a limit to the depth of nesting of parenthesized subpatterns of all kinds. This is imposed in order to limit the amount of system stack - used at compile time. The limit can be specified when PCRE2 is built; - the default is 250. - - There is a limit to the number of forward references to subsequent sub- - patterns of around 200,000. Repeated forward references with fixed - upper limits, for example, (?2){0,100} when subpattern number 2 is to - the right, are included in the count. There is no limit to the number - of backward references. + used at compile time. The default limit can be specified when PCRE2 is + built; the default default is 250. An application can change this limit + by calling pcre2_set_parens_nest_limit() to set the limit in a compile + context. The maximum length of name for a named subpattern is 32 code units, and the maximum number of named subpatterns is 10000. The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or - (*THEN) verb is 255 for the 8-bit library and 65535 for the 16-bit and - 32-bit libraries. + (*THEN) verb is 255 code units for the 8-bit library and 65535 code + units for the 16-bit and 32-bit libraries. + + The maximum length of a string argument to a callout is the largest + number a 32-bit unsigned integer can hold. AUTHOR @@ -4306,8 +4743,8 @@ AUTHOR REVISION - Last updated: 25 November 2014 - Copyright (c) 1997-2014 University of Cambridge. + Last updated: 26 October 2016 + Copyright (c) 1997-2016 University of Cambridge. ------------------------------------------------------------------------------ @@ -4970,6 +5407,4432 @@ REVISION ------------------------------------------------------------------------------ +PCRE2PATTERN(3) Library Functions Manual PCRE2PATTERN(3) + + + +NAME + PCRE2 - Perl-compatible regular expressions (revised API) + +PCRE2 REGULAR EXPRESSION DETAILS + + The syntax and semantics of the regular expressions that are supported + by PCRE2 are described in detail below. There is a quick-reference syn- + tax summary in the pcre2syntax page. PCRE2 tries to match Perl syntax + and semantics as closely as it can. PCRE2 also supports some alterna- + tive regular expression syntax (which does not conflict with the Perl + syntax) in order to provide some compatibility with regular expressions + in Python, .NET, and Oniguruma. + + Perl's regular expressions are described in its own documentation, and + regular expressions in general are covered in a number of books, some + of which have copious examples. Jeffrey Friedl's "Mastering Regular + Expressions", published by O'Reilly, covers regular expressions in + great detail. This description of PCRE2's regular expressions is + intended as reference material. + + This document discusses the patterns that are supported by PCRE2 when + its main matching function, pcre2_match(), is used. PCRE2 also has an + alternative matching function, pcre2_dfa_match(), which matches using a + different algorithm that is not Perl-compatible. Some of the features + discussed below are not available when DFA matching is used. The advan- + tages and disadvantages of the alternative function, and how it differs + from the normal function, are discussed in the pcre2matching page. + + +SPECIAL START-OF-PATTERN ITEMS + + A number of options that can be passed to pcre2_compile() can also be + set by special items at the start of a pattern. These are not Perl-com- + patible, but are provided to make these options accessible to pattern + writers who are not able to change the program that processes the pat- + tern. Any number of these items may appear, but they must all be + together right at the start of the pattern string, and the letters must + be in upper case. + + UTF support + + In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either + as single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 + can be specified for the 32-bit library, in which case it constrains + the character values to valid Unicode code points. To process UTF + strings, PCRE2 must be built to include Unicode support (which is the + default). When using UTF strings you must either call the compiling + function with the PCRE2_UTF option, or the pattern must start with the + special sequence (*UTF), which is equivalent to setting the relevant + option. How setting a UTF mode affects pattern matching is mentioned in + several places below. There is also a summary of features in the + pcre2unicode page. + + Some applications that allow their users to supply patterns may wish to + restrict them to non-UTF data for security reasons. If the + PCRE2_NEVER_UTF option is passed to pcre2_compile(), (*UTF) is not + allowed, and its appearance in a pattern causes an error. + + Unicode property support + + Another special sequence that may appear at the start of a pattern is + (*UCP). This has the same effect as setting the PCRE2_UCP option: it + causes sequences such as \d and \w to use Unicode properties to deter- + mine character types, instead of recognizing only characters with codes + less than 256 via a lookup table. + + Some applications that allow their users to supply patterns may wish to + restrict them for security reasons. If the PCRE2_NEVER_UCP option is + passed to pcre2_compile(), (*UCP) is not allowed, and its appearance in + a pattern causes an error. + + Locking out empty string matching + + Starting a pattern with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) has the same + effect as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option + to whichever matching function is subsequently called to match the pat- + tern. These options lock out the matching of empty strings, either + entirely, or only at the start of the subject. + + Disabling auto-possessification + + If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as + setting the PCRE2_NO_AUTO_POSSESS option. This stops PCRE2 from making + quantifiers possessive when what follows cannot match the repeated + item. For example, by default a+b is treated as a++b. For more details, + see the pcre2api documentation. + + Disabling start-up optimizations + + If a pattern starts with (*NO_START_OPT), it has the same effect as + setting the PCRE2_NO_START_OPTIMIZE option. This disables several opti- + mizations for quickly reaching "no match" results. For more details, + see the pcre2api documentation. + + Disabling automatic anchoring + + If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect + as setting the PCRE2_NO_DOTSTAR_ANCHOR option. This disables optimiza- + tions that apply to patterns whose top-level branches all start with .* + (match any number of arbitrary characters). For more details, see the + pcre2api documentation. + + Disabling JIT compilation + + If a pattern that starts with (*NO_JIT) is successfully compiled, an + attempt by the application to apply the JIT optimization by calling + pcre2_jit_compile() is ignored. + + Setting match and recursion limits + + The caller of pcre2_match() can set a limit on the number of times the + internal match() function is called and on the maximum depth of recur- + sive calls. These facilities are provided to catch runaway matches that + are provoked by patterns with huge matching trees (a typical example is + a pattern with nested unlimited repeats) and to avoid running out of + system stack by too much recursion. When one of these limits is + reached, pcre2_match() gives an error return. The limits can also be + set by items at the start of the pattern of the form + + (*LIMIT_MATCH=d) + (*LIMIT_RECURSION=d) + + where d is any number of decimal digits. However, the value of the set- + ting must be less than the value set (or defaulted) by the caller of + pcre2_match() for it to have any effect. In other words, the pattern + writer can lower the limits set by the programmer, but not raise them. + If there is more than one setting of one of these limits, the lower + value is used. + + The match limit is used (but in a different way) when JIT is being + used, but it is not relevant, and is ignored, when matching with + pcre2_dfa_match(). However, the recursion limit is relevant for DFA + matching, which does use some function recursion, in particular, for + recursions within the pattern. + + Newline conventions + + PCRE2 supports five different conventions for indicating line breaks in + strings: a single CR (carriage return) character, a single LF (line- + feed) character, the two-character sequence CRLF, any of the three pre- + ceding, or any Unicode newline sequence. The pcre2api page has further + discussion about newlines, and shows how to set the newline convention + when calling pcre2_compile(). + + It is also possible to specify a newline convention by starting a pat- + tern string with one of the following five sequences: + + (*CR) carriage return + (*LF) linefeed + (*CRLF) carriage return, followed by linefeed + (*ANYCRLF) any of the three above + (*ANY) all Unicode newline sequences + + These override the default and the options given to the compiling func- + tion. For example, on a Unix system where LF is the default newline + sequence, the pattern + + (*CR)a.b + + changes the convention to CR. That pattern matches "a\nb" because LF is + no longer a newline. If more than one of these settings is present, the + last one is used. + + The newline convention affects where the circumflex and dollar asser- + tions are true. It also affects the interpretation of the dot metachar- + acter when PCRE2_DOTALL is not set, and the behaviour of \N. However, + it does not affect what the \R escape sequence matches. By default, + this is any Unicode newline sequence, for Perl compatibility. However, + this can be changed; see the description of \R in the section entitled + "Newline sequences" below. A change of \R setting can be combined with + a change of newline convention. + + Specifying what \R matches + + It is possible to restrict \R to match only CR, LF, or CRLF (instead of + the complete set of Unicode line endings) by setting the option + PCRE2_BSR_ANYCRLF at compile time. This effect can also be achieved by + starting a pattern with (*BSR_ANYCRLF). For completeness, (*BSR_UNI- + CODE) is also recognized, corresponding to PCRE2_BSR_UNICODE. + + +EBCDIC CHARACTER CODES + + PCRE2 can be compiled to run in an environment that uses EBCDIC as its + character code rather than ASCII or Unicode (typically a mainframe sys- + tem). In the sections below, character code values are ASCII or Uni- + code; in an EBCDIC environment these characters may have different code + values, and there are no code points greater than 255. + + +CHARACTERS AND METACHARACTERS + + A regular expression is a pattern that is matched against a subject + string from left to right. Most characters stand for themselves in a + pattern, and match the corresponding characters in the subject. As a + trivial example, the pattern + + The quick brown fox + + matches a portion of a subject string that is identical to itself. When + caseless matching is specified (the PCRE2_CASELESS option), letters are + matched independently of case. + + The power of regular expressions comes from the ability to include + alternatives and repetitions in the pattern. These are encoded in the + pattern by the use of metacharacters, which do not stand for themselves + but instead are interpreted in some special way. + + There are two different sets of metacharacters: those that are recog- + nized anywhere in the pattern except within square brackets, and those + that are recognized within square brackets. Outside square brackets, + the metacharacters are as follows: + + \ general escape character with several uses + ^ assert start of string (or line, in multiline mode) + $ assert end of string (or line, in multiline mode) + . match any character except newline (by default) + [ start character class definition + | start of alternative branch + ( start subpattern + ) end subpattern + ? extends the meaning of ( + also 0 or 1 quantifier + also quantifier minimizer + * 0 or more quantifier + + 1 or more quantifier + also "possessive quantifier" + { start min/max quantifier + + Part of a pattern that is in square brackets is called a "character + class". In a character class the only metacharacters are: + + \ general escape character + ^ negate the class, but only if the first character + - indicates character range + [ POSIX character class (only if followed by POSIX + syntax) + ] terminates the character class + + The following sections describe the use of each of the metacharacters. + + +BACKSLASH + + The backslash character has several uses. Firstly, if it is followed by + a character that is not a number or a letter, it takes away any special + meaning that character may have. This use of backslash as an escape + character applies both inside and outside character classes. + + For example, if you want to match a * character, you write \* in the + pattern. This escaping action applies whether or not the following + character would otherwise be interpreted as a metacharacter, so it is + always safe to precede a non-alphanumeric with backslash to specify + that it stands for itself. In particular, if you want to match a back- + slash, you write \\. + + In a UTF mode, only ASCII numbers and letters have any special meaning + after a backslash. All other characters (in particular, those whose + codepoints are greater than 127) are treated as literals. + + If a pattern is compiled with the PCRE2_EXTENDED option, most white + space in the pattern (other than in a character class), and characters + between a # outside a character class and the next newline, inclusive, + are ignored. An escaping backslash can be used to include a white space + or # character as part of the pattern. + + If you want to remove the special meaning from a sequence of charac- + ters, you can do so by putting them between \Q and \E. This is differ- + ent from Perl in that $ and @ are handled as literals in \Q...\E + sequences in PCRE2, whereas in Perl, $ and @ cause variable interpola- + tion. Note the following examples: + + Pattern PCRE2 matches Perl matches + + \Qabc$xyz\E abc$xyz abc followed by the + contents of $xyz + \Qabc\$xyz\E abc\$xyz abc\$xyz + \Qabc\E\$\Qxyz\E abc$xyz abc$xyz + + The \Q...\E sequence is recognized both inside and outside character + classes. An isolated \E that is not preceded by \Q is ignored. If \Q + is not followed by \E later in the pattern, the literal interpretation + continues to the end of the pattern (that is, \E is assumed at the + end). If the isolated \Q is inside a character class, this causes an + error, because the character class is not terminated. + + Non-printing characters + + A second use of backslash provides a way of encoding non-printing char- + acters in patterns in a visible manner. There is no restriction on the + appearance of non-printing characters in a pattern, but when a pattern + is being prepared by text editing, it is often easier to use one of the + following escape sequences than the binary character it represents. In + an ASCII or Unicode environment, these escapes are as follows: + + \a alarm, that is, the BEL character (hex 07) + \cx "control-x", where x is any printable ASCII character + \e escape (hex 1B) + \f form feed (hex 0C) + \n linefeed (hex 0A) + \r carriage return (hex 0D) + \t tab (hex 09) + \0dd character with octal code 0dd + \ddd character with octal code ddd, or back reference + \o{ddd..} character with octal code ddd.. + \xhh character with hex code hh + \x{hhh..} character with hex code hhh.. (default mode) + \uhhhh character with hex code hhhh (when PCRE2_ALT_BSUX is set) + + The precise effect of \cx on ASCII characters is as follows: if x is a + lower case letter, it is converted to upper case. Then bit 6 of the + character (hex 40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A + (A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes + hex 7B (; is 3B). If the code unit following \c has a value less than + 32 or greater than 126, a compile-time error occurs. + + When PCRE2 is compiled in EBCDIC mode, \a, \e, \f, \n, \r, and \t gen- + erate the appropriate EBCDIC code values. The \c escape is processed as + specified for Perl in the perlebcdic document. The only characters that + are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?. + Any other character provokes a compile-time error. The sequence \c@ + encodes character code 0; after \c the letters (in either case) encode + characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters + 27-31 (hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 + (hex 5F). + + Thus, apart from \c?, these escapes generate the same character code + values as they do in an ASCII environment, though the meanings of the + values mostly differ. For example, \cG always generates code value 7, + which is BEL in ASCII but DEL in EBCDIC. + + The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, + but because 127 is not a control character in EBCDIC, Perl makes it + generate the APC character. Unfortunately, there are several variants + of EBCDIC. In most of them the APC character has the value 255 (hex + FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If + certain other characters have POSIX-BC values, PCRE2 makes \c? generate + 95; otherwise it generates 255. + + After \0 up to two further octal digits are read. If there are fewer + than two digits, just those that are present are used. Thus the + sequence \0\x\015 specifies two binary zeros followed by a CR character + (code value 13). Make sure you supply two digits after the initial zero + if the pattern character that follows is itself an octal digit. + + The escape \o must be followed by a sequence of octal digits, enclosed + in braces. An error occurs if this is not the case. This escape is a + recent addition to Perl; it provides way of specifying character code + points as octal numbers greater than 0777, and it also allows octal + numbers and back references to be unambiguously specified. + + For greater clarity and unambiguity, it is best to avoid following \ by + a digit greater than zero. Instead, use \o{} or \x{} to specify charac- + ter numbers, and \g{} to specify back references. The following para- + graphs describe the old, ambiguous syntax. + + The handling of a backslash followed by a digit other than 0 is compli- + cated, and Perl has changed over time, causing PCRE2 also to change. + + Outside a character class, PCRE2 reads the digit and any following dig- + its as a decimal number. If the number is less than 10, begins with the + digit 8 or 9, or if there are at least that many previous capturing + left parentheses in the expression, the entire sequence is taken as a + back reference. A description of how this works is given later, follow- + ing the discussion of parenthesized subpatterns. Otherwise, up to + three octal digits are read to form a character code. + + Inside a character class, PCRE2 handles \8 and \9 as the literal char- + acters "8" and "9", and otherwise reads up to three octal digits fol- + lowing the backslash, using them to generate a data character. Any sub- + sequent digits stand for themselves. For example, outside a character + class: + + \040 is another way of writing an ASCII space + \40 is the same, provided there are fewer than 40 + previous capturing subpatterns + \7 is always a back reference + \11 might be a back reference, or another way of + writing a tab + \011 is always a tab + \0113 is a tab followed by the character "3" + \113 might be a back reference, otherwise the + character with octal code 113 + \377 might be a back reference, otherwise + the value 255 (decimal) + \81 is always a back reference + + Note that octal values of 100 or greater that are specified using this + syntax must not be introduced by a leading zero, because no more than + three octal digits are ever read. + + By default, after \x that is not followed by {, from zero to two hexa- + decimal digits are read (letters can be in upper or lower case). Any + number of hexadecimal digits may appear between \x{ and }. If a charac- + ter other than a hexadecimal digit appears between \x{ and }, or if + there is no terminating }, an error occurs. + + If the PCRE2_ALT_BSUX option is set, the interpretation of \x is as + just described only when it is followed by two hexadecimal digits. Oth- + erwise, it matches a literal "x" character. In this mode mode, support + for code points greater than 256 is provided by \u, which must be fol- + lowed by four hexadecimal digits; otherwise it matches a literal "u" + character. + + Characters whose value is less than 256 can be defined by either of the + two syntaxes for \x (or by \u in PCRE2_ALT_BSUX mode). There is no dif- + ference in the way they are handled. For example, \xdc is exactly the + same as \x{dc} (or \u00dc in PCRE2_ALT_BSUX mode). + + Constraints on character values + + Characters that are specified using octal or hexadecimal numbers are + limited to certain values, as follows: + + 8-bit non-UTF mode less than 0x100 + 8-bit UTF-8 mode less than 0x10ffff and a valid codepoint + 16-bit non-UTF mode less than 0x10000 + 16-bit UTF-16 mode less than 0x10ffff and a valid codepoint + 32-bit non-UTF mode less than 0x100000000 + 32-bit UTF-32 mode less than 0x10ffff and a valid codepoint + + Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so- + called "surrogate" codepoints), and 0xffef. + + Escape sequences in character classes + + All the sequences that define a single character value can be used both + inside and outside character classes. In addition, inside a character + class, \b is interpreted as the backspace character (hex 08). + + \N is not allowed in a character class. \B, \R, and \X are not special + inside a character class. Like other unrecognized alphabetic escape + sequences, they cause an error. Outside a character class, these + sequences have different meanings. + + Unsupported escape sequences + + In Perl, the sequences \l, \L, \u, and \U are recognized by its string + handler and used to modify the case of following characters. By + default, PCRE2 does not support these escape sequences. However, if the + PCRE2_ALT_BSUX option is set, \U matches a "U" character, and \u can be + used to define a character by code point, as described in the previous + section. + + Absolute and relative back references + + The sequence \g followed by a signed or unsigned number, optionally + enclosed in braces, is an absolute or relative back reference. A named + back reference can be coded as \g{name}. Back references are discussed + later, following the discussion of parenthesized subpatterns. + + Absolute and relative subroutine calls + + For compatibility with Oniguruma, the non-Perl syntax \g followed by a + name or a number enclosed either in angle brackets or single quotes, is + an alternative syntax for referencing a subpattern as a "subroutine". + Details are discussed later. Note that \g{...} (Perl syntax) and + \g<...> (Oniguruma syntax) are not synonymous. The former is a back + reference; the latter is a subroutine call. + + Generic character types + + Another use of backslash is for specifying generic character types: + + \d any decimal digit + \D any character that is not a decimal digit + \h any horizontal white space character + \H any character that is not a horizontal white space character + \s any white space character + \S any character that is not a white space character + \v any vertical white space character + \V any character that is not a vertical white space character + \w any "word" character + \W any "non-word" character + + There is also the single sequence \N, which matches a non-newline char- + acter. This is the same as the "." metacharacter when PCRE2_DOTALL is + not set. Perl also uses \N to match characters by name; PCRE2 does not + support this. + + Each pair of lower and upper case escape sequences partitions the com- + plete set of characters into two disjoint sets. Any given character + matches one, and only one, of each pair. The sequences can appear both + inside and outside character classes. They each match one character of + the appropriate type. If the current matching point is at the end of + the subject string, all of them fail, because there is no character to + match. + + The default \s characters are HT (9), LF (10), VT (11), FF (12), CR + (13), and space (32), which are defined as white space in the "C" + locale. This list may vary if locale-specific matching is taking place. + For example, in some locales the "non-breaking space" character (\xA0) + is recognized as white space, and in others the VT character is not. + + A "word" character is an underscore or any character that is a letter + or digit. By default, the definition of letters and digits is con- + trolled by PCRE2's low-valued character tables, and may vary if locale- + specific matching is taking place (see "Locale support" in the pcre2api + page). For example, in a French locale such as "fr_FR" in Unix-like + systems, or "french" in Windows, some character codes greater than 127 + are used for accented letters, and these are then matched by \w. The + use of locales with Unicode is discouraged. + + By default, characters whose code points are greater than 127 never + match \d, \s, or \w, and always match \D, \S, and \W, although this may + be different for characters in the range 128-255 when locale-specific + matching is happening. These escape sequences retain their original + meanings from before Unicode support was available, mainly for effi- + ciency reasons. If the PCRE2_UCP option is set, the behaviour is + changed so that Unicode properties are used to determine character + types, as follows: + + \d any character that matches \p{Nd} (decimal digit) + \s any character that matches \p{Z} or \h or \v + \w any character that matches \p{L} or \p{N}, plus underscore + + The upper case escapes match the inverse sets of characters. Note that + \d matches only decimal digits, whereas \w matches any Unicode digit, + as well as any Unicode letter, and underscore. Note also that PCRE2_UCP + affects \b, and \B because they are defined in terms of \w and \W. + Matching these sequences is noticeably slower when PCRE2_UCP is set. + + The sequences \h, \H, \v, and \V, in contrast to the other sequences, + which match only ASCII characters by default, always match a specific + list of code points, whether or not PCRE2_UCP is set. The horizontal + space characters are: + + U+0009 Horizontal tab (HT) + U+0020 Space + U+00A0 Non-break space + U+1680 Ogham space mark + U+180E Mongolian vowel separator + U+2000 En quad + U+2001 Em quad + U+2002 En space + U+2003 Em space + U+2004 Three-per-em space + U+2005 Four-per-em space + U+2006 Six-per-em space + U+2007 Figure space + U+2008 Punctuation space + U+2009 Thin space + U+200A Hair space + U+202F Narrow no-break space + U+205F Medium mathematical space + U+3000 Ideographic space + + The vertical space characters are: + + U+000A Linefeed (LF) + U+000B Vertical tab (VT) + U+000C Form feed (FF) + U+000D Carriage return (CR) + U+0085 Next line (NEL) + U+2028 Line separator + U+2029 Paragraph separator + + In 8-bit, non-UTF-8 mode, only the characters with code points less + than 256 are relevant. + + Newline sequences + + Outside a character class, by default, the escape sequence \R matches + any Unicode newline sequence. In 8-bit non-UTF-8 mode \R is equivalent + to the following: + + (?>\r\n|\n|\x0b|\f|\r|\x85) + + This is an example of an "atomic group", details of which are given + below. This particular group matches either the two-character sequence + CR followed by LF, or one of the single characters LF (linefeed, + U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (car- + riage return, U+000D), or NEL (next line, U+0085). Because this is an + atomic group, the two-character sequence is treated as a single unit + that cannot be split. + + In other modes, two additional characters whose codepoints are greater + than 255 are added: LS (line separator, U+2028) and PS (paragraph sepa- + rator, U+2029). Unicode support is not needed for these characters to + be recognized. + + It is possible to restrict \R to match only CR, LF, or CRLF (instead of + the complete set of Unicode line endings) by setting the option + PCRE2_BSR_ANYCRLF at compile time. (BSR is an abbrevation for "back- + slash R".) This can be made the default when PCRE2 is built; if this is + the case, the other behaviour can be requested via the PCRE2_BSR_UNI- + CODE option. It is also possible to specify these settings by starting + a pattern string with one of the following sequences: + + (*BSR_ANYCRLF) CR, LF, or CRLF only + (*BSR_UNICODE) any Unicode newline sequence + + These override the default and the options given to the compiling func- + tion. Note that these special settings, which are not Perl-compatible, + are recognized only at the very start of a pattern, and that they must + be in upper case. If more than one of them is present, the last one is + used. They can be combined with a change of newline convention; for + example, a pattern can start with: + + (*ANY)(*BSR_ANYCRLF) + + They can also be combined with the (*UTF) or (*UCP) special sequences. + Inside a character class, \R is treated as an unrecognized escape + sequence, and causes an error. + + Unicode character properties + + When PCRE2 is built with Unicode support (the default), three addi- + tional escape sequences that match characters with specific properties + are available. In 8-bit non-UTF-8 mode, these sequences are of course + limited to testing characters whose codepoints are less than 256, but + they do work in this mode. The extra escape sequences are: + + \p{xx} a character with the xx property + \P{xx} a character without the xx property + \X a Unicode extended grapheme cluster + + The property names represented by xx above are limited to the Unicode + script names, the general category properties, "Any", which matches any + character (including newline), and some special PCRE2 properties + (described in the next section). Other Perl properties such as "InMu- + sicalSymbols" are not supported by PCRE2. Note that \P{Any} does not + match any characters, so always causes a match failure. + + Sets of Unicode characters are defined as belonging to certain scripts. + A character from one of these sets can be matched using a script name. + For example: + + \p{Greek} + \P{Han} + + Those that are not part of an identified script are lumped together as + "Common". The current list of scripts is: + + Ahom, Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, Balinese, + Bamum, Bassa_Vah, Batak, Bengali, Bopomofo, Brahmi, Braille, Buginese, + Buhid, Canadian_Aboriginal, Carian, Caucasian_Albanian, Chakma, Cham, + Cherokee, Common, Coptic, Cuneiform, Cypriot, Cyrillic, Deseret, + Devanagari, Duployan, Egyptian_Hieroglyphs, Elbasan, Ethiopic, Geor- + gian, Glagolitic, Gothic, Grantha, Greek, Gujarati, Gurmukhi, Han, + Hangul, Hanunoo, Hatran, Hebrew, Hiragana, Imperial_Aramaic, Inherited, + Inscriptional_Pahlavi, Inscriptional_Parthian, Javanese, Kaithi, Kan- + nada, Katakana, Kayah_Li, Kharoshthi, Khmer, Khojki, Khudawadi, Lao, + Latin, Lepcha, Limbu, Linear_A, Linear_B, Lisu, Lycian, Lydian, Maha- + jani, Malayalam, Mandaic, Manichaean, Meetei_Mayek, Mende_Kikakui, + Meroitic_Cursive, Meroitic_Hieroglyphs, Miao, Modi, Mongolian, Mro, + Multani, Myanmar, Nabataean, New_Tai_Lue, Nko, Ogham, Ol_Chiki, + Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic, Old_Persian, + Old_South_Arabian, Old_Turkic, Oriya, Osmanya, Pahawh_Hmong, Palmyrene, + Pau_Cin_Hau, Phags_Pa, Phoenician, Psalter_Pahlavi, Rejang, Runic, + Samaritan, Saurashtra, Sharada, Shavian, Siddham, SignWriting, Sinhala, + Sora_Sompeng, Sundanese, Syloti_Nagri, Syriac, Tagalog, Tagbanwa, + Tai_Le, Tai_Tham, Tai_Viet, Takri, Tamil, Telugu, Thaana, Thai, + Tibetan, Tifinagh, Tirhuta, Ugaritic, Vai, Warang_Citi, Yi. + + Each character has exactly one Unicode general category property, spec- + ified by a two-letter abbreviation. For compatibility with Perl, nega- + tion can be specified by including a circumflex between the opening + brace and the property name. For example, \p{^Lu} is the same as + \P{Lu}. + + If only one letter is specified with \p or \P, it includes all the gen- + eral category properties that start with that letter. In this case, in + the absence of negation, the curly brackets in the escape sequence are + optional; these two examples have the same effect: + + \p{L} + \pL + + The following general category property codes are supported: + + C Other + Cc Control + Cf Format + Cn Unassigned + Co Private use + Cs Surrogate + + L Letter + Ll Lower case letter + Lm Modifier letter + Lo Other letter + Lt Title case letter + Lu Upper case letter + + M Mark + Mc Spacing mark + Me Enclosing mark + Mn Non-spacing mark + + N Number + Nd Decimal number + Nl Letter number + No Other number + + P Punctuation + Pc Connector punctuation + Pd Dash punctuation + Pe Close punctuation + Pf Final punctuation + Pi Initial punctuation + Po Other punctuation + Ps Open punctuation + + S Symbol + Sc Currency symbol + Sk Modifier symbol + Sm Mathematical symbol + So Other symbol + + Z Separator + Zl Line separator + Zp Paragraph separator + Zs Space separator + + The special property L& is also supported: it matches a character that + has the Lu, Ll, or Lt property, in other words, a letter that is not + classified as a modifier or "other". + + The Cs (Surrogate) property applies only to characters in the range + U+D800 to U+DFFF. Such characters are not valid in Unicode strings and + so cannot be tested by PCRE2, unless UTF validity checking has been + turned off (see the discussion of PCRE2_NO_UTF_CHECK in the pcre2api + page). Perl does not support the Cs property. + + The long synonyms for property names that Perl supports (such as + \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix + any of these properties with "Is". + + No character that is in the Unicode table has the Cn (unassigned) prop- + erty. Instead, this property is assumed for any code point that is not + in the Unicode table. + + Specifying caseless matching does not affect these escape sequences. + For example, \p{Lu} always matches only upper case letters. This is + different from the behaviour of current versions of Perl. + + Matching characters by Unicode property is not fast, because PCRE2 has + to do a multistage table lookup in order to find a character's prop- + erty. That is why the traditional escape sequences such as \d and \w do + not use Unicode properties in PCRE2 by default, though you can make + them do so by setting the PCRE2_UCP option or by starting the pattern + with (*UCP). + + Extended grapheme clusters + + The \X escape matches any number of Unicode characters that form an + "extended grapheme cluster", and treats the sequence as an atomic group + (see below). Unicode supports various kinds of composite character by + giving each character a grapheme breaking property, and having rules + that use these properties to define the boundaries of extended grapheme + clusters. \X always matches at least one character. Then it decides + whether to add additional characters according to the following rules + for ending a cluster: + + 1. End at the end of the subject string. + + 2. Do not end between CR and LF; otherwise end after any control char- + acter. + + 3. Do not break Hangul (a Korean script) syllable sequences. Hangul + characters are of five types: L, V, T, LV, and LVT. An L character may + be followed by an L, V, LV, or LVT character; an LV or V character may + be followed by a V or T character; an LVT or T character may be follwed + only by a T character. + + 4. Do not end before extending characters or spacing marks. Characters + with the "mark" property always have the "extend" grapheme breaking + property. + + 5. Do not end after prepend characters. + + 6. Otherwise, end the cluster. + + PCRE2's additional properties + + As well as the standard Unicode properties described above, PCRE2 sup- + ports four more that make it possible to convert traditional escape + sequences such as \w and \s to use Unicode properties. PCRE2 uses these + non-standard, non-Perl properties internally when PCRE2_UCP is set. + However, they may also be used explicitly. These properties are: + + Xan Any alphanumeric character + Xps Any POSIX space character + Xsp Any Perl space character + Xwd Any Perl "word" character + + Xan matches characters that have either the L (letter) or the N (num- + ber) property. Xps matches the characters tab, linefeed, vertical tab, + form feed, or carriage return, and any other character that has the Z + (separator) property. Xsp is the same as Xps; in PCRE1 it used to + exclude vertical tab, for Perl compatibility, but Perl changed. Xwd + matches the same characters as Xan, plus underscore. + + There is another non-standard property, Xuc, which matches any charac- + ter that can be represented by a Universal Character Name in C++ and + other programming languages. These are the characters $, @, ` (grave + accent), and all characters with Unicode code points greater than or + equal to U+00A0, except for the surrogates U+D800 to U+DFFF. Note that + most base (ASCII) characters are excluded. (Universal Character Names + are of the form \uHHHH or \UHHHHHHHH where H is a hexadecimal digit. + Note that the Xuc property does not match these sequences but the char- + acters that they represent.) + + Resetting the match start + + The escape sequence \K causes any previously matched characters not to + be included in the final matched sequence. For example, the pattern: + + foo\Kbar + + matches "foobar", but reports that it has matched "bar". This feature + is similar to a lookbehind assertion (described below). However, in + this case, the part of the subject before the real match does not have + to be of fixed length, as lookbehind assertions do. The use of \K does + not interfere with the setting of captured substrings. For example, + when the pattern + + (foo)\Kbar + + matches "foobar", the first substring is still set to "foo". + + Perl documents that the use of \K within assertions is "not well + defined". In PCRE2, \K is acted upon when it occurs inside positive + assertions, but is ignored in negative assertions. Note that when a + pattern such as (?=ab\K) matches, the reported start of the match can + be greater than the end of the match. + + Simple assertions + + The final use of backslash is for certain simple assertions. An asser- + tion specifies a condition that has to be met at a particular point in + a match, without consuming any characters from the subject string. The + use of subpatterns for more complicated assertions is described below. + The backslashed assertions are: + + \b matches at a word boundary + \B matches when not at a word boundary + \A matches at the start of the subject + \Z matches at the end of the subject + also matches before a newline at the end of the subject + \z matches only at the end of the subject + \G matches at the first matching position in the subject + + Inside a character class, \b has a different meaning; it matches the + backspace character. If any other of these assertions appears in a + character class, an "invalid escape sequence" error is generated. + + A word boundary is a position in the subject string where the current + character and the previous character do not both match \w or \W (i.e. + one matches \w and the other matches \W), or the start or end of the + string if the first or last character matches \w, respectively. In a + UTF mode, the meanings of \w and \W can be changed by setting the + PCRE2_UCP option. When this is done, it also affects \b and \B. Neither + PCRE2 nor Perl has a separate "start of word" or "end of word" metase- + quence. However, whatever follows \b normally determines which it is. + For example, the fragment \ba matches "a" at the start of a word. + + The \A, \Z, and \z assertions differ from the traditional circumflex + and dollar (described in the next section) in that they only ever match + at the very start and end of the subject string, whatever options are + set. Thus, they are independent of multiline mode. These three asser- + tions are not affected by the PCRE2_NOTBOL or PCRE2_NOTEOL options, + which affect only the behaviour of the circumflex and dollar metachar- + acters. However, if the startoffset argument of pcre2_match() is non- + zero, indicating that matching is to start at a point other than the + beginning of the subject, \A can never match. The difference between + \Z and \z is that \Z matches before a newline at the end of the string + as well as at the very end, whereas \z matches only at the end. + + The \G assertion is true only when the current matching position is at + the start point of the match, as specified by the startoffset argument + of pcre2_match(). It differs from \A when the value of startoffset is + non-zero. By calling pcre2_match() multiple times with appropriate + arguments, you can mimic Perl's /g option, and it is in this kind of + implementation where \G can be useful. + + Note, however, that PCRE2's interpretation of \G, as the start of the + current match, is subtly different from Perl's, which defines it as the + end of the previous match. In Perl, these can be different when the + previously matched string was empty. Because PCRE2 does just one match + at a time, it cannot reproduce this behaviour. + + If all the alternatives of a pattern begin with \G, the expression is + anchored to the starting match position, and the "anchored" flag is set + in the compiled regular expression. + + +CIRCUMFLEX AND DOLLAR + + The circumflex and dollar metacharacters are zero-width assertions. + That is, they test for a particular condition being true without con- + suming any characters from the subject string. These two metacharacters + are concerned with matching the starts and ends of lines. If the new- + line convention is set so that only the two-character sequence CRLF is + recognized as a newline, isolated CR and LF characters are treated as + ordinary data characters, and are not recognized as newlines. + + Outside a character class, in the default matching mode, the circumflex + character is an assertion that is true only if the current matching + point is at the start of the subject string. If the startoffset argu- + ment of pcre2_match() is non-zero, or if PCRE2_NOTBOL is set, circum- + flex can never match if the PCRE2_MULTILINE option is unset. Inside a + character class, circumflex has an entirely different meaning (see + below). + + Circumflex need not be the first character of the pattern if a number + of alternatives are involved, but it should be the first thing in each + alternative in which it appears if the pattern is ever to match that + branch. If all possible alternatives start with a circumflex, that is, + if the pattern is constrained to match only at the start of the sub- + ject, it is said to be an "anchored" pattern. (There are also other + constructs that can cause a pattern to be anchored.) + + The dollar character is an assertion that is true only if the current + matching point is at the end of the subject string, or immediately + before a newline at the end of the string (by default), unless + PCRE2_NOTEOL is set. Note, however, that it does not actually match the + newline. Dollar need not be the last character of the pattern if a num- + ber of alternatives are involved, but it should be the last item in any + branch in which it appears. Dollar has no special meaning in a charac- + ter class. + + The meaning of dollar can be changed so that it matches only at the + very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at + compile time. This does not affect the \Z assertion. + + The meanings of the circumflex and dollar metacharacters are changed if + the PCRE2_MULTILINE option is set. When this is the case, a dollar + character matches before any newlines in the string, as well as at the + very end, and a circumflex matches immediately after internal newlines + as well as at the start of the subject string. It does not match after + a newline that ends the string, for compatibility with Perl. However, + this can be changed by setting the PCRE2_ALT_CIRCUMFLEX option. + + For example, the pattern /^abc$/ matches the subject string "def\nabc" + (where \n represents a newline) in multiline mode, but not otherwise. + Consequently, patterns that are anchored in single line mode because + all branches start with ^ are not anchored in multiline mode, and a + match for circumflex is possible when the startoffset argument of + pcre2_match() is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored + if PCRE2_MULTILINE is set. + + When the newline convention (see "Newline conventions" below) recog- + nizes the two-character sequence CRLF as a newline, this is preferred, + even if the single characters CR and LF are also recognized as new- + lines. For example, if the newline convention is "any", a multiline + mode circumflex matches before "xyz" in the string "abc\r\nxyz" rather + than after CR, even though CR on its own is a valid newline. (It also + matches at the very start of the string, of course.) + + Note that the sequences \A, \Z, and \z can be used to match the start + and end of the subject in both modes, and if all branches of a pattern + start with \A it is always anchored, whether or not PCRE2_MULTILINE is + set. + + +FULL STOP (PERIOD, DOT) AND \N + + Outside a character class, a dot in the pattern matches any one charac- + ter in the subject string except (by default) a character that signi- + fies the end of a line. + + When a line ending is defined as a single character, dot never matches + that character; when the two-character sequence CRLF is used, dot does + not match CR if it is immediately followed by LF, but otherwise it + matches all characters (including isolated CRs and LFs). When any Uni- + code line endings are being recognized, dot does not match CR or LF or + any of the other line ending characters. + + The behaviour of dot with regard to newlines can be changed. If the + PCRE2_DOTALL option is set, a dot matches any one character, without + exception. If the two-character sequence CRLF is present in the sub- + ject string, it takes two dots to match it. + + The handling of dot is entirely independent of the handling of circum- + flex and dollar, the only relationship being that they both involve + newlines. Dot has no special meaning in a character class. + + The escape sequence \N behaves like a dot, except that it is not + affected by the PCRE2_DOTALL option. In other words, it matches any + character except one that signifies the end of a line. Perl also uses + \N to match characters by name; PCRE2 does not support this. + + +MATCHING A SINGLE CODE UNIT + + Outside a character class, the escape sequence \C matches any one code + unit, whether or not a UTF mode is set. In the 8-bit library, one code + unit is one byte; in the 16-bit library it is a 16-bit unit; in the + 32-bit library it is a 32-bit unit. Unlike a dot, \C always matches + line-ending characters. The feature is provided in Perl in order to + match individual bytes in UTF-8 mode, but it is unclear how it can use- + fully be used. + + Because \C breaks up characters into individual code units, matching + one unit with \C in UTF-8 or UTF-16 mode means that the rest of the + string may start with a malformed UTF character. This has undefined + results, because PCRE2 assumes that it is matching character by charac- + ter in a valid UTF string (by default it checks the subject string's + validity at the start of processing unless the PCRE2_NO_UTF_CHECK + option is used). + + An application can lock out the use of \C by setting the + PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also + possible to build PCRE2 with the use of \C permanently disabled. + + PCRE2 does not allow \C to appear in lookbehind assertions (described + below) in UTF-8 or UTF-16 modes, because this would make it impossible + to calculate the length of the lookbehind. Neither the alternative + matching function pcre2_dfa_match() nor the JIT optimizer support \C in + these UTF modes. The former gives a match-time error; the latter fails + to optimize and so the match is always run using the interpreter. + + In the 32-bit library, however, \C is always supported (when not + explicitly locked out) because it always matches a single code unit, + whether or not UTF-32 is specified. + + In general, the \C escape sequence is best avoided. However, one way of + using it that avoids the problem of malformed UTF-8 or UTF-16 charac- + ters is to use a lookahead to check the length of the next character, + as in this pattern, which could be used with a UTF-8 string (ignore + white space and line breaks): + + (?| (?=[\x00-\x7f])(\C) | + (?=[\x80-\x{7ff}])(\C)(\C) | + (?=[\x{800}-\x{ffff}])(\C)(\C)(\C) | + (?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C)) + + In this example, a group that starts with (?| resets the capturing + parentheses numbers in each alternative (see "Duplicate Subpattern Num- + bers" below). The assertions at the start of each branch check the next + UTF-8 character for values whose encoding uses 1, 2, 3, or 4 bytes, + respectively. The character's individual bytes are then captured by the + appropriate number of \C groups. + + +SQUARE BRACKETS AND CHARACTER CLASSES + + An opening square bracket introduces a character class, terminated by a + closing square bracket. A closing square bracket on its own is not spe- + cial by default. If a closing square bracket is required as a member + of the class, it should be the first data character in the class (after + an initial circumflex, if present) or escaped with a backslash. This + means that, by default, an empty class cannot be defined. However, if + the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing square bracket at + the start does end the (empty) class. + + A character class matches a single character in the subject. A matched + character must be in the set of characters defined by the class, unless + the first character in the class definition is a circumflex, in which + case the subject character must not be in the set defined by the class. + If a circumflex is actually required as a member of the class, ensure + it is not the first character, or escape it with a backslash. + + For example, the character class [aeiou] matches any lower case vowel, + while [^aeiou] matches any character that is not a lower case vowel. + Note that a circumflex is just a convenient notation for specifying the + characters that are in the class by enumerating those that are not. A + class that starts with a circumflex is not an assertion; it still con- + sumes a character from the subject string, and therefore it fails if + the current pointer is at the end of the string. + + When caseless matching is set, any letters in a class represent both + their upper case and lower case versions, so for example, a caseless + [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not + match "A", whereas a caseful version would. + + Characters that might indicate line breaks are never treated in any + special way when matching character classes, whatever line-ending + sequence is in use, and whatever setting of the PCRE2_DOTALL and + PCRE2_MULTILINE options is used. A class such as [^a] always matches + one of these characters. + + The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V, + \w, and \W may appear in a character class, and add the characters that + they match to the class. For example, [\dABCDEF] matches any hexadeci- + mal digit. In UTF modes, the PCRE2_UCP option affects the meanings of + \d, \s, \w and their upper case partners, just as it does when they + appear outside a character class, as described in the section entitled + "Generic character types" above. The escape sequence \b has a different + meaning inside a character class; it matches the backspace character. + The sequences \B, \N, \R, and \X are not special inside a character + class. Like any other unrecognized escape sequences, they cause an + error. + + The minus (hyphen) character can be used to specify a range of charac- + ters in a character class. For example, [d-m] matches any letter + between d and m, inclusive. If a minus character is required in a + class, it must be escaped with a backslash or appear in a position + where it cannot be interpreted as indicating a range, typically as the + first or last character in the class, or immediately after a range. For + example, [b-d-z] matches letters in the range b to d, a hyphen charac- + ter, or z. + + Perl treats a hyphen as a literal if it appears before or after a POSIX + class (see below) or a character type escape such as as \d, but gives a + warning in its warning mode, as this is most likely a user error. As + PCRE2 has no facility for warning, an error is given in these cases. + + It is not possible to have the literal character "]" as the end charac- + ter of a range. A pattern such as [W-]46] is interpreted as a class of + two characters ("W" and "-") followed by a literal string "46]", so it + would match "W46]" or "-46]". However, if the "]" is escaped with a + backslash it is interpreted as the end of range, so [W-\]46] is inter- + preted as a class containing a range followed by two other characters. + The octal or hexadecimal representation of "]" can also be used to end + a range. + + Ranges normally include all code points between the start and end char- + acters, inclusive. They can also be used for code points specified + numerically, for example [\000-\037]. Ranges can include any characters + that are valid for the current mode. + + There is a special case in EBCDIC environments for ranges whose end + points are both specified as literal letters in the same case. For com- + patibility with Perl, EBCDIC code points within the range that are not + letters are omitted. For example, [h-k] matches only four characters, + even though the codes for h and k are 0x88 and 0x92, a range of 11 code + points. However, if the range is specified numerically, for example, + [\x88-\x92] or [h-\x92], all code points are included. + + If a range that includes letters is used when caseless matching is set, + it matches the letters in either case. For example, [W-c] is equivalent + to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if + character tables for a French locale are in use, [\xc8-\xcb] matches + accented E characters in both cases. + + A circumflex can conveniently be used with the upper case character + types to specify a more restricted set of characters than the matching + lower case type. For example, the class [^\W_] matches any letter or + digit, but not underscore, whereas [\w] includes underscore. A positive + character class should be read as "something OR something OR ..." and a + negative class as "NOT something AND NOT something AND NOT ...". + + The only metacharacters that are recognized in character classes are + backslash, hyphen (only where it can be interpreted as specifying a + range), circumflex (only at the start), opening square bracket (only + when it can be interpreted as introducing a POSIX class name, or for a + special compatibility feature - see the next two sections), and the + terminating closing square bracket. However, escaping other non- + alphanumeric characters does no harm. + + +POSIX CHARACTER CLASSES + + Perl supports the POSIX notation for character classes. This uses names + enclosed by [: and :] within the enclosing square brackets. PCRE2 also + supports this notation. For example, + + [01[:alpha:]%] + + matches "0", "1", any alphabetic character, or "%". The supported class + names are: + + alnum letters and digits + alpha letters + ascii character codes 0 - 127 + blank space or tab only + cntrl control characters + digit decimal digits (same as \d) + graph printing characters, excluding space + lower lower case letters + print printing characters, including space + punct printing characters, excluding letters and digits and space + space white space (the same as \s from PCRE2 8.34) + upper upper case letters + word "word" characters (same as \w) + xdigit hexadecimal digits + + The default "space" characters are HT (9), LF (10), VT (11), FF (12), + CR (13), and space (32). If locale-specific matching is taking place, + the list of space characters may be different; there may be fewer or + more of them. "Space" and \s match the same set of characters. + + The name "word" is a Perl extension, and "blank" is a GNU extension + from Perl 5.8. Another Perl extension is negation, which is indicated + by a ^ character after the colon. For example, + + [12[:^digit:]] + + matches "1", "2", or any non-digit. PCRE2 (and Perl) also recognize the + POSIX syntax [.ch.] and [=ch=] where "ch" is a "collating element", but + these are not supported, and an error is given if they are encountered. + + By default, characters with values greater than 127 do not match any of + the POSIX character classes, although this may be different for charac- + ters in the range 128-255 when locale-specific matching is happening. + However, if the PCRE2_UCP option is passed to pcre2_compile(), some of + the classes are changed so that Unicode character properties are used. + This is achieved by replacing certain POSIX classes with other + sequences, as follows: + + [:alnum:] becomes \p{Xan} + [:alpha:] becomes \p{L} + [:blank:] becomes \h + [:cntrl:] becomes \p{Cc} + [:digit:] becomes \p{Nd} + [:lower:] becomes \p{Ll} + [:space:] becomes \p{Xps} + [:upper:] becomes \p{Lu} + [:word:] becomes \p{Xwd} + + Negated versions, such as [:^alpha:] use \P instead of \p. Three other + POSIX classes are handled specially in UCP mode: + + [:graph:] This matches characters that have glyphs that mark the page + when printed. In Unicode property terms, it matches all char- + acters with the L, M, N, P, S, or Cf properties, except for: + + U+061C Arabic Letter Mark + U+180E Mongolian Vowel Separator + U+2066 - U+2069 Various "isolate"s + + + [:print:] This matches the same characters as [:graph:] plus space + characters that are not controls, that is, characters with + the Zs property. + + [:punct:] This matches all characters that have the Unicode P (punctua- + tion) property, plus those characters with code points less + than 256 that have the S (Symbol) property. + + The other POSIX classes are unchanged, and match only characters with + code points less than 256. + + +COMPATIBILITY FEATURE FOR WORD BOUNDARIES + + In the POSIX.2 compliant library that was included in 4.4BSD Unix, the + ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word" + and "end of word". PCRE2 treats these items as follows: + + [[:<:]] is converted to \b(?=\w) + [[:>:]] is converted to \b(?<=\w) + + Only these exact character sequences are recognized. A sequence such as + [a[:<:]b] provokes error for an unrecognized POSIX class name. This + support is not compatible with Perl. It is provided to help migrations + from other environments, and is best not used in any new patterns. Note + that \b matches at the start and the end of a word (see "Simple asser- + tions" above), and in a Perl-style pattern the preceding or following + character normally shows which is wanted, without the need for the + assertions that are used above in order to give exactly the POSIX be- + haviour. + + +VERTICAL BAR + + Vertical bar characters are used to separate alternative patterns. For + example, the pattern + + gilbert|sullivan + + matches either "gilbert" or "sullivan". Any number of alternatives may + appear, and an empty alternative is permitted (matching the empty + string). The matching process tries each alternative in turn, from left + to right, and the first one that succeeds is used. If the alternatives + are within a subpattern (defined below), "succeeds" means matching the + rest of the main pattern as well as the alternative in the subpattern. + + +INTERNAL OPTION SETTING + + The settings of the PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and + PCRE2_EXTENDED options (which are Perl-compatible) can be changed from + within the pattern by a sequence of Perl option letters enclosed + between "(?" and ")". The option letters are + + i for PCRE2_CASELESS + m for PCRE2_MULTILINE + s for PCRE2_DOTALL + x for PCRE2_EXTENDED + + For example, (?im) sets caseless, multiline matching. It is also possi- + ble to unset these options by preceding the letter with a hyphen, and a + combined setting and unsetting such as (?im-sx), which sets PCRE2_CASE- + LESS and PCRE2_MULTILINE while unsetting PCRE2_DOTALL and + PCRE2_EXTENDED, is also permitted. If a letter appears both before and + after the hyphen, the option is unset. An empty options setting "(?)" + is allowed. Needless to say, it has no effect. + + The PCRE2-specific options PCRE2_DUPNAMES and PCRE2_UNGREEDY can be + changed in the same way as the Perl-compatible options by using the + characters J and U respectively. + + When one of these option changes occurs at top level (that is, not + inside subpattern parentheses), the change applies to the remainder of + the pattern that follows. An option change within a subpattern (see + below for a description of subpatterns) affects only that part of the + subpattern that follows it, so + + (a(?i)b)c + + matches abc and aBc and no other strings (assuming PCRE2_CASELESS is + not used). By this means, options can be made to have different set- + tings in different parts of the pattern. Any changes made in one alter- + native do carry on into subsequent branches within the same subpattern. + For example, + + (a(?i)b|c) + + matches "ab", "aB", "c", and "C", even though when matching "C" the + first branch is abandoned before the option setting. This is because + the effects of option settings happen at compile time. There would be + some very weird behaviour otherwise. + + As a convenient shorthand, if any option settings are required at the + start of a non-capturing subpattern (see the next section), the option + letters may appear between the "?" and the ":". Thus the two patterns + + (?i:saturday|sunday) + (?:(?i)saturday|sunday) + + match exactly the same set of strings. + + Note: There are other PCRE2-specific options that can be set by the + application when the compiling function is called. The pattern can con- + tain special leading sequences such as (*CRLF) to override what the + application has set or what has been defaulted. Details are given in + the section entitled "Newline sequences" above. There are also the + (*UTF) and (*UCP) leading sequences that can be used to set UTF and + Unicode property modes; they are equivalent to setting the PCRE2_UTF + and PCRE2_UCP options, respectively. However, the application can set + the PCRE2_NEVER_UTF and PCRE2_NEVER_UCP options, which lock out the use + of the (*UTF) and (*UCP) sequences. + + +SUBPATTERNS + + Subpatterns are delimited by parentheses (round brackets), which can be + nested. Turning part of a pattern into a subpattern does two things: + + 1. It localizes a set of alternatives. For example, the pattern + + cat(aract|erpillar|) + + matches "cataract", "caterpillar", or "cat". Without the parentheses, + it would match "cataract", "erpillar" or an empty string. + + 2. It sets up the subpattern as a capturing subpattern. This means + that, when the whole pattern matches, the portion of the subject string + that matched the subpattern is passed back to the caller, separately + from the portion that matched the whole pattern. (This applies only to + the traditional matching function; the DFA matching function does not + support capturing.) + + Opening parentheses are counted from left to right (starting from 1) to + obtain numbers for the capturing subpatterns. For example, if the + string "the red king" is matched against the pattern + + the ((red|white) (king|queen)) + + the captured substrings are "red king", "red", and "king", and are num- + bered 1, 2, and 3, respectively. + + The fact that plain parentheses fulfil two functions is not always + helpful. There are often times when a grouping subpattern is required + without a capturing requirement. If an opening parenthesis is followed + by a question mark and a colon, the subpattern does not do any captur- + ing, and is not counted when computing the number of any subsequent + capturing subpatterns. For example, if the string "the white queen" is + matched against the pattern + + the ((?:red|white) (king|queen)) + + the captured substrings are "white queen" and "queen", and are numbered + 1 and 2. The maximum number of capturing subpatterns is 65535. + + As a convenient shorthand, if any option settings are required at the + start of a non-capturing subpattern, the option letters may appear + between the "?" and the ":". Thus the two patterns + + (?i:saturday|sunday) + (?:(?i)saturday|sunday) + + match exactly the same set of strings. Because alternative branches are + tried from left to right, and options are not reset until the end of + the subpattern is reached, an option setting in one branch does affect + subsequent branches, so the above patterns match "SUNDAY" as well as + "Saturday". + + +DUPLICATE SUBPATTERN NUMBERS + + Perl 5.10 introduced a feature whereby each alternative in a subpattern + uses the same numbers for its capturing parentheses. Such a subpattern + starts with (?| and is itself a non-capturing subpattern. For example, + consider this pattern: + + (?|(Sat)ur|(Sun))day + + Because the two alternatives are inside a (?| group, both sets of cap- + turing parentheses are numbered one. Thus, when the pattern matches, + you can look at captured substring number one, whichever alternative + matched. This construct is useful when you want to capture part, but + not all, of one of a number of alternatives. Inside a (?| group, paren- + theses are numbered as usual, but the number is reset at the start of + each branch. The numbers of any capturing parentheses that follow the + subpattern start after the highest number used in any branch. The fol- + lowing example is taken from the Perl documentation. The numbers under- + neath show in which buffer the captured content will be stored. + + # before ---------------branch-reset----------- after + / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x + # 1 2 2 3 2 3 4 + + A back reference to a numbered subpattern uses the most recent value + that is set for that number by any subpattern. The following pattern + matches "abcabc" or "defdef": + + /(?|(abc)|(def))\1/ + + In contrast, a subroutine call to a numbered subpattern always refers + to the first one in the pattern with the given number. The following + pattern matches "abcabc" or "defabc": + + /(?|(abc)|(def))(?1)/ + + A relative reference such as (?-1) is no different: it is just a conve- + nient way of computing an absolute group number. + + If a condition test for a subpattern's having matched refers to a non- + unique number, the test is true if any of the subpatterns of that num- + ber have matched. + + An alternative approach to using this "branch reset" feature is to use + duplicate named subpatterns, as described in the next section. + + +NAMED SUBPATTERNS + + Identifying capturing parentheses by number is simple, but it can be + very hard to keep track of the numbers in complicated regular expres- + sions. Furthermore, if an expression is modified, the numbers may + change. To help with this difficulty, PCRE2 supports the naming of sub- + patterns. This feature was not added to Perl until release 5.10. Python + had the feature earlier, and PCRE1 introduced it at release 4.0, using + the Python syntax. PCRE2 supports both the Perl and the Python syntax. + Perl allows identically numbered subpatterns to have different names, + but PCRE2 does not. + + In PCRE2, a subpattern can be named in one of three ways: (? ...) + or (?'name'...) as in Perl, or (?P ...) as in Python. References + to capturing parentheses from other parts of the pattern, such as back + references, recursion, and conditions, can be made by name as well as + by number. + + Names consist of up to 32 alphanumeric characters and underscores, but + must start with a non-digit. Named capturing parentheses are still + allocated numbers as well as names, exactly as if the names were not + present. The PCRE2 API provides function calls for extracting the name- + to-number translation table from a compiled pattern. There are also + convenience functions for extracting a captured substring by name. + + By default, a name must be unique within a pattern, but it is possible + to relax this constraint by setting the PCRE2_DUPNAMES option at com- + pile time. (Duplicate names are also always permitted for subpatterns + with the same number, set up as described in the previous section.) + Duplicate names can be useful for patterns where only one instance of + the named parentheses can match. Suppose you want to match the name of + a weekday, either as a 3-letter abbreviation or as the full name, and + in both cases you want to extract the abbreviation. This pattern + (ignoring the line breaks) does the job: + + (? Mon|Fri|Sun)(?:day)?| + (? Tue)(?:sday)?| + (? Wed)(?:nesday)?| + (? Thu)(?:rsday)?| + (? Sat)(?:urday)? + + There are five capturing substrings, but only one is ever set after a + match. (An alternative way of solving this problem is to use a "branch + reset" subpattern, as described in the previous section.) + + The convenience functions for extracting the data by name returns the + substring for the first (and in this example, the only) subpattern of + that name that matched. This saves searching to find which numbered + subpattern it was. + + If you make a back reference to a non-unique named subpattern from + elsewhere in the pattern, the subpatterns to which the name refers are + checked in the order in which they appear in the overall pattern. The + first one that is set is used for the reference. For example, this pat- + tern matches both "foofoo" and "barbar" but not "foobar" or "barfoo": + + (?:(? foo)|(? bar))\k + + + If you make a subroutine call to a non-unique named subpattern, the one + that corresponds to the first occurrence of the name is used. In the + absence of duplicate numbers (see the previous section) this is the one + with the lowest number. + + If you use a named reference in a condition test (see the section about + conditions below), either to check whether a subpattern has matched, or + to check for recursion, all subpatterns with the same name are tested. + If the condition is true for any one of them, the overall condition is + true. This is the same behaviour as testing by number. For further + details of the interfaces for handling named subpatterns, see the + pcre2api documentation. + + Warning: You cannot use different names to distinguish between two sub- + patterns with the same number because PCRE2 uses only the numbers when + matching. For this reason, an error is given at compile time if differ- + ent names are given to subpatterns with the same number. However, you + can always give the same name to subpatterns with the same number, even + when PCRE2_DUPNAMES is not set. + + +REPETITION + + Repetition is specified by quantifiers, which can follow any of the + following items: + + a literal data character + the dot metacharacter + the \C escape sequence + the \X escape sequence + the \R escape sequence + an escape such as \d or \pL that matches a single character + a character class + a back reference + a parenthesized subpattern (including most assertions) + a subroutine call to a subpattern (recursive or otherwise) + + The general repetition quantifier specifies a minimum and maximum num- + ber of permitted matches, by giving the two numbers in curly brackets + (braces), separated by a comma. The numbers must be less than 65536, + and the first must be less than or equal to the second. For example: + + z{2,4} + + matches "zz", "zzz", or "zzzz". A closing brace on its own is not a + special character. If the second number is omitted, but the comma is + present, there is no upper limit; if the second number and the comma + are both omitted, the quantifier specifies an exact number of required + matches. Thus + + [aeiou]{3,} + + matches at least 3 successive vowels, but may match many more, whereas + + \d{8} + + matches exactly 8 digits. An opening curly bracket that appears in a + position where a quantifier is not allowed, or one that does not match + the syntax of a quantifier, is taken as a literal character. For exam- + ple, {,6} is not a quantifier, but a literal string of four characters. + + In UTF modes, quantifiers apply to characters rather than to individual + code units. Thus, for example, \x{100}{2} matches two characters, each + of which is represented by a two-byte sequence in a UTF-8 string. Simi- + larly, \X{3} matches three Unicode extended grapheme clusters, each of + which may be several code units long (and they may be of different + lengths). + + The quantifier {0} is permitted, causing the expression to behave as if + the previous item and the quantifier were not present. This may be use- + ful for subpatterns that are referenced as subroutines from elsewhere + in the pattern (but see also the section entitled "Defining subpatterns + for use by reference only" below). Items other than subpatterns that + have a {0} quantifier are omitted from the compiled pattern. + + For convenience, the three most common quantifiers have single-charac- + ter abbreviations: + + * is equivalent to {0,} + + is equivalent to {1,} + ? is equivalent to {0,1} + + It is possible to construct infinite loops by following a subpattern + that can match no characters with a quantifier that has no upper limit, + for example: + + (a?)* + + Earlier versions of Perl and PCRE1 used to give an error at compile + time for such patterns. However, because there are cases where this can + be useful, such patterns are now accepted, but if any repetition of the + subpattern does in fact match no characters, the loop is forcibly bro- + ken. + + By default, the quantifiers are "greedy", that is, they match as much + as possible (up to the maximum number of permitted times), without + causing the rest of the pattern to fail. The classic example of where + this gives problems is in trying to match comments in C programs. These + appear between /* and */ and within the comment, individual * and / + characters may appear. An attempt to match C comments by applying the + pattern + + /\*.*\*/ + + to the string + + /* first comment */ not comment /* second comment */ + + fails, because it matches the entire string owing to the greediness of + the .* item. + + If a quantifier is followed by a question mark, it ceases to be greedy, + and instead matches the minimum number of times possible, so the pat- + tern + + /\*.*?\*/ + + does the right thing with the C comments. The meaning of the various + quantifiers is not otherwise changed, just the preferred number of + matches. Do not confuse this use of question mark with its use as a + quantifier in its own right. Because it has two uses, it can sometimes + appear doubled, as in + + \d??\d + + which matches one digit by preference, but can match two if that is the + only way the rest of the pattern matches. + + If the PCRE2_UNGREEDY option is set (an option that is not available in + Perl), the quantifiers are not greedy by default, but individual ones + can be made greedy by following them with a question mark. In other + words, it inverts the default behaviour. + + When a parenthesized subpattern is quantified with a minimum repeat + count that is greater than 1 or with a limited maximum, more memory is + required for the compiled pattern, in proportion to the size of the + minimum or maximum. + + If a pattern starts with .* or .{0,} and the PCRE2_DOTALL option + (equivalent to Perl's /s) is set, thus allowing the dot to match new- + lines, the pattern is implicitly anchored, because whatever follows + will be tried against every character position in the subject string, + so there is no point in retrying the overall match at any position + after the first. PCRE2 normally treats such a pattern as though it were + preceded by \A. + + In cases where it is known that the subject string contains no new- + lines, it is worth setting PCRE2_DOTALL in order to obtain this opti- + mization, or alternatively, using ^ to indicate anchoring explicitly. + + However, there are some cases where the optimization cannot be used. + When .* is inside capturing parentheses that are the subject of a back + reference elsewhere in the pattern, a match at the start may fail where + a later one succeeds. Consider, for example: + + (.*)abc\1 + + If the subject is "xyz123abc123" the match point is the fourth charac- + ter. For this reason, such a pattern is not implicitly anchored. + + Another case where implicit anchoring is not applied is when the lead- + ing .* is inside an atomic group. Once again, a match at the start may + fail where a later one succeeds. Consider this pattern: + + (?>.*?a)b + + It matches "ab" in the subject "aab". The use of the backtracking con- + trol verbs (*PRUNE) and (*SKIP) also disable this optimization, and + there is an option, PCRE2_NO_DOTSTAR_ANCHOR, to do so explicitly. + + When a capturing subpattern is repeated, the value captured is the sub- + string that matched the final iteration. For example, after + + (tweedle[dume]{3}\s*)+ + + has matched "tweedledum tweedledee" the value of the captured substring + is "tweedledee". However, if there are nested capturing subpatterns, + the corresponding captured values may have been set in previous itera- + tions. For example, after + + (a|(b))+ + + matches "aba" the value of the second captured substring is "b". + + +ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS + + With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") + repetition, failure of what follows normally causes the repeated item + to be re-evaluated to see if a different number of repeats allows the + rest of the pattern to match. Sometimes it is useful to prevent this, + either to change the nature of the match, or to cause it fail earlier + than it otherwise might, when the author of the pattern knows there is + no point in carrying on. + + Consider, for example, the pattern \d+foo when applied to the subject + line + + 123456bar + + After matching all 6 digits and then failing to match "foo", the normal + action of the matcher is to try again with only 5 digits matching the + \d+ item, and then with 4, and so on, before ultimately failing. + "Atomic grouping" (a term taken from Jeffrey Friedl's book) provides + the means for specifying that once a subpattern has matched, it is not + to be re-evaluated in this way. + + If we use atomic grouping for the previous example, the matcher gives + up immediately on failing to match "foo" the first time. The notation + is a kind of special parenthesis, starting with (?> as in this example: + + (?>\d+)foo + + This kind of parenthesis "locks up" the part of the pattern it con- + tains once it has matched, and a failure further into the pattern is + prevented from backtracking into it. Backtracking past it to previous + items, however, works as normal. + + An alternative description is that a subpattern of this type matches + exactly the string of characters that an identical standalone pattern + would match, if anchored at the current point in the subject string. + + Atomic grouping subpatterns are not capturing subpatterns. Simple cases + such as the above example can be thought of as a maximizing repeat that + must swallow everything it can. So, while both \d+ and \d+? are pre- + pared to adjust the number of digits they match in order to make the + rest of the pattern match, (?>\d+) can only match an entire sequence of + digits. + + Atomic groups in general can of course contain arbitrarily complicated + subpatterns, and can be nested. However, when the subpattern for an + atomic group is just a single repeated item, as in the example above, a + simpler notation, called a "possessive quantifier" can be used. This + consists of an additional + character following a quantifier. Using + this notation, the previous example can be rewritten as + + \d++foo + + Note that a possessive quantifier can be used with an entire group, for + example: + + (abc|xyz){2,3}+ + + Possessive quantifiers are always greedy; the setting of the + PCRE2_UNGREEDY option is ignored. They are a convenient notation for + the simpler forms of atomic group. However, there is no difference in + the meaning of a possessive quantifier and the equivalent atomic group, + though there may be a performance difference; possessive quantifiers + should be slightly faster. + + The possessive quantifier syntax is an extension to the Perl 5.8 syn- + tax. Jeffrey Friedl originated the idea (and the name) in the first + edition of his book. Mike McCloskey liked it, so implemented it when he + built Sun's Java package, and PCRE1 copied it from there. It ultimately + found its way into Perl at release 5.10. + + PCRE2 has an optimization that automatically "possessifies" certain + simple pattern constructs. For example, the sequence A+B is treated as + A++B because there is no point in backtracking into a sequence of A's + when B must follow. This feature can be disabled by the PCRE2_NO_AUTO- + POSSESS option, or starting the pattern with (*NO_AUTO_POSSESS). + + When a pattern contains an unlimited repeat inside a subpattern that + can itself be repeated an unlimited number of times, the use of an + atomic group is the only way to avoid some failing matches taking a + very long time indeed. The pattern + + (\D+|<\d+>)*[!?] + + matches an unlimited number of substrings that either consist of non- + digits, or digits enclosed in <>, followed by either ! or ?. When it + matches, it runs quickly. However, if it is applied to + + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + + it takes a long time before reporting failure. This is because the + string can be divided between the internal \D+ repeat and the external + * repeat in a large number of ways, and all have to be tried. (The + example uses [!?] rather than a single character at the end, because + both PCRE2 and Perl have an optimization that allows for fast failure + when a single character is used. They remember the last single charac- + ter that is required for a match, and fail early if it is not present + in the string.) If the pattern is changed so that it uses an atomic + group, like this: + + ((?>\D+)|<\d+>)*[!?] + + sequences of non-digits cannot be broken, and failure happens quickly. + + +BACK REFERENCES + + Outside a character class, a backslash followed by a digit greater than + 0 (and possibly further digits) is a back reference to a capturing sub- + pattern earlier (that is, to its left) in the pattern, provided there + have been that many previous capturing left parentheses. + + However, if the decimal number following the backslash is less than 8, + it is always taken as a back reference, and causes an error only if + there are not that many capturing left parentheses in the entire pat- + tern. In other words, the parentheses that are referenced need not be + to the left of the reference for numbers less than 8. A "forward back + reference" of this type can make sense when a repetition is involved + and the subpattern to the right has participated in an earlier itera- + tion. + + It is not possible to have a numerical "forward back reference" to a + subpattern whose number is 8 or more using this syntax because a + sequence such as \50 is interpreted as a character defined in octal. + See the subsection entitled "Non-printing characters" above for further + details of the handling of digits following a backslash. There is no + such problem when named parentheses are used. A back reference to any + subpattern is possible using named parentheses (see below). + + Another way of avoiding the ambiguity inherent in the use of digits + following a backslash is to use the \g escape sequence. This escape + must be followed by a signed or unsigned number, optionally enclosed in + braces. These examples are all identical: + + (ring), \1 + (ring), \g1 + (ring), \g{1} + + An unsigned number specifies an absolute reference without the ambigu- + ity that is present in the older syntax. It is also useful when literal + digits follow the reference. A signed number is a relative reference. + Consider this example: + + (abc(def)ghi)\g{-1} + + The sequence \g{-1} is a reference to the most recently started captur- + ing subpattern before \g, that is, is it equivalent to \2 in this exam- + ple. Similarly, \g{-2} would be equivalent to \1. The use of relative + references can be helpful in long patterns, and also in patterns that + are created by joining together fragments that contain references + within themselves. + + The sequence \g{+1} is a reference to the next capturing subpattern. + This kind of forward reference can be useful it patterns that repeat. + Perl does not support the use of + in this way. + + A back reference matches whatever actually matched the capturing sub- + pattern in the current subject string, rather than anything matching + the subpattern itself (see "Subpatterns as subroutines" below for a way + of doing that). So the pattern + + (sens|respons)e and \1ibility + + matches "sense and sensibility" and "response and responsibility", but + not "sense and responsibility". If caseful matching is in force at the + time of the back reference, the case of letters is relevant. For exam- + ple, + + ((?i)rah)\s+\1 + + matches "rah rah" and "RAH RAH", but not "RAH rah", even though the + original capturing subpattern is matched caselessly. + + There are several different ways of writing back references to named + subpatterns. The .NET syntax \k{name} and the Perl syntax \k or + \k'name' are supported, as is the Python syntax (?P=name). Perl 5.10's + unified back reference syntax, in which \g can be used for both numeric + and named references, is also supported. We could rewrite the above + example in any of the following ways: + + (? (?i)rah)\s+\k + (?'p1'(?i)rah)\s+\k{p1} + (?P (?i)rah)\s+(?P=p1) + (? (?i)rah)\s+\g{p1} + + A subpattern that is referenced by name may appear in the pattern + before or after the reference. + + There may be more than one back reference to the same subpattern. If a + subpattern has not actually been used in a particular match, any back + references to it always fail by default. For example, the pattern + + (a|(bc))\2 + + always fails if it starts to match "a" rather than "bc". However, if + the PCRE2_MATCH_UNSET_BACKREF option is set at compile time, a back + reference to an unset value matches an empty string. + + Because there may be many capturing parentheses in a pattern, all dig- + its following a backslash are taken as part of a potential back refer- + ence number. If the pattern continues with a digit character, some + delimiter must be used to terminate the back reference. If the + PCRE2_EXTENDED option is set, this can be white space. Otherwise, the + \g{ syntax or an empty comment (see "Comments" below) can be used. + + Recursive back references + + A back reference that occurs inside the parentheses to which it refers + fails when the subpattern is first used, so, for example, (a\1) never + matches. However, such references can be useful inside repeated sub- + patterns. For example, the pattern + + (a|b\1)+ + + matches any number of "a"s and also "aba", "ababbaa" etc. At each iter- + ation of the subpattern, the back reference matches the character + string corresponding to the previous iteration. In order for this to + work, the pattern must be such that the first iteration does not need + to match the back reference. This can be done using alternation, as in + the example above, or by a quantifier with a minimum of zero. + + Back references of this type cause the group that they reference to be + treated as an atomic group. Once the whole group has been matched, a + subsequent matching failure cannot cause backtracking into the middle + of the group. + + +ASSERTIONS + + An assertion is a test on the characters following or preceding the + current matching point that does not consume any characters. The simple + assertions coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described + above. + + More complicated assertions are coded as subpatterns. There are two + kinds: those that look ahead of the current position in the subject + string, and those that look behind it. An assertion subpattern is + matched in the normal way, except that it does not cause the current + matching position to be changed. + + Assertion subpatterns are not capturing subpatterns. If such an asser- + tion contains capturing subpatterns within it, these are counted for + the purposes of numbering the capturing subpatterns in the whole pat- + tern. However, substring capturing is carried out only for positive + assertions. (Perl sometimes, but not always, does do capturing in nega- + tive assertions.) + + WARNING: If a positive assertion containing one or more capturing sub- + patterns succeeds, but failure to match later in the pattern causes + backtracking over this assertion, the captures within the assertion are + reset only if no higher numbered captures are already set. This is, + unfortunately, a fundamental limitation of the current implementation; + it may get removed in a future reworking. + + For compatibility with Perl, most assertion subpatterns may be + repeated; though it makes no sense to assert the same thing several + times, the side effect of capturing parentheses may occasionally be + useful. However, an assertion that forms the condition for a condi- + tional subpattern may not be quantified. In practice, for other asser- + tions, there only three cases: + + (1) If the quantifier is {0}, the assertion is never obeyed during + matching. However, it may contain internal capturing parenthesized + groups that are called from elsewhere via the subroutine mechanism. + + (2) If quantifier is {0,n} where n is greater than zero, it is treated + as if it were {0,1}. At run time, the rest of the pattern match is + tried with and without the assertion, the order depending on the greed- + iness of the quantifier. + + (3) If the minimum repetition is greater than zero, the quantifier is + ignored. The assertion is obeyed just once when encountered during + matching. + + Lookahead assertions + + Lookahead assertions start with (?= for positive assertions and (?! for + negative assertions. For example, + + \w+(?=;) + + matches a word followed by a semicolon, but does not include the semi- + colon in the match, and + + foo(?!bar) + + matches any occurrence of "foo" that is not followed by "bar". Note + that the apparently similar pattern + + (?!foo)bar + + does not find an occurrence of "bar" that is preceded by something + other than "foo"; it finds any occurrence of "bar" whatsoever, because + the assertion (?!foo) is always true when the next three characters are + "bar". A lookbehind assertion is needed to achieve the other effect. + + If you want to force a matching failure at some point in a pattern, the + most convenient way to do it is with (?!) because an empty string + always matches, so an assertion that requires there not to be an empty + string must always fail. The backtracking control verb (*FAIL) or (*F) + is a synonym for (?!). + + Lookbehind assertions + + Lookbehind assertions start with (?<= for positive assertions and (?)...) or (?('name')...) to test for a + used subpattern by name. For compatibility with earlier versions of + PCRE1, which had this facility before Perl, the syntax (?(name)...) is + also recognized. Note, however, that undelimited names consisting of + the letter R followed by digits are ambiguous (see the following sec- + tion). + + Rewriting the above example to use a named subpattern gives this: + + (? \( )? [^()]+ (?( ) \) ) + + If the name used in a condition of this kind is a duplicate, the test + is applied to all subpatterns of the same name, and is true if any one + of them has matched. + + Checking for pattern recursion + + "Recursion" in this sense refers to any subroutine-like call from one + part of the pattern to another, whether or not it is actually recur- + sive. See the sections entitled "Recursive patterns" and "Subpatterns + as subroutines" below for details of recursion and subpattern calls. + + If a condition is the string (R), and there is no subpattern with the + name R, the condition is true if matching is currently in a recursion + or subroutine call to the whole pattern or any subpattern. If digits + follow the letter R, and there is no subpattern with that name, the + condition is true if the most recent call is into a subpattern with the + given number, which must exist somewhere in the overall pattern. This + is a contrived example that is equivalent to a+b: + + ((?(R1)a+|(?1)b)) + + However, in both cases, if there is a subpattern with a matching name, + the condition tests for its being set, as described in the section + above, instead of testing for recursion. For example, creating a group + with the name R1 by adding (? ) to the above pattern completely + changes its meaning. + + If a name preceded by ampersand follows the letter R, for example: + + (?(R&name)...) + + the condition is true if the most recent recursion is into a subpattern + of that name (which must exist within the pattern). + + This condition does not check the entire recursion stack. It tests only + the current level. If the name used in a condition of this kind is a + duplicate, the test is applied to all subpatterns of the same name, and + is true if any one of them is the most recent recursion. + + At "top level", all these recursion test conditions are false. + + Defining subpatterns for use by reference only + + If the condition is the string (DEFINE), the condition is always false, + even if there is a group with the name DEFINE. In this case, there may + be only one alternative in the subpattern. It is always skipped if con- + trol reaches this point in the pattern; the idea of DEFINE is that it + can be used to define subroutines that can be referenced from else- + where. (The use of subroutines is described below.) For example, a pat- + tern to match an IPv4 address such as "192.168.23.245" could be written + like this (ignore white space and line breaks): + + (?(DEFINE) (? 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) ) + \b (?&byte) (\.(?&byte)){3} \b + + The first part of the pattern is a DEFINE group inside which a another + group named "byte" is defined. This matches an individual component of + an IPv4 address (a number less than 256). When matching takes place, + this part of the pattern is skipped because DEFINE acts like a false + condition. The rest of the pattern uses references to the named group + to match the four dot-separated components of an IPv4 address, insist- + ing on a word boundary at each end. + + Checking the PCRE2 version + + Programs that link with a PCRE2 library can check the version by call- + ing pcre2_config() with appropriate arguments. Users of applications + that do not have access to the underlying code cannot do this. A spe- + cial "condition" called VERSION exists to allow such users to discover + which version of PCRE2 they are dealing with by using this condition to + match a string such as "yesno". VERSION must be followed either by "=" + or ">=" and a version number. For example: + + (?(VERSION>=10.4)yes|no) + + This pattern matches "yes" if the PCRE2 version is greater or equal to + 10.4, or "no" otherwise. The fractional part of the version number may + not contain more than two digits. + + Assertion conditions + + If the condition is not in any of the above formats, it must be an + assertion. This may be a positive or negative lookahead or lookbehind + assertion. Consider this pattern, again containing non-significant + white space, and with the two alternatives on the second line: + + (?(?=[^a-z]*[a-z]) + \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} ) + + The condition is a positive lookahead assertion that matches an + optional sequence of non-letters followed by a letter. In other words, + it tests for the presence of at least one letter in the subject. If a + letter is found, the subject is matched against the first alternative; + otherwise it is matched against the second. This pattern matches + strings in one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are + letters and dd are digits. + + +COMMENTS + + There are two ways of including comments in patterns that are processed + by PCRE2. In both cases, the start of the comment must not be in a + character class, nor in the middle of any other sequence of related + characters such as (?: or a subpattern name or number. The characters + that make up a comment play no part in the pattern matching. + + The sequence (?# marks the start of a comment that continues up to the + next closing parenthesis. Nested parentheses are not permitted. If the + PCRE2_EXTENDED option is set, an unescaped # character also introduces + a comment, which in this case continues to immediately after the next + newline character or character sequence in the pattern. Which charac- + ters are interpreted as newlines is controlled by an option passed to + the compiling function or by a special sequence at the start of the + pattern, as described in the section entitled "Newline conventions" + above. Note that the end of this type of comment is a literal newline + sequence in the pattern; escape sequences that happen to represent a + newline do not count. For example, consider this pattern when + PCRE2_EXTENDED is set, and the default newline convention (a single + linefeed character) is in force: + + abc #comment \n still comment + + On encountering the # character, pcre2_compile() skips along, looking + for a newline in the pattern. The sequence \n is still literal at this + stage, so it does not terminate the comment. Only an actual character + with the code value 0x0a (the default newline) does so. + + +RECURSIVE PATTERNS + + Consider the problem of matching a string in parentheses, allowing for + unlimited nested parentheses. Without the use of recursion, the best + that can be done is to use a pattern that matches up to some fixed + depth of nesting. It is not possible to handle an arbitrary nesting + depth. + + For some time, Perl has provided a facility that allows regular expres- + sions to recurse (amongst other things). It does this by interpolating + Perl code in the expression at run time, and the code can refer to the + expression itself. A Perl pattern using code interpolation to solve the + parentheses problem can be created like this: + + $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x; + + The (?p{...}) item interpolates Perl code at run time, and in this case + refers recursively to the pattern in which it appears. + + Obviously, PCRE2 cannot support the interpolation of Perl code. + Instead, it supports special syntax for recursion of the entire pat- + tern, and also for individual subpattern recursion. After its introduc- + tion in PCRE1 and Python, this kind of recursion was subsequently + introduced into Perl at release 5.10. + + A special item that consists of (? followed by a number greater than + zero and a closing parenthesis is a recursive subroutine call of the + subpattern of the given number, provided that it occurs inside that + subpattern. (If not, it is a non-recursive subroutine call, which is + described in the next section.) The special item (?R) or (?0) is a + recursive call of the entire regular expression. + + This PCRE2 pattern solves the nested parentheses problem (assume the + PCRE2_EXTENDED option is set so that white space is ignored): + + \( ( [^()]++ | (?R) )* \) + + First it matches an opening parenthesis. Then it matches any number of + substrings which can either be a sequence of non-parentheses, or a + recursive match of the pattern itself (that is, a correctly parenthe- + sized substring). Finally there is a closing parenthesis. Note the use + of a possessive quantifier to avoid backtracking into sequences of non- + parentheses. + + If this were part of a larger pattern, you would not want to recurse + the entire pattern, so instead you could use this: + + ( \( ( [^()]++ | (?1) )* \) ) + + We have put the pattern into parentheses, and caused the recursion to + refer to them instead of the whole pattern. + + In a larger pattern, keeping track of parenthesis numbers can be + tricky. This is made easier by the use of relative references. Instead + of (?1) in the pattern above you can write (?-2) to refer to the second + most recently opened parentheses preceding the recursion. In other + words, a negative number counts capturing parentheses leftwards from + the point at which it is encountered. + + Be aware however, that if duplicate subpattern numbers are in use, rel- + ative references refer to the earliest subpattern with the appropriate + number. Consider, for example: + + (?|(a)|(b)) (c) (?-2) + + The first two capturing groups (a) and (b) are both numbered 1, and + group (c) is number 2. When the reference (?-2) is encountered, the + second most recently opened parentheses has the number 1, but it is the + first such group (the (a) group) to which the recursion refers. This + would be the same if an absolute reference (?1) was used. In other + words, relative references are just a shorthand for computing a group + number. + + It is also possible to refer to subsequently opened parentheses, by + writing references such as (?+2). However, these cannot be recursive + because the reference is not inside the parentheses that are refer- + enced. They are always non-recursive subroutine calls, as described in + the next section. + + An alternative approach is to use named parentheses. The Perl syntax + for this is (?&name); PCRE1's earlier syntax (?P>name) is also sup- + ported. We could rewrite the above example as follows: + + (? \( ( [^()]++ | (?&pn) )* \) ) + + If there is more than one subpattern with the same name, the earliest + one is used. + + The example pattern that we have been looking at contains nested unlim- + ited repeats, and so the use of a possessive quantifier for matching + strings of non-parentheses is important when applying the pattern to + strings that do not match. For example, when this pattern is applied to + + (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa() + + it yields "no match" quickly. However, if a possessive quantifier is + not used, the match runs for a very long time indeed because there are + so many different ways the + and * repeats can carve up the subject, + and all have to be tested before failure can be reported. + + At the end of a match, the values of capturing parentheses are those + from the outermost level. If you want to obtain intermediate values, a + callout function can be used (see below and the pcre2callout documenta- + tion). If the pattern above is matched against + + (ab(cd)ef) + + the value for the inner capturing parentheses (numbered 2) is "ef", + which is the last value taken on at the top level. If a capturing sub- + pattern is not matched at the top level, its final captured value is + unset, even if it was (temporarily) set at a deeper level during the + matching process. + + If there are more than 15 capturing parentheses in a pattern, PCRE2 has + to obtain extra memory from the heap to store data during a recursion. + If no memory can be obtained, the match fails with the + PCRE2_ERROR_NOMEMORY error. + + Do not confuse the (?R) item with the condition (R), which tests for + recursion. Consider this pattern, which matches text in angle brack- + ets, allowing for arbitrary nesting. Only digits are allowed in nested + brackets (that is, when recursing), whereas any characters are permit- + ted at the outer level. + + < (?: (?(R) \d++ | [^<>]*+) | (?R)) * > + + In this pattern, (?(R) is the start of a conditional subpattern, with + two different alternatives for the recursive and non-recursive cases. + The (?R) item is the actual recursive call. + + Differences in recursion processing between PCRE2 and Perl + + Recursion processing in PCRE2 differs from Perl in two important ways. + In PCRE2 (like Python, but unlike Perl), a recursive subpattern call is + always treated as an atomic group. That is, once it has matched some of + the subject string, it is never re-entered, even if it contains untried + alternatives and there is a subsequent matching failure. This can be + illustrated by the following pattern, which purports to match a palin- + dromic string that contains an odd number of characters (for example, + "a", "aba", "abcba", "abcdcba"): + + ^(.|(.)(?1)\2)$ + + The idea is that it either matches a single character, or two identical + characters surrounding a sub-palindrome. In Perl, this pattern works; + in PCRE2 it does not if the pattern is longer than three characters. + Consider the subject string "abcba": + + At the top level, the first character is matched, but as it is not at + the end of the string, the first alternative fails; the second alterna- + tive is taken and the recursion kicks in. The recursive call to subpat- + tern 1 successfully matches the next character ("b"). (Note that the + beginning and end of line tests are not part of the recursion). + + Back at the top level, the next character ("c") is compared with what + subpattern 2 matched, which was "a". This fails. Because the recursion + is treated as an atomic group, there are now no backtracking points, + and so the entire match fails. (Perl is able, at this point, to re- + enter the recursion and try the second alternative.) However, if the + pattern is written with the alternatives in the other order, things are + different: + + ^((.)(?1)\2|.)$ + + This time, the recursing alternative is tried first, and continues to + recurse until it runs out of characters, at which point the recursion + fails. But this time we do have another alternative to try at the + higher level. That is the big difference: in the previous case the + remaining alternative is at a deeper recursion level, which PCRE2 can- + not use. + + To change the pattern so that it matches all palindromic strings, not + just those with an odd number of characters, it is tempting to change + the pattern to this: + + ^((.)(?1)\2|.?)$ + + Again, this works in Perl, but not in PCRE2, and for the same reason. + When a deeper recursion has matched a single character, it cannot be + entered again in order to match an empty string. The solution is to + separate the two cases, and write out the odd and even cases as alter- + natives at the higher level: + + ^(?:((.)(?1)\2|)|((.)(?3)\4|.)) + + If you want to match typical palindromic phrases, the pattern has to + ignore all non-word characters, which can be done like this: + + ^\W*+(?:((.)\W*+(?1)\W*+\2|)|((.)\W*+(?3)\W*+\4|\W*+.\W*+))\W*+$ + + If run with the PCRE2_CASELESS option, this pattern matches phrases + such as "A man, a plan, a canal: Panama!" and it works in both PCRE2 + and Perl. Note the use of the possessive quantifier *+ to avoid back- + tracking into sequences of non-word characters. Without this, PCRE2 + takes a great deal longer (ten times or more) to match typical phrases, + and Perl takes so long that you think it has gone into a loop. + + WARNING: The palindrome-matching patterns above work only if the sub- + ject string does not start with a palindrome that is shorter than the + entire string. For example, although "abcba" is correctly matched, if + the subject is "ababa", PCRE2 finds the palindrome "aba" at the start, + then fails at top level because the end of the string does not follow. + Once again, it cannot jump back into the recursion to try other alter- + natives, so the entire match fails. + + The second way in which PCRE2 and Perl differ in their recursion pro- + cessing is in the handling of captured values. In Perl, when a subpat- + tern is called recursively or as a subpattern (see the next section), + it has no access to any values that were captured outside the recur- + sion, whereas in PCRE2 these values can be referenced. Consider this + pattern: + + ^(.)(\1|a(?2)) + + In PCRE2, this pattern matches "bab". The first capturing parentheses + match "b", then in the second group, when the back reference \1 fails + to match "b", the second alternative matches "a" and then recurses. In + the recursion, \1 does now match "b" and so the whole match succeeds. + In Perl, the pattern fails to match because inside the recursive call + \1 cannot access the externally set value. + + +SUBPATTERNS AS SUBROUTINES + + If the syntax for a recursive subpattern call (either by number or by + name) is used outside the parentheses to which it refers, it operates + like a subroutine in a programming language. The called subpattern may + be defined before or after the reference. A numbered reference can be + absolute or relative, as in these examples: + + (...(absolute)...)...(?2)... + (...(relative)...)...(?-1)... + (...(?+1)...(relative)... + + An earlier example pointed out that the pattern + + (sens|respons)e and \1ibility + + matches "sense and sensibility" and "response and responsibility", but + not "sense and responsibility". If instead the pattern + + (sens|respons)e and (?1)ibility + + is used, it does match "sense and responsibility" as well as the other + two strings. Another example is given in the discussion of DEFINE + above. + + All subroutine calls, whether recursive or not, are always treated as + atomic groups. That is, once a subroutine has matched some of the sub- + ject string, it is never re-entered, even if it contains untried alter- + natives and there is a subsequent matching failure. Any capturing + parentheses that are set during the subroutine call revert to their + previous values afterwards. + + Processing options such as case-independence are fixed when a subpat- + tern is defined, so if it is used as a subroutine, such options cannot + be changed for different calls. For example, consider this pattern: + + (abc)(?i:(?-1)) + + It matches "abcabc". It does not match "abcABC" because the change of + processing option does not affect the called subpattern. + + +ONIGURUMA SUBROUTINE SYNTAX + + For compatibility with Oniguruma, the non-Perl syntax \g followed by a + name or a number enclosed either in angle brackets or single quotes, is + an alternative syntax for referencing a subpattern as a subroutine, + possibly recursively. Here are two of the examples used above, rewrit- + ten using this syntax: + + (? \( ( (?>[^()]+) | \g )* \) ) + (sens|respons)e and \g'1'ibility + + PCRE2 supports an extension to Oniguruma: if a number is preceded by a + plus or a minus sign it is taken as a relative reference. For example: + + (abc)(?i:\g<-1>) + + Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not + synonymous. The former is a back reference; the latter is a subroutine + call. + + +CALLOUTS + + Perl has a feature whereby using the sequence (?{...}) causes arbitrary + Perl code to be obeyed in the middle of matching a regular expression. + This makes it possible, amongst other things, to extract different sub- + strings that match the same pair of parentheses when there is a repeti- + tion. + + PCRE2 provides a similar feature, but of course it cannot obey arbi- + trary Perl code. The feature is called "callout". The caller of PCRE2 + provides an external function by putting its entry point in a match + context using the function pcre2_set_callout(), and then passing that + context to pcre2_match() or pcre2_dfa_match(). If no match context is + passed, or if the callout entry point is set to NULL, callouts are dis- + abled. + + Within a regular expression, (?C ) indicates a point at which the + external function is to be called. There are two kinds of callout: + those with a numerical argument and those with a string argument. (?C) + on its own with no argument is treated as (?C0). A numerical argument + allows the application to distinguish between different callouts. + String arguments were added for release 10.20 to make it possible for + script languages that use PCRE2 to embed short scripts within patterns + in a similar way to Perl. + + During matching, when PCRE2 reaches a callout point, the external func- + tion is called. It is provided with the number or string argument of + the callout, the position in the pattern, and one item of data that is + also set in the match block. The callout function may cause matching to + proceed, to backtrack, or to fail. + + By default, PCRE2 implements a number of optimizations at matching + time, and one side-effect is that sometimes callouts are skipped. If + you need all possible callouts to happen, you need to set options that + disable the relevant optimizations. More details, including a complete + description of the programming interface to the callout function, are + given in the pcre2callout documentation. + + Callouts with numerical arguments + + If you just want to have a means of identifying different callout + points, put a number less than 256 after the letter C. For example, + this pattern has two callout points: + + (?C1)abc(?C2)def + + If the PCRE2_AUTO_CALLOUT flag is passed to pcre2_compile(), numerical + callouts are automatically installed before each item in the pattern. + They are all numbered 255. If there is a conditional group in the pat- + tern whose condition is an assertion, an additional callout is inserted + just before the condition. An explicit callout may also be set at this + position, as in this example: + + (?(?C9)(?=a)abc|def) + + Note that this applies only to assertion conditions, not to other types + of condition. + + Callouts with string arguments + + A delimited string may be used instead of a number as a callout argu- + ment. The starting delimiter must be one of ` ' " ^ % # $ { and the + ending delimiter is the same as the start, except for {, where the end- + ing delimiter is }. If the ending delimiter is needed within the + string, it must be doubled. For example: + + (?C'ab ''c'' d')xyz(?C{any text})pqr + + The doubling is removed before the string is passed to the callout + function. + + +BACKTRACKING CONTROL + + Perl 5.10 introduced a number of "Special Backtracking Control Verbs", + which are still described in the Perl documentation as "experimental + and subject to change or removal in a future version of Perl". It goes + on to say: "Their usage in production code should be noted to avoid + problems during upgrades." The same remarks apply to the PCRE2 features + described in this section. + + The new verbs make use of what was previously invalid syntax: an open- + ing parenthesis followed by an asterisk. They are generally of the form + (*VERB) or (*VERB:NAME). Some verbs take either form, possibly behaving + differently depending on whether or not a name is present. + + By default, for compatibility with Perl, a name is any sequence of + characters that does not include a closing parenthesis. The name is not + processed in any way, and it is not possible to include a closing + parenthesis in the name. This can be changed by setting the + PCRE2_ALT_VERBNAMES option, but the result is no longer Perl-compati- + ble. + + When PCRE2_ALT_VERBNAMES is set, backslash processing is applied to + verb names and only an unescaped closing parenthesis terminates the + name. However, the only backslash items that are permitted are \Q, \E, + and sequences such as \x{100} that define character code points. Char- + acter type escapes such as \d are faulted. + + A closing parenthesis can be included in a name either as \) or between + \Q and \E. In addition to backslash processing, if the PCRE2_EXTENDED + option is also set, unescaped whitespace in verb names is skipped, and + #-comments are recognized, exactly as in the rest of the pattern. + PCRE2_EXTENDED does not affect verb names unless PCRE2_ALT_VERBNAMES is + also set. + + The maximum length of a name is 255 in the 8-bit library and 65535 in + the 16-bit and 32-bit libraries. If the name is empty, that is, if the + closing parenthesis immediately follows the colon, the effect is as if + the colon were not there. Any number of these verbs may occur in a pat- + tern. + + Since these verbs are specifically related to backtracking, most of + them can be used only when the pattern is to be matched using the tra- + ditional matching function, because these use a backtracking algorithm. + With the exception of (*FAIL), which behaves like a failing negative + assertion, the backtracking control verbs cause an error if encountered + by the DFA matching function. + + The behaviour of these verbs in repeated groups, assertions, and in + subpatterns called as subroutines (whether or not recursively) is docu- + mented below. + + Optimizations that affect backtracking verbs + + PCRE2 contains some optimizations that are used to speed up matching by + running some checks at the start of each match attempt. For example, it + may know the minimum length of matching subject, or that a particular + character must be present. When one of these optimizations bypasses the + running of a match, any included backtracking verbs will not, of + course, be processed. You can suppress the start-of-match optimizations + by setting the PCRE2_NO_START_OPTIMIZE option when calling pcre2_com- + pile(), or by starting the pattern with (*NO_START_OPT). There is more + discussion of this option in the section entitled "Compiling a pattern" + in the pcre2api documentation. + + Experiments with Perl suggest that it too has similar optimizations, + sometimes leading to anomalous results. + + Verbs that act immediately + + The following verbs act as soon as they are encountered. They may not + be followed by a name. + + (*ACCEPT) + + This verb causes the match to end successfully, skipping the remainder + of the pattern. However, when it is inside a subpattern that is called + as a subroutine, only that subpattern is ended successfully. Matching + then continues at the outer level. If (*ACCEPT) in triggered in a posi- + tive assertion, the assertion succeeds; in a negative assertion, the + assertion fails. + + If (*ACCEPT) is inside capturing parentheses, the data so far is cap- + tured. For example: + + A((?:A|B(*ACCEPT)|C)D) + + This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is cap- + tured by the outer parentheses. + + (*FAIL) or (*F) + + This verb causes a matching failure, forcing backtracking to occur. It + is equivalent to (?!) but easier to read. The Perl documentation notes + that it is probably useful only when combined with (?{}) or (??{}). + Those are, of course, Perl features that are not present in PCRE2. The + nearest equivalent is the callout feature, as for example in this pat- + tern: + + a+(?C)(*FAIL) + + A match with the string "aaaa" always fails, but the callout is taken + before each backtrack happens (in this example, 10 times). + + Recording which path was taken + + There is one verb whose main purpose is to track how a match was + arrived at, though it also has a secondary use in conjunction with + advancing the match starting point (see (*SKIP) below). + + (*MARK:NAME) or (*:NAME) + + A name is always required with this verb. There may be as many + instances of (*MARK) as you like in a pattern, and their names do not + have to be unique. + + When a match succeeds, the name of the last-encountered (*MARK:NAME), + (*PRUNE:NAME), or (*THEN:NAME) on the matching path is passed back to + the caller as described in the section entitled "Other information + about the match" in the pcre2api documentation. Here is an example of + pcre2test output, where the "mark" modifier requests the retrieval and + outputting of (*MARK) data: + + re> /X(*MARK:A)Y|X(*MARK:B)Z/mark + data> XY + 0: XY + MK: A + XZ + 0: XZ + MK: B + + The (*MARK) name is tagged with "MK:" in this output, and in this exam- + ple it indicates which of the two alternatives matched. This is a more + efficient way of obtaining this information than putting each alterna- + tive in its own capturing parentheses. + + If a verb with a name is encountered in a positive assertion that is + true, the name is recorded and passed back if it is the last-encoun- + tered. This does not happen for negative assertions or failing positive + assertions. + + After a partial match or a failed match, the last encountered name in + the entire match process is returned. For example: + + re> /X(*MARK:A)Y|X(*MARK:B)Z/mark + data> XP + No match, mark = B + + Note that in this unanchored example the mark is retained from the + match attempt that started at the letter "X" in the subject. Subsequent + match attempts starting at "P" and then with an empty string do not get + as far as the (*MARK) item, but nevertheless do not reset it. + + If you are interested in (*MARK) values after failed matches, you + should probably set the PCRE2_NO_START_OPTIMIZE option (see above) to + ensure that the match is always attempted. + + Verbs that act after backtracking + + The following verbs do nothing when they are encountered. Matching con- + tinues with what follows, but if there is no subsequent match, causing + a backtrack to the verb, a failure is forced. That is, backtracking + cannot pass to the left of the verb. However, when one of these verbs + appears inside an atomic group (which includes any group that is called + as a subroutine) or in an assertion that is true, its effect is con- + fined to that group, because once the group has been matched, there is + never any backtracking into it. In this situation, backtracking has to + jump to the left of the entire atomic group or assertion. + + These verbs differ in exactly what kind of failure occurs when back- + tracking reaches them. The behaviour described below is what happens + when the verb is not in a subroutine or an assertion. Subsequent sec- + tions cover these special cases. + + (*COMMIT) + + This verb, which may not be followed by a name, causes the whole match + to fail outright if there is a later matching failure that causes back- + tracking to reach it. Even if the pattern is unanchored, no further + attempts to find a match by advancing the starting point take place. If + (*COMMIT) is the only backtracking verb that is encountered, once it + has been passed pcre2_match() is committed to finding a match at the + current starting point, or not at all. For example: + + a+(*COMMIT)b + + This matches "xxaab" but not "aacaab". It can be thought of as a kind + of dynamic anchor, or "I've started, so I must finish." The name of the + most recently passed (*MARK) in the path is passed back when (*COMMIT) + forces a match failure. + + If there is more than one backtracking verb in a pattern, a different + one that follows (*COMMIT) may be triggered first, so merely passing + (*COMMIT) during a match does not always guarantee that a match must be + at this starting point. + + Note that (*COMMIT) at the start of a pattern is not the same as an + anchor, unless PCRE2's start-of-match optimizations are turned off, as + shown in this output from pcre2test: + + re> /(*COMMIT)abc/ + data> xyzabc + 0: abc + data> + re> /(*COMMIT)abc/no_start_optimize + data> xyzabc + No match + + For the first pattern, PCRE2 knows that any match must start with "a", + so the optimization skips along the subject to "a" before applying the + pattern to the first set of data. The match attempt then succeeds. The + second pattern disables the optimization that skips along to the first + character. The pattern is now applied starting at "x", and so the + (*COMMIT) causes the match to fail without trying any other starting + points. + + (*PRUNE) or (*PRUNE:NAME) + + This verb causes the match to fail at the current starting position in + the subject if there is a later matching failure that causes backtrack- + ing to reach it. If the pattern is unanchored, the normal "bumpalong" + advance to the next starting character then happens. Backtracking can + occur as usual to the left of (*PRUNE), before it is reached, or when + matching to the right of (*PRUNE), but if there is no match to the + right, backtracking cannot cross (*PRUNE). In simple cases, the use of + (*PRUNE) is just an alternative to an atomic group or possessive quan- + tifier, but there are some uses of (*PRUNE) that cannot be expressed in + any other way. In an anchored pattern (*PRUNE) has the same effect as + (*COMMIT). + + The behaviour of (*PRUNE:NAME) is the not the same as + (*MARK:NAME)(*PRUNE). It is like (*MARK:NAME) in that the name is + remembered for passing back to the caller. However, (*SKIP:NAME) + searches only for names set with (*MARK), ignoring those set by + (*PRUNE) or (*THEN). + + (*SKIP) + + This verb, when given without a name, is like (*PRUNE), except that if + the pattern is unanchored, the "bumpalong" advance is not to the next + character, but to the position in the subject where (*SKIP) was encoun- + tered. (*SKIP) signifies that whatever text was matched leading up to + it cannot be part of a successful match. Consider: + + a+(*SKIP)b + + If the subject is "aaaac...", after the first match attempt fails + (starting at the first character in the string), the starting point + skips on to start the next attempt at "c". Note that a possessive quan- + tifer does not have the same effect as this example; although it would + suppress backtracking during the first match attempt, the second + attempt would start at the second character instead of skipping on to + "c". + + (*SKIP:NAME) + + When (*SKIP) has an associated name, its behaviour is modified. When it + is triggered, the previous path through the pattern is searched for the + most recent (*MARK) that has the same name. If one is found, the + "bumpalong" advance is to the subject position that corresponds to that + (*MARK) instead of to where (*SKIP) was encountered. If no (*MARK) with + a matching name is found, the (*SKIP) is ignored. + + Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It + ignores names that are set by (*PRUNE:NAME) or (*THEN:NAME). + + (*THEN) or (*THEN:NAME) + + This verb causes a skip to the next innermost alternative when back- + tracking reaches it. That is, it cancels any further backtracking + within the current alternative. Its name comes from the observation + that it can be used for a pattern-based if-then-else block: + + ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ... + + If the COND1 pattern matches, FOO is tried (and possibly further items + after the end of the group if FOO succeeds); on failure, the matcher + skips to the second alternative and tries COND2, without backtracking + into COND1. If that succeeds and BAR fails, COND3 is tried. If subse- + quently BAZ fails, there are no more alternatives, so there is a back- + track to whatever came before the entire group. If (*THEN) is not + inside an alternation, it acts like (*PRUNE). + + The behaviour of (*THEN:NAME) is the not the same as + (*MARK:NAME)(*THEN). It is like (*MARK:NAME) in that the name is + remembered for passing back to the caller. However, (*SKIP:NAME) + searches only for names set with (*MARK), ignoring those set by + (*PRUNE) and (*THEN). + + A subpattern that does not contain a | character is just a part of the + enclosing alternative; it is not a nested alternation with only one + alternative. The effect of (*THEN) extends beyond such a subpattern to + the enclosing alternative. Consider this pattern, where A, B, etc. are + complex pattern fragments that do not contain any | characters at this + level: + + A (B(*THEN)C) | D + + If A and B are matched, but there is a failure in C, matching does not + backtrack into A; instead it moves to the next alternative, that is, D. + However, if the subpattern containing (*THEN) is given an alternative, + it behaves differently: + + A (B(*THEN)C | (*FAIL)) | D + + The effect of (*THEN) is now confined to the inner subpattern. After a + failure in C, matching moves to (*FAIL), which causes the whole subpat- + tern to fail because there are no more alternatives to try. In this + case, matching does now backtrack into A. + + Note that a conditional subpattern is not considered as having two + alternatives, because only one is ever used. In other words, the | + character in a conditional subpattern has a different meaning. Ignoring + white space, consider: + + ^.*? (?(?=a) a | b(*THEN)c ) + + If the subject is "ba", this pattern does not match. Because .*? is + ungreedy, it initially matches zero characters. The condition (?=a) + then fails, the character "b" is matched, but "c" is not. At this + point, matching does not backtrack to .*? as might perhaps be expected + from the presence of the | character. The conditional subpattern is + part of the single alternative that comprises the whole pattern, and so + the match fails. (If there was a backtrack into .*?, allowing it to + match "b", the match would succeed.) + + The verbs just described provide four different "strengths" of control + when subsequent matching fails. (*THEN) is the weakest, carrying on the + match at the next alternative. (*PRUNE) comes next, failing the match + at the current starting position, but allowing an advance to the next + character (for an unanchored pattern). (*SKIP) is similar, except that + the advance may be more than one character. (*COMMIT) is the strongest, + causing the entire match to fail. + + More than one backtracking verb + + If more than one backtracking verb is present in a pattern, the one + that is backtracked onto first acts. For example, consider this pat- + tern, where A, B, etc. are complex pattern fragments: + + (A(*COMMIT)B(*THEN)C|ABD) + + If A matches but B fails, the backtrack to (*COMMIT) causes the entire + match to fail. However, if A and B match, but C fails, the backtrack to + (*THEN) causes the next alternative (ABD) to be tried. This behaviour + is consistent, but is not always the same as Perl's. It means that if + two or more backtracking verbs appear in succession, all the the last + of them has no effect. Consider this example: + + ...(*COMMIT)(*PRUNE)... + + If there is a matching failure to the right, backtracking onto (*PRUNE) + causes it to be triggered, and its action is taken. There can never be + a backtrack onto (*COMMIT). + + Backtracking verbs in repeated groups + + PCRE2 differs from Perl in its handling of backtracking verbs in + repeated groups. For example, consider: + + /(a(*COMMIT)b)+ac/ + + If the subject is "abac", Perl matches, but PCRE2 fails because the + (*COMMIT) in the second repeat of the group acts. + + Backtracking verbs in assertions + + (*FAIL) in an assertion has its normal effect: it forces an immediate + backtrack. + + (*ACCEPT) in a positive assertion causes the assertion to succeed with- + out any further processing. In a negative assertion, (*ACCEPT) causes + the assertion to fail without any further processing. + + The other backtracking verbs are not treated specially if they appear + in a positive assertion. In particular, (*THEN) skips to the next + alternative in the innermost enclosing group that has alternations, + whether or not this is within the assertion. + + Negative assertions are, however, different, in order to ensure that + changing a positive assertion into a negative assertion changes its + result. Backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes a neg- + ative assertion to be true, without considering any further alternative + branches in the assertion. Backtracking into (*THEN) causes it to skip + to the next enclosing alternative within the assertion (the normal be- + haviour), but if the assertion does not have such an alternative, + (*THEN) behaves like (*PRUNE). + + Backtracking verbs in subroutines + + These behaviours occur whether or not the subpattern is called recur- + sively. Perl's treatment of subroutines is different in some cases. + + (*FAIL) in a subpattern called as a subroutine has its normal effect: + it forces an immediate backtrack. + + (*ACCEPT) in a subpattern called as a subroutine causes the subroutine + match to succeed without any further processing. Matching then contin- + ues after the subroutine call. + + (*COMMIT), (*SKIP), and (*PRUNE) in a subpattern called as a subroutine + cause the subroutine match to fail. + + (*THEN) skips to the next alternative in the innermost enclosing group + within the subpattern that has alternatives. If there is no such group + within the subpattern, (*THEN) causes the subroutine match to fail. + + +SEE ALSO + + pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), + pcre2(3). + + +AUTHOR + + Philip Hazel + University Computing Service + Cambridge, England. + + +REVISION + + Last updated: 27 December 2016 + Copyright (c) 1997-2016 University of Cambridge. +------------------------------------------------------------------------------ + + +PCRE2PERFORM(3) Library Functions Manual PCRE2PERFORM(3) + + + +NAME + PCRE2 - Perl-compatible regular expressions (revised API) + +PCRE2 PERFORMANCE + + Two aspects of performance are discussed below: memory usage and pro- + cessing time. The way you express your pattern as a regular expression + can affect both of them. + + +COMPILED PATTERN MEMORY USAGE + + Patterns are compiled by PCRE2 into a reasonably efficient interpretive + code, so that most simple patterns do not use much memory. However, + there is one case where the memory usage of a compiled pattern can be + unexpectedly large. If a parenthesized subpattern has a quantifier with + a minimum greater than 1 and/or a limited maximum, the whole subpattern + is repeated in the compiled code. For example, the pattern + + (abc|def){2,4} + + is compiled as if it were + + (abc|def)(abc|def)((abc|def)(abc|def)?)? + + (Technical aside: It is done this way so that backtrack points within + each of the repetitions can be independently maintained.) + + For regular expressions whose quantifiers use only small numbers, this + is not usually a problem. However, if the numbers are large, and par- + ticularly if such repetitions are nested, the memory usage can become + an embarrassment. For example, the very simple pattern + + ((ab){1,1000}c){1,3} + + uses 51K bytes when compiled using the 8-bit library. When PCRE2 is + compiled with its default internal pointer size of two bytes, the size + limit on a compiled pattern is 64K code units in the 8-bit and 16-bit + libraries, and this is reached with the above pattern if the outer rep- + etition is increased from 3 to 4. PCRE2 can be compiled to use larger + internal pointers and thus handle larger compiled patterns, but it is + better to try to rewrite your pattern to use less memory if you can. + + One way of reducing the memory usage for such patterns is to make use + of PCRE2's "subroutine" facility. Re-writing the above pattern as + + ((ab)(?2){0,999}c)(?1){0,2} + + reduces the memory requirements to 18K, and indeed it remains under 20K + even with the outer repetition increased to 100. However, this pattern + is not exactly equivalent, because the "subroutine" calls are treated + as atomic groups into which there can be no backtracking if there is a + subsequent matching failure. Therefore, PCRE2 cannot do this kind of + rewriting automatically. Furthermore, there is a noticeable loss of + speed when executing the modified pattern. Nevertheless, if the atomic + grouping is not a problem and the loss of speed is acceptable, this + kind of rewriting will allow you to process patterns that PCRE2 cannot + otherwise handle. + + +STACK USAGE AT RUN TIME + + When pcre2_match() is used for matching, certain kinds of pattern can + cause it to use large amounts of the process stack. In some environ- + ments the default process stack is quite small, and if it runs out the + result is often SIGSEGV. Rewriting your pattern can often help. The + pcre2stack documentation discusses this issue in detail. + + +PROCESSING TIME + + Certain items in regular expression patterns are processed more effi- + ciently than others. It is more efficient to use a character class like + [aeiou] than a set of single-character alternatives such as + (a|e|i|o|u). In general, the simplest construction that provides the + required behaviour is usually the most efficient. Jeffrey Friedl's book + contains a lot of useful general discussion about optimizing regular + expressions for efficient performance. This document contains a few + observations about PCRE2. + + Using Unicode character properties (the \p, \P, and \X escapes) is + slow, because PCRE2 has to use a multi-stage table lookup whenever it + needs a character's property. If you can find an alternative pattern + that does not use character properties, it will probably be faster. + + By default, the escape sequences \b, \d, \s, and \w, and the POSIX + character classes such as [:alpha:] do not use Unicode properties, + partly for backwards compatibility, and partly for performance reasons. + However, you can set the PCRE2_UCP option or start the pattern with + (*UCP) if you want Unicode character properties to be used. This can + double the matching time for items such as \d, when matched with + pcre2_match(); the performance loss is less with a DFA matching func- + tion, and in both cases there is not much difference for \b. + + When a pattern begins with .* not in atomic parentheses, nor in paren- + theses that are the subject of a backreference, and the PCRE2_DOTALL + option is set, the pattern is implicitly anchored by PCRE2, since it + can match only at the start of a subject string. If the pattern has + multiple top-level branches, they must all be anchorable. The optimiza- + tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is + automatically disabled if the pattern contains (*PRUNE) or (*SKIP). + + If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, + because the dot metacharacter does not then match a newline, and if the + subject string contains newlines, the pattern may match from the char- + acter immediately following one of them instead of from the very start. + For example, the pattern + + .*second + + matches the subject "first\nand second" (where \n stands for a newline + character), with the match starting at the seventh character. In order + to do this, PCRE2 has to retry the match starting after every newline + in the subject. + + If you are using such a pattern with subject strings that do not con- + tain newlines, the best performance is obtained by setting + PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate + explicit anchoring. That saves PCRE2 from having to scan along the sub- + ject looking for a newline to restart at. + + Beware of patterns that contain nested indefinite repeats. These can + take a long time to run when applied to a string that does not match. + Consider the pattern fragment + + ^(a+)* + + This can match "aaaa" in 16 different ways, and this number increases + very rapidly as the string gets longer. (The * repeat can match 0, 1, + 2, 3, or 4 times, and for each of those cases other than 0 or 4, the + + repeats can match different numbers of times.) When the remainder of + the pattern is such that the entire match is going to fail, PCRE2 has + in principle to try every possible variation, and this can take an + extremely long time, even for relatively short strings. + + An optimization catches some of the more simple cases such as + + (a+)*b + + where a literal character follows. Before embarking on the standard + matching procedure, PCRE2 checks that there is a "b" later in the sub- + ject string, and if there is not, it fails the match immediately. How- + ever, when there is no following literal this optimization cannot be + used. You can see the difference by comparing the behaviour of + + (a+)*\d + + with the pattern above. The former gives a failure almost instantly + when applied to a whole line of "a" characters, whereas the latter + takes an appreciable time with strings longer than about 20 characters. + + In many cases, the solution to this kind of performance issue is to use + an atomic group or a possessive quantifier. + + +AUTHOR + + Philip Hazel + University Computing Service + Cambridge, England. + + +REVISION + + Last updated: 02 January 2015 + Copyright (c) 1997-2015 University of Cambridge. +------------------------------------------------------------------------------ + + +PCRE2POSIX(3) Library Functions Manual PCRE2POSIX(3) + + + +NAME + PCRE2 - Perl-compatible regular expressions (revised API) + +SYNOPSIS + + #include + + int regcomp(regex_t *preg, const char *pattern, + int cflags); + + int regexec(const regex_t *preg, const char *string, + size_t nmatch, regmatch_t pmatch[], int eflags); + + size_t regerror(int errcode, const regex_t *preg, + char *errbuf, size_t errbuf_size); + + void regfree(regex_t *preg); + + +DESCRIPTION + + This set of functions provides a POSIX-style API for the PCRE2 regular + expression 8-bit library. See the pcre2api documentation for a descrip- + tion of PCRE2's native API, which contains much additional functional- + ity. There are no POSIX-style wrappers for PCRE2's 16-bit and 32-bit + libraries. + + The functions described here are just wrapper functions that ultimately + call the PCRE2 native API. Their prototypes are defined in the + pcre2posix.h header file, and on Unix systems the library itself is + called libpcre2-posix.a, so can be accessed by adding -lpcre2-posix to + the command for linking an application that uses them. Because the + POSIX functions call the native ones, it is also necessary to add + -lpcre2-8. + + Those POSIX option bits that can reasonably be mapped to PCRE2 native + options have been implemented. In addition, the option REG_EXTENDED is + defined with the value zero. This has no effect, but since programs + that are written to the POSIX interface often use it, this makes it + easier to slot in PCRE2 as a replacement library. Other POSIX options + are not even defined. + + There are also some options that are not defined by POSIX. These have + been added at the request of users who want to make use of certain + PCRE2-specific features via the POSIX calling interface. + + When PCRE2 is called via these functions, it is only the API that is + POSIX-like in style. The syntax and semantics of the regular expres- + sions themselves are still those of Perl, subject to the setting of + various PCRE2 options, as described below. "POSIX-like in style" means + that the API approximates to the POSIX definition; it is not fully + POSIX-compatible, and in multi-unit encoding domains it is probably + even less compatible. + + The header for these functions is supplied as pcre2posix.h to avoid any + potential clash with other POSIX libraries. It can, of course, be + renamed or aliased as regex.h, which is the "correct" name. It provides + two structure types, regex_t for compiled internal forms, and reg- + match_t for returning captured substrings. It also defines some con- + stants whose names start with "REG_"; these are used for setting + options and identifying error codes. + + +COMPILING A PATTERN + + The function regcomp() is called to compile a pattern into an internal + form. The pattern is a C string terminated by a binary zero, and is + passed in the argument pattern. The preg argument is a pointer to a + regex_t structure that is used as a base for storing information about + the compiled regular expression. + + The argument cflags is either zero, or contains one or more of the bits + defined by the following macros: + + REG_DOTALL + + The PCRE2_DOTALL option is set when the regular expression is passed + for compilation to the native function. Note that REG_DOTALL is not + part of the POSIX standard. + + REG_ICASE + + The PCRE2_CASELESS option is set when the regular expression is passed + for compilation to the native function. + + REG_NEWLINE + + The PCRE2_MULTILINE option is set when the regular expression is passed + for compilation to the native function. Note that this does not mimic + the defined POSIX behaviour for REG_NEWLINE (see the following sec- + tion). + + REG_NOSUB + + When a pattern that is compiled with this flag is passed to regexec() + for matching, the nmatch and pmatch arguments are ignored, and no cap- + tured strings are returned. Versions of the PCRE library prior to 10.22 + used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no + longer happens because it disables the use of back references. + + REG_UCP + + The PCRE2_UCP option is set when the regular expression is passed for + compilation to the native function. This causes PCRE2 to use Unicode + properties when matchine \d, \w, etc., instead of just recognizing + ASCII values. Note that REG_UCP is not part of the POSIX standard. + + REG_UNGREEDY + + The PCRE2_UNGREEDY option is set when the regular expression is passed + for compilation to the native function. Note that REG_UNGREEDY is not + part of the POSIX standard. + + REG_UTF + + The PCRE2_UTF option is set when the regular expression is passed for + compilation to the native function. This causes the pattern itself and + all data strings used for matching it to be treated as UTF-8 strings. + Note that REG_UTF is not part of the POSIX standard. + + In the absence of these flags, no options are passed to the native + function. This means the the regex is compiled with PCRE2 default + semantics. In particular, the way it handles newline characters in the + subject string is the Perl way, not the POSIX way. Note that setting + PCRE2_MULTILINE has only some of the effects specified for REG_NEWLINE. + It does not affect the way newlines are matched by the dot metacharac- + ter (they are not) or by a negative class such as [^a] (they are). + + The yield of regcomp() is zero on success, and non-zero otherwise. The + preg structure is filled in on success, and one member of the structure + is public: re_nsub contains the number of capturing subpatterns in the + regular expression. Various error codes are defined in the header file. + + NOTE: If the yield of regcomp() is non-zero, you must not attempt to + use the contents of the preg structure. If, for example, you pass it to + regexec(), the result is undefined and your program is likely to crash. + + +MATCHING NEWLINE CHARACTERS + + This area is not simple, because POSIX and Perl take different views of + things. It is not possible to get PCRE2 to obey POSIX semantics, but + then PCRE2 was never intended to be a POSIX engine. The following table + lists the different possibilities for matching newline characters in + Perl and PCRE2: + + Default Change with + + . matches newline no PCRE2_DOTALL + newline matches [^a] yes not changeable + $ matches \n at end yes PCRE2_DOLLAR_ENDONLY + $ matches \n in middle no PCRE2_MULTILINE + ^ matches \n in middle no PCRE2_MULTILINE + + This is the equivalent table for a POSIX-compatible pattern matcher: + + Default Change with + + . matches newline yes REG_NEWLINE + newline matches [^a] yes REG_NEWLINE + $ matches \n at end no REG_NEWLINE + $ matches \n in middle no REG_NEWLINE + ^ matches \n in middle no REG_NEWLINE + + This behaviour is not what happens when PCRE2 is called via its POSIX + API. By default, PCRE2's behaviour is the same as Perl's, except that + there is no equivalent for PCRE2_DOLLAR_ENDONLY in Perl. In both PCRE2 + and Perl, there is no way to stop newline from matching [^a]. + + Default POSIX newline handling can be obtained by setting PCRE2_DOTALL + and PCRE2_DOLLAR_ENDONLY when calling pcre2_compile() directly, but + there is no way to make PCRE2 behave exactly as for the REG_NEWLINE + action. When using the POSIX API, passing REG_NEWLINE to PCRE2's reg- + comp() function causes PCRE2_MULTILINE to be passed to pcre2_compile(), + and REG_DOTALL passes PCRE2_DOTALL. There is no way to pass PCRE2_DOL- + LAR_ENDONLY. + + +MATCHING A PATTERN + + The function regexec() is called to match a compiled pattern preg + against a given string, which is by default terminated by a zero byte + (but see REG_STARTEND below), subject to the options in eflags. These + can be: + + REG_NOTBOL + + The PCRE2_NOTBOL option is set when calling the underlying PCRE2 match- + ing function. + + REG_NOTEMPTY + + The PCRE2_NOTEMPTY option is set when calling the underlying PCRE2 + matching function. Note that REG_NOTEMPTY is not part of the POSIX + standard. However, setting this option can give more POSIX-like behav- + iour in some situations. + + REG_NOTEOL + + The PCRE2_NOTEOL option is set when calling the underlying PCRE2 match- + ing function. + + REG_STARTEND + + The string is considered to start at string + pmatch[0].rm_so and to + have a terminating NUL located at string + pmatch[0].rm_eo (there need + not actually be a NUL at that location), regardless of the value of + nmatch. This is a BSD extension, compatible with but not specified by + IEEE Standard 1003.2 (POSIX.2), and should be used with caution in + software intended to be portable to other systems. Note that a non-zero + rm_so does not imply REG_NOTBOL; REG_STARTEND affects only the location + of the string, not how it is matched. Setting REG_STARTEND and passing + pmatch as NULL are mutually exclusive; the error REG_INVARG is + returned. + + If the pattern was compiled with the REG_NOSUB flag, no data about any + matched strings is returned. The nmatch and pmatch arguments of + regexec() are ignored (except possibly as input for REG_STARTEND). + + The value of nmatch may be zero, and the value pmatch may be NULL + (unless REG_STARTEND is set); in both these cases no data about any + matched strings is returned. + + Otherwise, the portion of the string that was matched, and also any + captured substrings, are returned via the pmatch argument, which points + to an array of nmatch structures of type regmatch_t, containing the + members rm_so and rm_eo. These contain the byte offset to the first + character of each substring and the offset to the first character after + the end of each substring, respectively. The 0th element of the vector + relates to the entire portion of string that was matched; subsequent + elements relate to the capturing subpatterns of the regular expression. + Unused entries in the array have both structure members set to -1. + + A successful match yields a zero return; various error codes are + defined in the header file, of which REG_NOMATCH is the "expected" + failure code. + + +ERROR MESSAGES + + The regerror() function maps a non-zero errorcode from either regcomp() + or regexec() to a printable message. If preg is not NULL, the error + should have arisen from the use of that structure. A message terminated + by a binary zero is placed in errbuf. If the buffer is too short, only + the first errbuf_size - 1 characters of the error message are used. The + yield of the function is the size of buffer needed to hold the whole + message, including the terminating zero. This value is greater than + errbuf_size if the message was truncated. + + +MEMORY USAGE + + Compiling a regular expression causes memory to be allocated and asso- + ciated with the preg structure. The function regfree() frees all such + memory, after which preg may no longer be used as a compiled expres- + sion. + + +AUTHOR + + Philip Hazel + University Computing Service + Cambridge, England. + + +REVISION + + Last updated: 31 January 2016 + Copyright (c) 1997-2016 University of Cambridge. +------------------------------------------------------------------------------ + + +PCRE2SAMPLE(3) Library Functions Manual PCRE2SAMPLE(3) + + + +NAME + PCRE2 - Perl-compatible regular expressions (revised API) + +PCRE2 SAMPLE PROGRAM + + A simple, complete demonstration program to get you started with using + PCRE2 is supplied in the file pcre2demo.c in the src directory in the + PCRE2 distribution. A listing of this program is given in the pcre2demo + documentation. If you do not have a copy of the PCRE2 distribution, you + can save this listing to re-create the contents of pcre2demo.c. + + The demonstration program compiles the regular expression that is its + first argument, and matches it against the subject string in its second + argument. No PCRE2 options are set, and default character tables are + used. If matching succeeds, the program outputs the portion of the sub- + ject that matched, together with the contents of any captured sub- + strings. + + If the -g option is given on the command line, the program then goes on + to check for further matches of the same regular expression in the same + subject string. The logic is a little bit tricky because of the possi- + bility of matching an empty string. Comments in the code explain what + is going on. + + The code in pcre2demo.c is an 8-bit program that uses the PCRE2 8-bit + library. It handles strings and characters that are stored in 8-bit + code units. By default, one character corresponds to one code unit, + but if the pattern starts with "(*UTF)", both it and the subject are + treated as UTF-8 strings, where characters may occupy multiple code + units. + + If PCRE2 is installed in the standard include and library directories + for your operating system, you should be able to compile the demonstra- + tion program using a command like this: + + cc -o pcre2demo pcre2demo.c -lpcre2-8 + + If PCRE2 is installed elsewhere, you may need to add additional options + to the command line. For example, on a Unix-like system that has PCRE2 + installed in /usr/local, you can compile the demonstration program + using a command like this: + + cc -o pcre2demo -I/usr/local/include pcre2demo.c \ + -L/usr/local/lib -lpcre2-8 + + Once you have built the demonstration program, you can run simple tests + like this: + + ./pcre2demo 'cat|dog' 'the cat sat on the mat' + ./pcre2demo -g 'cat|dog' 'the dog sat on the cat' + + Note that there is a much more comprehensive test program, called + pcre2test, which supports many more facilities for testing regular + expressions using all three PCRE2 libraries (8-bit, 16-bit, and 32-bit, + though not all three need be installed). The pcre2demo program is pro- + vided as a relatively simple coding example. + + If you try to run pcre2demo when PCRE2 is not installed in the standard + library directory, you may get an error like this on some operating + systems (e.g. Solaris): + + ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file + or directory + + This is caused by the way shared library support works on those sys- + tems. You need to add + + -R/usr/local/lib + + (for example) to the compile command to get round this problem. + + +AUTHOR + + Philip Hazel + University Computing Service + Cambridge, England. + + +REVISION + + Last updated: 02 February 2016 + Copyright (c) 1997-2016 University of Cambridge. +------------------------------------------------------------------------------ +PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3) + + + +NAME + PCRE2 - Perl-compatible regular expressions (revised API) + +SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS + + int32_t pcre2_serialize_decode(pcre2_code **codes, + int32_t number_of_codes, const uint32_t *bytes, + pcre2_general_context *gcontext); + + int32_t pcre2_serialize_encode(pcre2_code **codes, + int32_t number_of_codes, uint32_t **serialized_bytes, + PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext); + + void pcre2_serialize_free(uint8_t *bytes); + + int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes); + + If you are running an application that uses a large number of regular + expression patterns, it may be useful to store them in a precompiled + form instead of having to compile them every time the application is + run. However, if you are using the just-in-time optimization feature, + it is not possible to save and reload the JIT data, because it is posi- + tion-dependent. The host on which the patterns are reloaded must be + running the same version of PCRE2, with the same code unit width, and + must also have the same endianness, pointer width and PCRE2_SIZE type. + For example, patterns compiled on a 32-bit system using PCRE2's 16-bit + library cannot be reloaded on a 64-bit system, nor can they be reloaded + using the 8-bit library. + + +SECURITY CONCERNS + + The facility for saving and restoring compiled patterns is intended for + use within individual applications. As such, the data supplied to + pcre2_serialize_decode() is expected to be trusted data, not data from + arbitrary external sources. There is only some simple consistency + checking, not complete validation of what is being re-loaded. + + +SAVING COMPILED PATTERNS + + Before compiled patterns can be saved they must be serialized, that is, + converted to a stream of bytes. A single byte stream may contain any + number of compiled patterns, but they must all use the same character + tables. A single copy of the tables is included in the byte stream (its + size is 1088 bytes). For more details of character tables, see the sec- + tion on locale support in the pcre2api documentation. + + The function pcre2_serialize_encode() creates a serialized byte stream + from a list of compiled patterns. Its first two arguments specify the + list, being a pointer to a vector of pointers to compiled patterns, and + the length of the vector. The third and fourth arguments point to vari- + ables which are set to point to the created byte stream and its length, + respectively. The final argument is a pointer to a general context, + which can be used to specify custom memory mangagement functions. If + this argument is NULL, malloc() is used to obtain memory for the byte + stream. The yield of the function is the number of serialized patterns, + or one of the following negative error codes: + + PCRE2_ERROR_BADDATA the number of patterns is zero or less + PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns + PCRE2_ERROR_MEMORY memory allocation failed + PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables + PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL + + PCRE2_ERROR_BADMAGIC means either that a pattern's code has been cor- + rupted, or that a slot in the vector does not point to a compiled pat- + tern. + + Once a set of patterns has been serialized you can save the data in any + appropriate manner. Here is sample code that compiles two patterns and + writes them to a file. It assumes that the variable fd refers to a file + that is open for output. The error checking that should be present in a + real application has been omitted for simplicity. + + int errorcode; + uint8_t *bytes; + PCRE2_SIZE erroroffset; + PCRE2_SIZE bytescount; + pcre2_code *list_of_codes[2]; + list_of_codes[0] = pcre2_compile("first pattern", + PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroroffset, NULL); + list_of_codes[1] = pcre2_compile("second pattern", + PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroroffset, NULL); + errorcode = pcre2_serialize_encode(list_of_codes, 2, &bytes, + &bytescount, NULL); + errorcode = fwrite(bytes, 1, bytescount, fd); + + Note that the serialized data is binary data that may contain any of + the 256 possible byte values. On systems that make a distinction + between binary and non-binary data, be sure that the file is opened for + binary output. + + Serializing a set of patterns leaves the original data untouched, so + they can still be used for matching. Their memory must eventually be + freed in the usual way by calling pcre2_code_free(). When you have fin- + ished with the byte stream, it too must be freed by calling pcre2_seri- + alize_free(). + + +RE-USING PRECOMPILED PATTERNS + + In order to re-use a set of saved patterns you must first make the + serialized byte stream available in main memory (for example, by read- + ing from a file). The management of this memory block is up to the + application. You can use the pcre2_serialize_get_number_of_codes() + function to find out how many compiled patterns are in the serialized + data without actually decoding the patterns: + + uint8_t *bytes = ; + int32_t number_of_codes = pcre2_serialize_get_number_of_codes(bytes); + + The pcre2_serialize_decode() function reads a byte stream and recreates + the compiled patterns in new memory blocks, setting pointers to them in + a vector. The first two arguments are a pointer to a suitable vector + and its length, and the third argument points to a byte stream. The + final argument is a pointer to a general context, which can be used to + specify custom memory mangagement functions for the decoded patterns. + If this argument is NULL, malloc() and free() are used. After deserial- + ization, the byte stream is no longer needed and can be discarded. + + int32_t number_of_codes; + pcre2_code *list_of_codes[2]; + uint8_t *bytes = ; + int32_t number_of_codes = + pcre2_serialize_decode(list_of_codes, 2, bytes, NULL); + + If the vector is not large enough for all the patterns in the byte + stream, it is filled with those that fit, and the remainder are + ignored. The yield of the function is the number of decoded patterns, + or one of the following negative error codes: + + PCRE2_ERROR_BADDATA second argument is zero or less + PCRE2_ERROR_BADMAGIC mismatch of id bytes in the data + PCRE2_ERROR_BADMODE mismatch of code unit size or PCRE2 version + PCRE2_ERROR_BADSERIALIZEDDATA other sanity check failure + PCRE2_ERROR_MEMORY memory allocation failed + PCRE2_ERROR_NULL first or third argument is NULL + + PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was + compiled on a system with different endianness. + + Decoded patterns can be used for matching in the usual way, and must be + freed by calling pcre2_code_free(). However, be aware that there is a + potential race issue if you are using multiple patterns that were + decoded from a single byte stream in a multithreaded application. A + single copy of the character tables is used by all the decoded patterns + and a reference count is used to arrange for its memory to be automati- + cally freed when the last pattern is freed, but there is no locking on + this reference count. Therefore, if you want to call pcre2_code_free() + for these patterns in different threads, you must arrange your own + locking, and ensure that pcre2_code_free() cannot be called by two + threads at the same time. + + If a pattern was processed by pcre2_jit_compile() before being serial- + ized, the JIT data is discarded and so is no longer available after a + save/restore cycle. You can, however, process a restored pattern with + pcre2_jit_compile() if you wish. + + +AUTHOR + + Philip Hazel + University Computing Service + Cambridge, England. + + +REVISION + + Last updated: 24 May 2016 + Copyright (c) 1997-2016 University of Cambridge. +------------------------------------------------------------------------------ + + +PCRE2STACK(3) Library Functions Manual PCRE2STACK(3) + + + +NAME + PCRE2 - Perl-compatible regular expressions (revised API) + +PCRE2 DISCUSSION OF STACK USAGE + + When you call pcre2_match(), it makes use of an internal function + called match(). This calls itself recursively at branch points in the + pattern, in order to remember the state of the match so that it can + back up and try a different alternative after a failure. As matching + proceeds deeper and deeper into the tree of possibilities, the recur- + sion depth increases. The match() function is also called in other cir- + cumstances, for example, whenever a parenthesized sub-pattern is + entered, and in certain cases of repetition. + + Not all calls of match() increase the recursion depth; for an item such + as a* it may be called several times at the same level, after matching + different numbers of a's. Furthermore, in a number of cases where the + result of the recursive call would immediately be passed back as the + result of the current call (a "tail recursion"), the function is just + restarted instead. + + Each time the internal match() function is called recursively, it uses + memory from the process stack. For certain kinds of pattern and data, + very large amounts of stack may be needed, despite the recognition of + "tail recursion". Note that if PCRE2 is compiled with the -fsani- + tize=address option of the GCC compiler, the stack requirements are + greatly increased. + + The above comments apply when pcre2_match() is run in its normal inter- + pretive manner. If the compiled pattern was processed by pcre2_jit_com- + pile(), and just-in-time compiling was successful, and the options + passed to pcre2_match() were not incompatible, the matching process + uses the JIT-compiled code instead of the match() function. In this + case, the memory requirements are handled entirely differently. See the + pcre2jit documentation for details. + + The pcre2_dfa_match() function operates in a different way to + pcre2_match(), and uses recursion only when there is a regular expres- + sion recursion or subroutine call in the pattern. This includes the + processing of assertion and "once-only" subpatterns, which are handled + like subroutine calls. Normally, these are never very deep, and the + limit on the complexity of pcre2_dfa_match() is controlled by the + amount of workspace it is given. However, it is possible to write pat- + terns with runaway infinite recursions; such patterns will cause + pcre2_dfa_match() to run out of stack unless a limit is applied (see + below). + + The comments in the next three sections do not apply to + pcre2_dfa_match(); they are relevant only for pcre2_match() without the + JIT optimization. + + Reducing pcre2_match()'s stack usage + + You can often reduce the amount of recursion, and therefore the amount + of stack used, by modifying the pattern that is being matched. Con- + sider, for example, this pattern: + + ([^<]|<(?!inet))+ + + It matches from wherever it starts until it encounters " ...) named capturing group (Perl) + (?'name'...) named capturing group (Perl) + (?P ...) named capturing group (Python) + (?:...) non-capturing group + (?|...) non-capturing group; reset group numbers for + capturing groups in each alternative + + +ATOMIC GROUPS + + (?>...) atomic, non-capturing group + + +COMMENT + + (?#....) comment (not nestable) + + +OPTION SETTING + + (?i) caseless + (?J) allow duplicate names + (?m) multiline + (?s) single line (dotall) + (?U) default ungreedy (lazy) + (?x) extended (ignore white space) + (?-...) unset option(s) + + The following are recognized only at the very start of a pattern or + after one of the newline or \R options with similar syntax. More than + one of them may appear. + + (*LIMIT_MATCH=d) set the match limit to d (decimal number) + (*LIMIT_RECURSION=d) set the recursion limit to d (decimal number) + (*NOTEMPTY) set PCRE2_NOTEMPTY when matching + (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching + (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) + (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) + (*NO_JIT) disable JIT optimization + (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) + (*UTF) set appropriate UTF mode for the library in use + (*UCP) set PCRE2_UCP (use Unicode properties for \d etc) + + Note that LIMIT_MATCH and LIMIT_RECURSION can only reduce the value of + the limits set by the caller of pcre2_match() or pcre2_dfa_match(), not + increase them. The application can lock out the use of (*UTF) and + (*UCP) by setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, + respectively, at compile time. + + +NEWLINE CONVENTION + + These are recognized only at the very start of the pattern or after + option settings with a similar syntax. + + (*CR) carriage return only + (*LF) linefeed only + (*CRLF) carriage return followed by linefeed + (*ANYCRLF) all three of the above + (*ANY) any Unicode newline sequence + + +WHAT \R MATCHES + + These are recognized only at the very start of the pattern or after + option setting with a similar syntax. + + (*BSR_ANYCRLF) CR, LF, or CRLF + (*BSR_UNICODE) any Unicode newline sequence + + +LOOKAHEAD AND LOOKBEHIND ASSERTIONS + + (?=...) positive look ahead + (?!...) negative look ahead + (?<=...) positive look behind + (? reference by name (Perl) + \k'name' reference by name (Perl) + \g{name} reference by name (Perl) + \k{name} reference by name (.NET) + (?P=name) reference by name (Python) + + +SUBROUTINE REFERENCES (POSSIBLY RECURSIVE) + + (?R) recurse whole pattern + (?n) call subpattern by absolute number + (?+n) call subpattern by relative number + (?-n) call subpattern by relative number + (?&name) call subpattern by name (Perl) + (?P>name) call subpattern by name (Python) + \g call subpattern by name (Oniguruma) + \g'name' call subpattern by name (Oniguruma) + \g call subpattern by absolute number (Oniguruma) + \g'n' call subpattern by absolute number (Oniguruma) + \g<+n> call subpattern by relative number (PCRE2 extension) + \g'+n' call subpattern by relative number (PCRE2 extension) + \g<-n> call subpattern by relative number (PCRE2 extension) + \g'-n' call subpattern by relative number (PCRE2 extension) + + +CONDITIONAL PATTERNS + + (?(condition)yes-pattern) + (?(condition)yes-pattern|no-pattern) + + (?(n) absolute reference condition + (?(+n) relative reference condition + (?(-n) relative reference condition + (?( ) named reference condition (Perl) + (?('name') named reference condition (Perl) + (?(name) named reference condition (PCRE2, deprecated) + (?(R) overall recursion condition + (?(Rn) specific numbered group recursion condition + (?(R&name) specific named group recursion condition + (?(DEFINE) define subpattern for reference + (?(VERSION[>]=n.m) test PCRE2 version + (?(assert) assertion condition + + Note the ambiguity of (?(R) and (?(Rn) which might be named reference + conditions or recursion tests. Such a condition is interpreted as a + reference condition if the relevant named group exists. + + +BACKTRACKING CONTROL + + The following act immediately they are reached: + + (*ACCEPT) force successful match + (*FAIL) force backtrack; synonym (*F) + (*MARK:NAME) set name to be passed back; synonym (*:NAME) + + The following act only when a subsequent match failure causes a back- + track to reach them. They all force a match failure, but they differ in + what happens afterwards. Those that advance the start-of-match point do + so only if the pattern is not anchored. + + (*COMMIT) overall failure, no advance of starting point + (*PRUNE) advance to next starting character + (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE) + (*SKIP) advance to current matching position + (*SKIP:NAME) advance to position corresponding to an earlier + (*MARK:NAME); if not found, the (*SKIP) is ignored + (*THEN) local failure, backtrack to next alternation + (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN) + + +CALLOUTS + + (?C) callout (assumed number 0) + (?Cn) callout with numerical data n + (?C"text") callout with string data + + The allowed string delimiters are ` ' " ^ % # $ (which are the same for + the start and the end), and the starting delimiter { matched with the + ending delimiter }. To encode the ending delimiter within the string, + double it. + + +SEE ALSO + + pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), + pcre2(3). + + +AUTHOR + + Philip Hazel + University Computing Service + Cambridge, England. + + +REVISION + + Last updated: 23 December 2016 + Copyright (c) 1997-2016 University of Cambridge. +------------------------------------------------------------------------------ + + PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3) @@ -5021,58 +9884,74 @@ WIDE CHARACTERS AND UTF MODES In UTF modes, the dot metacharacter matches one UTF character instead of a single code unit. - The escape sequence \C can be used to match a single code unit, in a - UTF mode, but its use can lead to some strange effects because it - breaks up multi-unit characters (see the description of \C in the - pcre2pattern documentation). The use of \C is not supported in the - alternative matching function pcre2_dfa_match(), nor is it supported in - UTF mode by the JIT optimization. If JIT optimization is requested for - a UTF pattern that contains \C, it will not succeed, and so the match- - ing will be carried out by the normal interpretive function. + The escape sequence \C can be used to match a single code unit in a UTF + mode, but its use can lead to some strange effects because it breaks up + multi-unit characters (see the description of \C in the pcre2pattern + documentation). + + The use of \C is not supported by the alternative matching function + pcre2_dfa_match() when in UTF-8 or UTF-16 mode, that is, when a charac- + ter may consist of more than one code unit. The use of \C in these + modes provokes a match-time error. Also, the JIT optimization does not + support \C in these modes. If JIT optimization is requested for a UTF-8 + or UTF-16 pattern that contains \C, it will not succeed, and so when + pcre2_match() is called, the matching will be carried out by the normal + interpretive function. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test - characters of any code value, but, by default, the characters that - PCRE2 recognizes as digits, spaces, or word characters remain the same - set as in non-UTF mode, all with code points less than 256. This - remains true even when PCRE2 is built to include Unicode support, - because to do otherwise would slow down matching in many common cases. - Note that this also applies to \b and \B, because they are defined in - terms of \w and \W. If you want to test for a wider sense of, say, - "digit", you can use explicit Unicode property tests such as \p{Nd}. - Alternatively, if you set the PCRE2_UCP option, the way that the char- - acter escapes work is changed so that Unicode properties are used to + characters of any code value, but, by default, the characters that + PCRE2 recognizes as digits, spaces, or word characters remain the same + set as in non-UTF mode, all with code points less than 256. This + remains true even when PCRE2 is built to include Unicode support, + because to do otherwise would slow down matching in many common cases. + Note that this also applies to \b and \B, because they are defined in + terms of \w and \W. If you want to test for a wider sense of, say, + "digit", you can use explicit Unicode property tests such as \p{Nd}. + Alternatively, if you set the PCRE2_UCP option, the way that the char- + acter escapes work is changed so that Unicode properties are used to determine which characters match. There are more details in the section on generic character types in the pcre2pattern documentation. - Similarly, characters that match the POSIX named character classes are + Similarly, characters that match the POSIX named character classes are all low-valued characters, unless the PCRE2_UCP option is set. - However, the special horizontal and vertical white space matching + However, the special horizontal and vertical white space matching escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char- acters, whether or not PCRE2_UCP is set. - Case-insensitive matching in UTF mode makes use of Unicode properties. - A few Unicode characters such as Greek sigma have more than two code- + Case-insensitive matching in UTF mode makes use of Unicode properties. + A few Unicode characters such as Greek sigma have more than two code- points that are case-equivalent, and these are treated as such. VALIDITY OF UTF STRINGS - When the PCRE2_UTF option is set, the strings passed as patterns and + When the PCRE2_UTF option is set, the strings passed as patterns and subjects are (by default) checked for validity on entry to the relevant - functions. If an invalid UTF string is passed, an negative error code - is returned. The code unit offset to the offending character can be - extracted from the match data block by calling pcre2_get_startchar(), + functions. If an invalid UTF string is passed, an negative error code + is returned. The code unit offset to the offending character can be + extracted from the match data block by calling pcre2_get_startchar(), which is used for this purpose after a UTF error. UTF-16 and UTF-32 strings can indicate their endianness by special code - knows as a byte-order mark (BOM). The PCRE2 functions do not handle + knows as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting strings to be in host byte order. - The entire string is checked before any other processing takes place. + A UTF string is checked before any other processing takes place. In the + case of pcre2_match() and pcre2_dfa_match() calls with a non-zero + starting offset, the check is applied only to that part of the subject + that could be inspected during matching, and there is a check that the + starting offset points to the first code unit of a character or to the + end of the subject. If there are no lookbehind assertions in the pat- + tern, the check starts at the starting offset. Otherwise, it starts at + the length of the longest lookbehind before the starting offset, or at + the start of the subject if there are not that many characters before + the starting offset. Note that the sequences \b and \B are one-charac- + ter lookbehinds. + In addition to checking the format of the string, there is a check to ensure that all code points lie in the range U+0 to U+10FFFF, excluding - the surrogate area. The so-called "non-character" code points are not + the surrogate area. The so-called "non-character" code points are not excluded because Unicode corrigendum #9 makes it clear that they should not be. @@ -5169,9 +10048,9 @@ VALIDITY OF UTF STRINGS The following negative error codes are given for invalid UTF-16 strings: - PCRE_UTF16_ERR1 Missing low surrogate at end of string - PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate - PCRE_UTF16_ERR3 Isolated low surrogate + PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at end of string + PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate follows high surrogate + PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate Errors in UTF-32 strings @@ -5179,8 +10058,8 @@ VALIDITY OF UTF STRINGS The following negative error codes are given for invalid UTF-32 strings: - PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff) - PCRE_UTF32_ERR2 Code point is greater than 0x10ffff + PCRE2_ERROR_UTF32_ERR1 Surrogate character (0xd800 to 0xdfff) + PCRE2_ERROR_UTF32_ERR2 Code point is greater than 0x10ffff AUTHOR @@ -5192,8 +10071,8 @@ AUTHOR REVISION - Last updated: 23 November 2014 - Copyright (c) 1997-2014 University of Cambridge. + Last updated: 03 July 2016 + Copyright (c) 1997-2016 University of Cambridge. ------------------------------------------------------------------------------ diff --git a/pcre2/doc/pcre2_code_copy.3 b/pcre2/doc/pcre2_code_copy.3 new file mode 100644 index 000000000..09b47054d --- /dev/null +++ b/pcre2/doc/pcre2_code_copy.3 @@ -0,0 +1,31 @@ +.TH PCRE2_CODE_COPY 3 "22 November 2016" "PCRE2 10.23" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP); +.fi +. +.SH DESCRIPTION +.rs +.sp +This function makes a copy of the memory used for a compiled pattern, excluding +any memory used by the JIT compiler. Without a subsequent call to +\fBpcre2_jit_compile()\fP, the copy can be used only for non-JIT matching. The +pointer to the character tables is copied, not the tables themselves (see +\fBpcre2_code_copy_with_tables()\fP). The yield of the function is NULL if +\fIcode\fP is NULL or if sufficient memory cannot be obtained. +.P +There is a complete description of the PCRE2 native API in the +.\" HREF +\fBpcre2api\fP +.\" +page and a description of the POSIX API in the +.\" HREF +\fBpcre2posix\fP +.\" +page. diff --git a/pcre2/doc/pcre2_code_copy_with_tables.3 b/pcre2/doc/pcre2_code_copy_with_tables.3 new file mode 100644 index 000000000..cfbddb330 --- /dev/null +++ b/pcre2/doc/pcre2_code_copy_with_tables.3 @@ -0,0 +1,32 @@ +.TH PCRE2_CODE_COPY 3 "22 November 2016" "PCRE2 10.23" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP); +.fi +. +.SH DESCRIPTION +.rs +.sp +This function makes a copy of the memory used for a compiled pattern, excluding +any memory used by the JIT compiler. Without a subsequent call to +\fBpcre2_jit_compile()\fP, the copy can be used only for non-JIT matching. +Unlike \fBpcre2_code_copy()\fP, a separate copy of the character tables is also +made, with the new code pointing to it. This memory will be automatically freed +when \fBpcre2_code_free()\fP is called. The yield of the function is NULL if +\fIcode\fP is NULL or if sufficient memory cannot be obtained. +.P +There is a complete description of the PCRE2 native API in the +.\" HREF +\fBpcre2api\fP +.\" +page and a description of the POSIX API in the +.\" HREF +\fBpcre2posix\fP +.\" +page. diff --git a/pcre2/doc/pcre2_code_free.3 b/pcre2/doc/pcre2_code_free.3 index 3a1c7d885..5127081e3 100644 --- a/pcre2/doc/pcre2_code_free.3 +++ b/pcre2/doc/pcre2_code_free.3 @@ -1,4 +1,4 @@ -.TH PCRE2_CODE_FREE 3 "21 October 2014" "PCRE2 10.00" +.TH PCRE2_CODE_FREE 3 "29 July 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -7,7 +7,7 @@ PCRE2 - Perl-compatible regular expressions (revised API) .B #include .PP .nf -.B pcre2_code_free(pcre2_code *\fIcode\fP); +.B void pcre2_code_free(pcre2_code *\fIcode\fP); .fi . .SH DESCRIPTION diff --git a/pcre2/doc/pcre2_dfa_match.3 b/pcre2/doc/pcre2_dfa_match.3 index f45da0df7..d2132d514 100644 --- a/pcre2/doc/pcre2_dfa_match.3 +++ b/pcre2/doc/pcre2_dfa_match.3 @@ -1,4 +1,4 @@ -.TH PCRE2_DFA_MATCH 3 "12 May 2013" "PCRE2 10.00" +.TH PCRE2_DFA_MATCH 3 "23 December 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -33,8 +33,8 @@ is \fBpcre2_match()\fP.) The arguments for this function are: \fIwscount\fP Number of elements in the vector .sp For \fBpcre2_dfa_match()\fP, a match context is needed only if you want to set -up a callout function. The \fIlength\fP and \fIstartoffset\fP values are code -units, not characters. The options are: +up a callout function or specify the recursion limit. The \fIlength\fP and +\fIstartoffset\fP values are code units, not characters. The options are: .sp PCRE2_ANCHORED Match only at the first position PCRE2_NOTBOL Subject is not the beginning of a line diff --git a/pcre2/doc/pcre2_get_error_message.3 b/pcre2/doc/pcre2_get_error_message.3 index 9ff53420d..9378b1835 100644 --- a/pcre2/doc/pcre2_get_error_message.3 +++ b/pcre2/doc/pcre2_get_error_message.3 @@ -1,4 +1,4 @@ -.TH PCRE2_GET_ERROR_MESSAGE 3 "21 October 2014" "PCRE2 10.00" +.TH PCRE2_GET_ERROR_MESSAGE 3 "17 June 2016" "PCRE2 10.22" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -23,7 +23,10 @@ errors are negative numbers. The arguments are: \fIbufflen\fP the length of the buffer (code units) .sp The function returns the length of the message, excluding the trailing zero, or -a negative error code if the buffer is too small. +the negative error code PCRE2_ERROR_NOMEMORY if the buffer is too small. In +this case, the returned message is truncated (but still with a trailing zero). +If \fIerrorcode\fP does not contain a recognized error code number, the +negative value PCRE2_ERROR_BADDATA is returned. .P There is a complete description of the PCRE2 native API in the .\" HREF diff --git a/pcre2/doc/pcre2_match_data_create.3 b/pcre2/doc/pcre2_match_data_create.3 index 2a92f0bcb..3b0a29e19 100644 --- a/pcre2/doc/pcre2_match_data_create.3 +++ b/pcre2/doc/pcre2_match_data_create.3 @@ -1,4 +1,4 @@ -.TH PCRE2_MATCH_DATA_CREATE 3 "22 October 2014" "PCRE2 10.00" +.TH PCRE2_MATCH_DATA_CREATE 3 "29 July 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -7,7 +7,7 @@ PCRE2 - Perl-compatible regular expressions (revised API) .B #include .PP .nf -.B pcre2_match_data_create(uint32_t \fIovecsize\fP, +.B pcre2_match_data *pcre2_match_data_create(uint32_t \fIovecsize\fP, .B " pcre2_general_context *\fIgcontext\fP);" .fi . diff --git a/pcre2/doc/pcre2_match_data_create_from_pattern.3 b/pcre2/doc/pcre2_match_data_create_from_pattern.3 index 83267d6f7..60bf77cc6 100644 --- a/pcre2/doc/pcre2_match_data_create_from_pattern.3 +++ b/pcre2/doc/pcre2_match_data_create_from_pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "24 October 2014" "PCRE2 10.00" +.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "29 July 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -7,8 +7,8 @@ PCRE2 - Perl-compatible regular expressions (revised API) .B #include .PP .nf -.B pcre2_match_data_create_from_pattern(const pcre2_code *\fIcode\fP, -.B " pcre2_general_context *\fIgcontext\fP);" +.B pcre2_match_data *pcre2_match_data_create_from_pattern( +.B " const pcre2_code *\fIcode\fP, pcre2_general_context *\fIgcontext\fP);" .fi . .SH DESCRIPTION diff --git a/pcre2/doc/pcre2_pattern_info.3 b/pcre2/doc/pcre2_pattern_info.3 index 8424e6f58..575840bbf 100644 --- a/pcre2/doc/pcre2_pattern_info.3 +++ b/pcre2/doc/pcre2_pattern_info.3 @@ -1,4 +1,4 @@ -.TH PCRE2_PATTERN_INFO 3 "01 December 2014" "PCRE2 10.00" +.TH PCRE2_PATTERN_INFO 3 "21 November 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -30,19 +30,20 @@ request are as follows: PCRE2_BSR_ANYCRLF: CR, LF, or CRLF only PCRE2_INFO_CAPTURECOUNT Number of capturing subpatterns PCRE2_INFO_FIRSTBITMAP Bitmap of first code units, or NULL - PCRE2_INFO_FIRSTCODEUNIT First code unit when type is 1 PCRE2_INFO_FIRSTCODETYPE Type of start-of-match information 0 nothing set 1 first code unit is set 2 start of string or after newline + PCRE2_INFO_FIRSTCODEUNIT First code unit when type is 1 + PCRE2_INFO_HASBACKSLASHC Return 1 if pattern contains \eC PCRE2_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist in the pattern PCRE2_INFO_JCHANGED Return 1 if (?J) or (?-J) was used PCRE2_INFO_JITSIZE Size of JIT compiled code, or 0 - PCRE2_INFO_LASTCODEUNIT Last code unit when type is 1 PCRE2_INFO_LASTCODETYPE Type of must-be-present information 0 nothing set 1 code unit is set + PCRE2_INFO_LASTCODEUNIT Last code unit when type is 1 PCRE2_INFO_MATCHEMPTY 1 if the pattern can match an empty string, 0 otherwise PCRE2_INFO_MATCHLIMIT Match limit if set, @@ -50,8 +51,8 @@ request are as follows: PCRE2_INFO_MAXLOOKBEHIND Length (in characters) of the longest lookbehind assertion PCRE2_INFO_MINLENGTH Lower bound length of matching strings - PCRE2_INFO_NAMEENTRYSIZE Size of name table entries PCRE2_INFO_NAMECOUNT Number of named subpatterns + PCRE2_INFO_NAMEENTRYSIZE Size of name table entries PCRE2_INFO_NAMETABLE Pointer to name table PCRE2_CONFIG_NEWLINE Code for the newline sequence: PCRE2_NEWLINE_CR diff --git a/pcre2/doc/pcre2_serialize_decode.3 b/pcre2/doc/pcre2_serialize_decode.3 index b362fcdff..57304a59c 100644 --- a/pcre2/doc/pcre2_serialize_decode.3 +++ b/pcre2/doc/pcre2_serialize_decode.3 @@ -1,4 +1,4 @@ -.TH PCRE2_SERIALIZE_DECODE 3 "19 January 2015" "PCRE2 10.10" +.TH PCRE2_SERIALIZE_DECODE 3 "02 September 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -8,7 +8,7 @@ PCRE2 - Perl-compatible regular expressions (revised API) .PP .nf .B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP, -.B " int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP," +.B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP," .B " pcre2_general_context *\fIgcontext\fP);" .fi . diff --git a/pcre2/doc/pcre2_serialize_encode.3 b/pcre2/doc/pcre2_serialize_encode.3 index 57077eb1e..9c2963318 100644 --- a/pcre2/doc/pcre2_serialize_encode.3 +++ b/pcre2/doc/pcre2_serialize_encode.3 @@ -1,4 +1,4 @@ -.TH PCRE2_SERIALIZE_ENCODE 3 "19 January 2015" "PCRE2 10.10" +.TH PCRE2_SERIALIZE_ENCODE 3 "02 September 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -7,8 +7,8 @@ PCRE2 - Perl-compatible regular expressions (revised API) .B #include .PP .nf -.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP, -.B " int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP," +.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP, +.B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP," .B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);" .fi . diff --git a/pcre2/doc/pcre2_set_max_pattern_length.3 b/pcre2/doc/pcre2_set_max_pattern_length.3 new file mode 100644 index 000000000..7aa01c775 --- /dev/null +++ b/pcre2/doc/pcre2_set_max_pattern_length.3 @@ -0,0 +1,31 @@ +.TH PCRE2_SET_MAX_PATTERN_LENGTH 3 "05 October 2016" "PCRE2 10.23" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP, +.B " PCRE2_SIZE \fIvalue\fP);" +.fi +. +.SH DESCRIPTION +.rs +.sp +This function sets, in a compile context, the maximum text length (in code +units) of the pattern that can be compiled. The result is always zero. If a +longer pattern is passed to \fBpcre2_compile()\fP there is an immediate error +return. The default is effectively unlimited, being the largest value a +PCRE2_SIZE variable can hold. +.P +There is a complete description of the PCRE2 native API in the +.\" HREF +\fBpcre2api\fP +.\" +page and a description of the POSIX API in the +.\" HREF +\fBpcre2posix\fP +.\" +page. diff --git a/pcre2/doc/pcre2_set_offset_limit.3 b/pcre2/doc/pcre2_set_offset_limit.3 new file mode 100644 index 000000000..20fa1045d --- /dev/null +++ b/pcre2/doc/pcre2_set_offset_limit.3 @@ -0,0 +1,28 @@ +.TH PCRE2_SET_OFFSET_LIMIT 3 "22 September 2015" "PCRE2 10.21" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP, +.B " PCRE2_SIZE \fIvalue\fP);" +.fi +. +.SH DESCRIPTION +.rs +.sp +This function sets the offset limit field in a match context. The result is +always zero. +.P +There is a complete description of the PCRE2 native API in the +.\" HREF +\fBpcre2api\fP +.\" +page and a description of the POSIX API in the +.\" HREF +\fBpcre2posix\fP +.\" +page. diff --git a/pcre2/doc/pcre2_substitute.3 b/pcre2/doc/pcre2_substitute.3 index edfcb0432..e69e0ccc0 100644 --- a/pcre2/doc/pcre2_substitute.3 +++ b/pcre2/doc/pcre2_substitute.3 @@ -1,4 +1,4 @@ -.TH PCRE2_SUBSTITUTE 3 "11 November 2014" "PCRE2 10.00" +.TH PCRE2_SUBSTITUTE 3 "12 December 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -47,20 +47,25 @@ units, not characters, as is the contents of the variable pointed at by \fIoutlengthptr\fP, which is updated to the actual length of the new string. The options are: .sp - PCRE2_ANCHORED Match only at the first position - PCRE2_NOTBOL Subject string is not the beginning of a line - PCRE2_NOTEOL Subject string is not the end of a line - PCRE2_NOTEMPTY An empty string is not a valid match - PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject - is not a valid match - PCRE2_NO_UTF_CHECK Do not check the subject or replacement for - UTF validity (only relevant if PCRE2_UTF - was set at compile time) - PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject + PCRE2_ANCHORED Match only at the first position + PCRE2_NOTBOL Subject is not the beginning of a line + PCRE2_NOTEOL Subject is not the end of a line + PCRE2_NOTEMPTY An empty string is not a valid match + PCRE2_NOTEMPTY_ATSTART An empty string at the start of the + subject is not a valid match + PCRE2_NO_UTF_CHECK Do not check the subject or replacement + for UTF validity (only relevant if + PCRE2_UTF was set at compile time) + PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing + PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length + PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset + PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string .sp The function returns the number of substitutions, which may be zero if there were no matches. The result can be greater than one only when -PCRE2_SUBSTITUTE_GLOBAL is set. +PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code +is returned. .P There is a complete description of the PCRE2 native API in the .\" HREF diff --git a/pcre2/doc/pcre2api.3 b/pcre2/doc/pcre2api.3 index 1147f89d4..e0a434af4 100644 --- a/pcre2/doc/pcre2api.3 +++ b/pcre2/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "22 April 2015" "PCRE2 10.20" +.TH PCRE2API 3 "24 December 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -20,13 +20,13 @@ document for an overview of all the PCRE2 documentation. .B " uint32_t \fIoptions\fP, int *\fIerrorcode\fP, PCRE2_SIZE *\fIerroroffset,\fP" .B " pcre2_compile_context *\fIccontext\fP);" .sp -.B pcre2_code_free(pcre2_code *\fIcode\fP); +.B void pcre2_code_free(pcre2_code *\fIcode\fP); .sp -.B pcre2_match_data_create(uint32_t \fIovecsize\fP, +.B pcre2_match_data *pcre2_match_data_create(uint32_t \fIovecsize\fP, .B " pcre2_general_context *\fIgcontext\fP);" .sp -.B pcre2_match_data_create_from_pattern(const pcre2_code *\fIcode\fP, -.B " pcre2_general_context *\fIgcontext\fP);" +.B pcre2_match_data *pcre2_match_data_create_from_pattern( +.B " const pcre2_code *\fIcode\fP, pcre2_general_context *\fIgcontext\fP);" .sp .B int pcre2_match(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," @@ -90,6 +90,9 @@ document for an overview of all the PCRE2 documentation. .B int pcre2_set_character_tables(pcre2_compile_context *\fIccontext\fP, .B " const unsigned char *\fItables\fP);" .sp +.B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP, +.B " PCRE2_SIZE \fIvalue\fP);" +.sp .B int pcre2_set_newline(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .sp @@ -120,6 +123,9 @@ document for an overview of all the PCRE2 documentation. .B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .sp +.B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP, +.B " PCRE2_SIZE \fIvalue\fP);" +.sp .B int pcre2_set_recursion_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .sp @@ -210,11 +216,11 @@ document for an overview of all the PCRE2 documentation. .sp .nf .B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP, -.B " int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP," +.B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP," .B " pcre2_general_context *\fIgcontext\fP);" .sp -.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP, -.B " int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP," +.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP, +.B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP," .B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);" .sp .B void pcre2_serialize_free(uint8_t *\fIbytes\fP); @@ -227,6 +233,10 @@ document for an overview of all the PCRE2 documentation. .rs .sp .nf +.B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP); +.sp +.B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP); +.sp .B int pcre2_get_error_message(int \fIerrorcode\fP, PCRE2_UCHAR *\fIbuffer\fP, .B " PCRE2_SIZE \fIbufflen\fP);" .sp @@ -346,9 +356,10 @@ More complicated programs might need to make use of the specialist functions \fBpcre2_jit_stack_create()\fP, \fBpcre2_jit_stack_free()\fP, and \fBpcre2_jit_stack_assign()\fP in order to control the JIT code's memory usage. .P -JIT matching is automatically used by \fBpcre2_match()\fP if it is available. -There is also a direct interface for JIT matching, which gives improved -performance. The JIT-specific functions are discussed in the +JIT matching is automatically used by \fBpcre2_match()\fP if it is available, +unless the PCRE2_NO_JIT option is set. There is also a direct interface for JIT +matching, which gives improved performance. The JIT-specific functions are +discussed in the .\" HREF \fBpcre2jit\fP .\" @@ -387,9 +398,16 @@ The function \fBpcre2_substitute()\fP can be called to match a pattern and return a copy of the subject string with substitutions for parts that were matched. .P +Functions whose names begin with \fBpcre2_serialize_\fP are used for saving +compiled patterns on disc or elsewhere, and reloading them later. +.P Finally, there are functions for finding out information about a compiled pattern (\fBpcre2_pattern_info()\fP) and about the configuration with which PCRE2 was built (\fBpcre2_config()\fP). +.P +Functions with names ending with \fB_free()\fP are used for freeing memory +blocks of various sorts. In all cases, if one of these functions is called with +a NULL argument, it does nothing. . . .SH "STRING LENGTHS AND OFFSETS" @@ -455,21 +473,53 @@ time ensuring that multithreaded applications can use it. .P There are several different blocks of data that are used to pass information between the application and the PCRE2 libraries. -.P -(1) A pointer to the compiled form of a pattern is returned to the user when +. +. +.SS "The compiled pattern" +.rs +.sp +A pointer to the compiled form of a pattern is returned to the user when \fBpcre2_compile()\fP is successful. The data in the compiled pattern is fixed, and does not change when the pattern is matched. Therefore, it is thread-safe, that is, the same compiled pattern can be used by more than one thread -simultaneously. An application can compile all its patterns at the start, -before forking off multiple threads that use them. However, if the just-in-time -optimization feature is being used, it needs separate memory stack areas for -each thread. See the +simultaneously. For example, an application can compile all its patterns at the +start, before forking off multiple threads that use them. However, if the +just-in-time optimization feature is being used, it needs separate memory stack +areas for each thread. See the .\" HREF \fBpcre2jit\fP .\" documentation for more details. .P -(2) The next section below introduces the idea of "contexts" in which PCRE2 +In a more complicated situation, where patterns are compiled only when they are +first needed, but are still shared between threads, pointers to compiled +patterns must be protected from simultaneous writing by multiple threads, at +least until a pattern has been compiled. The logic can be something like this: +.sp + Get a read-only (shared) lock (mutex) for pointer + if (pointer == NULL) + { + Get a write (unique) lock for pointer + pointer = pcre2_compile(... + } + Release the lock + Use pointer in pcre2_match() +.sp +Of course, testing for compilation errors should also be included in the code. +.P +If JIT is being used, but the JIT compilation is not being done immediately, +(perhaps waiting to see if the pattern is used often enough) similar logic is +required. JIT compilation updates a pointer within the compiled code block, so +a thread must gain unique write access to the pointer before calling +\fBpcre2_jit_compile()\fP. Alternatively, \fBpcre2_code_copy()\fP or +\fBpcre2_code_copy_with_tables()\fP can be used to obtain a private copy of the +compiled code. +. +. +.SS "Context blocks" +.rs +.sp +The next main section below introduces the idea of "contexts" in which PCRE2 functions are called. A context is nothing more than a collection of parameters that control the way PCRE2 operates. Grouping a number of parameters together in a context is a convenient way of passing them to a PCRE2 function without @@ -481,11 +531,15 @@ In a multithreaded application, if the parameters in a context are values that are never changed, the same context can be used by all the threads. However, if any thread needs to change any value in a context, it must make its own thread-specific copy. -.P -(3) The matching functions need a block of memory for working space and for -storing the results of a match. This includes details of what was matched, as -well as additional information such as the name of a (*MARK) setting. Each -thread must provide its own version of this memory. +. +. +.SS "Match blocks" +.rs +.sp +The matching functions need a block of memory for working space and for storing +the results of a match. This includes details of what was matched, as well as +additional information such as the name of a (*MARK) setting. Each thread must +provide its own copy of this memory. . . .SH "PCRE2 CONTEXTS" @@ -564,6 +618,7 @@ of the following compile-time parameters: PCRE2's character tables The newline character sequence The compile time nested parentheses limit + The maximum length of the pattern string An external function for stack checking .sp A compile context is also required if you are using custom memory management. @@ -607,6 +662,17 @@ argument is a general context. This function builds a set of character tables in the current locale. .sp .nf +.B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP, +.B " PCRE2_SIZE \fIvalue\fP);" +.fi +.sp +This sets a maximum length, in code units, for the pattern string that is to be +compiled. If the pattern is longer, an error is generated. This facility is +provided so that applications that accept patterns from external sources can +limit their size. The default is the largest number that a PCRE2_SIZE variable +can hold, which is effectively unlimited. +.sp +.nf .B int pcre2_set_newline(pcre2_compile_context *\fIccontext\fP, .B " uint32_t \fIvalue\fP);" .fi @@ -630,7 +696,8 @@ functions, \fIpcre2_match()\fP and \fIpcre2_dfa_match()\fP. .sp This parameter ajusts the limit, set when PCRE2 is built (default 250), on the depth of parenthesis nesting in a pattern. This limit stops rogue patterns -using up too much system stack when being compiled. +using up too much system stack when being compiled. The limit applies to +parentheses of all kinds, not just capturing parentheses. .sp .nf .B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP, @@ -659,8 +726,9 @@ A match context is required if you want to change the default values of any of the following match-time parameters: .sp A callout function - The limit for calling \fImatch()\fP - The limit for calling \fImatch()\fP recursively + The offset limit for matching an unanchored pattern + The limit for calling \fBmatch()\fP (see below) + The limit for calling \fBmatch()\fP recursively .sp A match context is also required if you are using custom memory management. If none of these apply, just pass NULL as the context argument of @@ -696,6 +764,32 @@ during a matching operation. Details are given in the documentation. .sp .nf +.B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP, +.B " PCRE2_SIZE \fIvalue\fP);" +.fi +.sp +The \fIoffset_limit\fP parameter limits how far an unanchored search can +advance in the subject string. The default value is PCRE2_UNSET. The +\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP functions return +PCRE2_ERROR_NOMATCH if a match with a starting point before or at the given +offset is not found. For example, if the pattern /abc/ is matched against +"123abc" with an offset limit less than 3, the result is PCRE2_ERROR_NO_MATCH. +A match can never be found if the \fIstartoffset\fP argument of +\fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP is greater than the offset +limit. +.P +When using this facility, you must set PCRE2_USE_OFFSET_LIMIT when calling +\fBpcre2_compile()\fP so that when JIT is in use, different code can be +compiled. If a match is started with a non-default match limit when +PCRE2_USE_OFFSET_LIMIT is not set, an error is generated. +.P +The offset limit facility can be used to track progress when searching large +subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to +start within the first line of the subject. If this is set with an offset +limit, a match must occur in the first line and also within the offset limit. +In other words, whichever limit comes first is used. +.sp +.nf .B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP, .B " uint32_t \fIvalue\fP);" .fi @@ -746,20 +840,22 @@ This limit is of use only if it is set smaller than \fImatch_limit\fP. Limiting the recursion depth limits the amount of system stack that can be used, or, when PCRE2 has been compiled to use memory on the heap instead of the stack, the amount of heap memory that can be used. This limit is not relevant, -and is ignored, when matching is done using JIT compiled code or by the -\fBpcre2_dfa_match()\fP function. +and is ignored, when matching is done using JIT compiled code. However, it is +supported by \fBpcre2_dfa_match()\fP, which uses recursive function calls less +frequently than \fBpcre2_match()\fP, but which can be caused to use a lot of +stack by a recursive pattern such as /(.)(?1)/ matched to a very long string. .P The default value for \fIrecursion_limit\fP can be set when PCRE2 is built; the default default is the same value as the default for \fImatch_limit\fP. If the -limit is exceeded, \fBpcre2_match()\fP returns PCRE2_ERROR_RECURSIONLIMIT. A -value for the recursion limit may also be supplied by an item at the start of a -pattern of the form +limit is exceeded, \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP return +PCRE2_ERROR_RECURSIONLIMIT. A value for the recursion limit may also be +supplied by an item at the start of a pattern of the form .sp (*LIMIT_RECURSION=ddd) .sp where ddd is a decimal number. However, such a setting is ignored unless ddd is -less than the limit set by the caller of \fBpcre2_match()\fP or, if no such -limit is set, less than the default. +less than the limit set by the caller of \fBpcre2_match()\fP or +\fBpcre2_dfa_match()\fP or, if no such limit is set, less than the default. .sp .nf .B int pcre2_set_recursion_memory_management( @@ -905,7 +1001,7 @@ The \fIwhere\fP argument should point to a buffer that is at least 24 code units long. (The exact length required can be found by calling \fBpcre2_config()\fP with \fBwhere\fP set to NULL.) If PCRE2 has been compiled without Unicode support, the buffer is filled with the text "Unicode not -supported". Otherwise, the Unicode version string (for example, "7.0.0") is +supported". Otherwise, the Unicode version string (for example, "8.0.0") is inserted. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero. .sp @@ -933,35 +1029,69 @@ zero. .B " uint32_t \fIoptions\fP, int *\fIerrorcode\fP, PCRE2_SIZE *\fIerroroffset,\fP" .B " pcre2_compile_context *\fIccontext\fP);" .sp -.B pcre2_code_free(pcre2_code *\fIcode\fP); +.B void pcre2_code_free(pcre2_code *\fIcode\fP); +.sp +.B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP); +.sp +.B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP); .fi .P The \fBpcre2_compile()\fP function compiles a pattern into an internal form. -The pattern is defined by a pointer to a string of code units and a length, If +The pattern is defined by a pointer to a string of code units and a length. If the pattern is zero-terminated, the length can be specified as PCRE2_ZERO_TERMINATED. The function returns a pointer to a block of memory that -contains the compiled pattern and related data. The caller must free the memory -by calling \fBpcre2_code_free()\fP when it is no longer needed. +contains the compiled pattern and related data, or NULL if an error occurred. +.P +If the compile context argument \fIccontext\fP is NULL, memory for the compiled +pattern is obtained by calling \fBmalloc()\fP. Otherwise, it is obtained from +the same memory function that was used for the compile context. The caller must +free the memory by calling \fBpcre2_code_free()\fP when it is no longer needed. +.P +The function \fBpcre2_code_copy()\fP makes a copy of the compiled code in new +memory, using the same memory allocator as was used for the original. However, +if the code has been processed by the JIT compiler (see +.\" HTML +.\" +below), +.\" +the JIT information cannot be copied (because it is position-dependent). +The new copy can initially be used only for non-JIT matching, though it can be +passed to \fBpcre2_jit_compile()\fP if required. +.P +The \fBpcre2_code_copy()\fP function provides a way for individual threads in a +multithreaded application to acquire a private copy of shared compiled code. +However, it does not make a copy of the character tables used by the compiled +pattern; the new pattern code points to the same tables as the original code. +(See +.\" HTML +.\" +"Locale Support" +.\" +below for details of these character tables.) In many applications the same +tables are used throughout, so this behaviour is appropriate. Nevertheless, +there are occasions when a copy of a compiled pattern and the relevant tables +are needed. The \fBpcre2_code_copy_with_tables()\fP provides this facility. +Copies of both the code and the tables are made, with the new code pointing to +the new tables. The memory for the new tables is automatically freed when +\fBpcre2_code_free()\fP is called for the new copy of the compiled code. .P NOTE: When one of the matching functions is called, pointers to the compiled pattern and the subject string are set in the match data block so that they can -be referenced by the extraction functions. After running a match, you must not -free a compiled pattern (or a subject string) until after all operations on the +be referenced by the substring extraction functions. After running a match, you +must not free a compiled pattern (or a subject string) until after all +operations on the .\" HTML .\" match data block .\" have taken place. .P -If the compile context argument \fIccontext\fP is NULL, memory for the compiled -pattern is obtained by calling \fBmalloc()\fP. Otherwise, it is obtained from -the same memory function that was used for the compile context. -.P -The \fIoptions\fP argument contains various bit settings that affect the -compilation. It should be zero if no options are required. The available -options are described below. Some of them (in particular, those that are -compatible with Perl, but some others as well) can also be set and unset from -within the pattern (see the detailed description in the +The \fIoptions\fP argument for \fBpcre2_compile()\fP contains various bit +settings that affect the compilation. It should be zero if no options are +required. The available options are described below. Some of them (in +particular, those that are compatible with Perl, but some others as well) can +also be set and unset from within the pattern (see the detailed description in +the .\" HREF \fBpcre2pattern\fP .\" @@ -980,13 +1110,28 @@ above). .\" .P If \fIerrorcode\fP or \fIerroroffset\fP is NULL, \fBpcre2_compile()\fP returns -NULL immediately. Otherwise, if compilation of a pattern fails, -\fBpcre2_compile()\fP returns NULL, having set these variables to an error code -and an offset (number of code units) within the pattern, respectively. The -\fBpcre2_get_error_message()\fP function provides a textual message for each -error code. Compilation errors are positive numbers, but UTF formatting errors -are negative numbers. For an invalid UTF-8 or UTF-16 string, the offset is that -of the first code unit of the failing character. +NULL immediately. Otherwise, the variables to which these point are set to an +error code and an offset (number of code units) within the pattern, +respectively, when \fBpcre2_compile()\fP returns NULL because a compilation +error has occurred. The values are not defined when compilation is successful +and \fBpcre2_compile()\fP returns a non-NULL value. +.P +The value returned in \fIerroroffset\fP is an indication of where in the +pattern the error occurred. It is not necessarily the furthest point in the +pattern that was read. For example, after the error "lookbehind assertion is +not fixed length", the error offset points to the start of the failing +assertion. +.P +The \fBpcre2_get_error_message()\fP function (see "Obtaining a textual error +message" +.\" HTML +.\" +below) +.\" +provides a textual message for each error code. Compilation errors have +positive error codes; UTF formatting error codes are negative. For an invalid +UTF-8 or UTF-16 string, the offset is that of the first code unit of the +failing character. .P Some errors are not detected until the whole pattern has been scanned; in these cases, the offset passed back is the length of the pattern. Note that the @@ -1052,12 +1197,24 @@ after any internal newline. However, it does not match after a newline at the end of the subject, for compatibility with Perl. If you want a multiline circumflex also to match after a terminating newline, you must set PCRE2_ALT_CIRCUMFLEX. +.sp + PCRE2_ALT_VERBNAMES +.sp +By default, for compatibility with Perl, the name in any verb sequence such as +(*MARK:NAME) is any sequence of characters that does not include a closing +parenthesis. The name is not processed in any way, and it is not possible to +include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES +option is set, normal backslash processing is applied to verb names and only an +unescaped closing parenthesis terminates the name. A closing parenthesis can be +included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED +option is set, unescaped whitespace in verb names is skipped and #-comments are +recognized, exactly as in the rest of the pattern. .sp PCRE2_AUTO_CALLOUT .sp If this bit is set, \fBpcre2_compile()\fP automatically inserts callout items, -all with number 255, before each pattern item. For discussion of the callout -facility, see the +all with number 255, before each pattern item, except immediately before or +after a callout in the pattern. For discussion of the callout facility, see the .\" HREF \fBpcre2callout\fP .\" @@ -1130,7 +1287,10 @@ built. .sp If this option is set, an unanchored pattern is required to match before or at the first newline in the subject string, though the matched text may continue -over the newline. +over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more +general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a +match must occur in the first line and also within the offset limit. In other +words, whichever limit comes first is used. .sp PCRE2_MATCH_UNSET_BACKREF .sp @@ -1168,7 +1328,8 @@ This option locks out the use of \eC in the pattern that is being compiled. This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because it may leave the current matching point in the middle of a multi-code-unit character. This option may be useful in applications that process patterns from -external sources. +external sources. Note that there is also a build-time option that permanently +locks out the use of \eC. .sp PCRE2_NEVER_UCP .sp @@ -1194,7 +1355,9 @@ If this option is set, it disables the use of numbered capturing parentheses in the pattern. Any opening parenthesis that is not followed by ? behaves as if it were followed by ?: but named parentheses can still be used for capturing (and they acquire numbers in the usual way). There is no equivalent of this option -in Perl. +in Perl. Note that, if this option is set, references to capturing groups (back +references or recursion/subroutine calls) may only refer to named groups, +though the reference can be by name or by number. .sp PCRE2_NO_AUTO_POSSESS .sp @@ -1323,6 +1486,20 @@ support. This option inverts the "greediness" of the quantifiers so that they are not greedy by default, but become greedy if followed by "?". It is not compatible with Perl. It can also be set by a (?U) option setting within the pattern. +.sp + PCRE2_USE_OFFSET_LIMIT +.sp +This option must be set for \fBpcre2_compile()\fP if +\fBpcre2_set_offset_limit()\fP is going to be used to set a non-default offset +limit in a match context for matches that use this pattern. An error is +generated if an offset limit is set without this option. For more details, see +the description of \fBpcre2_set_offset_limit()\fP in the +.\" HTML +.\" +section +.\" +that describes match contexts. See also the PCRE2_FIRSTLINE +option above. .sp PCRE2_UTF .sp @@ -1341,17 +1518,24 @@ page. .SH "COMPILATION ERROR CODES" .rs .sp -There are over 80 positive error codes that \fBpcre2_compile()\fP may return if -it finds an error in the pattern. There are also some negative error codes that -are used for invalid UTF strings. These are the same as given by -\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP, and are described in the +There are over 80 positive error codes that \fBpcre2_compile()\fP may return +(via \fIerrorcode\fP) if it finds an error in the pattern. There are also some +negative error codes that are used for invalid UTF strings. These are the same +as given by \fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP, and are described +in the .\" HREF \fBpcre2unicode\fP .\" -page. The \fBpcre2_get_error_message()\fP function can be called to obtain a -textual error message from any error code. +page. The \fBpcre2_get_error_message()\fP function (see "Obtaining a textual +error message" +.\" HTML +.\" +below) +.\" +can be called to obtain a textual error message from any error code. . . +.\" HTML .SH "JUST-IN-TIME (JIT) COMPILATION" .rs .sp @@ -1490,11 +1674,14 @@ are as follows: Return a copy of the pattern's options. The third argument should point to a \fBuint32_t\fP variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that were passed to \fBpcre2_compile()\fP, whereas PCRE2_INFO_ALLOPTIONS returns -the compile options as modified by any top-level option settings at the start -of the pattern itself. In other words, they are the options that will be in -force when matching starts. For example, if the pattern /(?im)abc(?-i)d/ is -compiled with the PCRE2_EXTENDED option, the result is PCRE2_CASELESS, -PCRE2_MULTILINE, and PCRE2_EXTENDED. +the compile options as modified by any top-level (*XXX) option settings such as +(*UTF) at the start of the pattern itself. +.P +For example, if the pattern /(*UTF)abc/ is compiled with the PCRE2_EXTENDED +option, the result for PCRE2_INFO_ALLOPTIONS is PCRE2_EXTENDED and PCRE2_UTF. +Option settings such as (?i) that can change within a pattern do not affect the +result of PCRE2_INFO_ALLOPTIONS, even if they appear right at the start of the +pattern. (This was different in some earlier releases.) .P A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if the first significant item in every top-level branch is one of the following: @@ -1537,17 +1724,27 @@ matches only CR, LF, or CRLF. .sp PCRE2_INFO_CAPTURECOUNT .sp -Return the number of capturing subpatterns in the pattern. The third argument -should point to an \fBuint32_t\fP variable. +Return the highest capturing subpattern number in the pattern. In patterns +where (?| is not used, this is also the total number of capturing subpatterns. +The third argument should point to an \fBuint32_t\fP variable. +.sp + PCRE2_INFO_FIRSTBITMAP +.sp +In the absence of a single first code unit for a non-anchored pattern, +\fBpcre2_compile()\fP may construct a 256-bit table that defines a fixed set of +values for the first code unit in any match. For example, a pattern that starts +with [abc] results in a table with three bits set. When code unit values +greater than 255 are supported, the flag bit for 255 means "any code unit of +value 255 or above". If such a table was constructed, a pointer to it is +returned. Otherwise NULL is returned. The third argument should point to an +\fBconst uint8_t *\fP variable. .sp PCRE2_INFO_FIRSTCODETYPE .sp Return information about the first code unit of any matched string, for a non-anchored pattern. The third argument should point to an \fBuint32_t\fP -variable. -.P -If there is a fixed first value, for example, the letter "c" from a pattern -such as (cat|cow|coyote), 1 is returned, and the character value can be +variable. If there is a fixed first value, for example, the letter "c" from a +pattern such as (cat|cow|coyote), 1 is returned, and the character value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but it is known that a match can occur only at the start of the subject or following a newline in the subject, 2 is returned. Otherwise, and for anchored @@ -1562,16 +1759,10 @@ value is always less than 256. In the 16-bit library the value can be up to 0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32 mode. .sp - PCRE2_INFO_FIRSTBITMAP + PCRE2_INFO_HASBACKSLASHC .sp -In the absence of a single first code unit for a non-anchored pattern, -\fBpcre2_compile()\fP may construct a 256-bit table that defines a fixed set of -values for the first code unit in any match. For example, a pattern that starts -with [abc] results in a table with three bits set. When code unit values -greater than 255 are supported, the flag bit for 255 means "any code unit of -value 255 or above". If such a table was constructed, a pointer to it is -returned. Otherwise NULL is returned. The third argument should point to an -\fBconst uint8_t *\fP variable. +Return 1 if the pattern contains any instances of \eC, otherwise 0. The third +argument should point to an \fBuint32_t\fP variable. .sp PCRE2_INFO_HASCRORLF .sp @@ -1597,12 +1788,10 @@ Returns 1 if there is a rightmost literal code unit that must exist in any matched string, other than at its start. The third argument should point to an \fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is returned, the code unit value itself can be retrieved using -PCRE2_INFO_LASTCODEUNIT. -.P -For anchored patterns, a last literal value is recorded only if it follows -something of variable length. For example, for the pattern /^a\ed+z\ed+/ the -returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for -/^a\edz\ed/ the returned value is 0. +PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is +recorded only if it follows something of variable length. For example, for the +pattern /^a\ed+z\ed+/ the returned value is 1 (with "z" returned from +PCRE2_INFO_LASTCODEUNIT), but for /^a\edz\ed/ the returned value is 0. .sp PCRE2_INFO_LASTCODEUNIT .sp @@ -1613,8 +1802,11 @@ value, 0 is returned. .sp PCRE2_INFO_MATCHEMPTY .sp -Return 1 if the pattern can match an empty string, otherwise 0. The third -argument should point to an \fBuint32_t\fP variable. +Return 1 if the pattern might match an empty string, otherwise 0. The third +argument should point to an \fBuint32_t\fP variable. When a pattern contains +recursive subroutine calls it is not always possible to determine whether or +not it can match an empty string. PCRE2 takes a cautious approach and returns 1 +in such cases. .sp PCRE2_INFO_MATCHLIMIT .sp @@ -1788,11 +1980,11 @@ documentation. .rs .sp .nf -.B pcre2_match_data_create(uint32_t \fIovecsize\fP, +.B pcre2_match_data *pcre2_match_data_create(uint32_t \fIovecsize\fP, .B " pcre2_general_context *\fIgcontext\fP);" .sp -.B pcre2_match_data_create_from_pattern(const pcre2_code *\fIcode\fP, -.B " pcre2_general_context *\fIgcontext\fP);" +.B pcre2_match_data *pcre2_match_data_create_from_pattern( +.B " const pcre2_code *\fIcode\fP, pcre2_general_context *\fIgcontext\fP);" .sp .B void pcre2_match_data_free(pcre2_match_data *\fImatch_data\fP); .fi @@ -1801,7 +1993,7 @@ Information about a successful or unsuccessful match is placed in a match data block, which is an opaque structure that is accessed by function calls. In particular, the match data block contains a vector of offsets into the subject string that define the matched part of the subject and any substrings that were -captured. This is know as the \fIovector\fP. +captured. This is known as the \fIovector\fP. .P Before calling \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP, or \fBpcre2_jit_match()\fP you must create a match data block by calling one of @@ -1964,13 +2156,14 @@ pattern does not require the match to be at the start of the subject. .sp The unused bits of the \fIoptions\fP argument for \fBpcre2_match()\fP must be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL, -PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, -PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below. +PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, +PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is +described below. .P Setting PCRE2_ANCHORED at match time is not supported by the just-in-time (JIT) compiler. If it is set, JIT matching is disabled and the normal interpretive -code in \fBpcre2_match()\fP is run. The remaining options are supported for JIT -matching. +code in \fBpcre2_match()\fP is run. Apart from PCRE2_NO_JIT (obviously), the +remaining options are supported for JIT matching. .sp PCRE2_ANCHORED .sp @@ -2017,17 +2210,31 @@ only at the first matching position, that is, at the start of the subject plus the starting offset. An empty string match later in the subject is permitted. If the pattern is anchored, such a match can occur only if the pattern contains \eK. +.sp + PCRE2_NO_JIT +.sp +By default, if a pattern has been successfully processed by +\fBpcre2_jit_compile()\fP, JIT is automatically used when \fBpcre2_match()\fP +is called with options that JIT supports. Setting PCRE2_NO_JIT disables the use +of JIT; it forces matching to be done by the interpreter. .sp PCRE2_NO_UTF_CHECK .sp When PCRE2_UTF is set at compile time, the validity of the subject as a UTF string is checked by default when \fBpcre2_match()\fP is subsequently called. -The entire string is checked before any other processing takes place, and a +If a non-zero starting offset is given, the check is applied only to that part +of the subject that could be inspected during matching, and there is a check +that the starting offset points to the first code unit of a character or to the +end of the subject. If there are no lookbehind assertions in the pattern, the +check starts at the starting offset. Otherwise, it starts at the length of the +longest lookbehind before the starting offset, or at the start of the subject +if there are not that many characters before the starting offset. Note that the +sequences \eb and \eB are one-character lookbehinds. +.P +The check is carried out before any other processing takes place, and a negative error code is returned if the check fails. There are several UTF error codes for each code unit width, corresponding to different problems with the -code unit sequence. The value of \fIstartoffset\fP is also checked, to ensure -that it points to the start of a character or to the end of the subject. There -are discussions about the validity of +code unit sequence. There are discussions about the validity of .\" HTML .\" UTF-8 strings, @@ -2092,9 +2299,19 @@ standard convention for the operating system. The default can be overridden in a .\" HTML .\" -compile context. +compile context .\" -During matching, the newline choice affects the behaviour of the dot, +by calling \fBpcre2_set_newline()\fP. It can also be overridden by starting a +pattern string with, for example, (*CRLF), as described in the +.\" HTML +.\" +section on newline conventions +.\" +in the +.\" HREF +\fBpcre2pattern\fP +.\" +page. During matching, the newline choice affects the behaviour of the dot, circumflex, and dollar metacharacters. It may also alter the way the match starting position is advanced after a match failure for an unanchored pattern. .P @@ -2140,18 +2357,7 @@ that do not cause substrings to be captured. The \fBpcre2_pattern_info()\fP function can be used to find out how many capturing subpatterns there are in a compiled pattern. .P -A successful match returns the overall matched string and any captured -substrings to the caller via a vector of PCRE2_SIZE values. This is called the -\fBovector\fP, and is contained within the -.\" HTML -.\" -match data block. -.\" -You can obtain direct access to the ovector by calling -\fBpcre2_get_ovector_pointer()\fP to find its address, and -\fBpcre2_get_ovector_count()\fP to find the number of pairs of values it -contains. Alternatively, you can use the auxiliary functions for accessing -captured substrings +You can use auxiliary functions for accessing captured substrings .\" HTML .\" by number @@ -2159,9 +2365,20 @@ by number or .\" HTML .\" -by name +by name, .\" -(see below). +as described in sections below. +.P +Alternatively, you can make direct use of the vector of PCRE2_SIZE values, +called the \fBovector\fP, which contains the offsets of captured strings. It is +part of the +.\" HTML +.\" +match data block. +.\" +The function \fBpcre2_get_ovector_pointer()\fP returns the address of the +ovector, and \fBpcre2_get_ovector_count()\fP returns the number of pairs of +values it contains. .P Within the ovector, the first in each pair of values is set to the offset of the first code unit of a substring, and the second is set to the offset of the @@ -2245,7 +2462,12 @@ After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be available, and \fBpcre2_get_mark()\fP can be called. It returns a pointer to the zero-terminated name, which is within the compiled pattern. Otherwise NULL is -returned. After a successful match, the (*MARK) name that is returned is the +returned. The length of the (*MARK) name (excluding the terminating zero) is +stored in the code unit that preceeds the name. You should use this instead of +relying on the terminating zero if the (*MARK) name might contain a binary +zero. +.P +After a successful match, the (*MARK) name that is returned is the last one encountered on the matching path through the pattern. After a "no match" or a partial match, the last encountered (*MARK) name is returned. For example, consider this pattern: @@ -2264,7 +2486,7 @@ different to the value of \fIovector[0]\fP if the pattern contains the \eK escape sequence. After a partial match, however, this value is always the same as \fIovector[0]\fP because \eK does not affect the result of a partial match. .P -After a UTF check failure, \fBpcre2_get_startchar()\fB can be used to obtain +After a UTF check failure, \fBpcre2_get_startchar()\fP can be used to obtain the code unit offset of the invalid UTF character. Details are given in the .\" HREF \fBpcre2unicode\fP @@ -2277,11 +2499,16 @@ page. .rs .sp If \fBpcre2_match()\fP fails, it returns a negative number. This can be -converted to a text string by calling \fBpcre2_get_error_message()\fP. Negative -error codes are also returned by other functions, and are documented with them. -The codes are given names in the header file. If UTF checking is in force and -an invalid UTF subject string is detected, one of a number of UTF-specific -negative error codes is returned. Details are given in the +converted to a text string by calling the \fBpcre2_get_error_message()\fP +function (see "Obtaining a textual error message" +.\" HTML +.\" +below). +.\" +Negative error codes are also returned by other functions, and are documented +with them. The codes are given names in the header file. If UTF checking is in +force and an invalid UTF subject string is detected, one of a number of +UTF-specific negative error codes is returned. Details are given in the .\" HREF \fBpcre2unicode\fP .\" @@ -2394,6 +2621,30 @@ is attempted. The internal recursion limit was reached. . . +.\" HTML +.SH "OBTAINING A TEXTUAL ERROR MESSAGE" +.rs +.sp +.nf +.B int pcre2_get_error_message(int \fIerrorcode\fP, PCRE2_UCHAR *\fIbuffer\fP, +.B " PCRE2_SIZE \fIbufflen\fP);" +.fi +.P +A text message for an error code from any PCRE2 function (compile, match, or +auxiliary) can be obtained by calling \fBpcre2_get_error_message()\fP. The code +is passed as the first argument, with the remaining two arguments specifying a +code unit buffer and its length, into which the text message is placed. Note +that the message is returned in code units of the appropriate width for the +library that is being used. +.P +The returned message is terminated with a trailing zero, and the function +returns the number of code units used, excluding the trailing zero. If the +error number is unknown, the negative error code PCRE2_ERROR_BADDATA is +returned. If the buffer is too small, the message is truncated (but still with +a trailing zero), and the negative error code PCRE2_ERROR_NOMEMORY is returned. +None of the messages are very long; a buffer size of 120 code units is ample. +. +. .\" HTML .SH "EXTRACTING CAPTURED SUBSTRINGS BY NUMBER" .rs @@ -2595,32 +2846,17 @@ same number causes an error at compile time. .B int pcre2_substitute(const pcre2_code *\fIcode\fP, PCRE2_SPTR \fIsubject\fP, .B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP," .B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP," -.B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacementzfP," +.B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacement\fP," .B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\zfP," .B " PCRE2_SIZE *\fIoutlengthptr\fP);" .fi +.P This function calls \fBpcre2_match()\fP and then makes a copy of the subject string in \fIoutputbuffer\fP, replacing the part that was matched with the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This can -be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. -.P -In the replacement string, which is interpreted as a UTF string in UTF mode, -and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a -dollar character is an escape character that can specify the insertion of -characters from capturing groups in the pattern. The following forms are -recognized: -.sp - $$ insert a dollar character - $ insert the contents of group - ${ } insert the contents of group -.sp -Either a group number or a group name can be given for . Curly brackets are -required only if the following character would be interpreted as part of the -number or name. The number may be zero to include the entire matched string. -For example, if the pattern a(b)c is matched with "=abc=" and the replacement -string "+$1$0$1+", the result is "=+babcb+=". Group insertion is done by -calling \fBpcre2_copy_byname()\fP or \fBpcre2_copy_bynumber()\fP as -appropriate. +be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in +which a \eK item in a lookahead in the pattern causes the match to end before +it starts are not supported, and give rise to an error return. .P The first seven arguments of \fBpcre2_substitute()\fP are the same as for \fBpcre2_match()\fP, except that the partial matching options are not @@ -2629,23 +2865,169 @@ data block is obtained and freed within this function, using memory management functions from the match context, if provided, or else those that were used to allocate memory for the compiled code. .P -There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the -function to iterate over the subject string, replacing every matching -substring. If this is not set, only the first matching substring is replaced. -.P The \fIoutlengthptr\fP argument must point to a variable that contains the -length, in code units, of the output buffer. It is updated to contain the -length of the new string, excluding the trailing zero that is automatically -added. +length, in code units, of the output buffer. If the function is successful, the +value is updated to contain the length of the new string, excluding the +trailing zero that is automatically added. .P -The function returns the number of replacements that were made. This may be -zero if no matches were found, and is never greater than 1 unless -PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code -is returned. Except for PCRE2_ERROR_NOMATCH (which is never returned), any -errors from \fBpcre2_match()\fP or the substring copying functions are passed -straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid -replacement string (unrecognized sequence following a dollar sign), and -PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. +If the function is not successful, the value set via \fIoutlengthptr\fP depends +on the type of error. For syntax errors in the replacement string, the value is +the offset in the replacement string where the error was detected. For other +errors, the value is PCRE2_UNSET by default. This includes the case of the +output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set +(see below), in which case the value is the minimum length needed, including +space for the trailing zero. Note that in order to compute the required length, +\fBpcre2_substitute()\fP has to simulate all the matching and copying, instead +of giving an error return as soon as the buffer overflows. Note also that the +length is in code units, not bytes. +.P +In the replacement string, which is interpreted as a UTF string in UTF mode, +and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a +dollar character is an escape character that can specify the insertion of +characters from capturing groups or (*MARK) items in the pattern. The following +forms are always recognized: +.sp + $$ insert a dollar character + $ or ${ } insert the contents of group + $*MARK or ${*MARK} insert the name of the last (*MARK) encountered +.sp +Either a group number or a group name can be given for . Curly brackets are +required only if the following character would be interpreted as part of the +number or name. The number may be zero to include the entire matched string. +For example, if the pattern a(b)c is matched with "=abc=" and the replacement +string "+$1$0$1+", the result is "=+babcb+=". +.P +The facility for inserting a (*MARK) name can be used to perform simple +simultaneous substitutions, as this \fBpcre2test\fP example shows: +.sp + /(*:pear)apple|(*:orange)lemon/g,replace=${*MARK} + apple lemon + 2: pear orange +.sp +As well as the usual options for \fBpcre2_match()\fP, a number of additional +options can be set in the \fIoptions\fP argument. +.P +PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string, +replacing every matching substring. If this is not set, only the first matching +substring is replaced. If any matched substring has zero length, after the +substitution has happened, an attempt to find a non-empty match at the same +position is performed. If this is not successful, the current position is +advanced by one character except when CRLF is a valid newline sequence and the +next two characters are CR, LF. In this case, the current position is advanced +by two characters. +.P +PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is +too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If +this option is set, however, \fBpcre2_substitute()\fP continues to go through +the motions of matching and substituting (without, of course, writing anything) +in order to compute the size of buffer that is needed. This value is passed +back via the \fIoutlengthptr\fP variable, with the result of the function still +being PCRE2_ERROR_NOMEMORY. +.P +Passing a buffer size of zero is a permitted way of finding out how much memory +is needed for given substitution. However, this does mean that the entire +operation is carried out twice. Depending on the application, it may be more +efficient to allocate a large buffer and free the excess afterwards, instead of +using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH. +.P +PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups that do +not appear in the pattern to be treated as unset groups. This option should be +used with care, because it means that a typo in a group name or number no +longer causes the PCRE2_ERROR_NOSUBSTRING error. +.P +PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including unknown +groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty +strings when inserted as described above. If this option is not set, an attempt +to insert an unset group causes the PCRE2_ERROR_UNSET error. This option does +not influence the extended substitution syntax described below. +.P +PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the +replacement string. Without this option, only the dollar character is special, +and only the group insertion forms listed above are valid. When +PCRE2_SUBSTITUTE_EXTENDED is set, two things change: +.P +Firstly, backslash in a replacement string is interpreted as an escape +character. The usual forms such as \en or \ex{ddd} can be used to specify +particular character codes, and backslash followed by any non-alphanumeric +character quotes that character. Extended quoting can be coded using \eQ...\eE, +exactly as in pattern strings. +.P +There are also four escape sequences for forcing the case of inserted letters. +The insertion mechanism has three states: no case forcing, force upper case, +and force lower case. The escape sequences change the current state: \eU and +\eL change to upper or lower case forcing, respectively, and \eE (when not +terminating a \eQ quoted sequence) reverts to no case forcing. The sequences +\eu and \el force the next character (if it is a letter) to upper or lower +case, respectively, and then the state automatically reverts to no case +forcing. Case forcing applies to all inserted characters, including those from +captured groups and letters within \eQ...\eE quoted sequences. +.P +Note that case forcing sequences such as \eU...\eE do not nest. For example, +the result of processing "\eUaa\eLBB\eEcc\eE" is "AAbbcc"; the final \eE has no +effect. +.P +The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more +flexibility to group substitution. The syntax is similar to that used by Bash: +.sp + ${ :- } + ${ :+ : } +.sp +As before, may be a group number or a name. The first form specifies a +default value. If group is set, its value is inserted; if not, is +expanded and the result inserted. The second form specifies strings that are +expanded and inserted when group is set or unset, respectively. The first +form is just a convenient shorthand for +.sp + ${ :+${ }: } +.sp +Backslash can be used to escape colons and closing curly brackets in the +replacement strings. A change of the case forcing state within a replacement +string remains in force afterwards, as shown in this \fBpcre2test\fP example: +.sp + /(some)?(body)/substitute_extended,replace=${1:+\eU:\eL}HeLLo + body + 1: hello + somebody + 1: HELLO +.sp +The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended +substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown +groups in the extended syntax forms to be treated as unset. +.P +If successful, \fBpcre2_substitute()\fP returns the number of replacements that +were made. This may be zero if no matches were found, and is never greater than +1 unless PCRE2_SUBSTITUTE_GLOBAL is set. +.P +In the event of an error, a negative error code is returned. Except for +PCRE2_ERROR_NOMATCH (which is never returned), errors from \fBpcre2_match()\fP +are passed straight back. +.P +PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion, +unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set. +.P +PCRE2_ERROR_UNSET is returned for an unset substring insertion (including an +unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple +(non-extended) syntax is used and PCRE2_SUBSTITUTE_UNSET_EMPTY is not set. +.P +PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the +PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is +needed is returned via \fIoutlengthptr\fP. Note that this does not happen by +default. +.P +PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the +replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE +(invalid escape sequence), PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket +not found), PCRE2_BADSUBSTITUTION (syntax error in extended group +substitution), and PCRE2_BADSUBPATTERN (the pattern match ended before it +started, which can happen if \eK is used in an assertion). +.P +As for all PCRE2 errors, a text message that describes the error can be +obtained by calling the \fBpcre2_get_error_message()\fP function (see +"Obtaining a textual error message" +.\" HTML +.\" +above). +.\" . . .SH "DUPLICATE SUBPATTERN NAMES" @@ -2686,14 +3068,14 @@ first and last entries in the name-to-number table for the given name, and the function returns the length of each entry in code units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name. .P -The format of the name table is described above in the section entitled -\fIInformation about a pattern\fP +The format of the name table is described .\" HTML .\" -above. +above .\" -Given all the relevant entries for the name, you can extract each of their -numbers, and hence the captured data. +in the section entitled \fIInformation about a pattern\fP. Given all the +relevant entries for the name, you can extract each of their numbers, and hence +the captured data. . . .SH "FINDING ALL POSSIBLE MATCHES AT ONE POSITION" @@ -2888,8 +3270,8 @@ There are in addition the following errors that are specific to PCRE2_ERROR_DFA_UITEM .sp This return is given if \fBpcre2_dfa_match()\fP encounters an item in the -pattern that it does not support, for instance, the use of \eC or a back -reference. +pattern that it does not support, for instance, the use of \eC in a UTF mode or +a back reference. .sp PCRE2_ERROR_DFA_UCOND .sp @@ -2939,6 +3321,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 22 April 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 23 December 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2build.3 b/pcre2/doc/pcre2build.3 index 8f74e9b6b..ea9d8a97b 100644 --- a/pcre2/doc/pcre2build.3 +++ b/pcre2/doc/pcre2build.3 @@ -1,4 +1,4 @@ -.TH PCRE2BUILD 3 "23 April 2015" "PCRE2 10.20" +.TH PCRE2BUILD 3 "01 November 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) . @@ -132,11 +132,20 @@ Pattern escapes such as \ed and \ew do not by default make use of Unicode properties. The application can request that they do by setting the PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also request this by starting with (*UCP). -.P +. +. +.SH "DISABLING THE USE OF \eC" +.rs +.sp The \eC escape sequence, which matches a single code unit, even in a UTF mode, can cause unpredictable behaviour because it may leave the current matching -point in the middle of a multi-code-unit character. It can be locked out by -setting the PCRE2_NEVER_BACKSLASH_C option. +point in the middle of a multi-code-unit character. The application can lock it +out by setting the PCRE2_NEVER_BACKSLASH_C option when calling +\fBpcre2_compile()\fP. There is also a build-time option +.sp + --enable-never-backslash-C +.sp +(note the upper case C) which locks out the use of \eC entirely. . . .SH "JUST-IN-TIME COMPILER SUPPORT" @@ -343,6 +352,19 @@ and equivalent run-time options, refer to these character values in an EBCDIC environment. . . +.SH "PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS" +.rs +.sp +By default, on non-Windows systems, \fBpcre2grep\fP supports the use of +callouts with string arguments within the patterns it is matching, in order to +run external scripts. For details, see the +.\" HREF +\fBpcre2grep\fP +.\" +documentation. This support can be disabled by adding +--disable-pcre2grep-callout to the \fBconfigure\fP command. +. +. .SH "PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT" .rs .sp @@ -363,16 +385,19 @@ they are not. .sp \fBpcre2grep\fP uses an internal buffer to hold a "window" on the file it is scanning, in order to be able to output "before" and "after" lines when it -finds a match. The size of the buffer is controlled by a parameter whose -default value is 20K. The buffer itself is three times this size, but because -of the way it is used for holding "before" lines, the longest line that is -guaranteed to be processable is the parameter size. You can change the default -parameter value by adding, for example, +finds a match. The starting size of the buffer is controlled by a parameter +whose default value is 20K. The buffer itself is three times this size, but +because of the way it is used for holding "before" lines, the longest line that +is guaranteed to be processable is the parameter size. If a longer line is +encountered, \fBpcre2grep\fP automatically expands the buffer, up to a +specified maximum size, whose default is 1M or the starting size, whichever is +the larger. You can change the default parameter values by adding, for example, .sp - --with-pcre2grep-bufsize=50K + --with-pcre2grep-bufsize=51200 + --with-pcre2grep-max-bufsize=2097152 .sp -to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override this -value by using --buffer-size on the command line.. +to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override +these values by using --buffer-size and --max-buffer-size on the command line. . . .SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT" @@ -490,6 +515,28 @@ information about code coverage, see the \fBgcov\fP and \fBlcov\fP documentation. . . +.SH "SUPPORT FOR FUZZERS" +.rs +.sp +There is a special option for use by people who want to run fuzzing tests on +PCRE2: +.sp + --enable-fuzz-support +.sp +At present this applies only to the 8-bit library. If set, it causes an extra +library called libpcre2-fuzzsupport.a to be built, but not installed. This +contains a single function called LLVMFuzzerTestOneInput() whose arguments are +a pointer to a string and the length of the string. When called, this function +tries to compile the string as a pattern, and if that succeeds, to match it. +This is done both with no options and with some random options bits that are +generated from the string. Setting --enable-fuzz-support also causes a binary +called \fBpcre2fuzzcheck\fP to be created. This is normally run under valgrind +or used when PCRE2 is compiled with address sanitizing enabled. It calls the +fuzzing function and outputs information about it is doing. The input strings +are specified by arguments: if an argument starts with "=" the rest of it is a +literal input string. Otherwise, it is assumed to be a file name, and the +contents of the file are the test string. +. .SH "SEE ALSO" .rs .sp @@ -510,6 +557,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 24 April 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 01 November 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2callout.3 b/pcre2/doc/pcre2callout.3 index 6919f5a61..001796d68 100644 --- a/pcre2/doc/pcre2callout.3 +++ b/pcre2/doc/pcre2callout.3 @@ -1,4 +1,4 @@ -.TH PCRE2CALLOUT 3 "23 March 2015" "PCRE2 10.20" +.TH PCRE2CALLOUT 3 "29 September 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -40,11 +40,20 @@ two callout points: .sp If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2 automatically inserts callouts, all with number 255, before each item in the -pattern. For example, if PCRE2_AUTO_CALLOUT is used with the pattern +pattern except for immediately before or after a callout item in the pattern. +For example, if PCRE2_AUTO_CALLOUT is used with the pattern +.sp + A(?C3)B +.sp +it is processed as if it were +.sp + (?C255)A(?C3)B(?C255) +.sp +Here is a more complicated example: .sp A(\ed{2}|--) .sp -it is processed as if it were +With PCRE2_AUTO_CALLOUT, this pattern is processed as if it were .sp (?C255)A(?C255)((?C255)\ed{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255) .sp @@ -91,10 +100,10 @@ with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied to the string No match .sp This indicates that when matching [bc] fails, there is no backtracking into a+ -and therefore the callouts that would be taken for the backtracks do not occur. -You can disable the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to -\fBpcre2_compile()\fP, or starting the pattern with (*NO_AUTO_POSSESS). In this -case, the output changes to this: +(because it is being treated as a++) and therefore the callouts that would be +taken for the backtracks do not occur. You can disable the auto-possessify +feature by passing PCRE2_NO_AUTO_POSSESS to \fBpcre2_compile()\fP, or starting +the pattern with (*NO_AUTO_POSSESS). In this case, the output changes to this: .sp --->aaaa +0 ^ a+ @@ -220,8 +229,8 @@ but the intention is never to remove any of the existing fields. .sp For a numerical callout, \fIcallout_string\fP is NULL, and \fIcallout_number\fP contains the number of the callout, in the range 0-255. This is the number -that follows (?C for manual callouts; it is 255 for automatically generated -callouts. +that follows (?C for callouts that part of the pattern; it is 255 for +automatically generated callouts. . . .SS "Fields for string callouts" @@ -286,10 +295,15 @@ The \fIpattern_position\fP field contains the offset in the pattern string to the next item to be matched. .P The \fInext_item_length\fP field contains the length of the next item to be -matched in the pattern string. When the callout immediately precedes an -alternation bar, a closing parenthesis, or the end of the pattern, the length -is zero. When the callout precedes an opening parenthesis, the length is that -of the entire subpattern. +processed in the pattern string. When the callout is at the end of the pattern, +the length is zero. When the callout precedes an opening parenthesis, the +length includes meta characters that follow the parenthesis. For example, in a +callout before an assertion such as (?=ab) the length is 3. For an an +alternation bar or a closing parenthesis, the length is one, unless a closing +parenthesis is followed by a quantifier, in which case its length is included. +(This changed in release 10.23. In earlier releases, before an opening +parenthesis the length was that of the entire subpattern, and before an +alternation bar or a closing parenthesis the length was zero.) .P The \fIpattern_position\fP and \fInext_item_length\fP fields are intended to help in distinguishing between different automatic callouts, which all have the @@ -382,6 +396,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 23 March 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 29 September 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2compat.3 b/pcre2/doc/pcre2compat.3 index a3306d782..64d0df3d3 100644 --- a/pcre2/doc/pcre2compat.3 +++ b/pcre2/doc/pcre2compat.3 @@ -1,4 +1,4 @@ -.TH PCRE2COMPAT 3 "15 March 2015" "PCRE2 10.20" +.TH PCRE2COMPAT 3 "18 October 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "DIFFERENCES BETWEEN PCRE2 AND PERL" @@ -96,7 +96,7 @@ processed as anchored at the point where they are tested. one that is backtracked onto acts. For example, in the pattern A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the -same as PCRE2, but there are examples where it differs. +same as PCRE2, but there are cases where it differs. .P 11. Most backtracking verbs in assertions have their normal actions. They are not confined to the assertion. @@ -109,17 +109,18 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to 13. PCRE2's handling of duplicate subpattern numbers and duplicate subpattern names is not as general as Perl's. This is a consequence of the fact the PCRE2 works internally just with numbers, using an external table to translate -between numbers and names. In particular, a pattern such as (?|(?A)|(?A)|(?B), where the two capturing parentheses have the same number but different names, is not supported, and causes an error at compile time. If it were allowed, it would not be possible to distinguish which parentheses matched, because both names map to capturing subpattern number 1. To avoid this confusing situation, an error is given at compile time. .P -14. Perl recognizes comments in some places that PCRE2 does not, for example, -between the ( and ? at the start of a subpattern. If the /x modifier is set, -Perl allows white space between ( and ? (though current Perls warn that this is -deprecated) but PCRE2 never does, even if the PCRE2_EXTENDED option is set. +14. Perl used to recognize comments in some places that PCRE2 does not, for +example, between the ( and ? at the start of a subpattern. If the /x modifier +is set, Perl allowed white space between ( and ? though the latest Perls give +an error (for a while it was just deprecated). There may still be some cases +where Perl behaves differently. .P 15. Perl, when in warning mode, gives warnings for character classes such as [A-\ed] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no @@ -141,33 +142,37 @@ list is with respect to Perl 5.10: each alternative branch of a lookbehind assertion can match a different length of string. Perl requires them all to have the same length. .sp -(b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $ +(b) From PCRE2 10.23, back references to groups of fixed length are supported +in lookbehinds, provided that there is no possibility of referencing a +non-unique number or name. Perl does not support backreferences in lookbehinds. +.sp +(c) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $ meta-character matches only at the very end of the string. .sp -(c) A backslash followed by a letter with no special meaning is faulted. (Perl +(d) A backslash followed by a letter with no special meaning is faulted. (Perl can be made to issue a warning.) .sp -(d) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is +(e) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is inverted, that is, by default they are not greedy, but if followed by a question mark they are. .sp -(e) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried +(f) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried only at the first matching position in the subject string. .sp -(f) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, and +(g) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, and PCRE2_NO_AUTO_CAPTURE options have no Perl equivalents. .sp -(g) The \eR escape sequence can be restricted to match only CR, LF, or CRLF +(h) The \eR escape sequence can be restricted to match only CR, LF, or CRLF by the PCRE2_BSR_ANYCRLF option. .sp -(h) The callout facility is PCRE2-specific. +(i) The callout facility is PCRE2-specific. .sp -(i) The partial matching facility is PCRE2-specific. +(j) The partial matching facility is PCRE2-specific. .sp -(j) The alternative matching function (\fBpcre2_dfa_match()\fP matches in a +(k) The alternative matching function (\fBpcre2_dfa_match()\fP matches in a different way and is not Perl-compatible. .sp -(k) PCRE2 recognizes some special sequences such as (*CR) at the start of +(l) PCRE2 recognizes some special sequences such as (*CR) at the start of a pattern that set overall options that cannot be changed within the pattern. . . @@ -185,6 +190,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 15 March 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 18 October 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2demo.3 b/pcre2/doc/pcre2demo.3 index 5deed0a05..c02dcd95c 100644 --- a/pcre2/doc/pcre2demo.3 +++ b/pcre2/doc/pcre2demo.3 @@ -20,28 +20,31 @@ *************************************************/ /* This is a demonstration program to illustrate a straightforward way of -calling the PCRE2 regular expression library from a C program. See the +using the PCRE2 regular expression library from a C program. See the pcre2sample documentation for a short discussion ("man pcre2sample" if you have the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is incompatible with the original PCRE API. There are actually three libraries, each supporting a different code unit -width. This demonstration program uses the 8-bit library. +width. This demonstration program uses the 8-bit library. The default is to +process each code unit as a separate character, but if the pattern begins with +"(*UTF)", both it and the subject are treated as UTF-8 strings, where +characters may occupy multiple code units. In Unix-like environments, if PCRE2 is installed in your standard system libraries, you should be able to compile this program using this command: -gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo +cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo If PCRE2 is not installed in a standard place, it is likely to be installed with support for the pkg-config mechanism. If you have pkg-config, you can compile this program using this command: -gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo +cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo -If you do not have pkg-config, you may have to use this: +If you do not have pkg-config, you may have to use something like this: -gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e +cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e -R/usr/local/lib -lpcre2-8 -o pcre2demo Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and @@ -56,9 +59,14 @@ the following line. */ /* #define PCRE2_STATIC */ -/* This macro must be defined before including pcre2.h. For a program that uses -only one code unit width, it makes it possible to use generic function names -such as pcre2_compile(). */ +/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h. +For a program that uses only one code unit width, setting it to 8, 16, or 32 +makes it possible to use generic function names such as pcre2_compile(). Note +that just changing 8 to 16 (for example) is not sufficient to convert this +program to process 16-bit characters. Even in a fully 16-bit environment, where +string-handling functions such as strcmp() and printf() work with 16-bit +characters, the code for handling the table of named substrings will still need +to be modified. */ #define PCRE2_CODE_UNIT_WIDTH 8 @@ -79,19 +87,19 @@ int main(int argc, char **argv) { pcre2_code *re; PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ -PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */ +PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */ PCRE2_SPTR name_table; int crlf_is_newline; int errornumber; int find_all; int i; -int namecount; -int name_entry_size; int rc; int utf8; uint32_t option_bits; +uint32_t namecount; +uint32_t name_entry_size; uint32_t newline; PCRE2_SIZE erroroffset; @@ -106,15 +114,19 @@ pcre2_match_data *match_data; * First, sort out the command line. There is only one possible option at * * the moment, "-g" to request repeated matching to find all occurrences, * * like Perl's /g option. We set the variable find_all to a non-zero value * -* if the -g option is present. Apart from that, there must be exactly two * -* arguments. * +* if the -g option is present. * **************************************************************************/ find_all = 0; for (i = 1; i < argc; i++) { if (strcmp(argv[i], "-g") == 0) find_all = 1; - else break; + else if (argv[i][0] == '-') + { + printf("Unrecognised option %s\en", argv[i]); + return 1; + } + else break; } /* After the options, we require exactly two arguments, which are the pattern, @@ -122,7 +134,7 @@ and the subject string. */ if (argc - i != 2) { - printf("Two arguments required: a regex and a subject string\en"); + printf("Exactly two arguments required: a regex and a subject string\en"); return 1; } @@ -201,7 +213,7 @@ if (rc < 0) stored. */ ovector = pcre2_get_ovector_pointer(match_data); -printf("\enMatch succeeded at offset %d\en", (int)ovector[0]); +printf("Match succeeded at offset %d\en", (int)ovector[0]); /************************************************************************* @@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ &namecount); /* where to put the answer */ -if (namecount <= 0) printf("No named substrings\en"); else +if (namecount == 0) printf("No named substrings\en"); else { PCRE2_SPTR tabptr; printf("Named substrings\en"); @@ -330,8 +342,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY || for (;;) { - uint32_t options = 0; /* Normally no options */ - PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ + uint32_t options = 0; /* Normally no options */ + PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ /* If the previous match was for an empty string, we are finished if we are at the end of the subject. Otherwise, arrange to run another match at the @@ -371,7 +383,7 @@ for (;;) { if (options == 0) break; /* All matches found */ ovector[1] = start_offset + 1; /* Advance one code unit */ - if (crlf_is_newline && /* If CRLF is newline & */ + if (crlf_is_newline && /* If CRLF is a newline & */ start_offset < subject_length - 1 && /* we are at CRLF, */ subject[start_offset] == '\er' && subject[start_offset + 1] == '\en') @@ -417,7 +429,7 @@ for (;;) printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start); } - if (namecount <= 0) printf("No named substrings\en"); else + if (namecount == 0) printf("No named substrings\en"); else { PCRE2_SPTR tabptr = name_table; printf("Named substrings\en"); diff --git a/pcre2/doc/pcre2grep.1 b/pcre2/doc/pcre2grep.1 index 028a91e4e..80e8899e7 100644 --- a/pcre2/doc/pcre2grep.1 +++ b/pcre2/doc/pcre2grep.1 @@ -1,4 +1,4 @@ -.TH PCRE2GREP 1 "03 January 2015" "PCRE2 10.00" +.TH PCRE2GREP 1 "31 December 2016" "PCRE2 10.23" .SH NAME pcre2grep - a grep with Perl-compatible regular expressions. .SH SYNOPSIS @@ -52,11 +52,18 @@ span line boundaries. What defines a line boundary is controlled by the \fB-N\fP (\fB--newline\fP) option. .P The amount of memory used for buffering files that are being scanned is -controlled by a parameter that can be set by the \fB--buffer-size\fP option. -The default value for this parameter is specified when \fBpcre2grep\fP is -built, with the default default being 20K. A block of memory three times this -size is used (to allow for buffering "before" and "after" lines). An error -occurs if a line overflows the buffer. +controlled by parameters that can be set by the \fB--buffer-size\fP and +\fB--max-buffer-size\fP options. The first of these sets the size of buffer +that is obtained at the start of processing. If an input file contains very +long lines, a larger buffer may be needed; this is handled by automatically +extending the buffer, up to the limit specified by \fB--max-buffer-size\fP. The +default values for these parameters are specified when \fBpcre2grep\fP is +built, with the default defaults being 20K and 1M respectively. An error occurs +if a line is too long and the buffer can no longer be expanded. +.P +The block of memory that is actually used is three times the "buffer size", to +allow for buffering "before" and "after" lines. If the buffer size is too +small, fewer than requested "before" and "after" lines may be output. .P Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater. BUFSIZ is defined in \fB \fP. When there is more than one pattern @@ -126,24 +133,27 @@ command line starts with a hyphen but is not an option. This allows for the processing of patterns and file names that start with hyphens. .TP \fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP -Output \fInumber\fP lines of context after each matching line. If file names -and/or line numbers are being output, a hyphen separator is used instead of a -colon for the context lines. A line containing "--" is output between each -group of lines, unless they are in fact contiguous in the input file. The value -of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP -guarantees to have up to 8K of following text available for context output. +Output up to \fInumber\fP lines of context after each matching line. Fewer +lines are output if the next match or the end of the file is reached, or if the +processing buffer size has been set too small. If file names and/or line +numbers are being output, a hyphen separator is used instead of a colon for the +context lines. A line containing "--" is output between each group of lines, +unless they are in fact contiguous in the input file. The value of \fInumber\fP +is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored. .TP \fB-a\fP, \fB--text\fP Treat binary files as text. This is equivalent to \fB--binary-files\fP=\fItext\fP. .TP \fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP -Output \fInumber\fP lines of context before each matching line. If file names -and/or line numbers are being output, a hyphen separator is used instead of a -colon for the context lines. A line containing "--" is output between each -group of lines, unless they are in fact contiguous in the input file. The value -of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP -guarantees to have up to 8K of preceding text available for context output. +Output up to \fInumber\fP lines of context before each matching line. Fewer +lines are output if the previous match or the start of the file is within +\fInumber\fP lines, or if the processing buffer size has been set too small. If +file names and/or line numbers are being output, a hyphen separator is used +instead of a colon for the context lines. A line containing "--" is output +between each group of lines, unless they are in fact contiguous in the input +file. The value of \fInumber\fP is expected to be relatively small. When +\fB-c\fP is used, \fB-B\fP is ignored. .TP \fB--binary-files=\fP\fIword\fP Specify how binary files are to be processed. If the word is "binary" (the @@ -158,8 +168,9 @@ be of interest and are skipped without causing any output or affecting the return code. .TP \fB--buffer-size=\fP\fInumber\fP -Set the parameter that controls how much memory is used for buffering files -that are being scanned. +Set the parameter that controls how much memory is obtained at the start of +processing for buffering files that are being scanned. See also +\fB--max-buffer-size\fP below. .TP \fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP Output \fInumber\fP lines of context both before and after each matching line. @@ -167,13 +178,15 @@ This is equivalent to setting both \fB-A\fP and \fB-B\fP to the same value. .TP \fB-c\fP, \fB--count\fP Do not output lines from the files that are being scanned; instead output the -number of matches (or non-matches if \fB-v\fP is used) that would otherwise -have caused lines to be shown. By default, this count is the same as the number -of suppressed lines, but if the \fB-M\fP (multiline) option is used (without -\fB-v\fP), there may be more suppressed lines than the number of matches. +number of lines that would have been shown, either because they matched, or, if +\fB-v\fP is set, because they failed to match. By default, this count is +exactly the same as the number of lines that would have been output, but if the +\fB-M\fP (multiline) option is used (without \fB-v\fP), there may be more +suppressed lines than the count (that is, the number of matches). .sp If no lines are selected, the number zero is output. If several files are are -being scanned, a count is output for each of them. However, if the +being scanned, a count is output for each of them and the \fB-t\fP option can +be used to cause a total to be output at the end. However, if the \fB--files-with-matches\fP option is also used, only those files whose counts are greater than zero are listed. When \fB-c\fP is used, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. @@ -192,12 +205,22 @@ connected to a terminal. More resources are used when colouring is enabled, because \fBpcre2grep\fP has to search for all possible matches in a line, not just one, in order to colour them all. .sp -The colour that is used can be specified by setting the environment variable -PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The value of this variable should be a -string of two numbers, separated by a semicolon. They are copied directly into -the control string for setting colour on a terminal, so it is your -responsibility to ensure that they make sense. If neither of the environment -variables is set, the default is "1;31", which gives red. +The colour that is used can be specified by setting one of the environment +variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, PCREGREP_COLOUR, or +PCREGREP_COLOR, which are checked in that order. If none of these are set, +\fBpcre2grep\fP looks for GREP_COLORS or GREP_COLOR (in that order). The value +of the variable should be a string of two numbers, separated by a semicolon, +except in the case of GREP_COLORS, which must start with "ms=" or "mt=" +followed by two semicolon-separated colours, terminated by the end of the +string or by a colon. If GREP_COLORS does not start with "ms=" or "mt=" it is +ignored, and GREP_COLOR is checked. +.sp +If the string obtained from one of the above variables contains any characters +other than semicolon or digits, the setting is ignored and the default colour +is used. The string is copied directly into the control string for setting +colour on a terminal, so it is your responsibility to ensure that the values +make sense. If no relevant environment variable is set, the default is "1;31", +which gives red. .TP \fB-D\fP \fIaction\fP, \fB--devices=\fP\fIaction\fP If an input path is not a regular file or a directory, "action" specifies how @@ -273,17 +296,17 @@ files; it does not apply to patterns specified by any of the \fB--include\fP or \fB--exclude\fP options. .TP \fB-f\fP \fIfilename\fP, \fB--file=\fP\fIfilename\fP -Read patterns from the file, one per line, and match them against -each line of input. What constitutes a newline when reading the file is the -operating system's default. The \fB--newline\fP option has no effect on this -option. Trailing white space is removed from each line, and blank lines are -ignored. An empty file contains no patterns and therefore matches nothing. See -also the comments about multiple patterns versus a single pattern with -alternatives in the description of \fB-e\fP above. +Read patterns from the file, one per line, and match them against each line of +input. What constitutes a newline when reading the file is the operating +system's default. The \fB--newline\fP option has no effect on this option. +Trailing white space is removed from each line, and blank lines are ignored. An +empty file contains no patterns and therefore matches nothing. See also the +comments about multiple patterns versus a single pattern with alternatives in +the description of \fB-e\fP above. .sp -If this option is given more than once, all the specified files are -read. A data line is output if any of the patterns match it. A file name can -be given as "-" to refer to the standard input. When \fB-f\fP is used, patterns +If this option is given more than once, all the specified files are read. A +data line is output if any of the patterns match it. A file name can be given +as "-" to refer to the standard input. When \fB-f\fP is used, patterns specified on the command line using \fB-e\fP may also be present; they are tested before the file's patterns. However, no other pattern is taken from the command line; all arguments are treated as the names of paths to be searched. @@ -432,18 +455,25 @@ of use only if it is set smaller than \fB--match-limit\fP. There are no short forms for these options. The default settings are specified when the PCRE2 library is compiled, with the default default being 10 million. .TP +\fB--max-buffer-size=\fInumber\fP +This limits the expansion of the processing buffer, whose initial size can be +set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no +smaller than the starting buffer size. +.TP \fB-M\fP, \fB--multiline\fP -Allow patterns to match more than one line. When this option is given, patterns -may usefully contain literal newline characters and internal occurrences of ^ -and $ characters. The output for a successful match may consist of more than -one line. The first is the line in which the match started, and the last is the -line in which the match ended. If the matched string ends with a newline -sequence the output ends at the end of that line. +Allow patterns to match more than one line. When this option is set, the PCRE2 +library is called in "multiline" mode. This allows a matched string to extend +past the end of a line and continue on one or more subsequent lines. Patterns +used with \fB-M\fP may usefully contain literal newline characters and internal +occurrences of ^ and $ characters. The output for a successful match may +consist of more than one line. The first line is the line in which the match +started, and the last line is the line in which the match ended. If the matched +string ends with a newline sequence, the output ends at the end of that line. +If \fB-v\fP is set, none of the lines in a multi-line match are output. Once a +match has been handled, scanning restarts at the beginning of the line after +the one in which the match ended. .sp -When this option is set, the PCRE2 library is called in "multiline" mode. -However, \fBpcre2grep\fP still processes the input line by line. The difference -is that a matched string may extend past the end of a line and continue on -one or more subsequent lines. The newline sequence must be matched as part of +The newline sequence that separates multiple lines must be matched as part of the pattern. For example, to find the phrase "regular expression" in a file where "regular" might be at the end of a line and "expression" at the start of the next line, you could use this command: @@ -455,11 +485,8 @@ and is followed by + so as to match trailing white space on the first line as well as possibly handling a two-character newline sequence. .sp There is a limit to the number of lines that can be matched, imposed by the way -that \fBpcre2grep\fP buffers the input file as it scans it. However, -\fBpcre2grep\fP ensures that at least 8K characters or the rest of the file -(whichever is the shorter) are available for forward matching, and similarly -the previous 8K characters (or all the previous characters, if fewer than 8K) -are guaranteed to be available for lookbehind assertions. The \fB-M\fP option +that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently +large processing buffer, this should not be a problem, but the \fB-M\fP option does not work when input is read line by line (see \fP--line-buffered\fP.) .TP \fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP @@ -502,12 +529,13 @@ It should never be needed in normal use. Show only the part of the line that matched a pattern instead of the whole line. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is more than one match in a line, each -of them is shown separately. If \fB-o\fP is combined with \fB-v\fP (invert the -sense of the match to find non-matching lines), no output is generated, but the -return code is set appropriately. If the matched portion of the line is empty, -nothing is output unless the file name or line number are being printed, in -which case they are shown on an otherwise empty line. This option is mutually -exclusive with \fB--file-offsets\fP and \fB--line-offsets\fP. +of them is shown separately, on a separate line of output. If \fB-o\fP is +combined with \fB-v\fP (invert the sense of the match to find non-matching +lines), no output is generated, but the return code is set appropriately. If +the matched portion of the line is empty, nothing is output unless the file +name or line number are being printed, in which case they are shown on an +otherwise empty line. This option is mutually exclusive with +\fB--file-offsets\fP and \fB--line-offsets\fP. .TP \fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP Show only the part of the line that matched the capturing parentheses of the @@ -519,10 +547,11 @@ for the non-argument case above also apply to this case. If the specified capturing parentheses do not exist in the pattern, or were not set in the match, nothing is output unless the file name or line number are being output. .sp -If this option is given multiple times, multiple substrings are output, in the -order the options are given. For example, -o3 -o1 -o3 causes the substrings -matched by capturing parentheses 3 and 1 and then 3 again to be output. By -default, there is no separator (but see the next option). +If this option is given multiple times, multiple substrings are output for each +match, in the order the options are given, and all on one line. For example, +-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and +then 3 again to be output. By default, there is no separator (but see the next +option). .TP \fB--om-separator\fP=\fItext\fP Specify a separating string for multiple occurrences of \fB-o\fP. The default @@ -547,6 +576,17 @@ Suppress error messages about non-existent or unreadable files. Such files are quietly skipped. However, the return code is still 2, even if matches were found in other files. .TP +\fB-t\fP, \fB--total-count\fP +This option is useful when scanning more than one file. If used on its own, +\fB-t\fP suppresses all output except for a grand total number of matching +lines (or non-matching lines if \fB-v\fP is used) in all the files. If \fB-t\fP +is used with \fB-c\fP, a grand total is output except when the previous output +is just one line. In other words, it is not output when just one file's count +is listed. If file names are being output, the grand total is preceded by +"TOTAL:". Otherwise, it appears as just another number. The \fB-t\fP option is +ignored when used with \fB-L\fP (list files without matches), because the grand +total would always be zero. +.TP \fB-u\fP, \fB--utf-8\fP Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (including those for any \fB--exclude\fP and @@ -570,11 +610,12 @@ specified by any of the \fB--include\fP or \fB--exclude\fP options. .TP \fB-x\fP, \fB--line-regex\fP, \fB--line-regexp\fP Force the patterns to be anchored (each must start matching at the beginning of -a line) and in addition, require them to match entire lines. This is equivalent -to having ^ and $ characters at the start and end of each alternative top-level -branch in every pattern. This option applies only to the patterns that are -matched against the contents of files; it does not apply to patterns specified -by any of the \fB--include\fP or \fB--exclude\fP options. +a line) and in addition, require them to match entire lines. In multiline mode +the match may be more than one line. This is equivalent to having \eA and \eZ +characters at the start and end of each alternative top-level branch in every +pattern. This option applies only to the patterns that are matched against the +contents of files; it does not apply to patterns specified by any of the +\fB--include\fP or \fB--exclude\fP options. . . .SH "ENVIRONMENT VARIABLES" @@ -653,6 +694,58 @@ options does have data, it must be given in the first form, using an equals character. Otherwise \fBpcre2grep\fP will assume that it has no data. . . +.SH "CALLING EXTERNAL SCRIPTS" +.rs +.sp +\fBpcre2grep\fP has, by default, support for calling external programs or +scripts during matching by making use of PCRE2's callout facility. However, +this support can be disabled when \fBpcre2grep\fP is built. You can find out +whether your binary has support for callouts by running it with the \fB--help\fP +option. If the support is not enabled, all callouts in patterns are ignored by +\fBpcre2grep\fP. +.P +A callout in a PCRE2 pattern is of the form (?C below. .\" +.sp + #newline_default [) where the argument is +either a number or a quoted string (see the +.\" HREF +\fBpcre2callout\fP +.\" +documentation for details). Numbered callouts are ignored by \fBpcre2grep\fP. +String arguments are parsed as a list of substrings separated by pipe (vertical +bar) characters. The first substring must be an executable name, with the +following substrings specifying arguments: +.sp + executable_name|arg1|arg2|... +.sp +Any substring (including the executable name) may contain escape sequences +started by a dollar character: $ or ${ } is replaced by the +captured substring of the given decimal number, which must be greater than +zero. If the number is greater than the number of capturing substrings, or if +the capture is unset, the replacement is empty. +.P +Any other character is substituted by itself. In particular, $$ is replaced by +a single dollar and $| is replaced by a pipe character. Here is an example: +.sp + echo -e "abcde\en12345" | pcre2grep \e + '(?x)(.)(..(.)) + (?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' - +.sp + Output: +.sp + Arg1: [a] [bcd] [d] Arg2: |a| () + abcde + Arg1: [1] [234] [4] Arg2: |1| () + 12345 +.sp +The parameters for the \fBexecv()\fP system call that is used to run the +program or script are zero-terminated strings. This means that binary zero +characters in the callout argument will cause premature termination of their +substrings, and therefore should not be present. Any syntax errors in the +string (for example, a dollar not followed by another character) cause the +callout to be ignored. If running the program fails for any reason (including +the non-existence of the executable), a local matching failure occurs and the +matcher backtracks in the normal way. +. +. .SH "MATCHING ERRORS" .rs .sp @@ -683,7 +776,7 @@ affect the return code. .SH "SEE ALSO" .rs .sp -\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3). +\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3). . . .SH AUTHOR @@ -700,6 +793,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 03 January 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 31 December 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2grep.txt b/pcre2/doc/pcre2grep.txt index 29cd75cfa..76c9cc1f3 100644 --- a/pcre2/doc/pcre2grep.txt +++ b/pcre2/doc/pcre2grep.txt @@ -51,103 +51,115 @@ DESCRIPTION boundary is controlled by the -N (--newline) option. The amount of memory used for buffering files that are being scanned is - controlled by a parameter that can be set by the --buffer-size option. - The default value for this parameter is specified when pcre2grep is - built, with the default default being 20K. A block of memory three - times this size is used (to allow for buffering "before" and "after" - lines). An error occurs if a line overflows the buffer. + controlled by parameters that can be set by the --buffer-size and + --max-buffer-size options. The first of these sets the size of buffer + that is obtained at the start of processing. If an input file contains + very long lines, a larger buffer may be needed; this is handled by + automatically extending the buffer, up to the limit specified by --max- + buffer-size. The default values for these parameters are specified when + pcre2grep is built, with the default defaults being 20K and 1M respec- + tively. An error occurs if a line is too long and the buffer can no + longer be expanded. - Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the - greater. BUFSIZ is defined in . When there is more than one + The block of memory that is actually used is three times the "buffer + size", to allow for buffering "before" and "after" lines. If the buffer + size is too small, fewer than requested "before" and "after" lines may + be output. + + Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the + greater. BUFSIZ is defined in . When there is more than one pattern (specified by the use of -e and/or -f), each pattern is applied - to each line in the order in which they are defined, except that all + to each line in the order in which they are defined, except that all the -e patterns are tried before the -f patterns. - By default, as soon as one pattern matches a line, no further patterns + By default, as soon as one pattern matches a line, no further patterns are considered. However, if --colour (or --color) is used to colour the - matching substrings, or if --only-matching, --file-offsets, or --line- - offsets is used to output only the part of the line that matched + matching substrings, or if --only-matching, --file-offsets, or --line- + offsets is used to output only the part of the line that matched (either shown literally, or as an offset), scanning resumes immediately - following the match, so that further matches on the same line can be - found. If there are multiple patterns, they are all tried on the - remainder of the line, but patterns that follow the one that matched + following the match, so that further matches on the same line can be + found. If there are multiple patterns, they are all tried on the + remainder of the line, but patterns that follow the one that matched are not tried on the earlier part of the line. - This behaviour means that the order in which multiple patterns are - specified can affect the output when one of the above options is used. - This is no longer the same behaviour as GNU grep, which now manages to - display earlier matches for later patterns (as long as there is no + This behaviour means that the order in which multiple patterns are + specified can affect the output when one of the above options is used. + This is no longer the same behaviour as GNU grep, which now manages to + display earlier matches for later patterns (as long as there is no overlap). - Patterns that can match an empty string are accepted, but empty string + Patterns that can match an empty string are accepted, but empty string matches are never recognized. An example is the pattern - "(super)?(man)?", in which all components are optional. This pattern - finds all occurrences of both "super" and "man"; the output differs - from matching with "super|man" when only the matching substrings are + "(super)?(man)?", in which all components are optional. This pattern + finds all occurrences of both "super" and "man"; the output differs + from matching with "super|man" when only the matching substrings are being shown. - If the LC_ALL or LC_CTYPE environment variable is set, pcre2grep uses + If the LC_ALL or LC_CTYPE environment variable is set, pcre2grep uses the value to set a locale when calling the PCRE2 library. The --locale option can be used to override this. SUPPORT FOR COMPRESSED FILES - It is possible to compile pcre2grep so that it uses libz or libbz2 to - read files whose names end in .gz or .bz2, respectively. You can find + It is possible to compile pcre2grep so that it uses libz or libbz2 to + read files whose names end in .gz or .bz2, respectively. You can find out whether your binary has support for one or both of these file types by running it with the --help option. If the appropriate support is not - present, files are treated as plain text. The standard input is always + present, files are treated as plain text. The standard input is always so treated. BINARY FILES - By default, a file that contains a binary zero byte within the first - 1024 bytes is identified as a binary file, and is processed specially. - (GNU grep also identifies binary files in this manner.) See the - --binary-files option for a means of changing the way binary files are + By default, a file that contains a binary zero byte within the first + 1024 bytes is identified as a binary file, and is processed specially. + (GNU grep also identifies binary files in this manner.) See the + --binary-files option for a means of changing the way binary files are handled. OPTIONS - The order in which some of the options appear can affect the output. - For example, both the -h and -l options affect the printing of file - names. Whichever comes later in the command line will be the one that - takes effect. Similarly, except where noted below, if an option is - given twice, the later setting is used. Numerical values for options - may be followed by K or M, to signify multiplication by 1024 or + The order in which some of the options appear can affect the output. + For example, both the -h and -l options affect the printing of file + names. Whichever comes later in the command line will be the one that + takes effect. Similarly, except where noted below, if an option is + given twice, the later setting is used. Numerical values for options + may be followed by K or M, to signify multiplication by 1024 or 1024*1024 respectively. -- This terminates the list of options. It is useful if the next - item on the command line starts with a hyphen but is not an - option. This allows for the processing of patterns and file + item on the command line starts with a hyphen but is not an + option. This allows for the processing of patterns and file names that start with hyphens. -A number, --after-context=number - Output number lines of context after each matching line. If - file names and/or line numbers are being output, a hyphen - separator is used instead of a colon for the context lines. A - line containing "--" is output between each group of lines, - unless they are in fact contiguous in the input file. The - value of number is expected to be relatively small. However, - pcre2grep guarantees to have up to 8K of following text - available for context output. + Output up to number lines of context after each matching + line. Fewer lines are output if the next match or the end of + the file is reached, or if the processing buffer size has + been set too small. If file names and/or line numbers are + being output, a hyphen separator is used instead of a colon + for the context lines. A line containing "--" is output + between each group of lines, unless they are in fact contigu- + ous in the input file. The value of number is expected to be + relatively small. When -c is used, -A is ignored. -a, --text Treat binary files as text. This is equivalent to --binary- files=text. -B number, --before-context=number - Output number lines of context before each matching line. If - file names and/or line numbers are being output, a hyphen - separator is used instead of a colon for the context lines. A - line containing "--" is output between each group of lines, - unless they are in fact contiguous in the input file. The - value of number is expected to be relatively small. However, - pcre2grep guarantees to have up to 8K of preceding text - available for context output. + Output up to number lines of context before each matching + line. Fewer lines are output if the previous match or the + start of the file is within number lines, or if the process- + ing buffer size has been set too small. If file names and/or + line numbers are being output, a hyphen separator is used + instead of a colon for the context lines. A line containing + "--" is output between each group of lines, unless they are + in fact contiguous in the input file. The value of number is + expected to be relatively small. When -c is used, -B is + ignored. --binary-files=word Specify how binary files are to be processed. If the word is @@ -164,54 +176,68 @@ OPTIONS any output or affecting the return code. --buffer-size=number - Set the parameter that controls how much memory is used for - buffering files that are being scanned. + Set the parameter that controls how much memory is obtained + at the start of processing for buffering files that are being + scanned. See also --max-buffer-size below. -C number, --context=number - Output number lines of context both before and after each - matching line. This is equivalent to setting both -A and -B + Output number lines of context both before and after each + matching line. This is equivalent to setting both -A and -B to the same value. -c, --count - Do not output lines from the files that are being scanned; - instead output the number of matches (or non-matches if -v is - used) that would otherwise have caused lines to be shown. By - default, this count is the same as the number of suppressed - lines, but if the -M (multiline) option is used (without -v), - there may be more suppressed lines than the number of - matches. + Do not output lines from the files that are being scanned; + instead output the number of lines that would have been + shown, either because they matched, or, if -v is set, because + they failed to match. By default, this count is exactly the + same as the number of lines that would have been output, but + if the -M (multiline) option is used (without -v), there may + be more suppressed lines than the count (that is, the number + of matches). If no lines are selected, the number zero is output. If sev- eral files are are being scanned, a count is output for each - of them. However, if the --files-with-matches option is also - used, only those files whose counts are greater than zero are - listed. When -c is used, the -A, -B, and -C options are - ignored. + of them and the -t option can be used to cause a total to be + output at the end. However, if the --files-with-matches + option is also used, only those files whose counts are + greater than zero are listed. When -c is used, the -A, -B, + and -C options are ignored. --colour, --color If this option is given without any data, it is equivalent to - "--colour=auto". If data is required, it must be given in + "--colour=auto". If data is required, it must be given in the same shell item, separated by an equals sign. --colour=value, --color=value This option specifies under what circumstances the parts of a line that matched a pattern should be coloured in the output. - By default, the output is not coloured. The value (which is - optional, see above) may be "never", "always", or "auto". In - the latter case, colouring happens only if the standard out- - put is connected to a terminal. More resources are used when + By default, the output is not coloured. The value (which is + optional, see above) may be "never", "always", or "auto". In + the latter case, colouring happens only if the standard out- + put is connected to a terminal. More resources are used when colouring is enabled, because pcre2grep has to search for all - possible matches in a line, not just one, in order to colour + possible matches in a line, not just one, in order to colour them all. - The colour that is used can be specified by setting the envi- - ronment variable PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The - value of this variable should be a string of two numbers, - separated by a semicolon. They are copied directly into the - control string for setting colour on a terminal, so it is - your responsibility to ensure that they make sense. If nei- - ther of the environment variables is set, the default is - "1;31", which gives red. + The colour that is used can be specified by setting one of + the environment variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, + PCREGREP_COLOUR, or PCREGREP_COLOR, which are checked in that + order. If none of these are set, pcre2grep looks for + GREP_COLORS or GREP_COLOR (in that order). The value of the + variable should be a string of two numbers, separated by a + semicolon, except in the case of GREP_COLORS, which must + start with "ms=" or "mt=" followed by two semicolon-separated + colours, terminated by the end of the string or by a colon. + If GREP_COLORS does not start with "ms=" or "mt=" it is + ignored, and GREP_COLOR is checked. + + If the string obtained from one of the above variables con- + tains any characters other than semicolon or digits, the set- + ting is ignored and the default colour is used. The string is + copied directly into the control string for setting colour on + a terminal, so it is your responsibility to ensure that the + values make sense. If no relevant environment variable is + set, the default is "1;31", which gives red. -D action, --devices=action If an input path is not a regular file or a directory, @@ -299,12 +325,12 @@ OPTIONS Read patterns from the file, one per line, and match them against each line of input. What constitutes a newline when reading the file is the operating system's default. The - --newline option has no effect on this option. Trailing white - space is removed from each line, and blank lines are ignored. - An empty file contains no patterns and therefore matches - nothing. See also the comments about multiple patterns versus - a single pattern with alternatives in the description of -e - above. + --newline option has no effect on this option. Trailing + white space is removed from each line, and blank lines are + ignored. An empty file contains no patterns and therefore + matches nothing. See also the comments about multiple pat- + terns versus a single pattern with alternatives in the + description of -e above. If this option is given more than once, all the specified files are read. A data line is output if any of the patterns @@ -482,96 +508,101 @@ OPTIONS tings are specified when the PCRE2 library is compiled, with the default default being 10 million. - -M, --multiline - Allow patterns to match more than one line. When this option - is given, patterns may usefully contain literal newline char- - acters and internal occurrences of ^ and $ characters. The - output for a successful match may consist of more than one - line. The first is the line in which the match started, and - the last is the line in which the match ended. If the matched - string ends with a newline sequence the output ends at the - end of that line. + --max-buffer-size=number + This limits the expansion of the processing buffer, whose + initial size can be set by --buffer-size. The maximum buffer + size is silently forced to be no smaller than the starting + buffer size. - When this option is set, the PCRE2 library is called in "mul- - tiline" mode. However, pcre2grep still processes the input - line by line. The difference is that a matched string may - extend past the end of a line and continue on one or more - subsequent lines. The newline sequence must be matched as - part of the pattern. For example, to find the phrase "regular - expression" in a file where "regular" might be at the end of - a line and "expression" at the start of the next line, you - could use this command: + -M, --multiline + Allow patterns to match more than one line. When this option + is set, the PCRE2 library is called in "multiline" mode. This + allows a matched string to extend past the end of a line and + continue on one or more subsequent lines. Patterns used with + -M may usefully contain literal newline characters and inter- + nal occurrences of ^ and $ characters. The output for a suc- + cessful match may consist of more than one line. The first + line is the line in which the match started, and the last + line is the line in which the match ended. If the matched + string ends with a newline sequence, the output ends at the + end of that line. If -v is set, none of the lines in a + multi-line match are output. Once a match has been handled, + scanning restarts at the beginning of the line after the one + in which the match ended. + + The newline sequence that separates multiple lines must be + matched as part of the pattern. For example, to find the + phrase "regular expression" in a file where "regular" might + be at the end of a line and "expression" at the start of the + next line, you could use this command: pcre2grep -M 'regular\s+expression' - The \s escape sequence matches any white space character, - including newlines, and is followed by + so as to match - trailing white space on the first line as well as possibly + The \s escape sequence matches any white space character, + including newlines, and is followed by + so as to match + trailing white space on the first line as well as possibly handling a two-character newline sequence. - There is a limit to the number of lines that can be matched, - imposed by the way that pcre2grep buffers the input file as - it scans it. However, pcre2grep ensures that at least 8K - characters or the rest of the file (whichever is the shorter) - are available for forward matching, and similarly the previ- - ous 8K characters (or all the previous characters, if fewer - than 8K) are guaranteed to be available for lookbehind asser- - tions. The -M option does not work when input is read line by - line (see --line-buffered.) + There is a limit to the number of lines that can be matched, + imposed by the way that pcre2grep buffers the input file as + it scans it. With a sufficiently large processing buffer, + this should not be a problem, but the -M option does not work + when input is read line by line (see --line-buffered.) -N newline-type, --newline=newline-type - The PCRE2 library supports five different conventions for - indicating the ends of lines. They are the single-character - sequences CR (carriage return) and LF (linefeed), the two- - character sequence CRLF, an "anycrlf" convention, which rec- - ognizes any of the preceding three types, and an "any" con- + The PCRE2 library supports five different conventions for + indicating the ends of lines. They are the single-character + sequences CR (carriage return) and LF (linefeed), the two- + character sequence CRLF, an "anycrlf" convention, which rec- + ognizes any of the preceding three types, and an "any" con- vention, in which any Unicode line ending sequence is assumed - to end a line. The Unicode sequences are the three just men- - tioned, plus VT (vertical tab, U+000B), FF (form feed, - U+000C), NEL (next line, U+0085), LS (line separator, + to end a line. The Unicode sequences are the three just men- + tioned, plus VT (vertical tab, U+000B), FF (form feed, + U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). - When the PCRE2 library is built, a default line-ending - sequence is specified. This is normally the standard + When the PCRE2 library is built, a default line-ending + sequence is specified. This is normally the standard sequence for the operating system. Unless otherwise specified - by this option, pcre2grep uses the library's default. The + by this option, pcre2grep uses the library's default. The possible values for this option are CR, LF, CRLF, ANYCRLF, or - ANY. This makes it possible to use pcre2grep to scan files + ANY. This makes it possible to use pcre2grep to scan files that have come from other environments without having to mod- - ify their line endings. If the data that is being scanned - does not agree with the convention set by this option, - pcre2grep may behave in strange ways. Note that this option - does not apply to files specified by the -f, --exclude-from, - or --include-from options, which are expected to use the + ify their line endings. If the data that is being scanned + does not agree with the convention set by this option, + pcre2grep may behave in strange ways. Note that this option + does not apply to files specified by the -f, --exclude-from, + or --include-from options, which are expected to use the operating system's standard newline sequence. -n, --line-number Precede each output line by its line number in the file, fol- - lowed by a colon for matching lines or a hyphen for context + lowed by a colon for matching lines or a hyphen for context lines. If the file name is also being output, it precedes the - line number. When the -M option causes a pattern to match - more than one line, only the first is preceded by its line + line number. When the -M option causes a pattern to match + more than one line, only the first is preceded by its line number. This option is forced if --line-offsets is used. - --no-jit If the PCRE2 library is built with support for just-in-time + --no-jit If the PCRE2 library is built with support for just-in-time compiling (which speeds up matching), pcre2grep automatically makes use of this, unless it was explicitly disabled at build - time. This option can be used to disable the use of JIT at - run time. It is provided for testing and working round prob- + time. This option can be used to disable the use of JIT at + run time. It is provided for testing and working round prob- lems. It should never be needed in normal use. -o, --only-matching Show only the part of the line that matched a pattern instead - of the whole line. In this mode, no context is shown. That - is, the -A, -B, and -C options are ignored. If there is more - than one match in a line, each of them is shown separately. - If -o is combined with -v (invert the sense of the match to - find non-matching lines), no output is generated, but the - return code is set appropriately. If the matched portion of - the line is empty, nothing is output unless the file name or - line number are being printed, in which case they are shown - on an otherwise empty line. This option is mutually exclusive - with --file-offsets and --line-offsets. + of the whole line. In this mode, no context is shown. That + is, the -A, -B, and -C options are ignored. If there is more + than one match in a line, each of them is shown separately, + on a separate line of output. If -o is combined with -v + (invert the sense of the match to find non-matching lines), + no output is generated, but the return code is set appropri- + ately. If the matched portion of the line is empty, nothing + is output unless the file name or line number are being + printed, in which case they are shown on an otherwise empty + line. This option is mutually exclusive with --file-offsets + and --line-offsets. -onumber, --only-matching=number Show only the part of the line that matched the capturing @@ -587,65 +618,80 @@ OPTIONS put. If this option is given multiple times, multiple substrings - are output, in the order the options are given. For example, - -o3 -o1 -o3 causes the substrings matched by capturing paren- - theses 3 and 1 and then 3 again to be output. By default, - there is no separator (but see the next option). + are output for each match, in the order the options are + given, and all on one line. For example, -o3 -o1 -o3 causes + the substrings matched by capturing parentheses 3 and 1 and + then 3 again to be output. By default, there is no separator + (but see the next option). --om-separator=text - Specify a separating string for multiple occurrences of -o. - The default is an empty string. Separating strings are never + Specify a separating string for multiple occurrences of -o. + The default is an empty string. Separating strings are never coloured. -q, --quiet Work quietly, that is, display nothing except error messages. - The exit status indicates whether or not any matches were + The exit status indicates whether or not any matches were found. -r, --recursive - If any given path is a directory, recursively scan the files - it contains, taking note of any --include and --exclude set- - tings. By default, a directory is read as a normal file; in - some operating systems this gives an immediate end-of-file. - This option is a shorthand for setting the -d option to + If any given path is a directory, recursively scan the files + it contains, taking note of any --include and --exclude set- + tings. By default, a directory is read as a normal file; in + some operating systems this gives an immediate end-of-file. + This option is a shorthand for setting the -d option to "recurse". --recursion-limit=number See --match-limit above. -s, --no-messages - Suppress error messages about non-existent or unreadable - files. Such files are quietly skipped. However, the return + Suppress error messages about non-existent or unreadable + files. Such files are quietly skipped. However, the return code is still 2, even if matches were found in other files. + -t, --total-count + This option is useful when scanning more than one file. If + used on its own, -t suppresses all output except for a grand + total number of matching lines (or non-matching lines if -v + is used) in all the files. If -t is used with -c, a grand + total is output except when the previous output is just one + line. In other words, it is not output when just one file's + count is listed. If file names are being output, the grand + total is preceded by "TOTAL:". Otherwise, it appears as just + another number. The -t option is ignored when used with -L + (list files without matches), because the grand total would + always be zero. + -u, --utf-8 Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (including - those for any --exclude and --include options) and all sub- - ject lines that are scanned must be valid strings of UTF-8 + those for any --exclude and --include options) and all sub- + ject lines that are scanned must be valid strings of UTF-8 characters. -V, --version - Write the version numbers of pcre2grep and the PCRE2 library - to the standard output and then exit. Anything else on the + Write the version numbers of pcre2grep and the PCRE2 library + to the standard output and then exit. Anything else on the command line is ignored. -v, --invert-match - Invert the sense of the match, so that lines which do not + Invert the sense of the match, so that lines which do not match any of the patterns are the ones that are found. -w, --word-regex, --word-regexp Force the patterns to match only whole words. This is equiva- - lent to having \b at the start and end of the pattern. This - option applies only to the patterns that are matched against - the contents of files; it does not apply to patterns speci- + lent to having \b at the start and end of the pattern. This + option applies only to the patterns that are matched against + the contents of files; it does not apply to patterns speci- fied by any of the --include or --exclude options. -x, --line-regex, --line-regexp - Force the patterns to be anchored (each must start matching - at the beginning of a line) and in addition, require them to - match entire lines. This is equivalent to having ^ and $ - characters at the start and end of each alternative top-level + Force the patterns to be anchored (each must start matching + at the beginning of a line) and in addition, require them to + match entire lines. In multiline mode the match may be more + than one line. This is equivalent to having \A and \Z charac- + ters at the start and end of each alternative top-level branch in every pattern. This option applies only to the pat- terns that are matched against the contents of files; it does not apply to patterns specified by any of the --include or @@ -725,35 +771,86 @@ OPTIONS WITH DATA equals character. Otherwise pcre2grep will assume that it has no data. +CALLING EXTERNAL SCRIPTS + + pcre2grep has, by default, support for calling external programs or + scripts during matching by making use of PCRE2's callout facility. How- + ever, this support can be disabled when pcre2grep is built. You can + find out whether your binary has support for callouts by running it + with the --help option. If the support is not enabled, all callouts in + patterns are ignored by pcre2grep. + + A callout in a PCRE2 pattern is of the form (?C ) where the argu- + ment is either a number or a quoted string (see the pcre2callout docu- + mentation for details). Numbered callouts are ignored by pcre2grep. + String arguments are parsed as a list of substrings separated by pipe + (vertical bar) characters. The first substring must be an executable + name, with the following substrings specifying arguments: + + executable_name|arg1|arg2|... + + Any substring (including the executable name) may contain escape + sequences started by a dollar character: $ or ${ } is + replaced by the captured substring of the given decimal number, which + must be greater than zero. If the number is greater than the number of + capturing substrings, or if the capture is unset, the replacement is + empty. + + Any other character is substituted by itself. In particular, $$ is + replaced by a single dollar and $| is replaced by a pipe character. + Here is an example: + + echo -e "abcde\n12345" | pcre2grep \ + '(?x)(.)(..(.)) + (?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' - + + Output: + + Arg1: [a] [bcd] [d] Arg2: |a| () + abcde + Arg1: [1] [234] [4] Arg2: |1| () + 12345 + + The parameters for the execv() system call that is used to run the pro- + gram or script are zero-terminated strings. This means that binary zero + characters in the callout argument will cause premature termination of + their substrings, and therefore should not be present. Any syntax + errors in the string (for example, a dollar not followed by another + character) cause the callout to be ignored. If running the program + fails for any reason (including the non-existence of the executable), a + local matching failure occurs and the matcher backtracks in the normal + way. + + MATCHING ERRORS - It is possible to supply a regular expression that takes a very long - time to fail to match certain lines. Such patterns normally involve - nested indefinite repeats, for example: (a+)*\d when matched against a - line of a's with no final digit. The PCRE2 matching function has a - resource limit that causes it to abort in these circumstances. If this - happens, pcre2grep outputs an error message and the line that caused - the problem to the standard error stream. If there are more than 20 + It is possible to supply a regular expression that takes a very long + time to fail to match certain lines. Such patterns normally involve + nested indefinite repeats, for example: (a+)*\d when matched against a + line of a's with no final digit. The PCRE2 matching function has a + resource limit that causes it to abort in these circumstances. If this + happens, pcre2grep outputs an error message and the line that caused + the problem to the standard error stream. If there are more than 20 such errors, pcre2grep gives up. - The --match-limit option of pcre2grep can be used to set the overall - resource limit; there is a second option called --recursion-limit that - sets a limit on the amount of memory (usually stack) that is used (see + The --match-limit option of pcre2grep can be used to set the overall + resource limit; there is a second option called --recursion-limit that + sets a limit on the amount of memory (usually stack) that is used (see the discussion of these options above). DIAGNOSTICS Exit status is 0 if any matches were found, 1 if no matches were found, - and 2 for syntax errors, overlong lines, non-existent or inaccessible - files (even if matches were found in other files) or too many matching + and 2 for syntax errors, overlong lines, non-existent or inaccessible + files (even if matches were found in other files) or too many matching errors. Using the -s option to suppress error messages about inaccessi- ble files does not affect the return code. SEE ALSO - pcre2pattern(3), pcre2syntax(3). + pcre2pattern(3), pcre2syntax(3), pcre2callout(3). AUTHOR @@ -765,5 +862,5 @@ AUTHOR REVISION - Last updated: 03 January 2015 - Copyright (c) 1997-2015 University of Cambridge. + Last updated: 31 December 2016 + Copyright (c) 1997-2016 University of Cambridge. diff --git a/pcre2/doc/pcre2jit.3 b/pcre2/doc/pcre2jit.3 index 3f2071ba2..0b95b4dc6 100644 --- a/pcre2/doc/pcre2jit.3 +++ b/pcre2/doc/pcre2jit.3 @@ -1,4 +1,4 @@ -.TH PCRE2JIT 3 "27 November 2014" "PCRE2 10.00" +.TH PCRE2JIT 3 "05 June 2016" "PCRE2 10.22" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT" @@ -61,6 +61,12 @@ much faster than the normal interpretive code, but yields exactly the same results. The returned value from \fBpcre2_jit_compile()\fP is zero on success, or a negative error code. .P +There is a limit to the size of pattern that JIT supports, imposed by the size +of machine stack that it uses. The exact rules are not documented because they +may change at any time, in particular, when new optimizations are introduced. +If a pattern is too big, a call to \fBpcre2_jit_compile()\fB returns +PCRE2_ERROR_NOMEMORY. +.P PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete matches. If you want to run partial matches using the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT options of \fBpcre2_match()\fP, you should set one or both @@ -122,6 +128,9 @@ PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED option is not supported at match time. .P +If the PCRE2_NO_JIT option is passed to \fBpcre2_match()\fP it disables the +use of JIT, forcing matching by the interpreter code. +.P The only unsupported pattern items are \eC (match a single data unit) when running in a UTF mode, and a callout immediately before an assertion condition in a conditional group. @@ -207,8 +216,13 @@ for JIT matching. A callback function can therefore be used to determine whether a match operation was executed by JIT or by the interpreter. .P You may safely use the same JIT stack for more than one pattern (either by -assigning directly or by callback), as long as the patterns are all matched -sequentially in the same thread. In a multithread application, if you do not +assigning directly or by callback), as long as the patterns are matched +sequentially in the same thread. Currently, the only way to set up +non-sequential matches in one thread is to use callouts: if a callout function +starts another match, that match must use a different JIT stack to the one used +for currently suspended match(es). +.P +In a multithread application, if you do not specify a JIT stack, or if you assign or pass back NULL from a callback, that is thread-safe, because each thread has its own machine stack. However, if you assign or pass back a non-NULL JIT stack, this must be a different stack for @@ -366,7 +380,7 @@ The fast path function is called \fBpcre2_jit_match()\fP, and it takes exactly the same arguments as \fBpcre2_match()\fP. The return values are also the same, plus PCRE2_ERROR_JIT_BADOPTION if a matching mode (partial or complete) is requested that was not compiled. Unsupported option bits (for example, -PCRE2_ANCHORED) are ignored. +PCRE2_ANCHORED) are ignored, as is the PCRE2_NO_JIT option. .P When you call \fBpcre2_match()\fP, as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For example, if @@ -399,6 +413,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 27 November 2014 -Copyright (c) 1997-2014 University of Cambridge. +Last updated: 05 June 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2limits.3 b/pcre2/doc/pcre2limits.3 index 898254618..805e42af7 100644 --- a/pcre2/doc/pcre2limits.3 +++ b/pcre2/doc/pcre2limits.3 @@ -1,4 +1,4 @@ -.TH PCRE2LIMITS 3 "25 November 2014" "PCRE2 10.00" +.TH PCRE2LIMITS 3 "26 October 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "SIZE AND OTHER LIMITATIONS" @@ -20,6 +20,10 @@ documentation for details. In these cases the limit is substantially larger. However, the speed of execution is slower. In the 32-bit library, the internal linkage size is always 4. .P +The maximum length of a source pattern string is essentially unlimited; it is +the largest number a PCRE2_SIZE variable can hold. However, the program that +calls \fBpcre2_compile()\fP can specify a smaller limit. +.P The maximum length (in code units) of a subject string is one less than the largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an unsigned integer type, usually defined as size_t. Its maximum value (that is @@ -37,22 +41,25 @@ documentation. .P All values in repeating quantifiers must be less than 65536. .P +The maximum length of a lookbehind assertion is 65535 characters. +.P There is no limit to the number of parenthesized subpatterns, but there can be no more than 65535 capturing subpatterns. There is, however, a limit to the depth of nesting of parenthesized subpatterns of all kinds. This is imposed in -order to limit the amount of system stack used at compile time. The limit can -be specified when PCRE2 is built; the default is 250. -.P -There is a limit to the number of forward references to subsequent subpatterns -of around 200,000. Repeated forward references with fixed upper limits, for -example, (?2){0,100} when subpattern number 2 is to the right, are included in -the count. There is no limit to the number of backward references. +order to limit the amount of system stack used at compile time. The default +limit can be specified when PCRE2 is built; the default default is 250. An +application can change this limit by calling pcre2_set_parens_nest_limit() to +set the limit in a compile context. .P The maximum length of name for a named subpattern is 32 code units, and the maximum number of named subpatterns is 10000. .P The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb -is 255 for the 8-bit library and 65535 for the 16-bit and 32-bit libraries. +is 255 code units for the 8-bit library and 65535 code units for the 16-bit and +32-bit libraries. +.P +The maximum length of a string argument to a callout is the largest number a +32-bit unsigned integer can hold. . . .SH AUTHOR @@ -69,6 +76,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 25 November 2014 -Copyright (c) 1997-2014 University of Cambridge. +Last updated: 26 October 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2pattern.3 b/pcre2/doc/pcre2pattern.3 index 192859dd3..4c869c1b7 100644 --- a/pcre2/doc/pcre2pattern.3 +++ b/pcre2/doc/pcre2pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2PATTERN 3 "13 June 2015" "PCRE2 10.20" +.TH PCRE2PATTERN 3 "27 December 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" @@ -158,6 +158,11 @@ be less than the value set (or defaulted) by the caller of \fBpcre2_match()\fP for it to have any effect. In other words, the pattern writer can lower the limits set by the programmer, but not raise them. If there is more than one setting of one of these limits, the lower value is used. +.P +The match limit is used (but in a different way) when JIT is being used, but it +is not relevant, and is ignored, when matching with \fBpcre2_dfa_match()\fP. +However, the recursion limit is relevant for DFA matching, which does use some +function recursion, in particular, for recursions within the pattern. . . .\" HTML @@ -359,29 +364,28 @@ case letter, it is converted to upper case. Then bit 6 of the character (hex 40) is inverted. Thus \ecA to \ecZ become hex 01 to hex 1A (A is 41, Z is 5A), but \ec{ becomes hex 3B ({ is 7B), and \ec; becomes hex 7B (; is 3B). If the code unit following \ec has a value less than 32 or greater than 126, a -compile-time error occurs. This locks out non-printable ASCII characters in all -modes. +compile-time error occurs. .P When PCRE2 is compiled in EBCDIC mode, \ea, \ee, \ef, \en, \er, and \et generate the appropriate EBCDIC code values. The \ec escape is processed as specified for Perl in the \fBperlebcdic\fP document. The only characters that are allowed after \ec are A-Z, a-z, or one of @, [, \e, ], ^, _, or ?. Any -other character provokes a compile-time error. The sequence \e@ encodes -character code 0; the letters (in either case) encode characters 1-26 (hex 01 -to hex 1A); [, \e, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and -\e? becomes either 255 (hex FF) or 95 (hex 5F). +other character provokes a compile-time error. The sequence \ec@ encodes +character code 0; after \ec the letters (in either case) encode characters 1-26 +(hex 01 to hex 1A); [, \e, ], ^, and _ encode characters 27-31 (hex 1B to hex +1F), and \ec? becomes either 255 (hex FF) or 95 (hex 5F). .P -Thus, apart from \e?, these escapes generate the same character code values as +Thus, apart from \ec?, these escapes generate the same character code values as they do in an ASCII environment, though the meanings of the values mostly -differ. For example, \eG always generates code value 7, which is BEL in ASCII +differ. For example, \ecG always generates code value 7, which is BEL in ASCII but DEL in EBCDIC. .P -The sequence \e? generates DEL (127, hex 7F) in an ASCII environment, but +The sequence \ec? generates DEL (127, hex 7F) in an ASCII environment, but because 127 is not a control character in EBCDIC, Perl makes it generate the APC character. Unfortunately, there are several variants of EBCDIC. In most of them the APC character has the value 255 (hex FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC -values, PCRE2 makes \e? generate 95; otherwise it generates 255. +values, PCRE2 makes \ec? generate 95; otherwise it generates 255. .P After \e0 up to two further octal digits are read. If there are fewer than two digits, just those that are present are used. Thus the sequence \e0\ex\e015 @@ -508,9 +512,9 @@ by code point, as described in the previous section. .SS "Absolute and relative back references" .rs .sp -The sequence \eg followed by an unsigned or a negative number, optionally -enclosed in braces, is an absolute or relative back reference. A named back -reference can be coded as \eg{name}. Back references are discussed +The sequence \eg followed by a signed or unsigned number, optionally enclosed +in braces, is an absolute or relative back reference. A named back reference +can be coded as \eg{name}. Back references are discussed .\" HTML .\" later, @@ -671,8 +675,8 @@ below. This particular group matches either the two-character sequence CR followed by LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (carriage return, U+000D), or NEL (next -line, U+0085). The two-character sequence is treated as a single unit that -cannot be split. +line, U+0085). Because this is an atomic group, the two-character sequence is +treated as a single unit that cannot be split. .P In other modes, two additional characters whose codepoints are greater than 255 are added: LS (line separator, U+2028) and PS (paragraph separator, U+2029). @@ -738,6 +742,8 @@ example: Those that are not part of an identified script are lumped together as "Common". The current list of scripts is: .P +Ahom, +Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, @@ -778,6 +784,7 @@ Gurmukhi, Han, Hangul, Hanunoo, +Hatran, Hebrew, Hiragana, Imperial_Aramaic, @@ -814,12 +821,14 @@ Miao, Modi, Mongolian, Mro, +Multani, Myanmar, Nabataean, New_Tai_Lue, Nko, Ogham, Ol_Chiki, +Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic, @@ -841,6 +850,7 @@ Saurashtra, Sharada, Shavian, Siddham, +SignWriting, Sinhala, Sora_Sompeng, Sundanese, @@ -1177,6 +1187,18 @@ patterns that are anchored in single line mode because all branches start with when the \fIstartoffset\fP argument of \fBpcre2_match()\fP is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. .P +When the newline convention (see +.\" HTML +.\" +"Newline conventions" +.\" +below) recognizes the two-character sequence CRLF as a newline, this is +preferred, even if the single characters CR and LF are also recognized as +newlines. For example, if the newline convention is "any", a multiline mode +circumflex matches before "xyz" in the string "abc\er\enxyz" rather than after +CR, even though CR on its own is a valid newline. (It also matches at the very +start of the string, of course.) +.P Note that the sequences \eA, \eZ, and \ez can be used to match the start and end of the subject in both modes, and if all branches of a pattern start with \eA it is always anchored, whether or not PCRE2_MULTILINE is set. @@ -1227,21 +1249,31 @@ with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start with a malformed UTF character. This has undefined results, because PCRE2 assumes that it is matching character by character in a valid UTF string (by default it checks the subject string's validity at the start of processing -unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the -use of \eC by setting the PCRE2_NEVER_BACKSLASH_C option. +unless the PCRE2_NO_UTF_CHECK option is used). +.P +An application can lock out the use of \eC by setting the +PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to +build PCRE2 with the use of \eC permanently disabled. .P PCRE2 does not allow \eC to appear in lookbehind assertions .\" HTML .\" (described below) .\" -in a UTF mode, because this would make it impossible to calculate the length of -the lookbehind. +in UTF-8 or UTF-16 modes, because this would make it impossible to calculate +the length of the lookbehind. Neither the alternative matching function +\fBpcre2_dfa_match()\fP nor the JIT optimizer support \eC in these UTF modes. +The former gives a match-time error; the latter fails to optimize and so the +match is always run using the interpreter. +.P +In the 32-bit library, however, \eC is always supported (when not explicitly +locked out) because it always matches a single code unit, whether or not UTF-32 +is specified. .P In general, the \eC escape sequence is best avoided. However, one way of using -it that avoids the problem of malformed UTF characters is to use a lookahead to -check the length of the next character, as in this pattern, which could be used -with a UTF-8 string (ignore white space and line breaks): +it that avoids the problem of malformed UTF-8 or UTF-16 characters is to use a +lookahead to check the length of the next character, as in this pattern, which +could be used with a UTF-8 string (ignore white space and line breaks): .sp (?| (?=[\ex00-\ex7f])(\eC) | (?=[\ex80-\ex{7ff}])(\eC)(\eC) | @@ -1297,37 +1329,6 @@ when matching character classes, whatever line-ending sequence is in use, and whatever setting of the PCRE2_DOTALL and PCRE2_MULTILINE options is used. A class such as [^a] always matches one of these characters. .P -The minus (hyphen) character can be used to specify a range of characters in a -character class. For example, [d-m] matches any letter between d and m, -inclusive. If a minus character is required in a class, it must be escaped with -a backslash or appear in a position where it cannot be interpreted as -indicating a range, typically as the first or last character in the class, or -immediately after a range. For example, [b-d-z] matches letters in the range b -to d, a hyphen character, or z. -.P -It is not possible to have the literal character "]" as the end character of a -range. A pattern such as [W-]46] is interpreted as a class of two characters -("W" and "-") followed by a literal string "46]", so it would match "W46]" or -"-46]". However, if the "]" is escaped with a backslash it is interpreted as -the end of range, so [W-\e]46] is interpreted as a class containing a range -followed by two other characters. The octal or hexadecimal representation of -"]" can also be used to end a range. -.P -An error is generated if a POSIX character class (see below) or an escape -sequence other than one that defines a single character appears at a point -where a range ending character is expected. For example, [z-\exff] is valid, -but [A-\ed] and [A-[:digit:]] are not. -.P -Ranges operate in the collating sequence of character values. They can also be -used for characters specified numerically, for example [\e000-\e037]. Ranges -can include any characters that are valid for the current mode. -.P -If a range that includes letters is used when caseless matching is set, it -matches the letters in either case. For example, [W-c] is equivalent to -[][\e\e^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character -tables for a French locale are in use, [\exc8-\excb] matches accented E -characters in both cases. -.P The character escape sequences \ed, \eD, \eh, \eH, \ep, \eP, \es, \eS, \ev, \eV, \ew, and \eW may appear in a character class, and add the characters that they match to the class. For example, [\edABCDEF] matches any hexadecimal @@ -1343,6 +1344,46 @@ class; it matches the backspace character. The sequences \eB, \eN, \eR, and \eX are not special inside a character class. Like any other unrecognized escape sequences, they cause an error. .P +The minus (hyphen) character can be used to specify a range of characters in a +character class. For example, [d-m] matches any letter between d and m, +inclusive. If a minus character is required in a class, it must be escaped with +a backslash or appear in a position where it cannot be interpreted as +indicating a range, typically as the first or last character in the class, +or immediately after a range. For example, [b-d-z] matches letters in the range +b to d, a hyphen character, or z. +.P +Perl treats a hyphen as a literal if it appears before or after a POSIX class +(see below) or a character type escape such as as \ed, but gives a warning in +its warning mode, as this is most likely a user error. As PCRE2 has no facility +for warning, an error is given in these cases. +.P +It is not possible to have the literal character "]" as the end character of a +range. A pattern such as [W-]46] is interpreted as a class of two characters +("W" and "-") followed by a literal string "46]", so it would match "W46]" or +"-46]". However, if the "]" is escaped with a backslash it is interpreted as +the end of range, so [W-\e]46] is interpreted as a class containing a range +followed by two other characters. The octal or hexadecimal representation of +"]" can also be used to end a range. +.P +Ranges normally include all code points between the start and end characters, +inclusive. They can also be used for code points specified numerically, for +example [\e000-\e037]. Ranges can include any characters that are valid for the +current mode. +.P +There is a special case in EBCDIC environments for ranges whose end points are +both specified as literal letters in the same case. For compatibility with +Perl, EBCDIC code points within the range that are not letters are omitted. For +example, [h-k] matches only four characters, even though the codes for h and k +are 0x88 and 0x92, a range of 11 code points. However, if the range is +specified numerically, for example, [\ex88-\ex92] or [h-\ex92], all code points +are included. +.P +If a range that includes letters is used when caseless matching is set, it +matches the letters in either case. For example, [W-c] is equivalent to +[][\e\e^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character +tables for a French locale are in use, [\exc8-\excb] matches accented E +characters in both cases. +.P A circumflex can conveniently be used with the upper case character types to specify a more restricted set of characters than the matching lower case type. For example, the class [^\eW_] matches any letter or digit, but not underscore, @@ -1514,12 +1555,8 @@ respectively. .P When one of these option changes occurs at top level (that is, not inside subpattern parentheses), the change applies to the remainder of the pattern -that follows. If the change is placed right at the start of a pattern, PCRE2 -extracts it into the global options (and it will therefore show up in data -extracted by the \fBpcre2_pattern_info()\fP function). -.P -An option change within a subpattern (see below for a description of -subpatterns) affects only that part of the subpattern that follows it, so +that follows. An option change within a subpattern (see below for a description +of subpatterns) affects only that part of the subpattern that follows it, so .sp (a(?i)b)c .sp @@ -1650,6 +1687,9 @@ first one in the pattern with the given number. The following pattern matches .sp /(?|(abc)|(def))(?1)/ .sp +A relative reference such as (?-1) is no different: it is just a convenient way +of computing an absolute group number. +.P If a .\" HTML .\" @@ -2056,9 +2096,9 @@ no such problem when named parentheses are used. A back reference to any subpattern is possible using named parentheses (see below). .P Another way of avoiding the ambiguity inherent in the use of digits following a -backslash is to use the \eg escape sequence. This escape must be followed by an -unsigned number or a negative number, optionally enclosed in braces. These -examples are all identical: +backslash is to use the \eg escape sequence. This escape must be followed by a +signed or unsigned number, optionally enclosed in braces. These examples are +all identical: .sp (ring), \e1 (ring), \eg1 @@ -2066,8 +2106,7 @@ examples are all identical: .sp An unsigned number specifies an absolute reference without the ambiguity that is present in the older syntax. It is also useful when literal digits follow -the reference. A negative number is a relative reference. Consider this -example: +the reference. A signed number is a relative reference. Consider this example: .sp (abc(def)ghi)\eg{-1} .sp @@ -2077,6 +2116,10 @@ Similarly, \eg{-2} would be equivalent to \e1. The use of relative references can be helpful in long patterns, and also in patterns that are created by joining together fragments that contain references within themselves. .P +The sequence \eg{+1} is a reference to the next capturing subpattern. This kind +of forward reference can be useful it patterns that repeat. Perl does not +support the use of + in this way. +.P A back reference matches whatever actually matched the capturing subpattern in the current subject string, rather than anything matching the subpattern itself (see @@ -2184,6 +2227,13 @@ numbering the capturing subpatterns in the whole pattern. However, substring capturing is carried out only for positive assertions. (Perl sometimes, but not always, does do capturing in negative assertions.) .P +WARNING: If a positive assertion containing one or more capturing subpatterns +succeeds, but failure to match later in the pattern causes backtracking over +this assertion, the captures within the assertion are reset only if no higher +numbered captures are already set. This is, unfortunately, a fundamental +limitation of the current implementation; it may get removed in a future +reworking. +.P For compatibility with Perl, most assertion subpatterns may be repeated; though it makes no sense to assert the same thing several times, the side effect of capturing parentheses may occasionally be useful. However, an assertion that @@ -2281,23 +2331,34 @@ temporarily move the current position back by the fixed length and then try to match. If there are insufficient characters before the current position, the assertion fails. .P -In a UTF mode, PCRE2 does not allow the \eC escape (which matches a single code -unit even in a UTF mode) to appear in lookbehind assertions, because it makes -it impossible to calculate the length of the lookbehind. The \eX and \eR -escapes, which can match different numbers of code units, are also not -permitted. +In UTF-8 and UTF-16 modes, PCRE2 does not allow the \eC escape (which matches a +single code unit even in a UTF mode) to appear in lookbehind assertions, +because it makes it impossible to calculate the length of the lookbehind. The +\eX and \eR escapes, which can match different numbers of code units, are never +permitted in lookbehinds. .P .\" HTML .\" "Subroutine" .\" calls (see below) such as (?2) or (?&X) are permitted in lookbehinds, as long -as the subpattern matches a fixed-length string. +as the subpattern matches a fixed-length string. However, .\" HTML .\" -Recursion, +recursion, .\" -however, is not supported. +that is, a "subroutine" call into a group that is already active, +is not supported. +.P +Perl does not support back references in lookbehinds. PCRE2 does support them, +but only if certain conditions are met. The PCRE2_MATCH_UNSET_BACKREF option +must not be set, there must be no use of (?| in the pattern (it creates +duplicate subpattern numbers), and if the back reference is by name, the name +must be unique. Of course, the referenced subpattern must itself be of fixed +length. The following pattern matches words containing at least two characters +that begin and end with the same character: +.sp + \eb(\ew)\ew++(?<=\e1) .P Possessive quantifiers can be used in conjunction with lookbehind assertions to specify efficient matching of fixed-length strings at the end of subject @@ -2436,7 +2497,9 @@ This makes the fragment independent of the parentheses in the larger pattern. .sp Perl uses the syntax (?( )...) or (?('name')...) to test for a used subpattern by name. For compatibility with earlier versions of PCRE1, which had -this facility before Perl, the syntax (?(name)...) is also recognized. +this facility before Perl, the syntax (?(name)...) is also recognized. Note, +however, that undelimited names consisting of the letter R followed by digits +are ambiguous (see the following section). .P Rewriting the above example to use a named subpattern gives this: .sp @@ -2450,33 +2513,55 @@ matched. .SS "Checking for pattern recursion" .rs .sp -If the condition is the string (R), and there is no subpattern with the name R, -the condition is true if a recursive call to the whole pattern or any -subpattern has been made. If digits or a name preceded by ampersand follow the -letter R, for example: -.sp - (?(R3)...) or (?(R&name)...) -.sp -the condition is true if the most recent recursion is into a subpattern whose -number or name is given. This condition does not check the entire recursion -stack. If the name used in a condition of this kind is a duplicate, the test is -applied to all subpatterns of the same name, and is true if any one of them is -the most recent recursion. -.P -At "top level", all these recursion test conditions are false. +"Recursion" in this sense refers to any subroutine-like call from one part of +the pattern to another, whether or not it is actually recursive. See the +sections entitled .\" HTML .\" -The syntax for recursive patterns +"Recursive patterns" .\" -is described below. +and +.\" HTML +.\" +"Subpatterns as subroutines" +.\" +below for details of recursion and subpattern calls. +.P +If a condition is the string (R), and there is no subpattern with the name R, +the condition is true if matching is currently in a recursion or subroutine +call to the whole pattern or any subpattern. If digits follow the letter R, and +there is no subpattern with that name, the condition is true if the most recent +call is into a subpattern with the given number, which must exist somewhere in +the overall pattern. This is a contrived example that is equivalent to a+b: +.sp + ((?(R1)a+|(?1)b)) +.sp +However, in both cases, if there is a subpattern with a matching name, the +condition tests for its being set, as described in the section above, instead +of testing for recursion. For example, creating a group with the name R1 by +adding (? ) to the above pattern completely changes its meaning. +.P +If a name preceded by ampersand follows the letter R, for example: +.sp + (?(R&name)...) +.sp +the condition is true if the most recent recursion is into a subpattern of that +name (which must exist within the pattern). +.P +This condition does not check the entire recursion stack. It tests only the +current level. If the name used in a condition of this kind is a duplicate, the +test is applied to all subpatterns of the same name, and is true if any one of +them is the most recent recursion. +.P +At "top level", all these recursion test conditions are false. . . .\" HTML .SS "Defining subpatterns for use by reference only" .rs .sp -If the condition is the string (DEFINE), and there is no subpattern with the -name DEFINE, the condition is always false. In this case, there may be only one +If the condition is the string (DEFINE), the condition is always false, even if +there is a group with the name DEFINE. In this case, there may be only one alternative in the subpattern. It is always skipped if control reaches this point in the pattern; the idea of DEFINE is that it can be used to define subroutines that can be referenced from elsewhere. (The use of @@ -2513,7 +2598,8 @@ For example: (?(VERSION>=10.4)yes|no) .sp This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or -"no" otherwise. +"no" otherwise. The fractional part of the version number may not contain more +than two digits. . . .SS "Assertion conditions" @@ -2630,6 +2716,23 @@ pattern above you can write (?-2) to refer to the second most recently opened parentheses preceding the recursion. In other words, a negative number counts capturing parentheses leftwards from the point at which it is encountered. .P +Be aware however, that if +.\" HTML +.\" +duplicate subpattern numbers +.\" +are in use, relative references refer to the earliest subpattern with the +appropriate number. Consider, for example: +.sp + (?|(a)|(b)) (c) (?-2) +.sp +The first two capturing groups (a) and (b) are both numbered 1, and group (c) +is number 2. When the reference (?-2) is encountered, the second most recently +opened parentheses has the number 1, but it is the first such group (the (a) +group) to which the recursion refers. This would be the same if an absolute +reference (?1) was used. In other words, relative references are just a +shorthand for computing a group number. +.P It is also possible to refer to subsequently opened parentheses, by writing references such as (?+2). However, these cannot be recursive because the reference is not inside the parentheses that are referenced. They are always @@ -2929,14 +3032,32 @@ in production code should be noted to avoid problems during upgrades." The same remarks apply to the PCRE2 features described in this section. .P The new verbs make use of what was previously invalid syntax: an opening -parenthesis followed by an asterisk. They are generally of the form -(*VERB) or (*VERB:NAME). Some may take either form, possibly behaving -differently depending on whether or not a name is present. A name is any -sequence of characters that does not include a closing parenthesis. The maximum -length of name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit -libraries. If the name is empty, that is, if the closing parenthesis -immediately follows the colon, the effect is as if the colon were not there. -Any number of these verbs may occur in a pattern. +parenthesis followed by an asterisk. They are generally of the form (*VERB) or +(*VERB:NAME). Some verbs take either form, possibly behaving differently +depending on whether or not a name is present. +.P +By default, for compatibility with Perl, a name is any sequence of characters +that does not include a closing parenthesis. The name is not processed in +any way, and it is not possible to include a closing parenthesis in the name. +This can be changed by setting the PCRE2_ALT_VERBNAMES option, but the result +is no longer Perl-compatible. +.P +When PCRE2_ALT_VERBNAMES is set, backslash processing is applied to verb names +and only an unescaped closing parenthesis terminates the name. However, the +only backslash items that are permitted are \eQ, \eE, and sequences such as +\ex{100} that define character code points. Character type escapes such as \ed +are faulted. +.P +A closing parenthesis can be included in a name either as \e) or between \eQ +and \eE. In addition to backslash processing, if the PCRE2_EXTENDED option is +also set, unescaped whitespace in verb names is skipped, and #-comments are +recognized, exactly as in the rest of the pattern. PCRE2_EXTENDED does not +affect verb names unless PCRE2_ALT_VERBNAMES is also set. +.P +The maximum length of a name is 255 in the 8-bit library and 65535 in the +16-bit and 32-bit libraries. If the name is empty, that is, if the closing +parenthesis immediately follows the colon, the effect is as if the colon were +not there. Any number of these verbs may occur in a pattern. .P Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the traditional matching @@ -3361,6 +3482,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 13 June 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 27 December 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2posix.3 b/pcre2/doc/pcre2posix.3 index b6669752f..70a86d81d 100644 --- a/pcre2/doc/pcre2posix.3 +++ b/pcre2/doc/pcre2posix.3 @@ -1,4 +1,4 @@ -.TH PCRE2POSIX 3 "20 October 2014" "PCRE2 10.00" +.TH PCRE2POSIX 3 "31 January 2016" "PCRE2 10.22" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "SYNOPSIS" @@ -28,7 +28,7 @@ expression 8-bit library. See the \fBpcre2api\fP .\" documentation for a description of PCRE2's native API, which contains much -additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit +additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit and 32-bit libraries. .P The functions described here are just wrapper functions that ultimately call @@ -44,9 +44,9 @@ value zero. This has no effect, but since programs that are written to the POSIX interface often use it, this makes it easier to slot in PCRE2 as a replacement library. Other POSIX options are not even defined. .P -There are also some other options that are not defined by POSIX. These have -been added at the request of users who want to make use of certain -PCRE2-specific features via the POSIX calling interface. +There are also some options that are not defined by POSIX. These have been +added at the request of users who want to make use of certain PCRE2-specific +features via the POSIX calling interface. .P When PCRE2 is called via these functions, it is only the API that is POSIX-like in style. The syntax and semantics of the regular expressions themselves are @@ -95,11 +95,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section). .sp REG_NOSUB .sp -The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed -for compilation to the native function. In addition, when a pattern that is -compiled with this flag is passed to \fBregexec()\fP for matching, the -\fInmatch\fP and \fIpmatch\fP arguments are ignored, and no captured strings -are returned. +When a pattern that is compiled with this flag is passed to \fBregexec()\fP for +matching, the \fInmatch\fP and \fIpmatch\fP arguments are ignored, and no +captured strings are returned. Versions of the PCRE library prior to 10.22 used +to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens +because it disables the use of back references. .sp REG_UCP .sp @@ -145,7 +145,7 @@ use the contents of the \fIpreg\fP structure. If, for example, you pass it to This area is not simple, because POSIX and Perl take different views of things. It is not possible to get PCRE2 to obey POSIX semantics, but then PCRE2 was never intended to be a POSIX engine. The following table lists the different -possibilities for matching newline characters in PCRE2: +possibilities for matching newline characters in Perl and PCRE2: .sp Default Change with .sp @@ -155,7 +155,7 @@ possibilities for matching newline characters in PCRE2: $ matches \en in middle no PCRE2_MULTILINE ^ matches \en in middle no PCRE2_MULTILINE .sp -This is the equivalent table for POSIX: +This is the equivalent table for a POSIX-compatible pattern matcher: .sp Default Change with .sp @@ -165,13 +165,17 @@ This is the equivalent table for POSIX: $ matches \en in middle no REG_NEWLINE ^ matches \en in middle no REG_NEWLINE .sp -PCRE2's behaviour is the same as Perl's, except that there is no equivalent for -PCRE2_DOLLAR_ENDONLY in Perl. In both PCRE2 and Perl, there is no way to stop -newline from matching [^a]. +This behaviour is not what happens when PCRE2 is called via its POSIX +API. By default, PCRE2's behaviour is the same as Perl's, except that there is +no equivalent for PCRE2_DOLLAR_ENDONLY in Perl. In both PCRE2 and Perl, there +is no way to stop newline from matching [^a]. .P -The default POSIX newline handling can be obtained by setting PCRE2_DOTALL and -PCRE2_DOLLAR_ENDONLY, but there is no way to make PCRE2 behave exactly as for -the REG_NEWLINE action. +Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and +PCRE2_DOLLAR_ENDONLY when calling \fBpcre2_compile()\fP directly, but there is +no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using +the POSIX API, passing REG_NEWLINE to PCRE2's \fBregcomp()\fP function +causes PCRE2_MULTILINE to be passed to \fBpcre2_compile()\fP, and REG_DOTALL +passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY. . . .SH "MATCHING A PATTERN" @@ -207,16 +211,18 @@ to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not -how it is matched. +how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are +mutually exclusive; the error REG_INVARG is returned. .P If the pattern was compiled with the REG_NOSUB flag, no data about any matched strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of -\fBregexec()\fP are ignored. +\fBregexec()\fP are ignored (except possibly as input for REG_STARTEND). .P -If the value of \fInmatch\fP is zero, or if the value \fIpmatch\fP is NULL, -no data about any matched strings is returned. +The value of \fInmatch\fP may be zero, and the value \fIpmatch\fP may be NULL +(unless REG_STARTEND is set); in both these cases no data about any matched +strings is returned. .P -Otherwise,the portion of the string that was matched, and also any captured +Otherwise, the portion of the string that was matched, and also any captured substrings, are returned via the \fIpmatch\fP argument, which points to an array of \fInmatch\fP structures of type \fIregmatch_t\fP, containing the members \fIrm_so\fP and \fIrm_eo\fP. These contain the byte offset to the first @@ -236,9 +242,11 @@ header file, of which REG_NOMATCH is the "expected" failure code. The \fBregerror()\fP function maps a non-zero errorcode from either \fBregcomp()\fP or \fBregexec()\fP to a printable message. If \fIpreg\fP is not NULL, the error should have arisen from the use of that structure. A message -terminated by a binary zero is placed in \fIerrbuf\fP. The length of the -message, including the zero, is limited to \fIerrbuf_size\fP. The yield of the -function is the size of buffer needed to hold the whole message. +terminated by a binary zero is placed in \fIerrbuf\fP. If the buffer is too +short, only the first \fIerrbuf_size\fP - 1 characters of the error message are +used. The yield of the function is the size of buffer needed to hold the whole +message, including the terminating zero. This value is greater than +\fIerrbuf_size\fP if the message was truncated. . . .SH MEMORY USAGE @@ -263,6 +271,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 20 October 2014 -Copyright (c) 1997-2014 University of Cambridge. +Last updated: 31 January 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2sample.3 b/pcre2/doc/pcre2sample.3 index 7b469356b..661e39274 100644 --- a/pcre2/doc/pcre2sample.3 +++ b/pcre2/doc/pcre2sample.3 @@ -1,4 +1,4 @@ -.TH PCRE2SAMPLE 3 "20 October 2014" "PCRE2 10.00" +.TH PCRE2SAMPLE 3 "02 February 2016" "PCRE2 10.22" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 SAMPLE PROGRAM" @@ -13,23 +13,28 @@ distribution. A listing of this program is given in the documentation. If you do not have a copy of the PCRE2 distribution, you can save this listing to re-create the contents of \fIpcre2demo.c\fP. .P -The demonstration program, which uses the PCRE2 8-bit library, compiles the -regular expression that is its first argument, and matches it against the -subject string in its second argument. No PCRE2 options are set, and default -character tables are used. If matching succeeds, the program outputs the -portion of the subject that matched, together with the contents of any captured -substrings. +The demonstration program compiles the regular expression that is its +first argument, and matches it against the subject string in its second +argument. No PCRE2 options are set, and default character tables are used. If +matching succeeds, the program outputs the portion of the subject that matched, +together with the contents of any captured substrings. .P If the -g option is given on the command line, the program then goes on to check for further matches of the same regular expression in the same subject string. The logic is a little bit tricky because of the possibility of matching an empty string. Comments in the code explain what is going on. .P +The code in \fBpcre2demo.c\fP is an 8-bit program that uses the PCRE2 8-bit +library. It handles strings and characters that are stored in 8-bit code units. +By default, one character corresponds to one code unit, but if the pattern +starts with "(*UTF)", both it and the subject are treated as UTF-8 strings, +where characters may occupy multiple code units. +.P If PCRE2 is installed in the standard include and library directories for your operating system, you should be able to compile the demonstration program using -this command: +a command like this: .sp - gcc -o pcre2demo pcre2demo.c -lpcre2-8 + cc -o pcre2demo pcre2demo.c -lpcre2-8 .sp If PCRE2 is installed elsewhere, you may need to add additional options to the command line. For example, on a Unix-like system that has PCRE2 installed in @@ -37,12 +42,11 @@ command line. For example, on a Unix-like system that has PCRE2 installed in like this: .sp .\" JOINSH - gcc -o pcre2demo -I/usr/local/include pcre2demo.c \e - -L/usr/local/lib -lpcre2-8 + cc -o pcre2demo -I/usr/local/include pcre2demo.c \e + -L/usr/local/lib -lpcre2-8 .sp -.P -Once you have compiled and linked the demonstration program, you can run simple -tests like this: +Once you have built the demonstration program, you can run simple tests like +this: .sp ./pcre2demo 'cat|dog' 'the cat sat on the mat' ./pcre2demo -g 'cat|dog' 'the dog sat on the cat' @@ -51,12 +55,13 @@ Note that there is a much more comprehensive test program, called .\" HREF \fBpcre2test\fP, .\" -which supports many more facilities for testing regular expressions using the -PCRE2 libraries. The +which supports many more facilities for testing regular expressions using all +three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be +installed). The .\" HREF \fBpcre2demo\fP .\" -program is provided as a simple coding example. +program is provided as a relatively simple coding example. .P If you try to run .\" HREF @@ -65,7 +70,7 @@ If you try to run when PCRE2 is not installed in the standard library directory, you may get an error like this on some operating systems (e.g. Solaris): .sp - ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory + ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory .sp This is caused by the way shared library support works on those systems. You need to add @@ -89,6 +94,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 20 October 2014 -Copyright (c) 1997-2014 University of Cambridge. +Last updated: 02 February 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2serialize.3 b/pcre2/doc/pcre2serialize.3 index a76272b70..664c1db56 100644 --- a/pcre2/doc/pcre2serialize.3 +++ b/pcre2/doc/pcre2serialize.3 @@ -1,4 +1,4 @@ -.TH PCRE2SERIALIZE 3 "20 January 2015" "PCRE2 10.10" +.TH PCRE2SERIALIZE 3 "24 May 2016" "PCRE2 10.22" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS" @@ -22,12 +22,22 @@ If you are running an application that uses a large number of regular expression patterns, it may be useful to store them in a precompiled form instead of having to compile them every time the application is run. However, if you are using the just-in-time optimization feature, it is not possible to -save and reload the JIT data, because it is position-dependent. In addition, -the host on which the patterns are reloaded must be running the same version of -PCRE2, with the same code unit width, and must also have the same endianness, -pointer width and PCRE2_SIZE type. For example, patterns compiled on a 32-bit -system using PCRE2's 16-bit library cannot be reloaded on a 64-bit system, nor -can they be reloaded using the 8-bit library. +save and reload the JIT data, because it is position-dependent. The host on +which the patterns are reloaded must be running the same version of PCRE2, with +the same code unit width, and must also have the same endianness, pointer width +and PCRE2_SIZE type. For example, patterns compiled on a 32-bit system using +PCRE2's 16-bit library cannot be reloaded on a 64-bit system, nor can they be +reloaded using the 8-bit library. +. +. +.SH "SECURITY CONCERNS" +.rs +.sp +The facility for saving and restoring compiled patterns is intended for use +within individual applications. As such, the data supplied to +\fBpcre2_serialize_decode()\fP is expected to be trusted data, not data from +arbitrary external sources. There is only some simple consistency checking, not +complete validation of what is being re-loaded. . . .SH "SAVING COMPILED PATTERNS" @@ -129,20 +139,26 @@ is filled with those that fit, and the remainder are ignored. The yield of the function is the number of decoded patterns, or one of the following negative error codes: .sp - PCRE2_ERROR_BADDATA second argument is zero or less - PCRE2_ERROR_BADMAGIC mismatch of id bytes in the data - PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE2 version - PCRE2_ERROR_MEMORY memory allocation failed - PCRE2_ERROR_NULL first or third argument is NULL + PCRE2_ERROR_BADDATA second argument is zero or less + PCRE2_ERROR_BADMAGIC mismatch of id bytes in the data + PCRE2_ERROR_BADMODE mismatch of code unit size or PCRE2 version + PCRE2_ERROR_BADSERIALIZEDDATA other sanity check failure + PCRE2_ERROR_MEMORY memory allocation failed + PCRE2_ERROR_NULL first or third argument is NULL .sp PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled on a system with different endianness. .P Decoded patterns can be used for matching in the usual way, and must be freed -by calling \fBpcre2_code_free()\fP as normal. A single copy of the character -tables is used by all the decoded patterns. A reference count is used to +by calling \fBpcre2_code_free()\fP. However, be aware that there is a potential +race issue if you are using multiple patterns that were decoded from a single +byte stream in a multithreaded application. A single copy of the character +tables is used by all the decoded patterns and a reference count is used to arrange for its memory to be automatically freed when the last pattern is -freed. +freed, but there is no locking on this reference count. Therefore, if you want +to call \fBpcre2_code_free()\fP for these patterns in different threads, you +must arrange your own locking, and ensure that \fBpcre2_code_free()\fP cannot +be called by two threads at the same time. .P If a pattern was processed by \fBpcre2_jit_compile()\fP before being serialized, the JIT data is discarded and so is no longer available after a @@ -165,6 +181,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 20 January 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 24 May 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2stack.3 b/pcre2/doc/pcre2stack.3 index 871126353..89d101bbc 100644 --- a/pcre2/doc/pcre2stack.3 +++ b/pcre2/doc/pcre2stack.3 @@ -1,4 +1,4 @@ -.TH PCRE2STACK 3 "21 November 2014" "PCRE2 10.00" +.TH PCRE2STACK 3 "23 December 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 DISCUSSION OF STACK USAGE" @@ -43,11 +43,12 @@ assertion and "once-only" subpatterns, which are handled like subroutine calls. Normally, these are never very deep, and the limit on the complexity of \fBpcre2_dfa_match()\fP is controlled by the amount of workspace it is given. However, it is possible to write patterns with runaway infinite recursions; -such patterns will cause \fBpcre2_dfa_match()\fP to run out of stack. At -present, there is no protection against this. +such patterns will cause \fBpcre2_dfa_match()\fP to run out of stack unless a +limit is applied (see below). .P -The comments that follow do NOT apply to \fBpcre2_dfa_match()\fP; they are -relevant only for \fBpcre2_match()\fP without the JIT optimization. +The comments in the next three sections do not apply to +\fBpcre2_dfa_match()\fP; they are relevant only for \fBpcre2_match()\fP without +the JIT optimization. . . .SS "Reducing \fBpcre2_match()\fP's stack usage" @@ -106,7 +107,7 @@ in the \fBpcre2api\fP .\" documentation. Since the block sizes are always the same, it may be possible to -implement customized a memory handler that is more efficient than the standard +implement a customized memory handler that is more efficient than the standard function. The memory blocks obtained for this purpose are retained and re-used if possible while \fBpcre2_match()\fP is running. They are all freed just before it exits. @@ -147,6 +148,15 @@ pattern to match. This is done by calling \fBpcre2_match()\fP repeatedly with different limits. . . +.SS "Limiting \fBpcre2_dfa_match()\fP's stack usage" +.rs +.sp +The recursion limit, as described above for \fBpcre2_match()\fP, also applies +to \fBpcre2_dfa_match()\fP, whose use of recursive function calls for +recursions in the pattern can lead to runaway stack usage. The non-recursive +match limit is not relevant for DFA matching, and is ignored. +. +. .SS "Changing stack size in Unix-like systems" .rs .sp @@ -197,6 +207,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 21 November 2014 -Copyright (c) 1997-2014 University of Cambridge. +Last updated: 23 December 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2syntax.3 b/pcre2/doc/pcre2syntax.3 index dc34538cc..29a956255 100644 --- a/pcre2/doc/pcre2syntax.3 +++ b/pcre2/doc/pcre2syntax.3 @@ -1,4 +1,4 @@ -.TH PCRE2SYNTAX 3 "13 June 2015" "PCRE2 10.20" +.TH PCRE2SYNTAX 3 "23 December 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY" @@ -81,9 +81,10 @@ it matches a literal "u". \eW a "non-word" character \eX a Unicode extended grapheme cluster .sp -The application can lock out the use of \eC by setting the -PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the -current matching point in the middle of a UTF-8 or UTF-16 character. +\eC is dangerous because it may leave the current matching point in the middle +of a UTF-8 or UTF-16 character. The application can lock out the use of \eC by +setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2 +with the use of \eC permanently disabled. .P By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode or in the 16-bit and 32-bit libraries. However, if locale-specific matching is @@ -159,6 +160,8 @@ at release 5.18. .SH "SCRIPT NAMES FOR \ep AND \eP" .rs .sp +Ahom, +Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, @@ -199,6 +202,7 @@ Gurmukhi, Han, Hangul, Hanunoo, +Hatran, Hebrew, Hiragana, Imperial_Aramaic, @@ -235,12 +239,14 @@ Miao, Modi, Mongolian, Mro, +Multani, Myanmar, Nabataean, New_Tai_Lue, Nko, Ogham, Ol_Chiki, +Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic, @@ -262,6 +268,7 @@ Saurashtra, Sharada, Shavian, Siddham, +SignWriting, Sinhala, Sora_Sompeng, Sundanese, @@ -421,9 +428,10 @@ appear. (*UCP) set PCRE2_UCP (use Unicode properties for \ed etc) .sp Note that LIMIT_MATCH and LIMIT_RECURSION can only reduce the value of the -limits set by the caller of pcre2_match(), not increase them. The application -can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or -PCRE2_NEVER_UCP options, respectively, at compile time. +limits set by the caller of \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP, not +increase them. The application can lock out the use of (*UTF) and (*UCP) by +setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at +compile time. . . .SH "NEWLINE CONVENTION" @@ -466,6 +474,9 @@ Each top-level branch of a look behind must be of a fixed length. \en reference by number (can be ambiguous) \egn reference by number \eg{n} reference by number + \eg+n relative reference by number (PCRE2 extension) + \eg-n relative reference by number + \eg{+n} relative reference by number (PCRE2 extension) \eg{-n} relative reference by number \ek reference by name (Perl) \ek'name' reference by name (Perl) @@ -504,13 +515,17 @@ Each top-level branch of a look behind must be of a fixed length. (?(-n) relative reference condition (?( ) named reference condition (Perl) (?('name') named reference condition (Perl) - (?(name) named reference condition (PCRE2) + (?(name) named reference condition (PCRE2, deprecated) (?(R) overall recursion condition - (?(Rn) specific group recursion condition - (?(R&name) specific recursion condition + (?(Rn) specific numbered group recursion condition + (?(R&name) specific named group recursion condition (?(DEFINE) define subpattern for reference (?(VERSION[>]=n.m) test PCRE2 version (?(assert) assertion condition +.sp +Note the ambiguity of (?(R) and (?(Rn) which might be named reference +conditions or recursion tests. Such a condition is interpreted as a reference +condition if the relevant named group exists. . . .SH "BACKTRACKING CONTROL" @@ -570,6 +585,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 13 June 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 23 December 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2test.1 b/pcre2/doc/pcre2test.1 index 857adc354..bd7383e69 100644 --- a/pcre2/doc/pcre2test.1 +++ b/pcre2/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "20 May 2015" "PCRE 10.20" +.TH PCRE2TEST 1 "28 December 2016" "PCRE 10.23" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -29,7 +29,7 @@ subject is processed, and what output is produced. .P As the original fairly simple PCRE library evolved, it acquired many different features, and as a result, the original \fBpcretest\fP program ended up with a -lot of options in a messy, arcane syntax, for testing all the features. The +lot of options in a messy, arcane syntax for testing all the features. The move to the new PCRE2 API provided an opportunity to re-implement the test program as \fBpcre2test\fP, with a cleaner modifier syntax. Nevertheless, there are still many obscure modifiers, some of which are specifically designed for @@ -47,31 +47,63 @@ strings that are encoded in 8-bit, 16-bit, or 32-bit code units. One, two, or all three of these libraries may be simultaneously installed. The \fBpcre2test\fP program can be used to test all the libraries. However, its own input and output are always in 8-bit format. When testing the 16-bit or 32-bit -libraries, patterns and subject strings are converted to 16- or 32-bit format -before being passed to the library functions. Results are converted back to -8-bit code units for output. +libraries, patterns and subject strings are converted to 16-bit or 32-bit +format before being passed to the library functions. Results are converted back +to 8-bit code units for output. .P In the rest of this document, the names of library functions and structures are given in generic form, for example, \fBpcre_compile()\fP. The actual names used in the libraries have a suffix _8, _16, or _32, as appropriate. . . +.\" HTML .SH "INPUT ENCODING" .rs .sp Input to \fBpcre2test\fP is processed line by line, either by calling the C -library's \fBfgets()\fP function, or via the \fBlibreadline\fP library (see -below). The input is processed using using C's string functions, so must not -contain binary zeroes, even though in Unix-like environments, \fBfgets()\fP -treats any bytes other than newline as data characters. In some Windows -environments character 26 (hex 1A) causes an immediate end of file, and no -further data is read. +library's \fBfgets()\fP function, or via the \fBlibreadline\fP library. In some +Windows environments character 26 (hex 1A) causes an immediate end of file, and +no further data is read, so this character should be avoided unless you really +want that action. .P -For maximum portability, therefore, it is safest to avoid non-printing -characters in \fBpcre2test\fP input files. There is a facility for specifying a -pattern's characters as hexadecimal pairs, thus making it possible to include -binary zeroes in a pattern for testing purposes. Subject lines are processed -for backslash escapes, which makes it possible to include any data value. +The input is processed using using C's string functions, so must not +contain binary zeroes, even though in Unix-like environments, \fBfgets()\fP +treats any bytes other than newline as data characters. An error is generated +if a binary zero is encountered. Subject lines are processed for backslash +escapes, which makes it possible to include any data value in strings that are +passed to the library for matching. For patterns, there is a facility for +specifying some or all of the 8-bit input characters as hexadecimal pairs, +which makes it possible to include binary zeros. +. +. +.SS "Input for the 16-bit and 32-bit libraries" +.rs +.sp +When testing the 16-bit or 32-bit libraries, there is a need to be able to +generate character code points greater than 255 in the strings that are passed +to the library. For subject lines, backslash escapes can be used. In addition, +when the \fButf\fP modifier (see +.\" HTML +.\" +"Setting compilation options" +.\" +below) is set, the pattern and any following subject lines are interpreted as +UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate. +.P +For non-UTF testing of wide characters, the \fButf8_input\fP modifier can be +used. This is mutually exclusive with \fButf\fP, and is allowed only in 16-bit +or 32-bit mode. It causes the pattern and following subject lines to be treated +as UTF-8 according to the original definition (RFC 2279), which allows for +character values up to 0x7fffffff. Each character is placed in one 16-bit or +32-bit code unit (in the 16-bit case, values greater than 0xffff cause an error +to occur). +.P +UTF-8 is not capable of encoding values greater than 0x7fffffff, but such +values can be handled by the 32-bit library. When testing this library in +non-UTF mode with \fButf8_input\fP set, if any character is preceded by the +byte 0xff (which is an illegal byte in UTF-8) 0x80000000 is added to the +character's value. This is the only way of passing such code points in a +pattern string. For subject strings, using an escape sequence is preferable. . . .SH "COMMAND LINE OPTIONS" @@ -92,8 +124,12 @@ If the 32-bit library has been built, this option causes it to be used. If only the 32-bit library has been built, this is the default. If the 32-bit library has not been built, this option causes an error. .TP 10 +\fB-ac\fP +Behave as if each pattern has the \fBauto_callout\fP modifier, that is, insert +automatic callouts into every pattern that is compiled. +.TP 10 \fB-b\fP -Behave as if each pattern has the \fB/fullbincode\fP modifier; the full +Behave as if each pattern has the \fBfullbincode\fP modifier; the full internal binary form of the pattern is output after compilation. .TP 10 \fB-C\fP @@ -122,12 +158,13 @@ following options output the value and set the exit code as indicated: The following options output 1 for true or 0 for false, and set the exit code to the same value: .sp - ebcdic compiled for an EBCDIC environment - jit just-in-time support is available - pcre2-16 the 16-bit library was built - pcre2-32 the 32-bit library was built - pcre2-8 the 8-bit library was built - unicode Unicode support is available + backslash-C \eC is supported (not locked out) + ebcdic compiled for an EBCDIC environment + jit just-in-time support is available + pcre2-16 the 16-bit library was built + pcre2-32 the 32-bit library was built + pcre2-8 the 8-bit library was built + unicode Unicode support is available .sp If an unknown option is given, an error message is output; the exit code is 0. .TP 10 @@ -141,11 +178,17 @@ Behave as if each subject line has the \fBdfa\fP modifier; matching is done using the \fBpcre2_dfa_match()\fP function instead of the default \fBpcre2_match()\fP. .TP 10 +\fB-error\fP \fInumber[,number,...]\fP +Call \fBpcre2_get_error_message()\fP for each of the error numbers in the +comma-separated list, display the resulting messages on the standard output, +then exit with zero exit code. The numbers may be positive or negative. This is +a convenience facility for PCRE2 maintainers. +.TP 10 \fB-help\fP Output a brief summary these options and then exit. .TP 10 \fB-i\fP -Behave as if each pattern has the \fB/info\fP modifier; information about the +Behave as if each pattern has the \fBinfo\fP modifier; information about the compiled pattern is given after compilation. .TP 10 \fB-jit\fP @@ -217,9 +260,9 @@ Each subject line is matched separately and independently. If you want to do multi-line matches, you have to use the \en escape sequence (or \er or \er\en, etc., depending on the newline setting) in a single line of input to encode the newline sequences. There is no limit on the length of subject lines; the input -buffer is automatically extended if it is too small. There is a replication -feature that makes it possible to generate long subject lines without having to -supply them explicitly. +buffer is automatically extended if it is too small. There are replication +features that makes it possible to generate long repetitive pattern or subject +lines without having to supply them explicitly. .P An empty line or the end of the file signals the end of the subject lines for a test, at which point a new pattern or command line is expected if there is @@ -259,6 +302,34 @@ described in the section entitled "Saving and restoring compiled patterns" .\" ] +.sp +When PCRE2 is built, a default newline convention can be specified. This +determines which characters and/or character pairs are recognized as indicating +a newline in a pattern or subject string. The default can be overridden when a +pattern is compiled. The standard test files contain tests of various newline +conventions, but the majority of the tests expect a single linefeed to be +recognized as a newline by default. Without special action the tests would fail +when PCRE2 is compiled with either CR or CRLF as the default newline. +.P +The #newline_default command specifies a list of newline types that are +acceptable as the default. The types must be one of CR, LF, CRLF, ANYCRLF, or +ANY (in upper or lower case), for example: +.sp + #newline_default LF Any anyCRLF +.sp +If the default newline is in the list, this command has no effect. Otherwise, +except when testing the POSIX API, a \fBnewline\fP modifier that specifies the +first newline convention in the list (LF in the above example) is added to any +pattern that does not already have a \fBnewline\fP modifier. If the newline +list is empty, the feature is turned off. This command is present in a number +of the standard test input files. +.P +When the POSIX API is being tested there is no way to override the default +newline convention, though it is possible to set the newline convention from +within the pattern. A warning is given if the \fBposix\fP modifier is used when +\fB#newline_default\fP would set a default for the non-POSIX API. .sp #pattern .sp @@ -276,9 +347,10 @@ test files that are also processed by \fBperltest.sh\fP. The \fB#perltest\fP command helps detect tests that are accidentally put in the wrong file. .sp #pop [ ] + #popcopy [ ] .sp -This command is used to manipulate the stack of compiled patterns, as described -in the section entitled "Saving and restoring compiled patterns" +These commands are used to manipulate the stack of compiled patterns, as +described in the section entitled "Saving and restoring compiled patterns" .\" HTML .\" below. @@ -303,12 +375,13 @@ subject lines. Modifiers on a subject line can change these settings. .rs .sp Modifier lists are used with both pattern and subject lines. Items in a list -are separated by commas and optional white space. Some modifiers may be given -for both patterns and subject lines, whereas others are valid for one or the -other only. Each modifier has a long name, for example "anchored", and some of -them must be followed by an equals sign and a value, for example, "offset=12". -Modifiers that do not take values may be preceded by a minus sign to turn off a -previous setting. +are separated by commas followed by optional white space. Trailing whitespace +in a modifier list is ignored. Some modifiers may be given for both patterns +and subject lines, whereas others are valid only for one or the other. Each +modifier has a long name, for example "anchored", and some of them must be +followed by an equals sign and a value, for example, "offset=12". Values cannot +contain comma characters, but may contain spaces. Modifiers that do not take +values may be preceded by a minus sign to turn off a previous setting. .P A few of the more common modifiers can also be specified as single letters, for example "i" for "caseless". In documentation, following the Perl convention, @@ -414,6 +487,12 @@ the start of a modifier list. For example: .sp abc\e=notbol,notempty .sp +If the subject string is empty and \e= is followed by whitespace, the line is +treated as a comment line, and is not used for matching. For example: +.sp + \e= This is a comment. + abc\e= This is an invalid modifier list. +.sp A backslash followed by any other non-alphanumeric character just escapes that character. A backslash followed by anything else causes an error. However, if the very last character in the line is a backslash (and there is no modifier @@ -424,10 +503,10 @@ a real empty line terminates the data input. .SH "PATTERN MODIFIERS" .rs .sp -There are three types of modifier that can appear in pattern lines, two of -which may also be used in a \fB#pattern\fP command. A pattern's modifier list -can add to or override default modifiers that were set by a previous -\fB#pattern\fP command. +There are several types of modifier that can appear in pattern lines. Except +where noted below, they may also be used in \fB#pattern\fP commands. A +pattern's modifier list can add to or override default modifiers that were set +by a previous \fB#pattern\fP command. . . .\" HTML @@ -437,13 +516,14 @@ can add to or override default modifiers that were set by a previous The following modifiers set options for \fBpcre2_compile()\fP. The most common ones have single-letter abbreviations. See .\" HREF -\fBpcreapi\fP +\fBpcre2api\fP .\" for a description of their effects. .sp allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED auto_callout set PCRE2_AUTO_CALLOUT /i caseless set PCRE2_CASELESS @@ -464,12 +544,15 @@ for a description of their effects. no_utf_check set PCRE2_NO_UTF_CHECK ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY + use_offset_limit set PCRE2_USE_OFFSET_LIMIT utf set PCRE2_UTF .sp As well as turning on the PCRE2_UTF option, the \fButf\fP modifier causes all non-printing characters in output strings to be printed using the \ex{hh...} notation. Otherwise, those less than 0x100 are output in hex without the curly -brackets. +brackets. Setting \fButf\fP in 16-bit or 32-bit mode also causes pattern and +subject strings to be translated to UTF-16 or UTF-32, respectively, before +being passed to library functions. . . .\" HTML @@ -485,18 +568,24 @@ about the pattern: debug same as info,fullbincode fullbincode show binary code with lengths /I info show info about compiled pattern - hex pattern is coded in hexadecimal + hex unquoted characters are hexadecimal jit[= ] use JIT jitfast use JIT fast path jitverify verify JIT use locale= use this locale + max_pattern_length= set the maximum pattern length memory show memory used newline= set newline type + null_context compile with a NULL context parens_nest_limit= set maximum parentheses depth posix use the POSIX API + posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack + pushcopy push a copy onto the stack stackguard= test the stackguard feature tables=[0|1|2] select internal tables + use_length do not zero-terminate the pattern + utf8_input treat input as UTF-8 .sp The effects of these modifiers are described in the following sections. . @@ -565,40 +654,148 @@ is requested. For each callout, either its number or string is given, followed by the item that follows it in the pattern. . . -.SS "Specifying a pattern in hex" +.SS "Passing a NULL context" .rs .sp -The \fBhex\fP modifier specifies that the characters of the pattern are to be -interpreted as pairs of hexadecimal digits. White space is permitted between -pairs. For example: +Normally, \fBpcre2test\fP passes a context block to \fBpcre2_compile()\fP. If +the \fBnull_context\fP modifier is set, however, NULL is passed. This is for +testing that \fBpcre2_compile()\fP behaves correctly in this case (it uses +default values). +. +. +.SS "Specifying the pattern's length" +.rs +.sp +By default, patterns are passed to the compiling functions as zero-terminated +strings. When using the POSIX wrapper API, there is no other option. However, +when using PCRE2's native API, patterns can be passed by length instead of +being zero-terminated. The \fBuse_length\fP modifier causes this to happen. +Using a length happens automatically (whether or not \fBuse_length\fP is set) +when \fBhex\fP is set, because patterns specified in hexadecimal may contain +binary zeros. +. +. +.SS "Specifying pattern characters in hexadecimal" +.rs +.sp +The \fBhex\fP modifier specifies that the characters of the pattern, except for +substrings enclosed in single or double quotes, are to be interpreted as pairs +of hexadecimal digits. This feature is provided as a way of creating patterns +that contain binary zeros and other non-printing characters. White space is +permitted between pairs of digits. For example, this pattern contains three +characters: .sp /ab 32 59/hex .sp -This feature is provided as a way of creating patterns that contain binary zero -and other non-printing characters. By default, \fBpcre2test\fP passes patterns -as zero-terminated strings to \fBpcre2_compile()\fP, giving the length as -PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the -actual length of the pattern is passed. +Parts of such a pattern are taken literally if quoted. This pattern contains +nine characters, only two of which are specified in hexadecimal: +.sp + /ab "literal" 32/hex +.sp +Either single or double quotes may be used. There is no way of including +the delimiter within a substring. The \fBhex\fP and \fBexpand\fP modifiers are +mutually exclusive. +.P +The POSIX API cannot be used with patterns specified in hexadecimal because +they may contain binary zeros, which conflicts with \fBregcomp()\fP's +requirement for a zero-terminated string. Such patterns are always passed to +\fBpcre2_compile()\fP as a string with a length, not as zero-terminated. +. +. +.SS "Specifying wide characters in 16-bit and 32-bit modes" +.rs +.sp +In 16-bit and 32-bit modes, all input is automatically treated as UTF-8 and +translated to UTF-16 or UTF-32 when the \fButf\fP modifier is set. For testing +the 16-bit and 32-bit libraries in non-UTF mode, the \fButf8_input\fP modifier +can be used. It is mutually exclusive with \fButf\fP. Input lines are +interpreted as UTF-8 as a means of specifying wide characters. More details are +given in +.\" HTML +.\" +"Input encoding" +.\" +above. +. +. +.SS "Generating long repetitive patterns" +.rs +.sp +Some tests use long patterns that are very repetitive. Instead of creating a +very long input line for such a pattern, you can use a special repetition +feature, similar to the one described for subject lines above. If the +\fBexpand\fP modifier is present on a pattern, parts of the pattern that have +the form +.sp + \e[ ]{ } +.sp +are expanded before the pattern is passed to \fBpcre2_compile()\fP. For +example, \e[AB]{6000} is expanded to "ABAB..." 6000 times. This construction +cannot be nested. An initial "\e[" sequence is recognized only if "]{" followed +by decimal digits and "}" is found later in the pattern. If not, the characters +remain in the pattern unaltered. The \fBexpand\fP and \fBhex\fP modifiers are +mutually exclusive. +.P +If part of an expanded pattern looks like an expansion, but is really part of +the actual pattern, unwanted expansion can be avoided by giving two values in +the quantifier. For example, \e[AB]{6000,6000} is not recognized as an +expansion item. +.P +If the \fBinfo\fP modifier is set on an expanded pattern, the result of the +expansion is included in the information that is output. . . .SS "JIT compilation" .rs .sp -The \fB/jit\fP modifier may optionally be followed by an equals sign and a -number in the range 0 to 7: +Just-in-time (JIT) compiling is a heavyweight optimization that can greatly +speed up pattern matching. See the +.\" HREF +\fBpcre2jit\fP +.\" +documentation for details. JIT compiling happens, optionally, after a pattern +has been successfully compiled into an internal form. The JIT compiler converts +this to optimized machine code. It needs to know whether the match-time options +PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, because +different code is generated for the different cases. See the \fBpartial\fP +modifier in "Subject Modifiers" +.\" HTML +.\" +below +.\" +for details of how these options are specified for each match attempt. +.P +JIT compilation is requested by the \fB/jit\fP pattern modifier, which may +optionally be followed by an equals sign and a number in the range 0 to 7. +The three bits that make up the number specify which of the three JIT operating +modes are to be compiled: +.sp + 1 compile JIT code for non-partial matching + 2 compile JIT code for soft partial matching + 4 compile JIT code for hard partial matching +.sp +The possible values for the \fBjit\fP modifier are therefore: .sp 0 disable JIT - 1 use JIT for normal match only - 2 use JIT for soft partial match only - 3 use JIT for normal match and soft partial match - 4 use JIT for hard partial match only - 6 use JIT for soft and hard partial match + 1 normal matching only + 2 soft partial matching only + 3 normal and soft partial matching + 4 hard partial matching only + 6 soft and hard partial matching only 7 all three modes .sp -If no number is given, 7 is assumed. If JIT compilation is successful, the -compiled JIT code will automatically be used when \fBpcre2_match()\fP is run -for the appropriate type of match, except when incompatible run-time options -are specified. For more details, see the +If no number is given, 7 is assumed. The phrase "partial matching" means a call +to \fBpcre2_match()\fP with either the PCRE2_PARTIAL_SOFT or the +PCRE2_PARTIAL_HARD option set. Note that such a call may return a complete +match; the options enable the possibility of a partial match, but do not +require it. Note also that if you request JIT compilation only for partial +matching (for example, /jit=2) but do not set the \fBpartial\fP modifier on a +subject line, that match will not use JIT code because none was compiled for +non-partial matching. +.P +If JIT compilation is successful, the compiled JIT code will automatically be +used when an appropriate type of match is run, except when incompatible +run-time options are specified. For more details, see the .\" HREF \fBpcre2jit\fP .\" @@ -622,14 +819,14 @@ code was actually used in the match. .SS "Setting a locale" .rs .sp -The \fB/locale\fP modifier must specify the name of a locale, for example: +The \fBlocale\fP modifier must specify the name of a locale, for example: .sp /pattern/locale=fr_FR .sp The given locale is set, \fBpcre2_maketables()\fP is called to build a set of character tables for the locale, and this is then passed to \fBpcre2_compile()\fP when compiling the regular expression. The same tables -are used when matching the following subject lines. The \fB/locale\fP modifier +are used when matching the following subject lines. The \fBlocale\fP modifier applies only to the pattern on which it appears, but can be given in a \fB#pattern\fP command if a default is needed. Setting a locale and alternate character tables are mutually exclusive. @@ -638,7 +835,7 @@ character tables are mutually exclusive. .SS "Showing pattern memory" .rs .sp -The \fB/memory\fP modifier causes the size in bytes of the memory used to hold +The \fBmemory\fP modifier causes the size in bytes of the memory used to hold the compiled pattern to be output. This does not include the size of the \fBpcre2_code\fP block; it is just the actual compiled data. If the pattern is subsequently passed to the JIT compiler, the size of the JIT compiled code is @@ -660,30 +857,54 @@ sets its own default of 220, which is required for running the standard test suite. . . +.SS "Limiting the pattern length" +.rs +.sp +The \fBmax_pattern_length\fP modifier sets a limit, in code units, to the +length of pattern that \fBpcre2_compile()\fP will accept. Breaching the limit +causes a compilation error. The default is the largest number a PCRE2_SIZE +variable can hold (essentially unlimited). +. +. .SS "Using the POSIX wrapper API" .rs .sp -The \fB/posix\fP modifier causes \fBpcre2test\fP to call PCRE2 via the POSIX -wrapper API rather than its native API. This supports only the 8-bit library. -When the POSIX API is being used, the following pattern modifiers set options -for the \fBregcomp()\fP function: +The \fB/posix\fP and \fBposix_nosub\fP modifiers cause \fBpcre2test\fP to call +PCRE2 via the POSIX wrapper API rather than its native API. When +\fBposix_nosub\fP is used, the POSIX option REG_NOSUB is passed to +\fBregcomp()\fP. The POSIX wrapper supports only the 8-bit library. Note that +it does not imply POSIX matching semantics; for more detail see the +.\" HREF +\fBpcre2posix\fP +.\" +documentation. The following pattern modifiers set options for the +\fBregcomp()\fP function: .sp caseless REG_ICASE multiline REG_NEWLINE - no_auto_capture REG_NOSUB dotall REG_DOTALL ) ungreedy REG_UNGREEDY ) These options are not part of ucp REG_UCP ) the POSIX standard utf REG_UTF8 ) .sp +The \fBregerror_buffsize\fP modifier specifies a size for the error buffer that +is passed to \fBregerror()\fP in the event of a compilation error. For example: +.sp + /abc/posix,regerror_buffsize=20 +.sp +This provides a means of testing the behaviour of \fBregerror()\fP when the +buffer is too small for the error message. If this modifier has not been set, a +large buffer is used. +.P The \fBaftertext\fP and \fBallaftertext\fP subject modifiers work as described -below. All other modifiers cause an error. +below. All other modifiers are either ignored, with a warning message, or cause +an error. . . .SS "Testing the stack guard feature" .rs .sp -The \fB/stackguard\fP modifier is used to test the use of +The \fBstackguard\fP modifier is used to test the use of \fBpcre2_set_compile_recursion_guard()\fP, a function that is provided to enable stack availability to be checked during compilation (see the .\" HREF @@ -700,7 +921,7 @@ be aborted. .SS "Using alternative character tables" .rs .sp -The value specified for the \fB/tables\fP modifier must be one of the digits 0, +The value specified for the \fBtables\fP modifier must be one of the digits 0, 1, or 2. It causes a specific set of built-in character tables to be passed to \fBpcre2_compile()\fP. This is used in the PCRE2 tests to check behaviour with different character tables. The digit specifies the tables as follows: @@ -720,17 +941,22 @@ are mutually exclusive. .sp The following modifiers are really subject modifiers, and are described below. However, they may be included in a pattern's modifier list, in which case they -are applied to every subject line that is processed with that pattern. They do -not affect the compilation process. +are applied to every subject line that is processed with that pattern. They may +not appear in \fB#pattern\fP commands. These modifiers do not affect the +compilation process. .sp - aftertext show text after match - allaftertext show text after captures - allcaptures show all captures - allusedtext show all consulted text - /g global global matching - mark show mark values - replace= specify a replacement string - startchar show starting character when relevant + aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allusedtext show all consulted text + /g global global matching + mark show mark values + replace= specify a replacement string + startchar show starting character when relevant + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED + substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY .sp These modifiers may not appear in a \fB#pattern\fP command. If you want them as defaults, set them in a \fB#subject\fP command. @@ -746,15 +972,20 @@ facility is used when saving compiled patterns to a file, as described in the section entitled "Saving and restoring compiled patterns" .\" HTML .\" -below. +below. If \fBpushcopy\fP is used instead of \fBpush\fP, a copy of the compiled +pattern is stacked, leaving the original as current, ready to match the +following input lines. This provides a way of testing the +\fBpcre2_code_copy()\fP function. .\" -The \fBpush\fP modifier is incompatible with compilation modifiers such as -\fBglobal\fP that act at match time. Any that are specified are ignored, with a -warning message, except for \fBreplace\fP, which causes an error. Note that, -\fBjitverify\fP, which is allowed, does not carry through to any subsequent -matching that uses this pattern. +The \fBpush\fP and \fBpushcopy \fP modifiers are incompatible with compilation +modifiers such as \fBglobal\fP that act at match time. Any that are specified +are ignored (for the stacked copy), with a warning message, except for +\fBreplace\fP, which causes an error. Note that \fBjitverify\fP, which is +allowed, does not carry through to any subsequent matching that uses a stacked +pattern. . . +.\" HTML .SH "SUBJECT MODIFIERS" .rs .sp @@ -775,6 +1006,7 @@ for a description of their effects. anchored set PCRE2_ANCHORED dfa_restart set PCRE2_DFA_RESTART dfa_shortest set PCRE2_DFA_SHORTEST + no_jit set PCRE2_NO_JIT no_utf_check set PCRE2_NO_UTF_CHECK notbol set PCRE2_NOTBOL notempty set PCRE2_NOTEMPTY @@ -786,11 +1018,11 @@ for a description of their effects. The partial matching modifiers are provided with abbreviations because they appear frequently in tests. .P -If the \fB/posix\fP modifier was present on the pattern, causing the POSIX +If the \fBposix\fP modifier was present on the pattern, causing the POSIX wrapper API to be used, the only option-setting modifiers that have any effect are \fBnotbol\fP, \fBnotempty\fP, and \fBnoteol\fP, causing REG_NOTBOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to \fBregexec()\fP. -Any other modifiers cause an error. +The other modifiers are ignored, with a warning message. . . .SS "Setting match controls" @@ -801,33 +1033,44 @@ information. Some of them may also be specified on a pattern line (see above), in which case they apply to every subject line that is matched against that pattern. .sp - aftertext show text after match - allaftertext show text after captures - allcaptures show all captures - allusedtext show all consulted text (non-JIT only) - altglobal alternative global matching - callout_capture show captures at callout time - callout_data= set a value to pass via callouts - callout_fail= [: ] control callout failure - callout_none do not supply a callout function - copy= copy captured substring - dfa use \fBpcre2_dfa_match()\fP - find_limits find match and recursion limits - get= extract captured substring - getall extract all captured substrings - /g global global matching - jitstack= set size of JIT stack - mark show mark values - match_limit=>n> set a match limit - memory show memory usage - offset= set starting offset - ovector= set size of output vector - recursion_limit= set a recursion limit - replace= specify a replacement string - startchar show startchar when relevant - zero_terminate pass the subject as zero-terminated + aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allusedtext show all consulted text (non-JIT only) + altglobal alternative global matching + callout_capture show captures at callout time + callout_data= set a value to pass via callouts + callout_error= [: ] control callout error + callout_fail= [: ] control callout failure + callout_none do not supply a callout function + copy= copy captured substring + dfa use \fBpcre2_dfa_match()\fP + find_limits find match and recursion limits + get= extract captured substring + getall extract all captured substrings + /g global global matching + jitstack= set size of JIT stack + mark show mark values + match_limit= set a match limit + memory show memory usage + null_context match with a NULL context + offset= set starting offset + offset_limit= set offset limit + ovector= set size of output vector + recursion_limit= set a recursion limit + replace= specify a replacement string + startchar show startchar when relevant + startoffset= same as offset= + substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY + zero_terminate pass the subject as zero-terminated .sp -The effects of these modifiers are described in the following sections. +The effects of these modifiers are described in the following sections. When +matching via the POSIX wrapper API, the \fBaftertext\fP, \fBallaftertext\fP, +and \fBovector\fP subject modifiers work as described below. All other +modifiers are either ignored, with a warning message, or cause an error. . . .SS "Showing more text" @@ -882,7 +1125,8 @@ The \fBallcaptures\fP modifier requests that the values of all potential captured parentheses be output after a match. By default, only those up to the highest one actually used in the match are output (corresponding to the return code from \fBpcre2_match()\fP). Groups that did not take part in the match -are output as " ". +are output as " ". This modifier is not relevant for DFA matching (which +does no capturing); it is ignored, with a warning message, if present. . . .SS "Testing callouts" @@ -890,14 +1134,20 @@ are output as " ". .sp A callout function is supplied when \fBpcre2test\fP calls the library matching functions, unless \fBcallout_none\fP is specified. If \fBcallout_capture\fP is -set, the current captured groups are output when a callout occurs. +set, the current captured groups are output when a callout occurs. The default +return from the callout function is zero, which allows matching to continue. .P The \fBcallout_fail\fP modifier can be given one or two numbers. If there is -only one number, 1 is returned instead of 0 when a callout of that number is -reached. If two numbers are given, 1 is returned when callout is reached -for the th time. Note that callouts with string arguments are always given -the number zero. See "Callouts" below for a description of the output when a -callout it taken. +only one number, 1 is returned instead of 0 (causing matching to backtrack) +when a callout of that number is reached. If two numbers ( : ) are given, 1 +is returned when callout is reached and there have been at least +callouts. The \fBcallout_error\fP modifier is similar, except that +PCRE2_ERROR_CALLOUT is returned, causing the entire matching process to be +aborted. If both these modifiers are set for the same callout number, +\fBcallout_error\fP takes precedence. +.P +Note that callouts with string arguments are always given the number zero. See +"Callouts" below for a description of the output when a callout it taken. .P The \fBcallout_data\fP modifier can be given an unsigned or a negative number. This is set as the "user data" that is passed to the matching function, and @@ -909,7 +1159,7 @@ used as a return from \fBpcre2test\fP's callout function. .rs .sp Searching for all possible matches within a subject can be requested by the -\fBglobal\fP or \fB/altglobal\fP modifier. After finding a match, the matching +\fBglobal\fP or \fBaltglobal\fP modifier. After finding a match, the matching function is called again to search the remainder of the subject. The difference between \fBglobal\fP and \fBaltglobal\fP is that the former uses the \fIstart_offset\fP argument to \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP @@ -957,18 +1207,30 @@ by name. .rs .sp If the \fBreplace\fP modifier is set, the \fBpcre2_substitute()\fP function is -called instead of one of the matching functions. Unlike subject strings, -\fBpcre2test\fP does not process replacement strings for escape sequences. In -UTF mode, a replacement string is checked to see if it is a valid UTF-8 string. -If so, it is correctly converted to a UTF string of the appropriate code unit -width. If it is not a valid UTF-8 string, the individual code units are copied -directly. This provides a means of passing an invalid UTF-8 string for testing -purposes. +called instead of one of the matching functions. Note that replacement strings +cannot contain commas, because a comma signifies the end of a modifier. This is +not thought to be an issue in a test program. .P -If the \fBglobal\fP modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to -\fBpcre2_substitute()\fP. After a successful substitution, the modified string -is output, preceded by the number of replacements. This may be zero if there -were no matches. Here is a simple example of a substitution test: +Unlike subject strings, \fBpcre2test\fP does not process replacement strings +for escape sequences. In UTF mode, a replacement string is checked to see if it +is a valid UTF-8 string. If so, it is correctly converted to a UTF string of +the appropriate code unit width. If it is not a valid UTF-8 string, the +individual code units are copied directly. This provides a means of passing an +invalid UTF-8 string for testing purposes. +.P +The following modifiers set options (in additional to the normal match options) +for \fBpcre2_substitute()\fP: +.sp + global PCRE2_SUBSTITUTE_GLOBAL + substitute_extended PCRE2_SUBSTITUTE_EXTENDED + substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY +.sp +.P +After a successful substitution, the modified string is output, preceded by the +number of replacements. This may be zero if there were no matches. Here is a +simple example of a substitution test: .sp /abc/replace=xxx =abc=abc= @@ -976,12 +1238,12 @@ were no matches. Here is a simple example of a substitution test: =abc=abc=\e=global 2: =xxx=xxx= .sp -Subject and replacement strings should be kept relatively short for -substitution tests, as fixed-size buffers are used. To make it easy to test for -buffer overflow, if the replacement string starts with a number in square -brackets, that number is passed to \fBpcre2_substitute()\fP as the size of the -output buffer, with the replacement string starting at the next character. Here -is an example that tests the edge case: +Subject and replacement strings should be kept relatively short (fewer than 256 +characters) for substitution tests, as fixed-size buffers are used. To make it +easy to test for buffer overflow, if the replacement string starts with a +number in square brackets, that number is passed to \fBpcre2_substitute()\fP as +the size of the output buffer, with the replacement string starting at the next +character. Here is an example that tests the edge case: .sp /abc/ 123abc123\e=replace=[10]XYZ @@ -989,6 +1251,19 @@ is an example that tests the edge case: 123abc123\e=replace=[9]XYZ Failed: error -47: no more memory .sp +The default action of \fBpcre2_substitute()\fP is to return +PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the +PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the +\fBsubstitute_overflow_length\fP modifier), \fBpcre2_substitute()\fP continues +to go through the motions of matching and substituting, in order to compute the +size of buffer that is required. When this happens, \fBpcre2test\fP shows the +required buffer length (which includes space for the trailing zero) as part of +the error message. For example: +.sp + /abc/substitute_overflow_length + 123abc123\e=replace=[9]XYZ + Failed: error -47: no more memory: 10 code units are needed +.sp A replacement string is ignored with POSIX and DFA matching. Specifying partial matching provokes an error return ("bad option value") from \fBpcre2_substitute()\fP. @@ -1059,6 +1334,16 @@ The \fBoffset\fP modifier sets an offset in the subject string at which matching starts. Its value is a number of code units, not characters. . . +.SS "Setting an offset limit" +.rs +.sp +The \fBoffset_limit\fP modifier sets a limit for unanchored matches. If a match +cannot be found starting at or before this offset in the subject, a "no match" +return is given. The data value is a number of code units, not characters. When +this modifier is used, the \fBuse_offset_limit\fP modifier must have been set +for the pattern; if not, an error is generated. +. +. .SS "Setting the size of the output vector" .rs .sp @@ -1089,6 +1374,17 @@ When testing \fBpcre2_substitute()\fP, this modifier also has the effect of passing the replacement string as zero-terminated. . . +.SS "Passing a NULL context" +.rs +.sp +Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP, +\fBpcre2_dfa_match()\fP or \fBpcre2_jit_match()\fP. If the \fBnull_context\fP +modifier is set, however, NULL is passed. This is for testing that the matching +functions behave correctly in this case (they use default values). This +modifier cannot be used with the \fBfind_limits\fP modifier or when testing the +substitution function. +. +. .SH "THE ALTERNATIVE MATCHING FUNCTION" .rs .sp @@ -1156,7 +1452,7 @@ unset substring is shown as " ", as for the second data line. If the strings contain any non-printing characters, they are output as \exhh escapes if the value is less than 256 and UTF mode is not set. Otherwise they are output as \ex{hh...} escapes. See below for the definition of non-printing -characters. If the \fB/aftertext\fP modifier is set, the output for substring +characters. If the \fBaftertext\fP modifier is set, the output for substring 0 is followed by the the rest of the subject string, identified by "0+" like this: .sp @@ -1286,7 +1582,9 @@ item to be tested. For example: This output indicates that callout number 0 occurred for a match attempt starting at the fourth character of the subject string, when the pointer was at the seventh character, and when the next pattern item was \ed. Just -one circumflex is output if the start and current positions are the same. +one circumflex is output if the start and current positions are the same, or if +the current position precedes the start position, which can happen if the +callout is in a lookbehind assertion. .P Callouts numbered 255 are assumed to be automatic callouts, inserted as a result of the \fB/auto_callout\fP pattern modifier. In this case, instead of @@ -1352,7 +1650,7 @@ therefore shown as hex escapes. .P When \fBpcre2test\fP is outputting text that is a matched part of a subject string, it behaves in the same way, unless a different locale has been set for -the pattern (using the \fB/locale\fP modifier). In this case, the +the pattern (using the \fBlocale\fP modifier). In this case, the \fBisprint()\fP function is used to distinguish printing and non-printing characters. . @@ -1382,11 +1680,15 @@ can be used to test these functions. .P When a pattern with \fBpush\fP modifier is successfully compiled, it is pushed onto a stack of compiled patterns, and \fBpcre2test\fP expects the next line to -contain a new pattern (or command) instead of a subject line. By this means, a -number of patterns can be compiled and retained. The \fBpush\fP modifier is -incompatible with \fBposix\fP, and control modifiers that act at match time are -ignored (with a message). The \fBjitverify\fP modifier applies only at compile -time. The command +contain a new pattern (or command) instead of a subject line. By contrast, +the \fBpushcopy\fP modifier causes a copy of the compiled pattern to be +stacked, leaving the original available for immediate matching. By using +\fBpush\fP and/or \fBpushcopy\fP, a number of patterns can be compiled and +retained. These modifiers are incompatible with \fBposix\fP, and control +modifiers that act at match time are ignored (with a message) for the stacked +patterns. The \fBjitverify\fP modifier applies only at compile time. +.P +The command .sp #save .sp @@ -1406,7 +1708,8 @@ modifier list containing only control modifiers .\" that act after a pattern has been compiled. In particular, \fBhex\fP, -\fBposix\fP, and \fBpush\fP are not allowed, nor are any +\fBposix\fP, \fBposix_nosub\fP, \fBpush\fP, and \fBpushcopy\fP are not allowed, +nor are any .\" HTML .\" option-setting modifiers. @@ -1426,6 +1729,10 @@ reloads two patterns. .sp If \fBjitverify\fP is used with #pop, it does not automatically imply \fBjit\fP, which is different behaviour from when it is used on a pattern. +.P +The #popcopy command is analagous to the \fBpushcopy\fP modifier in that it +makes current a copy of the topmost stack pattern, leaving the original still +on the stack. . . . @@ -1451,6 +1758,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 20 May 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 28 December 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2test.txt b/pcre2/doc/pcre2test.txt index c022a9ce5..52f0e18b5 100644 --- a/pcre2/doc/pcre2test.txt +++ b/pcre2/doc/pcre2test.txt @@ -26,7 +26,7 @@ SYNOPSIS As the original fairly simple PCRE library evolved, it acquired many different features, and as a result, the original pcretest program - ended up with a lot of options in a messy, arcane syntax, for testing + ended up with a lot of options in a messy, arcane syntax for testing all the features. The move to the new PCRE2 API provided an opportunity to re-implement the test program as pcre2test, with a cleaner modifier syntax. Nevertheless, there are still many obscure modifiers, some of @@ -45,7 +45,7 @@ PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES installed. The pcre2test program can be used to test all the libraries. However, its own input and output are always in 8-bit format. When testing the 16-bit or 32-bit libraries, patterns and subject strings - are converted to 16- or 32-bit format before being passed to the + are converted to 16-bit or 32-bit format before being passed to the library functions. Results are converted back to 8-bit code units for output. @@ -58,49 +58,80 @@ PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES INPUT ENCODING Input to pcre2test is processed line by line, either by calling the C - library's fgets() function, or via the libreadline library (see below). + library's fgets() function, or via the libreadline library. In some + Windows environments character 26 (hex 1A) causes an immediate end of + file, and no further data is read, so this character should be avoided + unless you really want that action. + The input is processed using using C's string functions, so must not contain binary zeroes, even though in Unix-like environments, fgets() - treats any bytes other than newline as data characters. In some Windows - environments character 26 (hex 1A) causes an immediate end of file, and - no further data is read. + treats any bytes other than newline as data characters. An error is + generated if a binary zero is encountered. Subject lines are processed + for backslash escapes, which makes it possible to include any data + value in strings that are passed to the library for matching. For pat- + terns, there is a facility for specifying some or all of the 8-bit + input characters as hexadecimal pairs, which makes it possible to + include binary zeros. - For maximum portability, therefore, it is safest to avoid non-printing - characters in pcre2test input files. There is a facility for specifying - a pattern's characters as hexadecimal pairs, thus making it possible to - include binary zeroes in a pattern for testing purposes. Subject lines - are processed for backslash escapes, which makes it possible to include - any data value. + Input for the 16-bit and 32-bit libraries + + When testing the 16-bit or 32-bit libraries, there is a need to be able + to generate character code points greater than 255 in the strings that + are passed to the library. For subject lines, backslash escapes can be + used. In addition, when the utf modifier (see "Setting compilation + options" below) is set, the pattern and any following subject lines are + interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as + appropriate. + + For non-UTF testing of wide characters, the utf8_input modifier can be + used. This is mutually exclusive with utf, and is allowed only in + 16-bit or 32-bit mode. It causes the pattern and following subject + lines to be treated as UTF-8 according to the original definition (RFC + 2279), which allows for character values up to 0x7fffffff. Each charac- + ter is placed in one 16-bit or 32-bit code unit (in the 16-bit case, + values greater than 0xffff cause an error to occur). + + UTF-8 is not capable of encoding values greater than 0x7fffffff, but + such values can be handled by the 32-bit library. When testing this + library in non-UTF mode with utf8_input set, if any character is pre- + ceded by the byte 0xff (which is an illegal byte in UTF-8) 0x80000000 + is added to the character's value. This is the only way of passing such + code points in a pattern string. For subject strings, using an escape + sequence is preferable. COMMAND LINE OPTIONS -8 If the 8-bit library has been built, this option causes it to - be used (this is the default). If the 8-bit library has not + be used (this is the default). If the 8-bit library has not been built, this option causes an error. - -16 If the 16-bit library has been built, this option causes it - to be used. If only the 16-bit library has been built, this - is the default. If the 16-bit library has not been built, + -16 If the 16-bit library has been built, this option causes it + to be used. If only the 16-bit library has been built, this + is the default. If the 16-bit library has not been built, this option causes an error. - -32 If the 32-bit library has been built, this option causes it - to be used. If only the 32-bit library has been built, this - is the default. If the 32-bit library has not been built, + -32 If the 32-bit library has been built, this option causes it + to be used. If only the 32-bit library has been built, this + is the default. If the 32-bit library has not been built, this option causes an error. - -b Behave as if each pattern has the /fullbincode modifier; the + -ac Behave as if each pattern has the auto_callout modifier, that + is, insert automatic callouts into every pattern that is com- + piled. + + -b Behave as if each pattern has the fullbincode modifier; the full internal binary form of the pattern is output after com- pilation. - -C Output the version number of the PCRE2 library, and all - available information about the optional features that are - included, and then exit with zero exit code. All other + -C Output the version number of the PCRE2 library, and all + available information about the optional features that are + included, and then exit with zero exit code. All other options are ignored. - -C option Output information about a specific build-time option, then - exit. This functionality is intended for use in scripts such - as RunTest. The following options output the value and set + -C option Output information about a specific build-time option, then + exit. This functionality is intended for use in scripts such + as RunTest. The following options output the value and set the exit code as indicated: ebcdic-nl the code for LF (= NL) in an EBCDIC environment: @@ -116,34 +147,42 @@ COMMAND LINE OPTIONS ANYCRLF or ANY exit code is always 0 - The following options output 1 for true or 0 for false, and + The following options output 1 for true or 0 for false, and set the exit code to the same value: - ebcdic compiled for an EBCDIC environment - jit just-in-time support is available - pcre2-16 the 16-bit library was built - pcre2-32 the 32-bit library was built - pcre2-8 the 8-bit library was built - unicode Unicode support is available + backslash-C \C is supported (not locked out) + ebcdic compiled for an EBCDIC environment + jit just-in-time support is available + pcre2-16 the 16-bit library was built + pcre2-32 the 32-bit library was built + pcre2-8 the 8-bit library was built + unicode Unicode support is available - If an unknown option is given, an error message is output; + If an unknown option is given, an error message is output; the exit code is 0. - -d Behave as if each pattern has the debug modifier; the inter- + -d Behave as if each pattern has the debug modifier; the inter- nal form and information about the compiled pattern is output after compilation; -d is equivalent to -b -i. -dfa Behave as if each subject line has the dfa modifier; matching - is done using the pcre2_dfa_match() function instead of the + is done using the pcre2_dfa_match() function instead of the default pcre2_match(). + -error number[,number,...] + Call pcre2_get_error_message() for each of the error numbers + in the comma-separated list, display the resulting messages + on the standard output, then exit with zero exit code. The + numbers may be positive or negative. This is a convenience + facility for PCRE2 maintainers. + -help Output a brief summary these options and then exit. - -i Behave as if each pattern has the /info modifier; information + -i Behave as if each pattern has the info modifier; information about the compiled pattern is given after compilation. - -jit Behave as if each pattern line has the jit modifier; after - successful compilation, each pattern is passed to the just- + -jit Behave as if each pattern line has the jit modifier; after + successful compilation, each pattern is passed to the just- in-time compiler, if available. -pattern modifier-list @@ -152,25 +191,25 @@ COMMAND LINE OPTIONS -q Do not output the version number of pcre2test at the start of execution. - -S size On Unix-like systems, set the size of the run-time stack to + -S size On Unix-like systems, set the size of the run-time stack to size megabytes. -subject modifier-list Behave as if each subject line contains the given modifiers. - -t Run each compile and match many times with a timer, and out- - put the resulting times per compile or match. When JIT is - used, separate times are given for the initial compile and - the JIT compile. You can control the number of iterations - that are used for timing by following -t with a number (as a - separate item on the command line). For example, "-t 1000" + -t Run each compile and match many times with a timer, and out- + put the resulting times per compile or match. When JIT is + used, separate times are given for the initial compile and + the JIT compile. You can control the number of iterations + that are used for timing by following -t with a number (as a + separate item on the command line). For example, "-t 1000" iterates 1000 times. The default is to iterate 500,000 times. -tm This is like -t except that it times only the matching phase, not the compile phase. - -T -TM These behave like -t and -tm, but in addition, at the end of - a run, the total times for all compiles and matches are out- + -T -TM These behave like -t and -tm, but in addition, at the end of + a run, the total times for all compiles and matches are out- put. -version Output the PCRE2 version number and then exit. @@ -178,38 +217,39 @@ COMMAND LINE OPTIONS DESCRIPTION - If pcre2test is given two filename arguments, it reads from the first + If pcre2test is given two filename arguments, it reads from the first and writes to the second. If the first name is "-", input is taken from - the standard input. If pcre2test is given only one argument, it reads + the standard input. If pcre2test is given only one argument, it reads from that file and writes to stdout. Otherwise, it reads from stdin and writes to stdout. - When pcre2test is built, a configuration option can specify that it - should be linked with the libreadline or libedit library. When this is - done, if the input is from a terminal, it is read using the readline() + When pcre2test is built, a configuration option can specify that it + should be linked with the libreadline or libedit library. When this is + done, if the input is from a terminal, it is read using the readline() function. This provides line-editing and history facilities. The output from the -help option states whether or not readline() will be used. - The program handles any number of tests, each of which consists of a - set of input lines. Each set starts with a regular expression pattern, + The program handles any number of tests, each of which consists of a + set of input lines. Each set starts with a regular expression pattern, followed by any number of subject lines to be matched against that pat- tern. In between sets of test data, command lines that begin with # may appear. This file format, with some restrictions, can also be processed - by the perltest.sh script that is distributed with PCRE2 as a means of + by the perltest.sh script that is distributed with PCRE2 as a means of checking that the behaviour of PCRE2 and Perl is the same. When the input is a terminal, pcre2test prompts for each line of input, - using "re>" to prompt for regular expression patterns, and "data>" to - prompt for subject lines. Command lines starting with # can be entered + using "re>" to prompt for regular expression patterns, and "data>" to + prompt for subject lines. Command lines starting with # can be entered only in response to the "re>" prompt. - Each subject line is matched separately and independently. If you want + Each subject line is matched separately and independently. If you want to do multi-line matches, you have to use the \n escape sequence (or \r - or \r\n, etc., depending on the newline setting) in a single line of - input to encode the newline sequences. There is no limit on the length - of subject lines; the input buffer is automatically extended if it is - too small. There is a replication feature that makes it possible to - generate long subject lines without having to supply them explicitly. + or \r\n, etc., depending on the newline setting) in a single line of + input to encode the newline sequences. There is no limit on the length + of subject lines; the input buffer is automatically extended if it is + too small. There are replication features that makes it possible to + generate long repetitive pattern or subject lines without having to + supply them explicitly. An empty line or the end of the file signals the end of the subject lines for a test, at which point a new pattern or command line is @@ -247,6 +287,36 @@ COMMAND LINES as described in the section entitled "Saving and restoring compiled patterns" below. + #newline_default [ ] + + When PCRE2 is built, a default newline convention can be specified. + This determines which characters and/or character pairs are recognized + as indicating a newline in a pattern or subject string. The default can + be overridden when a pattern is compiled. The standard test files con- + tain tests of various newline conventions, but the majority of the + tests expect a single linefeed to be recognized as a newline by + default. Without special action the tests would fail when PCRE2 is com- + piled with either CR or CRLF as the default newline. + + The #newline_default command specifies a list of newline types that are + acceptable as the default. The types must be one of CR, LF, CRLF, ANY- + CRLF, or ANY (in upper or lower case), for example: + + #newline_default LF Any anyCRLF + + If the default newline is in the list, this command has no effect. Oth- + erwise, except when testing the POSIX API, a newline modifier that + specifies the first newline convention in the list (LF in the above + example) is added to any pattern that does not already have a newline + modifier. If the newline list is empty, the feature is turned off. This + command is present in a number of the standard test input files. + + When the POSIX API is being tested there is no way to override the + default newline convention, though it is possible to set the newline + convention from within the pattern. A warning is given if the posix + modifier is used when #newline_default would set a default for the non- + POSIX API. + #pattern This command sets a default modifier list that applies to all subse- @@ -264,10 +334,11 @@ COMMAND LINES wrong file. #pop [ ] + #popcopy [ ] - This command is used to manipulate the stack of compiled patterns, as - described in the section entitled "Saving and restoring compiled pat- - terns" below. + These commands are used to manipulate the stack of compiled patterns, + as described in the section entitled "Saving and restoring compiled + patterns" below. #save @@ -285,12 +356,14 @@ COMMAND LINES MODIFIER SYNTAX Modifier lists are used with both pattern and subject lines. Items in a - list are separated by commas and optional white space. Some modifiers - may be given for both patterns and subject lines, whereas others are - valid for one or the other only. Each modifier has a long name, for - example "anchored", and some of them must be followed by an equals sign - and a value, for example, "offset=12". Modifiers that do not take val- - ues may be preceded by a minus sign to turn off a previous setting. + list are separated by commas followed by optional white space. Trailing + whitespace in a modifier list is ignored. Some modifiers may be given + for both patterns and subject lines, whereas others are valid only for + one or the other. Each modifier has a long name, for example + "anchored", and some of them must be followed by an equals sign and a + value, for example, "offset=12". Values cannot contain comma charac- + ters, but may contain spaces. Modifiers that do not take values may be + preceded by a minus sign to turn off a previous setting. A few of the more common modifiers can also be specified as single let- ters, for example "i" for "caseless". In documentation, following the @@ -400,6 +473,13 @@ SUBJECT LINE SYNTAX abc\=notbol,notempty + If the subject string is empty and \= is followed by whitespace, the + line is treated as a comment line, and is not used for matching. For + example: + + \= This is a comment. + abc\= This is an invalid modifier list. + A backslash followed by any other non-alphanumeric character just escapes that character. A backslash followed by anything else causes an error. However, if the very last character in the line is a backslash @@ -410,20 +490,21 @@ SUBJECT LINE SYNTAX PATTERN MODIFIERS - There are three types of modifier that can appear in pattern lines, two - of which may also be used in a #pattern command. A pattern's modifier - list can add to or override default modifiers that were set by a previ- - ous #pattern command. + There are several types of modifier that can appear in pattern lines. + Except where noted below, they may also be used in #pattern commands. A + pattern's modifier list can add to or override default modifiers that + were set by a previous #pattern command. Setting compilation options The following modifiers set options for pcre2_compile(). The most com- - mon ones have single-letter abbreviations. See pcreapi for a descrip- + mon ones have single-letter abbreviations. See pcre2api for a descrip- tion of their effects. allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED auto_callout set PCRE2_AUTO_CALLOUT /i caseless set PCRE2_CASELESS @@ -444,12 +525,15 @@ PATTERN MODIFIERS no_utf_check set PCRE2_NO_UTF_CHECK ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY + use_offset_limit set PCRE2_USE_OFFSET_LIMIT utf set PCRE2_UTF As well as turning on the PCRE2_UTF option, the utf modifier causes all non-printing characters in output strings to be printed using the \x{hh...} notation. Otherwise, those less than 0x100 are output in hex - without the curly brackets. + without the curly brackets. Setting utf in 16-bit or 32-bit mode also + causes pattern and subject strings to be translated to UTF-16 or + UTF-32, respectively, before being passed to library functions. Setting compilation controls @@ -462,18 +546,24 @@ PATTERN MODIFIERS debug same as info,fullbincode fullbincode show binary code with lengths /I info show info about compiled pattern - hex pattern is coded in hexadecimal + hex unquoted characters are hexadecimal jit[= ] use JIT jitfast use JIT fast path jitverify verify JIT use locale= use this locale + max_pattern_length= set the maximum pattern length memory show memory used newline= set newline type + null_context compile with a NULL context parens_nest_limit= set maximum parentheses depth posix use the POSIX API + posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack + pushcopy push a copy onto the stack stackguard= test the stackguard feature tables=[0|1|2] select internal tables + use_length do not zero-terminate the pattern + utf8_input treat input as UTF-8 The effects of these modifiers are described in the following sections. @@ -539,39 +629,129 @@ PATTERN MODIFIERS mation that is requested. For each callout, either its number or string is given, followed by the item that follows it in the pattern. - Specifying a pattern in hex + Passing a NULL context - The hex modifier specifies that the characters of the pattern are to be - interpreted as pairs of hexadecimal digits. White space is permitted - between pairs. For example: + Normally, pcre2test passes a context block to pcre2_compile(). If the + null_context modifier is set, however, NULL is passed. This is for + testing that pcre2_compile() behaves correctly in this case (it uses + default values). + + Specifying the pattern's length + + By default, patterns are passed to the compiling functions as zero-ter- + minated strings. When using the POSIX wrapper API, there is no other + option. However, when using PCRE2's native API, patterns can be passed + by length instead of being zero-terminated. The use_length modifier + causes this to happen. Using a length happens automatically (whether + or not use_length is set) when hex is set, because patterns specified + in hexadecimal may contain binary zeros. + + Specifying pattern characters in hexadecimal + + The hex modifier specifies that the characters of the pattern, except + for substrings enclosed in single or double quotes, are to be inter- + preted as pairs of hexadecimal digits. This feature is provided as a + way of creating patterns that contain binary zeros and other non-print- + ing characters. White space is permitted between pairs of digits. For + example, this pattern contains three characters: /ab 32 59/hex - This feature is provided as a way of creating patterns that contain - binary zero and other non-printing characters. By default, pcre2test - passes patterns as zero-terminated strings to pcre2_compile(), giving - the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in - hexadecimal, the actual length of the pattern is passed. + Parts of such a pattern are taken literally if quoted. This pattern + contains nine characters, only two of which are specified in hexadeci- + mal: + + /ab "literal" 32/hex + + Either single or double quotes may be used. There is no way of includ- + ing the delimiter within a substring. The hex and expand modifiers are + mutually exclusive. + + The POSIX API cannot be used with patterns specified in hexadecimal + because they may contain binary zeros, which conflicts with regcomp()'s + requirement for a zero-terminated string. Such patterns are always + passed to pcre2_compile() as a string with a length, not as zero-termi- + nated. + + Specifying wide characters in 16-bit and 32-bit modes + + In 16-bit and 32-bit modes, all input is automatically treated as UTF-8 + and translated to UTF-16 or UTF-32 when the utf modifier is set. For + testing the 16-bit and 32-bit libraries in non-UTF mode, the utf8_input + modifier can be used. It is mutually exclusive with utf. Input lines + are interpreted as UTF-8 as a means of specifying wide characters. More + details are given in "Input encoding" above. + + Generating long repetitive patterns + + Some tests use long patterns that are very repetitive. Instead of cre- + ating a very long input line for such a pattern, you can use a special + repetition feature, similar to the one described for subject lines + above. If the expand modifier is present on a pattern, parts of the + pattern that have the form + + \[ ]{ } + + are expanded before the pattern is passed to pcre2_compile(). For exam- + ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction + cannot be nested. An initial "\[" sequence is recognized only if "]{" + followed by decimal digits and "}" is found later in the pattern. If + not, the characters remain in the pattern unaltered. The expand and hex + modifiers are mutually exclusive. + + If part of an expanded pattern looks like an expansion, but is really + part of the actual pattern, unwanted expansion can be avoided by giving + two values in the quantifier. For example, \[AB]{6000,6000} is not rec- + ognized as an expansion item. + + If the info modifier is set on an expanded pattern, the result of the + expansion is included in the information that is output. JIT compilation - The /jit modifier may optionally be followed by an equals sign and a - number in the range 0 to 7: + Just-in-time (JIT) compiling is a heavyweight optimization that can + greatly speed up pattern matching. See the pcre2jit documentation for + details. JIT compiling happens, optionally, after a pattern has been + successfully compiled into an internal form. The JIT compiler converts + this to optimized machine code. It needs to know whether the match-time + options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, + because different code is generated for the different cases. See the + partial modifier in "Subject Modifiers" below for details of how these + options are specified for each match attempt. + + JIT compilation is requested by the /jit pattern modifier, which may + optionally be followed by an equals sign and a number in the range 0 to + 7. The three bits that make up the number specify which of the three + JIT operating modes are to be compiled: + + 1 compile JIT code for non-partial matching + 2 compile JIT code for soft partial matching + 4 compile JIT code for hard partial matching + + The possible values for the jit modifier are therefore: 0 disable JIT - 1 use JIT for normal match only - 2 use JIT for soft partial match only - 3 use JIT for normal match and soft partial match - 4 use JIT for hard partial match only - 6 use JIT for soft and hard partial match + 1 normal matching only + 2 soft partial matching only + 3 normal and soft partial matching + 4 hard partial matching only + 6 soft and hard partial matching only 7 all three modes - If no number is given, 7 is assumed. If JIT compilation is successful, - the compiled JIT code will automatically be used when pcre2_match() is - run for the appropriate type of match, except when incompatible run- - time options are specified. For more details, see the pcre2jit documen- - tation. See also the jitstack modifier below for a way of setting the - size of the JIT stack. + If no number is given, 7 is assumed. The phrase "partial matching" + means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the + PCRE2_PARTIAL_HARD option set. Note that such a call may return a com- + plete match; the options enable the possibility of a partial match, but + do not require it. Note also that if you request JIT compilation only + for partial matching (for example, /jit=2) but do not set the partial + modifier on a subject line, that match will not use JIT code because + none was compiled for non-partial matching. + + If JIT compilation is successful, the compiled JIT code will automati- + cally be used when an appropriate type of match is run, except when + incompatible run-time options are specified. For more details, see the + pcre2jit documentation. See also the jitstack modifier below for a way + of setting the size of the JIT stack. If the jitfast modifier is specified, matching is done using the JIT "fast path" interface, pcre2_jit_match(), which skips some of the san- @@ -588,24 +768,24 @@ PATTERN MODIFIERS Setting a locale - The /locale modifier must specify the name of a locale, for example: + The locale modifier must specify the name of a locale, for example: /pattern/locale=fr_FR The given locale is set, pcre2_maketables() is called to build a set of character tables for the locale, and this is then passed to pcre2_com- pile() when compiling the regular expression. The same tables are used - when matching the following subject lines. The /locale modifier applies + when matching the following subject lines. The locale modifier applies only to the pattern on which it appears, but can be given in a #pattern command if a default is needed. Setting a locale and alternate charac- ter tables are mutually exclusive. Showing pattern memory - The /memory modifier causes the size in bytes of the memory used to - hold the compiled pattern to be output. This does not include the size - of the pcre2_code block; it is just the actual compiled data. If the - pattern is subsequently passed to the JIT compiler, the size of the JIT + The memory modifier causes the size in bytes of the memory used to hold + the compiled pattern to be output. This does not include the size of + the pcre2_code block; it is just the actual compiled data. If the pat- + tern is subsequently passed to the JIT compiler, the size of the JIT compiled code is also output. Here is an example: re> /a(b)c/jit,memory @@ -621,39 +801,59 @@ PATTERN MODIFIERS pcre2test sets its own default of 220, which is required for running the standard test suite. + Limiting the pattern length + + The max_pattern_length modifier sets a limit, in code units, to the + length of pattern that pcre2_compile() will accept. Breaching the limit + causes a compilation error. The default is the largest number a + PCRE2_SIZE variable can hold (essentially unlimited). + Using the POSIX wrapper API - The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap- - per API rather than its native API. This supports only the 8-bit - library. When the POSIX API is being used, the following pattern modi- - fiers set options for the regcomp() function: + The /posix and posix_nosub modifiers cause pcre2test to call PCRE2 via + the POSIX wrapper API rather than its native API. When posix_nosub is + used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX + wrapper supports only the 8-bit library. Note that it does not imply + POSIX matching semantics; for more detail see the pcre2posix documenta- + tion. The following pattern modifiers set options for the regcomp() + function: caseless REG_ICASE multiline REG_NEWLINE - no_auto_capture REG_NOSUB dotall REG_DOTALL ) ungreedy REG_UNGREEDY ) These options are not part of ucp REG_UCP ) the POSIX standard utf REG_UTF8 ) + The regerror_buffsize modifier specifies a size for the error buffer + that is passed to regerror() in the event of a compilation error. For + example: + + /abc/posix,regerror_buffsize=20 + + This provides a means of testing the behaviour of regerror() when the + buffer is too small for the error message. If this modifier has not + been set, a large buffer is used. + The aftertext and allaftertext subject modifiers work as described - below. All other modifiers cause an error. + below. All other modifiers are either ignored, with a warning message, + or cause an error. Testing the stack guard feature - The /stackguard modifier is used to test the use of pcre2_set_com- - pile_recursion_guard(), a function that is provided to enable stack - availability to be checked during compilation (see the pcre2api docu- - mentation for details). If the number specified by the modifier is + The stackguard modifier is used to test the use of pcre2_set_com- + pile_recursion_guard(), a function that is provided to enable stack + availability to be checked during compilation (see the pcre2api docu- + mentation for details). If the number specified by the modifier is greater than zero, pcre2_set_compile_recursion_guard() is called to set - up callback from pcre2_compile() to a local function. The argument it - receives is the current nesting parenthesis depth; if this is greater + up callback from pcre2_compile() to a local function. The argument it + receives is the current nesting parenthesis depth; if this is greater than the value given by the modifier, non-zero is returned, causing the compilation to be aborted. Using alternative character tables - The value specified for the /tables modifier must be one of the digits + The value specified for the tables modifier must be one of the digits 0, 1, or 2. It causes a specific set of built-in character tables to be passed to pcre2_compile(). This is used in the PCRE2 tests to check be- haviour with different character tables. The digit specifies the tables @@ -664,25 +864,30 @@ PATTERN MODIFIERS pcre2_chartables.c.dist 2 a set of tables defining ISO 8859 characters - In table 2, some characters whose codes are greater than 128 are iden- - tified as letters, digits, spaces, etc. Setting alternate character + In table 2, some characters whose codes are greater than 128 are iden- + tified as letters, digits, spaces, etc. Setting alternate character tables and a locale are mutually exclusive. Setting certain match controls The following modifiers are really subject modifiers, and are described - below. However, they may be included in a pattern's modifier list, in - which case they are applied to every subject line that is processed - with that pattern. They do not affect the compilation process. + below. However, they may be included in a pattern's modifier list, in + which case they are applied to every subject line that is processed + with that pattern. They may not appear in #pattern commands. These mod- + ifiers do not affect the compilation process. - aftertext show text after match - allaftertext show text after captures - allcaptures show all captures - allusedtext show all consulted text - /g global global matching - mark show mark values - replace= specify a replacement string - startchar show starting character when relevant + aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allusedtext show all consulted text + /g global global matching + mark show mark values + replace= specify a replacement string + startchar show starting character when relevant + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED + substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY These modifiers may not appear in a #pattern command. If you want them as defaults, set them in a #subject command. @@ -694,11 +899,15 @@ PATTERN MODIFIERS next line to contain a new pattern (or a command) instead of a subject line. This facility is used when saving compiled patterns to a file, as described in the section entitled "Saving and restoring compiled pat- - terns" below. The push modifier is incompatible with compilation modi- - fiers such as global that act at match time. Any that are specified are - ignored, with a warning message, except for replace, which causes an - error. Note that, jitverify, which is allowed, does not carry through - to any subsequent matching that uses this pattern. + terns" below. If pushcopy is used instead of push, a copy of the com- + piled pattern is stacked, leaving the original as current, ready to + match the following input lines. This provides a way of testing the + pcre2_code_copy() function. The push and pushcopy modifiers are + incompatible with compilation modifiers such as global that act at + match time. Any that are specified are ignored (for the stacked copy), + with a warning message, except for replace, which causes an error. Note + that jitverify, which is allowed, does not carry through to any subse- + quent matching that uses a stacked pattern. SUBJECT MODIFIERS @@ -714,6 +923,7 @@ SUBJECT MODIFIERS anchored set PCRE2_ANCHORED dfa_restart set PCRE2_DFA_RESTART dfa_shortest set PCRE2_DFA_SHORTEST + no_jit set PCRE2_NO_JIT no_utf_check set PCRE2_NO_UTF_CHECK notbol set PCRE2_NOTBOL notempty set PCRE2_NOTEMPTY @@ -725,11 +935,11 @@ SUBJECT MODIFIERS The partial matching modifiers are provided with abbreviations because they appear frequently in tests. - If the /posix modifier was present on the pattern, causing the POSIX + If the posix modifier was present on the pattern, causing the POSIX wrapper API to be used, the only option-setting modifiers that have any effect are notbol, notempty, and noteol, causing REG_NOTBOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). - Any other modifiers cause an error. + The other modifiers are ignored, with a warning message. Setting match controls @@ -738,53 +948,64 @@ SUBJECT MODIFIERS line (see above), in which case they apply to every subject line that is matched against that pattern. - aftertext show text after match - allaftertext show text after captures - allcaptures show all captures - allusedtext show all consulted text (non-JIT only) - altglobal alternative global matching - callout_capture show captures at callout time - callout_data= set a value to pass via callouts - callout_fail= [: ] control callout failure - callout_none do not supply a callout function - copy= copy captured substring - dfa use pcre2_dfa_match() - find_limits find match and recursion limits - get= extract captured substring - getall extract all captured substrings - /g global global matching - jitstack= set size of JIT stack - mark show mark values - match_limit=>n> set a match limit - memory show memory usage - offset= set starting offset - ovector= set size of output vector - recursion_limit= set a recursion limit - replace= specify a replacement string - startchar show startchar when relevant - zero_terminate pass the subject as zero-terminated + aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allusedtext show all consulted text (non-JIT only) + altglobal alternative global matching + callout_capture show captures at callout time + callout_data= set a value to pass via callouts + callout_error= [: ] control callout error + callout_fail= [: ] control callout failure + callout_none do not supply a callout function + copy= copy captured substring + dfa use pcre2_dfa_match() + find_limits find match and recursion limits + get= extract captured substring + getall extract all captured substrings + /g global global matching + jitstack= set size of JIT stack + mark show mark values + match_limit= set a match limit + memory show memory usage + null_context match with a NULL context + offset= set starting offset + offset_limit= set offset limit + ovector= set size of output vector + recursion_limit= set a recursion limit + replace= specify a replacement string + startchar show startchar when relevant + startoffset= same as offset= + substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY + zero_terminate pass the subject as zero-terminated The effects of these modifiers are described in the following sections. + When matching via the POSIX wrapper API, the aftertext, allaftertext, + and ovector subject modifiers work as described below. All other modi- + fiers are either ignored, with a warning message, or cause an error. Showing more text - The aftertext modifier requests that as well as outputting the part of + The aftertext modifier requests that as well as outputting the part of the subject string that matched the entire pattern, pcre2test should in addition output the remainder of the subject string. This is useful for tests where the subject contains multiple copies of the same substring. - The allaftertext modifier requests the same action for captured sub- + The allaftertext modifier requests the same action for captured sub- strings as well as the main matched substring. In each case the remain- der is output on the following line with a plus character following the capture number. - The allusedtext modifier requests that all the text that was consulted - during a successful pattern match by the interpreter should be shown. - This feature is not supported for JIT matching, and if requested with - JIT it is ignored (with a warning message). Setting this modifier + The allusedtext modifier requests that all the text that was consulted + during a successful pattern match by the interpreter should be shown. + This feature is not supported for JIT matching, and if requested with + JIT it is ignored (with a warning message). Setting this modifier affects the output if there is a lookbehind at the start of a match, or - a lookahead at the end, or if \K is used in the pattern. Characters - that precede or follow the start and end of the actual match are indi- - cated in the output by '<' or '>' characters underneath them. Here is + a lookahead at the end, or if \K is used in the pattern. Characters + that precede or follow the start and end of the actual match are indi- + cated in the output by '<' or '>' characters underneath them. Here is an example: re> /(?<=pqr)abc(?=xyz)/ @@ -792,16 +1013,16 @@ SUBJECT MODIFIERS 0: pqrabcxyz <<< >>> - This shows that the matched string is "abc", with the preceding and - following strings "pqr" and "xyz" having been consulted during the + This shows that the matched string is "abc", with the preceding and + following strings "pqr" and "xyz" having been consulted during the match (when processing the assertions). - The startchar modifier requests that the starting character for the - match be indicated, if it is different to the start of the matched + The startchar modifier requests that the starting character for the + match be indicated, if it is different to the start of the matched string. The only time when this occurs is when \K has been processed as part of the match. In this situation, the output for the matched string - is displayed from the starting character instead of from the match - point, with circumflex characters under the earlier characters. For + is displayed from the starting character instead of from the match + point, with circumflex characters under the earlier characters. For example: re> /abc\Kxyz/ @@ -809,7 +1030,7 @@ SUBJECT MODIFIERS 0: abcxyz ^^^ - Unlike allusedtext, the startchar modifier can be used with JIT. How- + Unlike allusedtext, the startchar modifier can be used with JIT. How- ever, these two modifiers are mutually exclusive. Showing the value of all capture groups @@ -817,89 +1038,110 @@ SUBJECT MODIFIERS The allcaptures modifier requests that the values of all potential cap- tured parentheses be output after a match. By default, only those up to the highest one actually used in the match are output (corresponding to - the return code from pcre2_match()). Groups that did not take part in - the match are output as " ". + the return code from pcre2_match()). Groups that did not take part in + the match are output as " ". This modifier is not relevant for + DFA matching (which does no capturing); it is ignored, with a warning + message, if present. Testing callouts - A callout function is supplied when pcre2test calls the library match- - ing functions, unless callout_none is specified. If callout_capture is - set, the current captured groups are output when a callout occurs. + A callout function is supplied when pcre2test calls the library match- + ing functions, unless callout_none is specified. If callout_capture is + set, the current captured groups are output when a callout occurs. The + default return from the callout function is zero, which allows matching + to continue. - The callout_fail modifier can be given one or two numbers. If there is - only one number, 1 is returned instead of 0 when a callout of that num- - ber is reached. If two numbers are given, 1 is returned when callout - is reached for the th time. Note that callouts with string argu- - ments are always given the number zero. See "Callouts" below for a - description of the output when a callout it taken. + The callout_fail modifier can be given one or two numbers. If there is + only one number, 1 is returned instead of 0 (causing matching to back- + track) when a callout of that number is reached. If two numbers + ( : ) are given, 1 is returned when callout is reached and + there have been at least callouts. The callout_error modifier is + similar, except that PCRE2_ERROR_CALLOUT is returned, causing the + entire matching process to be aborted. If both these modifiers are set + for the same callout number, callout_error takes precedence. - The callout_data modifier can be given an unsigned or a negative num- - ber. This is set as the "user data" that is passed to the matching - function, and passed back when the callout function is invoked. Any - value other than zero is used as a return from pcre2test's callout + Note that callouts with string arguments are always given the number + zero. See "Callouts" below for a description of the output when a call- + out it taken. + + The callout_data modifier can be given an unsigned or a negative num- + ber. This is set as the "user data" that is passed to the matching + function, and passed back when the callout function is invoked. Any + value other than zero is used as a return from pcre2test's callout function. Finding all matches in a string Searching for all possible matches within a subject can be requested by - the global or /altglobal modifier. After finding a match, the matching - function is called again to search the remainder of the subject. The - difference between global and altglobal is that the former uses the - start_offset argument to pcre2_match() or pcre2_dfa_match() to start - searching at a new point within the entire string (which is what Perl + the global or altglobal modifier. After finding a match, the matching + function is called again to search the remainder of the subject. The + difference between global and altglobal is that the former uses the + start_offset argument to pcre2_match() or pcre2_dfa_match() to start + searching at a new point within the entire string (which is what Perl does), whereas the latter passes over a shortened subject. This makes a difference to the matching process if the pattern begins with a lookbe- hind assertion (including \b or \B). - If an empty string is matched, the next match is done with the + If an empty string is matched, the next match is done with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search for another, non-empty, match at the same point in the subject. If this - match fails, the start offset is advanced, and the normal match is - retried. This imitates the way Perl handles such cases when using the - /g modifier or the split() function. Normally, the start offset is - advanced by one character, but if the newline convention recognizes - CRLF as a newline, and the current character is CR followed by LF, an + match fails, the start offset is advanced, and the normal match is + retried. This imitates the way Perl handles such cases when using the + /g modifier or the split() function. Normally, the start offset is + advanced by one character, but if the newline convention recognizes + CRLF as a newline, and the current character is CR followed by LF, an advance of two characters occurs. Testing substring extraction functions - The copy and get modifiers can be used to test the pcre2_sub- + The copy and get modifiers can be used to test the pcre2_sub- string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be - given more than once, and each can specify a group name or number, for + given more than once, and each can specify a group name or number, for example: abcd\=copy=1,copy=3,get=G1 - If the #subject command is used to set default copy and/or get lists, - these can be unset by specifying a negative number to cancel all num- + If the #subject command is used to set default copy and/or get lists, + these can be unset by specifying a negative number to cancel all num- bered groups and an empty name to cancel all named groups. - The getall modifier tests pcre2_substring_list_get(), which extracts + The getall modifier tests pcre2_substring_list_get(), which extracts all captured substrings. - If the subject line is successfully matched, the substrings extracted - by the convenience functions are output with C, G, or L after the - string number instead of a colon. This is in addition to the normal - full list. The string length (that is, the return from the extraction + If the subject line is successfully matched, the substrings extracted + by the convenience functions are output with C, G, or L after the + string number instead of a colon. This is in addition to the normal + full list. The string length (that is, the return from the extraction function) is given in parentheses after each substring, followed by the name when the extraction was by name. Testing the substitution function - If the replace modifier is set, the pcre2_substitute() function is - called instead of one of the matching functions. Unlike subject - strings, pcre2test does not process replacement strings for escape - sequences. In UTF mode, a replacement string is checked to see if it is - a valid UTF-8 string. If so, it is correctly converted to a UTF string - of the appropriate code unit width. If it is not a valid UTF-8 string, - the individual code units are copied directly. This provides a means of - passing an invalid UTF-8 string for testing purposes. + If the replace modifier is set, the pcre2_substitute() function is + called instead of one of the matching functions. Note that replacement + strings cannot contain commas, because a comma signifies the end of a + modifier. This is not thought to be an issue in a test program. - If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to - pcre2_substitute(). After a successful substitution, the modified - string is output, preceded by the number of replacements. This may be - zero if there were no matches. Here is a simple example of a substitu- - tion test: + Unlike subject strings, pcre2test does not process replacement strings + for escape sequences. In UTF mode, a replacement string is checked to + see if it is a valid UTF-8 string. If so, it is correctly converted to + a UTF string of the appropriate code unit width. If it is not a valid + UTF-8 string, the individual code units are copied directly. This pro- + vides a means of passing an invalid UTF-8 string for testing purposes. + + The following modifiers set options (in additional to the normal match + options) for pcre2_substitute(): + + global PCRE2_SUBSTITUTE_GLOBAL + substitute_extended PCRE2_SUBSTITUTE_EXTENDED + substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY + + + After a successful substitution, the modified string is output, pre- + ceded by the number of replacements. This may be zero if there were no + matches. Here is a simple example of a substitution test: /abc/replace=xxx =abc=abc= @@ -907,12 +1149,13 @@ SUBJECT MODIFIERS =abc=abc=\=global 2: =xxx=xxx= - Subject and replacement strings should be kept relatively short for - substitution tests, as fixed-size buffers are used. To make it easy to - test for buffer overflow, if the replacement string starts with a num- - ber in square brackets, that number is passed to pcre2_substitute() as - the size of the output buffer, with the replacement string starting at - the next character. Here is an example that tests the edge case: + Subject and replacement strings should be kept relatively short (fewer + than 256 characters) for substitution tests, as fixed-size buffers are + used. To make it easy to test for buffer overflow, if the replacement + string starts with a number in square brackets, that number is passed + to pcre2_substitute() as the size of the output buffer, with the + replacement string starting at the next character. Here is an example + that tests the edge case: /abc/ 123abc123\=replace=[10]XYZ @@ -920,91 +1163,121 @@ SUBJECT MODIFIERS 123abc123\=replace=[9]XYZ Failed: error -47: no more memory + The default action of pcre2_substitute() is to return + PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if + the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the sub- + stitute_overflow_length modifier), pcre2_substitute() continues to go + through the motions of matching and substituting, in order to compute + the size of buffer that is required. When this happens, pcre2test shows + the required buffer length (which includes space for the trailing zero) + as part of the error message. For example: + + /abc/substitute_overflow_length + 123abc123\=replace=[9]XYZ + Failed: error -47: no more memory: 10 code units are needed + A replacement string is ignored with POSIX and DFA matching. Specifying - partial matching provokes an error return ("bad option value") from + partial matching provokes an error return ("bad option value") from pcre2_substitute(). Setting the JIT stack size - The jitstack modifier provides a way of setting the maximum stack size - that is used by the just-in-time optimization code. It is ignored if + The jitstack modifier provides a way of setting the maximum stack size + that is used by the just-in-time optimization code. It is ignored if JIT optimization is not being used. The value is a number of kilobytes. Providing a stack that is larger than the default 32K is necessary only for very complicated patterns. Setting match and recursion limits - The match_limit and recursion_limit modifiers set the appropriate lim- + The match_limit and recursion_limit modifiers set the appropriate lim- its in the match context. These values are ignored when the find_limits modifier is specified. Finding minimum limits - If the find_limits modifier is present, pcre2test calls pcre2_match() - several times, setting different values in the match context via - pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds - the minimum values for each parameter that allow pcre2_match() to com- + If the find_limits modifier is present, pcre2test calls pcre2_match() + several times, setting different values in the match context via + pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds + the minimum values for each parameter that allow pcre2_match() to com- plete without error. If JIT is being used, only the match limit is relevant. If DFA matching - is being used, neither limit is relevant, and this modifier is ignored + is being used, neither limit is relevant, and this modifier is ignored (with a warning message). - The match_limit number is a measure of the amount of backtracking that - takes place, and learning the minimum value can be instructive. For - most simple matches, the number is quite small, but for patterns with - very large numbers of matching possibilities, it can become large very - quickly with increasing length of subject string. The - match_limit_recursion number is a measure of how much stack (or, if - PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to + The match_limit number is a measure of the amount of backtracking that + takes place, and learning the minimum value can be instructive. For + most simple matches, the number is quite small, but for patterns with + very large numbers of matching possibilities, it can become large very + quickly with increasing length of subject string. The + match_limit_recursion number is a measure of how much stack (or, if + PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to complete the match attempt. Showing MARK names The mark modifier causes the names from backtracking control verbs that - are returned from calls to pcre2_match() to be displayed. If a mark is - returned for a match, non-match, or partial match, pcre2test shows it. - For a match, it is on a line by itself, tagged with "MK:". Otherwise, + are returned from calls to pcre2_match() to be displayed. If a mark is + returned for a match, non-match, or partial match, pcre2test shows it. + For a match, it is on a line by itself, tagged with "MK:". Otherwise, it is added to the non-match message. Showing memory usage - The memory modifier causes pcre2test to log all memory allocation and + The memory modifier causes pcre2test to log all memory allocation and freeing calls that occur during a match operation. Setting a starting offset - The offset modifier sets an offset in the subject string at which + The offset modifier sets an offset in the subject string at which matching starts. Its value is a number of code units, not characters. + Setting an offset limit + + The offset_limit modifier sets a limit for unanchored matches. If a + match cannot be found starting at or before this offset in the subject, + a "no match" return is given. The data value is a number of code units, + not characters. When this modifier is used, the use_offset_limit modi- + fier must have been set for the pattern; if not, an error is generated. + Setting the size of the output vector - The ovector modifier applies only to the subject line in which it - appears, though of course it can also be used to set a default in a - #subject command. It specifies the number of pairs of offsets that are + The ovector modifier applies only to the subject line in which it + appears, though of course it can also be used to set a default in a + #subject command. It specifies the number of pairs of offsets that are available for storing matching information. The default is 15. - A value of zero is useful when testing the POSIX API because it causes + A value of zero is useful when testing the POSIX API because it causes regexec() to be called with a NULL capture vector. When not testing the - POSIX API, a value of zero is used to cause pcre2_match_data_cre- - ate_from_pattern() to be called, in order to create a match block of + POSIX API, a value of zero is used to cause pcre2_match_data_cre- + ate_from_pattern() to be called, in order to create a match block of exactly the right size for the pattern. (It is not possible to create a - match block with a zero-length ovector; there is always at least one + match block with a zero-length ovector; there is always at least one pair of offsets.) Passing the subject as zero-terminated By default, the subject string is passed to a native API matching func- tion with its correct length. In order to test the facility for passing - a zero-terminated string, the zero_terminate modifier is provided. It + a zero-terminated string, the zero_terminate modifier is provided. It causes the length to be passed as PCRE2_ZERO_TERMINATED. (When matching - via the POSIX interface, this modifier has no effect, as there is no + via the POSIX interface, this modifier has no effect, as there is no facility for passing a length.) - When testing pcre2_substitute(), this modifier also has the effect of + When testing pcre2_substitute(), this modifier also has the effect of passing the replacement string as zero-terminated. + Passing a NULL context + + Normally, pcre2test passes a context block to pcre2_match(), + pcre2_dfa_match() or pcre2_jit_match(). If the null_context modifier is + set, however, NULL is passed. This is for testing that the matching + functions behave correctly in this case (they use default values). This + modifier cannot be used with the find_limits modifier or when testing + the substitution function. + THE ALTERNATIVE MATCHING FUNCTION @@ -1069,7 +1342,7 @@ DEFAULT OUTPUT FROM pcre2test If the strings contain any non-printing characters, they are output as \xhh escapes if the value is less than 256 and UTF mode is not set. Otherwise they are output as \x{hh...} escapes. See below for the defi- - nition of non-printing characters. If the /aftertext modifier is set, + nition of non-printing characters. If the aftertext modifier is set, the output for substring 0 is followed by the the rest of the subject string, identified by "0+" like this: @@ -1188,10 +1461,11 @@ CALLOUTS attempt starting at the fourth character of the subject string, when the pointer was at the seventh character, and when the next pattern item was \d. Just one circumflex is output if the start and current - positions are the same. + positions are the same, or if the current position precedes the start + position, which can happen if the callout is in a lookbehind assertion. Callouts numbered 255 are assumed to be automatic callouts, inserted as - a result of the /auto_callout pattern modifier. In this case, instead + a result of the /auto_callout pattern modifier. In this case, instead of showing the callout number, the offset in the pattern, preceded by a plus, is output. For example: @@ -1205,7 +1479,7 @@ CALLOUTS 0: E* If a pattern contains (*MARK) items, an additional line is output when- - ever a change of latest mark is passed to the callout function. For + ever a change of latest mark is passed to the callout function. For example: re> /a(*MARK:X)bc/auto_callout @@ -1219,17 +1493,17 @@ CALLOUTS +12 ^ ^ 0: abc - The mark changes between matching "a" and "b", but stays the same for - the rest of the match, so nothing more is output. If, as a result of - backtracking, the mark reverts to being unset, the text " " is + The mark changes between matching "a" and "b", but stays the same for + the rest of the match, so nothing more is output. If, as a result of + backtracking, the mark reverts to being unset, the text " " is output. Callouts with string arguments The output for a callout with a string argument is similar, except that - instead of outputting a callout number before the position indicators, - the callout string and its offset in the pattern string are output - before the reflection of the subject string, and the subject string is + instead of outputting a callout number before the position indicators, + the callout string and its offset in the pattern string are output + before the reflection of the subject string, and the subject string is reflected for each callout. For example: re> /^ab(?C'first')cd(?C"second")ef/ @@ -1246,41 +1520,46 @@ CALLOUTS NON-PRINTING CHARACTERS When pcre2test is outputting text in the compiled version of a pattern, - bytes other than 32-126 are always treated as non-printing characters + bytes other than 32-126 are always treated as non-printing characters and are therefore shown as hex escapes. - When pcre2test is outputting text that is a matched part of a subject - string, it behaves in the same way, unless a different locale has been - set for the pattern (using the /locale modifier). In this case, the - isprint() function is used to distinguish printing and non-printing + When pcre2test is outputting text that is a matched part of a subject + string, it behaves in the same way, unless a different locale has been + set for the pattern (using the locale modifier). In this case, the + isprint() function is used to distinguish printing and non-printing characters. SAVING AND RESTORING COMPILED PATTERNS - It is possible to save compiled patterns on disc or elsewhere, and + It is possible to save compiled patterns on disc or elsewhere, and reload them later, subject to a number of restrictions. JIT data cannot - be saved. The host on which the patterns are reloaded must be running + be saved. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also - have the same endianness, pointer width and PCRE2_SIZE type. Before - compiled patterns can be saved they must be serialized, that is, con- - verted to a stream of bytes. A single byte stream may contain any num- - ber of compiled patterns, but they must all use the same character + have the same endianness, pointer width and PCRE2_SIZE type. Before + compiled patterns can be saved they must be serialized, that is, con- + verted to a stream of bytes. A single byte stream may contain any num- + ber of compiled patterns, but they must all use the same character tables. A single copy of the tables is included in the byte stream (its size is 1088 bytes). - The functions whose names begin with pcre2_serialize_ are used for - serializing and de-serializing. They are described in the pcre2serial- + The functions whose names begin with pcre2_serialize_ are used for + serializing and de-serializing. They are described in the pcre2serial- ize documentation. In this section we describe the features of pcre2test that can be used to test these functions. - When a pattern with push modifier is successfully compiled, it is - pushed onto a stack of compiled patterns, and pcre2test expects the - next line to contain a new pattern (or command) instead of a subject - line. By this means, a number of patterns can be compiled and retained. - The push modifier is incompatible with posix, and control modifiers - that act at match time are ignored (with a message). The jitverify mod- - ifier applies only at compile time. The command + When a pattern with push modifier is successfully compiled, it is + pushed onto a stack of compiled patterns, and pcre2test expects the + next line to contain a new pattern (or command) instead of a subject + line. By contrast, the pushcopy modifier causes a copy of the compiled + pattern to be stacked, leaving the original available for immediate + matching. By using push and/or pushcopy, a number of patterns can be + compiled and retained. These modifiers are incompatible with posix, and + control modifiers that act at match time are ignored (with a message) + for the stacked patterns. The jitverify modifier applies only at com- + pile time. + + The command #save @@ -1297,9 +1576,10 @@ SAVING AND RESTORING COMPILED PATTERNS matched with the pattern, terminated as usual by an empty line or end of file. This command may be followed by a modifier list containing only control modifiers that act after a pattern has been compiled. In - particular, hex, posix, and push are not allowed, nor are any option- - setting modifiers. The JIT modifiers are, however permitted. Here is - an example that saves and reloads two patterns. + particular, hex, posix, posix_nosub, push, and pushcopy are not + allowed, nor are any option-setting modifiers. The JIT modifiers are, + however permitted. Here is an example that saves and reloads two pat- + terns. /abc/push /xyz/push @@ -1311,9 +1591,13 @@ SAVING AND RESTORING COMPILED PATTERNS #pop jit,bincode abc - If jitverify is used with #pop, it does not automatically imply jit, + If jitverify is used with #pop, it does not automatically imply jit, which is different behaviour from when it is used on a pattern. + The #popcopy command is analagous to the pushcopy modifier in that it + makes current a copy of the topmost stack pattern, leaving the original + still on the stack. + SEE ALSO @@ -1330,5 +1614,5 @@ AUTHOR REVISION - Last updated: 20 May 2015 - Copyright (c) 1997-2015 University of Cambridge. + Last updated: 28 December 2016 + Copyright (c) 1997-2016 University of Cambridge. diff --git a/pcre2/doc/pcre2unicode.3 b/pcre2/doc/pcre2unicode.3 index 6c32bc046..253d4b64d 100644 --- a/pcre2/doc/pcre2unicode.3 +++ b/pcre2/doc/pcre2unicode.3 @@ -1,4 +1,4 @@ -.TH PCRE2UNICODE 3 "23 November 2014" "PCRE2 10.00" +.TH PCRE2UNICODE 3 "03 July 2016" "PCRE2 10.22" .SH NAME PCRE - Perl-compatible regular expressions (revised API) .SH "UNICODE AND UTF SUPPORT" @@ -57,17 +57,21 @@ individual code units. In UTF modes, the dot metacharacter matches one UTF character instead of a single code unit. .P -The escape sequence \eC can be used to match a single code unit, in a UTF mode, +The escape sequence \eC can be used to match a single code unit in a UTF mode, but its use can lead to some strange effects because it breaks up multi-unit characters (see the description of \eC in the .\" HREF \fBpcre2pattern\fP .\" -documentation). The use of \eC is not supported in the alternative matching -function \fBpcre2_dfa_match()\fP, nor is it supported in UTF mode by the JIT -optimization. If JIT optimization is requested for a UTF pattern that contains -\eC, it will not succeed, and so the matching will be carried out by the normal -interpretive function. +documentation). +.P +The use of \eC is not supported by the alternative matching function +\fBpcre2_dfa_match()\fP when in UTF-8 or UTF-16 mode, that is, when a character +may consist of more than one code unit. The use of \eC in these modes provokes +a match-time error. Also, the JIT optimization does not support \eC in these +modes. If JIT optimization is requested for a UTF-8 or UTF-16 pattern that +contains \eC, it will not succeed, and so when \fBpcre2_match()\fP is called, +the matching will be carried out by the normal interpretive function. .P The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test characters of any code value, but, by default, the characters that PCRE2 @@ -117,11 +121,21 @@ UTF-16 and UTF-32 strings can indicate their endianness by special code knows as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting strings to be in host byte order. .P -The entire string is checked before any other processing takes place. In -addition to checking the format of the string, there is a check to ensure that -all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area. -The so-called "non-character" code points are not excluded because Unicode -corrigendum #9 makes it clear that they should not be. +A UTF string is checked before any other processing takes place. In the case of +\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting +offset, the check is applied only to that part of the subject that could be +inspected during matching, and there is a check that the starting offset points +to the first code unit of a character or to the end of the subject. If there +are no lookbehind assertions in the pattern, the check starts at the starting +offset. Otherwise, it starts at the length of the longest lookbehind before the +starting offset, or at the start of the subject if there are not that many +characters before the starting offset. Note that the sequences \eb and \eB are +one-character lookbehinds. +.P +In addition to checking the format of the string, there is a check to ensure +that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate +area. The so-called "non-character" code points are not excluded because +Unicode corrigendum #9 makes it clear that they should not be. .P Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16, where they are used in pairs to encode code points with values greater than @@ -221,9 +235,9 @@ never occur in a valid UTF-8 string. .sp The following negative error codes are given for invalid UTF-16 strings: .sp - PCRE_UTF16_ERR1 Missing low surrogate at end of string - PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate - PCRE_UTF16_ERR3 Isolated low surrogate + PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at end of string + PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate follows high surrogate + PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate .sp . . @@ -233,8 +247,8 @@ The following negative error codes are given for invalid UTF-16 strings: .sp The following negative error codes are given for invalid UTF-32 strings: .sp - PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff) - PCRE_UTF32_ERR2 Code point is greater than 0x10ffff + PCRE2_ERROR_UTF32_ERR1 Surrogate character (0xd800 to 0xdfff) + PCRE2_ERROR_UTF32_ERR2 Code point is greater than 0x10ffff .sp . . @@ -252,6 +266,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 23 November 2014 -Copyright (c) 1997-2014 University of Cambridge. +Last updated: 03 July 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/pcre2/pcre2-config.in b/pcre2/pcre2-config.in index 932160ef5..74271c088 100644 --- a/pcre2/pcre2-config.in +++ b/pcre2/pcre2-config.in @@ -86,7 +86,7 @@ while test $# -gt 0; do ;; --libs-posix) if test @enable_pcre2_8@ = yes ; then - echo $libS$libR -lpcre2posix -lpcre2-8 + echo $libS$libR -lpcre2-posix -lpcre2-8 else echo "${usage}" 1>&2 fi diff --git a/pcre2/perltest.sh b/pcre2/perltest.sh index f011ccc99..50c70e5c2 100755 --- a/pcre2/perltest.sh +++ b/pcre2/perltest.sh @@ -1,14 +1,17 @@ #! /bin/sh # Script for testing regular expressions with perl to check that PCRE2 handles -# them the same. The Perl code has to have "use utf8" and "require Encode" at -# the start when running UTF-8 tests, but *not* for non-utf8 tests. (The -# "require" would actually be OK for non-utf8-tests, but is not always -# installed, so this way the script will always run for these tests.) +# them the same. If the first argument to this script is "-w", Perl is also +# called with "-w", which turns on its warning mode. +# +# The Perl code has to have "use utf8" and "require Encode" at the start when +# running UTF-8 tests, but *not* for non-utf8 tests. (The "require" would +# actually be OK for non-utf8-tests, but is not always installed, so this way +# the script will always run for these tests.) # # The desired effect is achieved by making this a shell script that passes the -# Perl script to Perl through a pipe. If the first argument is "-utf8", a -# suitable prefix is set up. +# Perl script to Perl through a pipe. If the first argument (possibly after +# removing "-w") is "-utf8", a suitable prefix is set up. # # The remaining arguments, if any, are passed to Perl. They are an input file # and an output file. If there is one argument, the output is written to @@ -17,7 +20,14 @@ # of the contorted piping input.) perl=perl +perlarg='' prefix='' + +if [ $# -gt 0 -a "$1" = "-w" ] ; then + perlarg="-w" + shift +fi + if [ $# -gt 0 -a "$1" = "-utf8" ] ; then prefix="use utf8; require Encode;" shift @@ -204,12 +214,14 @@ for (;;) printf "data> " if $interact; last NEXT_RE if ! ($_ = <$infile>); chomp; - printf $outfile "$_\n" if ! $interact; + printf $outfile "%s", "$_\n" if ! $interact; s/\s+$//; # Remove trailing space s/^\s+//; # Remove leading space last if ($_ eq ""); + next if $_ =~ /^\\=(?:\s|$)/; # Comment line + $x = eval "\"$_\""; # To get escapes processed # Empty array for holding results, ensure $REGERROR and $REGMARK are @@ -290,6 +302,6 @@ for (;;) # printf $outfile "\n"; PERLEND -) | $perl - $@ +) | $perl $perlarg - $@ # End diff --git a/pcre2/src/config.h.generic b/pcre2/src/config.h.generic index 0f9da50ce..3315b7770 100644 --- a/pcre2/src/config.h.generic +++ b/pcre2/src/config.h.generic @@ -78,6 +78,9 @@ sure both macros are undefined; an emulation function will then be used. */ /* Define to 1 if you have the header file. */ /* #undef HAVE_MEMORY_H */ +/* Define to 1 if you have the `mkostemp' function. */ +/* #undef HAVE_MKOSTEMP */ + /* Define if you have POSIX threads libraries and header files. */ /* #undef HAVE_PTHREAD */ @@ -90,6 +93,9 @@ sure both macros are undefined; an emulation function will then be used. */ /* Define to 1 if you have the header file. */ /* #undef HAVE_READLINE_READLINE_H */ +/* Define to 1 if you have the `secure_getenv' function. */ +/* #undef HAVE_SECURE_GETENV */ + /* Define to 1 if you have the header file. */ /* #undef HAVE_STDINT_H */ @@ -111,6 +117,9 @@ sure both macros are undefined; an emulation function will then be used. */ /* Define to 1 if you have the header file. */ /* #undef HAVE_SYS_TYPES_H */ +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_WAIT_H */ + /* Define to 1 if you have the header file. */ /* #undef HAVE_UNISTD_H */ @@ -182,6 +191,9 @@ sure both macros are undefined; an emulation function will then be used. */ #define MAX_NAME_SIZE 32 #endif +/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */ +/* #undef NEVER_BACKSLASH_C */ + /* The value of NEWLINE_DEFAULT determines the default newline character sequence. PCRE2 client programs can override this by selecting other values at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5 @@ -200,7 +212,7 @@ sure both macros are undefined; an emulation function will then be used. */ #define PACKAGE_NAME "PCRE2" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "PCRE2 10.20" +#define PACKAGE_STRING "PCRE2 10.23" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "pcre2" @@ -209,7 +221,7 @@ sure both macros are undefined; an emulation function will then be used. */ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "10.20" +#define PACKAGE_VERSION "10.23" /* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested parentheses (of any kind) in a pattern. This limits the amount of system @@ -218,15 +230,24 @@ sure both macros are undefined; an emulation function will then be used. */ #define PARENS_NEST_LIMIT 250 #endif -/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by - pcre2grep to hold parts of the file it is searching. This is also the - minimum value. The actual amount of memory used by pcre2grep is three times - this number, because it allows for the buffering of "before" and "after" - lines. */ +/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by + pcre2grep to hold parts of the file it is searching. The buffer will be + expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing + very long lines. The actual amount of memory used by pcre2grep is three + times this number, because it allows for the buffering of "before" and + "after" lines. */ #ifndef PCRE2GREP_BUFSIZE #define PCRE2GREP_BUFSIZE 20480 #endif +/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer + used by pcre2grep to hold parts of the file it is searching. The actual + amount of memory used by pcre2grep is three times this number, because it + allows for the buffering of "before" and "after" lines. */ +#ifndef PCRE2GREP_MAX_BUFSIZE +#define PCRE2GREP_MAX_BUFSIZE 1048576 +#endif + /* Define to any value to include debugging code. */ /* #undef PCRE2_DEBUG */ @@ -268,7 +289,11 @@ sure both macros are undefined; an emulation function will then be used. */ is able to handle .gz files. */ /* #undef SUPPORT_LIBZ */ -/* Define to any value to enable JIT support in pcre2grep. */ +/* Define to any value to enable callout script support in pcre2grep. */ +/* #undef SUPPORT_PCRE2GREP_CALLOUT */ + +/* Define to any value to enable JIT support in pcre2grep. Note that this will + have no effect unless SUPPORT_JIT is also defined. */ /* #undef SUPPORT_PCRE2GREP_JIT */ /* Define to any value to enable the 16 bit PCRE2 library. */ @@ -289,8 +314,39 @@ sure both macros are undefined; an emulation function will then be used. */ /* Define to any value for valgrind support to find invalid memory reads. */ /* #undef SUPPORT_VALGRIND */ +/* Enable extensions on AIX 3, Interix. */ +#ifndef _ALL_SOURCE +# define _ALL_SOURCE 1 +#endif +/* Enable GNU extensions on systems that have them. */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif +/* Enable threading extensions on Solaris. */ +#ifndef _POSIX_PTHREAD_SEMANTICS +# define _POSIX_PTHREAD_SEMANTICS 1 +#endif +/* Enable extensions on HP NonStop. */ +#ifndef _TANDEM_SOURCE +# define _TANDEM_SOURCE 1 +#endif +/* Enable general extensions on Solaris. */ +#ifndef __EXTENSIONS__ +# define __EXTENSIONS__ 1 +#endif + /* Version number of package */ -#define VERSION "10.20" +#define VERSION "10.23" + +/* Define to 1 if on MINIX. */ +/* #undef _MINIX */ + +/* Define to 2 if the system does not provide POSIX.1 features except with + this defined. */ +/* #undef _POSIX_1_SOURCE */ + +/* Define to 1 if you need to in order for `stat' and other things to work. */ +/* #undef _POSIX_SOURCE */ /* Define to empty if `const' does not conform to ANSI C. */ /* #undef const */ diff --git a/pcre2/src/config.h.in b/pcre2/src/config.h.in index e3ef2fddc..e04b209bf 100644 --- a/pcre2/src/config.h.in +++ b/pcre2/src/config.h.in @@ -78,6 +78,9 @@ sure both macros are undefined; an emulation function will then be used. */ /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H +/* Define to 1 if you have the `mkostemp' function. */ +#undef HAVE_MKOSTEMP + /* Define if you have POSIX threads libraries and header files. */ #undef HAVE_PTHREAD @@ -90,6 +93,9 @@ sure both macros are undefined; an emulation function will then be used. */ /* Define to 1 if you have the header file. */ #undef HAVE_READLINE_READLINE_H +/* Define to 1 if you have the `secure_getenv' function. */ +#undef HAVE_SECURE_GETENV + /* Define to 1 if you have the header file. */ #undef HAVE_STDINT_H @@ -111,6 +117,9 @@ sure both macros are undefined; an emulation function will then be used. */ /* Define to 1 if you have the header file. */ #undef HAVE_SYS_TYPES_H +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_WAIT_H + /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H @@ -169,6 +178,9 @@ sure both macros are undefined; an emulation function will then be used. */ overflow caused by enormously large patterns. */ #undef MAX_NAME_SIZE +/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */ +#undef NEVER_BACKSLASH_C + /* The value of NEWLINE_DEFAULT determines the default newline character sequence. PCRE2 client programs can override this by selecting other values at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5 @@ -201,13 +213,20 @@ sure both macros are undefined; an emulation function will then be used. */ stack that is used while compiling a pattern. */ #undef PARENS_NEST_LIMIT -/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by - pcre2grep to hold parts of the file it is searching. This is also the - minimum value. The actual amount of memory used by pcre2grep is three times - this number, because it allows for the buffering of "before" and "after" - lines. */ +/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by + pcre2grep to hold parts of the file it is searching. The buffer will be + expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing + very long lines. The actual amount of memory used by pcre2grep is three + times this number, because it allows for the buffering of "before" and + "after" lines. */ #undef PCRE2GREP_BUFSIZE +/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer + used by pcre2grep to hold parts of the file it is searching. The actual + amount of memory used by pcre2grep is three times this number, because it + allows for the buffering of "before" and "after" lines. */ +#undef PCRE2GREP_MAX_BUFSIZE + /* to make a symbol visible */ #undef PCRE2POSIX_EXP_DECL @@ -259,7 +278,11 @@ sure both macros are undefined; an emulation function will then be used. */ is able to handle .gz files. */ #undef SUPPORT_LIBZ -/* Define to any value to enable JIT support in pcre2grep. */ +/* Define to any value to enable callout script support in pcre2grep. */ +#undef SUPPORT_PCRE2GREP_CALLOUT + +/* Define to any value to enable JIT support in pcre2grep. Note that this will + have no effect unless SUPPORT_JIT is also defined. */ #undef SUPPORT_PCRE2GREP_JIT /* Define to any value to enable the 16 bit PCRE2 library. */ @@ -280,9 +303,41 @@ sure both macros are undefined; an emulation function will then be used. */ /* Define to any value for valgrind support to find invalid memory reads. */ #undef SUPPORT_VALGRIND +/* Enable extensions on AIX 3, Interix. */ +#ifndef _ALL_SOURCE +# undef _ALL_SOURCE +#endif +/* Enable GNU extensions on systems that have them. */ +#ifndef _GNU_SOURCE +# undef _GNU_SOURCE +#endif +/* Enable threading extensions on Solaris. */ +#ifndef _POSIX_PTHREAD_SEMANTICS +# undef _POSIX_PTHREAD_SEMANTICS +#endif +/* Enable extensions on HP NonStop. */ +#ifndef _TANDEM_SOURCE +# undef _TANDEM_SOURCE +#endif +/* Enable general extensions on Solaris. */ +#ifndef __EXTENSIONS__ +# undef __EXTENSIONS__ +#endif + + /* Version number of package */ #undef VERSION +/* Define to 1 if on MINIX. */ +#undef _MINIX + +/* Define to 2 if the system does not provide POSIX.1 features except with + this defined. */ +#undef _POSIX_1_SOURCE + +/* Define to 1 if you need to in order for `stat' and other things to work. */ +#undef _POSIX_SOURCE + /* Define to empty if `const' does not conform to ANSI C. */ #undef const diff --git a/pcre2/src/dftables.c b/pcre2/src/dftables.c index b6417cc2e..dfb90b594 100644 --- a/pcre2/src/dftables.c +++ b/pcre2/src/dftables.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -102,7 +102,7 @@ fprintf(f, "/* This file was automatically written by the dftables auxiliary\n" "program. It contains character tables that are used when no external\n" "tables are passed to PCRE2 by the application that calls it. The tables\n" - "are used only for characters whose code values are less than 256.\n\n"); + "are used only for characters whose code values are less than 256. */\n\n"); /* Force config.h in z/OS */ @@ -115,7 +115,7 @@ fprintf(f, #endif fprintf(f, - "The following #includes are present because without them gcc 4.x may remove\n" + "/* The following #includes are present because without them gcc 4.x may remove\n" "the array definition from the final binary if PCRE2 is built into a static\n" "library and dead code stripping is activated. This leads to link errors.\n" "Pulling in the header ensures that the array gets flagged as \"someone\n" diff --git a/pcre2/src/pcre2.h.generic b/pcre2/src/pcre2.h.generic index 3e97fb8bf..86503208e 100644 --- a/pcre2/src/pcre2.h.generic +++ b/pcre2/src/pcre2.h.generic @@ -5,7 +5,7 @@ /* This is the public header file for the PCRE library, second API, to be #included by applications that call PCRE2 functions. - Copyright (c) 2015 University of Cambridge + Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -36,15 +36,15 @@ POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ -#ifndef _PCRE2_H -#define _PCRE2_H +#ifndef PCRE2_H_IDEMPOTENT_GUARD +#define PCRE2_H_IDEMPOTENT_GUARD /* The current PCRE version information. */ #define PCRE2_MAJOR 10 -#define PCRE2_MINOR 20 +#define PCRE2_MINOR 23 #define PCRE2_PRERELEASE -#define PCRE2_DATE 2015-06-30 +#define PCRE2_DATE 2017-02-14 /* When an application links to a PCRE DLL in Windows, the symbols that are imported have to be identified as such. When building PCRE2, the appropriate @@ -67,6 +67,20 @@ don't change existing definitions of PCRE2_EXP_DECL. */ # endif #endif +/* When compiling with the MSVC compiler, it is sometimes necessary to include +a "calling convention" before exported function names. (This is secondhand +information; I know nothing about MSVC myself). For example, something like + + void __cdecl function(....) + +might be needed. In order so make this easy, all the exported functions have +PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not +set, we ensure here that it has no effect. */ + +#ifndef PCRE2_CALL_CONVENTION +#define PCRE2_CALL_CONVENTION +#endif + /* Have to include limits.h, stdlib.h and stdint.h to ensure that size_t and uint8_t, UCHAR_MAX, etc are defined. */ @@ -120,6 +134,8 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_UTF 0x00080000u /* C J M D */ #define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */ #define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */ +#define PCRE2_ALT_VERBNAMES 0x00400000u /* C */ +#define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */ /* These are for pcre2_jit_compile(). */ @@ -144,9 +160,19 @@ sanity checks). */ #define PCRE2_DFA_RESTART 0x00000040u #define PCRE2_DFA_SHORTEST 0x00000080u -/* This is an additional option for pcre2_substitute(). */ +/* These are additional options for pcre2_substitute(), which passes any others +through to pcre2_match(). */ -#define PCRE2_SUBSTITUTE_GLOBAL 0x00000100u +#define PCRE2_SUBSTITUTE_GLOBAL 0x00000100u +#define PCRE2_SUBSTITUTE_EXTENDED 0x00000200u +#define PCRE2_SUBSTITUTE_UNSET_EMPTY 0x00000400u +#define PCRE2_SUBSTITUTE_UNKNOWN_UNSET 0x00000800u +#define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u + +/* A further option for pcre2_match(), not allowed for pcre2_dfa_match(), +ignored for pcre2_jit_match(). */ + +#define PCRE2_NO_JIT 0x00002000u /* Newline and \R settings, for use in compile contexts. The newline values must be kept in step with values set in config.h and both sets must all be @@ -233,6 +259,13 @@ numbers must not be changed. */ #define PCRE2_ERROR_RECURSIONLIMIT (-53) #define PCRE2_ERROR_UNAVAILABLE (-54) #define PCRE2_ERROR_UNSET (-55) +#define PCRE2_ERROR_BADOFFSETLIMIT (-56) +#define PCRE2_ERROR_BADREPESCAPE (-57) +#define PCRE2_ERROR_REPMISSINGBRACE (-58) +#define PCRE2_ERROR_BADSUBSTITUTION (-59) +#define PCRE2_ERROR_BADSUBSPATTERN (-60) +#define PCRE2_ERROR_TOOMANYREPLACE (-61) +#define PCRE2_ERROR_BADSERIALIZEDDATA (-62) /* Request types for pcre2_pattern_info() */ @@ -259,6 +292,7 @@ numbers must not be changed. */ #define PCRE2_INFO_NEWLINE 20 #define PCRE2_INFO_RECURSIONLIMIT 21 #define PCRE2_INFO_SIZE 22 +#define PCRE2_INFO_HASBACKSLASHC 23 /* Request types for pcre2_config(). */ @@ -291,6 +325,7 @@ define special values to indicate zero-terminated strings and unset offsets in the offset vector (ovector). */ #define PCRE2_SIZE size_t +#define PCRE2_SIZE_MAX SIZE_MAX #define PCRE2_ZERO_TERMINATED (~(PCRE2_SIZE)0) #define PCRE2_UNSET (~(PCRE2_SIZE)0) @@ -365,164 +400,192 @@ expanded for each width below. Start with functions that give general information. */ #define PCRE2_GENERAL_INFO_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_config(uint32_t, void *); +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_config(uint32_t, void *); /* Functions for manipulating contexts. */ #define PCRE2_GENERAL_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL \ - pcre2_general_context *pcre2_general_context_copy(pcre2_general_context *); \ -PCRE2_EXP_DECL \ - pcre2_general_context *pcre2_general_context_create( \ - void *(*)(PCRE2_SIZE, void *), \ - void (*)(void *, void *), void *); \ -PCRE2_EXP_DECL void pcre2_general_context_free(pcre2_general_context *); +PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \ + *pcre2_general_context_copy(pcre2_general_context *); \ +PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \ + *pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \ + void (*)(void *, void *), void *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_general_context_free(pcre2_general_context *); #define PCRE2_COMPILE_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL \ - pcre2_compile_context *pcre2_compile_context_copy(pcre2_compile_context *); \ -PCRE2_EXP_DECL \ - pcre2_compile_context *pcre2_compile_context_create(pcre2_general_context *);\ -PCRE2_EXP_DECL void pcre2_compile_context_free(pcre2_compile_context *); \ -PCRE2_EXP_DECL int pcre2_set_bsr(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int pcre2_set_character_tables(pcre2_compile_context *, \ - const unsigned char *); \ -PCRE2_EXP_DECL int pcre2_set_newline(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int pcre2_set_parens_nest_limit(pcre2_compile_context *, \ - uint32_t); \ -PCRE2_EXP_DECL int pcre2_set_compile_recursion_guard(\ - pcre2_compile_context *, int (*)(uint32_t, void *), \ - void *); +PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \ + *pcre2_compile_context_copy(pcre2_compile_context *); \ +PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \ + *pcre2_compile_context_create(pcre2_general_context *);\ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_compile_context_free(pcre2_compile_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_bsr(pcre2_compile_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_character_tables(pcre2_compile_context *, const unsigned char *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_newline(pcre2_compile_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_parens_nest_limit(pcre2_compile_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_compile_recursion_guard(pcre2_compile_context *, \ + int (*)(uint32_t, void *), void *); #define PCRE2_MATCH_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL \ - pcre2_match_context *pcre2_match_context_copy(pcre2_match_context *); \ -PCRE2_EXP_DECL \ - pcre2_match_context *pcre2_match_context_create(pcre2_general_context *); \ -PCRE2_EXP_DECL void pcre2_match_context_free(pcre2_match_context *); \ -PCRE2_EXP_DECL int pcre2_set_callout(pcre2_match_context *, \ - int (*)(pcre2_callout_block *, void *), void *); \ -PCRE2_EXP_DECL int pcre2_set_match_limit(pcre2_match_context *, \ - uint32_t); \ -PCRE2_EXP_DECL int pcre2_set_recursion_limit(pcre2_match_context *, \ - uint32_t); \ -PCRE2_EXP_DECL int pcre2_set_recursion_memory_management( \ - pcre2_match_context *, void *(*)(PCRE2_SIZE, void *), \ - void (*)(void *, void *), void *); +PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \ + *pcre2_match_context_copy(pcre2_match_context *); \ +PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \ + *pcre2_match_context_create(pcre2_general_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_match_context_free(pcre2_match_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_callout(pcre2_match_context *, \ + int (*)(pcre2_callout_block *, void *), void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_match_limit(pcre2_match_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_offset_limit(pcre2_match_context *, PCRE2_SIZE); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_recursion_limit(pcre2_match_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_recursion_memory_management(pcre2_match_context *, \ + void *(*)(PCRE2_SIZE, void *), void (*)(void *, void *), void *); /* Functions concerned with compiling a pattern to PCRE internal code. */ #define PCRE2_COMPILE_FUNCTIONS \ -PCRE2_EXP_DECL \ - pcre2_code *pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, \ - int *, PCRE2_SIZE *, pcre2_compile_context *); \ -PCRE2_EXP_DECL void pcre2_code_free(pcre2_code *); +PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ + *pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, \ + pcre2_compile_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_code_free(pcre2_code *); \ +PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ + *pcre2_code_copy(const pcre2_code *); \ +PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ + *pcre2_code_copy_with_tables(const pcre2_code *); /* Functions that give information about a compiled pattern. */ #define PCRE2_PATTERN_INFO_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_pattern_info(const pcre2_code *, uint32_t, \ - void *); \ -PCRE2_EXP_DECL int pcre2_callout_enumerate(const pcre2_code *, \ - int (*)(pcre2_callout_enumerate_block *, void *), \ - void *); +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_pattern_info(const pcre2_code *, uint32_t, void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_callout_enumerate(const pcre2_code *, \ + int (*)(pcre2_callout_enumerate_block *, void *), void *); /* Functions for running a match and inspecting the result. */ #define PCRE2_MATCH_FUNCTIONS \ -PCRE2_EXP_DECL \ - pcre2_match_data *pcre2_match_data_create(uint32_t, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL \ - pcre2_match_data *pcre2_match_data_create_from_pattern(\ - const pcre2_code *, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL int pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, \ - PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ - pcre2_match_data *, pcre2_match_context *, int *, \ - PCRE2_SIZE); \ -PCRE2_EXP_DECL int pcre2_match(const pcre2_code *, \ - PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ - pcre2_match_data *, pcre2_match_context *); \ -PCRE2_EXP_DECL void pcre2_match_data_free(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SPTR pcre2_get_mark(pcre2_match_data *); \ -PCRE2_EXP_DECL uint32_t pcre2_get_ovector_count(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *); +PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \ + *pcre2_match_data_create(uint32_t, pcre2_general_context *); \ +PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \ + *pcre2_match_data_create_from_pattern(const pcre2_code *, \ + pcre2_general_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *, int *, PCRE2_SIZE); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_match_data_free(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SPTR PCRE2_CALL_CONVENTION \ + pcre2_get_mark(pcre2_match_data *); \ +PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \ + pcre2_get_ovector_count(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ + *pcre2_get_ovector_pointer(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ + pcre2_get_startchar(pcre2_match_data *); /* Convenience functions for handling matched substrings. */ #define PCRE2_SUBSTRING_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_substring_copy_byname(pcre2_match_data *, \ - PCRE2_SPTR, PCRE2_UCHAR *, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int pcre2_substring_copy_bynumber(pcre2_match_data *, \ - uint32_t, PCRE2_UCHAR *, PCRE2_SIZE *); \ -PCRE2_EXP_DECL void pcre2_substring_free(PCRE2_UCHAR *); \ -PCRE2_EXP_DECL int pcre2_substring_get_byname(pcre2_match_data *, \ - PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int pcre2_substring_get_bynumber(pcre2_match_data *, \ - uint32_t, PCRE2_UCHAR **, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int pcre2_substring_length_byname(pcre2_match_data *, \ - PCRE2_SPTR, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int pcre2_substring_length_bynumber(pcre2_match_data *, \ - uint32_t, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int pcre2_substring_nametable_scan(const pcre2_code *, \ - PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \ -PCRE2_EXP_DECL int pcre2_substring_number_from_name(\ - const pcre2_code *, PCRE2_SPTR); \ -PCRE2_EXP_DECL void pcre2_substring_list_free(PCRE2_SPTR *); \ -PCRE2_EXP_DECL int pcre2_substring_list_get(pcre2_match_data *, \ - PCRE2_UCHAR ***, PCRE2_SIZE **); +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_copy_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR *, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_copy_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR *, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_substring_free(PCRE2_UCHAR *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_get_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR **, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_get_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR **, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_length_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_length_bynumber(pcre2_match_data *, uint32_t, PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_nametable_scan(const pcre2_code *, PCRE2_SPTR, PCRE2_SPTR *, \ + PCRE2_SPTR *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_number_from_name(const pcre2_code *, PCRE2_SPTR); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_substring_list_free(PCRE2_SPTR *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_list_get(pcre2_match_data *, PCRE2_UCHAR ***, PCRE2_SIZE **); /* Functions for serializing / deserializing compiled patterns. */ #define PCRE2_SERIALIZE_FUNCTIONS \ -PCRE2_EXP_DECL int32_t pcre2_serialize_encode(const pcre2_code **, \ - int32_t, uint8_t **, PCRE2_SIZE *, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL int32_t pcre2_serialize_decode(pcre2_code **, int32_t, \ - const uint8_t *, pcre2_general_context *); \ -PCRE2_EXP_DECL int32_t pcre2_serialize_get_number_of_codes(const uint8_t *); \ -PCRE2_EXP_DECL void pcre2_serialize_free(uint8_t *); +PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ + pcre2_serialize_encode(const pcre2_code **, int32_t, uint8_t **, \ + PCRE2_SIZE *, pcre2_general_context *); \ +PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ + pcre2_serialize_decode(pcre2_code **, int32_t, const uint8_t *, \ + pcre2_general_context *); \ +PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ + pcre2_serialize_get_number_of_codes(const uint8_t *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_serialize_free(uint8_t *); /* Convenience function for match + substitute. */ #define PCRE2_SUBSTITUTE_FUNCTION \ -PCRE2_EXP_DECL int pcre2_substitute(const pcre2_code *, \ - PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ - pcre2_match_data *, pcre2_match_context *, \ - PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, \ - PCRE2_SIZE *); +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substitute(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *, PCRE2_SPTR, \ + PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE *); /* Functions for JIT processing */ #define PCRE2_JIT_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_jit_compile(pcre2_code *, uint32_t); \ -PCRE2_EXP_DECL int pcre2_jit_match(const pcre2_code *, \ - PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ - pcre2_match_data *, pcre2_match_context *); \ -PCRE2_EXP_DECL void pcre2_jit_free_unused_memory(pcre2_general_context *); \ -PCRE2_EXP_DECL \ - pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL void pcre2_jit_stack_assign(pcre2_match_context *, \ - pcre2_jit_callback, void *); \ -PCRE2_EXP_DECL void pcre2_jit_stack_free(pcre2_jit_stack *); +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_jit_compile(pcre2_code *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_jit_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_jit_free_unused_memory(pcre2_general_context *); \ +PCRE2_EXP_DECL pcre2_jit_stack PCRE2_CALL_CONVENTION \ + *pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_jit_stack_assign(pcre2_match_context *, pcre2_jit_callback, void *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_jit_stack_free(pcre2_jit_stack *); /* Other miscellaneous functions. */ #define PCRE2_OTHER_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_get_error_message(int, PCRE2_UCHAR *, PCRE2_SIZE); \ -PCRE2_EXP_DECL \ - const uint8_t *pcre2_maketables(pcre2_general_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_get_error_message(int, PCRE2_UCHAR *, PCRE2_SIZE); \ +PCRE2_EXP_DECL const uint8_t PCRE2_CALL_CONVENTION \ + *pcre2_maketables(pcre2_general_context *); \ /* Define macros that generate width-specific names from generic versions. The @@ -567,6 +630,8 @@ pcre2_compile are called by application code. */ /* Functions: the complete list in alphabetical order */ #define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_) +#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_) +#define pcre2_code_copy_with_tables PCRE2_SUFFIX(pcre2_code_copy_with_tables_) #define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_) #define pcre2_compile PCRE2_SUFFIX(pcre2_compile_) #define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_) @@ -606,8 +671,10 @@ pcre2_compile are called by application code. */ #define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_) #define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_) #define pcre2_set_match_limit PCRE2_SUFFIX(pcre2_set_match_limit_) +#define pcre2_set_max_pattern_length PCRE2_SUFFIX(pcre2_set_max_pattern_length_) #define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_) #define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_) +#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_) #define pcre2_set_recursion_limit PCRE2_SUFFIX(pcre2_set_recursion_limit_) #define pcre2_set_recursion_memory_management PCRE2_SUFFIX(pcre2_set_recursion_memory_management_) #define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_) @@ -699,4 +766,6 @@ PCRE2_SUFFIX a no-op. Otherwise, generate an error. */ } /* extern "C" */ #endif -#endif /* End of pcre2.h */ +#endif /* PCRE2_H_IDEMPOTENT_GUARD */ + +/* End of pcre2.h */ diff --git a/pcre2/src/pcre2.h.in b/pcre2/src/pcre2.h.in index 94fbdd5b3..96c29ffd8 100644 --- a/pcre2/src/pcre2.h.in +++ b/pcre2/src/pcre2.h.in @@ -5,7 +5,7 @@ /* This is the public header file for the PCRE library, second API, to be #included by applications that call PCRE2 functions. - Copyright (c) 2015 University of Cambridge + Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -36,8 +36,8 @@ POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ -#ifndef _PCRE2_H -#define _PCRE2_H +#ifndef PCRE2_H_IDEMPOTENT_GUARD +#define PCRE2_H_IDEMPOTENT_GUARD /* The current PCRE version information. */ @@ -67,6 +67,20 @@ don't change existing definitions of PCRE2_EXP_DECL. */ # endif #endif +/* When compiling with the MSVC compiler, it is sometimes necessary to include +a "calling convention" before exported function names. (This is secondhand +information; I know nothing about MSVC myself). For example, something like + + void __cdecl function(....) + +might be needed. In order so make this easy, all the exported functions have +PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not +set, we ensure here that it has no effect. */ + +#ifndef PCRE2_CALL_CONVENTION +#define PCRE2_CALL_CONVENTION +#endif + /* Have to include limits.h, stdlib.h and stdint.h to ensure that size_t and uint8_t, UCHAR_MAX, etc are defined. */ @@ -120,6 +134,8 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_UTF 0x00080000u /* C J M D */ #define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */ #define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */ +#define PCRE2_ALT_VERBNAMES 0x00400000u /* C */ +#define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */ /* These are for pcre2_jit_compile(). */ @@ -144,9 +160,19 @@ sanity checks). */ #define PCRE2_DFA_RESTART 0x00000040u #define PCRE2_DFA_SHORTEST 0x00000080u -/* This is an additional option for pcre2_substitute(). */ +/* These are additional options for pcre2_substitute(), which passes any others +through to pcre2_match(). */ -#define PCRE2_SUBSTITUTE_GLOBAL 0x00000100u +#define PCRE2_SUBSTITUTE_GLOBAL 0x00000100u +#define PCRE2_SUBSTITUTE_EXTENDED 0x00000200u +#define PCRE2_SUBSTITUTE_UNSET_EMPTY 0x00000400u +#define PCRE2_SUBSTITUTE_UNKNOWN_UNSET 0x00000800u +#define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u + +/* A further option for pcre2_match(), not allowed for pcre2_dfa_match(), +ignored for pcre2_jit_match(). */ + +#define PCRE2_NO_JIT 0x00002000u /* Newline and \R settings, for use in compile contexts. The newline values must be kept in step with values set in config.h and both sets must all be @@ -233,6 +259,13 @@ numbers must not be changed. */ #define PCRE2_ERROR_RECURSIONLIMIT (-53) #define PCRE2_ERROR_UNAVAILABLE (-54) #define PCRE2_ERROR_UNSET (-55) +#define PCRE2_ERROR_BADOFFSETLIMIT (-56) +#define PCRE2_ERROR_BADREPESCAPE (-57) +#define PCRE2_ERROR_REPMISSINGBRACE (-58) +#define PCRE2_ERROR_BADSUBSTITUTION (-59) +#define PCRE2_ERROR_BADSUBSPATTERN (-60) +#define PCRE2_ERROR_TOOMANYREPLACE (-61) +#define PCRE2_ERROR_BADSERIALIZEDDATA (-62) /* Request types for pcre2_pattern_info() */ @@ -259,6 +292,7 @@ numbers must not be changed. */ #define PCRE2_INFO_NEWLINE 20 #define PCRE2_INFO_RECURSIONLIMIT 21 #define PCRE2_INFO_SIZE 22 +#define PCRE2_INFO_HASBACKSLASHC 23 /* Request types for pcre2_config(). */ @@ -291,6 +325,7 @@ define special values to indicate zero-terminated strings and unset offsets in the offset vector (ovector). */ #define PCRE2_SIZE size_t +#define PCRE2_SIZE_MAX SIZE_MAX #define PCRE2_ZERO_TERMINATED (~(PCRE2_SIZE)0) #define PCRE2_UNSET (~(PCRE2_SIZE)0) @@ -365,164 +400,192 @@ expanded for each width below. Start with functions that give general information. */ #define PCRE2_GENERAL_INFO_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_config(uint32_t, void *); +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_config(uint32_t, void *); /* Functions for manipulating contexts. */ #define PCRE2_GENERAL_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL \ - pcre2_general_context *pcre2_general_context_copy(pcre2_general_context *); \ -PCRE2_EXP_DECL \ - pcre2_general_context *pcre2_general_context_create( \ - void *(*)(PCRE2_SIZE, void *), \ - void (*)(void *, void *), void *); \ -PCRE2_EXP_DECL void pcre2_general_context_free(pcre2_general_context *); +PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \ + *pcre2_general_context_copy(pcre2_general_context *); \ +PCRE2_EXP_DECL pcre2_general_context PCRE2_CALL_CONVENTION \ + *pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \ + void (*)(void *, void *), void *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_general_context_free(pcre2_general_context *); #define PCRE2_COMPILE_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL \ - pcre2_compile_context *pcre2_compile_context_copy(pcre2_compile_context *); \ -PCRE2_EXP_DECL \ - pcre2_compile_context *pcre2_compile_context_create(pcre2_general_context *);\ -PCRE2_EXP_DECL void pcre2_compile_context_free(pcre2_compile_context *); \ -PCRE2_EXP_DECL int pcre2_set_bsr(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int pcre2_set_character_tables(pcre2_compile_context *, \ - const unsigned char *); \ -PCRE2_EXP_DECL int pcre2_set_newline(pcre2_compile_context *, uint32_t); \ -PCRE2_EXP_DECL int pcre2_set_parens_nest_limit(pcre2_compile_context *, \ - uint32_t); \ -PCRE2_EXP_DECL int pcre2_set_compile_recursion_guard(\ - pcre2_compile_context *, int (*)(uint32_t, void *), \ - void *); +PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \ + *pcre2_compile_context_copy(pcre2_compile_context *); \ +PCRE2_EXP_DECL pcre2_compile_context PCRE2_CALL_CONVENTION \ + *pcre2_compile_context_create(pcre2_general_context *);\ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_compile_context_free(pcre2_compile_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_bsr(pcre2_compile_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_character_tables(pcre2_compile_context *, const unsigned char *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_newline(pcre2_compile_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_parens_nest_limit(pcre2_compile_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_compile_recursion_guard(pcre2_compile_context *, \ + int (*)(uint32_t, void *), void *); #define PCRE2_MATCH_CONTEXT_FUNCTIONS \ -PCRE2_EXP_DECL \ - pcre2_match_context *pcre2_match_context_copy(pcre2_match_context *); \ -PCRE2_EXP_DECL \ - pcre2_match_context *pcre2_match_context_create(pcre2_general_context *); \ -PCRE2_EXP_DECL void pcre2_match_context_free(pcre2_match_context *); \ -PCRE2_EXP_DECL int pcre2_set_callout(pcre2_match_context *, \ - int (*)(pcre2_callout_block *, void *), void *); \ -PCRE2_EXP_DECL int pcre2_set_match_limit(pcre2_match_context *, \ - uint32_t); \ -PCRE2_EXP_DECL int pcre2_set_recursion_limit(pcre2_match_context *, \ - uint32_t); \ -PCRE2_EXP_DECL int pcre2_set_recursion_memory_management( \ - pcre2_match_context *, void *(*)(PCRE2_SIZE, void *), \ - void (*)(void *, void *), void *); +PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \ + *pcre2_match_context_copy(pcre2_match_context *); \ +PCRE2_EXP_DECL pcre2_match_context PCRE2_CALL_CONVENTION \ + *pcre2_match_context_create(pcre2_general_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_match_context_free(pcre2_match_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_callout(pcre2_match_context *, \ + int (*)(pcre2_callout_block *, void *), void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_match_limit(pcre2_match_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_offset_limit(pcre2_match_context *, PCRE2_SIZE); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_recursion_limit(pcre2_match_context *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_recursion_memory_management(pcre2_match_context *, \ + void *(*)(PCRE2_SIZE, void *), void (*)(void *, void *), void *); /* Functions concerned with compiling a pattern to PCRE internal code. */ #define PCRE2_COMPILE_FUNCTIONS \ -PCRE2_EXP_DECL \ - pcre2_code *pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, \ - int *, PCRE2_SIZE *, pcre2_compile_context *); \ -PCRE2_EXP_DECL void pcre2_code_free(pcre2_code *); +PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ + *pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, \ + pcre2_compile_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_code_free(pcre2_code *); \ +PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ + *pcre2_code_copy(const pcre2_code *); \ +PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \ + *pcre2_code_copy_with_tables(const pcre2_code *); /* Functions that give information about a compiled pattern. */ #define PCRE2_PATTERN_INFO_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_pattern_info(const pcre2_code *, uint32_t, \ - void *); \ -PCRE2_EXP_DECL int pcre2_callout_enumerate(const pcre2_code *, \ - int (*)(pcre2_callout_enumerate_block *, void *), \ - void *); +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_pattern_info(const pcre2_code *, uint32_t, void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_callout_enumerate(const pcre2_code *, \ + int (*)(pcre2_callout_enumerate_block *, void *), void *); /* Functions for running a match and inspecting the result. */ #define PCRE2_MATCH_FUNCTIONS \ -PCRE2_EXP_DECL \ - pcre2_match_data *pcre2_match_data_create(uint32_t, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL \ - pcre2_match_data *pcre2_match_data_create_from_pattern(\ - const pcre2_code *, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL int pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, \ - PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ - pcre2_match_data *, pcre2_match_context *, int *, \ - PCRE2_SIZE); \ -PCRE2_EXP_DECL int pcre2_match(const pcre2_code *, \ - PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ - pcre2_match_data *, pcre2_match_context *); \ -PCRE2_EXP_DECL void pcre2_match_data_free(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SPTR pcre2_get_mark(pcre2_match_data *); \ -PCRE2_EXP_DECL uint32_t pcre2_get_ovector_count(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *); +PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \ + *pcre2_match_data_create(uint32_t, pcre2_general_context *); \ +PCRE2_EXP_DECL pcre2_match_data PCRE2_CALL_CONVENTION \ + *pcre2_match_data_create_from_pattern(const pcre2_code *, \ + pcre2_general_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *, int *, PCRE2_SIZE); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_match_data_free(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SPTR PCRE2_CALL_CONVENTION \ + pcre2_get_mark(pcre2_match_data *); \ +PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \ + pcre2_get_ovector_count(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ + *pcre2_get_ovector_pointer(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ + pcre2_get_startchar(pcre2_match_data *); /* Convenience functions for handling matched substrings. */ #define PCRE2_SUBSTRING_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_substring_copy_byname(pcre2_match_data *, \ - PCRE2_SPTR, PCRE2_UCHAR *, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int pcre2_substring_copy_bynumber(pcre2_match_data *, \ - uint32_t, PCRE2_UCHAR *, PCRE2_SIZE *); \ -PCRE2_EXP_DECL void pcre2_substring_free(PCRE2_UCHAR *); \ -PCRE2_EXP_DECL int pcre2_substring_get_byname(pcre2_match_data *, \ - PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int pcre2_substring_get_bynumber(pcre2_match_data *, \ - uint32_t, PCRE2_UCHAR **, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int pcre2_substring_length_byname(pcre2_match_data *, \ - PCRE2_SPTR, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int pcre2_substring_length_bynumber(pcre2_match_data *, \ - uint32_t, PCRE2_SIZE *); \ -PCRE2_EXP_DECL int pcre2_substring_nametable_scan(const pcre2_code *, \ - PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \ -PCRE2_EXP_DECL int pcre2_substring_number_from_name(\ - const pcre2_code *, PCRE2_SPTR); \ -PCRE2_EXP_DECL void pcre2_substring_list_free(PCRE2_SPTR *); \ -PCRE2_EXP_DECL int pcre2_substring_list_get(pcre2_match_data *, \ - PCRE2_UCHAR ***, PCRE2_SIZE **); +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_copy_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR *, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_copy_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR *, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_substring_free(PCRE2_UCHAR *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_get_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_UCHAR **, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_get_bynumber(pcre2_match_data *, uint32_t, PCRE2_UCHAR **, \ + PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_length_byname(pcre2_match_data *, PCRE2_SPTR, PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_length_bynumber(pcre2_match_data *, uint32_t, PCRE2_SIZE *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_nametable_scan(const pcre2_code *, PCRE2_SPTR, PCRE2_SPTR *, \ + PCRE2_SPTR *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_number_from_name(const pcre2_code *, PCRE2_SPTR); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_substring_list_free(PCRE2_SPTR *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substring_list_get(pcre2_match_data *, PCRE2_UCHAR ***, PCRE2_SIZE **); /* Functions for serializing / deserializing compiled patterns. */ #define PCRE2_SERIALIZE_FUNCTIONS \ -PCRE2_EXP_DECL int32_t pcre2_serialize_encode(const pcre2_code **, \ - int32_t, uint8_t **, PCRE2_SIZE *, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL int32_t pcre2_serialize_decode(pcre2_code **, int32_t, \ - const uint8_t *, pcre2_general_context *); \ -PCRE2_EXP_DECL int32_t pcre2_serialize_get_number_of_codes(const uint8_t *); \ -PCRE2_EXP_DECL void pcre2_serialize_free(uint8_t *); +PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ + pcre2_serialize_encode(const pcre2_code **, int32_t, uint8_t **, \ + PCRE2_SIZE *, pcre2_general_context *); \ +PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ + pcre2_serialize_decode(pcre2_code **, int32_t, const uint8_t *, \ + pcre2_general_context *); \ +PCRE2_EXP_DECL int32_t PCRE2_CALL_CONVENTION \ + pcre2_serialize_get_number_of_codes(const uint8_t *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_serialize_free(uint8_t *); /* Convenience function for match + substitute. */ #define PCRE2_SUBSTITUTE_FUNCTION \ -PCRE2_EXP_DECL int pcre2_substitute(const pcre2_code *, \ - PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ - pcre2_match_data *, pcre2_match_context *, \ - PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, \ - PCRE2_SIZE *); +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_substitute(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *, PCRE2_SPTR, \ + PCRE2_SIZE, PCRE2_UCHAR *, PCRE2_SIZE *); /* Functions for JIT processing */ #define PCRE2_JIT_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_jit_compile(pcre2_code *, uint32_t); \ -PCRE2_EXP_DECL int pcre2_jit_match(const pcre2_code *, \ - PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ - pcre2_match_data *, pcre2_match_context *); \ -PCRE2_EXP_DECL void pcre2_jit_free_unused_memory(pcre2_general_context *); \ -PCRE2_EXP_DECL \ - pcre2_jit_stack *pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL void pcre2_jit_stack_assign(pcre2_match_context *, \ - pcre2_jit_callback, void *); \ -PCRE2_EXP_DECL void pcre2_jit_stack_free(pcre2_jit_stack *); +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_jit_compile(pcre2_code *, uint32_t); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_jit_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ + uint32_t, pcre2_match_data *, pcre2_match_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_jit_free_unused_memory(pcre2_general_context *); \ +PCRE2_EXP_DECL pcre2_jit_stack PCRE2_CALL_CONVENTION \ + *pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_jit_stack_assign(pcre2_match_context *, pcre2_jit_callback, void *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_jit_stack_free(pcre2_jit_stack *); /* Other miscellaneous functions. */ #define PCRE2_OTHER_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_get_error_message(int, PCRE2_UCHAR *, PCRE2_SIZE); \ -PCRE2_EXP_DECL \ - const uint8_t *pcre2_maketables(pcre2_general_context *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_get_error_message(int, PCRE2_UCHAR *, PCRE2_SIZE); \ +PCRE2_EXP_DECL const uint8_t PCRE2_CALL_CONVENTION \ + *pcre2_maketables(pcre2_general_context *); \ /* Define macros that generate width-specific names from generic versions. The @@ -567,6 +630,8 @@ pcre2_compile are called by application code. */ /* Functions: the complete list in alphabetical order */ #define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_) +#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_) +#define pcre2_code_copy_with_tables PCRE2_SUFFIX(pcre2_code_copy_with_tables_) #define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_) #define pcre2_compile PCRE2_SUFFIX(pcre2_compile_) #define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_) @@ -606,8 +671,10 @@ pcre2_compile are called by application code. */ #define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_) #define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_) #define pcre2_set_match_limit PCRE2_SUFFIX(pcre2_set_match_limit_) +#define pcre2_set_max_pattern_length PCRE2_SUFFIX(pcre2_set_max_pattern_length_) #define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_) #define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_) +#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_) #define pcre2_set_recursion_limit PCRE2_SUFFIX(pcre2_set_recursion_limit_) #define pcre2_set_recursion_memory_management PCRE2_SUFFIX(pcre2_set_recursion_memory_management_) #define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_) @@ -699,4 +766,6 @@ PCRE2_SUFFIX a no-op. Otherwise, generate an error. */ } /* extern "C" */ #endif -#endif /* End of pcre2.h */ +#endif /* PCRE2_H_IDEMPOTENT_GUARD */ + +/* End of pcre2.h */ diff --git a/pcre2/src/pcre2_auto_possess.c b/pcre2/src/pcre2_auto_possess.c index e99a2c44f..64ec6dfbb 100644 --- a/pcre2/src/pcre2_auto_possess.c +++ b/pcre2/src/pcre2_auto_possess.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -91,6 +91,7 @@ static const uint8_t autoposstab[APTROWS][APTCOLS] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */ }; +#ifdef SUPPORT_UNICODE /* This table is used to check whether auto-possessification is possible between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The left-hand (repeated) opcode is used to select the row, and the right-hand @@ -170,64 +171,7 @@ static const uint8_t posspropstab[3][4] = { { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */ { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */ }; - -/* This table is used when converting repeating opcodes into possessified -versions as a result of an explicit possessive quantifier such as ++. A zero -value means there is no possessified version - in those cases the item in -question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT -because all relevant opcodes are less than that. */ - -static const uint8_t opcode_possessify[] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */ - - 0, /* NOTI */ - OP_POSSTAR, 0, /* STAR, MINSTAR */ - OP_POSPLUS, 0, /* PLUS, MINPLUS */ - OP_POSQUERY, 0, /* QUERY, MINQUERY */ - OP_POSUPTO, 0, /* UPTO, MINUPTO */ - 0, /* EXACT */ - 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */ - - OP_POSSTARI, 0, /* STARI, MINSTARI */ - OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */ - OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */ - OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */ - 0, /* EXACTI */ - 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */ - - OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */ - OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */ - OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */ - OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */ - 0, /* NOTEXACT */ - 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */ - - OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */ - OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */ - OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */ - OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */ - 0, /* NOTEXACTI */ - 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */ - - OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */ - OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */ - OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */ - OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */ - 0, /* TYPEEXACT */ - 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */ - - OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */ - OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */ - OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */ - OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */ - 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */ - - 0, 0, 0, /* CLASS, NCLASS, XCLASS */ - 0, 0, /* REF, REFI */ - 0, 0, /* DNREF, DNREFI */ - 0, 0 /* RECURSE, CALLOUT */ -}; +#endif /* SUPPORT_UNICODE */ @@ -645,6 +589,7 @@ for(;;) case OP_ASSERTBACK_NOT: case OP_ONCE: case OP_ONCE_NC: + /* Atomic sub-patterns and assertions can always auto-possessify their last iterator. However, if the group was entered as a result of checking a previous iterator, this is not possible. */ @@ -662,6 +607,9 @@ for(;;) next_code = code + GET(code, 1); code += PRIV(OP_lengths)[c]; + /* Check each branch. We have to recurse a level for all but the last + branch. */ + while (*next_code == OP_ALT) { if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit)) @@ -1102,8 +1050,10 @@ but some compilers complain about an unreachable statement. */ /* Replaces single character iterations with their possessive alternatives if appropriate. This function modifies the compiled opcode! Hitting a -non-existant opcode may indicate a bug in PCRE2, but it can also be caused if a -bad UTF string was compiled with PCRE2_NO_UTF_CHECK. +non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a +bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches +overly complicated or large patterns. In these cases, the check just stops, +leaving the remainder of the pattern unpossessified. Arguments: code points to start of the byte code @@ -1117,11 +1067,11 @@ Returns: 0 for success int PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb) { -register PCRE2_UCHAR c; +PCRE2_UCHAR c; PCRE2_SPTR end; PCRE2_UCHAR *repeat_opcode; uint32_t list[8]; -int rec_limit; +int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ for (;;) { @@ -1136,7 +1086,6 @@ for (;;) get_chr_property_list(code, utf, cb->fcc, list) : NULL; list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; - rec_limit = 1000; if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit)) { switch(c) @@ -1193,7 +1142,6 @@ for (;;) list[1] = (c & 1) == 0; - rec_limit = 1000; if (compare_opcodes(end, utf, cb, list, end, &rec_limit)) { switch (c) diff --git a/pcre2/src/pcre2_compile.c b/pcre2/src/pcre2_compile.c index 4a9e42e2c..6d98a68ca 100644 --- a/pcre2/src/pcre2_compile.c +++ b/pcre2/src/pcre2_compile.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2015 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -58,9 +58,14 @@ POSSIBILITY OF SUCH DAMAGE. #define PRINTABLE(c) ((c) >= 32 && (c) < 127) #endif #include "pcre2_printint.c" -#define CALL_PRINTINT +#define DEBUG_CALL_PRINTINT #endif +/* Other debugging code can be enabled by these defines. */ + +// #define DEBUG_SHOW_CAPTURES +// #define DEBUG_SHOW_PARSED + /* There are a few things that vary with different code unit sizes. Handle them by defining macros in order to minimize #if usage. */ @@ -79,16 +84,56 @@ by defining macros in order to minimize #if usage. */ #endif #endif +/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which +consists of uint32_t elements. Assume that if uint32_t can't hold it, two of +them will be able to (i.e. assume a 64-bit world). */ + +#if PCRE2_SIZE_MAX <= UINT32_MAX +#define PUTOFFSET(s,p) *p++ = s +#define GETOFFSET(s,p) s = *p++ +#define GETPLUSOFFSET(s,p) s = *(++p) +#define READPLUSOFFSET(s,p) s = p[1] +#define SKIPOFFSET(p) p++ +#define SIZEOFFSET 1 +#else +#define PUTOFFSET(s,p) \ + { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); } +#define GETOFFSET(s,p) \ + { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; } +#define GETPLUSOFFSET(s,p) \ + { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; } +#define READPLUSOFFSET(s,p) \ + { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; } +#define SKIPOFFSET(p) p += 2 +#define SIZEOFFSET 2 +#endif + +/* Macros for manipulating elements of the parsed pattern vector. */ + +#define META_CODE(x) (x & 0xffff0000u) +#define META_DATA(x) (x & 0x0000ffffu) +#define META_DIFF(x,y) ((x-y)>>16) + /* Function definitions to allow mutual recursion */ +#ifdef SUPPORT_UNICODE +static unsigned int + add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, + compile_block *, const uint32_t *, unsigned int); +#endif + static int - add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *, - const uint32_t *, unsigned int); + compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t, + uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *, + compile_block *, PCRE2_SIZE *); + +static int + get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *, + compile_block *); static BOOL - compile_regex(uint32_t, PCRE2_UCHAR **, PCRE2_SPTR *, int *, BOOL, BOOL, - uint32_t, int, uint32_t *, int32_t *, uint32_t *, int32_t *, - branch_chain *, compile_block *, size_t *); + set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *, + compile_block *); @@ -96,26 +141,35 @@ static BOOL * Code parameters and static tables * *************************************************/ -/* This value specifies the size of stack workspace, which is used during the -pre-compile phase when determining how much memory is required. The regex is -partly compiled into this space, but the compiled parts are discarded as soon -as they can be, so that hopefully there will never be an overrun. The code -does, however, check for an overrun. The largest amount I've seen used is 218, -so this number is very generous. +#define MAX_GROUP_NUMBER 65535u +#define MAX_REPEAT_COUNT 65535u +#define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1) -The same workspace is used during the second, actual compile phase for -remembering forward references to groups so that they can be filled in at the -end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE -is 4 there is plenty of room for most patterns. However, the memory can get -filled up by repetitions of forward references, for example patterns like -/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so -that the workspace is expanded in this situation. The value below is therefore -a minimum, and we put a maximum on it for safety. The minimum is now also -defined in terms of LINK_SIZE so that the size increase kicks in at the same -number of forward references in all cases. */ +/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in +different ways in the different pattern scans. The parsing and group- +identifying pre-scan uses it to handle nesting, and needs it to be 16-bit +aligned for this. Having defined the size in code units, we set up +C16_WORK_SIZE as the number of elements in the 16-bit vector. -#define COMPILE_WORK_SIZE (2048*LINK_SIZE) -#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE) +During the first compiling phase, when determining how much memory is required, +the regex is partly compiled into this space, but the compiled parts are +discarded as soon as they can be, so that hopefully there will never be an +overrun. The code does, however, check for an overrun, which can occur for +pathological patterns. The size of the workspace depends on LINK_SIZE because +the length of compiled items varies with this. + +In the real compile phase, this workspace is not currently used. */ + +#define COMPILE_WORK_SIZE (2048*LINK_SIZE) /* Size in code units */ + +#define C16_WORK_SIZE \ + ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t)) + +/* A uint32_t vector is used for caching information about the size of +capturing groups, to improve performance. A default is created on the stack of +this size. */ + +#define GROUPINFO_DEFAULT_SIZE 256 /* The overrun tests check for a slightly smaller size so that they detect the overrun before it actually does run off the end of the data block. */ @@ -129,28 +183,186 @@ value is the number of slots in the list. */ #define NAMED_GROUP_LIST_SIZE 20 -/* The original PCRE required patterns to be zero-terminated, and it simplifies -the compiling code if it is guaranteed that there is a zero code unit at the -end of the pattern, because this means that tests for coding sequences such as -(*SKIP) or even just (?<= can check a sequence of code units without having to -keep checking for the end of the pattern. The new PCRE2 API allows zero code -units within patterns if a positive length is given, but in order to keep most -of the compiling code as it was, we copy such patterns and add a zero on the -end. This value determines the size of space on the stack that is used if the -pattern fits; if not, heap memory is used. */ +/* The pre-compiling pass over the pattern creates a parsed pattern in a vector +of uint32_t. For short patterns this lives on the stack, with this size. Heap +memory is used for longer patterns. */ -#define COPIED_PATTERN_SIZE 1024 +#define PARSED_PATTERN_DEFAULT_SIZE 1024 /* Maximum length value to check against when making sure that the variable that holds the compiled pattern length does not overflow. We make it a bit less -than INT_MAX to allow for adding in group terminating bytes, so that we don't -have to check them every time. */ +than INT_MAX to allow for adding in group terminating code units, so that we +don't have to check them every time. */ #define OFLOW_MAX (INT_MAX - 20) -/* Macro for setting individual bits in class bitmaps. */ +/* Code values for parsed patterns, which are stored in a vector of 32-bit +unsigned ints. Values less than META_END are literal data values. The coding +for identifying the item is in the top 16-bits, leaving 16 bits for the +additional data that some of them need. The META_CODE, META_DATA, and META_DIFF +macros are used to manipulate parsed pattern elements. -#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7)) +NOTE: When these definitions are changed, the table of extra lengths for each +code (meta_extra_lengths, just below) must be updated to remain in step. */ + +#define META_END 0x80000000u /* End of pattern */ + +#define META_ALT 0x80010000u /* alternation */ +#define META_ATOMIC 0x80020000u /* atomic group */ +#define META_BACKREF 0x80030000u /* Back ref */ +#define META_BACKREF_BYNAME 0x80040000u /* \k'name' */ +#define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */ +#define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */ +#define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */ +#define META_CAPTURE 0x80080000u /* Capturing parenthesis */ +#define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */ +#define META_CLASS 0x800a0000u /* start non-empty class */ +#define META_CLASS_EMPTY 0x800b0000u /* empty class */ +#define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */ +#define META_CLASS_END 0x800d0000u /* end of non-empty class */ +#define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */ +#define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */ +#define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */ +#define META_COND_NAME 0x80110000u /* (?( )... */ +#define META_COND_NUMBER 0x80120000u /* (?(digits)... */ +#define META_COND_RNAME 0x80130000u /* (?(R&name)... */ +#define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */ +#define META_COND_VERSION 0x80150000u /* (?(VERSION x.y)... */ +#define META_DOLLAR 0x80160000u /* $ metacharacter */ +#define META_DOT 0x80170000u /* . metacharacter */ +#define META_ESCAPE 0x80180000u /* \d and friends */ +#define META_KET 0x80190000u /* closing parenthesis */ +#define META_NOCAPTURE 0x801a0000u /* no capture parens */ +#define META_OPTIONS 0x801b0000u /* (?i) and friends */ +#define META_POSIX 0x801c0000u /* POSIX class item */ +#define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */ +#define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */ +#define META_RANGE_LITERAL 0x801f0000u /* range defined literally */ +#define META_RECURSE 0x80200000u /* Recursion */ +#define META_RECURSE_BYNAME 0x80210000u /* (?&name) */ + +/* These must be kept together to make it easy to check that an assertion +is present where expected in a conditional group. */ + +#define META_LOOKAHEAD 0x80220000u /* (?= */ +#define META_LOOKAHEADNOT 0x80230000u /* (?! */ +#define META_LOOKBEHIND 0x80240000u /* (?<= */ +#define META_LOOKBEHINDNOT 0x80250000u /* (?= 10 */ + 1+SIZEOFFSET, /* META_BACKREF_BYNAME */ + 1, /* META_BIGVALUE */ + 3, /* META_CALLOUT_NUMBER */ + 3+SIZEOFFSET, /* META_CALLOUT_STRING */ + 0, /* META_CAPTURE */ + 0, /* META_CIRCUMFLEX */ + 0, /* META_CLASS */ + 0, /* META_CLASS_EMPTY */ + 0, /* META_CLASS_EMPTY_NOT */ + 0, /* META_CLASS_END */ + 0, /* META_CLASS_NOT */ + 0, /* META_COND_ASSERT */ + SIZEOFFSET, /* META_COND_DEFINE */ + 1+SIZEOFFSET, /* META_COND_NAME */ + 1+SIZEOFFSET, /* META_COND_NUMBER */ + 1+SIZEOFFSET, /* META_COND_RNAME */ + 1+SIZEOFFSET, /* META_COND_RNUMBER */ + 3, /* META_COND_VERSION */ + 0, /* META_DOLLAR */ + 0, /* META_DOT */ + 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */ + 0, /* META_KET */ + 0, /* META_NOCAPTURE */ + 1, /* META_OPTIONS */ + 1, /* META_POSIX */ + 1, /* META_POSIX_NEG */ + 0, /* META_RANGE_ESCAPED */ + 0, /* META_RANGE_LITERAL */ + SIZEOFFSET, /* META_RECURSE */ + 1+SIZEOFFSET, /* META_RECURSE_BYNAME */ + 0, /* META_LOOKAHEAD */ + 0, /* META_LOOKAHEADNOT */ + SIZEOFFSET, /* META_LOOKBEHIND */ + SIZEOFFSET, /* META_LOOKBEHINDNOT */ + 1, /* META_MARK - plus the string length */ + 0, /* META_ACCEPT */ + 0, /* META_COMMIT */ + 0, /* META_FAIL */ + 0, /* META_PRUNE */ + 1, /* META_PRUNE_ARG - plus the string length */ + 0, /* META_SKIP */ + 1, /* META_SKIP_ARG - plus the string length */ + 0, /* META_THEN */ + 1, /* META_THEN_ARG - plus the string length */ + 0, /* META_ASTERISK */ + 0, /* META_ASTERISK_PLUS */ + 0, /* META_ASTERISK_QUERY */ + 0, /* META_PLUS */ + 0, /* META_PLUS_PLUS */ + 0, /* META_PLUS_QUERY */ + 0, /* META_QUERY */ + 0, /* META_QUERY_PLUS */ + 0, /* META_QUERY_QUERY */ + 2, /* META_MINMAX */ + 2, /* META_MINMAX_PLUS */ + 2 /* META_MINMAX_QUERY */ +}; + +/* Types for skipping parts of a parsed pattern. */ + +enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET }; + +/* Macro for setting individual bits in class bitmaps. It took some +experimenting to figure out how to stop gcc 5.3.0 from warning with +-Wconversion. This version gets a warning: + + #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7)) + +Let's hope the apparently less efficient version isn't actually so bad if the +compiler is clever with identical subexpressions. */ + +#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7))) /* Private flags added to firstcu and reqcu. */ @@ -160,10 +372,11 @@ have to check them every time. */ #define REQ_UNSET (-2) /* Not yet found anything */ #define REQ_NONE (-1) /* Found not fixed char */ -/* This bit (which is greater than any UTF value) is used to indicate that a -variable contains a number of code units instead of an actual code point. */ +/* These flags are used in the groupinfo vector. */ -#define UTF_LENGTH 0x10000000l +#define GI_SET_FIXED_LENGTH 0x80000000u +#define GI_NOT_FIXED_LENGTH 0x40000000u +#define GI_FIXED_LENGTH_MASK 0x0000ffffu /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC and is fast (a good compiler can turn it into a subtraction and unsigned @@ -175,8 +388,8 @@ comparison). */ locale, and may mark arbitrary characters as digits. We want to recognize only 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It costs 256 bytes, but it is a lot faster than doing character value tests (at -least in some simple cases I timed), and in some applications one wants PCRE to -compile efficiently as well as match efficiently. The value in the table is +least in some simple cases I timed), and in some applications one wants PCRE2 +to compile efficiently as well as match efficiently. The value in the table is the binary hex digit value, or 0xff for non-hex digits. */ /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in @@ -270,7 +483,7 @@ in UTF-8 mode. It runs from '0' to 'z'. */ #ifndef EBCDIC #define ESCAPES_FIRST CHAR_0 #define ESCAPES_LAST CHAR_z -#define ESCAPES_UPPER_CASE (-32) /* Add this to upper case a letter */ +#define UPPER_CASE(c) (c-32) static const short int escapes[] = { 0, 0, @@ -323,11 +536,11 @@ because it is defined as 'a', which of course picks up the ASCII value. */ #if 'a' == 0x81 /* Check for a real EBCDIC environment */ #define ESCAPES_FIRST CHAR_a #define ESCAPES_LAST CHAR_9 -#define ESCAPES_UPPER_CASE (+64) /* Add this to upper case a letter */ +#define UPPER_CASE(c) (c+64) #else /* Testing in an ASCII environment */ #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */ #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */ -#define ESCAPES_UPPER_CASE (-32) /* Add this to upper case a letter */ +#define UPPER_CASE(c) (c-32) #endif static const short int escapes[] = { @@ -364,9 +577,9 @@ string is built from string macros so that it works in UTF-8 mode on EBCDIC platforms. */ typedef struct verbitem { - int len; /* Length of verb name */ - int op; /* Op when no arg, or -1 if arg mandatory */ - int op_arg; /* Op when arg present, or -1 if not allowed */ + unsigned int len; /* Length of verb name */ + uint32_t meta; /* Base META_ code */ + int has_arg; /* Argument requirement */ } verbitem; static const char verbnames[] = @@ -381,32 +594,30 @@ static const char verbnames[] = STRING_THEN; static const verbitem verbs[] = { - { 0, -1, OP_MARK }, - { 4, -1, OP_MARK }, - { 6, OP_ACCEPT, -1 }, - { 6, OP_COMMIT, -1 }, - { 1, OP_FAIL, -1 }, - { 4, OP_FAIL, -1 }, - { 5, OP_PRUNE, OP_PRUNE_ARG }, - { 4, OP_SKIP, OP_SKIP_ARG }, - { 4, OP_THEN, OP_THEN_ARG } + { 0, META_MARK, +1 }, /* > 0 => must have an argument */ + { 4, META_MARK, +1 }, + { 6, META_ACCEPT, -1 }, /* < 0 => must not have an argument */ + { 6, META_COMMIT, -1 }, + { 1, META_FAIL, -1 }, + { 4, META_FAIL, -1 }, + { 5, META_PRUNE, 0 }, /* Argument is optional; bump META code if found */ + { 4, META_SKIP, 0 }, + { 4, META_THEN, 0 } }; static const int verbcount = sizeof(verbs)/sizeof(verbitem); +/* Verb opcodes, indexed by their META code offset from META_MARK. */ -/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in -another regex library. */ +static const uint32_t verbops[] = { + OP_MARK, OP_ACCEPT, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_PRUNE_ARG, OP_SKIP, + OP_SKIP_ARG, OP_THEN, OP_THEN_ARG }; -static const PCRE2_UCHAR sub_start_of_word[] = { - CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK, - CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' }; - -static const PCRE2_UCHAR sub_end_of_word[] = { - CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK, - CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, - CHAR_RIGHT_PARENTHESIS, '\0' }; +/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */ +static uint32_t chartypeoffset[] = { + OP_STAR - OP_STAR, OP_STARI - OP_STAR, + OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR }; /* Tables of names of POSIX character classes and their lengths. The names are now all in a single string, to reduce the number of relocations when a shared @@ -428,7 +639,6 @@ static const uint8_t posix_name_lengths[] = { #define PC_PRINT 9 #define PC_PUNCT 10 - /* Table of class bit maps for each POSIX class. Each class is formed from a base map, with an optional addition or removal of another map. Then, for some classes, there is some additional tweaking: for [:blank:] the vertical space @@ -456,134 +666,46 @@ static const int posix_class_maps[] = { cbit_xdigit,-1, 0 /* xdigit */ }; -/* Table of substitutes for \d etc when PCRE2_UCP is set. They are replaced by -Unicode property escapes. */ - #ifdef SUPPORT_UNICODE -static const PCRE2_UCHAR string_PNd[] = { - CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, - CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_pNd[] = { - CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, - CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_PXsp[] = { - CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, - CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_pXsp[] = { - CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, - CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_PXwd[] = { - CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, - CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_pXwd[] = { - CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, - CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static PCRE2_SPTR substitutes[] = { - string_PNd, /* \D */ - string_pNd, /* \d */ - string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */ - string_pXsp, /* \s */ /* space and POSIX space are the same. */ - string_PXwd, /* \W */ - string_pXwd /* \w */ +/* The POSIX class Unicode property substitutes that are used in UCP mode must +be in the order of the POSIX class names, defined above. */ + +static int posix_substitutes[] = { + PT_GC, ucp_L, /* alpha */ + PT_PC, ucp_Ll, /* lower */ + PT_PC, ucp_Lu, /* upper */ + PT_ALNUM, 0, /* alnum */ + -1, 0, /* ascii, treat as non-UCP */ + -1, 1, /* blank, treat as \h */ + PT_PC, ucp_Cc, /* cntrl */ + PT_PC, ucp_Nd, /* digit */ + PT_PXGRAPH, 0, /* graph */ + PT_PXPRINT, 0, /* print */ + PT_PXPUNCT, 0, /* punct */ + PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */ + PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */ + -1, 0 /* xdigit, treat as non-UCP */ }; - -/* The POSIX class substitutes must be in the order of the POSIX class names, -defined above, and there are both positive and negative cases. NULL means no -general substitute of a Unicode property escape (\p or \P). However, for some -POSIX classes (e.g. graph, print, punct) a special property code is compiled -directly. */ - -static const PCRE2_UCHAR string_pCc[] = { - CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, - CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_pL[] = { - CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, - CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_pLl[] = { - CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, - CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_pLu[] = { - CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, - CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_pXan[] = { - CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, - CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_h[] = { - CHAR_BACKSLASH, CHAR_h, '\0' }; -static const PCRE2_UCHAR string_pXps[] = { - CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, - CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_PCc[] = { - CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, - CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_PL[] = { - CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, - CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_PLl[] = { - CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, - CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_PLu[] = { - CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, - CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_PXan[] = { - CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, - CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; -static const PCRE2_UCHAR string_H[] = { - CHAR_BACKSLASH, CHAR_H, '\0' }; -static const PCRE2_UCHAR string_PXps[] = { - CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, - CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; - -static PCRE2_SPTR posix_substitutes[] = { - string_pL, /* alpha */ - string_pLl, /* lower */ - string_pLu, /* upper */ - string_pXan, /* alnum */ - NULL, /* ascii */ - string_h, /* blank */ - string_pCc, /* cntrl */ - string_pNd, /* digit */ - NULL, /* graph */ - NULL, /* print */ - NULL, /* punct */ - string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */ - string_pXwd, /* word */ /* Perl and POSIX space are the same */ - NULL, /* xdigit */ - /* Negated cases */ - string_PL, /* ^alpha */ - string_PLl, /* ^lower */ - string_PLu, /* ^upper */ - string_PXan, /* ^alnum */ - NULL, /* ^ascii */ - string_H, /* ^blank */ - string_PCc, /* ^cntrl */ - string_PNd, /* ^digit */ - NULL, /* ^graph */ - NULL, /* ^print */ - NULL, /* ^punct */ - string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */ - string_PXwd, /* ^word */ /* Perl and POSIX space are the same */ - NULL /* ^xdigit */ -}; -#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(PCRE2_UCHAR *)) +#define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t))) #endif /* SUPPORT_UNICODE */ /* Masks for checking option settings. */ #define PUBLIC_COMPILE_OPTIONS \ (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ - PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL| \ - PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE|PCRE2_MATCH_UNSET_BACKREF| \ - PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \ - PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \ - PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK| \ - PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF) + PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \ + PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \ + PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \ + PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \ + PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ + PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \ + PCRE2_UTF) /* Compile time error code numbers. They are given names so that they can more easily be tracked. When a new number is added, the tables called eint1 and -eint2 in pcre2posix.c must be updated, and a new error text must be added to -compile_error_texts in pcre2_error.c. */ +eint2 in pcre2posix.c may need to be updated, and a new error text must be +added to compile_error_texts in pcre2_error.c. */ enum { ERR0 = COMPILE_ERROR_BASE, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, @@ -594,7 +716,7 @@ enum { ERR0 = COMPILE_ERROR_BASE, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, - ERR81, ERR82, ERR83, ERR84 }; + ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90 }; /* This is a table of start-of-pattern options such as (*UTF) and settings such as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward @@ -697,6 +819,337 @@ static const uint8_t opcode_possessify[] = { }; +#ifdef DEBUG_SHOW_PARSED +/************************************************* +* Show the parsed pattern for debugging * +*************************************************/ + +/* For debugging the pre-scan, this code, which outputs the parsed data vector, +can be enabled. */ + +static void show_parsed(compile_block *cb) +{ +uint32_t *pptr = cb->parsed_pattern; + +for (;;) + { + int max, min; + PCRE2_SIZE offset; + uint32_t i; + uint32_t length; + uint32_t meta_arg = META_DATA(*pptr); + + fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr); + + if (*pptr < META_END) + { + if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr); + pptr++; + } + + else switch (META_CODE(*pptr++)) + { + default: + fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n"); + return; + + case META_END: + fprintf(stderr, "META_END\n"); + return; + + case META_CAPTURE: + fprintf(stderr, "META_CAPTURE %d", meta_arg); + break; + + case META_RECURSE: + GETOFFSET(offset, pptr); + fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset); + break; + + case META_BACKREF: + if (meta_arg < 10) + offset = cb->small_ref_offset[meta_arg]; + else + GETOFFSET(offset, pptr); + fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset); + break; + + case META_ESCAPE: + if (meta_arg == ESC_P || meta_arg == ESC_p) + { + uint32_t ptype = *pptr >> 16; + uint32_t pvalue = *pptr++ & 0xffff; + fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p', + ptype, pvalue); + } + else + { + uint32_t cc; + /* There's just one escape we might have here that isn't negated in the + escapes table. */ + if (meta_arg == ESC_g) cc = CHAR_g; + else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++) + { + if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break; + } + if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK; + fprintf(stderr, "META \\%c", cc); + } + break; + + case META_MINMAX: + min = *pptr++; + max = *pptr++; + if (max != REPEAT_UNLIMITED) + fprintf(stderr, "META {%d,%d}", min, max); + else + fprintf(stderr, "META {%d,}", min); + break; + + case META_MINMAX_QUERY: + min = *pptr++; + max = *pptr++; + if (max != REPEAT_UNLIMITED) + fprintf(stderr, "META {%d,%d}?", min, max); + else + fprintf(stderr, "META {%d,}?", min); + break; + + case META_MINMAX_PLUS: + min = *pptr++; + max = *pptr++; + if (max != REPEAT_UNLIMITED) + fprintf(stderr, "META {%d,%d}+", min, max); + else + fprintf(stderr, "META {%d,}+", min); + break; + + case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break; + case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break; + case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break; + case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break; + case META_DOT: fprintf(stderr, "META_DOT"); break; + case META_ASTERISK: fprintf(stderr, "META *"); break; + case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break; + case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break; + case META_PLUS: fprintf(stderr, "META +"); break; + case META_PLUS_QUERY: fprintf(stderr, "META +?"); break; + case META_PLUS_PLUS: fprintf(stderr, "META ++"); break; + case META_QUERY: fprintf(stderr, "META ?"); break; + case META_QUERY_QUERY: fprintf(stderr, "META ??"); break; + case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break; + + case META_ATOMIC: fprintf(stderr, "META (?>"); break; + case META_NOCAPTURE: fprintf(stderr, "META (?:"); break; + case META_LOOKAHEAD: fprintf(stderr, "META (?="); break; + case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break; + case META_KET: fprintf(stderr, "META )"); break; + case META_ALT: fprintf(stderr, "META | %d", meta_arg); break; + + case META_CLASS: fprintf(stderr, "META ["); break; + case META_CLASS_NOT: fprintf(stderr, "META [^"); break; + case META_CLASS_END: fprintf(stderr, "META ]"); break; + case META_CLASS_EMPTY: fprintf(stderr, "META []"); break; + case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break; + + case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break; + case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break; + + case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break; + case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break; + + case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break; + case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break; + case META_FAIL: fprintf(stderr, "META (*FAIL)"); break; + case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break; + case META_SKIP: fprintf(stderr, "META (*SKIP)"); break; + case META_THEN: fprintf(stderr, "META (*THEN)"); break; + + case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break; + + case META_LOOKBEHIND: + fprintf(stderr, "META (?<= %d offset=", meta_arg); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + break; + + case META_LOOKBEHINDNOT: + fprintf(stderr, "META (?="); + fprintf(stderr, "%d.", *pptr++); + fprintf(stderr, "%d)", *pptr++); + break; + + case META_COND_NAME: + fprintf(stderr, "META (?( ) length=%d offset=", *pptr++); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + break; + + case META_COND_RNAME: + fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + break; + + /* This is kept as a name, because it might be. */ + + case META_COND_RNUMBER: + fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + break; + + case META_MARK: + fprintf(stderr, "META (*MARK:"); + goto SHOWARG; + + case META_PRUNE_ARG: + fprintf(stderr, "META (*PRUNE:"); + goto SHOWARG; + + case META_SKIP_ARG: + fprintf(stderr, "META (*SKIP:"); + goto SHOWARG; + + case META_THEN_ARG: + fprintf(stderr, "META (*THEN:"); + SHOWARG: + length = *pptr++; + for (i = 0; i < length; i++) + { + uint32_t cc = *pptr++; + if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc); + else fprintf(stderr, "\\x{%x}", cc); + } + fprintf(stderr, ") length=%u", length); + break; + } + fprintf(stderr, "\n"); + } +return; +} +#endif /* DEBUG_SHOW_PARSED */ + + + +/************************************************* +* Copy compiled code * +*************************************************/ + +/* Compiled JIT code cannot be copied, so the new compiled block has no +associated JIT data. */ + +PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION +pcre2_code_copy(const pcre2_code *code) +{ +PCRE2_SIZE* ref_count; +pcre2_code *newcode; + +if (code == NULL) return NULL; +newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); +if (newcode == NULL) return NULL; +memcpy(newcode, code, code->blocksize); +newcode->executable_jit = NULL; + +/* If the code is one that has been deserialized, increment the reference count +in the decoded tables. */ + +if ((code->flags & PCRE2_DEREF_TABLES) != 0) + { + ref_count = (PCRE2_SIZE *)(code->tables + tables_length); + (*ref_count)++; + } + +return newcode; +} + + + +/************************************************* +* Copy compiled code and character tables * +*************************************************/ + +/* Compiled JIT code cannot be copied, so the new compiled block has no +associated JIT data. This version of code_copy also makes a separate copy of +the character tables. */ + +PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION +pcre2_code_copy_with_tables(const pcre2_code *code) +{ +PCRE2_SIZE* ref_count; +pcre2_code *newcode; +uint8_t *newtables; + +if (code == NULL) return NULL; +newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data); +if (newcode == NULL) return NULL; +memcpy(newcode, code, code->blocksize); +newcode->executable_jit = NULL; + +newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE), + code->memctl.memory_data); +if (newtables == NULL) + { + code->memctl.free((void *)newcode, code->memctl.memory_data); + return NULL; + } +memcpy(newtables, code->tables, tables_length); +ref_count = (PCRE2_SIZE *)(newtables + tables_length); +*ref_count = 1; + +newcode->tables = newtables; +newcode->flags |= PCRE2_DEREF_TABLES; +return newcode; +} + + /************************************************* * Free compiled code * @@ -734,418 +1187,2928 @@ if (code != NULL) /************************************************* -* Insert an automatic callout point * +* Read a number, possibly signed * *************************************************/ -/* This function is called when the PCRE2_AUTO_CALLOUT option is set, to insert -callout points before each pattern item. +/* This function is used to read numbers in the pattern. The initial pointer +must be the sign or first digit of the number. When relative values (introduced +by + or -) are allowed, they are relative group numbers, and the result must be +greater than zero. Arguments: - code current code pointer - ptr current pattern pointer - cb general compile-time data + ptrptr points to the character pointer variable + ptrend points to the end of the input string + allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this + max_value the largest number allowed + max_error the error to give for an over-large number + intptr where to put the result + errcodeptr where to put an error code -Returns: new code pointer +Returns: TRUE - a number was read + FALSE - errorcode == 0 => no number was found + errorcode != 0 => an error occurred */ -static PCRE2_UCHAR * -auto_callout(PCRE2_UCHAR *code, PCRE2_SPTR ptr, compile_block *cb) +static BOOL +read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign, + uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr) { -code[0] = OP_CALLOUT; -PUT(code, 1, ptr - cb->start_pattern); /* Pattern offset */ -PUT(code, 1 + LINK_SIZE, 0); /* Default length */ -code[1 + 2*LINK_SIZE] = 255; -return code + PRIV(OP_lengths)[OP_CALLOUT]; +int sign = 0; +uint32_t n = 0; +PCRE2_SPTR ptr = *ptrptr; +BOOL yield = FALSE; + +*errorcodeptr = 0; + +if (allow_sign >= 0 && ptr < ptrend) + { + if (*ptr == CHAR_PLUS) + { + sign = +1; + max_value -= allow_sign; + ptr++; + } + else if (*ptr == CHAR_MINUS) + { + sign = -1; + ptr++; + } + } + +if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE; +while (ptr < ptrend && IS_DIGIT(*ptr)) + { + n = n * 10 + *ptr++ - CHAR_0; + if (n > max_value) + { + *errorcodeptr = max_error; + goto EXIT; + } + } + +if (allow_sign >= 0 && sign != 0) + { + if (n == 0) + { + *errorcodeptr = ERR26; /* +0 and -0 are not allowed */ + goto EXIT; + } + + if (sign > 0) n += allow_sign; + else if ((int)n > allow_sign) + { + *errorcodeptr = ERR15; /* Non-existent subpattern */ + goto EXIT; + } + else n = allow_sign + 1 - n; + } + +yield = TRUE; + +EXIT: +*intptr = n; +*ptrptr = ptr; +return yield; } /************************************************* -* Complete a callout item * +* Read repeat counts * *************************************************/ -/* A callout item contains the length of the next item in the pattern, which -we can't fill in till after we have reached the relevant point. This is used -for both automatic and manual callouts. +/* Read an item of the form {n,m} and return the values if non-NULL pointers +are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a +larger value is used for "unlimited". We have to use signed arguments for +read_number() because it is capable of returning a signed value. Arguments: - previous_callout points to previous callout item - ptr current pattern pointer - cb general compile-time data + ptrptr points to pointer to character after'{' + ptrend pointer to end of input + minp if not NULL, pointer to int for min + maxp if not NULL, pointer to int for max (-1 if no max) + returned as -1 if no max + errorcodeptr points to error code variable -Returns: nothing +Returns: FALSE if not a repeat quantifier, errorcode set zero + FALSE on error, with errorcode set non-zero + TRUE on success, with pointer updated to point after '}' */ -static void -complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr, - compile_block *cb) +static BOOL +read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp, + uint32_t *maxp, int *errorcodeptr) { -size_t length = ptr - cb->start_pattern - GET(previous_callout, 1); -PUT(previous_callout, 1 + LINK_SIZE, length); +PCRE2_SPTR p = *ptrptr; +BOOL yield = FALSE; +int32_t min = 0; +int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */ + +/* NB read_number() initializes the error code to zero. The only error is for a +number that is too big. */ + +if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr)) + goto EXIT; + +if (p >= ptrend) goto EXIT; + +if (*p == CHAR_RIGHT_CURLY_BRACKET) + { + p++; + max = min; + } + +else + { + if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT; + if (*p != CHAR_RIGHT_CURLY_BRACKET) + { + if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, + errorcodeptr) || p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET) + goto EXIT; + if (max < min) + { + *errorcodeptr = ERR4; + goto EXIT; + } + } + p++; + } + +yield = TRUE; +if (minp != NULL) *minp = (uint32_t)min; +if (maxp != NULL) *maxp = (uint32_t)max; + +/* Update the pattern pointer on success, or after an error, but not when +the result is "not a repeat quantifier". */ + +EXIT: +if (yield || *errorcodeptr != 0) *ptrptr = p; +return yield; + + + } /************************************************* -* Find the fixed length of a branch * +* Handle escapes * *************************************************/ -/* Scan a branch and compute the fixed length of subject that will match it, if -the length is fixed. This is needed for dealing with backward assertions. In -UTF mode, the result is in code units rather than bytes. The branch is -temporarily terminated with OP_END when this function is called. +/* This function is called when a \ has been encountered. It either returns a +positive value for a simple escape such as \d, or 0 for a data character, which +is placed in chptr. A backreference to group n is returned as negative n. On +entry, ptr is pointing at the character after \. On exit, it points after the +final code unit of the escape sequence. -This function is called when a backward assertion is encountered, so that if it -fails, the error message can point to the correct place in the pattern. -However, we cannot do this when the assertion contains subroutine calls, -because they can be forward references. We solve this by remembering this case -and doing the check at the end; a flag specifies which mode we are running in. +This function is also called from pcre2_substitute() to handle escape sequences +in replacement strings. In this case, the cb argument is NULL, and in the case +of escapes that have further processing, only sequences that define a data +character are recognised. The isclass argument is not relevant; the options +argument is the final value of the compiled pattern's options. Arguments: - code points to the start of the pattern (the bracket) - utf TRUE in UTF mode - atend TRUE if called when the pattern is complete - cb the "compile data" structure - recurses chain of recurse_check to catch mutual recursion + ptrptr points to the input position pointer + ptrend points to the end of the input + chptr points to a returned data character + errorcodeptr points to the errorcode variable (containing zero) + options the current options bits + isclass TRUE if inside a character class + cb compile data block -Returns: the fixed length, - or -1 if there is no fixed length, - or -2 if \C was encountered (in UTF-8 mode only) - or -3 if an OP_RECURSE item was encountered and atend is FALSE - or -4 if an unknown opcode was encountered (internal error) +Returns: zero => a data character + positive => a special escape sequence + negative => a numerical back reference + on error, errorcodeptr is set non-zero +*/ + +int +PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr, + int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb) +{ +BOOL utf = (options & PCRE2_UTF) != 0; +PCRE2_SPTR ptr = *ptrptr; +uint32_t c, cc; +int escape = 0; +int i; + +/* If backslash is at the end of the string, it's an error. */ + +if (ptr >= ptrend) + { + *errorcodeptr = ERR1; + return 0; + } + +GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ +*errorcodeptr = 0; /* Be optimistic */ + +/* Non-alphanumerics are literals, so we just leave the value in c. An initial +value test saves a memory lookup for code points outside the alphanumeric +range. Otherwise, do a table lookup. A non-zero result is something that can be +returned immediately. Otherwise further processing is required. */ + +if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */ + +else if ((i = escapes[c - ESCAPES_FIRST]) != 0) + { + if (i > 0) c = (uint32_t)i; else /* Positive is a data character */ + { + escape = -i; /* Else return a special escape */ + if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X)) + cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */ + } + } + +/* Escapes that need further processing, including those that are unknown. +When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u +when BSUX is set). */ + +else + { + PCRE2_SPTR oldptr; + BOOL overflow; + int s; + + /* Filter calls from pcre2_substitute(). */ + + if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x && + (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0)) + { + *errorcodeptr = ERR3; + return 0; + } + + switch (c) + { + /* A number of Perl escapes are not handled by PCRE. We give an explicit + error. */ + + case CHAR_l: + case CHAR_L: + *errorcodeptr = ERR37; + break; + + /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated + specially, \u must be followed by four hex digits. Otherwise it is a + lowercase u letter. */ + + case CHAR_u: + if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else + { + uint32_t xc; + if (ptrend - ptr < 4) break; /* Less than 4 chars */ + if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ + if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ + cc = (cc << 4) | xc; + if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ + cc = (cc << 4) | xc; + if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ + c = (cc << 4) | xc; + ptr += 4; + if (utf) + { + if (c > 0x10ffffU) *errorcodeptr = ERR77; + else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; + } + else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; + } + break; + + /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an + upper case letter. */ + + case CHAR_U: + if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; + break; + + /* In a character class, \g is just a literal "g". Outside a character + class, \g must be followed by one of a number of specific things: + + (1) A number, either plain or braced. If positive, it is an absolute + backreference. If negative, it is a relative backreference. This is a Perl + 5.10 feature. + + (2) Perl 5.10 also supports \g{name} as a reference to a named group. This + is part of Perl's movement towards a unified syntax for back references. As + this is synonymous with \k{name}, we fudge it up by pretending it really + was \k{name}. + + (3) For Oniguruma compatibility we also support \g followed by a name or a + number either in angle brackets or in single quotes. However, these are + (possibly recursive) subroutine calls, _not_ backreferences. We return + the ESC_g code. + + Summary: Return a negative number for a numerical back reference, ESC_k for + a named back reference, and ESC_g for a named or numbered subroutine call. + */ + + case CHAR_g: + if (isclass) break; + + if (ptr >= ptrend) + { + *errorcodeptr = ERR57; + break; + } + + if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE) + { + escape = ESC_g; + break; + } + + /* If there is a brace delimiter, try to read a numerical reference. If + there isn't one, assume we have a name and treat it as \k. */ + + if (*ptr == CHAR_LEFT_CURLY_BRACKET) + { + PCRE2_SPTR p = ptr + 1; + if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s, + errorcodeptr)) + { + if (*errorcodeptr == 0) escape = ESC_k; /* No number found */ + break; + } + if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET) + { + *errorcodeptr = ERR57; + break; + } + ptr = p + 1; + } + + /* Read an undelimited number */ + + else + { + if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s, + errorcodeptr)) + { + if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */ + break; + } + } + + if (s <= 0) + { + *errorcodeptr = ERR15; + break; + } + + escape = -s; + break; + + /* The handling of escape sequences consisting of a string of digits + starting with one that is not zero is not straightforward. Perl has changed + over the years. Nowadays \g{} for backreferences and \o{} for octal are + recommended to avoid the ambiguities in the old syntax. + + Outside a character class, the digits are read as a decimal number. If the + number is less than 10, or if there are that many previous extracting left + brackets, it is a back reference. Otherwise, up to three octal digits are + read to form an escaped character code. Thus \123 is likely to be octal 123 + (cf \0123, which is octal 012 followed by the literal 3). + + Inside a character class, \ followed by a digit is always either a literal + 8 or 9 or an octal number. */ + + case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: + case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: + + if (!isclass) + { + oldptr = ptr; + ptr--; /* Back to the digit */ + if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s, + errorcodeptr)) + break; + + /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x + are octal escapes if there are not that many previous captures. */ + + if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount) + { + if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61; + else escape = -s; /* Indicates a back reference */ + break; + } + ptr = oldptr; /* Put the pointer back and fall through */ + } + + /* Handle a digit following \ when the number is not a back reference, or + we are within a character class. If the first digit is 8 or 9, Perl used to + generate a binary zero and then treat the digit as a following literal. At + least by Perl 5.18 this changed so as not to insert the binary zero. */ + + if (c >= CHAR_8) break; + + /* Fall through with a digit less than 8 */ + + /* \0 always starts an octal number, but we may drop through to here with a + larger first octal digit. The original code used just to take the least + significant 8 bits of octal numbers (I think this is what early Perls used + to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, + but no more than 3 octal digits. */ + + case CHAR_0: + c -= CHAR_0; + while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) + c = c * 8 + *ptr++ - CHAR_0; +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (!utf && c > 0xff) *errorcodeptr = ERR51; +#endif + break; + + /* \o is a relatively new Perl feature, supporting a more general way of + specifying character codes in octal. The only supported form is \o{ddd}. */ + + case CHAR_o: + if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET) + { + ptr--; + *errorcodeptr = ERR55; + } + else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET) + *errorcodeptr = ERR78; + else + { + c = 0; + overflow = FALSE; + while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) + { + cc = *ptr++; + if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ +#if PCRE2_CODE_UNIT_WIDTH == 32 + if (c >= 0x20000000l) { overflow = TRUE; break; } +#endif + c = (c << 3) + (cc - CHAR_0); +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } +#elif PCRE2_CODE_UNIT_WIDTH == 16 + if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } +#elif PCRE2_CODE_UNIT_WIDTH == 32 + if (utf && c > 0x10ffffU) { overflow = TRUE; break; } +#endif + } + if (overflow) + { + while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++; + *errorcodeptr = ERR34; + } + else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) + { + if (utf && c >= 0xd800 && c <= 0xdfff) + { + ptr--; + *errorcodeptr = ERR73; + } + } + else + { + ptr--; + *errorcodeptr = ERR64; + } + } + break; + + /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by + two hexadecimal digits. Otherwise it is a lowercase x letter. */ + + case CHAR_x: + if ((options & PCRE2_ALT_BSUX) != 0) + { + uint32_t xc; + if (ptrend - ptr < 2) break; /* Less than 2 characters */ + if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */ + if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ + c = (cc << 4) | xc; + ptr += 2; + } /* End PCRE2_ALT_BSUX handling */ + + /* Handle \x in Perl's style. \x{ddd} is a character number which can be + greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex + digits. If not, { used to be treated as a data character. However, Perl + seems to read hex digits up to the first non-such, and ignore the rest, so + that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE + now gives an error. */ + + else + { + if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET) + { + if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET) + { + *errorcodeptr = ERR78; + break; + } + c = 0; + overflow = FALSE; + + while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff) + { + ptr++; + if (c == 0 && cc == 0) continue; /* Leading zeroes */ +#if PCRE2_CODE_UNIT_WIDTH == 32 + if (c >= 0x10000000l) { overflow = TRUE; break; } +#endif + c = (c << 4) | cc; + if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR)) + { + overflow = TRUE; + break; + } + } + + if (overflow) + { + while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++; + *errorcodeptr = ERR34; + } + else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) + { + if (utf && c >= 0xd800 && c <= 0xdfff) + { + ptr--; + *errorcodeptr = ERR73; + } + } + + /* If the sequence of hex digits does not end with '}', give an error. + We used just to recognize this construct and fall through to the normal + \x handling, but nowadays Perl gives an error, which seems much more + sensible, so we do too. */ + + else + { + ptr--; + *errorcodeptr = ERR67; + } + } /* End of \x{} processing */ + + /* Read a up to two hex digits after \x */ + + else + { + c = 0; + if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */ + ptr++; + c = cc; + if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */ + ptr++; + c = (c << 4) | cc; + } /* End of \xdd handling */ + } /* End of Perl-style \x handling */ + break; + + /* The handling of \c is different in ASCII and EBCDIC environments. In an + ASCII (or Unicode) environment, an error is given if the character + following \c is not a printable ASCII character. Otherwise, the following + character is upper-cased if it is a letter, and after that the 0x40 bit is + flipped. The result is the value of the escape. + + In an EBCDIC environment the handling of \c is compatible with the + specification in the perlebcdic document. The following character must be + a letter or one of small number of special characters. These provide a + means of defining the character values 0-31. + + For testing the EBCDIC handling of \c in an ASCII environment, recognize + the EBCDIC value of 'c' explicitly. */ + +#if defined EBCDIC && 'a' != 0x81 + case 0x83: +#else + case CHAR_c: +#endif + if (ptr >= ptrend) + { + *errorcodeptr = ERR2; + break; + } + c = *ptr; + if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c); + + /* Handle \c in an ASCII/Unicode environment. */ + +#ifndef EBCDIC /* ASCII/UTF-8 coding */ + if (c < 32 || c > 126) /* Excludes all non-printable ASCII */ + { + *errorcodeptr = ERR68; + break; + } + c ^= 0x40; + + /* Handle \c in an EBCDIC environment. The special case \c? is converted to + 255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC + encoding. (This is the way Perl indicates that it handles \c?.) The other + valid sequences correspond to a list of specific characters. */ + +#else + if (c == CHAR_QUESTION_MARK) + c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; + else + { + for (i = 0; i < 32; i++) + { + if (c == ebcdic_escape_c[i]) break; + } + if (i < 32) c = i; else *errorcodeptr = ERR68; + } +#endif /* EBCDIC */ + + ptr++; + break; + + /* Any other alphanumeric following \ is an error. Perl gives an error only + if in warning mode, but PCRE doesn't have a warning mode. */ + + default: + *errorcodeptr = ERR3; + *ptrptr = ptr - 1; /* Point to the character at fault */ + return 0; + } + } + +/* Perl supports \N{name} for character names, as well as plain \N for "not +newline". PCRE does not support \N{name}. However, it does support +quantification such as \N{2,3}. */ + +if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET && + ptrend - ptr > 2) + { + PCRE2_SPTR p = ptr + 1; + if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) && + *errorcodeptr == 0) + *errorcodeptr = ERR37; + } + +/* Set the pointer to the next character before returning. */ + +*ptrptr = ptr; +*chptr = c; +return escape; +} + + + +#ifdef SUPPORT_UNICODE +/************************************************* +* Handle \P and \p * +*************************************************/ + +/* This function is called after \P or \p has been encountered, provided that +PCRE2 is compiled with support for UTF and Unicode properties. On entry, the +contents of ptrptr are pointing after the P or p. On exit, it is left pointing +after the final code unit of the escape sequence. + +Arguments: + ptrptr the pattern position pointer + negptr a boolean that is set TRUE for negation else FALSE + ptypeptr an unsigned int that is set to the type value + pdataptr an unsigned int that is set to the detailed property value + errorcodeptr the error code variable + cb the compile data + +Returns: TRUE if the type value was found, or FALSE for an invalid type +*/ + +static BOOL +get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr, + uint16_t *pdataptr, int *errorcodeptr, compile_block *cb) +{ +PCRE2_UCHAR c; +PCRE2_SIZE i, bot, top; +PCRE2_SPTR ptr = *ptrptr; +PCRE2_UCHAR name[32]; + +if (ptr >= cb->end_pattern) goto ERROR_RETURN; +c = *ptr++; +*negptr = FALSE; + +/* \P or \p can be followed by a name in {}, optionally preceded by ^ for +negation. */ + +if (c == CHAR_LEFT_CURLY_BRACKET) + { + if (ptr >= cb->end_pattern) goto ERROR_RETURN; + if (*ptr == CHAR_CIRCUMFLEX_ACCENT) + { + *negptr = TRUE; + ptr++; + } + for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++) + { + if (ptr >= cb->end_pattern) goto ERROR_RETURN; + c = *ptr++; + if (c == CHAR_NULL) goto ERROR_RETURN; + if (c == CHAR_RIGHT_CURLY_BRACKET) break; + name[i] = c; + } + if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; + name[i] = 0; + } + +/* Otherwise there is just one following character, which must be an ASCII +letter. */ + +else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0) + { + name[0] = c; + name[1] = 0; + } +else goto ERROR_RETURN; + +*ptrptr = ptr; + +/* Search for a recognized property name using binary chop. */ + +bot = 0; +top = PRIV(utt_size); + +while (bot < top) + { + int r; + i = (bot + top) >> 1; + r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); + if (r == 0) + { + *ptypeptr = PRIV(utt)[i].type; + *pdataptr = PRIV(utt)[i].value; + return TRUE; + } + if (r > 0) bot = i + 1; else top = i; + } +*errorcodeptr = ERR47; /* Unrecognized name */ +return FALSE; + +ERROR_RETURN: /* Malformed \P or \p */ +*errorcodeptr = ERR46; +*ptrptr = ptr; +return FALSE; +} +#endif + + + +/************************************************* +* Check for POSIX class syntax * +*************************************************/ + +/* This function is called when the sequence "[:" or "[." or "[=" is +encountered in a character class. It checks whether this is followed by a +sequence of characters terminated by a matching ":]" or ".]" or "=]". If we +reach an unescaped ']' without the special preceding character, return FALSE. + +Originally, this function only recognized a sequence of letters between the +terminators, but it seems that Perl recognizes any sequence of characters, +though of course unknown POSIX names are subsequently rejected. Perl gives an +"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE +didn't consider this to be a POSIX class. Likewise for [:1234:]. + +The problem in trying to be exactly like Perl is in the handling of escapes. We +have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX +class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code +below handles the special cases \\ and \], but does not try to do any other +escape processing. This makes it different from Perl for cases such as +[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does +not recognize "l\ower". This is a lesser evil than not diagnosing bad classes +when Perl does, I think. + +A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. +It seems that the appearance of a nested POSIX class supersedes an apparent +external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or +a digit. This is handled by returning FALSE if the start of a new group with +the same terminator is encountered, since the next closing sequence must close +the nested group, not the outer one. + +In Perl, unescaped square brackets may also appear as part of class names. For +example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for +[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not +seem right at all. PCRE does not allow closing square brackets in POSIX class +names. + +Arguments: + ptr pointer to the character after the initial [ (colon, dot, equals) + ptrend pointer to the end of the pattern + endptr where to return a pointer to the terminating ':', '.', or '=' + +Returns: TRUE or FALSE +*/ + +static BOOL +check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr) +{ +PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */ +terminator = *ptr++; /* compiler warns about "non-constant" initializer. */ + +for (; ptrend - ptr >= 2; ptr++) + { + if (*ptr == CHAR_BACKSLASH && + (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH)) + ptr++; + + else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) || + *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; + + else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) + { + *endptr = ptr; + return TRUE; + } + } + +return FALSE; +} + + + +/************************************************* +* Check POSIX class name * +*************************************************/ + +/* This function is called to check the name given in a POSIX-style class entry +such as [:alnum:]. + +Arguments: + ptr points to the first letter + len the length of the name + +Returns: a value representing the name, or -1 if unknown */ static int -find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb, - recurse_check *recurses) +check_posix_name(PCRE2_SPTR ptr, int len) { -int length = -1; -recurse_check this_recurse; -register int branchlength = 0; -register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE; - -/* Scan along the opcodes for this branch. If we get to the end of the -branch, check the length against that of the other branches. */ - -for (;;) +const char *pn = posix_names; +int yield = 0; +while (posix_name_lengths[yield] != 0) { - int d; - PCRE2_UCHAR *ce, *cs; - register PCRE2_UCHAR op = *cc; + if (len == posix_name_lengths[yield] && + PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield; + pn += posix_name_lengths[yield] + 1; + yield++; + } +return -1; +} - switch (op) + + +/************************************************* +* Read a subpattern or VERB name * +*************************************************/ + +/* This function is called from parse_regex() below whenever it needs to read +the name of a subpattern or a (*VERB). The initial pointer must be to the +character before the name. If that character is '*' we are reading a verb name. +The pointer is updated to point after the name, for a VERB, or after tha name's +terminator for a subpattern name. Returning both the offset and the name +pointer is redundant information, but some callers use one and some the other, +so it is simplest just to return both. + +Arguments: + ptrptr points to the character pointer variable + ptrend points to the end of the input string + terminator the terminator of a subpattern name must be this + offsetptr where to put the offset from the start of the pattern + nameptr where to put a pointer to the name in the input + namelenptr where to put the length of the name + errcodeptr where to put an error code + cb pointer to the compile data block + +Returns: TRUE if a name was read + FALSE otherwise, with error code set +*/ + +static BOOL +read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t terminator, + PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr, + int *errorcodeptr, compile_block *cb) +{ +PCRE2_SPTR ptr = *ptrptr; +BOOL is_verb = (*ptr == CHAR_ASTERISK); +uint32_t namelen = 0; +uint32_t ctype = is_verb? ctype_letter : ctype_word; + +if (++ptr >= ptrend) + { + *errorcodeptr = is_verb? ERR60: /* Verb not recognized or malformed */ + ERR62; /* Subpattern name expected */ + goto FAILED; + } + +*nameptr = ptr; +*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern); + +if (IS_DIGIT(*ptr)) + { + *errorcodeptr = ERR44; /* Group name must not start with digit */ + goto FAILED; + } + +while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype) != 0) + { + ptr++; + namelen++; + if (namelen > MAX_NAME_SIZE) { - /* We only need to continue for OP_CBRA (normal capturing bracket) and - OP_BRA (normal non-capturing bracket) because the other variants of these - opcodes are all concerned with unlimited repeated groups, which of course - are not of fixed length. */ + *errorcodeptr = ERR48; + goto FAILED; + } + } - case OP_CBRA: - case OP_BRA: - case OP_ONCE: - case OP_ONCE_NC: - case OP_COND: - d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cb, - recurses); - if (d < 0) return d; - branchlength += d; - do cc += GET(cc, 1); while (*cc == OP_ALT); - cc += 1 + LINK_SIZE; - break; +/* Subpattern names must not be empty, and their terminator is checked here. +(What follows a verb name is checked separately.) */ - /* Reached end of a branch; if it's a ket it is the end of a nested call. - If it's ALT it is an alternation in a nested call. An ACCEPT is effectively - an ALT. If it is END it's the end of the outer call. All can be handled by - the same code. Note that we must not include the OP_KETRxxx opcodes here, - because they all imply an unlimited repeat. */ +if (!is_verb) + { + if (namelen == 0) + { + *errorcodeptr = ERR62; /* Subpattern name expected */ + goto FAILED; + } + if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator) + { + *errorcodeptr = ERR42; + goto FAILED; + } + ptr++; + } - case OP_ALT: - case OP_KET: - case OP_END: - case OP_ACCEPT: - case OP_ASSERT_ACCEPT: - if (length < 0) length = branchlength; - else if (length != branchlength) return -1; - if (*cc != OP_ALT) return length; - cc += 1 + LINK_SIZE; - branchlength = 0; - break; +*namelenptr = namelen; +*ptrptr = ptr; +return TRUE; - /* A true recursion implies not fixed length, but a subroutine call may - be OK. If the subroutine is a forward reference, we can't deal with - it until the end of the pattern, so return -3. */ +FAILED: +*ptrptr = ptr; +return FALSE; +} - case OP_RECURSE: - if (!atend) return -3; - cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */ - do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ - if (cc > cs && cc < ce) return -1; /* Recursion */ - else /* Check for mutual recursion */ - { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; - if (r != NULL) return -1; /* Mutual recursion */ - } - this_recurse.prev = recurses; - this_recurse.group = cs; - d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cb, &this_recurse); - if (d < 0) return d; - branchlength += d; - cc += 1 + LINK_SIZE; - break; - /* Skip over assertive subpatterns. Note that we must increment cc by - 1 + LINK_SIZE at the end, not by OP_length[*cc] because in a recursive - situation this assertion may be the one that is ultimately being checked - for having a fixed length, in which case its terminating OP_KET will have - been temporarily replaced by OP_END. */ - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - do cc += GET(cc, 1); while (*cc == OP_ALT); - cc += 1 + LINK_SIZE; - break; +/************************************************* +* Manage callouts at start of cycle * +*************************************************/ - /* Skip over things that don't match chars */ +/* At the start of a new item in parse_regex() we are able to record the +details of the previous item in a prior callout, and also to set up an +automatic callout if enabled. Avoid having two adjacent automatic callouts, +which would otherwise happen for items such as \Q that contribute nothing to +the parsed pattern. - case OP_MARK: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - cc += cc[1] + PRIV(OP_lengths)[*cc]; - break; +Arguments: + ptr current pattern pointer + pcalloutptr points to a pointer to previous callout, or NULL + options the compiling options + parsed_pattern the parsed pattern pointer + cb compile block - case OP_CALLOUT: - case OP_CIRC: - case OP_CIRCM: - case OP_CLOSE: - case OP_COMMIT: - case OP_CREF: - case OP_FALSE: - case OP_TRUE: - case OP_DNCREF: - case OP_DNRREF: - case OP_DOLL: - case OP_DOLLM: - case OP_EOD: - case OP_EODN: - case OP_FAIL: - case OP_NOT_WORD_BOUNDARY: - case OP_PRUNE: - case OP_REVERSE: - case OP_RREF: - case OP_SET_SOM: - case OP_SKIP: - case OP_SOD: - case OP_SOM: - case OP_THEN: - case OP_WORD_BOUNDARY: - cc += PRIV(OP_lengths)[*cc]; - break; +Returns: possibly updated parsed_pattern pointer. +*/ - case OP_CALLOUT_STR: - cc += GET(cc, 1 + 2*LINK_SIZE); - break; +static uint32_t * +manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, uint32_t options, + uint32_t *parsed_pattern, compile_block *cb) +{ +uint32_t *previous_callout = *pcalloutptr; - /* Handle literal characters */ +if (previous_callout != NULL) previous_callout[2] = ptr - cb->start_pattern - + (PCRE2_SIZE)previous_callout[1]; - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - branchlength++; - cc += 2; -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; +if ((options & PCRE2_AUTO_CALLOUT) == 0) previous_callout = NULL; else + { + if (previous_callout == NULL || + previous_callout != parsed_pattern - 4 || + previous_callout[3] != 255) + { + previous_callout = parsed_pattern; /* Set up new automatic callout */ + parsed_pattern += 4; + previous_callout[0] = META_CALLOUT_NUMBER; + previous_callout[2] = 0; + previous_callout[3] = 255; + } + previous_callout[1] = (uint32_t)(ptr - cb->start_pattern); + } - /* Handle exact repetitions. The count is already in characters, but we - need to skip over a multibyte character in UTF8 mode. */ +*pcalloutptr = previous_callout; +return parsed_pattern; +} - case OP_EXACT: - case OP_EXACTI: - case OP_NOTEXACT: - case OP_NOTEXACTI: - branchlength += (int)GET2(cc,1); - cc += 2 + IMM2_SIZE; -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - break; - case OP_TYPEEXACT: - branchlength += GET2(cc,1); - if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) - cc += 2; - cc += 1 + IMM2_SIZE + 1; - break; - /* Handle single-char matchers */ +/************************************************* +* Parse regex and identify named groups * +*************************************************/ - case OP_PROP: - case OP_NOTPROP: - cc += 2; - /* Fall through */ +/* This function is called first of all. It scans the pattern and does two +things: (1) It identifies capturing groups and makes a table of named capturing +groups so that information about them is fully available to both the compiling +scans. (2) It writes a parsed version of the pattern with comments omitted and +escapes processed into the parsed_pattern vector. - case OP_HSPACE: - case OP_VSPACE: - case OP_NOT_HSPACE: - case OP_NOT_VSPACE: - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - case OP_ANY: - case OP_ALLANY: - branchlength++; - cc++; - break; +Arguments: + ptr points to the start of the pattern + options compiling dynamic options (may change during the scan) + has_lookbehind points to a boolean, set TRUE if a lookbehind is found + cb pointer to the compile data block - /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode; - otherwise \C is coded as OP_ALLANY. */ +Returns: zero on success or a non-zero error code, with the + error offset placed in the cb field +*/ - case OP_ANYBYTE: - return -2; +/* A structure and some flags for dealing with nested groups. */ - /* Check a class for variable quantification */ +typedef struct nest_save { + uint16_t nest_depth; + uint16_t reset_group; + uint16_t max_group; + uint16_t flags; +} nest_save; - case OP_CLASS: - case OP_NCLASS: -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: - /* The original code caused an unsigned overflow in 64 bit systems, - so now we use a conditional statement. */ - if (op == OP_XCLASS) - cc += GET(cc, 1); - else - cc += PRIV(OP_lengths)[OP_CLASS]; +#define NSF_RESET 0x0001u +#define NSF_EXTENDED 0x0002u +#define NSF_DUPNAMES 0x0004u +#define NSF_CONDASSERT 0x0008u + +/* States used for analyzing ranges in character classes. The two OK values +must be last. */ + +enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL }; + +/* Only in 32-bit mode can there be literals > META_END. A macros encapsulates +the storing of literal values in the parsed pattern. */ + +#if PCRE2_CODE_UNIT_WIDTH == 32 +#define PARSED_LITERAL(c, p) \ + { \ + if (c >= META_END) *p++ = META_BIGVALUE; \ + *p++ = c; \ + okquantifier = TRUE; \ + } #else - cc += PRIV(OP_lengths)[OP_CLASS]; +#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE; #endif - switch (*cc) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSSTAR: - case OP_CRPOSPLUS: - case OP_CRPOSQUERY: - return -1; +/* Here's the actual function. */ - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1; - branchlength += (int)GET2(cc,1); - cc += 1 + 2 * IMM2_SIZE; +static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind, + compile_block *cb) +{ +uint32_t c; +uint32_t delimiter; +uint32_t namelen; +uint32_t class_range_state; +uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */ +uint32_t *previous_callout = NULL; +uint32_t *parsed_pattern = cb->parsed_pattern; +uint32_t *parsed_pattern_end = cb->parsed_pattern_end; +uint32_t meta_quantifier = 0; +uint16_t nest_depth = 0; +int after_manual_callout = 0; +int expect_cond_assert = 0; +int errorcode = 0; +int escape; +int i; +BOOL inescq = FALSE; +BOOL inverbname = FALSE; +BOOL utf = (options & PCRE2_UTF) != 0; +BOOL isdupname; +BOOL negate_class; +BOOL okquantifier = FALSE; +PCRE2_SPTR name; +PCRE2_SPTR ptrend = cb->end_pattern; +PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ +named_group *ng; +nest_save *top_nest = NULL; +nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); + +/* The size of the nest_save structure might not be a factor of the size of the +workspace. Therefore we must round down end_nests so as to correctly avoid +creating a nest_save that spans the end of the workspace. */ + +end_nests = (nest_save *)((char *)end_nests - + ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save))); + +/* Now scan the pattern */ + +*has_lookbehind = FALSE; + +while (ptr < ptrend) + { + int prev_expect_cond_assert; + uint32_t min_repeat, max_repeat; + uint32_t set, unset, *optset; + uint32_t terminator; + uint32_t prev_meta_quantifier; + BOOL prev_okquantifier; + PCRE2_SPTR tempptr; + PCRE2_SPTR thisptr; + PCRE2_SIZE offset; + + if (parsed_pattern >= parsed_pattern_end) + { + errorcode = ERR63; /* Internal error (parsed pattern overflow) */ + goto FAILED; + } + + if (nest_depth > cb->cx->parens_nest_limit) + { + errorcode = ERR19; + goto FAILED; + } + + /* Get next input character, save its position for callout handling. */ + + thisptr = ptr; + GETCHARINCTEST(c, ptr); + + /* Copy quoted literals until \E, allowing for the possibility of automatic + callouts, except when processing a (*VERB) "name". */ + + if (inescq) + { + if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E) + { + inescq = FALSE; + ptr++; /* Skip E */ + } + else + { + if (expect_cond_assert > 0) /* A literal is not allowed if we are */ + { /* expecting a conditional assertion, */ + ptr--; /* but an empty \Q\E sequence is OK. */ + errorcode = ERR28; + goto FAILED; + } + if (!inverbname && after_manual_callout-- <= 0) + parsed_pattern = manage_callouts(thisptr, &previous_callout, options, + parsed_pattern, cb); + PARSED_LITERAL(c, parsed_pattern); + meta_quantifier = 0; + } + continue; /* Next character */ + } + + /* If we are processing the "name" part of a (*VERB:NAME) item, all + characters up to the closing parenthesis are literals except when + PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q + and \E and escaped characters are allowed (no character types such as \d). If + PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do + this by not entering the special (*VERB:NAME) processing - they are then + picked up below. Note that c is a character, not a code unit, so we must not + use MAX_255 to test its size because MAX_255 tests code units and is assumed + TRUE in 8-bit mode. */ + + if (inverbname && + ( + /* EITHER: not both options set */ + ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) != + (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) || + /* OR: character > 255 */ + c > 255 || + /* OR: not a # comment or white space */ + (c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0) + )) + { + PCRE2_SIZE verbnamelength; + + switch(c) + { + default: + PARSED_LITERAL(c, parsed_pattern); + break; + + case CHAR_RIGHT_PARENTHESIS: + inverbname = FALSE; + okquantifier = FALSE; /* Was probably set by literals */ + /* This is the length in characters */ + verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1); + /* But the limit on the length is in code units */ + if (ptr - verbnamestart - 1 > (int)MAX_MARK) + { + ptr--; + errorcode = ERR76; + goto FAILED; + } + *verblengthptr = (uint32_t)verbnamelength; + break; + + case CHAR_BACKSLASH: + if ((options & PCRE2_ALT_VERBNAMES) != 0) + { + escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, + FALSE, cb); + if (errorcode != 0) goto FAILED; + } + else escape = 0; /* Treat all as literal */ + + switch(escape) + { + case 0: + PARSED_LITERAL(c, parsed_pattern); + break; + + case ESC_Q: + inescq = TRUE; + break; + + case ESC_E: /* Ignore */ + break; + + default: + errorcode = ERR40; /* Invalid in verb name */ + goto FAILED; + } + } + continue; /* Next character in pattern */ + } + + /* Not a verb name character. At this point we must process everything that + must not change the quantification state. This is mainly comments, but we + handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as + A+, as in Perl. An isolated \E is ignored. */ + + if (c == CHAR_BACKSLASH && ptr < ptrend) + { + if (*ptr == CHAR_Q || *ptr == CHAR_E) + { + inescq = *ptr == CHAR_Q; + ptr++; + continue; + } + } + + /* Skip over whitespace and # comments in extended mode. Note that c is a + character, not a code unit, so we must not use MAX_255 to test its size + because MAX_255 tests code units and is assumed TRUE in 8-bit mode. */ + + if ((options & PCRE2_EXTENDED) != 0) + { + if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue; + if (c == CHAR_NUMBER_SIGN) + { + while (ptr < ptrend) + { + if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ + { /* IS_NEWLINE sets cb->nllen. */ + ptr += cb->nllen; + break; + } + ptr++; +#ifdef SUPPORT_UNICODE + if (utf) FORWARDCHARTEST(ptr, ptrend); +#endif + } + continue; /* Next character in pattern */ + } + } + + /* Skip over bracketed comments */ + + if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 && + ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN) + { + while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS); + if (ptr >= ptrend) + { + errorcode = ERR18; /* A special error for missing ) in a comment */ + goto FAILED; /* to make it easier to debug. */ + } + ptr++; + continue; /* Next character in pattern */ + } + + /* If the next item is not a quantifier, fill in length of any previous + callout and create an auto callout if required. */ + + if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK && + (c != CHAR_LEFT_CURLY_BRACKET || + (tempptr = ptr, + !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode)))) + { + if (after_manual_callout-- <= 0) + parsed_pattern = manage_callouts(thisptr, &previous_callout, options, + parsed_pattern, cb); + } + + /* If expect_cond_assert is 2, we have just passed (?( and are expecting an + assertion, possibly preceded by a callout. If the value is 1, we have just + had the callout and expect an assertion. There must be at least 3 more + characters in all cases. When expect_cond_assert is 2, we know that the + current character is an opening parenthesis, as otherwise we wouldn't be + here. However, when it is 1, we need to check, and it's easiest just to check + always. Note that expect_cond_assert may be negative, since all callouts just + decrement it. */ + + if (expect_cond_assert > 0) + { + BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 && + ptr[0] == CHAR_QUESTION_MARK; + if (ok) switch(ptr[1]) + { + case CHAR_C: + ok = expect_cond_assert == 2; + break; + + case CHAR_EQUALS_SIGN: + case CHAR_EXCLAMATION_MARK: + break; + + case CHAR_LESS_THAN_SIGN: + ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK; break; default: - branchlength++; + ok = FALSE; + } + + if (!ok) + { + ptr--; /* Adjust error offset */ + errorcode = ERR28; + goto FAILED; + } + } + + /* Remember whether we are expecting a conditional assertion, and set the + default for this item. */ + + prev_expect_cond_assert = expect_cond_assert; + expect_cond_assert = 0; + + /* Remember quantification status for the previous significant item, then set + default for this item. */ + + prev_okquantifier = okquantifier; + prev_meta_quantifier = meta_quantifier; + okquantifier = FALSE; + meta_quantifier = 0; + + /* If the previous significant item was a quantifier, adjust the parsed code + if there is a following modifier. The base meta value is always followed by + the PLUS and QUERY values, in that order. We do this here rather than after + reading a quantifier so that intervening comments and /x whitespace can be + ignored without having to replicate code. */ + + if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS)) + { + parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] = + prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)? + 0x00020000u : 0x00010000u); + continue; /* Next character in pattern */ + } + + + /* Process the next item in the main part of a pattern. */ + + switch(c) + { + default: /* Non-special character */ + PARSED_LITERAL(c, parsed_pattern); + break; + + + /* ---- Escape sequence ---- */ + + case CHAR_BACKSLASH: + escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, + FALSE, cb); + if (errorcode != 0) goto FAILED; + + /* The escape was a data character. */ + + if (escape == 0) + { + PARSED_LITERAL(c, parsed_pattern); + } + + /* The escape was a back (or forward) reference. We keep the offset in + order to give a more useful diagnostic for a bad forward reference. For + references to groups numbered less than 10 we can't use more than two items + in parsed_pattern because they may be just two characters in the input (and + in a 64-bit world an offset may need two elements). So for them, the offset + of the first occurrent is held in a special vector. */ + + else if (escape < 0) + { + offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1); + escape = -escape; + *parsed_pattern++ = META_BACKREF | (uint32_t)escape; + if (escape < 10) + { + if (cb->small_ref_offset[escape] == PCRE2_UNSET) + cb->small_ref_offset[escape] = offset; + } + else + { + PUTOFFSET(offset, parsed_pattern); + } + okquantifier = TRUE; + } + + /* The escape was a character class such as \d etc. or other special + escape indicator such as \A or \X. Most of them generate just a single + parsed item, but \P and \p are followed by a 16-bit type and a 16-bit + value. They are supported only when Unicode is available. The type and + value are packed into a single 32-bit value so that the whole sequences + uses only two elements in the parsed_vector. This is because the same + coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is + set. + + There are also some cases where the escape sequence is followed by a name: + \k{name}, \k , and \k'name' are backreferences by name, and \g + and \g'name' are subroutine calls by name; \g{name} is a synonym for + \k{name}. Note that \g and \g'number' are handled by check_escape() + and returned as a negative value (handled above). A name is coded as an + offset into the pattern and a length. */ + + else switch (escape) + { + case ESC_C: +#ifdef NEVER_BACKSLASH_C + errorcode = ERR85; + goto FAILED; +#else + if ((options & PCRE2_NEVER_BACKSLASH_C) != 0) + { + errorcode = ERR83; + goto FAILED; + } +#endif + okquantifier = TRUE; + *parsed_pattern++ = META_ESCAPE + escape; + break; + + case ESC_X: +#ifndef SUPPORT_UNICODE + errorcode = ERR45; /* Supported only with Unicode support */ + goto FAILED; +#endif + case ESC_H: + case ESC_h: + case ESC_N: + case ESC_R: + case ESC_V: + case ESC_v: + okquantifier = TRUE; + *parsed_pattern++ = META_ESCAPE + escape; + break; + + default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */ + *parsed_pattern++ = META_ESCAPE + escape; + break; + + /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set + without Unicode support because it is checked when pcre2_compile() is + called. */ + + case ESC_d: + case ESC_D: + case ESC_s: + case ESC_S: + case ESC_w: + case ESC_W: + okquantifier = TRUE; + if ((options & PCRE2_UCP) == 0) + { + *parsed_pattern++ = META_ESCAPE + escape; + } + else + { + *parsed_pattern++ = META_ESCAPE + + ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? + ESC_p : ESC_P); + switch(escape) + { + case ESC_d: + case ESC_D: + *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; + break; + + case ESC_s: + case ESC_S: + *parsed_pattern++ = PT_SPACE << 16; + break; + + case ESC_w: + case ESC_W: + *parsed_pattern++ = PT_WORD << 16; + break; + } + } + break; + + /* Unicode property matching */ + + case ESC_P: + case ESC_p: +#ifdef SUPPORT_UNICODE + { + BOOL negated; + uint16_t ptype = 0, pdata = 0; + if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) + goto FAILED; + if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; + *parsed_pattern++ = META_ESCAPE + escape; + *parsed_pattern++ = (ptype << 16) | pdata; + okquantifier = TRUE; + } +#else + errorcode = ERR45; + goto FAILED; +#endif + break; /* End \P and \p */ + + /* When \g is used with quotes or angle brackets as delimiters, it is a + numerical or named subroutine call, and control comes here. When used + with brace delimiters it is a numberical back reference and does not come + here because check_escape() returns it directly as a reference. \k is + always a named back reference. */ + + case ESC_g: + case ESC_k: + if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET && + *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE)) + { + errorcode = (escape == ESC_g)? ERR57 : ERR69; + goto FAILED; + } + terminator = (*ptr == CHAR_LESS_THAN_SIGN)? + CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? + CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; + + /* For a non-braced \g, check for a numerical recursion. */ + + if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET) + { + PCRE2_SPTR p = ptr + 1; + + if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i, + &errorcode)) + { + if (p >= ptrend || *p != terminator) + { + errorcode = ERR57; + goto FAILED; + } + ptr = p; + goto SET_RECURSION; + } + if (errorcode != 0) goto FAILED; + } + + /* Not a numerical recursion */ + + if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen, + &errorcode, cb)) goto FAILED; + + /* \k and \g when used with braces are back references, whereas \g used + with quotes or angle brackets is a recursion */ + + *parsed_pattern++ = + (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)? + META_BACKREF_BYNAME : META_RECURSE_BYNAME; + *parsed_pattern++ = namelen; + + PUTOFFSET(offset, parsed_pattern); + okquantifier = TRUE; + break; + } + break; /* End escape sequence processing */ + + + /* ---- Single-character special items ---- */ + + case CHAR_CIRCUMFLEX_ACCENT: + *parsed_pattern++ = META_CIRCUMFLEX; + break; + + case CHAR_DOLLAR_SIGN: + *parsed_pattern++ = META_DOLLAR; + break; + + case CHAR_DOT: + *parsed_pattern++ = META_DOT; + okquantifier = TRUE; + break; + + + /* ---- Single-character quantifiers ---- */ + + case CHAR_ASTERISK: + meta_quantifier = META_ASTERISK; + goto CHECK_QUANTIFIER; + + case CHAR_PLUS: + meta_quantifier = META_PLUS; + goto CHECK_QUANTIFIER; + + case CHAR_QUESTION_MARK: + meta_quantifier = META_QUERY; + goto CHECK_QUANTIFIER; + + + /* ---- Potential {n,m} quantifier ---- */ + + case CHAR_LEFT_CURLY_BRACKET: + if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat, + &errorcode)) + { + if (errorcode != 0) goto FAILED; /* Error in quantifier. */ + PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */ + break; /* No more quantifier processing */ + } + meta_quantifier = META_MINMAX; + /* Fall through */ + + + /* ---- Quantifier post-processing ---- */ + + /* Check that a quantifier is allowed after the previous item. */ + + CHECK_QUANTIFIER: + if (!prev_okquantifier) + { + errorcode = ERR9; + goto FAILED_BACK; + } + + /* Now we can put the quantifier into the parsed pattern vector. At this + stage, we have only the basic quantifier. The check for a following + or ? + modifier happens at the top of the loop, after any intervening comments + have been removed. */ + + *parsed_pattern++ = meta_quantifier; + if (c == CHAR_LEFT_CURLY_BRACKET) + { + *parsed_pattern++ = min_repeat; + *parsed_pattern++ = max_repeat; } break; - /* Anything else is variable length */ - case OP_ANYNL: - case OP_BRAMINZERO: - case OP_BRAPOS: - case OP_BRAPOSZERO: - case OP_BRAZERO: - case OP_CBRAPOS: - case OP_EXTUNI: - case OP_KETRMAX: - case OP_KETRMIN: - case OP_KETRPOS: - case OP_MINPLUS: - case OP_MINPLUSI: - case OP_MINQUERY: - case OP_MINQUERYI: - case OP_MINSTAR: - case OP_MINSTARI: - case OP_MINUPTO: - case OP_MINUPTOI: - case OP_NOTMINPLUS: - case OP_NOTMINPLUSI: - case OP_NOTMINQUERY: - case OP_NOTMINQUERYI: - case OP_NOTMINSTAR: - case OP_NOTMINSTARI: - case OP_NOTMINUPTO: - case OP_NOTMINUPTOI: - case OP_NOTPLUS: - case OP_NOTPLUSI: - case OP_NOTPOSPLUS: - case OP_NOTPOSPLUSI: - case OP_NOTPOSQUERY: - case OP_NOTPOSQUERYI: - case OP_NOTPOSSTAR: - case OP_NOTPOSSTARI: - case OP_NOTPOSUPTO: - case OP_NOTPOSUPTOI: - case OP_NOTQUERY: - case OP_NOTQUERYI: - case OP_NOTSTAR: - case OP_NOTSTARI: - case OP_NOTUPTO: - case OP_NOTUPTOI: - case OP_PLUS: - case OP_PLUSI: - case OP_POSPLUS: - case OP_POSPLUSI: - case OP_POSQUERY: - case OP_POSQUERYI: - case OP_POSSTAR: - case OP_POSSTARI: - case OP_POSUPTO: - case OP_POSUPTOI: - case OP_QUERY: - case OP_QUERYI: - case OP_REF: - case OP_REFI: - case OP_DNREF: - case OP_DNREFI: - case OP_SBRA: - case OP_SBRAPOS: - case OP_SCBRA: - case OP_SCBRAPOS: - case OP_SCOND: - case OP_SKIPZERO: - case OP_STAR: - case OP_STARI: - case OP_TYPEMINPLUS: - case OP_TYPEMINQUERY: - case OP_TYPEMINSTAR: - case OP_TYPEMINUPTO: - case OP_TYPEPLUS: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSUPTO: - case OP_TYPEQUERY: - case OP_TYPESTAR: - case OP_TYPEUPTO: - case OP_UPTO: - case OP_UPTOI: - return -1; + /* ---- Character class ---- */ - /* Catch unrecognized opcodes so that when new ones are added they - are not forgotten, as has happened in the past. */ + case CHAR_LEFT_SQUARE_BRACKET: + okquantifier = TRUE; - default: - return -4; - } + /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is + used for "start of word" and "end of word". As these are otherwise illegal + sequences, we don't break anything by recognizing them. They are replaced + by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are + erroneous and are handled by the normal code below. */ + + if (ptrend - ptr >= 6 && + (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 || + PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0)) + { + *parsed_pattern++ = META_ESCAPE + ESC_b; + + if (ptr[2] == CHAR_LESS_THAN_SIGN) + { + *parsed_pattern++ = META_LOOKAHEAD; + } + else + { + *parsed_pattern++ = META_LOOKBEHIND; + *has_lookbehind = TRUE; + + /* The offset is used only for the "non-fixed length" error; this won't + occur here, so just store zero. */ + + PUTOFFSET((PCRE2_SIZE)0, parsed_pattern); + } + + if ((options & PCRE2_UCP) == 0) + *parsed_pattern++ = META_ESCAPE + ESC_w; + else + { + *parsed_pattern++ = META_ESCAPE + ESC_p; + *parsed_pattern++ = PT_WORD << 16; + } + *parsed_pattern++ = META_KET; + ptr += 6; + break; + } + + /* PCRE supports POSIX class stuff inside a class. Perl gives an error if + they are encountered at the top level, so we'll do that too. */ + + if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT || + *ptr == CHAR_EQUALS_SIGN) && + check_posix_syntax(ptr, ptrend, &tempptr)) + { + errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13; + goto FAILED; + } + + /* Process a regular character class. If the first character is '^', set + the negation flag. If the first few characters (either before or after ^) + are \Q\E or \E we skip them too. This makes for compatibility with Perl. */ + + negate_class = FALSE; + while (ptr < ptrend) + { + GETCHARINCTEST(c, ptr); + if (c == CHAR_BACKSLASH) + { + if (ptr < ptrend && *ptr == CHAR_E) ptr++; + else if (ptrend - ptr >= 3 && + PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0) + ptr += 3; + else + break; + } + else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) + negate_class = TRUE; + else break; + } + + /* Now the real contents of the class; c has the first "real" character. + Empty classes are permitted only if the option is set. */ + + if (c == CHAR_RIGHT_SQUARE_BRACKET && + (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0) + { + *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY; + break; /* End of class processing */ + } + + /* Process a non-empty class. */ + + *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS; + class_range_state = RANGE_NO; + + /* In an EBCDIC environment, Perl treats alphabetic ranges specially + because there are holes in the encoding, and simply using the range A-Z + (for example) would include the characters in the holes. This applies only + to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z] + in this respect. In order to accommodate this, we keep track of whether + character values are literal or not, and a state variable for handling + ranges. */ + + /* Loop for the contents of the class */ + + for (;;) + { + BOOL char_is_literal = TRUE; + + /* Inside \Q...\E everything is literal except \E */ + + if (inescq) + { + if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E) + { + inescq = FALSE; /* Reset literal state */ + ptr++; /* Skip the 'E' */ + goto CLASS_CONTINUE; + } + goto CLASS_LITERAL; + } + + /* Handle POSIX class names. Perl allows a negation extension of the + form [:^name:]. A square bracket that doesn't match the syntax is + treated as a literal. We also recognize the POSIX constructions + [.ch.] and [=ch=] ("collating elements") and fault them, as Perl + 5.6 and 5.8 do. */ + + if (c == CHAR_LEFT_SQUARE_BRACKET && + ptrend - ptr >= 3 && + (*ptr == CHAR_COLON || *ptr == CHAR_DOT || + *ptr == CHAR_EQUALS_SIGN) && + check_posix_syntax(ptr, ptrend, &tempptr)) + { + BOOL posix_negate = FALSE; + int posix_class; + + /* Perl treats a hyphen before a POSIX class as a literal, not the + start of a range. However, it gives a warning in its warning mode. PCRE + does not have a warning mode, so we give an error, because this is + likely an error on the user's part. */ + + if (class_range_state == RANGE_STARTED) + { + errorcode = ERR50; + goto FAILED; + } + + if (*ptr != CHAR_COLON) + { + errorcode = ERR13; + goto FAILED_BACK; + } + + if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT) + { + posix_negate = TRUE; + ptr++; + } + + posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); + if (posix_class < 0) + { + errorcode = ERR30; + goto FAILED; + } + ptr = tempptr + 2; + + /* Perl treats a hyphen after a POSIX class as a literal, not the + start of a range. However, it gives a warning in its warning mode. PCRE + does not have a warning mode, so we give an error, because this is + likely an error on the user's part. */ + + if (ptr < ptrend && *ptr == CHAR_MINUS) + { + errorcode = ERR50; + goto FAILED; + } + + /* Set "a hyphen is not the start of a range" just in case the POSIX + class is followed by \E or \Q\E (possibly repeated - fuzzers do that + kind of thing) and *then* a hyphen. This causes that hyphen to be + treated as a literal. I don't think it's worth setting up special + apparatus to do otherwise. */ + + class_range_state = RANGE_NO; + + /* When PCRE2_UCP is set, some of the POSIX classes are converted to + use Unicode properties \p or \P or, in one case, \h or \H. The + substitutes table has two values per class, containing the type and + value of a \p or \P item. The special cases are specified with a + negative type: a non-zero value causes \h or \H to be used, and a zero + value falls through to behave like a non-UCP POSIX class. */ + +#ifdef SUPPORT_UNICODE + if ((options & PCRE2_UCP) != 0) + { + int ptype = posix_substitutes[2*posix_class]; + int pvalue = posix_substitutes[2*posix_class + 1]; + if (ptype >= 0) + { + *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p); + *parsed_pattern++ = (ptype << 16) | pvalue; + goto CLASS_CONTINUE; + } + + if (pvalue != 0) + { + *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h); + goto CLASS_CONTINUE; + } + + /* Fall through */ + } +#endif /* SUPPORT_UNICODE */ + + /* Non-UCP POSIX class */ + + *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX; + *parsed_pattern++ = posix_class; + } + + /* Handle potential start of range */ + + else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED) + { + *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)? + META_RANGE_LITERAL : META_RANGE_ESCAPED; + class_range_state = RANGE_STARTED; + } + + /* Handle a literal character */ + + else if (c != CHAR_BACKSLASH) + { + CLASS_LITERAL: + if (class_range_state == RANGE_STARTED) + { + if (c == parsed_pattern[-2]) /* Optimize one-char range */ + parsed_pattern--; + else if (parsed_pattern[-2] > c) /* Check range is in order */ + { + errorcode = ERR8; + goto FAILED_BACK; + } + else + { + if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL) + parsed_pattern[-1] = META_RANGE_ESCAPED; + PARSED_LITERAL(c, parsed_pattern); + } + class_range_state = RANGE_NO; + } + else /* Potential start of range */ + { + class_range_state = char_is_literal? + RANGE_OK_LITERAL : RANGE_OK_ESCAPED; + PARSED_LITERAL(c, parsed_pattern); + } + } + + /* Handle escapes in a class */ + + else + { + escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, + options, TRUE, cb); + + if (errorcode != 0) goto FAILED; + if (escape == 0) /* Escaped character code point is in c */ + { + char_is_literal = FALSE; + goto CLASS_LITERAL; + } + + /* These three escapes do not alter the class range state. */ + + if (escape == ESC_b) + { + c = CHAR_BS; /* \b is backspace in a class */ + char_is_literal = FALSE; + goto CLASS_LITERAL; + } + + else if (escape == ESC_Q) + { + inescq = TRUE; /* Enter literal mode */ + goto CLASS_CONTINUE; + } + + else if (escape == ESC_E) /* Ignore orphan \E */ + goto CLASS_CONTINUE; + + /* The second part of a range can be a single-character escape + sequence (detected above), but not any of the other escapes. Perl + treats a hyphen as a literal in such circumstances. However, in Perl's + warning mode, a warning is given, so PCRE now faults it, as it is + almost certainly a mistake on the user's part. */ + + if (class_range_state == RANGE_STARTED) + { + errorcode = ERR50; + goto FAILED; + } + + /* Of the remaining escapes, only those that define characters are + allowed in a class. None may start a range. */ + + class_range_state = RANGE_NO; + switch(escape) + { + case ESC_N: + errorcode = ERR71; /* Not supported in a class */ + goto FAILED; + + case ESC_H: + case ESC_h: + case ESC_V: + case ESC_v: + *parsed_pattern++ = META_ESCAPE + escape; + break; + + /* These escapes are converted to Unicode property tests when + PCRE2_UCP is set. */ + + case ESC_d: + case ESC_D: + case ESC_s: + case ESC_S: + case ESC_w: + case ESC_W: + if ((options & PCRE2_UCP) == 0) + { + *parsed_pattern++ = META_ESCAPE + escape; + } + else + { + *parsed_pattern++ = META_ESCAPE + + ((escape == ESC_d || escape == ESC_s || escape == ESC_w)? + ESC_p : ESC_P); + switch(escape) + { + case ESC_d: + case ESC_D: + *parsed_pattern++ = (PT_PC << 16) | ucp_Nd; + break; + + case ESC_s: + case ESC_S: + *parsed_pattern++ = PT_SPACE << 16; + break; + + case ESC_w: + case ESC_W: + *parsed_pattern++ = PT_WORD << 16; + break; + } + } + break; + + /* Explicit Unicode property matching */ + + case ESC_P: + case ESC_p: +#ifdef SUPPORT_UNICODE + { + BOOL negated; + uint16_t ptype = 0, pdata = 0; + if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb)) + goto FAILED; + if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P; + *parsed_pattern++ = META_ESCAPE + escape; + *parsed_pattern++ = (ptype << 16) | pdata; + } +#else + errorcode = ERR45; + goto FAILED; +#endif + break; /* End \P and \p */ + + default: /* All others are not allowed in a class */ + errorcode = ERR7; + goto FAILED_BACK; + } + } + + /* Proceed to next thing in the class. */ + + CLASS_CONTINUE: + if (ptr >= ptrend) + { + errorcode = ERR6; /* Missing terminating ']' */ + goto FAILED; + } + GETCHARINCTEST(c, ptr); + if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; + } /* End of class-processing loop */ + + if (class_range_state == RANGE_STARTED) + { + parsed_pattern[-1] = CHAR_MINUS; + class_range_state = RANGE_NO; + } + + *parsed_pattern++ = META_CLASS_END; + break; /* End of character class */ + + + /* ---- Opening parenthesis ---- */ + + case CHAR_LEFT_PARENTHESIS: + if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + /* If ( is not followed by ? it is either a capture or a special verb. */ + + if (*ptr != CHAR_QUESTION_MARK) + { + const char *vn; + + /* Handle capturing brackets (or non-capturing if auto-capture is turned + off). */ + + if (*ptr != CHAR_ASTERISK) + { + nest_depth++; + if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) + { + cb->bracount++; + *parsed_pattern++ = META_CAPTURE | cb->bracount; + } + else *parsed_pattern++ = META_NOCAPTURE; + } + + + /* ---- Handle (*VERB) and (*VERB:NAME) ---- */ + + /* Do nothing for (*) so it gives a "bad quantifier" error rather than + "(*MARK) must have an argument". */ + + else if (ptrend - ptr > 1 && ptr[1] != CHAR_RIGHT_PARENTHESIS) + { + vn = verbnames; + if (!read_name(&ptr, ptrend, 0, &offset, &name, &namelen, &errorcode, + cb)) goto FAILED; + if (ptr >= ptrend || (*ptr != CHAR_COLON && + *ptr != CHAR_RIGHT_PARENTHESIS)) + { + errorcode = ERR60; /* Malformed */ + goto FAILED; + } + + /* Scan the table of verb names */ + + for (i = 0; i < verbcount; i++) + { + if (namelen == verbs[i].len && + PRIV(strncmp_c8)(name, vn, namelen) == 0) + break; + vn += verbs[i].len + 1; + } + + if (i >= verbcount) + { + errorcode = ERR60; /* Verb not recognized */ + goto FAILED; + } + + /* An empty argument is treated as no argument. */ + + if (*ptr == CHAR_COLON && ptr + 1 < ptrend && + ptr[1] == CHAR_RIGHT_PARENTHESIS) + ptr++; /* Advance to the closing parens */ + + /* Check for mandatory non-empty argument; this is (*MARK) */ + + if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON) + { + errorcode = ERR66; + goto FAILED; + } + + /* It appears that Perl allows any characters whatsoever, other than a + closing parenthesis, to appear in arguments ("names"), so we no longer + insist on letters, digits, and underscores. Perl does not, however, do + any interpretation within arguments, and has no means of including a + closing parenthesis. PCRE supports escape processing but only when it + is requested by an option. We set inverbname TRUE here, and let the + main loop take care of this so that escape and \x processing is done by + the main code above. */ + + if (*ptr++ == CHAR_COLON) /* Skip past : or ) */ + { + if (verbs[i].has_arg < 0) /* Argument is forbidden */ + { + errorcode = ERR59; + goto FAILED; + } + *parsed_pattern++ = verbs[i].meta + + ((verbs[i].meta != META_MARK)? 0x00010000u:0); + verblengthptr = parsed_pattern++; + verbnamestart = ptr; + inverbname = TRUE; + } + else /* No verb "name" argument */ + { + *parsed_pattern++ = verbs[i].meta; + } + } /* End of (*VERB) handling */ + break; /* Done with this parenthesis */ + } /* End of groups that don't start with (? */ + + + /* ---- Items starting (? ---- */ + + /* The type of item is determined by what follows (?. Handle (?| and option + changes under "default" because both need a new block on the nest stack. + Comments starting with (?# are handled above. Note that there is some + ambiguity about the sequence (?- because if a digit follows it's a relative + recursion or subroutine call whereas otherwise it's an option unsetting. */ + + if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + switch(*ptr) + { + default: + if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1])) + goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */ + + /* We now have either (?| or a (possibly empty) option setting, + optionally followed by a non-capturing group. */ + + nest_depth++; + if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); + else if (++top_nest >= end_nests) + { + errorcode = ERR84; + goto FAILED; + } + top_nest->nest_depth = nest_depth; + top_nest->flags = 0; + if ((options & PCRE2_EXTENDED) != 0) top_nest->flags |= NSF_EXTENDED; + if ((options & PCRE2_DUPNAMES) != 0) top_nest->flags |= NSF_DUPNAMES; + + /* Start of non-capturing group that resets the capture count for each + branch. */ + + if (*ptr == CHAR_VERTICAL_LINE) + { + top_nest->reset_group = (uint16_t)cb->bracount; + top_nest->max_group = (uint16_t)cb->bracount; + top_nest->flags |= NSF_RESET; + cb->external_flags |= PCRE2_DUPCAPUSED; + *parsed_pattern++ = META_NOCAPTURE; + ptr++; + } + + /* Scan for options imsxJU. We need to keep track of (?x) and (?J) for + use while scanning. The other options are used during the compiling + phases. */ + + else + { + top_nest->reset_group = 0; + top_nest->max_group = 0; + set = unset = 0; + optset = &set; + + while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS && + *ptr != CHAR_COLON) + { + switch (*ptr++) + { + case CHAR_MINUS: optset = &unset; break; + + case CHAR_J: /* Record that it changed in the external options */ + *optset |= PCRE2_DUPNAMES; + cb->external_flags |= PCRE2_JCHANGED; + break; + + case CHAR_i: *optset |= PCRE2_CASELESS; break; + case CHAR_m: *optset |= PCRE2_MULTILINE; break; + case CHAR_s: *optset |= PCRE2_DOTALL; break; + case CHAR_x: *optset |= PCRE2_EXTENDED; break; + case CHAR_U: *optset |= PCRE2_UNGREEDY; break; + + default: + errorcode = ERR11; + ptr--; /* Correct the offset */ + goto FAILED; + } + } + options = (options | set) & (~unset); + + /* If the options ended with ')' this is not the start of a nested + group with option changes, so the options change at this level. + In this case, if the previous level set up a nest block, discard the + one we have just created. Otherwise adjust it for the previous level. + If the options ended with ':' we are starting a non-capturing group, + possibly with an options setting. */ + + if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + if (*ptr++ == CHAR_RIGHT_PARENTHESIS) + { + nest_depth--; /* This is not a nested group after all. */ + if (top_nest > (nest_save *)(cb->start_workspace) && + (top_nest-1)->nest_depth == nest_depth) top_nest--; + else top_nest->nest_depth = nest_depth; + } + else *parsed_pattern++ = META_NOCAPTURE; + + /* If nothing changed, no need to record. */ + + if (set != 0 || unset != 0) + { + *parsed_pattern++ = META_OPTIONS; + *parsed_pattern++ = options; + } + } /* End options processing */ + break; /* End default case after (? */ + + + /* ---- Python syntax support ---- */ + + case CHAR_P: + if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + /* (?P is the same as (? , which defines a named group. */ + + if (*ptr == CHAR_LESS_THAN_SIGN) + { + terminator = CHAR_GREATER_THAN_SIGN; + goto DEFINE_NAME; + } + + /* (?P>name) is the same as (?&name), which is a recursion or subroutine + call. */ + + if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME; + + /* (?P=name) is the same as \k , a back reference by name. Anything + else after (?P is an error. */ + + if (*ptr != CHAR_EQUALS_SIGN) + { + errorcode = ERR41; + goto FAILED; + } + if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name, + &namelen, &errorcode, cb)) goto FAILED; + *parsed_pattern++ = META_BACKREF_BYNAME; + *parsed_pattern++ = namelen; + PUTOFFSET(offset, parsed_pattern); + okquantifier = TRUE; + break; /* End of (?P processing */ + + + /* ---- Recursion/subroutine calls by number ---- */ + + case CHAR_R: + i = 0; /* (?R) == (?R0) */ + ptr++; + if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) + { + errorcode = ERR58; + goto FAILED; + } + goto SET_RECURSION; + + /* An item starting (?- followed by a digit comes here via the "default" + case because (?- followed by a non-digit is an options setting. */ + + case CHAR_PLUS: + if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1])) + { + errorcode = ERR29; /* Missing number */ + goto FAILED; + } + /* Fall through */ + + case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: + case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: + RECURSION_BYNUMBER: + if (!read_number(&ptr, ptrend, + (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */ + MAX_GROUP_NUMBER, ERR61, + &i, &errorcode)) goto FAILED; + if (i < 0) /* NB (?0) is permitted */ + { + errorcode = ERR15; /* Unknown group */ + goto FAILED_BACK; + } + if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) + goto UNCLOSED_PARENTHESIS; + + SET_RECURSION: + *parsed_pattern++ = META_RECURSE | (uint32_t)i; + offset = (PCRE2_SIZE)(ptr - cb->start_pattern); + ptr++; + PUTOFFSET(offset, parsed_pattern); + okquantifier = TRUE; + break; /* End of recursive call by number handling */ + + + /* ---- Recursion/subroutine calls by name ---- */ + + case CHAR_AMPERSAND: + RECURSE_BY_NAME: + if (!read_name(&ptr, ptrend, CHAR_RIGHT_PARENTHESIS, &offset, &name, + &namelen, &errorcode, cb)) goto FAILED; + *parsed_pattern++ = META_RECURSE_BYNAME; + *parsed_pattern++ = namelen; + PUTOFFSET(offset, parsed_pattern); + okquantifier = TRUE; + break; + + /* ---- Callout with numerical or string argument ---- */ + + case CHAR_C: + if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + /* If the previous item was a condition starting (?(? an assertion, + optionally preceded by a callout, is expected. This is checked later on, + during actual compilation. However we need to identify this kind of + assertion in this pass because it must not be qualified. The value of + expect_cond_assert is set to 2 after (?(? is processed. We decrement it + for a callout - still leaving a positive value that identifies the + assertion. Multiple callouts or any other items will make it zero or + less, which doesn't matter because they will cause an error later. */ + + expect_cond_assert = prev_expect_cond_assert - 1; + + /* If previous_callout is not NULL, it means this follows a previous + callout. If it was a manual callout, do nothing; this means its "length + of next pattern item" field will remain zero. If it was an automatic + callout, abolish it. */ + + if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 && + previous_callout == parsed_pattern - 4 && + parsed_pattern[-1] == 255) + parsed_pattern = previous_callout; + + /* Save for updating next pattern item length, and skip one item before + completing. */ + + previous_callout = parsed_pattern; + after_manual_callout = 1; + + /* Handle a string argument; specific delimiter is required. */ + + if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr)) + { + PCRE2_SIZE calloutlength; + PCRE2_SPTR startptr = ptr; + + delimiter = 0; + for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) + { + if (*ptr == PRIV(callout_start_delims)[i]) + { + delimiter = PRIV(callout_end_delims)[i]; + break; + } + } + if (delimiter == 0) + { + errorcode = ERR82; + goto FAILED; + } + + *parsed_pattern = META_CALLOUT_STRING; + parsed_pattern += 3; /* Skip pattern info */ + + for (;;) + { + if (++ptr >= ptrend) + { + errorcode = ERR81; + ptr = startptr; /* To give a more useful message */ + goto FAILED; + } + if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter)) + break; + } + + calloutlength = (PCRE2_SIZE)(ptr - startptr); + if (calloutlength > UINT32_MAX) + { + errorcode = ERR72; + goto FAILED; + } + *parsed_pattern++ = (uint32_t)calloutlength; + offset = (PCRE2_SIZE)(startptr - cb->start_pattern); + PUTOFFSET(offset, parsed_pattern); + } + + /* Handle a callout with an optional numerical argument, which must be + less than or equal to 255. A missing argument gives 0. */ + + else + { + int n = 0; + *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */ + parsed_pattern += 3; /* Skip pattern info */ + while (ptr < ptrend && IS_DIGIT(*ptr)) + { + n = n * 10 + *ptr++ - CHAR_0; + if (n > 255) + { + errorcode = ERR38; + goto FAILED; + } + } + *parsed_pattern++ = n; + } + + /* Both formats must have a closing parenthesis */ + + if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) + { + errorcode = ERR39; + goto FAILED; + } + ptr++; + + /* Remember the offset to the next item in the pattern, and set a default + length. This should get updated after the next item is read. */ + + previous_callout[1] = ptr - cb->start_pattern; + previous_callout[2] = 0; + break; /* End callout */ + + + /* ---- Conditional group ---- */ + + /* A condition can be an assertion, a number (referring to a numbered + group's having been set), a name (referring to a named group), or 'R', + referring to overall recursion. R and R&name are also permitted + for recursion state tests. Numbers may be preceded by + or - to specify a + relative group number. + + There are several syntaxes for testing a named group: (?(name)) is used + by Python; Perl 5.10 onwards uses (?( ) or (?('name')). + + There are two unfortunate ambiguities. 'R' can be the recursive thing or + the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be + the Perl DEFINE feature or the Python named test. We look for a name + first; if not found, we try the other case. + + For compatibility with auto-callouts, we allow a callout to be specified + before a condition that is an assertion. */ + + case CHAR_LEFT_PARENTHESIS: + if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + nest_depth++; + + /* If the next character is ? there must be an assertion next (optionally + preceded by a callout). We do not check this here, but instead we set + expect_cond_assert to 2. If this is still greater than zero (callouts + decrement it) when the next assertion is read, it will be marked as a + condition that must not be repeated. A value greater than zero also + causes checking that an assertion (possibly with callout) follows. */ + + if (*ptr == CHAR_QUESTION_MARK) + { + *parsed_pattern++ = META_COND_ASSERT; + ptr--; /* Pull pointer back to the opening parenthesis. */ + expect_cond_assert = 2; + break; /* End of conditional */ + } + + /* Handle (?([+-]number)... */ + + if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i, + &errorcode)) + { + if (i <= 0) + { + errorcode = ERR15; + goto FAILED; + } + *parsed_pattern++ = META_COND_NUMBER; + offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); + PUTOFFSET(offset, parsed_pattern); + *parsed_pattern++ = i; + } + else if (errorcode != 0) goto FAILED; /* Number too big */ + + /* No number found. Handle the special case (?(VERSION[>]=n.m)... */ + + else if (ptrend - ptr >= 10 && + PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 && + ptr[7] != CHAR_RIGHT_PARENTHESIS) + { + uint32_t ge = 0; + int major = 0; + int minor = 0; + + ptr += 7; + if (*ptr == CHAR_GREATER_THAN_SIGN) + { + ge = 1; + ptr++; + } + + /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT + references its argument twice. */ + + if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr))) + goto BAD_VERSION_CONDITION; + + if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode)) + goto FAILED; + + if (ptr >= ptrend) goto BAD_VERSION_CONDITION; + if (*ptr == CHAR_DOT) + { + if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION; + if (!read_number(&ptr, ptrend, -1, 99 , ERR79, &minor, &errorcode)) + goto FAILED; + if (minor < 10) minor *= 10; + if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) + goto BAD_VERSION_CONDITION; + } + + *parsed_pattern++ = META_COND_VERSION; + *parsed_pattern++ = ge; + *parsed_pattern++ = major; + *parsed_pattern++ = minor; + } + + /* All the remaining cases now require us to read a name. We cannot at + this stage distinguish ambiguous cases such as (?(R12) which might be a + recursion test by number or a name, because the named groups have not yet + all been identified. Those cases are treated as names, but given a + different META code. */ + + else + { + BOOL was_r_ampersand = FALSE; + + if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND) + { + terminator = CHAR_RIGHT_PARENTHESIS; + was_r_ampersand = TRUE; + ptr++; + } + else if (*ptr == CHAR_LESS_THAN_SIGN) + terminator = CHAR_GREATER_THAN_SIGN; + else if (*ptr == CHAR_APOSTROPHE) + terminator = CHAR_APOSTROPHE; + else + { + terminator = CHAR_RIGHT_PARENTHESIS; + ptr--; /* Point to char before name */ + } + if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen, + &errorcode, cb)) goto FAILED; + + /* Handle (?(R&name) */ + + if (was_r_ampersand) + { + *parsed_pattern = META_COND_RNAME; + ptr--; /* Back to closing parens */ + } + + /* Handle (?(name). If the name is "DEFINE" we identify it with a + special code. Likewise if the name consists of R followed only by + digits. Otherwise, handle it like a quoted name. */ + + else if (terminator == CHAR_RIGHT_PARENTHESIS) + { + if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0) + *parsed_pattern = META_COND_DEFINE; + else + { + for (i = 1; i < (int)namelen; i++) + if (!IS_DIGIT(name[i])) break; + *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)? + META_COND_RNUMBER : META_COND_NAME; + } + ptr--; /* Back to closing parens */ + } + + /* Handle (?('name') or (?( ) */ + + else *parsed_pattern = META_COND_NAME; + + /* All these cases except DEFINE end with the name length and offset; + DEFINE just has an offset (for the "too many branches" error). */ + + if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen; + PUTOFFSET(offset, parsed_pattern); + } /* End cases that read a name */ + + /* Check the closing parenthesis of the condition */ + + if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS) + { + errorcode = ERR24; + goto FAILED; + } + ptr++; + break; /* End of condition processing */ + + + /* ---- Atomic group ---- */ + + case CHAR_GREATER_THAN_SIGN: + *parsed_pattern++ = META_ATOMIC; + nest_depth++; + ptr++; + break; + + + /* ---- Lookahead assertions ---- */ + + case CHAR_EQUALS_SIGN: + *parsed_pattern++ = META_LOOKAHEAD; + ptr++; + goto POST_ASSERTION; + + case CHAR_EXCLAMATION_MARK: + *parsed_pattern++ = META_LOOKAHEADNOT; + ptr++; + goto POST_ASSERTION; + + + /* ---- Lookbehind assertions ---- */ + + /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the + start of the name of a capturing group. */ + + case CHAR_LESS_THAN_SIGN: + if (ptrend - ptr <= 1 || + (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK)) + { + terminator = CHAR_GREATER_THAN_SIGN; + goto DEFINE_NAME; + } + *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)? + META_LOOKBEHIND : META_LOOKBEHINDNOT; + *has_lookbehind = TRUE; + offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); + PUTOFFSET(offset, parsed_pattern); + ptr += 2; + /* Fall through */ + + /* If the previous item was a condition starting (?(? an assertion, + optionally preceded by a callout, is expected. This is checked later on, + during actual compilation. However we need to identify this kind of + assertion in this pass because it must not be qualified. The value of + expect_cond_assert is set to 2 after (?(? is processed. We decrement it + for a callout - still leaving a positive value that identifies the + assertion. Multiple callouts or any other items will make it zero or + less, which doesn't matter because they will cause an error later. */ + + POST_ASSERTION: + nest_depth++; + if (prev_expect_cond_assert > 0) + { + if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); + else if (++top_nest >= end_nests) + { + errorcode = ERR84; + goto FAILED; + } + top_nest->nest_depth = nest_depth; + top_nest->flags = NSF_CONDASSERT; + if ((options & PCRE2_EXTENDED) != 0) top_nest->flags |= NSF_EXTENDED; + if ((options & PCRE2_DUPNAMES) != 0) top_nest->flags |= NSF_DUPNAMES; + } + break; + + + /* ---- Define a named group ---- */ + + /* A named group may be defined as (?'name') or (? ). In the latter + case we jump to DEFINE_NAME from the disambiguation of (?< above with the + terminator set to '>'. */ + + case CHAR_APOSTROPHE: + terminator = CHAR_APOSTROPHE; /* Terminator */ + + DEFINE_NAME: + if (!read_name(&ptr, ptrend, terminator, &offset, &name, &namelen, + &errorcode, cb)) goto FAILED; + + /* We have a name for this capturing group. It is also assigned a number, + which is its primary means of identification. */ + + cb->bracount++; + *parsed_pattern++ = META_CAPTURE | cb->bracount; + nest_depth++; + + /* Check not too many names */ + + if (cb->names_found >= MAX_NAME_COUNT) + { + errorcode = ERR49; + goto FAILED; + } + + /* Adjust the entry size to accommodate the longest name found. */ + + if (namelen + IMM2_SIZE + 1 > cb->name_entry_size) + cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1); + + /* Scan the list to check for duplicates. For duplicate names, if the + number is the same, break the loop, which causes the name to be + discarded; otherwise, if DUPNAMES is not set, give an error. + If it is set, allow the name with a different number, but continue + scanning in case this is a duplicate with the same number. For + non-duplicate names, give an error if the number is duplicated. */ + + isdupname = FALSE; + ng = cb->named_groups; + for (i = 0; i < cb->names_found; i++, ng++) + { + if (namelen == ng->length && + PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0) + { + if (ng->number == cb->bracount) break; + if ((options & PCRE2_DUPNAMES) == 0) + { + errorcode = ERR43; + goto FAILED; + } + isdupname = ng->isdup = TRUE; /* Mark as a duplicate */ + cb->dupnames = TRUE; /* Duplicate names exist */ + } + else if (ng->number == cb->bracount) + { + errorcode = ERR65; + goto FAILED; + } + } + + if (i < cb->names_found) break; /* Ignore duplicate with same number */ + + /* Increase the list size if necessary */ + + if (cb->names_found >= cb->named_group_list_size) + { + uint32_t newsize = cb->named_group_list_size * 2; + named_group *newspace = + cb->cx->memctl.malloc(newsize * sizeof(named_group), + cb->cx->memctl.memory_data); + if (newspace == NULL) + { + errorcode = ERR21; + goto FAILED; + } + + memcpy(newspace, cb->named_groups, + cb->named_group_list_size * sizeof(named_group)); + if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE) + cb->cx->memctl.free((void *)cb->named_groups, + cb->cx->memctl.memory_data); + cb->named_groups = newspace; + cb->named_group_list_size = newsize; + } + + /* Add this name to the list */ + + cb->named_groups[cb->names_found].name = name; + cb->named_groups[cb->names_found].length = (uint16_t)namelen; + cb->named_groups[cb->names_found].number = cb->bracount; + cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname; + cb->names_found++; + break; + } /* End of (? switch */ + break; /* End of ( handling */ + + + /* ---- Branch terminators ---- */ + + /* Alternation: reset the capture count if we are in a (?| group. */ + + case CHAR_VERTICAL_LINE: + if (top_nest != NULL && top_nest->nest_depth == nest_depth && + (top_nest->flags & NSF_RESET) != 0) + { + if (cb->bracount > top_nest->max_group) + top_nest->max_group = (uint16_t)cb->bracount; + cb->bracount = top_nest->reset_group; + } + *parsed_pattern++ = META_ALT; + break; + + /* End of group; reset the capture count to the maximum if we are in a (?| + group and/or reset the extended and dupnames options. Disallow quantifier + for a condition that is an assertion. */ + + case CHAR_RIGHT_PARENTHESIS: + okquantifier = TRUE; + if (top_nest != NULL && top_nest->nest_depth == nest_depth) + { + if ((top_nest->flags & NSF_RESET) != 0 && + top_nest->max_group > cb->bracount) + cb->bracount = top_nest->max_group; + if ((top_nest->flags & NSF_EXTENDED) != 0) options |= PCRE2_EXTENDED; + else options &= ~PCRE2_EXTENDED; + if ((top_nest->flags & NSF_DUPNAMES) != 0) options |= PCRE2_DUPNAMES; + else options &= ~PCRE2_DUPNAMES; + if ((top_nest->flags & NSF_CONDASSERT) != 0) + okquantifier = FALSE; + if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; + else top_nest--; + } + if (nest_depth == 0) /* Unmatched closing parenthesis */ + { + errorcode = ERR22; + goto FAILED_BACK; + } + nest_depth--; + *parsed_pattern++ = META_KET; + break; + } /* End of switch on pattern character */ + } /* End of main character scan loop */ + +/* End of pattern reached. Check for missing ) at the end of a verb name. */ + +if (inverbname && ptr >= ptrend) + { + errorcode = ERR60; + goto FAILED; } -/* Control never gets here */ + +/* Manage callout for the final item */ + +parsed_pattern = manage_callouts(ptr, &previous_callout, options, + parsed_pattern, cb); + +/* Terminate the parsed pattern, then return success if all groups are closed. +Otherwise we have unclosed parentheses. */ + +if (parsed_pattern >= parsed_pattern_end) + { + errorcode = ERR63; /* Internal error (parsed pattern overflow) */ + goto FAILED; + } +*parsed_pattern = META_END; +if (nest_depth == 0) return 0; + +UNCLOSED_PARENTHESIS: +errorcode = ERR14; + +/* Come here for all failures. */ + +FAILED: +cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern); +return errorcode; + +/* Some errors need to indicate the previous character. */ + +FAILED_BACK: +ptr--; +goto FAILED; + +/* This failure happens several times. */ + +BAD_VERSION_CONDITION: +errorcode = ERR79; +goto FAILED; } @@ -1210,1294 +4173,3565 @@ for (;;) +#ifdef SUPPORT_UNICODE /************************************************* -* Scan compiled branch for non-emptiness * +* Get othercase range * *************************************************/ -/* This function scans through a branch of a compiled pattern to see whether it -can match the empty string. It is called from could_be_empty() below and from -compile_branch() when checking for an unlimited repeat of a group that can -match nothing. Note that first_significant_code() skips over backward and -negative forward assertions when its final argument is TRUE. If we hit an -unclosed bracket, we return "empty" - this means we've struck an inner bracket -whose current branch will already have been scanned. +/* This function is passed the start and end of a class range in UCP mode. It +searches up the characters, looking for ranges of characters in the "other" +case. Each call returns the next one, updating the start address. A character +with multiple other cases is returned on its own with a special return value. Arguments: - code points to start of search - endcode points to where to stop - utf TRUE if in UTF mode - cb compile data - recurses chain of recurse_check to catch mutual recursion - -Returns: TRUE if what is matched could be empty -*/ - -static BOOL -could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf, - compile_block *cb, recurse_check *recurses) -{ -register PCRE2_UCHAR c; -recurse_check this_recurse; - -for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); - code < endcode; - code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE)) - { - PCRE2_SPTR ccode; - - c = *code; - - /* Skip over forward assertions; the other assertions are skipped by - first_significant_code() with a TRUE final argument. */ - - if (c == OP_ASSERT) - { - do code += GET(code, 1); while (*code == OP_ALT); - c = *code; - continue; - } - - /* For a recursion/subroutine call, if its end has been reached, which - implies a backward reference subroutine call, we can scan it. If it's a - forward reference subroutine call, we can't. To detect forward reference - we have to scan up the list that is kept in the workspace. This function is - called only when doing the real compile, not during the pre-compile that - measures the size of the compiled pattern. */ - - if (c == OP_RECURSE) - { - PCRE2_SPTR scode = cb->start_code + GET(code, 1); - PCRE2_SPTR endgroup = scode; - BOOL empty_branch; - - /* Test for forward reference or uncompleted reference. This is disabled - when called to scan a completed pattern by setting cb->start_workspace to - NULL. */ - - if (cb->start_workspace != NULL) - { - PCRE2_SPTR tcode; - for (tcode = cb->start_workspace; tcode < cb->hwm; tcode += LINK_SIZE) - if ((int)GET(tcode, 0) == (int)(code + 1 - cb->start_code)) return TRUE; - if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ - } - - /* If the reference is to a completed group, we need to detect whether this - is a recursive call, as otherwise there will be an infinite loop. If it is - a recursion, just skip over it. Simple recursions are easily detected. For - mutual recursions we keep a chain on the stack. */ - - do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); - if (code >= scode && code <= endgroup) continue; /* Simple recursion */ - else - { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) - if (r->group == scode) break; - if (r != NULL) continue; /* Mutual recursion */ - } - - /* Completed reference; scan the referenced group, remembering it on the - stack chain to detect mutual recursions. */ - - empty_branch = FALSE; - this_recurse.prev = recurses; - this_recurse.group = scode; - - do - { - if (could_be_empty_branch(scode, endcode, utf, cb, &this_recurse)) - { - empty_branch = TRUE; - break; - } - scode += GET(scode, 1); - } - while (*scode == OP_ALT); - - if (!empty_branch) return FALSE; /* All branches are non-empty */ - continue; - } - - /* Groups with zero repeats can of course be empty; skip them. */ - - if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO || - c == OP_BRAPOSZERO) - { - code += PRIV(OP_lengths)[c]; - do code += GET(code, 1); while (*code == OP_ALT); - c = *code; - continue; - } - - /* A nested group that is already marked as "could be empty" can just be - skipped. */ - - if (c == OP_SBRA || c == OP_SBRAPOS || - c == OP_SCBRA || c == OP_SCBRAPOS) - { - do code += GET(code, 1); while (*code == OP_ALT); - c = *code; - continue; - } - - /* For other groups, scan the branches. */ - - if (c == OP_BRA || c == OP_BRAPOS || - c == OP_CBRA || c == OP_CBRAPOS || - c == OP_ONCE || c == OP_ONCE_NC || - c == OP_COND || c == OP_SCOND) - { - BOOL empty_branch; - if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ - - /* If a conditional group has only one branch, there is a second, implied, - empty branch, so just skip over the conditional, because it could be empty. - Otherwise, scan the individual branches of the group. */ - - if (c == OP_COND && code[GET(code, 1)] != OP_ALT) - code += GET(code, 1); - else - { - empty_branch = FALSE; - do - { - if (!empty_branch && could_be_empty_branch(code, endcode, utf, cb, - recurses)) empty_branch = TRUE; - code += GET(code, 1); - } - while (*code == OP_ALT); - if (!empty_branch) return FALSE; /* All branches are non-empty */ - } - - c = *code; - continue; - } - - /* Handle the other opcodes */ - - switch (c) - { - /* Check for quantifiers after a class. XCLASS is used for classes that - cannot be represented just by a bit map. This includes negated single - high-valued characters. The length in PRIV(OP_lengths)[] is zero; the - actual length is stored in the compiled code, so we must update "code" - here. */ - -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - case OP_XCLASS: - ccode = code += GET(code, 1); - goto CHECK_CLASS_REPEAT; -#endif - - case OP_CLASS: - case OP_NCLASS: - ccode = code + PRIV(OP_lengths)[OP_CLASS]; - -#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 - CHECK_CLASS_REPEAT: -#endif - - switch (*ccode) - { - case OP_CRSTAR: /* These could be empty; continue */ - case OP_CRMINSTAR: - case OP_CRQUERY: - case OP_CRMINQUERY: - case OP_CRPOSSTAR: - case OP_CRPOSQUERY: - break; - - default: /* Non-repeat => class must match */ - case OP_CRPLUS: /* These repeats aren't empty */ - case OP_CRMINPLUS: - case OP_CRPOSPLUS: - return FALSE; - - case OP_CRRANGE: - case OP_CRMINRANGE: - case OP_CRPOSRANGE: - if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ - break; - } - break; - - /* Opcodes that must match a character */ - - case OP_ANY: - case OP_ALLANY: - case OP_ANYBYTE: - - case OP_PROP: - case OP_NOTPROP: - case OP_ANYNL: - - case OP_NOT_HSPACE: - case OP_HSPACE: - case OP_NOT_VSPACE: - case OP_VSPACE: - case OP_EXTUNI: - - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - - case OP_PLUS: - case OP_PLUSI: - case OP_MINPLUS: - case OP_MINPLUSI: - - case OP_NOTPLUS: - case OP_NOTPLUSI: - case OP_NOTMINPLUS: - case OP_NOTMINPLUSI: - - case OP_POSPLUS: - case OP_POSPLUSI: - case OP_NOTPOSPLUS: - case OP_NOTPOSPLUSI: - - case OP_EXACT: - case OP_EXACTI: - case OP_NOTEXACT: - case OP_NOTEXACTI: - - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEPOSPLUS: - case OP_TYPEEXACT: - - return FALSE; - - /* These are going to continue, as they may be empty, but we have to - fudge the length for the \p and \P cases. */ - - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPOSSTAR: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSQUERY: - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; - break; - - /* Same for these */ - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEPOSUPTO: - if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) - code += 2; - break; - - /* End of branch */ - - case OP_KET: - case OP_KETRMAX: - case OP_KETRMIN: - case OP_KETRPOS: - case OP_ALT: - return TRUE; - - /* In UTF-8 or UTF-16 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, - POSQUERY, UPTO, MINUPTO, and POSUPTO and their caseless and negative - versions may be followed by a multibyte character. */ - -#ifdef MAYBE_UTF_MULTI - case OP_STAR: - case OP_STARI: - case OP_NOTSTAR: - case OP_NOTSTARI: - - case OP_MINSTAR: - case OP_MINSTARI: - case OP_NOTMINSTAR: - case OP_NOTMINSTARI: - - case OP_POSSTAR: - case OP_POSSTARI: - case OP_NOTPOSSTAR: - case OP_NOTPOSSTARI: - - case OP_QUERY: - case OP_QUERYI: - case OP_NOTQUERY: - case OP_NOTQUERYI: - - case OP_MINQUERY: - case OP_MINQUERYI: - case OP_NOTMINQUERY: - case OP_NOTMINQUERYI: - - case OP_POSQUERY: - case OP_POSQUERYI: - case OP_NOTPOSQUERY: - case OP_NOTPOSQUERYI: - if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]); - break; - - case OP_UPTO: - case OP_UPTOI: - case OP_NOTUPTO: - case OP_NOTUPTOI: - - case OP_MINUPTO: - case OP_MINUPTOI: - case OP_NOTMINUPTO: - case OP_NOTMINUPTOI: - - case OP_POSUPTO: - case OP_POSUPTOI: - case OP_NOTPOSUPTO: - case OP_NOTPOSUPTOI: - if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]); - break; -#endif /* MAYBE_UTF_MULTI */ - - /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument - string. */ - - case OP_MARK: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - code += code[1]; - break; - - /* None of the remaining opcodes are required to match a character. */ - - default: - break; - } - } - -return TRUE; -} - - - -/************************************************* -* Scan compiled regex for non-emptiness * -*************************************************/ - -/* This function is called to check for left recursive calls. We want to check -the current branch of the current pattern to see if it could match the empty -string. If it could, we must look outwards for branches at other levels, -stopping when we pass beyond the bracket which is the subject of the recursion. -This function is called only during the real compile, not during the -pre-compile. - -Arguments: - code points to start of the recursion - endcode points to where to stop (current RECURSE item) - bcptr points to the chain of current (unclosed) branch starts - utf TRUE if in UTF mode - cb compile data - -Returns: TRUE if what is matched could be empty -*/ - -static BOOL -could_be_empty(PCRE2_SPTR code, PCRE2_SPTR endcode, branch_chain *bcptr, - BOOL utf, compile_block *cb) -{ -while (bcptr != NULL && bcptr->current_branch >= code) - { - if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cb, NULL)) - return FALSE; - bcptr = bcptr->outer; - } -return TRUE; -} - - - -/************************************************* -* Expand the workspace * -*************************************************/ - -/* This function is called during the second compiling phase, if the number of -forward references fills the existing workspace, which is originally a block on -the stack. A larger block is obtained from the heap unless the ultimate limit -has been reached or the increase will be rather small. - -Argument: pointer to the compile data block -Returns: 0 if all went well, else an error number + cptr points to starting character value; updated + d end value + ocptr where to put start of othercase range + odptr where to put end of othercase range + +Yield: -1 when no more + 0 when a range is returned + >0 the CASESET offset for char with multiple other cases + in this case, ocptr contains the original */ static int -expand_workspace(compile_block *cb) +get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr, + uint32_t *odptr) { -PCRE2_UCHAR *newspace; -int newsize = cb->workspace_size * 2; -if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX; -if (cb->workspace_size >= COMPILE_WORK_SIZE_MAX || - newsize - cb->workspace_size < WORK_SIZE_SAFETY_MARGIN) - return ERR72; -newspace = cb->cx->memctl.malloc(CU2BYTES(newsize), cb->cx->memctl.memory_data); -if (newspace == NULL) return ERR21; -memcpy(newspace, cb->start_workspace, cb->workspace_size * sizeof(PCRE2_UCHAR)); -cb->hwm = (PCRE2_UCHAR *)newspace + (cb->hwm - cb->start_workspace); -if (cb->workspace_size > COMPILE_WORK_SIZE) - cb->cx->memctl.free((void *)cb->start_workspace, cb->cx->memctl.memory_data); -cb->start_workspace = newspace; -cb->workspace_size = newsize; +uint32_t c, othercase, next; +unsigned int co; + +/* Find the first character that has an other case. If it has multiple other +cases, return its case offset value. */ + +for (c = *cptr; c <= d; c++) + { + if ((co = UCD_CASESET(c)) != 0) + { + *ocptr = c++; /* Character that has the set */ + *cptr = c; /* Rest of input range */ + return (int)co; + } + if ((othercase = UCD_OTHERCASE(c)) != c) break; + } + +if (c > d) return -1; /* Reached end of range */ + +/* Found a character that has a single other case. Search for the end of the +range, which is either the end of the input range, or a character that has zero +or more than one other cases. */ + +*ocptr = othercase; +next = othercase + 1; + +for (++c; c <= d; c++) + { + if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break; + next++; + } + +*odptr = next - 1; /* End of othercase range */ +*cptr = c; /* Rest of input range */ return 0; } +#endif /* SUPPORT_UNICODE */ /************************************************* -* Check for counted repeat * +* Add a character or range to a class (internal) * *************************************************/ -/* This function is called when a '{' is encountered in a place where it might -start a quantifier. It looks ahead to see if it really is a quantifier, that -is, one of the forms {ddd} {ddd,} or {ddd,ddd} where the ddds are digits. - -Argument: pointer to the first char after '{' -Returns: TRUE or FALSE -*/ - -static BOOL -is_counted_repeat(PCRE2_SPTR p) -{ -if (!IS_DIGIT(*p)) return FALSE; -p++; -while (IS_DIGIT(*p)) p++; -if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; - -if (*p++ != CHAR_COMMA) return FALSE; -if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; - -if (!IS_DIGIT(*p)) return FALSE; -p++; -while (IS_DIGIT(*p)) p++; - -return (*p == CHAR_RIGHT_CURLY_BRACKET); -} - - - -/************************************************* -* Handle escapes * -*************************************************/ - -/* This function is called when a \ has been encountered. It either returns a -positive value for a simple escape such as \d, or 0 for a data character, which -is placed in chptr. A backreference to group n is returned as negative n. On -entry, ptr is pointing at the \. On exit, it points the final code unit of the -escape sequence. +/* This function packages up the logic of adding a character or range of +characters to a class. The character values in the arguments will be within the +valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is +called only from within the "add to class" group of functions, some of which +are recursive and mutually recursive. The external entry point is +add_to_class(). Arguments: - ptrptr points to the pattern position pointer - chptr points to a returned data character - errorcodeptr points to the errorcode variable (containing zero) - options the current options bits - isclass TRUE if inside a character class - cb compile data block + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options word + cb compile data + start start of range character + end end of range character -Returns: zero => a data character - positive => a special escape sequence - negative => a back reference - on error, errorcodeptr is set non-zero +Returns: the number of < 256 characters added + the pointer to extra data is updated */ -static int -check_escape(PCRE2_SPTR *ptrptr, uint32_t *chptr, int *errorcodeptr, - uint32_t options, BOOL isclass, compile_block *cb) +static unsigned int +add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, + uint32_t options, compile_block *cb, uint32_t start, uint32_t end) { -BOOL utf = (options & PCRE2_UTF) != 0; -PCRE2_SPTR ptr = *ptrptr + 1; -register uint32_t c, cc; -int escape = 0; -int i; +uint32_t c; +uint32_t classbits_end = (end <= 0xff ? end : 0xff); +unsigned int n8 = 0; -GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ -ptr--; /* Set pointer back to the last code unit */ +/* If caseless matching is required, scan the range and process alternate +cases. In Unicode, there are 8-bit characters that have alternate cases that +are greater than 255 and vice-versa. Sometimes we can just extend the original +range. */ -/* If backslash is at the end of the pattern, it's an error. */ - -if (c == CHAR_NULL && ptr >= cb->end_pattern) *errorcodeptr = ERR1; - -/* Non-alphanumerics are literals, so we just leave the value in c. An initial -value test saves a memory lookup for code points outside the alphanumeric -range. Otherwise, do a table lookup. A non-zero result is something that can be -returned immediately. Otherwise further processing is required. */ - -else if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */ - -else if ((i = escapes[c - ESCAPES_FIRST]) != 0) +if ((options & PCRE2_CASELESS) != 0) { - if (i > 0) c = (uint32_t)i; else /* Positive is a data character */ +#ifdef SUPPORT_UNICODE + if ((options & PCRE2_UTF) != 0) { - escape = -i; /* Else return a special escape */ - if (escape == ESC_P || escape == ESC_p || escape == ESC_X) - cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */ + int rc; + uint32_t oc, od; + + options &= ~PCRE2_CASELESS; /* Remove for recursive calls */ + c = start; + + while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0) + { + /* Handle a single character that has more than one other case. */ + + if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb, + PRIV(ucd_caseless_sets) + rc, oc); + + /* Do nothing if the other case range is within the original range. */ + + else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue; + + /* Extend the original range if there is overlap, noting that if oc < c, we + can't have od > end because a subrange is always shorter than the basic + range. Otherwise, use a recursive call to add the additional range. */ + + else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ + else if (od > end && oc <= end + 1) + { + end = od; /* Extend upwards */ + if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); + } + else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od); + } + } + else +#endif /* SUPPORT_UNICODE */ + + /* Not UTF mode */ + + for (c = start; c <= classbits_end; c++) + { + SETBIT(classbits, cb->fcc[c]); + n8++; } } -/* Escapes that need further processing, including those that are unknown. */ +/* Now handle the originally supplied range. Adjust the final value according +to the bit length - this means that the same lists of (e.g.) horizontal spaces +can be used in all cases. */ -else +if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR) + end = MAX_NON_UTF_CHAR; + +if (start > cb->class_range_start && end < cb->class_range_end) return n8; + +/* Use the bitmap for characters < 256. Otherwise use extra data.*/ + +for (c = start; c <= classbits_end; c++) { - PCRE2_SPTR oldptr; - BOOL braced, negated, overflow; - unsigned int s; - - switch (c) - { - /* A number of Perl escapes are not handled by PCRE. We give an explicit - error. */ - - case CHAR_l: - case CHAR_L: - *errorcodeptr = ERR37; - break; - - /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated - specially, \u must be followed by four hex digits. Otherwise it is a - lowercase u letter. */ - - case CHAR_u: - if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else - { - uint32_t xc; - if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ - if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ - cc = (cc << 4) | xc; - if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ - cc = (cc << 4) | xc; - if ((xc = XDIGIT(ptr[4])) == 0xff) break; /* Not a hex digit */ - c = (cc << 4) | xc; - ptr += 4; - if (utf) - { - if (c > 0x10ffffU) *errorcodeptr = ERR77; - else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; - } - else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; - } - break; - - case CHAR_U: - /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an - upper case letter. */ - if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; - break; - - /* In a character class, \g is just a literal "g". Outside a character - class, \g must be followed by one of a number of specific things: - - (1) A number, either plain or braced. If positive, it is an absolute - backreference. If negative, it is a relative backreference. This is a Perl - 5.10 feature. - - (2) Perl 5.10 also supports \g{name} as a reference to a named group. This - is part of Perl's movement towards a unified syntax for back references. As - this is synonymous with \k{name}, we fudge it up by pretending it really - was \k. - - (3) For Oniguruma compatibility we also support \g followed by a name or a - number either in angle brackets or in single quotes. However, these are - (possibly recursive) subroutine calls, _not_ backreferences. Just return - the ESC_g code (cf \k). */ - - case CHAR_g: - if (isclass) break; - if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE) - { - escape = ESC_g; - break; - } - - /* Handle the Perl-compatible cases */ - - if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) - { - PCRE2_SPTR p; - for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++) - if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break; - if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET) - { - escape = ESC_k; - break; - } - braced = TRUE; - ptr++; - } - else braced = FALSE; - - if (ptr[1] == CHAR_MINUS) - { - negated = TRUE; - ptr++; - } - else negated = FALSE; - - /* The integer range is limited by the machine's int representation. */ - s = 0; - overflow = FALSE; - while (IS_DIGIT(ptr[1])) - { - if (s > INT_MAX / 10 - 1) /* Integer overflow */ - { - overflow = TRUE; - break; - } - s = s * 10 + (int)(*(++ptr) - CHAR_0); - } - if (overflow) /* Integer overflow */ - { - while (IS_DIGIT(ptr[1])) ptr++; - *errorcodeptr = ERR61; - break; - } - - if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET) - { - *errorcodeptr = ERR57; - break; - } - - if (s == 0) - { - *errorcodeptr = ERR58; - break; - } - - if (negated) - { - if (s > cb->bracount) - { - *errorcodeptr = ERR15; - break; - } - s = cb->bracount - (s - 1); - } - - escape = -s; - break; - - /* The handling of escape sequences consisting of a string of digits - starting with one that is not zero is not straightforward. Perl has changed - over the years. Nowadays \g{} for backreferences and \o{} for octal are - recommended to avoid the ambiguities in the old syntax. - - Outside a character class, the digits are read as a decimal number. If the - number is less than 10, or if there are that many previous extracting left - brackets, it is a back reference. Otherwise, up to three octal digits are - read to form an escaped character code. Thus \123 is likely to be octal 123 - (cf \0123, which is octal 012 followed by the literal 3). If the octal - value is greater than 377, the least significant 8 bits are taken. - - Inside a character class, \ followed by a digit is always either a literal - 8 or 9 or an octal number. */ - - case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: - case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: - - if (!isclass) - { - oldptr = ptr; - /* The integer range is limited by the machine's int representation. */ - s = (int)(c - CHAR_0); - overflow = FALSE; - while (IS_DIGIT(ptr[1])) - { - if (s > INT_MAX / 10 - 1) /* Integer overflow */ - { - overflow = TRUE; - break; - } - s = s * 10 + (int)(*(++ptr) - CHAR_0); - } - if (overflow) /* Integer overflow */ - { - while (IS_DIGIT(ptr[1])) ptr++; - *errorcodeptr = ERR61; - break; - } - - /* \1 to \9 are always back references. \8x and \9x are too, unless there - are an awful lot of previous captures; \1x to \7x are octal escapes if - there are not that many previous captures. */ - - if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount) - { - escape = -s; /* Indicates a back reference */ - break; - } - ptr = oldptr; /* Put the pointer back and fall through */ - } - - /* Handle a digit following \ when the number is not a back reference, or - we are within a character class. If the first digit is 8 or 9, Perl used to - generate a binary zero byte and then treat the digit as a following - literal. At least by Perl 5.18 this changed so as not to insert the binary - zero. */ - - if ((c = *ptr) >= CHAR_8) break; - - /* Fall through with a digit less than 8 */ - - /* \0 always starts an octal number, but we may drop through to here with a - larger first octal digit. The original code used just to take the least - significant 8 bits of octal numbers (I think this is what early Perls used - to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, - but no more than 3 octal digits. */ - - case CHAR_0: - c -= CHAR_0; - while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) - c = c * 8 + *(++ptr) - CHAR_0; -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (!utf && c > 0xff) *errorcodeptr = ERR51; -#endif - break; - - /* \o is a relatively new Perl feature, supporting a more general way of - specifying character codes in octal. The only supported form is \o{ddd}. */ - - case CHAR_o: - if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else - if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else - { - ptr += 2; - c = 0; - overflow = FALSE; - while (*ptr >= CHAR_0 && *ptr <= CHAR_7) - { - cc = *ptr++; - if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ -#if PCRE2_CODE_UNIT_WIDTH == 32 - if (c >= 0x20000000l) { overflow = TRUE; break; } -#endif - c = (c << 3) + cc - CHAR_0 ; -#if PCRE2_CODE_UNIT_WIDTH == 8 - if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } -#elif PCRE2_CODE_UNIT_WIDTH == 16 - if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } -#elif PCRE2_CODE_UNIT_WIDTH == 32 - if (utf && c > 0x10ffffU) { overflow = TRUE; break; } -#endif - } - if (overflow) - { - while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++; - *errorcodeptr = ERR34; - } - else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) - { - if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; - } - else *errorcodeptr = ERR64; - } - break; - - /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by - two hexadecimal digits. Otherwise it is a lowercase x letter. */ - - case CHAR_x: - if ((options & PCRE2_ALT_BSUX) != 0) - { - uint32_t xc; - if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ - if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ - c = (cc << 4) | xc; - ptr += 2; - } /* End PCRE2_ALT_BSUX handling */ - - /* Handle \x in Perl's style. \x{ddd} is a character number which can be - greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex - digits. If not, { used to be treated as a data character. However, Perl - seems to read hex digits up to the first non-such, and ignore the rest, so - that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE - now gives an error. */ - - else - { - if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) - { - ptr += 2; - if (*ptr == CHAR_RIGHT_CURLY_BRACKET) - { - *errorcodeptr = ERR78; - break; - } - c = 0; - overflow = FALSE; - - while ((cc = XDIGIT(*ptr)) != 0xff) - { - ptr++; - if (c == 0 && cc == 0) continue; /* Leading zeroes */ -#if PCRE2_CODE_UNIT_WIDTH == 32 - if (c >= 0x10000000l) { overflow = TRUE; break; } -#endif - c = (c << 4) | cc; - if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR)) - { - overflow = TRUE; - break; - } - } - - if (overflow) - { - while (XDIGIT(*ptr) != 0xff) ptr++; - *errorcodeptr = ERR34; - } - else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) - { - if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; - } - - /* If the sequence of hex digits does not end with '}', give an error. - We used just to recognize this construct and fall through to the normal - \x handling, but nowadays Perl gives an error, which seems much more - sensible, so we do too. */ - - else *errorcodeptr = ERR67; - } /* End of \x{} processing */ - - /* Read a single-byte hex-defined char (up to two hex digits after \x) */ - - else - { - c = 0; - if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ - ptr++; - c = cc; - if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ - ptr++; - c = (c << 4) | cc; - } /* End of \xdd handling */ - } /* End of Perl-style \x handling */ - break; - - /* The handling of \c is different in ASCII and EBCDIC environments. In an - ASCII (or Unicode) environment, an error is given if the character - following \c is not a printable ASCII character. Otherwise, the following - character is upper-cased if it is a letter, and after that the 0x40 bit is - flipped. The result is the value of the escape. - - In an EBCDIC environment the handling of \c is compatible with the - specification in the perlebcdic document. The following character must be - a letter or one of small number of special characters. These provide a - means of defining the character values 0-31. - - For testing the EBCDIC handling of \c in an ASCII environment, recognize - the EBCDIC value of 'c' explicitly. */ - -#if defined EBCDIC && 'a' != 0x81 - case 0x83: -#else - case CHAR_c: -#endif - - c = *(++ptr); - if (c >= CHAR_a && c <= CHAR_z) c += ESCAPES_UPPER_CASE; - if (c == CHAR_NULL && ptr >= cb->end_pattern) - { - *errorcodeptr = ERR2; - break; - } - - /* Handle \c in an ASCII/Unicode environment. */ - -#ifndef EBCDIC /* ASCII/UTF-8 coding */ - if (c < 32 || c > 126) /* Excludes all non-printable ASCII */ - { - *errorcodeptr = ERR68; - break; - } - c ^= 0x40; - - /* Handle \c in an EBCDIC environment. The special case \c? is converted to - 255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC - encoding. (This is the way Perl indicates that it handles \c?.) The other - valid sequences correspond to a list of specific characters. */ - -#else - if (c == CHAR_QUESTION_MARK) - c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; - else - { - for (i = 0; i < 32; i++) - { - if (c == ebcdic_escape_c[i]) break; - } - if (i < 32) c = i; else *errorcodeptr = ERR68; - } -#endif /* EBCDIC */ - - break; - - /* Any other alphanumeric following \ is an error. Perl gives an error only - if in warning mode, but PCRE doesn't have a warning mode. */ - - default: - *errorcodeptr = ERR3; - break; - } + /* Regardless of start, c will always be <= 255. */ + SETBIT(classbits, c); + n8++; } -/* Perl supports \N{name} for character names, as well as plain \N for "not -newline". PCRE does not support \N{name}. However, it does support -quantification such as \N{2,3}. */ +#ifdef SUPPORT_WIDE_CHARS +if (start <= 0xff) start = 0xff + 1; -if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET && - !is_counted_repeat(ptr+2)) - *errorcodeptr = ERR37; +if (end >= start) + { + PCRE2_UCHAR *uchardata = *uchardptr; -/* If PCRE2_UCP is set, we change the values for \d etc. */ +#ifdef SUPPORT_UNICODE + if ((options & PCRE2_UTF) != 0) + { + if (start < end) + { + *uchardata++ = XCL_RANGE; + uchardata += PRIV(ord2utf)(start, uchardata); + uchardata += PRIV(ord2utf)(end, uchardata); + } + else if (start == end) + { + *uchardata++ = XCL_SINGLE; + uchardata += PRIV(ord2utf)(start, uchardata); + } + } + else +#endif /* SUPPORT_UNICODE */ -if ((options & PCRE2_UCP) != 0 && escape >= ESC_D && escape <= ESC_w) - escape += (ESC_DU - ESC_D); + /* Without UTF support, character values are constrained by the bit length, + and can only be > 256 for 16-bit and 32-bit libraries. */ -/* Set the pointer to the final character before returning. */ +#if PCRE2_CODE_UNIT_WIDTH == 8 + {} +#else + if (start < end) + { + *uchardata++ = XCL_RANGE; + *uchardata++ = start; + *uchardata++ = end; + } + else if (start == end) + { + *uchardata++ = XCL_SINGLE; + *uchardata++ = start; + } +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ + *uchardptr = uchardata; /* Updata extra data pointer */ + } +#else /* SUPPORT_WIDE_CHARS */ + (void)uchardptr; /* Avoid compiler warning */ +#endif /* SUPPORT_WIDE_CHARS */ -*ptrptr = ptr; -*chptr = c; -return escape; +return n8; /* Number of 8-bit characters */ } #ifdef SUPPORT_UNICODE /************************************************* -* Handle \P and \p * +* Add a list of characters to a class (internal) * *************************************************/ -/* This function is called after \P or \p has been encountered, provided that -PCRE2 is compiled with support for UTF and Unicode properties. On entry, the -contents of ptrptr are pointing at the P or p. On exit, it is left pointing at -the final code unit of the escape sequence. +/* This function is used for adding a list of case-equivalent characters to a +class when in UTF mode. This function is called only from within +add_to_class_internal(), with which it is mutually recursive. Arguments: - ptrptr the pattern position pointer - negptr a boolean that is set TRUE for negation else FALSE - ptypeptr an unsigned int that is set to the type value - pdataptr an unsigned int that is set to the detailed property value - errorcodeptr the error code variable - cb the compile data + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options word + cb contains pointers to tables etc. + p points to row of 32-bit values, terminated by NOTACHAR + except character to omit; this is used when adding lists of + case-equivalent characters to avoid including the one we + already know about -Returns: TRUE if the type value was found, or FALSE for an invalid type +Returns: the number of < 256 characters added + the pointer to extra data is updated */ -static BOOL -get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, unsigned int *ptypeptr, - unsigned int *pdataptr, int *errorcodeptr, compile_block *cb) +static unsigned int +add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, + uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except) { -register PCRE2_UCHAR c; -int i, bot, top; -PCRE2_SPTR ptr = *ptrptr; -PCRE2_UCHAR name[32]; - -*negptr = FALSE; -c = *(++ptr); - -/* \P or \p can be followed by a name in {}, optionally preceded by ^ for -negation. */ - -if (c == CHAR_LEFT_CURLY_BRACKET) +unsigned int n8 = 0; +while (p[0] < NOTACHAR) { - if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT) + unsigned int n = 0; + if (p[0] != except) { - *negptr = TRUE; - ptr++; + while(p[n+1] == p[0] + n + 1) n++; + n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); } - for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++) - { - c = *(++ptr); - if (c == CHAR_NULL) goto ERROR_RETURN; - if (c == CHAR_RIGHT_CURLY_BRACKET) break; - name[i] = c; - } - if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; - name[i] = 0; + p += n + 1; } - -/* Otherwise there is just one following character, which must be an ASCII -letter. */ - -else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0) - { - name[0] = c; - name[1] = 0; - } -else goto ERROR_RETURN; - -*ptrptr = ptr; - -/* Search for a recognized property name using binary chop. */ - -bot = 0; -top = PRIV(utt_size); - -while (bot < top) - { - int r; - i = (bot + top) >> 1; - r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); - if (r == 0) - { - *ptypeptr = PRIV(utt)[i].type; - *pdataptr = PRIV(utt)[i].value; - return TRUE; - } - if (r > 0) bot = i + 1; else top = i; - } -*errorcodeptr = ERR47; /* Unrecognized name */ -return FALSE; - -ERROR_RETURN: /* Malformed \P or \p */ -*errorcodeptr = ERR46; -*ptrptr = ptr; -return FALSE; +return n8; } #endif /************************************************* -* Read repeat counts * +* External entry point for add range to class * *************************************************/ -/* Read an item of the form {n,m} and return the values. This is called only -after is_counted_repeat() has confirmed that a repeat-count quantifier exists, -so the syntax is guaranteed to be correct, but we need to check the values. +/* This function sets the overall range so that the internal functions can try +to avoid duplication when handling case-independence. Arguments: - p pointer to first char after '{' - minp pointer to int for min - maxp pointer to int for max - returned as -1 if no max - errorcodeptr points to error code variable + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options word + cb compile data + start start of range character + end end of range character -Returns: pointer to '}' on success; - current ptr on error, with errorcodeptr set non-zero +Returns: the number of < 256 characters added + the pointer to extra data is updated */ -static PCRE2_SPTR -read_repeat_counts(PCRE2_SPTR p, int *minp, int *maxp, int *errorcodeptr) +static unsigned int +add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, + compile_block *cb, uint32_t start, uint32_t end) { -int min = 0; -int max = -1; +cb->class_range_start = start; +cb->class_range_end = end; +return add_to_class_internal(classbits, uchardptr, options, cb, start, end); +} -while (IS_DIGIT(*p)) + +/************************************************* +* External entry point for add list to class * +*************************************************/ + +/* This function is used for adding a list of horizontal or vertical whitespace +characters to a class. The list must be in order so that ranges of characters +can be detected and handled appropriately. This function sets the overall range +so that the internal functions can try to avoid duplication when handling +case-independence. + +Arguments: + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options word + cb contains pointers to tables etc. + p points to row of 32-bit values, terminated by NOTACHAR + except character to omit; this is used when adding lists of + case-equivalent characters to avoid including the one we + already know about + +Returns: the number of < 256 characters added + the pointer to extra data is updated +*/ + +static unsigned int +add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, + compile_block *cb, const uint32_t *p, unsigned int except) +{ +unsigned int n8 = 0; +while (p[0] < NOTACHAR) { - min = min * 10 + (int)(*p++ - CHAR_0); - if (min > 65535) + unsigned int n = 0; + if (p[0] != except) { - *errorcodeptr = ERR5; - return p; + while(p[n+1] == p[0] + n + 1) n++; + cb->class_range_start = p[0]; + cb->class_range_end = p[n]; + n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); } + p += n + 1; } - -if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else - { - if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) - { - max = 0; - while(IS_DIGIT(*p)) - { - max = max * 10 + (int)(*p++ - CHAR_0); - if (max > 65535) - { - *errorcodeptr = ERR5; - return p; - } - } - if (max < min) - { - *errorcodeptr = ERR4; - return p; - } - } - } - -*minp = min; -*maxp = max; -return p; +return n8; } /************************************************* -* Scan compiled regex for specific bracket * +* Add characters not in a list to a class * *************************************************/ -/* This function scans through a compiled pattern until it finds a -capturing bracket with the given number, or, if the number is negative, an -instance of OP_REVERSE for a lookbehind. The function is global in the C sense -so that it can be called from pcre2_study() when finding the minimum matching -length. +/* This function is used for adding the complement of a list of horizontal or +vertical whitespace to a class. The list must be in order. Arguments: - code points to start of expression - utf TRUE in UTF mode - number the required bracket number or negative to find a lookbehind + classbits the bit map for characters < 256 + uchardptr points to the pointer for extra data + options the options word + cb contains pointers to tables etc. + p points to row of 32-bit values, terminated by NOTACHAR -Returns: pointer to the opcode for the bracket, or NULL if not found +Returns: the number of < 256 characters added + the pointer to extra data is updated */ -PCRE2_SPTR -PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number) +static unsigned int +add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, + uint32_t options, compile_block *cb, const uint32_t *p) { +BOOL utf = (options & PCRE2_UTF) != 0; +unsigned int n8 = 0; +if (p[0] > 0) + n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1); +while (p[0] < NOTACHAR) + { + while (p[1] == p[0] + 1) p++; + n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1, + (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1); + p++; + } +return n8; +} + + + +/************************************************* +* Find details of duplicate group names * +*************************************************/ + +/* This is called from compile_branch() when it needs to know the index and +count of duplicates in the names table when processing named backreferences, +either directly, or as conditions. + +Arguments: + name points to the name + length the length of the name + indexptr where to put the index + countptr where to put the count of duplicates + errorcodeptr where to put an error code + cb the compile block + +Returns: TRUE if OK, FALSE if not, error code set +*/ + +static BOOL +find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr, + int *countptr, int *errorcodeptr, compile_block *cb) +{ +uint32_t i, groupnumber; +int count; +PCRE2_UCHAR *slot = cb->name_table; + +/* Find the first entry in the table */ + +for (i = 0; i < cb->names_found; i++) + { + if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 && + slot[IMM2_SIZE+length] == 0) break; + slot += cb->name_entry_size; + } + +/* This should not occur, because this function is called only when we know we +have duplicate names. Give an internal error. */ + +if (i >= cb->names_found) + { + *errorcodeptr = ERR53; + cb->erroroffset = name - cb->start_pattern; + return FALSE; + } + +/* Record the index and then see how many duplicates there are, updating the +backref map and maximum back reference as we do. */ + +*indexptr = i; +count = 0; + for (;;) { - register PCRE2_UCHAR c = *code; + count++; + groupnumber = GET2(slot,0); + cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; + if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; + if (++i >= cb->names_found) break; + slot += cb->name_entry_size; + if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 || + (slot+IMM2_SIZE)[length] != 0) break; + } - if (c == OP_END) return NULL; +*countptr = count; +return TRUE; +} - /* XCLASS is used for classes that cannot be represented just by a bit map. - This includes negated single high-valued characters. CALLOUT_STR is used for - callouts with string arguments. In both cases the length in the table is - zero; the actual length is stored in the compiled code. */ - if (c == OP_XCLASS) code += GET(code, 1); - else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); - /* Handle recursion */ +/************************************************* +* Compile one branch * +*************************************************/ - else if (c == OP_REVERSE) +/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If +the options are changed during the branch, the pointer is used to change the +external options bits. This function is used during the pre-compile phase when +we are trying to find out the amount of memory needed, as well as during the +real compile phase. The value of lengthptr distinguishes the two phases. + +Arguments: + optionsptr pointer to the option bits + codeptr points to the pointer to the current code point + pptrptr points to the current parsed pattern pointer + errorcodeptr points to error code variable + firstcuptr place to put the first required code unit + firstcuflagsptr place to put the first code unit flags, or a negative number + reqcuptr place to put the last required code unit + reqcuflagsptr place to put the last required code unit flags, or a negative number + bcptr points to current branch chain + cb contains pointers to tables etc. + lengthptr NULL during the real compile phase + points to length accumulator during pre-compile phase + +Returns: 0 There's been an error, *errorcodeptr is non-zero + +1 Success, this branch must match at least one character + -1 Success, this branch may match an empty string +*/ + +static int +compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr, + int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr, + uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr, + compile_block *cb, PCRE2_SIZE *lengthptr) +{ +int bravalue = 0; +int okreturn = -1; +int group_return = 0; +uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */ +uint32_t greedy_default, greedy_non_default; +uint32_t repeat_type, op_type; +uint32_t options = *optionsptr; /* May change dynamically */ +uint32_t firstcu, reqcu; +uint32_t zeroreqcu, zerofirstcu; +uint32_t escape; +uint32_t *pptr = *pptrptr; +uint32_t meta, meta_arg; +int32_t firstcuflags, reqcuflags; +int32_t zeroreqcuflags, zerofirstcuflags; +int32_t req_caseopt, reqvary, tempreqvary; +PCRE2_SIZE offset = 0; +PCRE2_SIZE length_prevgroup = 0; +PCRE2_UCHAR *code = *codeptr; +PCRE2_UCHAR *last_code = code; +PCRE2_UCHAR *orig_code = code; +PCRE2_UCHAR *tempcode; +PCRE2_UCHAR *previous = NULL; +PCRE2_UCHAR op_previous; +BOOL groupsetfirstcu = FALSE; +BOOL matched_char = FALSE; +BOOL previous_matched_char = FALSE; +const uint8_t *cbits = cb->cbits; +uint8_t classbits[32]; + +/* We can fish out the UTF setting once and for all into a BOOL, but we must +not do this for other options (e.g. PCRE2_EXTENDED) because they may change +dynamically as we process the pattern. */ + +#ifdef SUPPORT_UNICODE +BOOL utf = (options & PCRE2_UTF) != 0; +#else /* No UTF support */ +BOOL utf = FALSE; +#endif + +/* Helper variables for OP_XCLASS opcode (for characters > 255). We define +class_uchardata always so that it can be passed to add_to_class() always, +though it will not be used in non-UTF 8-bit cases. This avoids having to supply +alternative calls for the different cases. */ + +PCRE2_UCHAR *class_uchardata; +#ifdef SUPPORT_WIDE_CHARS +BOOL xclass; +PCRE2_UCHAR *class_uchardata_base; +#endif + +/* Set up the default and non-default settings for greediness */ + +greedy_default = ((options & PCRE2_UNGREEDY) != 0); +greedy_non_default = greedy_default ^ 1; + +/* Initialize no first unit, no required unit. REQ_UNSET means "no char +matching encountered yet". It gets changed to REQ_NONE if we hit something that +matches a non-fixed first unit; reqcu just remains unset if we never find one. + +When we hit a repeat whose minimum is zero, we may have to adjust these values +to take the zero repeat into account. This is implemented by setting them to +zerofirstcu and zeroreqcu when such a repeat is encountered. The individual +item types that can be repeated set these backoff variables appropriately. */ + +firstcu = reqcu = zerofirstcu = zeroreqcu = 0; +firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET; + +/* The variable req_caseopt contains either the REQ_CASELESS value or zero, +according to the current setting of the caseless flag. The REQ_CASELESS value +leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables +to record the case status of the value. This is used only for ASCII characters. +*/ + +req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0; + +/* Switch on next META item until the end of the branch */ + +for (;; pptr++) + { +#ifdef SUPPORT_WIDE_CHARS + BOOL xclass_has_prop; +#endif + BOOL negate_class; + BOOL should_flip_negation; + BOOL match_all_or_no_wide_chars; + BOOL possessive_quantifier; + BOOL note_group_empty; + int class_has_8bitchar; + int i; + uint32_t mclength; + uint32_t templastcapture; + uint32_t skipunits; + uint32_t subreqcu, subfirstcu; + uint32_t groupnumber; + uint32_t verbarglen, verbculen; + int32_t subreqcuflags, subfirstcuflags; /* Must be signed */ + open_capitem *oc; + PCRE2_UCHAR mcbuffer[8]; + + /* Get next META item in the pattern and its potential argument. */ + + meta = META_CODE(*pptr); + meta_arg = META_DATA(*pptr); + + /* If we are in the pre-compile phase, accumulate the length used for the + previous cycle of this loop, unless the next item is a quantifier. */ + + if (lengthptr != NULL) { - if (number < 0) return (PCRE2_UCHAR *)code; - code += PRIV(OP_lengths)[c]; - } - - /* Handle capturing bracket */ - - else if (c == OP_CBRA || c == OP_SCBRA || - c == OP_CBRAPOS || c == OP_SCBRAPOS) - { - int n = (int)GET2(code, 1+LINK_SIZE); - if (n == number) return (PCRE2_UCHAR *)code; - code += PRIV(OP_lengths)[c]; - } - - /* Otherwise, we can get the item's length from the table, except that for - repeated character types, we have to test for \p and \P, which have an extra - two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we - must add in its length. */ - - else - { - switch(c) + if (code > cb->start_workspace + cb->workspace_size - + WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */ { - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)? + ERR52 : ERR86; + return 0; + } + + /* There is at least one situation where code goes backwards: this is the + case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier + is processed, the whole class is eliminated. However, it is created first, + so we have to allow memory for it. Therefore, don't ever reduce the length + at this point. */ + + if (code < last_code) code = last_code; + + /* If the next thing is not a quantifier, we add the length of the previous + item into the total, and reset the code pointer to the start of the + workspace. Otherwise leave the previous item available to be quantified. */ + + if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) + { + if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code)) + { + *errorcodeptr = ERR20; /* Integer overflow */ + return 0; + } + *lengthptr += (PCRE2_SIZE)(code - orig_code); + if (*lengthptr > MAX_PATTERN_SIZE) + { + *errorcodeptr = ERR20; /* Pattern is too large */ + return 0; + } + code = orig_code; + } + + /* Remember where this code item starts so we can catch the "backwards" + case above next time round. */ + + last_code = code; + } + + /* Process the next parsed pattern item. If it is not a quantifier, remember + where it starts so that it can be quantified when a quantifier follows. + Checking for the legality of quantifiers happens in parse_regex(), except for + a quantifier after an assertion that is a condition. */ + + if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) + { + previous = code; + if (matched_char) okreturn = 1; + } + + previous_matched_char = matched_char; + matched_char = FALSE; + note_group_empty = FALSE; + skipunits = 0; /* Default value for most subgroups */ + + switch(meta) + { + /* ===================================================================*/ + /* The branch terminates at pattern end or | or ) */ + + case META_END: + case META_ALT: + case META_KET: + *firstcuptr = firstcu; + *firstcuflagsptr = firstcuflags; + *reqcuptr = reqcu; + *reqcuflagsptr = reqcuflags; + *codeptr = code; + *pptrptr = pptr; + return okreturn; + + + /* ===================================================================*/ + /* Handle single-character metacharacters. In multiline mode, ^ disables + the setting of any following char as a first character. */ + + case META_CIRCUMFLEX: + if ((options & PCRE2_MULTILINE) != 0) + { + if (firstcuflags == REQ_UNSET) + zerofirstcuflags = firstcuflags = REQ_NONE; + *code++ = OP_CIRCM; + } + else *code++ = OP_CIRC; + break; + + case META_DOLLAR: + *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL; + break; + + /* There can never be a first char if '.' is first, whatever happens about + repeats. The value of reqcu doesn't change either. */ + + case META_DOT: + matched_char = TRUE; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY; + break; + + + /* ===================================================================*/ + /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. + Otherwise, an initial ']' is taken as a data character. When empty classes + are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must + match any character, so generate OP_ALLANY. */ + + case META_CLASS_EMPTY: + case META_CLASS_EMPTY_NOT: + matched_char = TRUE; + *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + break; + + + /* ===================================================================*/ + /* Non-empty character class. If the included characters are all < 256, we + build a 32-byte bitmap of the permitted characters, except in the special + case where there is only one such character. For negated classes, we build + the map as usual, then invert it at the end. However, we use a different + opcode so that data characters > 255 can be handled correctly. + + If the class contains characters outside the 0-255 range, a different + opcode is compiled. It may optionally have a bit map for characters < 256, + but those above are are explicitly listed afterwards. A flag code unit + tells whether the bitmap is present, and whether this is a negated class or + not. */ + + case META_CLASS_NOT: + case META_CLASS: + matched_char = TRUE; + negate_class = meta == META_CLASS_NOT; + + /* We can optimize the case of a single character in a class by generating + OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's + negative. In the negative case there can be no first char if this item is + first, whatever repeat count may follow. In the case of reqcu, save the + previous value for reinstating. */ + + /* NOTE: at present this optimization is not effective if the only + character in a class in 32-bit, non-UCP mode has its top bit set. */ + + if (pptr[1] < META_END && pptr[2] == META_CLASS_END) + { +#ifdef SUPPORT_UNICODE + uint32_t d; +#endif + uint32_t c = pptr[1]; + + pptr += 2; /* Move on to class end */ + if (meta == META_CLASS) /* A positive one-char class can be */ + { /* handled as a normal literal character. */ + meta = c; /* Set up the character */ + goto NORMAL_CHAR_SET; + } + + /* Handle a negative one-character class */ + + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + + /* For caseless UTF mode, check whether this character has more than + one other case. If so, generate a special OP_NOTPROP item instead of + OP_NOTI. */ + +#ifdef SUPPORT_UNICODE + if (utf && (options & PCRE2_CASELESS) != 0 && + (d = UCD_CASESET(c)) != 0) + { + *code++ = OP_NOTPROP; + *code++ = PT_CLIST; + *code++ = d; + break; /* We are finished with this class */ + } +#endif + /* Char has only one other case, or UCP not available */ + + *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT; + code += PUTCHAR(c, code); + break; /* We are finished with this class */ + } /* End of 1-char optimization */ + + /* Handle character classes that contain more than just one literal + character. */ + + /* If a non-extended class contains a negative special such as \S, we need + to flip the negation flag at the end, so that support for characters > 255 + works correctly (they are all included in the class). An extended class may + need to insert specific matching or non-matching code for wide characters. + */ + + should_flip_negation = match_all_or_no_wide_chars = FALSE; + + /* Extended class (xclass) will be used when characters > 255 + might match. */ + +#ifdef SUPPORT_WIDE_CHARS + xclass = FALSE; + class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ + class_uchardata_base = class_uchardata; /* Save the start */ +#endif + + /* For optimization purposes, we track some properties of the class: + class_has_8bitchar will be non-zero if the class contains at least one + character with a code point less than 256; xclass_has_prop will be TRUE if + Unicode property checks are present in the class. */ + + class_has_8bitchar = 0; +#ifdef SUPPORT_WIDE_CHARS + xclass_has_prop = FALSE; +#endif + + /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map + in a temporary bit of memory, in case the class contains fewer than two + 8-bit characters because in that case the compiled code doesn't use the bit + map. */ + + memset(classbits, 0, 32 * sizeof(uint8_t)); + + /* Process items until META_CLASS_END is reached. */ + + while ((meta = *(++pptr)) != META_CLASS_END) + { + /* Handle POSIX classes such as [:alpha:] etc. */ + + if (meta == META_POSIX || meta == META_POSIX_NEG) + { + BOOL local_negate = (meta == META_POSIX_NEG); + int posix_class = *(++pptr); + int taboffset, tabopt; + uint8_t pbits[32]; + + should_flip_negation = local_negate; /* Note negative special */ + + /* If matching is caseless, upper and lower are converted to alpha. + This relies on the fact that the class table starts with alpha, + lower, upper as the first 3 entries. */ + + if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2) + posix_class = 0; + + /* When PCRE2_UCP is set, some of the POSIX classes are converted to + different escape sequences that use Unicode properties \p or \P. + Others that are not available via \p or \P have to generate + XCL_PROP/XCL_NOTPROP directly, which is done here. */ + +#ifdef SUPPORT_UNICODE + if ((options & PCRE2_UCP) != 0) switch(posix_class) + { + case PC_GRAPH: + case PC_PRINT: + case PC_PUNCT: + *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; + *class_uchardata++ = (PCRE2_UCHAR) + ((posix_class == PC_GRAPH)? PT_PXGRAPH : + (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT); + *class_uchardata++ = 0; + xclass_has_prop = TRUE; + goto CONTINUE_CLASS; + + /* For the other POSIX classes (ascii, xdigit) we are going to + fall through to the non-UCP case and build a bit map for + characters with code points less than 256. However, if we are in + a negated POSIX class, characters with code points greater than + 255 must either all match or all not match, depending on whether + the whole class is not or is negated. For example, for + [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]... + they must not. + + In the special case where there are no xclass items, this is + automatically handled by the use of OP_CLASS or OP_NCLASS, but an + explicit range is needed for OP_XCLASS. Setting a flag here + causes the range to be generated later when it is known that + OP_XCLASS is required. In the 8-bit library this is relevant only in + utf mode, since no wide characters can exist otherwise. */ + + default: +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (utf) +#endif + match_all_or_no_wide_chars |= local_negate; + break; + } +#endif /* SUPPORT_UNICODE */ + + /* In the non-UCP case, or when UCP makes no difference, we build the + bit map for the POSIX class in a chunk of local store because we may + be adding and subtracting from it, and we don't want to subtract bits + that may be in the main map already. At the end we or the result into + the bit map that is being built. */ + + posix_class *= 3; + + /* Copy in the first table (always present) */ + + memcpy(pbits, cbits + posix_class_maps[posix_class], + 32 * sizeof(uint8_t)); + + /* If there is a second table, add or remove it as required. */ + + taboffset = posix_class_maps[posix_class + 1]; + tabopt = posix_class_maps[posix_class + 2]; + + if (taboffset >= 0) + { + if (tabopt >= 0) + for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset]; + else + for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset]; + } + + /* Now see if we need to remove any special characters. An option + value of 1 removes vertical space and 2 removes underscore. */ + + if (tabopt < 0) tabopt = -tabopt; + if (tabopt == 1) pbits[1] &= ~0x3c; + else if (tabopt == 2) pbits[11] &= 0x7f; + + /* Add the POSIX table or its complement into the main table that is + being built and we are done. */ + + if (local_negate) + for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i]; + else + for (i = 0; i < 32; i++) classbits[i] |= pbits[i]; + + /* Every class contains at least one < 256 character. */ + + class_has_8bitchar = 1; + goto CONTINUE_CLASS; /* End of POSIX handling */ + } + + /* Other than POSIX classes, the only items we should encounter are + \d-type escapes and literal characters (possibly as ranges). */ + + if (meta == META_BIGVALUE) + { + meta = *(++pptr); + goto CLASS_LITERAL; + } + + /* Any other non-literal must be an escape */ + + if (meta >= META_END) + { + if (META_CODE(meta) != META_ESCAPE) + { +#ifdef DEBUG_SHOW_PARSED + fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x " + "in character class\n", meta); +#endif + *errorcodeptr = ERR89; /* Internal error - unrecognized. */ + return 0; + } + escape = META_DATA(meta); + + /* Every class contains at least one < 256 character. */ + + class_has_8bitchar++; + + switch(escape) + { + case ESC_d: + for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit]; + break; + + case ESC_D: + should_flip_negation = TRUE; + for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit]; + break; + + case ESC_w: + for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word]; + break; + + case ESC_W: + should_flip_negation = TRUE; + for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word]; + break; + + /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl + 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was + previously set by something earlier in the character class. + Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so + we could just adjust the appropriate bit. From PCRE 8.34 we no + longer treat \s and \S specially. */ + + case ESC_s: + for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space]; + break; + + case ESC_S: + should_flip_negation = TRUE; + for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space]; + break; + + /* When adding the horizontal or vertical space lists to a class, or + their complements, disable PCRE2_CASELESS, because it justs wastes + time, and in the "not-x" UTF cases can create unwanted duplicates in + the XCLASS list (provoked by characters that have more than one other + case and by both cases being in the same "not-x" sublist). */ + + case ESC_h: + (void)add_list_to_class(classbits, &class_uchardata, + options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR); + break; + + case ESC_H: + (void)add_not_list_to_class(classbits, &class_uchardata, + options & ~PCRE2_CASELESS, cb, PRIV(hspace_list)); + break; + + case ESC_v: + (void)add_list_to_class(classbits, &class_uchardata, + options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR); + break; + + case ESC_V: + (void)add_not_list_to_class(classbits, &class_uchardata, + options & ~PCRE2_CASELESS, cb, PRIV(vspace_list)); + break; + + case ESC_p: + case ESC_P: + { + uint32_t ptype = *(++pptr) >> 16; + uint32_t pdata = *pptr & 0xffff; + *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; + *class_uchardata++ = ptype; + *class_uchardata++ = pdata; +#ifdef SUPPORT_WIDE_CHARS + xclass_has_prop = TRUE; +#endif + class_has_8bitchar--; /* Undo! */ + } + break; + } + + goto CONTINUE_CLASS; + } /* End handling \d-type escapes */ + + /* A literal character may be followed by a range meta. At parse time + there are checks for out-of-order characters, for ranges where the two + characters are equal, and for hyphens that cannot indicate a range. At + this point, therefore, no checking is needed. */ + + else + { + uint32_t c, d; + + CLASS_LITERAL: + c = d = meta; + + /* Remember if \r or \n were explicitly used */ + + if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; + + /* Process a character range */ + + if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED) + { +#ifdef EBCDIC + BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL); +#endif + pptr += 2; + d = *pptr; + if (d == META_BIGVALUE) d = *(++pptr); + + /* Remember an explicit \r or \n, and add the range to the class. */ + + if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; + + /* In an EBCDIC environment, Perl treats alphabetic ranges specially + because there are holes in the encoding, and simply using the range + A-Z (for example) would include the characters in the holes. This + applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */ + +#ifdef EBCDIC + if (range_is_literal && + (cb->ctypes[c] & ctype_letter) != 0 && + (cb->ctypes[d] & ctype_letter) != 0 && + (d <= CHAR_z) == (d <= CHAR_z)) + { + uint32_t uc = (d <= CHAR_z)? 0 : 64; + uint32_t C = d - uc; + uint32_t D = d - uc; + + if (C <= CHAR_i) + { + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cb, C + uc, + ((D < CHAR_i)? D : CHAR_i) + uc); + C = CHAR_j; + } + + if (C <= D && C <= CHAR_r) + { + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cb, C + uc, + ((D < CHAR_r)? D : CHAR_r) + uc); + C = CHAR_s; + } + + if (C <= D) + { + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cb, C + uc, + D + uc); + } + } + else +#endif + /* Not an EBCDIC special range */ + + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cb, c, d); + goto CONTINUE_CLASS; /* Go get the next char in the class */ + } /* End of range handling */ + + + /* Handle a single character. */ + + class_has_8bitchar += + add_to_class(classbits, &class_uchardata, options, cb, meta, meta); + } + + /* Continue to the next item in the class. */ + + CONTINUE_CLASS: + +#ifdef SUPPORT_WIDE_CHARS + /* If any wide characters or Unicode properties have been encountered, + set xclass = TRUE. Then, in the pre-compile phase, accumulate the length + of the extra data and reset the pointer. This is so that very large + classes that contain a zillion wide characters or Unicode property tests + do not overwrite the work space (which is on the stack). */ + + if (class_uchardata > class_uchardata_base) + { + xclass = TRUE; + if (lengthptr != NULL) + { + *lengthptr += class_uchardata - class_uchardata_base; + class_uchardata = class_uchardata_base; + } + } +#endif + + continue; /* Needed to avoid error when not supporting wide chars */ + } /* End of main class-processing loop */ + + /* If this class is the first thing in the branch, there can be no first + char setting, whatever the repeat count. Any reqcu setting must remain + unchanged after any kind of repeat. */ + + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + + /* If there are characters with values > 255, or Unicode property settings + (\p or \P), we have to compile an extended class, with its own opcode, + unless there were no property settings and there was a negated special such + as \S in the class, and PCRE2_UCP is not set, because in that case all + characters > 255 are in or not in the class, so any that were explicitly + given as well can be ignored. + + In the UCP case, if certain negated POSIX classes ([:^ascii:] or + [^:xdigit:]) were present in a class, we either have to match or not match + all wide characters (depending on whether the whole class is or is not + negated). This requirement is indicated by match_all_or_no_wide_chars being + true. We do this by including an explicit range, which works in both cases. + This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there + cannot be any wide characters in 8-bit non-UTF mode. + + When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit + class where \S etc is present without PCRE2_UCP, causing an extended class + to be compiled, we make sure that all characters > 255 are included by + forcing match_all_or_no_wide_chars to be true. + + If, when generating an xclass, there are no characters < 256, we can omit + the bitmap in the actual compiled code. */ + +#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */ + if (xclass && ( +#ifdef SUPPORT_UNICODE + (options & PCRE2_UCP) != 0 || +#endif + xclass_has_prop || !should_flip_negation)) + { + if (match_all_or_no_wide_chars || ( +#if PCRE2_CODE_UNIT_WIDTH == 8 + utf && +#endif + should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0)) + { + *class_uchardata++ = XCL_RANGE; + if (utf) /* Will always be utf in the 8-bit library */ + { + class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); + class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata); + } + else /* Can only happen for the 16-bit & 32-bit libraries */ + { +#if PCRE2_CODE_UNIT_WIDTH == 16 + *class_uchardata++ = 0x100; + *class_uchardata++ = 0xffffu; +#elif PCRE2_CODE_UNIT_WIDTH == 32 + *class_uchardata++ = 0x100; + *class_uchardata++ = 0xffffffffu; +#endif + } + } + *class_uchardata++ = XCL_END; /* Marks the end of extra data */ + *code++ = OP_XCLASS; + code += LINK_SIZE; + *code = negate_class? XCL_NOT:0; + if (xclass_has_prop) *code |= XCL_HASPROP; + + /* If the map is required, move up the extra data to make room for it; + otherwise just move the code pointer to the end of the extra data. */ + + if (class_has_8bitchar > 0) + { + *code++ |= XCL_MAP; + memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, + CU2BYTES(class_uchardata - code)); + if (negate_class && !xclass_has_prop) + for (i = 0; i < 32; i++) classbits[i] = ~classbits[i]; + memcpy(code, classbits, 32); + code = class_uchardata + (32 / sizeof(PCRE2_UCHAR)); + } + else code = class_uchardata; + + /* Now fill in the complete length of the item */ + + PUT(previous, 1, (int)(code - previous)); + break; /* End of class handling */ + } +#endif /* SUPPORT_WIDE_CHARS */ + + /* If there are no characters > 255, or they are all to be included or + excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the + whole class was negated and whether there were negative specials such as \S + (non-UCP) in the class. Then copy the 32-byte map into the code vector, + negating it if necessary. */ + + *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; + if (lengthptr == NULL) /* Save time in the pre-compile phase */ + { + if (negate_class) + for (i = 0; i < 32; i++) classbits[i] = ~classbits[i]; + memcpy(code, classbits, 32); + } + code += 32 / sizeof(PCRE2_UCHAR); + break; /* End of class processing */ + + + /* ===================================================================*/ + /* Deal with (*VERB)s. */ + + /* Check for open captures before ACCEPT and convert it to ASSERT_ACCEPT if + in an assertion. In the first pass, just accumulate the length required; + otherwise hitting (*ACCEPT) inside many nested parentheses can cause + workspace overflow. Do not set firstcu after *ACCEPT. */ + + case META_ACCEPT: + cb->had_accept = TRUE; + for (oc = cb->open_caps; oc != NULL; oc = oc->next) + { + if (lengthptr != NULL) + { + *lengthptr += CU2BYTES(1) + IMM2_SIZE; + } + else + { + *code++ = OP_CLOSE; + PUT2INC(code, 0, oc->number); + } + } + *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + break; + + case META_PRUNE: + case META_SKIP: + cb->had_pruneorskip = TRUE; + /* Fall through */ + case META_COMMIT: + case META_FAIL: + *code++ = verbops[(meta - META_MARK) >> 16]; + break; + + case META_THEN: + cb->external_flags |= PCRE2_HASTHEN; + *code++ = OP_THEN; + break; + + /* Handle verbs with arguments. Arguments can be very long, especially in + 16- and 32-bit modes, and can overflow the workspace in the first pass. + However, the argument length is constrained to be small enough to fit in + one code unit. This check happens in parse_regex(). In the first pass, + instead of putting the argument into memory, we just update the length + counter and set up an empty argument. */ + + case META_THEN_ARG: + cb->external_flags |= PCRE2_HASTHEN; + goto VERB_ARG; + + case META_PRUNE_ARG: + case META_SKIP_ARG: + cb->had_pruneorskip = TRUE; + /* Fall through */ + case META_MARK: + VERB_ARG: + *code++ = verbops[(meta - META_MARK) >> 16]; + /* The length is in characters. */ + verbarglen = *(++pptr); + verbculen = 0; + tempcode = code++; + for (i = 0; i < (int)verbarglen; i++) + { + meta = *(++pptr); +#ifdef SUPPORT_UNICODE + if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else +#endif + { + mclength = 1; + mcbuffer[0] = meta; + } + if (lengthptr != NULL) *lengthptr += mclength; else + { + memcpy(code, mcbuffer, CU2BYTES(mclength)); + code += mclength; + verbculen += mclength; + } + } + + *tempcode = verbculen; /* Fill in the code unit length */ + *code++ = 0; /* Terminating zero */ + break; + + + /* ===================================================================*/ + /* Handle options change. The new setting must be passed back for use in + subsequent branches. Reset the greedy defaults and the case value for + firstcu and reqcu. */ + + case META_OPTIONS: + *optionsptr = options = *(++pptr); + greedy_default = ((options & PCRE2_UNGREEDY) != 0); + greedy_non_default = greedy_default ^ 1; + req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0; + break; + + + /* ===================================================================*/ + /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous + because it could be a numerical check on recursion, or a name check on a + group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that + we can handle it either way. We first try for a name; if not found, process + the number. */ + + case META_COND_RNUMBER: /* (?(Rdigits) */ + case META_COND_NAME: /* (?(name) or (?'name') or ?( ) */ + case META_COND_RNAME: /* (?(R&name) - test for recursion */ + bravalue = OP_COND; + { + int count, index; + PCRE2_SPTR name; + named_group *ng = cb->named_groups; + uint32_t length = *(++pptr); + + GETPLUSOFFSET(offset, pptr); + name = cb->start_pattern + offset; + + /* In the first pass, the names generated in the pre-pass are available, + but the main name table has not yet been created. Scan the list of names + generated in the pre-pass in order to get a number and whether or not + this name is duplicated. If it is not duplicated, we can handle it as a + numerical group. */ + + for (i = 0; i < cb->names_found; i++, ng++) + { + if (length == ng->length && + PRIV(strncmp)(name, ng->name, length) == 0) + { + if (!ng->isdup) + { + code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF; + PUT2(code, 2+LINK_SIZE, ng->number); + if (ng->number > cb->top_backref) cb->top_backref = ng->number; + skipunits = 1+IMM2_SIZE; + goto GROUP_PROCESS_NOTE_EMPTY; + } + break; /* Found a duplicated name */ + } + } + + /* If the name was not found we have a bad reference, unless we are + dealing with R , which is treated as a recursion test by number. + */ + + if (i >= cb->names_found) + { + groupnumber = 0; + if (meta == META_COND_RNUMBER) + { + for (i = 1; i < (int)length; i++) + { + groupnumber = groupnumber * 10 + name[i] - CHAR_0; + if (groupnumber > MAX_GROUP_NUMBER) + { + *errorcodeptr = ERR61; + cb->erroroffset = offset + i; + return 0; + } + } + } + + if (meta != META_COND_RNUMBER || groupnumber > cb->bracount) + { + *errorcodeptr = ERR15; + cb->erroroffset = offset; + return 0; + } + + /* (?Rdigits) treated as a recursion reference by number. A value of + zero (which is the result of both (?R) and (?R0)) means "any", and is + translated into RREF_ANY (which is 0xffff). */ + + if (groupnumber == 0) groupnumber = RREF_ANY; + code[1+LINK_SIZE] = OP_RREF; + PUT2(code, 2+LINK_SIZE, groupnumber); + skipunits = 1+IMM2_SIZE; + goto GROUP_PROCESS_NOTE_EMPTY; + } + + /* A duplicated name was found. Note that if an R name is found + (META_COND_RNUMBER), it is a reference test, not a recursion test. */ + + code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF; + + /* We have a duplicated name. In the compile pass we have to search the + main table in order to get the index and count values. */ + + count = 0; /* Values for first pass (avoids compiler warning) */ + index = 0; + if (lengthptr == NULL && !find_dupname_details(name, length, &index, + &count, errorcodeptr, cb)) return 0; + + /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and + insert appropriate data values. */ + + code[1+LINK_SIZE]++; + skipunits = 1+2*IMM2_SIZE; + PUT2(code, 2+LINK_SIZE, index); + PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count); + } + goto GROUP_PROCESS_NOTE_EMPTY; + + /* The DEFINE condition is always false. It's internal groups may never + be called, so matched_char must remain false, hence the jump to + GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */ + + case META_COND_DEFINE: + bravalue = OP_COND; + GETPLUSOFFSET(offset, pptr); + code[1+LINK_SIZE] = OP_DEFINE; + skipunits = 1; + goto GROUP_PROCESS; + + /* Conditional test of a group's being set. */ + + case META_COND_NUMBER: + bravalue = OP_COND; + GETPLUSOFFSET(offset, pptr); + groupnumber = *(++pptr); + if (groupnumber > cb->bracount) + { + *errorcodeptr = ERR15; + cb->erroroffset = offset; + return 0; + } + if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; + offset -= 2; /* Point at initial ( for too many branches error */ + code[1+LINK_SIZE] = OP_CREF; + skipunits = 1+IMM2_SIZE; + PUT2(code, 2+LINK_SIZE, groupnumber); + goto GROUP_PROCESS_NOTE_EMPTY; + + /* Test for the PCRE2 version. */ + + case META_COND_VERSION: + bravalue = OP_COND; + if (pptr[1] > 0) + code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) || + (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))? + OP_TRUE : OP_FALSE; + else + code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])? + OP_TRUE : OP_FALSE; + skipunits = 1; + pptr += 3; + goto GROUP_PROCESS_NOTE_EMPTY; + + /* The condition is an assertion, possibly preceded by a callout. */ + + case META_COND_ASSERT: + bravalue = OP_COND; + goto GROUP_PROCESS_NOTE_EMPTY; + + + /* ===================================================================*/ + /* Handle all kinds of nested bracketed groups. The non-capturing, + non-conditional cases are here; others come to GROUP_PROCESS via goto. */ + + case META_LOOKAHEAD: + bravalue = OP_ASSERT; + cb->assert_depth += 1; + goto GROUP_PROCESS; + + /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird + thing to do, but Perl allows all assertions to be quantified, and when + they contain capturing parentheses there may be a potential use for + this feature. Not that that applies to a quantified (?!) but we allow + it for uniformity. */ + + case META_LOOKAHEADNOT: + if (pptr[1] == META_KET && + (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY)) + { + *code++ = OP_FAIL; + pptr++; + } + else + { + bravalue = OP_ASSERT_NOT; + cb->assert_depth += 1; + goto GROUP_PROCESS; + } + break; + + case META_LOOKBEHIND: + bravalue = OP_ASSERTBACK; + cb->assert_depth += 1; + goto GROUP_PROCESS; + + case META_LOOKBEHINDNOT: + bravalue = OP_ASSERTBACK_NOT; + cb->assert_depth += 1; + goto GROUP_PROCESS; + + case META_ATOMIC: + bravalue = OP_ONCE; + goto GROUP_PROCESS_NOTE_EMPTY; + + case META_NOCAPTURE: + bravalue = OP_BRA; + /* Fall through */ + + /* Process nested bracketed regex. The nesting depth is maintained for the + benefit of the stackguard function. The test for too deep nesting is now + done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS; + others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take + note of whether or not they may match an empty string. */ + + GROUP_PROCESS_NOTE_EMPTY: + note_group_empty = TRUE; + + GROUP_PROCESS: + cb->parens_depth += 1; + *code = bravalue; + pptr++; + tempcode = code; + tempreqvary = cb->req_varyopt; /* Save value before group */ + templastcapture = cb->lastcapture; /* Save value before group */ + length_prevgroup = 0; /* Initialize for pre-compile phase */ + + if ((group_return = + compile_regex( + options, /* The option state */ + &tempcode, /* Where to put code (updated) */ + &pptr, /* Input pointer (updated) */ + errorcodeptr, /* Where to put an error message */ + skipunits, /* Skip over bracket number */ + &subfirstcu, /* For possible first char */ + &subfirstcuflags, + &subreqcu, /* For possible last char */ + &subreqcuflags, + bcptr, /* Current branch chain */ + cb, /* Compile data block */ + (lengthptr == NULL)? NULL : /* Actual compile phase */ + &length_prevgroup /* Pre-compile phase */ + )) == 0) + return 0; /* Error */ + + cb->parens_depth -= 1; + + /* If that was a non-conditional significant group (not an assertion, not a + DEFINE) that matches at least one character, then the current item matches + a character. Conditionals are handled below. */ + + if (note_group_empty && bravalue != OP_COND && group_return > 0) + matched_char = TRUE; + + /* If that was an atomic group and there are no capturing groups within it, + generate OP_ONCE_NC instead of OP_ONCE. */ + + if (bravalue == OP_ONCE && cb->lastcapture <= templastcapture) + *code = OP_ONCE_NC; + + /* If we've just compiled an assertion, pop the assert depth. */ + + if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT) + cb->assert_depth -= 1; + + /* At the end of compiling, code is still pointing to the start of the + group, while tempcode has been updated to point past the end of the group. + The parsed pattern pointer (pptr) is on the closing META_KET. + + If this is a conditional bracket, check that there are no more than + two branches in the group, or just one if it's a DEFINE group. We do this + in the real compile phase, not in the pre-pass, where the whole group may + not be available. */ + + if (bravalue == OP_COND && lengthptr == NULL) + { + PCRE2_UCHAR *tc = code; + int condcount = 0; + + do { + condcount++; + tc += GET(tc,1); + } + while (*tc != OP_KET); + + /* A DEFINE group is never obeyed inline (the "condition" is always + false). It must have only one branch. Having checked this, change the + opcode to OP_FALSE. */ + + if (code[LINK_SIZE+1] == OP_DEFINE) + { + if (condcount > 1) + { + cb->erroroffset = offset; + *errorcodeptr = ERR54; + return 0; + } + code[LINK_SIZE+1] = OP_FALSE; + bravalue = OP_DEFINE; /* A flag to suppress char handling below */ + } + + /* A "normal" conditional group. If there is just one branch, we must not + make use of its firstcu or reqcu, because this is equivalent to an + empty second branch. Also, it may match an empty string. If there are two + branches, this item must match a character if the group must. */ + + else + { + if (condcount > 2) + { + cb->erroroffset = offset; + *errorcodeptr = ERR27; + return 0; + } + if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE; + else if (group_return > 0) matched_char = TRUE; + } + } + + /* In the pre-compile phase, update the length by the length of the group, + less the brackets at either end. Then reduce the compiled code to just a + set of non-capturing brackets so that it doesn't use much memory if it is + duplicated by a quantifier.*/ + + if (lengthptr != NULL) + { + if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) + { + *errorcodeptr = ERR20; + return 0; + } + *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; + code++; /* This already contains bravalue */ + PUTINC(code, 0, 1 + LINK_SIZE); + *code++ = OP_KET; + PUTINC(code, 0, 1 + LINK_SIZE); + break; /* No need to waste time with special character handling */ + } + + /* Otherwise update the main code pointer to the end of the group. */ + + code = tempcode; + + /* For a DEFINE group, required and first character settings are not + relevant. */ + + if (bravalue == OP_DEFINE) break; + + /* Handle updating of the required and first code units for other types of + group. Update for normal brackets of all kinds, and conditions with two + branches (see code above). If the bracket is followed by a quantifier with + zero repeat, we have to back off. Hence the definition of zeroreqcu and + zerofirstcu outside the main loop so that they can be accessed for the back + off. */ + + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + groupsetfirstcu = FALSE; + + if (bravalue >= OP_ONCE) /* Not an assertion */ + { + /* If we have not yet set a firstcu in this branch, take it from the + subpattern, remembering that it was set here so that a repeat of more + than one can replicate it as reqcu if necessary. If the subpattern has + no firstcu, set "none" for the whole branch. In both cases, a zero + repeat forces firstcu to "none". */ + + if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET) + { + if (subfirstcuflags >= 0) + { + firstcu = subfirstcu; + firstcuflags = subfirstcuflags; + groupsetfirstcu = TRUE; + } + else firstcuflags = REQ_NONE; + zerofirstcuflags = REQ_NONE; + } + + /* If firstcu was previously set, convert the subpattern's firstcu + into reqcu if there wasn't one, using the vary flag that was in + existence beforehand. */ + + else if (subfirstcuflags >= 0 && subreqcuflags < 0) + { + subreqcu = subfirstcu; + subreqcuflags = subfirstcuflags | tempreqvary; + } + + /* If the subpattern set a required code unit (or set a first code unit + that isn't really the first code unit - see above), set it. */ + + if (subreqcuflags >= 0) + { + reqcu = subreqcu; + reqcuflags = subreqcuflags; + } + } + + /* For a forward assertion, we take the reqcu, if set, provided that the + group has also set a firstcu. This can be helpful if the pattern that + follows the assertion doesn't set a different char. For example, it's + useful for /(?=abcde).+/. We can't set firstcu for an assertion, however + because it leads to incorrect effect for patterns such as /(?=a)a.+/ when + the "real" "a" would then become a reqcu instead of a firstcu. This is + overcome by a scan at the end if there's no firstcu, looking for an + asserted first char. A similar effect for patterns like /(?=.*X)X$/ means + we must only take the reqcu when the group also set a firstcu. Otherwise, + in that example, 'X' ends up set for both. */ + + else if (bravalue == OP_ASSERT && subreqcuflags >= 0 && + subfirstcuflags >= 0) + { + reqcu = subreqcu; + reqcuflags = subreqcuflags; + } + + break; /* End of nested group handling */ + + + /* ===================================================================*/ + /* Handle named backreferences and recursions. */ + + case META_BACKREF_BYNAME: + case META_RECURSE_BYNAME: + { + int count, index; + PCRE2_SPTR name; + BOOL is_dupname = FALSE; + named_group *ng = cb->named_groups; + uint32_t length = *(++pptr); + + GETPLUSOFFSET(offset, pptr); + name = cb->start_pattern + offset; + + /* In the first pass, the names generated in the pre-pass are available, + but the main name table has not yet been created. Scan the list of names + generated in the pre-pass in order to get a number and whether or not + this name is duplicated. */ + + groupnumber = 0; + for (i = 0; i < cb->names_found; i++, ng++) + { + if (length == ng->length && + PRIV(strncmp)(name, ng->name, length) == 0) + { + is_dupname = ng->isdup; + groupnumber = ng->number; + + /* For a recursion, that's all that is needed. We can now go to + the code above that handles numerical recursion, applying it to + the first group with the given name. */ + + if (meta == META_RECURSE_BYNAME) + { + meta_arg = groupnumber; + goto HANDLE_NUMERICAL_RECURSION; + } + + /* For a back reference, update the back reference map and the + maximum back reference. Then, for each group, we must check to + see if it is recursive, that is, it is inside the group that it + references. A flag is set so that the group can be made atomic. + */ + + cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; + if (groupnumber > cb->top_backref) + cb->top_backref = groupnumber; + + for (oc = cb->open_caps; oc != NULL; oc = oc->next) + { + if (oc->number == groupnumber) + { + oc->flag = TRUE; + break; + } + } + } + } + + /* If the name was not found we have a bad reference. */ + + if (groupnumber == 0) + { + *errorcodeptr = ERR15; + cb->erroroffset = offset; + return 0; + } + + /* If a back reference name is not duplicated, we can handle it as + a numerical reference. */ + + if (!is_dupname) + { + meta_arg = groupnumber; + goto HANDLE_SINGLE_REFERENCE; + } + + /* If a back reference name is duplicated, we generate a different + opcode to a numerical back reference. In the second pass we must + search for the index and count in the final name table. */ + + count = 0; /* Values for first pass (avoids compiler warning) */ + index = 0; + if (lengthptr == NULL && !find_dupname_details(name, length, &index, + &count, errorcodeptr, cb)) return 0; + + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF; + PUT2INC(code, 0, index); + PUT2INC(code, 0, count); + } + break; + + + /* ===================================================================*/ + /* Handle a numerical callout. */ + + case META_CALLOUT_NUMBER: + code[0] = OP_CALLOUT; + PUT(code, 1, pptr[1]); /* Offset to next pattern item */ + PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */ + code[1 + 2*LINK_SIZE] = pptr[3]; + pptr += 3; + code += PRIV(OP_lengths)[OP_CALLOUT]; + break; + + + /* ===================================================================*/ + /* Handle a callout with a string argument. In the pre-pass we just compute + the length without generating anything. The length in pptr[3] includes both + delimiters; in the actual compile only the first one is copied, but a + terminating zero is added. Any doubled delimiters within the string make + this an overestimate, but it is not worth bothering about. */ + + case META_CALLOUT_STRING: + if (lengthptr != NULL) + { + *lengthptr += pptr[3] + (1 + 4*LINK_SIZE); + pptr += 3; + SKIPOFFSET(pptr); + } + + /* In the real compile we can copy the string. The starting delimiter is + included so that the client can discover it if they want. We also pass the + start offset to help a script language give better error messages. */ + + else + { + PCRE2_SPTR pp; + uint32_t delimiter; + uint32_t length = pptr[3]; + PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE); + + code[0] = OP_CALLOUT_STR; + PUT(code, 1, pptr[1]); /* Offset to next pattern item */ + PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */ + + pptr += 3; + GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */ + pp = cb->start_pattern + offset; + delimiter = *callout_string++ = *pp++; + if (delimiter == CHAR_LEFT_CURLY_BRACKET) + delimiter = CHAR_RIGHT_CURLY_BRACKET; + PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */ + + /* The syntax of the pattern was checked in the parsing scan. The length + includes both delimiters, but we have passed the opening one just above, + so we reduce length before testing it. The test is for > 1 because we do + not want to copy the final delimiter. This also ensures that pp[1] is + accessible. */ + + while (--length > 1) + { + if (*pp == delimiter && pp[1] == delimiter) + { + *callout_string++ = delimiter; + pp += 2; + length--; + } + else *callout_string++ = *pp++; + } + *callout_string++ = CHAR_NULL; + + /* Set the length of the entire item, the advance to its end. */ + + PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code)); + code = callout_string; + } + break; + + + /* ===================================================================*/ + /* Handle repetition. The different types are all sorted out in the parsing + pass. */ + + case META_MINMAX_PLUS: + case META_MINMAX_QUERY: + case META_MINMAX: + repeat_min = *(++pptr); + repeat_max = *(++pptr); + goto REPEAT; + + case META_ASTERISK: + case META_ASTERISK_PLUS: + case META_ASTERISK_QUERY: + repeat_min = 0; + repeat_max = REPEAT_UNLIMITED; + goto REPEAT; + + case META_PLUS: + case META_PLUS_PLUS: + case META_PLUS_QUERY: + repeat_min = 1; + repeat_max = REPEAT_UNLIMITED; + goto REPEAT; + + case META_QUERY: + case META_QUERY_PLUS: + case META_QUERY_QUERY: + repeat_min = 0; + repeat_max = 1; + + REPEAT: + if (previous_matched_char && repeat_min > 0) matched_char = TRUE; + + /* Remember whether this is a variable length repeat, and default to + single-char opcodes. */ + + reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; + op_type = 0; + + /* If the repeat is {1} we can ignore it. */ + + if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; + + /* Adjust first and required code units for a zero repeat. */ + + if (repeat_min == 0) + { + firstcu = zerofirstcu; + firstcuflags = zerofirstcuflags; + reqcu = zeroreqcu; + reqcuflags = zeroreqcuflags; + } + + /* Note the greediness and possessiveness. */ + + switch (meta) + { + case META_MINMAX_PLUS: + case META_ASTERISK_PLUS: + case META_PLUS_PLUS: + case META_QUERY_PLUS: + repeat_type = 0; /* Force greedy */ + possessive_quantifier = TRUE; break; - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - case OP_TYPEPOSUPTO: - if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) - code += 2; + case META_MINMAX_QUERY: + case META_ASTERISK_QUERY: + case META_PLUS_QUERY: + case META_QUERY_QUERY: + repeat_type = greedy_non_default; + possessive_quantifier = FALSE; break; - case OP_MARK: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - code += code[1]; + default: + repeat_type = greedy_default; + possessive_quantifier = FALSE; break; } - /* Add in the fixed length from the table */ + /* Save start of previous item, in case we have to move it up in order to + insert something before it, and remember what it was. */ - code += PRIV(OP_lengths)[c]; + tempcode = previous; + op_previous = *previous; - /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be - followed by a multi-byte character. The length in the table is a minimum, so - we have to arrange to skip the extra bytes. */ + /* If previous was a recursion call, wrap it in atomic brackets so that + previous becomes the atomic group. All recursions were so wrapped in the + past, but it no longer happens for non-repeated recursions. In fact, the + repeated ones could be re-implemented independently so as not to need this, + but for the moment we rely on the code for repeating groups. */ -#ifdef MAYBE_UTF_MULTI - if (utf) switch(c) + if (op_previous == OP_RECURSE) { + memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE)); + op_previous = *previous = OP_ONCE; + PUT(previous, 1, 2 + 2*LINK_SIZE); + previous[2 + 2*LINK_SIZE] = OP_KET; + PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); + code += 2 + 2 * LINK_SIZE; + length_prevgroup = 3 + 3*LINK_SIZE; + group_return = -1; /* Set "may match empty string" */ + } + + /* Now handle repetition for the different types of item. */ + + switch (op_previous) + { + /* If previous was a character or negated character match, abolish the + item and generate a repeat item instead. If a char item has a minimum of + more than one, ensure that it is set in reqcu - it might not be if a + sequence such as x{3} is the first thing in a branch because the x will + have gone into firstcu instead. */ + case OP_CHAR: case OP_CHARI: case OP_NOT: case OP_NOTI: - case OP_EXACT: - case OP_EXACTI: - case OP_NOTEXACT: - case OP_NOTEXACTI: - case OP_UPTO: - case OP_UPTOI: - case OP_NOTUPTO: - case OP_NOTUPTOI: - case OP_MINUPTO: - case OP_MINUPTOI: - case OP_NOTMINUPTO: - case OP_NOTMINUPTOI: - case OP_POSUPTO: - case OP_POSUPTOI: - case OP_NOTPOSUPTO: - case OP_NOTPOSUPTOI: - case OP_STAR: - case OP_STARI: - case OP_NOTSTAR: - case OP_NOTSTARI: - case OP_MINSTAR: - case OP_MINSTARI: - case OP_NOTMINSTAR: - case OP_NOTMINSTARI: - case OP_POSSTAR: - case OP_POSSTARI: - case OP_NOTPOSSTAR: - case OP_NOTPOSSTARI: - case OP_PLUS: - case OP_PLUSI: - case OP_NOTPLUS: - case OP_NOTPLUSI: - case OP_MINPLUS: - case OP_MINPLUSI: - case OP_NOTMINPLUS: - case OP_NOTMINPLUSI: - case OP_POSPLUS: - case OP_POSPLUSI: - case OP_NOTPOSPLUS: - case OP_NOTPOSPLUSI: - case OP_QUERY: - case OP_QUERYI: - case OP_NOTQUERY: - case OP_NOTQUERYI: - case OP_MINQUERY: - case OP_MINQUERYI: - case OP_NOTMINQUERY: - case OP_NOTMINQUERYI: - case OP_POSQUERY: - case OP_POSQUERYI: - case OP_NOTPOSQUERY: - case OP_NOTPOSQUERYI: - if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); - break; - } -#else - (void)(utf); /* Keep compiler happy by referencing function argument */ + op_type = chartypeoffset[op_previous - OP_CHAR]; + + /* Deal with UTF characters that take up more than one code unit. */ + +#ifdef MAYBE_UTF_MULTI + if (utf && NOT_FIRSTCU(code[-1])) + { + PCRE2_UCHAR *lastchar = code - 1; + BACKCHAR(lastchar); + mclength = (uint32_t)(code - lastchar); /* Length of UTF character */ + memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */ + } + else #endif /* MAYBE_UTF_MULTI */ - } + + /* Handle the case of a single code unit - either with no UTF support, or + with UTF disabled, or for a single-code-unit UTF character. */ + { + mcbuffer[0] = code[-1]; + mclength = 1; + if (op_previous <= OP_CHARI && repeat_min > 1) + { + reqcu = mcbuffer[0]; + reqcuflags = req_caseopt | cb->req_varyopt; + } + } + goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ + + /* If previous was a character class or a back reference, we put the + repeat stuff after it, but just skip the item if the repeat was {0,0}. */ + +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: +#endif + case OP_CLASS: + case OP_NCLASS: + case OP_REF: + case OP_REFI: + case OP_DNREF: + case OP_DNREFI: + + if (repeat_max == 0) + { + code = previous; + goto END_REPEAT; + } + + if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED) + *code++ = OP_CRSTAR + repeat_type; + else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED) + *code++ = OP_CRPLUS + repeat_type; + else if (repeat_min == 0 && repeat_max == 1) + *code++ = OP_CRQUERY + repeat_type; + else + { + *code++ = OP_CRRANGE + repeat_type; + PUT2INC(code, 0, repeat_min); + if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */ + PUT2INC(code, 0, repeat_max); + } + break; + + /* If previous is OP_FAIL, it was generated by an empty class [] + (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be + generated, that is by (*FAIL) or (?!), disallow a quantifier at parse + time. We can just ignore this repeat. */ + + case OP_FAIL: + goto END_REPEAT; + + /* If previous was a bracket group, we may have to replicate it in + certain cases. Note that at this point we can encounter only the "basic" + bracket opcodes such as BRA and CBRA, as this is the place where they get + converted into the more special varieties such as BRAPOS and SBRA. + Originally, PCRE did not allow repetition of assertions, but now it does, + for Perl compatibility. */ + + case OP_ASSERT: + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + case OP_ONCE: + case OP_ONCE_NC: + case OP_BRA: + case OP_CBRA: + case OP_COND: + { + int len = (int)(code - previous); + PCRE2_UCHAR *bralink = NULL; + PCRE2_UCHAR *brazeroptr = NULL; + + /* Repeating a DEFINE group (or any group where the condition is always + FALSE and there is only one branch) is pointless, but Perl allows the + syntax, so we just ignore the repeat. */ + + if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE && + previous[GET(previous, 1)] != OP_ALT) + goto END_REPEAT; + + /* There is no sense in actually repeating assertions. The only potential + use of repetition is in cases when the assertion is optional. Therefore, + if the minimum is greater than zero, just ignore the repeat. If the + maximum is not zero or one, set it to 1. */ + + if (op_previous < OP_ONCE) /* Assertion */ + { + if (repeat_min > 0) goto END_REPEAT; + if (repeat_max > 1) repeat_max = 1; + } + + /* The case of a zero minimum is special because of the need to stick + OP_BRAZERO in front of it, and because the group appears once in the + data, whereas in other cases it appears the minimum number of times. For + this reason, it is simplest to treat this case separately, as otherwise + the code gets far too messy. There are several special subcases when the + minimum is zero. */ + + if (repeat_min == 0) + { + /* If the maximum is also zero, we used to just omit the group from + the output altogether, like this: + + ** if (repeat_max == 0) + ** { + ** code = previous; + ** goto END_REPEAT; + ** } + + However, that fails when a group or a subgroup within it is + referenced as a subroutine from elsewhere in the pattern, so now we + stick in OP_SKIPZERO in front of it so that it is skipped on + execution. As we don't have a list of which groups are referenced, we + cannot do this selectively. + + If the maximum is 1 or unlimited, we just have to stick in the + BRAZERO and do no more at this point. */ + + if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED) + { + memmove(previous + 1, previous, CU2BYTES(len)); + code++; + if (repeat_max == 0) + { + *previous++ = OP_SKIPZERO; + goto END_REPEAT; + } + brazeroptr = previous; /* Save for possessive optimizing */ + *previous++ = OP_BRAZERO + repeat_type; + } + + /* If the maximum is greater than 1 and limited, we have to replicate + in a nested fashion, sticking OP_BRAZERO before each set of brackets. + The first one has to be handled carefully because it's the original + copy, which has to be moved up. The remainder can be handled by code + that is common with the non-zero minimum case below. We have to + adjust the value or repeat_max, since one less copy is required. */ + + else + { + int linkoffset; + memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len)); + code += 2 + LINK_SIZE; + *previous++ = OP_BRAZERO + repeat_type; + *previous++ = OP_BRA; + + /* We chain together the bracket link offset fields that have to be + filled in later when the ends of the brackets are reached. */ + + linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink); + bralink = previous; + PUTINC(previous, 0, linkoffset); + } + + if (repeat_max != REPEAT_UNLIMITED) repeat_max--; + } + + /* If the minimum is greater than zero, replicate the group as many + times as necessary, and adjust the maximum to the number of subsequent + copies that we need. */ + + else + { + if (repeat_min > 1) + { + /* In the pre-compile phase, we don't actually do the replication. + We just adjust the length as if we had. Do some paranoid checks for + potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit + integer type when available, otherwise double. */ + + if (lengthptr != NULL) + { + PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup; + if ((INT64_OR_DOUBLE)(repeat_min - 1)* + (INT64_OR_DOUBLE)length_prevgroup > + (INT64_OR_DOUBLE)INT_MAX || + OFLOW_MAX - *lengthptr < delta) + { + *errorcodeptr = ERR20; + return 0; + } + *lengthptr += delta; + } + + /* This is compiling for real. If there is a set first code unit + for the group, and we have not yet set a "required code unit", set + it. */ + + else + { + if (groupsetfirstcu && reqcuflags < 0) + { + reqcu = firstcu; + reqcuflags = firstcuflags; + } + for (i = 1; (uint32_t)i < repeat_min; i++) + { + memcpy(code, previous, CU2BYTES(len)); + code += len; + } + } + } + + if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min; + } + + /* This code is common to both the zero and non-zero minimum cases. If + the maximum is limited, it replicates the group in a nested fashion, + remembering the bracket starts on a stack. In the case of a zero + minimum, the first one was set up above. In all cases the repeat_max + now specifies the number of additional copies needed. Again, we must + remember to replicate entries on the forward reference list. */ + + if (repeat_max != REPEAT_UNLIMITED) + { + /* In the pre-compile phase, we don't actually do the replication. We + just adjust the length as if we had. For each repetition we must add + 1 to the length for BRAZERO and for all but the last repetition we + must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some + paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type + is a 64-bit integer type when available, otherwise double. */ + + if (lengthptr != NULL && repeat_max > 0) + { + PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - + 2 - 2*LINK_SIZE; /* Last one doesn't nest */ + if ((INT64_OR_DOUBLE)repeat_max * + (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) + > (INT64_OR_DOUBLE)INT_MAX || + OFLOW_MAX - *lengthptr < delta) + { + *errorcodeptr = ERR20; + return 0; + } + *lengthptr += delta; + } + + /* This is compiling for real */ + + else for (i = repeat_max - 1; i >= 0; i--) + { + *code++ = OP_BRAZERO + repeat_type; + + /* All but the final copy start a new nesting, maintaining the + chain of brackets outstanding. */ + + if (i != 0) + { + int linkoffset; + *code++ = OP_BRA; + linkoffset = (bralink == NULL)? 0 : (int)(code - bralink); + bralink = code; + PUTINC(code, 0, linkoffset); + } + + memcpy(code, previous, CU2BYTES(len)); + code += len; + } + + /* Now chain through the pending brackets, and fill in their length + fields (which are holding the chain links pro tem). */ + + while (bralink != NULL) + { + int oldlinkoffset; + int linkoffset = (int)(code - bralink + 1); + PCRE2_UCHAR *bra = code - linkoffset; + oldlinkoffset = GET(bra, 1); + bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; + *code++ = OP_KET; + PUTINC(code, 0, linkoffset); + PUT(bra, 1, linkoffset); + } + } + + /* If the maximum is unlimited, set a repeater in the final copy. For + ONCE brackets, that's all we need to do. However, possessively repeated + ONCE brackets can be converted into non-capturing brackets, as the + behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to + deal with possessive ONCEs specially. + + Otherwise, when we are doing the actual compile phase, check to see + whether this group is one that could match an empty string. If so, + convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so + that runtime checking can be done. [This check is also applied to ONCE + groups at runtime, but in a different way.] + + Then, if the quantifier was possessive and the bracket is not a + conditional, we convert the BRA code to the POS form, and the KET code to + KETRPOS. (It turns out to be convenient at runtime to detect this kind of + subpattern at both the start and at the end.) The use of special opcodes + makes it possible to reduce greatly the stack usage in pcre2_match(). If + the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO. + + Then, if the minimum number of matches is 1 or 0, cancel the possessive + flag so that the default action below, of wrapping everything inside + atomic brackets, does not happen. When the minimum is greater than 1, + there will be earlier copies of the group, and so we still have to wrap + the whole thing. */ + + else + { + PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE; + PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1); + + /* Convert possessive ONCE brackets to non-capturing */ + + if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) && + possessive_quantifier) *bracode = OP_BRA; + + /* For non-possessive ONCE brackets, all we need to do is to + set the KET. */ + + if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC) + *ketcode = OP_KETRMAX + repeat_type; + + /* Handle non-ONCE brackets and possessive ONCEs (which have been + converted to non-capturing above). */ + + else + { + /* In the compile phase, adjust the opcode if the group can match + an empty string. For a conditional group with only one branch, the + value of group_return will not show "could be empty", so we must + check that separately. */ + + if (lengthptr == NULL) + { + if (group_return < 0) *bracode += OP_SBRA - OP_BRA; + if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT) + *bracode = OP_SCOND; + } + + /* Handle possessive quantifiers. */ + + if (possessive_quantifier) + { + /* For COND brackets, we wrap the whole thing in a possessively + repeated non-capturing bracket, because we have not invented POS + versions of the COND opcodes. */ + + if (*bracode == OP_COND || *bracode == OP_SCOND) + { + int nlen = (int)(code - bracode); + memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen)); + code += 1 + LINK_SIZE; + nlen += 1 + LINK_SIZE; + *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS; + *code++ = OP_KETRPOS; + PUTINC(code, 0, nlen); + PUT(bracode, 1, nlen); + } + + /* For non-COND brackets, we modify the BRA code and use KETRPOS. */ + + else + { + *bracode += 1; /* Switch to xxxPOS opcodes */ + *ketcode = OP_KETRPOS; + } + + /* If the minimum is zero, mark it as possessive, then unset the + possessive flag when the minimum is 0 or 1. */ + + if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO; + if (repeat_min < 2) possessive_quantifier = FALSE; + } + + /* Non-possessive quantifier */ + + else *ketcode = OP_KETRMAX + repeat_type; + } + } + } + break; + + /* If previous was a character type match (\d or similar), abolish it and + create a suitable repeat item. The code is shared with single-character + repeats by setting op_type to add a suitable offset into repeat_type. + Note the the Unicode property types will be present only when + SUPPORT_UNICODE is defined, but we don't wrap the little bits of code + here because it just makes it horribly messy. */ + + default: + if (op_previous >= OP_EODN) /* Not a character type - internal error */ + { + *errorcodeptr = ERR10; + return 0; + } + else + { + int prop_type, prop_value; + PCRE2_UCHAR *oldcode; + + op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ + mclength = 0; /* Not a character */ + + if (op_previous == OP_PROP || op_previous == OP_NOTPROP) + { + prop_type = previous[1]; + prop_value = previous[2]; + } + else + { + /* Come here from just above with a character in mcbuffer/mclength. */ + OUTPUT_SINGLE_REPEAT: + prop_type = prop_value = -1; + } + + /* At this point, if prop_type == prop_value == -1 we either have a + character in mcbuffer when mclength is greater than zero, or we have + mclength zero, in which case there is a non-property character type in + op_previous. If prop_type/value are not negative, we have a property + character type in op_previous. */ + + oldcode = code; /* Save where we were */ + code = previous; /* Usually overwrite previous item */ + + /* If the maximum is zero then the minimum must also be zero; Perl allows + this case, so we do too - by simply omitting the item altogether. */ + + if (repeat_max == 0) goto END_REPEAT; + + /* Combine the op_type with the repeat_type */ + + repeat_type += op_type; + + /* A minimum of zero is handled either as the special case * or ?, or as + an UPTO, with the maximum given. */ + + if (repeat_min == 0) + { + if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type; + else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; + else + { + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max); + } + } + + /* A repeat minimum of 1 is optimized into some special cases. If the + maximum is unlimited, we use OP_PLUS. Otherwise, the original item is + left in place and, if the maximum is greater than 1, we use OP_UPTO with + one less than the maximum. */ + + else if (repeat_min == 1) + { + if (repeat_max == REPEAT_UNLIMITED) + *code++ = OP_PLUS + repeat_type; + else + { + code = oldcode; /* Leave previous item in place */ + if (repeat_max == 1) goto END_REPEAT; + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max - 1); + } + } + + /* The case {n,n} is just an EXACT, while the general case {n,m} is + handled as an EXACT followed by an UPTO or STAR or QUERY. */ + + else + { + *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ + PUT2INC(code, 0, repeat_min); + + /* Unless repeat_max equals repeat_min, fill in the data for EXACT, + and then generate the second opcode. For a repeated Unicode property + match, there are two extra values that define the required property, + and mclength is set zero to indicate this. */ + + if (repeat_max != repeat_min) + { + if (mclength > 0) + { + memcpy(code, mcbuffer, CU2BYTES(mclength)); + code += mclength; + } + else + { + *code++ = op_previous; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } + } + + /* Now set up the following opcode */ + + if (repeat_max == REPEAT_UNLIMITED) + *code++ = OP_STAR + repeat_type; + else + { + repeat_max -= repeat_min; + if (repeat_max == 1) + { + *code++ = OP_QUERY + repeat_type; + } + else + { + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max); + } + } + } + } + + /* Fill in the character or character type for the final opcode. */ + + if (mclength > 0) + { + memcpy(code, mcbuffer, CU2BYTES(mclength)); + code += mclength; + } + else + { + *code++ = op_previous; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } + } + } + break; + } /* End of switch on different op_previous values */ + + + /* If the character following a repeat is '+', possessive_quantifier is + TRUE. For some opcodes, there are special alternative opcodes for this + case. For anything else, we wrap the entire repeated item inside OP_ONCE + brackets. Logically, the '+' notation is just syntactic sugar, taken from + Sun's Java package, but the special opcodes can optimize it. + + Some (but not all) possessively repeated subpatterns have already been + completely handled in the code just above. For them, possessive_quantifier + is always FALSE at this stage. Note that the repeated item starts at + tempcode, not at previous, which might be the first part of a string whose + (former) last char we repeated. */ + + if (possessive_quantifier) + { + int len; + + /* Possessifying an EXACT quantifier has no effect, so we can ignore it. + However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6}, + {5,}, or {5,10}). We skip over an EXACT item; if the length of what + remains is greater than zero, there's a further opcode that can be + handled. If not, do nothing, leaving the EXACT alone. */ + + switch(*tempcode) + { + case OP_TYPEEXACT: + tempcode += PRIV(OP_lengths)[*tempcode] + + ((tempcode[1 + IMM2_SIZE] == OP_PROP + || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); + break; + + /* CHAR opcodes are used for exacts whose count is 1. */ + + case OP_CHAR: + case OP_CHARI: + case OP_NOT: + case OP_NOTI: + case OP_EXACT: + case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: + tempcode += PRIV(OP_lengths)[*tempcode]; +#ifdef SUPPORT_UNICODE + if (utf && HAS_EXTRALEN(tempcode[-1])) + tempcode += GET_EXTRALEN(tempcode[-1]); +#endif + break; + + /* For the class opcodes, the repeat operator appears at the end; + adjust tempcode to point to it. */ + + case OP_CLASS: + case OP_NCLASS: + tempcode += 1 + 32/sizeof(PCRE2_UCHAR); + break; + +#ifdef SUPPORT_WIDE_CHARS + case OP_XCLASS: + tempcode += GET(tempcode, 1); + break; +#endif + } + + /* If tempcode is equal to code (which points to the end of the repeated + item), it means we have skipped an EXACT item but there is no following + QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In + all other cases, tempcode will be pointing to the repeat opcode, and will + be less than code, so the value of len will be greater than 0. */ + + len = (int)(code - tempcode); + if (len > 0) + { + unsigned int repcode = *tempcode; + + /* There is a table for possessifying opcodes, all of which are less + than OP_CALLOUT. A zero entry means there is no possessified version. + */ + + if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0) + *tempcode = opcode_possessify[repcode]; + + /* For opcode without a special possessified version, wrap the item in + ONCE brackets. */ + + else + { + memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len)); + code += 1 + LINK_SIZE; + len += 1 + LINK_SIZE; + tempcode[0] = OP_ONCE; + *code++ = OP_KET; + PUTINC(code, 0, len); + PUT(tempcode, 1, len); + } + } + } + + /* We set the "follows varying string" flag for subsequently encountered + reqcus if it isn't already set and we have just passed a varying length + item. */ + + END_REPEAT: + cb->req_varyopt |= reqvary; + break; + + + /* ===================================================================*/ + /* Handle a 32-bit data character with a value greater than META_END. */ + + case META_BIGVALUE: + pptr++; + goto NORMAL_CHAR; + + + /* ===============================================================*/ + /* Handle a back reference by number, which is the meta argument. The + pattern offsets for back references to group numbers less than 10 are held + in a special vector, to avoid using more than two parsed pattern elements + in 64-bit environments. We only need the offset to the first occurrence, + because if that doesn't fail, subsequent ones will also be OK. */ + + case META_BACKREF: + if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg]; + else GETPLUSOFFSET(offset, pptr); + + if (meta_arg > cb->bracount) + { + cb->erroroffset = offset; + *errorcodeptr = ERR15; /* Non-existent subpattern */ + return 0; + } + + /* Come here from named backref handling when the reference is to a + single group (that is, not to a duplicated name). The back reference + data will have already been updated. We must disable firstcu if not + set, to cope with cases like (?=(\w+))\1: which would otherwise set ':' + later. */ + + HANDLE_SINGLE_REFERENCE: + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF; + PUT2INC(code, 0, meta_arg); + + /* Update the map of back references, and keep the highest one. We + could do this in parse_regex() for numerical back references, but not + for named back references, because we don't know the numbers to which + named back references refer. So we do it all in this function. */ + + cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1; + if (meta_arg > cb->top_backref) cb->top_backref = meta_arg; + + /* Check to see if this back reference is recursive, that it, it + is inside the group that it references. A flag is set so that the + group can be made atomic. */ + + for (oc = cb->open_caps; oc != NULL; oc = oc->next) + { + if (oc->number == meta_arg) + { + oc->flag = TRUE; + break; + } + } + break; + + + /* ===============================================================*/ + /* Handle recursion by inserting the number of the called group (which is + the meta argument) after OP_RECURSE. At the end of compiling the pattern is + scanned and these numbers are replaced by offsets within the pattern. It is + done like this to avoid problems with forward references and adjusting + offsets when groups are duplicated and moved (as discovered in previous + implementations). Note that a recursion does not have a set first character + (relevant if it is repeated, because it will then be wrapped with ONCE + brackets). */ + + case META_RECURSE: + GETPLUSOFFSET(offset, pptr); + if (meta_arg > cb->bracount) + { + cb->erroroffset = offset; + *errorcodeptr = ERR15; /* Non-existent subpattern */ + return 0; + } + HANDLE_NUMERICAL_RECURSION: + *code = OP_RECURSE; + PUT(code, 1, meta_arg); + code += 1 + LINK_SIZE; + groupsetfirstcu = FALSE; + cb->had_recurse = TRUE; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + break; + + + /* ===============================================================*/ + /* Handle capturing parentheses; the number is the meta argument. */ + + case META_CAPTURE: + bravalue = OP_CBRA; + skipunits = IMM2_SIZE; + PUT2(code, 1+LINK_SIZE, meta_arg); + cb->lastcapture = meta_arg; + goto GROUP_PROCESS_NOTE_EMPTY; + + + /* ===============================================================*/ + /* Handle escape sequence items. For ones like \d, the ESC_values are + arranged to be the same as the corresponding OP_values in the default case + when PCRE2_UCP is not set (which is the only case in which they will appear + here). + + Note: \Q and \E are never seen here, as they were dealt with in + parse_pattern(). Neither are numerical back references or recursions, which + were turned into META_BACKREF or META_RECURSE items, respectively. \k and + \g, when followed by names, are turned into META_BACKREF_BYNAME or + META_RECURSE_BYNAME. */ + + case META_ESCAPE: + + /* We can test for escape sequences that consume a character because their + values lie between ESC_b and ESC_Z; this may have to change if any new ones + are ever created. For these sequences, we disable the setting of a first + character if it hasn't already been set. */ + + if (meta_arg > ESC_b && meta_arg < ESC_Z) + { + matched_char = TRUE; + if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; + } + + /* Set values to reset to if this is followed by a zero repeat. */ + + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + + /* If Unicode is not supported, \P and \p are not allowed and are + faulted at parse time, so will never appear here. */ + +#ifdef SUPPORT_UNICODE + if (meta_arg == ESC_P || meta_arg == ESC_p) + { + uint32_t ptype = *(++pptr) >> 16; + uint32_t pdata = *pptr & 0xffff; + *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP; + *code++ = ptype; + *code++ = pdata; + break; /* End META_ESCAPE */ + } +#endif + + /* For the rest (including \X when Unicode is supported - if not it's + faulted at parse time), the OP value is the escape value when PCRE2_UCP is + not set; if it is set, these escapes do not show up here because they are + converted into Unicode property tests in parse_regex(). Note that \b and \B + do a one-character lookbehind, and \A also behaves as if it does. */ + + if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */ + if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) && + cb->max_lookbehind == 0) + cb->max_lookbehind = 1; + + /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY + instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */ + +#if PCRE2_CODE_UNIT_WIDTH == 32 + *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg; +#else + *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg; +#endif + break; /* End META_ESCAPE */ + + + /* ===================================================================*/ + /* Handle an unrecognized meta value. A parsed pattern value less than + META_END is a literal. Otherwise we have a problem. */ + + default: + if (meta >= META_END) + { +#ifdef DEBUG_SHOW_PARSED + fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr); +#endif + *errorcodeptr = ERR89; /* Internal error - unrecognized. */ + return 0; + } + + /* Handle a literal character. We come here by goto in the case of a + 32-bit, non-UTF character whose value is greater than META_END. */ + + NORMAL_CHAR: + meta = *pptr; /* Get the full 32 bits */ + NORMAL_CHAR_SET: /* Character is already in meta */ + matched_char = TRUE; + + /* For caseless UTF mode, check whether this character has more than one + other case. If so, generate a special OP_PROP item instead of OP_CHARI. */ + +#ifdef SUPPORT_UNICODE + if (utf && (options & PCRE2_CASELESS) != 0) + { + uint32_t caseset = UCD_CASESET(meta); + if (caseset != 0) + { + *code++ = OP_PROP; + *code++ = PT_CLIST; + *code++ = caseset; + if (firstcuflags == REQ_UNSET) + firstcuflags = zerofirstcuflags = REQ_NONE; + break; /* End handling this meta item */ + } + } +#endif + + /* Caseful matches, or not one of the multicase characters. Get the + character's code units into mcbuffer, with the length in mclength. When not + in UTF mode, the length is always 1. */ + +#ifdef SUPPORT_UNICODE + if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else +#endif + { + mclength = 1; + mcbuffer[0] = meta; + } + + /* Generate the appropriate code */ + + *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR; + memcpy(code, mcbuffer, CU2BYTES(mclength)); + code += mclength; + + /* Remember if \r or \n were seen */ + + if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) + cb->external_flags |= PCRE2_HASCRORLF; + + /* Set the first and required code units appropriately. If no previous + first code unit, set it from this character, but revert to none on a zero + repeat. Otherwise, leave the firstcu value alone, and don't change it on + a zero repeat. */ + + if (firstcuflags == REQ_UNSET) + { + zerofirstcuflags = REQ_NONE; + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + + /* If the character is more than one code unit long, we can set firstcu + only if it is not to be matched caselessly. */ + + if (mclength == 1 || req_caseopt == 0) + { + firstcu = mcbuffer[0] | req_caseopt; + firstcu = mcbuffer[0]; + firstcuflags = req_caseopt; + if (mclength != 1) + { + reqcu = code[-1]; + reqcuflags = cb->req_varyopt; + } + } + else firstcuflags = reqcuflags = REQ_NONE; + } + + /* firstcu was previously set; we can set reqcu only if the length is + 1 or the matching is caseful. */ + + else + { + zerofirstcu = firstcu; + zerofirstcuflags = firstcuflags; + zeroreqcu = reqcu; + zeroreqcuflags = reqcuflags; + if (mclength == 1 || req_caseopt == 0) + { + reqcu = code[-1]; + reqcuflags = req_caseopt | cb->req_varyopt; + } + } + break; /* End default meta handling */ + } /* End of big switch */ + } /* End of big loop */ + +/* Control never reaches here. */ +} + + + +/************************************************* +* Compile regex: a sequence of alternatives * +*************************************************/ + +/* On entry, pptr is pointing past the bracket meta, but on return it points to +the closing bracket or META_END. The code variable is pointing at the code unit +into which the BRA operator has been stored. This function is used during the +pre-compile phase when we are trying to find out the amount of memory needed, +as well as during the real compile phase. The value of lengthptr distinguishes +the two phases. + +Arguments: + options option bits, including any changes for this subpattern + codeptr -> the address of the current code pointer + pptrptr -> the address of the current parsed pattern pointer + errorcodeptr -> pointer to error code variable + skipunits skip this many code units at start (for brackets and OP_COND) + firstcuptr place to put the first required code unit + firstcuflagsptr place to put the first code unit flags, or a negative number + reqcuptr place to put the last required code unit + reqcuflagsptr place to put the last required code unit flags, or a negative number + bcptr pointer to the chain of currently open branches + cb points to the data block with tables pointers etc. + lengthptr NULL during the real compile phase + points to length accumulator during pre-compile phase + +Returns: 0 There has been an error + +1 Success, this group must match at least one character + -1 Success, this group may match an empty string +*/ + +static int +compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr, + int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr, + int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr, + branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr) +{ +PCRE2_UCHAR *code = *codeptr; +PCRE2_UCHAR *last_branch = code; +PCRE2_UCHAR *start_bracket = code; +BOOL lookbehind; +open_capitem capitem; +int capnumber = 0; +int okreturn = 1; +uint32_t *pptr = *pptrptr; +uint32_t firstcu, reqcu; +uint32_t lookbehindlength; +int32_t firstcuflags, reqcuflags; +uint32_t branchfirstcu, branchreqcu; +int32_t branchfirstcuflags, branchreqcuflags; +PCRE2_SIZE length; +branch_chain bc; + +/* If set, call the external function that checks for stack availability. */ + +if (cb->cx->stack_guard != NULL && + cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data)) + { + *errorcodeptr= ERR33; + return 0; } + +/* Miscellaneous initialization */ + +bc.outer = bcptr; +bc.current_branch = code; + +firstcu = reqcu = 0; +firstcuflags = reqcuflags = REQ_UNSET; + +/* Accumulate the length for use in the pre-compile phase. Start with the +length of the BRA and KET and any extra code units that are required at the +beginning. We accumulate in a local variable to save frequent testing of +lengthptr for NULL. We cannot do this by looking at the value of 'code' at the +start and end of each alternative, because compiled items are discarded during +the pre-compile phase so that the work space is not exceeded. */ + +length = 2 + 2*LINK_SIZE + skipunits; + +/* Remember if this is a lookbehind assertion, and if it is, save its length +and skip over the pattern offset. */ + +lookbehind = *code == OP_ASSERTBACK || *code == OP_ASSERTBACK_NOT; +if (lookbehind) + { + lookbehindlength = META_DATA(pptr[-1]); + pptr += SIZEOFFSET; + } +else lookbehindlength = 0; + +/* If this is a capturing subpattern, add to the chain of open capturing items +so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA +need be tested here; changing this opcode to one of its variants, e.g. +OP_SCBRAPOS, happens later, after the group has been compiled. */ + +if (*code == OP_CBRA) + { + capnumber = GET2(code, 1 + LINK_SIZE); + capitem.number = capnumber; + capitem.next = cb->open_caps; + capitem.flag = FALSE; + cb->open_caps = &capitem; + } + +/* Offset is set zero to mark that this bracket is still open */ + +PUT(code, 1, 0); +code += 1 + LINK_SIZE + skipunits; + +/* Loop for each alternative branch */ + +for (;;) + { + int branch_return; + + /* Insert OP_REVERSE if this is as lookbehind assertion. */ + + if (lookbehind && lookbehindlength > 0) + { + *code++ = OP_REVERSE; + PUTINC(code, 0, lookbehindlength); + length += 1 + LINK_SIZE; + } + + /* Now compile the branch; in the pre-compile phase its length gets added + into the length. */ + + if ((branch_return = + compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu, + &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc, + cb, (lengthptr == NULL)? NULL : &length)) == 0) + return 0; + + /* If a branch can match an empty string, so can the whole group. */ + + if (branch_return < 0) okreturn = -1; + + /* In the real compile phase, there is some post-processing to be done. */ + + if (lengthptr == NULL) + { + /* If this is the first branch, the firstcu and reqcu values for the + branch become the values for the regex. */ + + if (*last_branch != OP_ALT) + { + firstcu = branchfirstcu; + firstcuflags = branchfirstcuflags; + reqcu = branchreqcu; + reqcuflags = branchreqcuflags; + } + + /* If this is not the first branch, the first char and reqcu have to + match the values from all the previous branches, except that if the + previous value for reqcu didn't have REQ_VARY set, it can still match, + and we set REQ_VARY for the regex. */ + + else + { + /* If we previously had a firstcu, but it doesn't match the new branch, + we have to abandon the firstcu for the regex, but if there was + previously no reqcu, it takes on the value of the old firstcu. */ + + if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu) + { + if (firstcuflags >= 0) + { + if (reqcuflags < 0) + { + reqcu = firstcu; + reqcuflags = firstcuflags; + } + } + firstcuflags = REQ_NONE; + } + + /* If we (now or from before) have no firstcu, a firstcu from the + branch becomes a reqcu if there isn't a branch reqcu. */ + + if (firstcuflags < 0 && branchfirstcuflags >= 0 && + branchreqcuflags < 0) + { + branchreqcu = branchfirstcu; + branchreqcuflags = branchfirstcuflags; + } + + /* Now ensure that the reqcus match */ + + if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) || + reqcu != branchreqcu) + reqcuflags = REQ_NONE; + else + { + reqcu = branchreqcu; + reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */ + } + } + } + + /* Handle reaching the end of the expression, either ')' or end of pattern. + In the real compile phase, go back through the alternative branches and + reverse the chain of offsets, with the field in the BRA item now becoming an + offset to the first alternative. If there are no alternatives, it points to + the end of the group. The length in the terminating ket is always the length + of the whole bracketed item. Return leaving the pointer at the terminating + char. */ + + if (META_CODE(*pptr) != META_ALT) + { + if (lengthptr == NULL) + { + PCRE2_SIZE branch_length = code - last_branch; + do + { + PCRE2_SIZE prev_length = GET(last_branch, 1); + PUT(last_branch, 1, branch_length); + branch_length = prev_length; + last_branch -= branch_length; + } + while (branch_length > 0); + } + + /* Fill in the ket */ + + *code = OP_KET; + PUT(code, 1, (int)(code - start_bracket)); + code += 1 + LINK_SIZE; + + /* If it was a capturing subpattern, check to see if it contained any + recursive back references. If so, we must wrap it in atomic brackets. In + any event, remove the block from the chain. */ + + if (capnumber > 0) + { + if (cb->open_caps->flag) + { + memmove(start_bracket + 1 + LINK_SIZE, start_bracket, + CU2BYTES(code - start_bracket)); + *start_bracket = OP_ONCE; + code += 1 + LINK_SIZE; + PUT(start_bracket, 1, (int)(code - start_bracket)); + *code = OP_KET; + PUT(code, 1, (int)(code - start_bracket)); + code += 1 + LINK_SIZE; + length += 2 + 2*LINK_SIZE; + } + cb->open_caps = cb->open_caps->next; + } + + /* Set values to pass back */ + + *codeptr = code; + *pptrptr = pptr; + *firstcuptr = firstcu; + *firstcuflagsptr = firstcuflags; + *reqcuptr = reqcu; + *reqcuflagsptr = reqcuflags; + if (lengthptr != NULL) + { + if (OFLOW_MAX - *lengthptr < length) + { + *errorcodeptr = ERR20; + return 0; + } + *lengthptr += length; + } + return okreturn; + } + + /* Another branch follows. In the pre-compile phase, we can move the code + pointer back to where it was for the start of the first branch. (That is, + pretend that each branch is the only one.) + + In the real compile phase, insert an ALT node. Its length field points back + to the previous branch while the bracket remains open. At the end the chain + is reversed. It's done like this so that the start of the bracket has a + zero offset until it is closed, making it possible to detect recursion. */ + + if (lengthptr != NULL) + { + code = *codeptr + 1 + LINK_SIZE + skipunits; + length += 1 + LINK_SIZE; + } + else + { + *code = OP_ALT; + PUT(code, 1, (int)(code - last_branch)); + bc.current_branch = last_branch = code; + code += 1 + LINK_SIZE; + } + + /* Set the lookbehind length (if not in a lookbehind the value will be zero) + and then advance past the vertical bar. */ + + lookbehindlength = META_DATA(*pptr); + pptr++; + } +/* Control never reaches here */ +} + + + +/************************************************* +* Check for anchored pattern * +*************************************************/ + +/* Try to find out if this is an anchored regular expression. Consider each +alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket +all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then +it's anchored. However, if this is a multiline pattern, then only OP_SOD will +be found, because ^ generates OP_CIRCM in that mode. + +We can also consider a regex to be anchored if OP_SOM starts all its branches. +This is the code for \G, which means "match at start of match position, taking +into account the match offset". + +A branch is also implicitly anchored if it starts with .* and DOTALL is set, +because that will try the rest of the pattern at all possible matching points, +so there is no point trying again.... er .... + +.... except when the .* appears inside capturing parentheses, and there is a +subsequent back reference to those parentheses. We haven't enough information +to catch that case precisely. + +At first, the best we could do was to detect when .* was in capturing brackets +and the highest back reference was greater than or equal to that level. +However, by keeping a bitmap of the first 31 back references, we can catch some +of the more common cases more precisely. + +... A second exception is when the .* appears inside an atomic group, because +this prevents the number of characters it matches from being adjusted. + +Arguments: + code points to start of the compiled pattern + bracket_map a bitmap of which brackets we are inside while testing; this + handles up to substring 31; after that we just have to take + the less precise approach + cb points to the compile data block + atomcount atomic group level + inassert TRUE if in an assertion + +Returns: TRUE or FALSE +*/ + +static BOOL +is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, + int atomcount, BOOL inassert) +{ +do { + PCRE2_SPTR scode = first_significant_code( + code + PRIV(OP_lengths)[*code], FALSE); + int op = *scode; + + /* Non-capturing brackets */ + + if (op == OP_BRA || op == OP_BRAPOS || + op == OP_SBRA || op == OP_SBRAPOS) + { + if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) + return FALSE; + } + + /* Capturing brackets */ + + else if (op == OP_CBRA || op == OP_CBRAPOS || + op == OP_SCBRA || op == OP_SCBRAPOS) + { + int n = GET2(scode, 1+LINK_SIZE); + int new_map = bracket_map | ((n < 32)? (1u << n) : 1); + if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE; + } + + /* Positive forward assertion */ + + else if (op == OP_ASSERT) + { + if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; + } + + /* Condition */ + + else if (op == OP_COND) + { + if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) + return FALSE; + } + + /* Atomic groups */ + + else if (op == OP_ONCE || op == OP_ONCE_NC) + { + if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert)) + return FALSE; + } + + /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and + it isn't in brackets that are or may be referenced or inside an atomic + group or an assertion. Also the pattern must not contain *PRUNE or *SKIP, + because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/ + with the subject "aab", which matches "b", i.e. not at the start of a line. + There is also an option that disables auto-anchoring. */ + + else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || + op == OP_TYPEPOSSTAR)) + { + if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 || + atomcount > 0 || cb->had_pruneorskip || inassert || + (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) + return FALSE; + } + + /* Check for explicit anchoring */ + + else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE; + + code += GET(code, 1); + } +while (*code == OP_ALT); /* Loop for each alternative */ +return TRUE; +} + + + +/************************************************* +* Check for starting with ^ or .* * +*************************************************/ + +/* This is called to find out if every branch starts with ^ or .* so that +"first char" processing can be done to speed things up in multiline +matching and for non-DOTALL patterns that start with .* (which must start at +the beginning or after \n). As in the case of is_anchored() (see above), we +have to take account of back references to capturing brackets that contain .* +because in that case we can't make the assumption. Also, the appearance of .* +inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE +or *SKIP does not count, because once again the assumption no longer holds. + +Arguments: + code points to start of the compiled pattern or a group + bracket_map a bitmap of which brackets we are inside while testing; this + handles up to substring 31; after that we just have to take + the less precise approach + cb points to the compile data + atomcount atomic group level + inassert TRUE if in an assertion + +Returns: TRUE or FALSE +*/ + +static BOOL +is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, + int atomcount, BOOL inassert) +{ +do { + PCRE2_SPTR scode = first_significant_code( + code + PRIV(OP_lengths)[*code], FALSE); + int op = *scode; + + /* If we are at the start of a conditional assertion group, *both* the + conditional assertion *and* what follows the condition must satisfy the test + for start of line. Other kinds of condition fail. Note that there may be an + auto-callout at the start of a condition. */ + + if (op == OP_COND) + { + scode += 1 + LINK_SIZE; + + if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT]; + else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE); + + switch (*scode) + { + case OP_CREF: + case OP_DNCREF: + case OP_RREF: + case OP_DNRREF: + case OP_FAIL: + case OP_FALSE: + case OP_TRUE: + return FALSE; + + default: /* Assertion */ + if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; + do scode += GET(scode, 1); while (*scode == OP_ALT); + scode += 1 + LINK_SIZE; + break; + } + scode = first_significant_code(scode, FALSE); + op = *scode; + } + + /* Non-capturing brackets */ + + if (op == OP_BRA || op == OP_BRAPOS || + op == OP_SBRA || op == OP_SBRAPOS) + { + if (!is_startline(scode, bracket_map, cb, atomcount, inassert)) + return FALSE; + } + + /* Capturing brackets */ + + else if (op == OP_CBRA || op == OP_CBRAPOS || + op == OP_SCBRA || op == OP_SCBRAPOS) + { + int n = GET2(scode, 1+LINK_SIZE); + int new_map = bracket_map | ((n < 32)? (1u << n) : 1); + if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE; + } + + /* Positive forward assertions */ + + else if (op == OP_ASSERT) + { + if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) + return FALSE; + } + + /* Atomic brackets */ + + else if (op == OP_ONCE || op == OP_ONCE_NC) + { + if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert)) + return FALSE; + } + + /* .* means "start at start or after \n" if it isn't in atomic brackets or + brackets that may be referenced or an assertion, and as long as the pattern + does not contain *PRUNE or *SKIP, because these break the feature. Consider, + for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", + i.e. not at the start of a line. There is also an option that disables this + optimization. */ + + else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) + { + if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || + atomcount > 0 || cb->had_pruneorskip || inassert || + (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) + return FALSE; + } + + /* Check for explicit circumflex; anything else gives a FALSE result. Note + in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC + because the number of characters matched by .* cannot be adjusted inside + them. */ + + else if (op != OP_CIRC && op != OP_CIRCM) return FALSE; + + /* Move on to the next alternative */ + + code += GET(code, 1); + } +while (*code == OP_ALT); /* Loop for each alternative */ +return TRUE; } @@ -2521,7 +7755,7 @@ find_recurse(PCRE2_SPTR code, BOOL utf) { for (;;) { - register PCRE2_UCHAR c = *code; + PCRE2_UCHAR c = *code; if (c == OP_END) return NULL; if (c == OP_RECURSE) return code; @@ -2535,8 +7769,8 @@ for (;;) /* Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra - two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we - must add in its length. */ + two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, + we must add in its length. */ else { @@ -2649,5243 +7883,6 @@ for (;;) -/************************************************* -* Adjust OP_RECURSE items in repeated group * -*************************************************/ - -/* OP_RECURSE items contain an offset from the start of the regex to the group -that is referenced. This means that groups can be replicated for fixed -repetition simply by copying (because the recursion is allowed to refer to -earlier groups that are outside the current group). However, when a group is -optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is -inserted before it, after it has been compiled. This means that any OP_RECURSE -items within it that refer to the group itself or any contained groups have to -have their offsets adjusted. That is one of the jobs of this function. Before -it is called, the partially compiled regex must be temporarily terminated with -OP_END. - -This function has been extended to cope with forward references for recursions -and subroutine calls. It must check the list of such references for the -group we are dealing with. If it finds that one of the recursions in the -current group is on this list, it does not adjust the value in the reference -(which is a group number). After the group has been scanned, all the offsets in -the forward reference list for the group are adjusted. - -Arguments: - group points to the start of the group - adjust the amount by which the group is to be moved - utf TRUE in UTF mode - cb compile data - save_hwm_offset the hwm forward reference offset at the start of the group - -Returns: nothing -*/ - -static void -adjust_recurse(PCRE2_UCHAR *group, int adjust, BOOL utf, compile_block *cb, - size_t save_hwm_offset) -{ -uint32_t offset; -PCRE2_UCHAR *hc; -PCRE2_UCHAR *ptr = group; - -/* Scan the group for recursions. For each one found, check the forward -reference list. */ - -while ((ptr = (PCRE2_UCHAR *)find_recurse(ptr, utf)) != NULL) - { - for (hc = (PCRE2_UCHAR *)cb->start_workspace + save_hwm_offset; hc < cb->hwm; - hc += LINK_SIZE) - { - offset = (int)GET(hc, 0); - if (cb->start_code + offset == ptr + 1) break; - } - - /* If we have not found this recursion on the forward reference list, adjust - the recursion's offset if it's after the start of this group. */ - - if (hc >= cb->hwm) - { - offset = (int)GET(ptr, 1); - if (cb->start_code + offset >= group) PUT(ptr, 1, offset + adjust); - } - - ptr += 1 + LINK_SIZE; - } - -/* Now adjust all forward reference offsets for the group. */ - -for (hc = (PCRE2_UCHAR *)cb->start_workspace + save_hwm_offset; hc < cb->hwm; - hc += LINK_SIZE) - { - offset = (int)GET(hc, 0); - PUT(hc, 0, offset + adjust); - } -} - - - -/************************************************* -* Check for POSIX class syntax * -*************************************************/ - -/* This function is called when the sequence "[:" or "[." or "[=" is -encountered in a character class. It checks whether this is followed by a -sequence of characters terminated by a matching ":]" or ".]" or "=]". If we -reach an unescaped ']' without the special preceding character, return FALSE. - -Originally, this function only recognized a sequence of letters between the -terminators, but it seems that Perl recognizes any sequence of characters, -though of course unknown POSIX names are subsequently rejected. Perl gives an -"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE -didn't consider this to be a POSIX class. Likewise for [:1234:]. - -The problem in trying to be exactly like Perl is in the handling of escapes. We -have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX -class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code -below handles the special case of \], but does not try to do any other escape -processing. This makes it different from Perl for cases such as [:l\ower:] -where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize -"l\ower". This is a lesser evil than not diagnosing bad classes when Perl does, -I think. - -A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. -It seems that the appearance of a nested POSIX class supersedes an apparent -external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or -a digit. - -In Perl, unescaped square brackets may also appear as part of class names. For -example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for -[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not -seem right at all. PCRE does not allow closing square brackets in POSIX class -names. - -Arguments: - ptr pointer to the initial [ - endptr where to return a pointer to the terminating ':', '.', or '=' - -Returns: TRUE or FALSE -*/ - -static BOOL -check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR *endptr) -{ -PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */ -terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ - -for (++ptr; *ptr != CHAR_NULL; ptr++) - { - if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; - else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; - else - { - if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) - { - *endptr = ptr; - return TRUE; - } - if (*ptr == CHAR_LEFT_SQUARE_BRACKET && - (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || - ptr[1] == CHAR_EQUALS_SIGN) && - check_posix_syntax(ptr, endptr)) - return FALSE; - } - } -return FALSE; -} - - - -/************************************************* -* Check POSIX class name * -*************************************************/ - -/* This function is called to check the name given in a POSIX-style class entry -such as [:alnum:]. - -Arguments: - ptr points to the first letter - len the length of the name - -Returns: a value representing the name, or -1 if unknown -*/ - -static int -check_posix_name(PCRE2_SPTR ptr, int len) -{ -const char *pn = posix_names; -register int yield = 0; -while (posix_name_lengths[yield] != 0) - { - if (len == posix_name_lengths[yield] && - PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield; - pn += posix_name_lengths[yield] + 1; - yield++; - } -return -1; -} - - - -#ifdef SUPPORT_UNICODE -/************************************************* -* Get othercase range * -*************************************************/ - -/* This function is passed the start and end of a class range in UCT mode. It -searches up the characters, looking for ranges of characters in the "other" -case. Each call returns the next one, updating the start address. A character -with multiple other cases is returned on its own with a special return value. - -Arguments: - cptr points to starting character value; updated - d end value - ocptr where to put start of othercase range - odptr where to put end of othercase range - -Yield: -1 when no more - 0 when a range is returned - >0 the CASESET offset for char with multiple other cases - in this case, ocptr contains the original -*/ - -static int -get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr, - uint32_t *odptr) -{ -uint32_t c, othercase, next; -unsigned int co; - -/* Find the first character that has an other case. If it has multiple other -cases, return its case offset value. */ - -for (c = *cptr; c <= d; c++) - { - if ((co = UCD_CASESET(c)) != 0) - { - *ocptr = c++; /* Character that has the set */ - *cptr = c; /* Rest of input range */ - return (int)co; - } - if ((othercase = UCD_OTHERCASE(c)) != c) break; - } - -if (c > d) return -1; /* Reached end of range */ - -/* Found a character that has a single other case. Search for the end of the -range, which is either the end of the input range, or a character that has zero -or more than one other cases. */ - -*ocptr = othercase; -next = othercase + 1; - -for (++c; c <= d; c++) - { - if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break; - next++; - } - -*odptr = next - 1; /* End of othercase range */ -*cptr = c; /* Rest of input range */ -return 0; -} -#endif /* SUPPORT_UNICODE */ - - - -/************************************************* -* Add a character or range to a class * -*************************************************/ - -/* This function packages up the logic of adding a character or range of -characters to a class. The character values in the arguments will be within the -valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is -mutually recursive with the function immediately below. - -Arguments: - classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data - options the options word - cb compile data - start start of range character - end end of range character - -Returns: the number of < 256 characters added - the pointer to extra data is updated -*/ - -static int -add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, - compile_block *cb, uint32_t start, uint32_t end) -{ -uint32_t c; -uint32_t classbits_end = (end <= 0xff ? end : 0xff); -int n8 = 0; - -/* If caseless matching is required, scan the range and process alternate -cases. In Unicode, there are 8-bit characters that have alternate cases that -are greater than 255 and vice-versa. Sometimes we can just extend the original -range. */ - -if ((options & PCRE2_CASELESS) != 0) - { -#ifdef SUPPORT_UNICODE - if ((options & PCRE2_UTF) != 0) - { - int rc; - uint32_t oc, od; - - options &= ~PCRE2_CASELESS; /* Remove for recursive calls */ - c = start; - - while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0) - { - /* Handle a single character that has more than one other case. */ - - if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb, - PRIV(ucd_caseless_sets) + rc, oc); - - /* Do nothing if the other case range is within the original range. */ - - else if (oc >= start && od <= end) continue; - - /* Extend the original range if there is overlap, noting that if oc < c, we - can't have od > end because a subrange is always shorter than the basic - range. Otherwise, use a recursive call to add the additional range. */ - - else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ - else if (od > end && oc <= end + 1) - { - end = od; /* Extend upwards */ - if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); - } - else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od); - } - } - else -#endif /* SUPPORT_UNICODE */ - - /* Not UTF mode */ - - for (c = start; c <= classbits_end; c++) - { - SETBIT(classbits, cb->fcc[c]); - n8++; - } - } - -/* Now handle the original range. Adjust the final value according to the bit -length - this means that the same lists of (e.g.) horizontal spaces can be used -in all cases. */ - -if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR) - end = MAX_NON_UTF_CHAR; - -/* Use the bitmap for characters < 256. Otherwise use extra data.*/ - -for (c = start; c <= classbits_end; c++) - { - /* Regardless of start, c will always be <= 255. */ - SETBIT(classbits, c); - n8++; - } - -#ifdef SUPPORT_WIDE_CHARS -if (start <= 0xff) start = 0xff + 1; - -if (end >= start) - { - PCRE2_UCHAR *uchardata = *uchardptr; - -#ifdef SUPPORT_UNICODE - if ((options & PCRE2_UTF) != 0) - { - if (start < end) - { - *uchardata++ = XCL_RANGE; - uchardata += PRIV(ord2utf)(start, uchardata); - uchardata += PRIV(ord2utf)(end, uchardata); - } - else if (start == end) - { - *uchardata++ = XCL_SINGLE; - uchardata += PRIV(ord2utf)(start, uchardata); - } - } - else -#endif /* SUPPORT_UNICODE */ - - /* Without UTF support, character values are constrained by the bit length, - and can only be > 256 for 16-bit and 32-bit libraries. */ - -#if PCRE2_CODE_UNIT_WIDTH == 8 - {} -#else - if (start < end) - { - *uchardata++ = XCL_RANGE; - *uchardata++ = start; - *uchardata++ = end; - } - else if (start == end) - { - *uchardata++ = XCL_SINGLE; - *uchardata++ = start; - } -#endif - *uchardptr = uchardata; /* Updata extra data pointer */ - } -#else - (void)uchardptr; /* Avoid compiler warning */ -#endif /* SUPPORT_WIDE_CHARS */ - -return n8; /* Number of 8-bit characters */ -} - - - -/************************************************* -* Add a list of characters to a class * -*************************************************/ - -/* This function is used for adding a list of case-equivalent characters to a -class, and also for adding a list of horizontal or vertical whitespace. If the -list is in order (which it should be), ranges of characters are detected and -handled appropriately. This function is mutually recursive with the function -above. - -Arguments: - classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data - options the options word - cb contains pointers to tables etc. - p points to row of 32-bit values, terminated by NOTACHAR - except character to omit; this is used when adding lists of - case-equivalent characters to avoid including the one we - already know about - -Returns: the number of < 256 characters added - the pointer to extra data is updated -*/ - -static int -add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, - compile_block *cb, const uint32_t *p, unsigned int except) -{ -int n8 = 0; -while (p[0] < NOTACHAR) - { - int n = 0; - if (p[0] != except) - { - while(p[n+1] == p[0] + n + 1) n++; - n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]); - } - p += n + 1; - } -return n8; -} - - - -/************************************************* -* Add characters not in a list to a class * -*************************************************/ - -/* This function is used for adding the complement of a list of horizontal or -vertical whitespace to a class. The list must be in order. - -Arguments: - classbits the bit map for characters < 256 - uchardptr points to the pointer for extra data - options the options word - cb contains pointers to tables etc. - p points to row of 32-bit values, terminated by NOTACHAR - -Returns: the number of < 256 characters added - the pointer to extra data is updated -*/ - -static int -add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, - uint32_t options, compile_block *cb, const uint32_t *p) -{ -BOOL utf = (options & PCRE2_UTF) != 0; -int n8 = 0; -if (p[0] > 0) - n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1); -while (p[0] < NOTACHAR) - { - while (p[1] == p[0] + 1) p++; - n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1, - (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1); - p++; - } -return n8; -} - - - -/************************************************* -* Scan regex to identify named groups * -*************************************************/ - -/* This function is called first of all, to scan for named capturing groups so -that information about them is fully available to both the compiling scans. -It skips over everything except parenthesized items. - -Arguments: - ptrptr points to pointer to the start of the pattern - options compiling dynamic options - cb pointer to the compile data block - -Returns: zero on success or a non-zero error code, with pointer updated -*/ - -typedef struct nest_save { - uint16_t nest_depth; - uint16_t reset_group; - uint16_t max_group; - uint16_t flags; -} nest_save; - -#define NSF_RESET 0x0001u -#define NSF_EXTENDED 0x0002u -#define NSF_DUPNAMES 0x0004u - -static uint32_t scan_for_captures(PCRE2_SPTR *ptrptr, uint32_t options, - compile_block *cb) -{ -uint32_t c; -uint32_t nest_depth = 0; -uint32_t set, unset, *optset; -int errorcode = 0; -int escape; -int namelen; -int i; -BOOL inescq = FALSE; -BOOL isdupname; -BOOL utf = (options & PCRE2_UTF) != 0; -BOOL negate_class; -PCRE2_SPTR name; -PCRE2_SPTR ptr = *ptrptr; -named_group *ng; -nest_save *top_nest = NULL; -nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); - -for (; ptr < cb->end_pattern; ptr++) - { - c = *ptr; - - /* Skip over literals */ - - if (inescq) - { - if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) - { - inescq = FALSE; - ptr++; - } - continue; - } - - /* Skip over comments and whitespace in extended mode. Need a loop to handle - whitespace after a comment. */ - - if ((options & PCRE2_EXTENDED) != 0) - { - for (;;) - { - while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr); - if (c != CHAR_NUMBER_SIGN) break; - ptr++; - while (*ptr != CHAR_NULL) - { - if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ - { /* IS_NEWLINE sets cb->nllen. */ - ptr += cb->nllen; - break; - } - ptr++; -#ifdef SUPPORT_UNICODE - if (utf) FORWARDCHAR(ptr); -#endif - } - c = *ptr; /* Either NULL or the char after a newline */ - } - } - - /* Process the next pattern item. */ - - switch(c) - { - default: /* Most characters are just skipped */ - break; - - /* Skip escapes except for \Q */ - - case CHAR_BACKSLASH: - errorcode = 0; - escape = check_escape(&ptr, &c, &errorcode, options, FALSE, cb); - if (errorcode != 0) goto FAILED; - if (escape == ESC_Q) inescq = TRUE; - break; - - /* Skip a character class. The syntax is complicated so we have to - replicate some of what happens when a class is processed for real. */ - - case CHAR_LEFT_SQUARE_BRACKET: - if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0 || - PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0) - { - ptr += 7; - break; - } - - /* If the first character is '^', set the negation flag (not actually used - here, except to recognize only one ^) and skip it. If the first few - characters (either before or after ^) are \Q\E or \E we skip them too. This - makes for compatibility with Perl. */ - - negate_class = FALSE; - for (;;) - { - c = *(++ptr); /* First character in class */ - if (c == CHAR_BACKSLASH) - { - if (ptr[1] == CHAR_E) - ptr++; - else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0) - ptr += 3; - else - break; - } - else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) - negate_class = TRUE; - else break; - } - - if (c == CHAR_RIGHT_SQUARE_BRACKET && - (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0) - break; - - /* Loop for the contents of the class */ - - for (;;) - { - if (c == CHAR_NULL && ptr >= cb->end_pattern) - { - errorcode = ERR6; /* Missing terminating ']' */ - goto FAILED; - } - -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(c)) - { /* Braces are required because the */ - GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ - } -#endif - - /* Inside \Q...\E everything is literal except \E */ - - if (inescq) - { - if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */ - { - inescq = FALSE; /* Reset literal state */ - ptr++; /* Skip the 'E' */ - } - goto CONTINUE_CLASS; - } - - /* Skip POSIX class names. */ - - if (c == CHAR_LEFT_SQUARE_BRACKET && - (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || - ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &ptr)) - ptr ++; - - else if (c == CHAR_BACKSLASH) - { - errorcode = 0; - escape = check_escape(&ptr, &c, &errorcode, options, TRUE, cb); - if (errorcode != 0) goto FAILED; - if (escape == ESC_Q) inescq = TRUE; - } - - CONTINUE_CLASS: - c = *(++ptr); - if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; - } /* End of class-processing loop */ - break; - - /* This is the real work of this function - handling parentheses. */ - - case CHAR_LEFT_PARENTHESIS: - nest_depth++; - - if (ptr[1] != CHAR_QUESTION_MARK) - { - if (ptr[1] != CHAR_ASTERISK && - (options & PCRE2_NO_AUTO_CAPTURE) == 0) - cb->bracount++; /* Capturing group */ - else /* (*something) - just skip to closing ket */ - { - ptr += 2; - while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; - } - } - - /* Handle (?...) groups */ - - else switch(ptr[2]) - { - default: - ptr += 2; - if (ptr[0] == CHAR_R || /* (?R) */ - ptr[0] == CHAR_C || /* (?C) */ - IS_DIGIT(ptr[0]) || /* (?n) */ - (ptr[0] == CHAR_MINUS && IS_DIGIT(ptr[1]))) break; /* (?-n) */ - - /* Handle (?| and (?imsxJU: which are the only other valid forms. Both - need a new block on the nest stack. */ - - if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace); - else if (++top_nest >= end_nests) - { - errorcode = ERR84; - goto FAILED; - } - top_nest->nest_depth = nest_depth; - top_nest->flags = 0; - if ((options & PCRE2_EXTENDED) != 0) top_nest->flags |= NSF_EXTENDED; - if ((options & PCRE2_DUPNAMES) != 0) top_nest->flags |= NSF_DUPNAMES; - - if (*ptr == CHAR_VERTICAL_LINE) - { - top_nest->reset_group = cb->bracount; - top_nest->max_group = cb->bracount; - top_nest->flags |= NSF_RESET; - break; - } - - /* Scan options */ - - top_nest->reset_group = 0; - top_nest->max_group = 0; - - set = unset = 0; - optset = &set; - - /* Need only track (?x: and (?J: at this stage */ - - while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON) - { - switch (*ptr++) - { - case CHAR_MINUS: optset = &unset; break; - - case CHAR_x: *optset |= PCRE2_EXTENDED; break; - - case CHAR_J: - *optset |= PCRE2_DUPNAMES; - cb->external_flags |= PCRE2_JCHANGED; - break; - - case CHAR_i: - case CHAR_m: - case CHAR_s: - case CHAR_U: - break; - - default: errorcode = ERR11; - ptr--; /* Correct the offset */ - goto FAILED; - } - } - - options = (options | set) & (~unset); - - /* If the options ended with ')' this is not the start of a nested - group with option changes, so the options change at this level. If the - previous level set up a nest block, discard the one we have just created. - Otherwise adjust it for the previous level. */ - - if (*ptr == CHAR_RIGHT_PARENTHESIS) - { - nest_depth--; - if (top_nest > (nest_save *)(cb->start_workspace) && - (top_nest-1)->nest_depth == nest_depth) top_nest --; - else top_nest->nest_depth = nest_depth; - } - break; - - case CHAR_NUMBER_SIGN: - ptr += 3; - while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; - if (*ptr != CHAR_RIGHT_PARENTHESIS) - { - errorcode = ERR18; - goto FAILED; - } - break; - - case CHAR_LEFT_PARENTHESIS: - nest_depth++; - /* Fall through */ - - case CHAR_COLON: - case CHAR_GREATER_THAN_SIGN: - case CHAR_EQUALS_SIGN: - case CHAR_EXCLAMATION_MARK: - case CHAR_AMPERSAND: - case CHAR_PLUS: - ptr += 2; - break; - - case CHAR_P: - if (ptr[3] != CHAR_LESS_THAN_SIGN) - { - ptr += 3; - break; - } - ptr++; - c = CHAR_GREATER_THAN_SIGN; /* Terminator */ - goto DEFINE_NAME; - - case CHAR_LESS_THAN_SIGN: - if (ptr[3] == CHAR_EQUALS_SIGN || ptr[3] == CHAR_EXCLAMATION_MARK) - { - ptr += 3; - break; - } - c = CHAR_GREATER_THAN_SIGN; /* Terminator */ - goto DEFINE_NAME; - - case CHAR_APOSTROPHE: - c = CHAR_APOSTROPHE; /* Terminator */ - - DEFINE_NAME: - name = ptr = ptr + 3; - - if (*ptr == c) /* Empty name */ - { - errorcode = ERR62; - goto FAILED; - } - - if (IS_DIGIT(*ptr)) - { - errorcode = ERR44; /* Group name must start with non-digit */ - goto FAILED; - } - - if (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) == 0) - { - errorcode = ERR24; - goto FAILED; - } - - while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) ptr++; - namelen = (int)(ptr - name); - - if (*ptr != c) - { - errorcode = ERR42; - goto FAILED; - } - - if (cb->names_found >= MAX_NAME_COUNT) - { - errorcode = ERR49; - goto FAILED; - } - - if (namelen + IMM2_SIZE + 1 > cb->name_entry_size) - { - cb->name_entry_size = namelen + IMM2_SIZE + 1; - if (namelen > MAX_NAME_SIZE) - { - errorcode = ERR48; - goto FAILED; - } - } - - /* We have a valid name for this capturing group. */ - - cb->bracount++; - - /* Scan the list to check for duplicates. For duplicate names, if the - number is the same, break the loop, which causes the name to be - discarded; otherwise, if DUPNAMES is not set, give an error. - If it is set, allow the name with a different number, but continue - scanning in case this is a duplicate with the same number. For - non-duplicate names, give an error if the number is duplicated. */ - - isdupname = FALSE; - ng = cb->named_groups; - for (i = 0; i < cb->names_found; i++, ng++) - { - if (namelen == ng->length && - PRIV(strncmp)(name, ng->name, namelen) == 0) - { - if (ng->number == cb->bracount) break; - if ((options & PCRE2_DUPNAMES) == 0) - { - errorcode = ERR43; - goto FAILED; - } - isdupname = ng->isdup = TRUE; /* Mark as a duplicate */ - cb->dupnames = TRUE; /* Duplicate names exist */ - } - else if (ng->number == cb->bracount) - { - errorcode = ERR65; - goto FAILED; - } - } - - if (i < cb->names_found) break; /* Ignore duplicate with same number */ - - /* Increase the list size if necessary */ - - if (cb->names_found >= cb->named_group_list_size) - { - int newsize = cb->named_group_list_size * 2; - named_group *newspace = - cb->cx->memctl.malloc(newsize * sizeof(named_group), - cb->cx->memctl.memory_data); - if (newspace == NULL) - { - errorcode = ERR21; - goto FAILED; - } - - memcpy(newspace, cb->named_groups, - cb->named_group_list_size * sizeof(named_group)); - if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE) - cb->cx->memctl.free((void *)cb->named_groups, - cb->cx->memctl.memory_data); - cb->named_groups = newspace; - cb->named_group_list_size = newsize; - } - - /* Add this name to the list */ - - cb->named_groups[cb->names_found].name = name; - cb->named_groups[cb->names_found].length = namelen; - cb->named_groups[cb->names_found].number = cb->bracount; - cb->named_groups[cb->names_found].isdup = isdupname; - cb->names_found++; - break; - } /* End of (? switch */ - break; /* End of ( handling */ - - /* At an alternation, reset the capture count if we are in a (?| group. */ - - case CHAR_VERTICAL_LINE: - if (top_nest != NULL && top_nest->nest_depth == nest_depth && - (top_nest->flags & NSF_RESET) != 0) - { - if (cb->bracount > top_nest->max_group) - top_nest->max_group = cb->bracount; - cb->bracount = top_nest->reset_group; - } - break; - - /* At a right parenthesis, reset the capture count to the maximum if we - are in a (?| group and/or reset the extended option. */ - - case CHAR_RIGHT_PARENTHESIS: - if (top_nest != NULL && top_nest->nest_depth == nest_depth) - { - if ((top_nest->flags & NSF_RESET) != 0 && - top_nest->max_group > cb->bracount) - cb->bracount = top_nest->max_group; - if ((top_nest->flags & NSF_EXTENDED) != 0) options |= PCRE2_EXTENDED; - else options &= ~PCRE2_EXTENDED; - if ((top_nest->flags & NSF_DUPNAMES) != 0) options |= PCRE2_DUPNAMES; - else options &= ~PCRE2_DUPNAMES; - if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; - else top_nest--; - } - nest_depth--; - break; - } - } - -cb->final_bracount = cb->bracount; -return 0; - -FAILED: -*ptrptr = ptr; -return errorcode; -} - - - -/************************************************* -* Compile one branch * -*************************************************/ - -/* Scan the pattern, compiling it into the a vector. If the options are -changed during the branch, the pointer is used to change the external options -bits. This function is used during the pre-compile phase when we are trying -to find out the amount of memory needed, as well as during the real compile -phase. The value of lengthptr distinguishes the two phases. - -Arguments: - optionsptr pointer to the option bits - codeptr points to the pointer to the current code point - ptrptr points to the current pattern pointer - errorcodeptr points to error code variable - firstcuptr place to put the first required code unit - firstcuflagsptr place to put the first code unit flags, or a negative number - reqcuptr place to put the last required code unit - reqcuflagsptr place to put the last required code unit flags, or a negative number - bcptr points to current branch chain - cond_depth conditional nesting depth - cb contains pointers to tables etc. - lengthptr NULL during the real compile phase - points to length accumulator during pre-compile phase - -Returns: TRUE on success - FALSE, with *errorcodeptr set non-zero on error -*/ - -static BOOL -compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, - PCRE2_SPTR *ptrptr, int *errorcodeptr, - uint32_t *firstcuptr, int32_t *firstcuflagsptr, - uint32_t *reqcuptr, int32_t *reqcuflagsptr, - branch_chain *bcptr, int cond_depth, - compile_block *cb, size_t *lengthptr) -{ -int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ -int bravalue = 0; -uint32_t greedy_default, greedy_non_default; -uint32_t repeat_type, op_type; -uint32_t options = *optionsptr; /* May change dynamically */ -uint32_t firstcu, reqcu; -int32_t firstcuflags, reqcuflags; -uint32_t zeroreqcu, zerofirstcu; -int32_t zeroreqcuflags, zerofirstcuflags; -int32_t req_caseopt, reqvary, tempreqvary; -int after_manual_callout = 0; -int escape; -size_t length_prevgroup = 0; -size_t item_hwm_offset = 0; -register uint32_t c; -register PCRE2_UCHAR *code = *codeptr; -PCRE2_UCHAR *last_code = code; -PCRE2_UCHAR *orig_code = code; -PCRE2_UCHAR *tempcode; -BOOL inescq = FALSE; -BOOL groupsetfirstcu = FALSE; -PCRE2_SPTR ptr = *ptrptr; -PCRE2_SPTR tempptr; -PCRE2_SPTR nestptr = NULL; -PCRE2_UCHAR *previous = NULL; -PCRE2_UCHAR *previous_callout = NULL; -uint8_t classbits[32]; - -/* We can fish out the UTF setting once and for all into a BOOL, but we must -not do this for other options (e.g. PCRE2_EXTENDED) because they may change -dynamically as we process the pattern. */ - -#ifdef SUPPORT_UNICODE -BOOL utf = (options & PCRE2_UTF) != 0; -#if PCRE2_CODE_UNIT_WIDTH != 32 -PCRE2_UCHAR utf_units[6]; /* For setting up multi-cu chars */ -#endif - -#else /* No UTF support */ -BOOL utf = FALSE; -#endif - -/* Helper variables for OP_XCLASS opcode (for characters > 255). We define -class_uchardata always so that it can be passed to add_to_class() always, -though it will not be used in non-UTF 8-bit cases. This avoids having to supply -alternative calls for the different cases. */ - -PCRE2_UCHAR *class_uchardata; -#ifdef SUPPORT_WIDE_CHARS -BOOL xclass; -PCRE2_UCHAR *class_uchardata_base; -#endif - -/* Set up the default and non-default settings for greediness */ - -greedy_default = ((options & PCRE2_UNGREEDY) != 0); -greedy_non_default = greedy_default ^ 1; - -/* Initialize no first unit, no required unit. REQ_UNSET means "no char -matching encountered yet". It gets changed to REQ_NONE if we hit something that -matches a non-fixed first unit; reqcu just remains unset if we never find one. - -When we hit a repeat whose minimum is zero, we may have to adjust these values -to take the zero repeat into account. This is implemented by setting them to -zerofirstcu and zeroreqcu when such a repeat is encountered. The individual -item types that can be repeated set these backoff variables appropriately. */ - -firstcu = reqcu = zerofirstcu = zeroreqcu = 0; -firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET; - -/* The variable req_caseopt contains either the REQ_CASELESS value or zero, -according to the current setting of the caseless flag. The REQ_CASELESS value -leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables -to record the case status of the value. This is used only for ASCII characters. -*/ - -req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0; - -/* Switch on next character until the end of the branch */ - -for (;; ptr++) - { - BOOL negate_class; - BOOL should_flip_negation; - BOOL possessive_quantifier; - BOOL is_quantifier; - BOOL is_recurse; - BOOL is_dupname; - BOOL reset_bracount; - int class_has_8bitchar; - int class_one_char; -#ifdef SUPPORT_WIDE_CHARS - BOOL xclass_has_prop; -#endif - int recno; /* Must be signed */ - int refsign; /* Must be signed */ - int terminator; /* Must be signed */ - unsigned int mclength; - unsigned int tempbracount; - uint32_t ec; - uint32_t newoptions; - uint32_t skipunits; - uint32_t subreqcu, subfirstcu; - int32_t subreqcuflags, subfirstcuflags; /* Must be signed */ - PCRE2_UCHAR mcbuffer[8]; - - /* Get next character in the pattern */ - - c = *ptr; - - /* If we are at the end of a nested substitution, revert to the outer level - string. Nesting only happens one level deep. */ - - if (c == CHAR_NULL && nestptr != NULL) - { - ptr = nestptr; - nestptr = NULL; - c = *ptr; - } - - /* If we are in the pre-compile phase, accumulate the length used for the - previous cycle of this loop. */ - - if (lengthptr != NULL) - { - if (code > cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */ - { - *errorcodeptr = ERR52; - goto FAILED; - } - - /* There is at least one situation where code goes backwards: this is the - case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, - the class is simply eliminated. However, it is created first, so we have to - allow memory for it. Therefore, don't ever reduce the length at this point. - */ - - if (code < last_code) code = last_code; - - /* Paranoid check for integer overflow */ - - if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code)) - { - *errorcodeptr = ERR20; - goto FAILED; - } - *lengthptr += code - last_code; - - /* If "previous" is set and it is not at the start of the work space, move - it back to there, in order to avoid filling up the work space. Otherwise, - if "previous" is NULL, reset the current code pointer to the start. */ - - if (previous != NULL) - { - if (previous > orig_code) - { - memmove(orig_code, previous, CU2BYTES(code - previous)); - code -= previous - orig_code; - previous = orig_code; - } - } - else code = orig_code; - - /* Remember where this code item starts so we can pick up the length - next time round. */ - - last_code = code; - } - - /* In the real compile phase, just check the workspace used by the forward - reference list. */ - - else if (cb->hwm > cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN) - { - *errorcodeptr = ERR52; - goto FAILED; - } - - /* If in \Q...\E, check for the end; if not, we have a literal */ - - if (inescq && (c != CHAR_NULL || ptr < cb->end_pattern)) - { - if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) - { - inescq = FALSE; - ptr++; - continue; - } - else - { - if (previous_callout != NULL) - { - if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ - complete_callout(previous_callout, ptr, cb); - previous_callout = NULL; - } - if ((options & PCRE2_AUTO_CALLOUT) != 0) - { - previous_callout = code; - code = auto_callout(code, ptr, cb); - } - goto NORMAL_CHAR; - } - /* Control does not reach here. */ - } - - /* In extended mode, skip white space and comments. We need a loop in order - to check for more white space and more comments after a comment. */ - - if ((options & PCRE2_EXTENDED) != 0) - { - for (;;) - { - while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr); - if (c != CHAR_NUMBER_SIGN) break; - ptr++; - while (*ptr != CHAR_NULL) - { - if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ - { /* IS_NEWLINE sets cb->nllen. */ - ptr += cb->nllen; - break; - } - ptr++; -#ifdef SUPPORT_UNICODE - if (utf) FORWARDCHAR(ptr); -#endif - } - c = *ptr; /* Either NULL or the char after a newline */ - } - } - - /* See if the next thing is a quantifier. */ - - is_quantifier = - c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK || - (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1)); - - /* Fill in length of a previous callout, except when the next thing is a - quantifier or when processing a property substitution string in UCP mode. */ - - if (!is_quantifier && previous_callout != NULL && nestptr == NULL && - after_manual_callout-- <= 0) - { - if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ - complete_callout(previous_callout, ptr, cb); - previous_callout = NULL; - } - - /* Create auto callout, except for quantifiers, or while processing property - strings that are substituted for \w etc in UCP mode. */ - - if ((options & PCRE2_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL) - { - previous_callout = code; - code = auto_callout(code, ptr, cb); - } - - /* Process the next pattern item. */ - - switch(c) - { - /* ===================================================================*/ - /* The branch terminates at string end or | or ) */ - - case CHAR_NULL: - if (ptr < cb->end_pattern) goto NORMAL_CHAR; /* Zero data character */ - /* Fall through */ - - case CHAR_VERTICAL_LINE: - case CHAR_RIGHT_PARENTHESIS: - *firstcuptr = firstcu; - *firstcuflagsptr = firstcuflags; - *reqcuptr = reqcu; - *reqcuflagsptr = reqcuflags; - *codeptr = code; - *ptrptr = ptr; - if (lengthptr != NULL) - { - if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code)) - { - *errorcodeptr = ERR20; - goto FAILED; - } - *lengthptr += code - last_code; /* To include callout length */ - } - return TRUE; - - - /* ===================================================================*/ - /* Handle single-character metacharacters. In multiline mode, ^ disables - the setting of any following char as a first character. */ - - case CHAR_CIRCUMFLEX_ACCENT: - previous = NULL; - if ((options & PCRE2_MULTILINE) != 0) - { - if (firstcuflags == REQ_UNSET) - zerofirstcuflags = firstcuflags = REQ_NONE; - *code++ = OP_CIRCM; - } - else *code++ = OP_CIRC; - break; - - case CHAR_DOLLAR_SIGN: - previous = NULL; - *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL; - break; - - /* There can never be a first char if '.' is first, whatever happens about - repeats. The value of reqcu doesn't change either. */ - - case CHAR_DOT: - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; - *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY; - break; - - - /* ===================================================================*/ - /* Character classes. If the included characters are all < 256, we build a - 32-byte bitmap of the permitted characters, except in the special case - where there is only one such character. For negated classes, we build the - map as usual, then invert it at the end. However, we use a different opcode - so that data characters > 255 can be handled correctly. - - If the class contains characters outside the 0-255 range, a different - opcode is compiled. It may optionally have a bit map for characters < 256, - but those above are are explicitly listed afterwards. A flag byte tells - whether the bitmap is present, and whether this is a negated class or not. - - An isolated ']' character is not treated specially, so is just another data - character. In earlier versions of PCRE that used the original API there was - a "JavaScript compatibility mode" in which it gave an error. However, - JavaScript itself has changed in this respect so there is no longer any - need for this special handling. - - In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is - used for "start of word" and "end of word". As these are otherwise illegal - sequences, we don't break anything by recognizing them. They are replaced - by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are - erroneous and are handled by the normal code below. */ - - case CHAR_LEFT_SQUARE_BRACKET: - if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0) - { - nestptr = ptr + 7; - ptr = sub_start_of_word - 1; - continue; - } - - if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0) - { - nestptr = ptr + 7; - ptr = sub_end_of_word - 1; - continue; - } - - /* Handle a real character class. */ - - previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; - - /* PCRE supports POSIX class stuff inside a class. Perl gives an error if - they are encountered at the top level, so we'll do that too. */ - - if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || - ptr[1] == CHAR_EQUALS_SIGN) && - check_posix_syntax(ptr, &tempptr)) - { - *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR12 : ERR13; - goto FAILED; - } - - /* If the first character is '^', set the negation flag and skip it. Also, - if the first few characters (either before or after ^) are \Q\E or \E we - skip them too. This makes for compatibility with Perl. */ - - negate_class = FALSE; - for (;;) - { - c = *(++ptr); - if (c == CHAR_BACKSLASH) - { - if (ptr[1] == CHAR_E) - ptr++; - else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0) - ptr += 3; - else - break; - } - else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) - negate_class = TRUE; - else break; - } - - /* Empty classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. Otherwise, - an initial ']' is taken as a data character -- the code below handles - that. When empty classes are allowed, [] must always fail, so generate - OP_FAIL, whereas [^] must match any character, so generate OP_ALLANY. */ - - if (c == CHAR_RIGHT_SQUARE_BRACKET && - (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0) - { - *code++ = negate_class? OP_ALLANY : OP_FAIL; - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - break; - } - - /* If a class contains a negative special such as \S, we need to flip the - negation flag at the end, so that support for characters > 255 works - correctly (they are all included in the class). */ - - should_flip_negation = FALSE; - - /* Extended class (xclass) will be used when characters > 255 - might match. */ - -#ifdef SUPPORT_WIDE_CHARS - xclass = FALSE; - class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ - class_uchardata_base = class_uchardata; /* Save the start */ -#endif - - /* For optimization purposes, we track some properties of the class: - class_has_8bitchar will be non-zero if the class contains at least one 256 - character with a code point less than 256; class_one_char will be 1 if the - class contains just one character; xclass_has_prop will be TRUE if Unicode - property checks are present in the class. */ - - class_has_8bitchar = 0; - class_one_char = 0; -#ifdef SUPPORT_WIDE_CHARS - xclass_has_prop = FALSE; -#endif - - /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map - in a temporary bit of memory, in case the class contains fewer than two - 8-bit characters because in that case the compiled code doesn't use the bit - map. */ - - memset(classbits, 0, 32 * sizeof(uint8_t)); - - /* Process characters until ] is reached. As the test is at the end of the - loop, an initial ] is taken as a data character. At the start of the loop, - c contains the first code unit of the character. If it is zero, check for - the end of the pattern, to allow binary zero as data. */ - - for(;;) - { - PCRE2_SPTR oldptr; - - if (c == CHAR_NULL && ptr >= cb->end_pattern) - { - *errorcodeptr = ERR6; /* Missing terminating ']' */ - goto FAILED; - } - -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(c)) - { /* Braces are required because the */ - GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ - } -#endif - - /* Inside \Q...\E everything is literal except \E */ - - if (inescq) - { - if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */ - { - inescq = FALSE; /* Reset literal state */ - ptr++; /* Skip the 'E' */ - goto CONTINUE_CLASS; /* Carry on with next char */ - } - goto CHECK_RANGE; /* Could be range if \E follows */ - } - - /* Handle POSIX class names. Perl allows a negation extension of the - form [:^name:]. A square bracket that doesn't match the syntax is - treated as a literal. We also recognize the POSIX constructions - [.ch.] and [=ch=] ("collating elements") and fault them, as Perl - 5.6 and 5.8 do. */ - - if (c == CHAR_LEFT_SQUARE_BRACKET && - (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || - ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr)) - { - BOOL local_negate = FALSE; - int posix_class, taboffset, tabopt; - register const uint8_t *cbits = cb->cbits; - uint8_t pbits[32]; - - if (ptr[1] != CHAR_COLON) - { - *errorcodeptr = ERR13; - goto FAILED; - } - - ptr += 2; - if (*ptr == CHAR_CIRCUMFLEX_ACCENT) - { - local_negate = TRUE; - should_flip_negation = TRUE; /* Note negative special */ - ptr++; - } - - posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); - if (posix_class < 0) - { - *errorcodeptr = ERR30; - goto FAILED; - } - - /* If matching is caseless, upper and lower are converted to - alpha. This relies on the fact that the class table starts with - alpha, lower, upper as the first 3 entries. */ - - if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2) - posix_class = 0; - - /* When PCRE2_UCP is set, some of the POSIX classes are converted to - different escape sequences that use Unicode properties \p or \P. Others - that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP - directly. UCP support is not available unless UTF support is.*/ - -#ifdef SUPPORT_UNICODE - if ((options & PCRE2_UCP) != 0) - { - unsigned int ptype = 0; - int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0); - - /* The posix_substitutes table specifies which POSIX classes can be - converted to \p or \P items. */ - - if (posix_substitutes[pc] != NULL) - { - nestptr = tempptr + 1; - ptr = posix_substitutes[pc] - 1; - goto CONTINUE_CLASS; - } - - /* There are three other classes that generate special property calls - that are recognized only in an XCLASS. */ - - else switch(posix_class) - { - case PC_GRAPH: - ptype = PT_PXGRAPH; - /* Fall through */ - case PC_PRINT: - if (ptype == 0) ptype = PT_PXPRINT; - /* Fall through */ - case PC_PUNCT: - if (ptype == 0) ptype = PT_PXPUNCT; - *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; - *class_uchardata++ = ptype; - *class_uchardata++ = 0; - xclass_has_prop = TRUE; - ptr = tempptr + 1; - goto CONTINUE_CLASS; - - /* For all other POSIX classes, no special action is taken in UCP - mode. Fall through to the non_UCP case. */ - - default: - break; - } - } -#endif /* SUPPORT_UNICODE */ - - /* In the non-UCP case, or when UCP makes no difference, we build the - bit map for the POSIX class in a chunk of local store because we may be - adding and subtracting from it, and we don't want to subtract bits that - may be in the main map already. At the end we or the result into the - bit map that is being built. */ - - posix_class *= 3; - - /* Copy in the first table (always present) */ - - memcpy(pbits, cbits + posix_class_maps[posix_class], - 32 * sizeof(uint8_t)); - - /* If there is a second table, add or remove it as required. */ - - taboffset = posix_class_maps[posix_class + 1]; - tabopt = posix_class_maps[posix_class + 2]; - - if (taboffset >= 0) - { - if (tabopt >= 0) - for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset]; - else - for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; - } - - /* Now see if we need to remove any special characters. An option - value of 1 removes vertical space and 2 removes underscore. */ - - if (tabopt < 0) tabopt = -tabopt; - if (tabopt == 1) pbits[1] &= ~0x3c; - else if (tabopt == 2) pbits[11] &= 0x7f; - - /* Add the POSIX table or its complement into the main table that is - being built and we are done. */ - - if (local_negate) - for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; - else - for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; - - ptr = tempptr + 1; - /* Every class contains at least one < 256 character. */ - class_has_8bitchar = 1; - /* Every class contains at least two characters. */ - class_one_char = 2; - goto CONTINUE_CLASS; /* End of POSIX syntax handling */ - } - - /* Backslash may introduce a single character, or it may introduce one - of the specials, which just set a flag. The sequence \b is a special - case. Inside a class (and only there) it is treated as backspace. We - assume that other escapes have more than one character in them, so - speculatively set both class_has_8bitchar and class_one_char bigger - than one. Unrecognized escapes fall through and are faulted. */ - - if (c == CHAR_BACKSLASH) - { - escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb); - if (*errorcodeptr != 0) goto FAILED; - if (escape == 0) c = ec; /* Escaped single char */ - else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ - else if (escape == ESC_N) /* \N is not supported in a class */ - { - *errorcodeptr = ERR71; - goto FAILED; - } - else if (escape == ESC_Q) /* Handle start of quoted string */ - { - if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) - { - ptr += 2; /* avoid empty string */ - } - else inescq = TRUE; - goto CONTINUE_CLASS; - } - else if (escape == ESC_E) goto CONTINUE_CLASS; /* Ignore orphan \E */ - - else /* Handle \d-type escapes */ - { - register const uint8_t *cbits = cb->cbits; - /* Every class contains at least two < 256 characters. */ - class_has_8bitchar++; - /* Every class contains at least two characters. */ - class_one_char += 2; - - switch (escape) - { -#ifdef SUPPORT_UNICODE - case ESC_du: /* These are the values given for \d etc */ - case ESC_DU: /* when PCRE2_UCP is set. We replace the */ - case ESC_wu: /* escape sequence with an appropriate \p */ - case ESC_WU: /* or \P to test Unicode properties instead */ - case ESC_su: /* of the default ASCII testing. */ - case ESC_SU: - nestptr = ptr; - ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */ - class_has_8bitchar--; /* Undo! */ - break; -#endif - case ESC_d: - for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; - break; - - case ESC_D: - should_flip_negation = TRUE; - for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; - break; - - case ESC_w: - for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word]; - break; - - case ESC_W: - should_flip_negation = TRUE; - for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; - break; - - /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl - 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was - previously set by something earlier in the character class. - Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so - we could just adjust the appropriate bit. From PCRE 8.34 we no - longer treat \s and \S specially. */ - - case ESC_s: - for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; - break; - - case ESC_S: - should_flip_negation = TRUE; - for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; - break; - - /* The rest apply in both UCP and non-UCP cases. */ - - case ESC_h: - (void)add_list_to_class(classbits, &class_uchardata, options, cb, - PRIV(hspace_list), NOTACHAR); - break; - - case ESC_H: - (void)add_not_list_to_class(classbits, &class_uchardata, options, - cb, PRIV(hspace_list)); - break; - - case ESC_v: - (void)add_list_to_class(classbits, &class_uchardata, options, cb, - PRIV(vspace_list), NOTACHAR); - break; - - case ESC_V: - (void)add_not_list_to_class(classbits, &class_uchardata, options, - cb, PRIV(vspace_list)); - break; - - case ESC_p: - case ESC_P: -#ifdef SUPPORT_UNICODE - { - BOOL negated; - unsigned int ptype = 0, pdata = 0; - if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb)) - goto FAILED; - *class_uchardata++ = ((escape == ESC_p) != negated)? - XCL_PROP : XCL_NOTPROP; - *class_uchardata++ = ptype; - *class_uchardata++ = pdata; - xclass_has_prop = TRUE; - class_has_8bitchar--; /* Undo! */ - } - break; -#else - *errorcodeptr = ERR45; - goto FAILED; -#endif - /* Unrecognized escapes are faulted. */ - - default: - *errorcodeptr = ERR7; - goto FAILED; - } - - /* Handled \d-type escape */ - - goto CONTINUE_CLASS; - } - - /* Control gets here if the escape just defined a single character. - This is in c and may be greater than 256. */ - - escape = 0; - } /* End of backslash handling */ - - /* A character may be followed by '-' to form a range. However, Perl does - not permit ']' to be the end of the range. A '-' character at the end is - treated as a literal. Perl ignores orphaned \E sequences entirely. The - code for handling \Q and \E is messy. */ - - CHECK_RANGE: - while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) - { - inescq = FALSE; - ptr += 2; - } - oldptr = ptr; - - /* Remember if \r or \n were explicitly used */ - - if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; - - /* Check for range */ - - if (!inescq && ptr[1] == CHAR_MINUS) - { - uint32_t d; - ptr += 2; - while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2; - - /* If we hit \Q (not followed by \E) at this point, go into escaped - mode. */ - - while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q) - { - ptr += 2; - if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) - { ptr += 2; continue; } - inescq = TRUE; - break; - } - - /* Minus (hyphen) at the end of a class is treated as a literal, so put - back the pointer and jump to handle the character that preceded it. */ - - if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET)) - { - ptr = oldptr; - goto CLASS_SINGLE_CHARACTER; - } - - /* Otherwise, we have a potential range; pick up the next character */ - -#ifdef SUPPORT_UNICODE - if (utf) - { /* Braces are required because the */ - GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ - } - else -#endif - d = *ptr; /* Not UTF mode */ - - /* The second part of a range can be a single-character escape - sequence, but not any of the other escapes. Perl treats a hyphen as a - literal in such circumstances. However, in Perl's warning mode, a - warning is given, so PCRE now faults it as it is almost certainly a - mistake on the user's part. */ - - if (!inescq) - { - if (d == CHAR_BACKSLASH) - { - int descape; - descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb); - if (*errorcodeptr != 0) goto FAILED; - - /* 0 means a character was put into d; \b is backspace; any other - special causes an error. */ - - if (descape != 0) - { - if (descape == ESC_b) d = CHAR_BS; else - { - *errorcodeptr = ERR50; - goto FAILED; - } - } - } - - /* A hyphen followed by a POSIX class is treated in the same way. */ - - else if (d == CHAR_LEFT_SQUARE_BRACKET && - (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || - ptr[1] == CHAR_EQUALS_SIGN) && - check_posix_syntax(ptr, &tempptr)) - { - *errorcodeptr = ERR50; - goto FAILED; - } - } - - /* Check that the two values are in the correct order. Optimize - one-character ranges. */ - - if (d < c) - { - *errorcodeptr = ERR8; - goto FAILED; - } - if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */ - - /* We have found a character range, so single character optimizations - cannot be done anymore. Any value greater than 1 indicates that there - is more than one character. */ - - class_one_char = 2; - - /* Remember an explicit \r or \n, and add the range to the class. */ - - if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; - - class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, cb, c, d); - - goto CONTINUE_CLASS; /* Go get the next char in the class */ - } - - /* Handle a single character - we can get here for a normal non-escape - char, or after \ that introduces a single character or for an apparent - range that isn't. Only the value 1 matters for class_one_char, so don't - increase it if it is already 2 or more ... just in case there's a class - with a zillion characters in it. */ - - CLASS_SINGLE_CHARACTER: - if (class_one_char < 2) class_one_char++; - - /* If class_one_char is 1, we have the first single character in the - class, and there have been no prior ranges, or XCLASS items generated by - escapes. If this is the final character in the class, we can optimize by - turning the item into a 1-character OP_CHAR[I] if it's positive, or - OP_NOT[I] if it's negative. In the positive case, it can cause firstcu - to be set. Otherwise, there can be no first char if this item is first, - whatever repeat count may follow. In the case of reqcu, save the - previous value for reinstating. */ - - if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) - { - ptr++; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - - if (negate_class) - { -#ifdef SUPPORT_UNICODE - int d; -#endif - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - - /* For caseless UTF mode, check whether this character has more than - one other case. If so, generate a special OP_NOTPROP item instead of - OP_NOTI. */ - -#ifdef SUPPORT_UNICODE - if (utf && (options & PCRE2_CASELESS) != 0 && - (d = UCD_CASESET(c)) != 0) - { - *code++ = OP_NOTPROP; - *code++ = PT_CLIST; - *code++ = d; - } - else -#endif - /* Char has only one other case, or UCP not available */ - - { - *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT; - code += PUTCHAR(c, code); - } - - /* We are finished with this character class */ - - goto END_CLASS; - } - - /* For a single, positive character, get the value into mcbuffer, and - then we can handle this with the normal one-character code. */ - - mclength = PUTCHAR(c, mcbuffer); - goto ONE_CHAR; - } /* End of 1-char optimization */ - - /* There is more than one character in the class, or an XCLASS item - has been generated. Add this character to the class. */ - - class_has_8bitchar += - add_to_class(classbits, &class_uchardata, options, cb, c, c); - - /* Continue to the next character in the class. Closing square bracket - not within \Q..\E ends the class. A NULL character terminates a - nested substitution string, but may be a data character in the main - pattern (tested at the start of this loop). */ - - CONTINUE_CLASS: - c = *(++ptr); - if (c == 0 && nestptr != NULL) - { - ptr = nestptr; - nestptr = NULL; - c = *(++ptr); - } - -#ifdef SUPPORT_WIDE_CHARS - /* If any wide characters have been encountered, set xclass = TRUE. Then, - in the pre-compile phase, accumulate the length of the wide characters - and reset the pointer. This is so that very large classes that contain a - zillion wide characters do not overwrite the work space (which is on the - stack). */ - - if (class_uchardata > class_uchardata_base) - { - xclass = TRUE; - if (lengthptr != NULL) - { - *lengthptr += class_uchardata - class_uchardata_base; - class_uchardata = class_uchardata_base; - } - } -#endif - /* An unescaped ] ends the class */ - - if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; - } /* End of main class-processing loop */ - - /* If this is the first thing in the branch, there can be no first char - setting, whatever the repeat count. Any reqcu setting must remain - unchanged after any kind of repeat. */ - - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - - /* If there are characters with values > 255, we have to compile an - extended class, with its own opcode, unless there was a negated special - such as \S in the class, and PCRE2_UCP is not set, because in that case all - characters > 255 are in the class, so any that were explicitly given as - well can be ignored. If (when there are explicit characters > 255 that must - be listed) there are no characters < 256, we can omit the bitmap in the - actual compiled code. */ - -#ifdef SUPPORT_WIDE_CHARS -#ifdef SUPPORT_UNICODE - if (xclass && (!should_flip_negation || (options & PCRE2_UCP) != 0)) -#elif PCRE2_CODE_UNIT_WIDTH != 8 - if (xclass && !should_flip_negation) -#endif - { - *class_uchardata++ = XCL_END; /* Marks the end of extra data */ - *code++ = OP_XCLASS; - code += LINK_SIZE; - *code = negate_class? XCL_NOT:0; - if (xclass_has_prop) *code |= XCL_HASPROP; - - /* If the map is required, move up the extra data to make room for it; - otherwise just move the code pointer to the end of the extra data. */ - - if (class_has_8bitchar > 0) - { - *code++ |= XCL_MAP; - memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, - CU2BYTES(class_uchardata - code)); - if (negate_class && !xclass_has_prop) - for (c = 0; c < 32; c++) classbits[c] = ~classbits[c]; - memcpy(code, classbits, 32); - code = class_uchardata + (32 / sizeof(PCRE2_UCHAR)); - } - else code = class_uchardata; - - /* Now fill in the complete length of the item */ - - PUT(previous, 1, (int)(code - previous)); - break; /* End of class handling */ - } -#endif - - /* If there are no characters > 255, or they are all to be included or - excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the - whole class was negated and whether there were negative specials such as \S - (non-UCP) in the class. Then copy the 32-byte map into the code vector, - negating it if necessary. */ - - *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; - if (lengthptr == NULL) /* Save time in the pre-compile phase */ - { - if (negate_class) - for (c = 0; c < 32; c++) classbits[c] = ~classbits[c]; - memcpy(code, classbits, 32); - } - code += 32 / sizeof(PCRE2_UCHAR); - - END_CLASS: - break; - - - /* ===================================================================*/ - /* Various kinds of repeat; '{' is not necessarily a quantifier, but this - has been tested above. */ - - case CHAR_LEFT_CURLY_BRACKET: - if (!is_quantifier) goto NORMAL_CHAR; - ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); - if (*errorcodeptr != 0) goto FAILED; - goto REPEAT; - - case CHAR_ASTERISK: - repeat_min = 0; - repeat_max = -1; - goto REPEAT; - - case CHAR_PLUS: - repeat_min = 1; - repeat_max = -1; - goto REPEAT; - - case CHAR_QUESTION_MARK: - repeat_min = 0; - repeat_max = 1; - - REPEAT: - if (previous == NULL) - { - *errorcodeptr = ERR9; - goto FAILED; - } - - if (repeat_min == 0) - { - firstcu = zerofirstcu; /* Adjust for zero repeat */ - firstcuflags = zerofirstcuflags; - reqcu = zeroreqcu; /* Ditto */ - reqcuflags = zeroreqcuflags; - } - - /* Remember whether this is a variable length repeat */ - - reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; - - op_type = 0; /* Default single-char op codes */ - possessive_quantifier = FALSE; /* Default not possessive quantifier */ - - /* Save start of previous item, in case we have to move it up in order to - insert something before it. */ - - tempcode = previous; - - /* Before checking for a possessive quantifier, we must skip over - whitespace and comments in extended mode because Perl allows white space at - this point. */ - - if ((options & PCRE2_EXTENDED) != 0) - { - PCRE2_SPTR p = ptr + 1; - for (;;) - { - while (MAX_255(*p) && (cb->ctypes[*p] & ctype_space) != 0) p++; - if (*p != CHAR_NUMBER_SIGN) break; - p++; - while (*p != CHAR_NULL) - { - if (IS_NEWLINE(p)) /* For non-fixed-length newline cases, */ - { /* IS_NEWLINE sets cb->nllen. */ - p += cb->nllen; - break; - } - p++; -#ifdef SUPPORT_UNICODE - if (utf) FORWARDCHAR(p); -#endif - } /* Loop for comment characters */ - } /* Loop for multiple comments */ - ptr = p - 1; /* Character before the next significant one. */ - } - - /* If the next character is '+', we have a possessive quantifier. This - implies greediness, whatever the setting of the PCRE2_UNGREEDY option. - If the next character is '?' this is a minimizing repeat, by default, - but if PCRE2_UNGREEDY is set, it works the other way round. We change the - repeat type to the non-default. */ - - if (ptr[1] == CHAR_PLUS) - { - repeat_type = 0; /* Force greedy */ - possessive_quantifier = TRUE; - ptr++; - } - else if (ptr[1] == CHAR_QUESTION_MARK) - { - repeat_type = greedy_non_default; - ptr++; - } - else repeat_type = greedy_default; - - /* If previous was a recursion call, wrap it in atomic brackets so that - previous becomes the atomic group. All recursions were so wrapped in the - past, but it no longer happens for non-repeated recursions. In fact, the - repeated ones could be re-implemented independently so as not to need this, - but for the moment we rely on the code for repeating groups. */ - - if (*previous == OP_RECURSE) - { - memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE)); - *previous = OP_ONCE; - PUT(previous, 1, 2 + 2*LINK_SIZE); - previous[2 + 2*LINK_SIZE] = OP_KET; - PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); - code += 2 + 2 * LINK_SIZE; - length_prevgroup = 3 + 3*LINK_SIZE; - - /* When actually compiling, we need to check whether this was a forward - reference, and if so, adjust the offset. */ - - if (lengthptr == NULL && cb->hwm >= cb->start_workspace + LINK_SIZE) - { - int offset = GET(cb->hwm, -LINK_SIZE); - if (offset == previous + 1 - cb->start_code) - PUT(cb->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE); - } - } - - /* Now handle repetition for the different types of item. */ - - /* If previous was a character or negated character match, abolish the item - and generate a repeat item instead. If a char item has a minimum of more - than one, ensure that it is set in reqcu - it might not be if a sequence - such as x{3} is the first thing in a branch because the x will have gone - into firstcu instead. */ - - if (*previous == OP_CHAR || *previous == OP_CHARI - || *previous == OP_NOT || *previous == OP_NOTI) - { - switch (*previous) - { - default: /* Make compiler happy. */ - case OP_CHAR: op_type = OP_STAR - OP_STAR; break; - case OP_CHARI: op_type = OP_STARI - OP_STAR; break; - case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break; - case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break; - } - - /* Deal with UTF characters that take up more than one code unit. It's - easier to write this out separately than try to macrify it. Use c to - hold the length of the character in code units, plus UTF_LENGTH to flag - that it's a length rather than a small character. */ - -#ifdef MAYBE_UTF_MULTI - if (utf && NOT_FIRSTCHAR(code[-1])) - { - PCRE2_UCHAR *lastchar = code - 1; - BACKCHAR(lastchar); - c = (int)(code - lastchar); /* Length of UTF character */ - memcpy(utf_units, lastchar, CU2BYTES(c)); /* Save the char */ - c |= UTF_LENGTH; /* Flag c as a length */ - } - else -#endif /* MAYBE_UTF_MULTI */ - - /* Handle the case of a single charater - either with no UTF support, or - with UTF disabled, or for a single-code-unit UTF character. */ - { - c = code[-1]; - if (*previous <= OP_CHARI && repeat_min > 1) - { - reqcu = c; - reqcuflags = req_caseopt | cb->req_varyopt; - } - } - - goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ - } - - /* If previous was a character type match (\d or similar), abolish it and - create a suitable repeat item. The code is shared with single-character - repeats by setting op_type to add a suitable offset into repeat_type. Note - the the Unicode property types will be present only when SUPPORT_UNICODE is - defined, but we don't wrap the little bits of code here because it just - makes it horribly messy. */ - - else if (*previous < OP_EODN) - { - PCRE2_UCHAR *oldcode; - int prop_type, prop_value; - op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ - c = *previous; /* Save previous opcode */ - if (c == OP_PROP || c == OP_NOTPROP) - { - prop_type = previous[1]; - prop_value = previous[2]; - } - else - { - /* Come here from just above with a character in c */ - OUTPUT_SINGLE_REPEAT: - prop_type = prop_value = -1; - } - - /* At this point we either have prop_type == prop_value == -1 and either - a code point or a character type that is not OP_[NOT]PROP in c, or we - have OP_[NOT]PROP in c and prop_type/prop_value not negative. */ - - oldcode = code; /* Save where we were */ - code = previous; /* Usually overwrite previous item */ - - /* If the maximum is zero then the minimum must also be zero; Perl allows - this case, so we do too - by simply omitting the item altogether. */ - - if (repeat_max == 0) goto END_REPEAT; - - /* Combine the op_type with the repeat_type */ - - repeat_type += op_type; - - /* A minimum of zero is handled either as the special case * or ?, or as - an UPTO, with the maximum given. */ - - if (repeat_min == 0) - { - if (repeat_max == -1) *code++ = OP_STAR + repeat_type; - else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; - else - { - *code++ = OP_UPTO + repeat_type; - PUT2INC(code, 0, repeat_max); - } - } - - /* A repeat minimum of 1 is optimized into some special cases. If the - maximum is unlimited, we use OP_PLUS. Otherwise, the original item is - left in place and, if the maximum is greater than 1, we use OP_UPTO with - one less than the maximum. */ - - else if (repeat_min == 1) - { - if (repeat_max == -1) - *code++ = OP_PLUS + repeat_type; - else - { - code = oldcode; /* Leave previous item in place */ - if (repeat_max == 1) goto END_REPEAT; - *code++ = OP_UPTO + repeat_type; - PUT2INC(code, 0, repeat_max - 1); - } - } - - /* The case {n,n} is just an EXACT, while the general case {n,m} is - handled as an EXACT followed by an UPTO or STAR or QUERY. */ - - else - { - *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ - PUT2INC(code, 0, repeat_min); - - /* Unless repeat_max equals repeat_min, fill in the data for EXACT, and - then generate the second opcode. In UTF mode, multi-code-unit - characters have their length in c, with the UTF_LENGTH bit as a flag, - and the code units in utf_units. For a repeated Unicode property match, - there are two extra values that define the required property, and c - never has the UTF_LENGTH bit set. */ - - if (repeat_max != repeat_min) - { -#ifdef MAYBE_UTF_MULTI - if (utf && (c & UTF_LENGTH) != 0) - { - memcpy(code, utf_units, CU2BYTES(c & 7)); - code += c & 7; - } - else -#endif /* MAYBE_UTF_MULTI */ - { - *code++ = c; - if (prop_type >= 0) - { - *code++ = prop_type; - *code++ = prop_value; - } - } - - /* Now set up the following opcode */ - - if (repeat_max < 0) *code++ = OP_STAR + repeat_type; else - { - repeat_max -= repeat_min; - if (repeat_max == 1) - { - *code++ = OP_QUERY + repeat_type; - } - else - { - *code++ = OP_UPTO + repeat_type; - PUT2INC(code, 0, repeat_max); - } - } - } - } - - /* Fill in the character or character type for the final opcode. */ - -#ifdef MAYBE_UTF_MULTI - if (utf && (c & UTF_LENGTH) != 0) - { - memcpy(code, utf_units, CU2BYTES(c & 7)); - code += c & 7; - } - else -#endif /* MAYBEW_UTF_MULTI */ - { - *code++ = c; - if (prop_type >= 0) - { - *code++ = prop_type; - *code++ = prop_value; - } - } - } - - /* If previous was a character class or a back reference, we put the repeat - stuff after it, but just skip the item if the repeat was {0,0}. */ - - else if (*previous == OP_CLASS || *previous == OP_NCLASS || -#ifdef SUPPORT_WIDE_CHARS - *previous == OP_XCLASS || -#endif - *previous == OP_REF || *previous == OP_REFI || - *previous == OP_DNREF || *previous == OP_DNREFI) - { - if (repeat_max == 0) - { - code = previous; - goto END_REPEAT; - } - - if (repeat_min == 0 && repeat_max == -1) - *code++ = OP_CRSTAR + repeat_type; - else if (repeat_min == 1 && repeat_max == -1) - *code++ = OP_CRPLUS + repeat_type; - else if (repeat_min == 0 && repeat_max == 1) - *code++ = OP_CRQUERY + repeat_type; - else - { - *code++ = OP_CRRANGE + repeat_type; - PUT2INC(code, 0, repeat_min); - if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ - PUT2INC(code, 0, repeat_max); - } - } - - /* If previous was a bracket group, we may have to replicate it in certain - cases. Note that at this point we can encounter only the "basic" bracket - opcodes such as BRA and CBRA, as this is the place where they get converted - into the more special varieties such as BRAPOS and SBRA. A test for >= - OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK, - ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND. - Originally, PCRE did not allow repetition of assertions, but now it does, - for Perl compatibility. */ - - else if (*previous >= OP_ASSERT && *previous <= OP_COND) - { - register int i; - int len = (int)(code - previous); - size_t base_hwm_offset = item_hwm_offset; - PCRE2_UCHAR *bralink = NULL; - PCRE2_UCHAR *brazeroptr = NULL; - - /* Repeating a DEFINE group (or any group where the condition is always - FALSE and there is only one branch) is pointless, but Perl allows the - syntax, so we just ignore the repeat. */ - - if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE && - previous[GET(previous, 1)] != OP_ALT) - goto END_REPEAT; - - /* There is no sense in actually repeating assertions. The only potential - use of repetition is in cases when the assertion is optional. Therefore, - if the minimum is greater than zero, just ignore the repeat. If the - maximum is not zero or one, set it to 1. */ - - if (*previous < OP_ONCE) /* Assertion */ - { - if (repeat_min > 0) goto END_REPEAT; - if (repeat_max < 0 || repeat_max > 1) repeat_max = 1; - } - - /* The case of a zero minimum is special because of the need to stick - OP_BRAZERO in front of it, and because the group appears once in the - data, whereas in other cases it appears the minimum number of times. For - this reason, it is simplest to treat this case separately, as otherwise - the code gets far too messy. There are several special subcases when the - minimum is zero. */ - - if (repeat_min == 0) - { - /* If the maximum is also zero, we used to just omit the group from the - output altogether, like this: - - ** if (repeat_max == 0) - ** { - ** code = previous; - ** goto END_REPEAT; - ** } - - However, that fails when a group or a subgroup within it is referenced - as a subroutine from elsewhere in the pattern, so now we stick in - OP_SKIPZERO in front of it so that it is skipped on execution. As we - don't have a list of which groups are referenced, we cannot do this - selectively. - - If the maximum is 1 or unlimited, we just have to stick in the BRAZERO - and do no more at this point. However, we do need to adjust any - OP_RECURSE calls inside the group that refer to the group itself or any - internal or forward referenced group, because the offset is from the - start of the whole regex. Temporarily terminate the pattern while doing - this. */ - - if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ - { - *code = OP_END; - adjust_recurse(previous, 1, utf, cb, item_hwm_offset); - memmove(previous + 1, previous, CU2BYTES(len)); - code++; - if (repeat_max == 0) - { - *previous++ = OP_SKIPZERO; - goto END_REPEAT; - } - brazeroptr = previous; /* Save for possessive optimizing */ - *previous++ = OP_BRAZERO + repeat_type; - } - - /* If the maximum is greater than 1 and limited, we have to replicate - in a nested fashion, sticking OP_BRAZERO before each set of brackets. - The first one has to be handled carefully because it's the original - copy, which has to be moved up. The remainder can be handled by code - that is common with the non-zero minimum case below. We have to - adjust the value or repeat_max, since one less copy is required. Once - again, we may have to adjust any OP_RECURSE calls inside the group. */ - - else - { - int offset; - *code = OP_END; - adjust_recurse(previous, 2 + LINK_SIZE, utf, cb, item_hwm_offset); - memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len)); - code += 2 + LINK_SIZE; - *previous++ = OP_BRAZERO + repeat_type; - *previous++ = OP_BRA; - - /* We chain together the bracket offset fields that have to be - filled in later when the ends of the brackets are reached. */ - - offset = (bralink == NULL)? 0 : (int)(previous - bralink); - bralink = previous; - PUTINC(previous, 0, offset); - } - - repeat_max--; - } - - /* If the minimum is greater than zero, replicate the group as many - times as necessary, and adjust the maximum to the number of subsequent - copies that we need. If we set a first char from the group, and didn't - set a required char, copy the latter from the former. If there are any - forward reference subroutine calls in the group, there will be entries on - the workspace list; replicate these with an appropriate increment. */ - - else - { - if (repeat_min > 1) - { - /* In the pre-compile phase, we don't actually do the replication. We - just adjust the length as if we had. Do some paranoid checks for - potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit - integer type when available, otherwise double. */ - - if (lengthptr != NULL) - { - size_t delta = (repeat_min - 1)*length_prevgroup; - if ((INT64_OR_DOUBLE)(repeat_min - 1)* - (INT64_OR_DOUBLE)length_prevgroup > - (INT64_OR_DOUBLE)INT_MAX || - OFLOW_MAX - *lengthptr < delta) - { - *errorcodeptr = ERR20; - goto FAILED; - } - *lengthptr += delta; - } - - /* This is compiling for real. If there is a set first byte for - the group, and we have not yet set a "required byte", set it. Make - sure there is enough workspace for copying forward references before - doing the copy. */ - - else - { - if (groupsetfirstcu && reqcuflags < 0) - { - reqcu = firstcu; - reqcuflags = firstcuflags; - } - - for (i = 1; i < repeat_min; i++) - { - PCRE2_UCHAR *hc; - size_t this_hwm_offset = cb->hwm - cb->start_workspace; - memcpy(code, previous, CU2BYTES(len)); - - while (cb->hwm > cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN - - (this_hwm_offset - base_hwm_offset)) - { - *errorcodeptr = expand_workspace(cb); - if (*errorcodeptr != 0) goto FAILED; - } - - for (hc = (PCRE2_UCHAR *)cb->start_workspace + base_hwm_offset; - hc < (PCRE2_UCHAR *)cb->start_workspace + this_hwm_offset; - hc += LINK_SIZE) - { - PUT(cb->hwm, 0, GET(hc, 0) + len); - cb->hwm += LINK_SIZE; - } - base_hwm_offset = this_hwm_offset; - code += len; - } - } - } - - if (repeat_max > 0) repeat_max -= repeat_min; - } - - /* This code is common to both the zero and non-zero minimum cases. If - the maximum is limited, it replicates the group in a nested fashion, - remembering the bracket starts on a stack. In the case of a zero minimum, - the first one was set up above. In all cases the repeat_max now specifies - the number of additional copies needed. Again, we must remember to - replicate entries on the forward reference list. */ - - if (repeat_max >= 0) - { - /* In the pre-compile phase, we don't actually do the replication. We - just adjust the length as if we had. For each repetition we must add 1 - to the length for BRAZERO and for all but the last repetition we must - add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some - paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is - a 64-bit integer type when available, otherwise double. */ - - if (lengthptr != NULL && repeat_max > 0) - { - size_t delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - - 2 - 2*LINK_SIZE; /* Last one doesn't nest */ - if ((INT64_OR_DOUBLE)repeat_max * - (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - > (INT64_OR_DOUBLE)INT_MAX || - OFLOW_MAX - *lengthptr < delta) - { - *errorcodeptr = ERR20; - goto FAILED; - } - *lengthptr += delta; - } - - /* This is compiling for real */ - - else for (i = repeat_max - 1; i >= 0; i--) - { - PCRE2_UCHAR *hc; - size_t this_hwm_offset = cb->hwm - cb->start_workspace; - - *code++ = OP_BRAZERO + repeat_type; - - /* All but the final copy start a new nesting, maintaining the - chain of brackets outstanding. */ - - if (i != 0) - { - int offset; - *code++ = OP_BRA; - offset = (bralink == NULL)? 0 : (int)(code - bralink); - bralink = code; - PUTINC(code, 0, offset); - } - - memcpy(code, previous, CU2BYTES(len)); - - /* Ensure there is enough workspace for forward references before - copying them. */ - - while (cb->hwm > cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN - - (this_hwm_offset - base_hwm_offset)) - { - *errorcodeptr = expand_workspace(cb); - if (*errorcodeptr != 0) goto FAILED; - } - - for (hc = (PCRE2_UCHAR *)cb->start_workspace + base_hwm_offset; - hc < (PCRE2_UCHAR *)cb->start_workspace + this_hwm_offset; - hc += LINK_SIZE) - { - PUT(cb->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); - cb->hwm += LINK_SIZE; - } - base_hwm_offset = this_hwm_offset; - code += len; - } - - /* Now chain through the pending brackets, and fill in their length - fields (which are holding the chain links pro tem). */ - - while (bralink != NULL) - { - int oldlinkoffset; - int offset = (int)(code - bralink + 1); - PCRE2_UCHAR *bra = code - offset; - oldlinkoffset = GET(bra, 1); - bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; - *code++ = OP_KET; - PUTINC(code, 0, offset); - PUT(bra, 1, offset); - } - } - - /* If the maximum is unlimited, set a repeater in the final copy. For - ONCE brackets, that's all we need to do. However, possessively repeated - ONCE brackets can be converted into non-capturing brackets, as the - behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to - deal with possessive ONCEs specially. - - Otherwise, when we are doing the actual compile phase, check to see - whether this group is one that could match an empty string. If so, - convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so - that runtime checking can be done. [This check is also applied to ONCE - groups at runtime, but in a different way.] - - Then, if the quantifier was possessive and the bracket is not a - conditional, we convert the BRA code to the POS form, and the KET code to - KETRPOS. (It turns out to be convenient at runtime to detect this kind of - subpattern at both the start and at the end.) The use of special opcodes - makes it possible to reduce greatly the stack usage in pcre_exec(). If - the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO. - - Then, if the minimum number of matches is 1 or 0, cancel the possessive - flag so that the default action below, of wrapping everything inside - atomic brackets, does not happen. When the minimum is greater than 1, - there will be earlier copies of the group, and so we still have to wrap - the whole thing. */ - - else - { - PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE; - PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1); - - /* Convert possessive ONCE brackets to non-capturing */ - - if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) && - possessive_quantifier) *bracode = OP_BRA; - - /* For non-possessive ONCE brackets, all we need to do is to - set the KET. */ - - if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC) - *ketcode = OP_KETRMAX + repeat_type; - - /* Handle non-ONCE brackets and possessive ONCEs (which have been - converted to non-capturing above). */ - - else - { - /* In the compile phase, check for empty string matching. */ - - if (lengthptr == NULL) - { - PCRE2_UCHAR *scode = bracode; - do - { - if (could_be_empty_branch(scode, ketcode, utf, cb, NULL)) - { - *bracode += OP_SBRA - OP_BRA; - break; - } - scode += GET(scode, 1); - } - while (*scode == OP_ALT); - } - - /* Handle possessive quantifiers. */ - - if (possessive_quantifier) - { - /* For COND brackets, we wrap the whole thing in a possessively - repeated non-capturing bracket, because we have not invented POS - versions of the COND opcodes. Because we are moving code along, we - must ensure that any pending recursive references are updated. */ - - if (*bracode == OP_COND || *bracode == OP_SCOND) - { - int nlen = (int)(code - bracode); - *code = OP_END; - adjust_recurse(bracode, 1 + LINK_SIZE, utf, cb, item_hwm_offset); - memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen)); - code += 1 + LINK_SIZE; - nlen += 1 + LINK_SIZE; - *bracode = OP_BRAPOS; - *code++ = OP_KETRPOS; - PUTINC(code, 0, nlen); - PUT(bracode, 1, nlen); - } - - /* For non-COND brackets, we modify the BRA code and use KETRPOS. */ - - else - { - *bracode += 1; /* Switch to xxxPOS opcodes */ - *ketcode = OP_KETRPOS; - } - - /* If the minimum is zero, mark it as possessive, then unset the - possessive flag when the minimum is 0 or 1. */ - - if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO; - if (repeat_min < 2) possessive_quantifier = FALSE; - } - - /* Non-possessive quantifier */ - - else *ketcode = OP_KETRMAX + repeat_type; - } - } - } - - /* If previous is OP_FAIL, it was generated by an empty class [] - (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be - generated, that is by (*FAIL) or (?!), set previous to NULL, which gives a - "nothing to repeat" error above. We can just ignore the repeat in empty - class case. */ - - else if (*previous == OP_FAIL) goto END_REPEAT; - - /* Else there's some kind of shambles */ - - else - { - *errorcodeptr = ERR10; - goto FAILED; - } - - /* If the character following a repeat is '+', possessive_quantifier is - TRUE. For some opcodes, there are special alternative opcodes for this - case. For anything else, we wrap the entire repeated item inside OP_ONCE - brackets. Logically, the '+' notation is just syntactic sugar, taken from - Sun's Java package, but the special opcodes can optimize it. - - Some (but not all) possessively repeated subpatterns have already been - completely handled in the code just above. For them, possessive_quantifier - is always FALSE at this stage. Note that the repeated item starts at - tempcode, not at previous, which might be the first part of a string whose - (former) last char we repeated. */ - - if (possessive_quantifier) - { - int len; - - /* Possessifying an EXACT quantifier has no effect, so we can ignore it. - However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6}, - {5,}, or {5,10}). We skip over an EXACT item; if the length of what - remains is greater than zero, there's a further opcode that can be - handled. If not, do nothing, leaving the EXACT alone. */ - - switch(*tempcode) - { - case OP_TYPEEXACT: - tempcode += PRIV(OP_lengths)[*tempcode] + - ((tempcode[1 + IMM2_SIZE] == OP_PROP - || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); - break; - - /* CHAR opcodes are used for exacts whose count is 1. */ - - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - case OP_EXACT: - case OP_EXACTI: - case OP_NOTEXACT: - case OP_NOTEXACTI: - tempcode += PRIV(OP_lengths)[*tempcode]; -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(tempcode[-1])) - tempcode += GET_EXTRALEN(tempcode[-1]); -#endif - break; - - /* For the class opcodes, the repeat operator appears at the end; - adjust tempcode to point to it. */ - - case OP_CLASS: - case OP_NCLASS: - tempcode += 1 + 32/sizeof(PCRE2_UCHAR); - break; - -#ifdef SUPPORT_WIDE_CHARS - case OP_XCLASS: - tempcode += GET(tempcode, 1); - break; -#endif - } - - /* If tempcode is equal to code (which points to the end of the repeated - item), it means we have skipped an EXACT item but there is no following - QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In - all other cases, tempcode will be pointing to the repeat opcode, and will - be less than code, so the value of len will be greater than 0. */ - - len = (int)(code - tempcode); - if (len > 0) - { - unsigned int repcode = *tempcode; - - /* There is a table for possessifying opcodes, all of which are less - than OP_CALLOUT. A zero entry means there is no possessified version. - */ - - if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0) - *tempcode = opcode_possessify[repcode]; - - /* For opcode without a special possessified version, wrap the item in - ONCE brackets. Because we are moving code along, we must ensure that - any pending recursive references are updated. */ - - else - { - *code = OP_END; - adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cb, item_hwm_offset); - memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len)); - code += 1 + LINK_SIZE; - len += 1 + LINK_SIZE; - tempcode[0] = OP_ONCE; - *code++ = OP_KET; - PUTINC(code, 0, len); - PUT(tempcode, 1, len); - } - } - } - - /* In all case we no longer have a previous item. We also set the - "follows varying string" flag for subsequently encountered reqcus if - it isn't already set and we have just passed a varying length item. */ - - END_REPEAT: - previous = NULL; - cb->req_varyopt |= reqvary; - break; - - - /* ===================================================================*/ - /* Start of nested parenthesized sub-expression, or comment or lookahead or - lookbehind or option setting or condition or all the other extended - parenthesis forms. We must save the current high-water-mark for the - forward reference list so that we know where they start for this group. - However, because the list may be extended when there are very many forward - references (usually the result of a replicated inner group), we must use - an offset rather than an absolute address. */ - - case CHAR_LEFT_PARENTHESIS: - ptr++; - - /* First deal with comments. Putting this code right at the start ensures - that comments have no bad side effects. */ - - if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN) - { - ptr += 2; - while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; - if (*ptr != CHAR_RIGHT_PARENTHESIS) - { - *errorcodeptr = ERR18; - goto FAILED; - } - continue; - } - - /* Now deal with various "verbs" that can be introduced by '*'. */ - - if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':' - || (MAX_255(ptr[1]) && ((cb->ctypes[ptr[1]] & ctype_letter) != 0)))) - { - int i, namelen; - int arglen = 0; - const char *vn = verbnames; - PCRE2_SPTR name = ptr + 1; - PCRE2_SPTR arg = NULL; - previous = NULL; - ptr++; - while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_letter) != 0) ptr++; - namelen = (int)(ptr - name); - - /* It appears that Perl allows any characters whatsoever, other than - a closing parenthesis, to appear in arguments, so we no longer insist on - letters, digits, and underscores. */ - - if (*ptr == CHAR_COLON) - { - arg = ++ptr; - while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; - arglen = (int)(ptr - arg); - if ((unsigned int)arglen > MAX_MARK) - { - *errorcodeptr = ERR76; - goto FAILED; - } - } - - if (*ptr != CHAR_RIGHT_PARENTHESIS) - { - *errorcodeptr = ERR60; - goto FAILED; - } - - /* Scan the table of verb names */ - - for (i = 0; i < verbcount; i++) - { - if (namelen == verbs[i].len && - PRIV(strncmp_c8)(name, vn, namelen) == 0) - { - int setverb; - - /* Check for open captures before ACCEPT and convert it to - ASSERT_ACCEPT if in an assertion. */ - - if (verbs[i].op == OP_ACCEPT) - { - open_capitem *oc; - if (arglen != 0) - { - *errorcodeptr = ERR59; - goto FAILED; - } - cb->had_accept = TRUE; - for (oc = cb->open_caps; oc != NULL; oc = oc->next) - { - *code++ = OP_CLOSE; - PUT2INC(code, 0, oc->number); - } - setverb = *code++ = - (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; - - /* Do not set firstcu after *ACCEPT */ - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - } - - /* Handle other cases with/without an argument */ - - else if (arglen == 0) - { - if (verbs[i].op < 0) /* Argument is mandatory */ - { - *errorcodeptr = ERR66; - goto FAILED; - } - setverb = *code++ = verbs[i].op; - } - - else - { - if (verbs[i].op_arg < 0) /* Argument is forbidden */ - { - *errorcodeptr = ERR59; - goto FAILED; - } - setverb = *code++ = verbs[i].op_arg; - *code++ = arglen; - memcpy(code, arg, CU2BYTES(arglen)); - code += arglen; - *code++ = 0; - } - - switch (setverb) - { - case OP_THEN: - case OP_THEN_ARG: - cb->external_flags |= PCRE2_HASTHEN; - break; - - case OP_PRUNE: - case OP_PRUNE_ARG: - case OP_SKIP: - case OP_SKIP_ARG: - cb->had_pruneorskip = TRUE; - break; - } - - break; /* Found verb, exit loop */ - } - - vn += verbs[i].len + 1; - } - - if (i < verbcount) continue; /* Successfully handled a verb */ - *errorcodeptr = ERR60; /* Verb not recognized */ - goto FAILED; - } - - /* Initialization for "real" parentheses */ - - newoptions = options; - skipunits = 0; - bravalue = OP_CBRA; - reset_bracount = FALSE; - - /* Deal with the extended parentheses; all are introduced by '?', and the - appearance of any of them means that this is not a capturing group. */ - - if (*ptr == CHAR_QUESTION_MARK) - { - int i, count; - int namelen; /* Must be signed */ - uint32_t index; - uint32_t set, unset, *optset; - named_group *ng; - PCRE2_SPTR name; - PCRE2_UCHAR *slot; - - switch (*(++ptr)) - { - /* ------------------------------------------------------------ */ - case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */ - reset_bracount = TRUE; - /* Fall through */ - - /* ------------------------------------------------------------ */ - case CHAR_COLON: /* Non-capturing bracket */ - bravalue = OP_BRA; - ptr++; - break; - - /* ------------------------------------------------------------ */ - case CHAR_LEFT_PARENTHESIS: - bravalue = OP_COND; /* Conditional group */ - tempptr = ptr; - - /* A condition can be an assertion, a number (referring to a numbered - group's having been set), a name (referring to a named group), or 'R', - referring to recursion. R and R&name are also permitted for - recursion tests. - - There are ways of testing a named group: (?(name)) is used by Python; - Perl 5.10 onwards uses (?( ) or (?('name')). - - There is one unfortunate ambiguity, caused by history. 'R' can be the - recursive thing or the name 'R' (and similarly for 'R' followed by - digits). We look for a name first; if not found, we try the other case. - - For compatibility with auto-callouts, we allow a callout to be - specified before a condition that is an assertion. First, check for the - syntax of a callout; if found, adjust the temporary pointer that is - used to check for an assertion condition. That's all that is needed! */ - - if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C) - { - if (IS_DIGIT(ptr[3]) || ptr[3] == CHAR_RIGHT_PARENTHESIS) - { - for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break; - if (ptr[i] == CHAR_RIGHT_PARENTHESIS) - tempptr += i + 1; - } - else - { - uint32_t delimiter = 0; - for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) - { - if (ptr[3] == PRIV(callout_start_delims)[i]) - { - delimiter = PRIV(callout_end_delims)[i]; - break; - } - } - if (delimiter != 0) - { - for (i = 4; ptr + i < cb->end_pattern; i++) - { - if (ptr[i] == delimiter) - { - if (ptr[i+1] == delimiter) i++; - else - { - if (ptr[i+1] == CHAR_RIGHT_PARENTHESIS) tempptr += i + 2; - break; - } - } - } - } - } - - /* tempptr should now be pointing to the opening parenthesis of the - assertion condition. */ - - if (*tempptr != CHAR_LEFT_PARENTHESIS) - { - *errorcodeptr = ERR28; - goto FAILED; - } - } - - /* For conditions that are assertions, check the syntax, and then exit - the switch. This will take control down to where bracketed groups - are processed. The assertion will be handled as part of the group, - but we need to identify this case because the conditional assertion may - not be quantifier. */ - - if (tempptr[1] == CHAR_QUESTION_MARK && - (tempptr[2] == CHAR_EQUALS_SIGN || - tempptr[2] == CHAR_EXCLAMATION_MARK || - (tempptr[2] == CHAR_LESS_THAN_SIGN && - (tempptr[3] == CHAR_EQUALS_SIGN || - tempptr[3] == CHAR_EXCLAMATION_MARK)))) - { - cb->iscondassert = TRUE; - break; - } - - /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all - need to skip at least 1+IMM2_SIZE bytes at the start of the group. */ - - code[1+LINK_SIZE] = OP_CREF; - skipunits = 1+IMM2_SIZE; - refsign = -1; /* => not a number */ - namelen = -1; /* => not a name; must set to avoid warning */ - name = NULL; /* Always set to avoid warning */ - recno = 0; /* Always set to avoid warning */ - - /* Point at character after (?( */ - - ptr++; - - /* Check for (?(VERSION[>]=n.m), which is a facility whereby indirect - users of PCRE2 via an application can discover which release of PCRE2 - is being used. */ - - if (PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 && - ptr[7] != CHAR_RIGHT_PARENTHESIS) - { - BOOL ge = FALSE; - int major = 0; - int minor = 0; - - ptr += 7; - if (*ptr == CHAR_GREATER_THAN_SIGN) - { - ge = TRUE; - ptr++; - } - - /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT - references its argument twice. */ - - if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr))) - { - *errorcodeptr = ERR79; - goto FAILED; - } - - while (IS_DIGIT(*ptr)) major = major * 10 + *ptr++ - '0'; - if (*ptr == CHAR_DOT) - { - ptr++; - while (IS_DIGIT(*ptr)) minor = minor * 10 + *ptr++ - '0'; - } - - if (*ptr != CHAR_RIGHT_PARENTHESIS) - { - *errorcodeptr = ERR79; - goto FAILED; - } - - if (ge) - code[1+LINK_SIZE] = ((PCRE2_MAJOR > major) || - (PCRE2_MAJOR == major && PCRE2_MINOR >= minor))? - OP_TRUE : OP_FALSE; - else - code[1+LINK_SIZE] = (PCRE2_MAJOR == major && PCRE2_MINOR == minor)? - OP_TRUE : OP_FALSE; - - ptr++; - skipunits = 1; - break; /* End of condition processing */ - } - - /* Check for a test for recursion in a named group. */ - - if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND) - { - terminator = -1; - ptr += 2; - code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */ - } - - /* Check for a test for a named group's having been set, using the Perl - syntax (?( ) or (?('name'), and also allow for the original PCRE - syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */ - - else if (*ptr == CHAR_LESS_THAN_SIGN) - { - terminator = CHAR_GREATER_THAN_SIGN; - ptr++; - } - else if (*ptr == CHAR_APOSTROPHE) - { - terminator = CHAR_APOSTROPHE; - ptr++; - } - else - { - terminator = CHAR_NULL; - if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++; - else if (IS_DIGIT(*ptr)) refsign = 0; - } - - /* Handle a number */ - - if (refsign >= 0) - { - while (IS_DIGIT(*ptr)) - { - recno = recno * 10 + (int)(*ptr - CHAR_0); - ptr++; - } - } - - /* Otherwise we expect to read a name; anything else is an error. When - the referenced name is one of a number of duplicates, a different - opcode is used and it needs more memory. Unfortunately we cannot tell - whether this is the case in the first pass, so we have to allow for - more memory always. In the second pass, the additional to skipunits - happens later. */ - - else - { - if (IS_DIGIT(*ptr)) - { - *errorcodeptr = ERR44; /* Group name must start with non-digit */ - goto FAILED; - } - if (!MAX_255(*ptr) || (cb->ctypes[*ptr] & ctype_word) == 0) - { - *errorcodeptr = ERR28; /* Assertion expected */ - goto FAILED; - } - name = ptr++; - while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) - { - ptr++; - } - namelen = (int)(ptr - name); - if (lengthptr != NULL) skipunits += IMM2_SIZE; - } - - /* Check the terminator */ - - if ((terminator > 0 && *ptr++ != (PCRE2_UCHAR)terminator) || - *ptr++ != CHAR_RIGHT_PARENTHESIS) - { - ptr--; /* Error offset */ - *errorcodeptr = ERR26; /* Malformed number or name */ - goto FAILED; - } - - /* Do no further checking in the pre-compile phase. */ - - if (lengthptr != NULL) break; - - /* In the real compile we do the work of looking for the actual - reference. If refsign is not negative, it means we have a number in - recno. */ - - if (refsign >= 0) - { - if (recno <= 0) - { - *errorcodeptr = ERR35; - goto FAILED; - } - if (refsign != 0) recno = (refsign == CHAR_MINUS)? - cb->bracount - recno + 1 : recno + cb->bracount; - if (recno <= 0 || (uint32_t)recno > cb->final_bracount) - { - *errorcodeptr = ERR15; - goto FAILED; - } - PUT2(code, 2+LINK_SIZE, recno); - if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno; - break; - } - - /* Otherwise look for the name. */ - - slot = cb->name_table; - for (i = 0; i < cb->names_found; i++) - { - if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0) break; - slot += cb->name_entry_size; - } - - /* Found the named subpattern. If the name is duplicated, add one to - the opcode to change CREF/RREF into DNCREF/DNRREF and insert - appropriate data values. Otherwise, just insert the unique subpattern - number. */ - - if (i < cb->names_found) - { - int offset = i; /* Offset of first name found */ - - count = 0; - for (;;) - { - recno = GET2(slot, 0); /* Number for last found */ - if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno; - count++; - if (++i >= cb->names_found) break; - slot += cb->name_entry_size; - if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) != 0 || - (slot+IMM2_SIZE)[namelen] != 0) break; - } - - if (count > 1) - { - PUT2(code, 2+LINK_SIZE, offset); - PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count); - skipunits += IMM2_SIZE; - code[1+LINK_SIZE]++; - } - else /* Not a duplicated name */ - { - PUT2(code, 2+LINK_SIZE, recno); - } - } - - /* If terminator == CHAR_NULL it means that the name followed directly - after the opening parenthesis [e.g. (?(abc)...] and in this case there - are some further alternatives to try. For the cases where terminator != - CHAR_NULL [things like (?( ... or (?('name')... or (?(R&name)... ] - we have now checked all the possibilities, so give an error. */ - - else if (terminator != CHAR_NULL) - { - *errorcodeptr = ERR15; - goto FAILED; - } - - /* Check for (?(R) for recursion. Allow digits after R to specify a - specific group number. */ - - else if (*name == CHAR_R) - { - recno = 0; - for (i = 1; i < namelen; i++) - { - if (!IS_DIGIT(name[i])) - { - *errorcodeptr = ERR15; - goto FAILED; - } - recno = recno * 10 + name[i] - CHAR_0; - } - if (recno == 0) recno = RREF_ANY; - code[1+LINK_SIZE] = OP_RREF; /* Change test type */ - PUT2(code, 2+LINK_SIZE, recno); - } - - /* Similarly, check for the (?(DEFINE) "condition", which is always - false. During compilation we set OP_DEFINE to distinguish this from - other OP_FALSE conditions so that it can be checked for having only one - branch, but after that the opcode is changed to OP_FALSE. */ - - else if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0) - { - code[1+LINK_SIZE] = OP_DEFINE; - skipunits = 1; - } - - /* Reference to an unidentified subpattern. */ - - else - { - *errorcodeptr = ERR15; - goto FAILED; - } - break; - - - /* ------------------------------------------------------------ */ - case CHAR_EQUALS_SIGN: /* Positive lookahead */ - bravalue = OP_ASSERT; - cb->assert_depth += 1; - ptr++; - break; - - /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird - thing to do, but Perl allows all assertions to be quantified, and when - they contain capturing parentheses there may be a potential use for - this feature. Not that that applies to a quantified (?!) but we allow - it for uniformity. */ - - /* ------------------------------------------------------------ */ - case CHAR_EXCLAMATION_MARK: /* Negative lookahead */ - ptr++; - if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK && - ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK && - (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2))) - { - *code++ = OP_FAIL; - previous = NULL; - continue; - } - bravalue = OP_ASSERT_NOT; - cb->assert_depth += 1; - break; - - - /* ------------------------------------------------------------ */ - case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */ - switch (ptr[1]) - { - case CHAR_EQUALS_SIGN: /* Positive lookbehind */ - bravalue = OP_ASSERTBACK; - cb->assert_depth += 1; - ptr += 2; - break; - - case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */ - bravalue = OP_ASSERTBACK_NOT; - cb->assert_depth += 1; - ptr += 2; - break; - - /* Must be a name definition - as the syntax was checked in the - pre-pass, we can assume here that it is valid. Skip over the name - and go to handle the numbered group. */ - - default: - while (*(++ptr) != CHAR_GREATER_THAN_SIGN); - ptr++; - goto NUMBERED_GROUP; - } - break; - - - /* ------------------------------------------------------------ */ - case CHAR_GREATER_THAN_SIGN: /* One-time brackets */ - bravalue = OP_ONCE; - ptr++; - break; - - - /* ------------------------------------------------------------ */ - case CHAR_C: /* Callout */ - previous_callout = code; /* Save for later completion */ - after_manual_callout = 1; /* Skip one item before completing */ - ptr++; /* Character after (?C */ - - /* A callout may have a string argument, delimited by one of a fixed - number of characters, or an undelimited numerical argument, or no - argument, which is the same as (?C0). Different opcodes are used for - the two cases. */ - - if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr)) - { - uint32_t delimiter = 0; - - for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) - { - if (*ptr == PRIV(callout_start_delims)[i]) - { - delimiter = PRIV(callout_end_delims)[i]; - break; - } - } - - if (delimiter == 0) - { - *errorcodeptr = ERR82; - goto FAILED; - } - - /* During the pre-compile phase, we parse the string and update the - length. There is no need to generate any code. */ - - if (lengthptr != NULL) /* Only check the string */ - { - PCRE2_SPTR start = ptr; - do - { - if (++ptr >= cb->end_pattern) - { - *errorcodeptr = ERR81; - ptr = start; /* To give a more useful message */ - goto FAILED; - } - if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2; - } - while (ptr[0] != delimiter); - - /* Start points to the opening delimiter, ptr points to the - closing delimiter. We must allow for including the delimiter and - for the terminating zero. Any doubled delimiters within the string - make this an overestimate, but it is not worth bothering about. */ - - (*lengthptr) += (ptr - start) + 2 + (1 + 4*LINK_SIZE); - } - - /* In the real compile we can copy the string, knowing that it is - syntactically OK. The starting delimiter is included so that the - client can discover it if they want. We also pass the start offset to - help a script language give better error messages. */ - - else - { - PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE); - *callout_string++ = *ptr++; - PUT(code, 1 + 3*LINK_SIZE, (int)(ptr - cb->start_pattern)); /* Start offset */ - for(;;) - { - if (*ptr == delimiter) - { - if (ptr[1] == delimiter) ptr++; else break; - } - *callout_string++ = *ptr++; - } - *callout_string++ = CHAR_NULL; - code[0] = OP_CALLOUT_STR; - PUT(code, 1, (int)(ptr + 2 - cb->start_pattern)); /* Next offset */ - PUT(code, 1 + LINK_SIZE, 0); /* Default length */ - PUT(code, 1 + 2*LINK_SIZE, /* Compute size */ - (int)(callout_string - code)); - code = callout_string; - } - - /* Advance to what should be the closing parenthesis, which is - checked below. */ - - ptr++; - } - - /* Handle a callout with an optional numerical argument, which must be - less than or equal to 255. A missing argument gives 0. */ - - else - { - int n = 0; - code[0] = OP_CALLOUT; /* Numerical callout */ - while (IS_DIGIT(*ptr)) - { - n = n * 10 + *ptr++ - CHAR_0; - if (n > 255) - { - *errorcodeptr = ERR38; - goto FAILED; - } - } - PUT(code, 1, (int)(ptr - cb->start_pattern + 1)); /* Next offset */ - PUT(code, 1 + LINK_SIZE, 0); /* Default length */ - code[1 + 2*LINK_SIZE] = n; /* Callout number */ - code += PRIV(OP_lengths)[OP_CALLOUT]; - } - - /* Both formats must have a closing parenthesis */ - - if (*ptr != CHAR_RIGHT_PARENTHESIS) - { - *errorcodeptr = ERR39; - goto FAILED; - } - - /* Callouts cannot be quantified. */ - - previous = NULL; - continue; - - - /* ------------------------------------------------------------ */ - case CHAR_P: /* Python-style named subpattern handling */ - if (*(++ptr) == CHAR_EQUALS_SIGN || - *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */ - { - is_recurse = *ptr == CHAR_GREATER_THAN_SIGN; - terminator = CHAR_RIGHT_PARENTHESIS; - goto NAMED_REF_OR_RECURSE; - } - else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */ - { - *errorcodeptr = ERR41; - goto FAILED; - } - /* Fall through to handle (?P< as (?< is handled */ - - - /* ------------------------------------------------------------ */ - case CHAR_APOSTROPHE: /* Define a name - note fall through above */ - - /* The syntax was checked and the list of names was set up in the - pre-pass, so there is nothing to be done now except to skip over the - name. */ - - terminator = (*ptr == CHAR_LESS_THAN_SIGN)? - CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; - while (*(++ptr) != (unsigned int)terminator); - ptr++; - goto NUMBERED_GROUP; /* Set up numbered group */ - - - /* ------------------------------------------------------------ */ - case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */ - terminator = CHAR_RIGHT_PARENTHESIS; - is_recurse = TRUE; - /* Fall through */ - - /* We come here from the Python syntax above that handles both - references (?P=name) and recursion (?P>name), as well as falling - through from the Perl recursion syntax (?&name). We also come here from - the Perl \k or \k'name' back reference syntax and the \k{name} - .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */ - - NAMED_REF_OR_RECURSE: - name = ++ptr; - if (IS_DIGIT(*ptr)) - { - *errorcodeptr = ERR44; /* Group name must start with non-digit */ - goto FAILED; - } - while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) ptr++; - namelen = (int)(ptr - name); - - /* In the pre-compile phase, do a syntax check. */ - - if (lengthptr != NULL) - { - if (namelen == 0) - { - *errorcodeptr = ERR62; - goto FAILED; - } - if (*ptr != (PCRE2_UCHAR)terminator) - { - *errorcodeptr = ERR42; - goto FAILED; - } - if (namelen > MAX_NAME_SIZE) - { - *errorcodeptr = ERR48; - goto FAILED; - } - } - - /* Scan the list of names generated in the pre-pass in order to get - a number and whether or not this name is duplicated. */ - - recno = 0; - is_dupname = FALSE; - ng = cb->named_groups; - - for (i = 0; i < cb->names_found; i++, ng++) - { - if (namelen == ng->length && - PRIV(strncmp)(name, ng->name, namelen) == 0) - { - open_capitem *oc; - is_dupname = ng->isdup; - recno = ng->number; - - /* For a recursion, that's all that is needed. We can now go to the - code that handles numerical recursion. */ - - if (is_recurse) goto HANDLE_RECURSION; - - /* For a back reference, update the back reference map and the - maximum back reference. Then for each group we must check to see if - it is recursive, that is, it is inside the group that it - references. A flag is set so that the group can be made atomic. */ - - cb->backref_map |= (recno < 32)? (1u << recno) : 1; - if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno; - - for (oc = cb->open_caps; oc != NULL; oc = oc->next) - { - if (oc->number == recno) - { - oc->flag = TRUE; - break; - } - } - } - } - - /* If the name was not found we have a bad reference. */ - - if (recno == 0) - { - *errorcodeptr = ERR15; - goto FAILED; - } - - /* If a back reference name is not duplicated, we can handle it as a - numerical reference. */ - - if (!is_dupname) goto HANDLE_REFERENCE; - - /* If a back reference name is duplicated, we generate a different - opcode to a numerical back reference. In the second pass we must search - for the index and count in the final name table. */ - - count = 0; - index = 0; - - if (lengthptr == NULL) - { - slot = cb->name_table; - for (i = 0; i < cb->names_found; i++) - { - if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0 && - slot[IMM2_SIZE+namelen] == 0) - { - if (count == 0) index = i; - count++; - } - slot += cb->name_entry_size; - } - - if (count == 0) - { - *errorcodeptr = ERR15; - goto FAILED; - } - } - - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; - *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF; - PUT2INC(code, 0, index); - PUT2INC(code, 0, count); - continue; /* End of back ref handling */ - - - /* ------------------------------------------------------------ */ - case CHAR_R: /* Recursion */ - ptr++; /* Same as (?0) */ - /* Fall through */ - - - /* ------------------------------------------------------------ */ - case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */ - case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: - case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: - { - PCRE2_SPTR called; - terminator = CHAR_RIGHT_PARENTHESIS; - - /* Come here from the \g<...> and \g'...' code (Oniguruma - compatibility). However, the syntax has been checked to ensure that - the ... are a (signed) number, so that neither ERR63 nor ERR29 will - be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY - ever be taken. */ - - HANDLE_NUMERICAL_RECURSION: - - if ((refsign = *ptr) == CHAR_PLUS) - { - ptr++; - if (!IS_DIGIT(*ptr)) - { - *errorcodeptr = ERR63; - goto FAILED; - } - } - else if (refsign == CHAR_MINUS) - { - if (!IS_DIGIT(ptr[1])) - goto OTHER_CHAR_AFTER_QUERY; - ptr++; - } - - recno = 0; - while (IS_DIGIT(*ptr)) - { - if (recno > INT_MAX / 10 - 1) /* Integer overflow */ - { - while (IS_DIGIT(*ptr)) ptr++; - *errorcodeptr = ERR61; - goto FAILED; - } - recno = recno * 10 + *ptr++ - CHAR_0; - } - - if (*ptr != (PCRE2_UCHAR)terminator) - { - *errorcodeptr = ERR29; - goto FAILED; - } - - if (refsign == CHAR_MINUS) - { - if (recno == 0) - { - *errorcodeptr = ERR58; - goto FAILED; - } - recno = cb->bracount - recno + 1; - if (recno <= 0) - { - *errorcodeptr = ERR15; - goto FAILED; - } - } - else if (refsign == CHAR_PLUS) - { - if (recno == 0) - { - *errorcodeptr = ERR58; - goto FAILED; - } - recno += cb->bracount; - } - - /* Come here from code above that handles a named recursion */ - - HANDLE_RECURSION: - previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; - called = cb->start_code; - - /* When we are actually compiling, find the bracket that is being - referenced. Temporarily end the regex in case it doesn't exist before - this point. If we end up with a forward reference, first check that - the bracket does occur later so we can give the error (and position) - now. Then remember this forward reference in the workspace so it can - be filled in at the end. */ - - if (lengthptr == NULL) - { - *code = OP_END; - if (recno != 0) - called = PRIV(find_bracket)(cb->start_code, utf, recno); - - /* Forward reference */ - - if (called == NULL) - { - if ((uint32_t)recno > cb->final_bracount) - { - *errorcodeptr = ERR15; - goto FAILED; - } - - /* Fudge the value of "called" so that when it is inserted as an - offset below, what it actually inserted is the reference number - of the group. Then remember the forward reference, expanding the - working space where the list is kept if necessary. */ - - called = cb->start_code + recno; - if (cb->hwm >= cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN) - { - *errorcodeptr = expand_workspace(cb); - if (*errorcodeptr != 0) goto FAILED; - } - PUTINC(cb->hwm, 0, (int)(code + 1 - cb->start_code)); - } - - /* If not a forward reference, and the subpattern is still open, - this is a recursive call. We check to see if this is a left - recursion that could loop for ever, and diagnose that case. We - must not, however, do this check if we are in a conditional - subpattern because the condition might be testing for recursion in - a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid. - Forever loops are also detected at runtime, so those that occur in - conditional subpatterns will be picked up then. */ - - else if (GET(called, 1) == 0 && cond_depth <= 0 && - could_be_empty(called, code, bcptr, utf, cb)) - { - *errorcodeptr = ERR40; - goto FAILED; - } - } - - /* Insert the recursion/subroutine item. It does not have a set first - character (relevant if it is repeated, because it will then be - wrapped with ONCE brackets). */ - - *code = OP_RECURSE; - PUT(code, 1, (int)(called - cb->start_code)); - code += 1 + LINK_SIZE; - groupsetfirstcu = FALSE; - } - - /* Can't determine a first byte now */ - - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - continue; - - - /* ------------------------------------------------------------ */ - default: /* Other characters: check option setting */ - OTHER_CHAR_AFTER_QUERY: - set = unset = 0; - optset = &set; - - while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON) - { - switch (*ptr++) - { - case CHAR_MINUS: optset = &unset; break; - - case CHAR_J: /* Record that it changed in the external options */ - *optset |= PCRE2_DUPNAMES; - cb->external_flags |= PCRE2_JCHANGED; - break; - - case CHAR_i: *optset |= PCRE2_CASELESS; break; - case CHAR_m: *optset |= PCRE2_MULTILINE; break; - case CHAR_s: *optset |= PCRE2_DOTALL; break; - case CHAR_x: *optset |= PCRE2_EXTENDED; break; - case CHAR_U: *optset |= PCRE2_UNGREEDY; break; - - default: *errorcodeptr = ERR11; - ptr--; /* Correct the offset */ - goto FAILED; - } - } - - /* Set up the changed option bits, but don't change anything yet. */ - - newoptions = (options | set) & (~unset); - - /* If the options ended with ')' this is not the start of a nested - group with option changes, so the options change at this level. If this - item is right at the start of the pattern, the options can be - abstracted and made external in the pre-compile phase, and ignored in - the compile phase. This can be helpful when matching -- for instance in - caseless checking of required bytes. - - If the code pointer is not (cb->start_code + 1 + LINK_SIZE), we are - definitely *not* at the start of the pattern because something has been - compiled. In the pre-compile phase, however, the code pointer can have - that value after the start, because it gets reset as code is discarded - during the pre-compile. However, this can happen only at top level - if - we are within parentheses, the starting BRA will still be present. At - any parenthesis level, the length value can be used to test if anything - has been compiled at that level. Thus, a test for both these conditions - is necessary to ensure we correctly detect the start of the pattern in - both phases. - - If we are not at the pattern start, reset the greedy defaults and the - case value for firstcu and reqcu. */ - - if (*ptr == CHAR_RIGHT_PARENTHESIS) - { - if (code == cb->start_code + 1 + LINK_SIZE && - (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) - { - cb->external_options = newoptions; - } - else - { - greedy_default = ((newoptions & PCRE2_UNGREEDY) != 0); - greedy_non_default = greedy_default ^ 1; - req_caseopt = ((newoptions & PCRE2_CASELESS) != 0)? REQ_CASELESS:0; - } - - /* Change options at this level, and pass them back for use - in subsequent branches. */ - - *optionsptr = options = newoptions; - previous = NULL; /* This item can't be repeated */ - continue; /* It is complete */ - } - - /* If the options ended with ':' we are heading into a nested group - with possible change of options. Such groups are non-capturing and are - not assertions of any kind. All we need to do is skip over the ':'; - the newoptions value is handled below. */ - - bravalue = OP_BRA; - ptr++; - } /* End of switch for character following (? */ - } /* End of (? handling */ - - /* Opening parenthesis not followed by '*' or '?'. If PCRE2_NO_AUTO_CAPTURE - is set, all unadorned brackets become non-capturing and behave like (?:...) - brackets. */ - - else if ((options & PCRE2_NO_AUTO_CAPTURE) != 0) - { - bravalue = OP_BRA; - } - - /* Else we have a capturing group. */ - - else - { - NUMBERED_GROUP: - cb->bracount += 1; - PUT2(code, 1+LINK_SIZE, cb->bracount); - skipunits = IMM2_SIZE; - } - - /* Process nested bracketed regex. First check for parentheses nested too - deeply. */ - - if ((cb->parens_depth += 1) > (int)(cb->cx->parens_nest_limit)) - { - *errorcodeptr = ERR19; - goto FAILED; - } - - /* All assertions used not to be repeatable, but this was changed for Perl - compatibility. All kinds can now be repeated except for assertions that are - conditions (Perl also forbids these to be repeated). We copy code into a - non-register variable (tempcode) in order to be able to pass its address - because some compilers complain otherwise. At the start of a conditional - group whose condition is an assertion, cb->iscondassert is set. We unset it - here so as to allow assertions later in the group to be quantified. */ - - if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT && - cb->iscondassert) - { - previous = NULL; - cb->iscondassert = FALSE; - } - else - { - previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; - } - - *code = bravalue; - tempcode = code; - tempreqvary = cb->req_varyopt; /* Save value before bracket */ - tempbracount = cb->bracount; /* Save value before bracket */ - length_prevgroup = 0; /* Initialize for pre-compile phase */ - - if (!compile_regex( - newoptions, /* The complete new option state */ - &tempcode, /* Where to put code (updated) */ - &ptr, /* Input pointer (updated) */ - errorcodeptr, /* Where to put an error message */ - (bravalue == OP_ASSERTBACK || - bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ - reset_bracount, /* True if (?| group */ - skipunits, /* Skip over bracket number */ - cond_depth + - ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */ - &subfirstcu, /* For possible first char */ - &subfirstcuflags, - &subreqcu, /* For possible last char */ - &subreqcuflags, - bcptr, /* Current branch chain */ - cb, /* Compile data block */ - (lengthptr == NULL)? NULL : /* Actual compile phase */ - &length_prevgroup /* Pre-compile phase */ - )) - goto FAILED; - - cb->parens_depth -= 1; - - /* If this was an atomic group and there are no capturing groups within it, - generate OP_ONCE_NC instead of OP_ONCE. */ - - if (bravalue == OP_ONCE && cb->bracount <= tempbracount) - *code = OP_ONCE_NC; - - if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT) - cb->assert_depth -= 1; - - /* At the end of compiling, code is still pointing to the start of the - group, while tempcode has been updated to point past the end of the group. - The pattern pointer (ptr) is on the bracket. - - If this is a conditional bracket, check that there are no more than - two branches in the group, or just one if it's a DEFINE group. We do this - in the real compile phase, not in the pre-pass, where the whole group may - not be available. */ - - if (bravalue == OP_COND && lengthptr == NULL) - { - PCRE2_UCHAR *tc = code; - int condcount = 0; - - do { - condcount++; - tc += GET(tc,1); - } - while (*tc != OP_KET); - - /* A DEFINE group is never obeyed inline (the "condition" is always - false). It must have only one branch. Having checked this, change the - opcode to OP_FALSE. */ - - if (code[LINK_SIZE+1] == OP_DEFINE) - { - if (condcount > 1) - { - *errorcodeptr = ERR54; - goto FAILED; - } - code[LINK_SIZE+1] = OP_FALSE; - bravalue = OP_DEFINE; /* Just a flag to suppress char handling below */ - } - - /* A "normal" conditional group. If there is just one branch, we must not - make use of its firstcu or reqcu, because this is equivalent to an - empty second branch. */ - - else - { - if (condcount > 2) - { - *errorcodeptr = ERR27; - goto FAILED; - } - if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE; - } - } - - /* Error if hit end of pattern */ - - if (*ptr != CHAR_RIGHT_PARENTHESIS) - { - *errorcodeptr = ERR14; - goto FAILED; - } - - /* In the pre-compile phase, update the length by the length of the group, - less the brackets at either end. Then reduce the compiled code to just a - set of non-capturing brackets so that it doesn't use much memory if it is - duplicated by a quantifier.*/ - - if (lengthptr != NULL) - { - if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) - { - *errorcodeptr = ERR20; - goto FAILED; - } - *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; - code++; /* This already contains bravalue */ - PUTINC(code, 0, 1 + LINK_SIZE); - *code++ = OP_KET; - PUTINC(code, 0, 1 + LINK_SIZE); - break; /* No need to waste time with special character handling */ - } - - /* Otherwise update the main code pointer to the end of the group. */ - - code = tempcode; - - /* For a DEFINE group, required and first character settings are not - relevant. */ - - if (bravalue == OP_DEFINE) break; - - /* Handle updating of the required and first characters for other types of - group. Update for normal brackets of all kinds, and conditions with two - branches (see code above). If the bracket is followed by a quantifier with - zero repeat, we have to back off. Hence the definition of zeroreqcu and - zerofirstcu outside the main loop so that they can be accessed for the - back off. */ - - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - groupsetfirstcu = FALSE; - - if (bravalue >= OP_ONCE) - { - /* If we have not yet set a firstcu in this branch, take it from the - subpattern, remembering that it was set here so that a repeat of more - than one can replicate it as reqcu if necessary. If the subpattern has - no firstcu, set "none" for the whole branch. In both cases, a zero - repeat forces firstcu to "none". */ - - if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET) - { - if (subfirstcuflags >= 0) - { - firstcu = subfirstcu; - firstcuflags = subfirstcuflags; - groupsetfirstcu = TRUE; - } - else firstcuflags = REQ_NONE; - zerofirstcuflags = REQ_NONE; - } - - /* If firstcu was previously set, convert the subpattern's firstcu - into reqcu if there wasn't one, using the vary flag that was in - existence beforehand. */ - - else if (subfirstcuflags >= 0 && subreqcuflags < 0) - { - subreqcu = subfirstcu; - subreqcuflags = subfirstcuflags | tempreqvary; - } - - /* If the subpattern set a required byte (or set a first byte that isn't - really the first byte - see above), set it. */ - - if (subreqcuflags >= 0) - { - reqcu = subreqcu; - reqcuflags = subreqcuflags; - } - } - - /* For a forward assertion, we take the reqcu, if set. This can be - helpful if the pattern that follows the assertion doesn't set a different - char. For example, it's useful for /(?=abcde).+/. We can't set firstcu - for an assertion, however because it leads to incorrect effect for patterns - such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead - of a firstcu. This is overcome by a scan at the end if there's no - firstcu, looking for an asserted first char. */ - - else if (bravalue == OP_ASSERT && subreqcuflags >= 0) - { - reqcu = subreqcu; - reqcuflags = subreqcuflags; - } - break; /* End of processing '(' */ - - - /* ===================================================================*/ - /* Handle metasequences introduced by \. For ones like \d, the ESC_ values - are arranged to be the negation of the corresponding OP_values in the - default case when PCRE2_UCP is not set. For the back references, the values - are negative the reference number. Only back references and those types - that consume a character may be repeated. We can test for values between - ESC_b and ESC_Z for the latter; this may have to change if any new ones are - ever created. */ - - case CHAR_BACKSLASH: - tempptr = ptr; - escape = check_escape(&ptr, &ec, errorcodeptr, options, FALSE, cb); - if (*errorcodeptr != 0) goto FAILED; - - if (escape == 0) /* The escape coded a single character */ - c = ec; - else - { - if (escape == ESC_Q) /* Handle start of quoted string */ - { - if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) - ptr += 2; /* avoid empty string */ - else inescq = TRUE; - continue; - } - - if (escape == ESC_E) continue; /* Perl ignores an orphan \E */ - - /* For metasequences that actually match a character, we disable the - setting of a first character if it hasn't already been set. */ - - if (firstcuflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z) - firstcuflags = REQ_NONE; - - /* Set values to reset to if this is followed by a zero repeat. */ - - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - - /* \g or \g'name' is a subroutine call by name and \g or \g'n' - is a subroutine call by number (Oniguruma syntax). In fact, the value - ESC_g is returned only for these cases. So we don't need to check for < - or ' if the value is ESC_g. For the Perl syntax \g{n} the value is - -n, and for the Perl syntax \g{name} the result is ESC_k (as - that is a synonym for a named back reference). */ - - if (escape == ESC_g) - { - PCRE2_SPTR p; - uint32_t cf; - - terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? - CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; - - /* These two statements stop the compiler for warning about possibly - unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In - fact, because we do the check for a number below, the paths that - would actually be in error are never taken. */ - - skipunits = 0; - reset_bracount = FALSE; - - /* If it's not a signed or unsigned number, treat it as a name. */ - - cf = ptr[1]; - if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf)) - { - is_recurse = TRUE; - goto NAMED_REF_OR_RECURSE; - } - - /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus - or a digit. */ - - p = ptr + 2; - while (IS_DIGIT(*p)) p++; - if (*p != (PCRE2_UCHAR)terminator) - { - *errorcodeptr = ERR57; - break; - } - ptr++; - goto HANDLE_NUMERICAL_RECURSION; - } - - /* \k or \k'name' is a back reference by name (Perl syntax). - We also support \k{name} (.NET syntax). */ - - if (escape == ESC_k) - { - if ((ptr[1] != CHAR_LESS_THAN_SIGN && - ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET)) - { - *errorcodeptr = ERR69; - break; - } - is_recurse = FALSE; - terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? - CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? - CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; - goto NAMED_REF_OR_RECURSE; - } - - /* Back references are handled specially; must disable firstcu if - not set to cope with cases like (?=(\w+))\1: which would otherwise set - ':' later. */ - - if (escape < 0) - { - open_capitem *oc; - recno = -escape; - - /* Come here from named backref handling when the reference is to a - single group (i.e. not to a duplicated name). */ - - HANDLE_REFERENCE: - if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; - previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; - *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF; - PUT2INC(code, 0, recno); - cb->backref_map |= (recno < 32)? (1u << recno) : 1; - if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno; - - /* Check to see if this back reference is recursive, that it, it - is inside the group that it references. A flag is set so that the - group can be made atomic. */ - - for (oc = cb->open_caps; oc != NULL; oc = oc->next) - { - if (oc->number == recno) - { - oc->flag = TRUE; - break; - } - } - } - - /* So are Unicode property matches, if supported. */ - -#ifdef SUPPORT_UNICODE - else if (escape == ESC_P || escape == ESC_p) - { - BOOL negated; - unsigned int ptype = 0, pdata = 0; - if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb)) - goto FAILED; - previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; - *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP; - *code++ = ptype; - *code++ = pdata; - } -#else - - /* If Unicode properties are not supported, \X, \P, and \p are not - allowed. */ - - else if (escape == ESC_X || escape == ESC_P || escape == ESC_p) - { - *errorcodeptr = ERR45; - goto FAILED; - } -#endif - - /* The use of \C can be locked out. */ - - else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0) - { - *errorcodeptr = ERR83; - goto FAILED; - } - - /* For the rest (including \X when Unicode properties are supported), we - can obtain the OP value by negating the escape value in the default - situation when PCRE2_UCP is not set. When it *is* set, we substitute - Unicode property tests. Note that \b and \B do a one-character - lookbehind, and \A also behaves as if it does. */ - - else - { - if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) && - cb->max_lookbehind == 0) - cb->max_lookbehind = 1; -#ifdef SUPPORT_UNICODE - if (escape >= ESC_DU && escape <= ESC_wu) - { - nestptr = ptr + 1; /* Where to resume */ - ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */ - } - else -#endif - /* In non-UTF mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE - so that it works in DFA mode and in lookbehinds. */ - - { - previous = (escape > ESC_b && escape < ESC_Z)? code : NULL; - item_hwm_offset = cb->hwm - cb->start_workspace; - *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape; - } - } - continue; - } - - /* We have a data character whose value is in c. In UTF-8 mode it may have - a value > 127. We set its representation in the length/buffer, and then - handle it as a data character. */ - - mclength = PUTCHAR(c, mcbuffer); - goto ONE_CHAR; - - - /* ===================================================================*/ - /* Handle a literal character. It is guaranteed not to be whitespace or # - when the extended flag is set. If we are in a UTF mode, it may be a - multi-unit literal character. */ - - default: - NORMAL_CHAR: - mclength = 1; - mcbuffer[0] = c; - -#ifdef SUPPORT_UNICODE - if (utf && HAS_EXTRALEN(c)) - ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr)); -#endif - - /* At this point we have the character's bytes in mcbuffer, and the length - in mclength. When not in UTF mode, the length is always 1. */ - - ONE_CHAR: - previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; - - /* For caseless UTF mode, check whether this character has more than one - other case. If so, generate a special OP_PROP item instead of OP_CHARI. */ - -#ifdef SUPPORT_UNICODE - if (utf && (options & PCRE2_CASELESS) != 0) - { - GETCHAR(c, mcbuffer); - if ((c = UCD_CASESET(c)) != 0) - { - *code++ = OP_PROP; - *code++ = PT_CLIST; - *code++ = c; - if (firstcuflags == REQ_UNSET) - firstcuflags = zerofirstcuflags = REQ_NONE; - break; - } - } -#endif - - /* Caseful matches, or not one of the multicase characters. */ - - *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR; - for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; - - /* Remember if \r or \n were seen */ - - if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) - cb->external_flags |= PCRE2_HASCRORLF; - - /* Set the first and required bytes appropriately. If no previous first - byte, set it from this character, but revert to none on a zero repeat. - Otherwise, leave the firstcu value alone, and don't change it on a zero - repeat. */ - - if (firstcuflags == REQ_UNSET) - { - zerofirstcuflags = REQ_NONE; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - - /* If the character is more than one byte long, we can set firstcu - only if it is not to be matched caselessly. */ - - if (mclength == 1 || req_caseopt == 0) - { - firstcu = mcbuffer[0] | req_caseopt; - firstcu = mcbuffer[0]; - firstcuflags = req_caseopt; - - if (mclength != 1) - { - reqcu = code[-1]; - reqcuflags = cb->req_varyopt; - } - } - else firstcuflags = reqcuflags = REQ_NONE; - } - - /* firstcu was previously set; we can set reqcu only if the length is - 1 or the matching is caseful. */ - - else - { - zerofirstcu = firstcu; - zerofirstcuflags = firstcuflags; - zeroreqcu = reqcu; - zeroreqcuflags = reqcuflags; - if (mclength == 1 || req_caseopt == 0) - { - reqcu = code[-1]; - reqcuflags = req_caseopt | cb->req_varyopt; - } - } - - break; /* End of literal character handling */ - } - } /* end of big loop */ - -/* Control never reaches here by falling through, only by a goto for all the -error states. Pass back the position in the pattern so that it can be displayed -to the user for diagnosing the error. */ - -FAILED: -*ptrptr = ptr; -return FALSE; -} - - - -/************************************************* -* Compile regex: a sequence of alternatives * -*************************************************/ - -/* On entry, ptr is pointing past the bracket character, but on return it -points to the closing bracket, or vertical bar, or end of string. The code -variable is pointing at the byte into which the BRA operator has been stored. -This function is used during the pre-compile phase when we are trying to find -out the amount of memory needed, as well as during the real compile phase. The -value of lengthptr distinguishes the two phases. - -Arguments: - options option bits, including any changes for this subpattern - codeptr -> the address of the current code pointer - ptrptr -> the address of the current pattern pointer - errorcodeptr -> pointer to error code variable - lookbehind TRUE if this is a lookbehind assertion - reset_bracount TRUE to reset the count for each branch - skipunits skip this many code units at start (for brackets and OP_COND) - cond_depth depth of nesting for conditional subpatterns - firstcuptr place to put the first required code unit - firstcuflagsptr place to put the first code unit flags, or a negative number - reqcuptr place to put the last required code unit - reqcuflagsptr place to put the last required code unit flags, or a negative number - bcptr pointer to the chain of currently open branches - cb points to the data block with tables pointers etc. - lengthptr NULL during the real compile phase - points to length accumulator during pre-compile phase - -Returns: TRUE on success -*/ - -static BOOL -compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, PCRE2_SPTR *ptrptr, - int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, uint32_t skipunits, - int cond_depth, uint32_t *firstcuptr, int32_t *firstcuflagsptr, - uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr, - compile_block *cb, size_t *lengthptr) -{ -PCRE2_SPTR ptr = *ptrptr; -PCRE2_UCHAR *code = *codeptr; -PCRE2_UCHAR *last_branch = code; -PCRE2_UCHAR *start_bracket = code; -PCRE2_UCHAR *reverse_count = NULL; -open_capitem capitem; -int capnumber = 0; -uint32_t firstcu, reqcu; -int32_t firstcuflags, reqcuflags; -uint32_t branchfirstcu, branchreqcu; -int32_t branchfirstcuflags, branchreqcuflags; -size_t length; -size_t save_hwm_offset; -unsigned int orig_bracount; -unsigned int max_bracount; -branch_chain bc; - -/* If set, call the external function that checks for stack availability. */ - -if (cb->cx->stack_guard != NULL && - cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data)) - { - *errorcodeptr= ERR33; - return FALSE; - } - -/* Miscellaneous initialization */ - -bc.outer = bcptr; -bc.current_branch = code; - -firstcu = reqcu = 0; -firstcuflags = reqcuflags = REQ_UNSET; - -save_hwm_offset = cb->hwm - cb->start_workspace; /* hwm at start of group */ - -/* Accumulate the length for use in the pre-compile phase. Start with the -length of the BRA and KET and any extra code units that are required at the -beginning. We accumulate in a local variable to save frequent testing of -lengthptr for NULL. We cannot do this by looking at the value of 'code' at the -start and end of each alternative, because compiled items are discarded during -the pre-compile phase so that the work space is not exceeded. */ - -length = 2 + 2*LINK_SIZE + skipunits; - -/* WARNING: If the above line is changed for any reason, you must also change -the code that abstracts option settings at the start of the pattern and makes -them global. It tests the value of length for (2 + 2*LINK_SIZE) in the -pre-compile phase to find out whether or not anything has yet been compiled. - -If this is a capturing subpattern, add to the chain of open capturing items -so that we can detect them if (*ACCEPT) is encountered. This is also used to -detect groups that contain recursive back references to themselves. Note that -only OP_CBRA need be tested here; changing this opcode to one of its variants, -e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */ - -if (*code == OP_CBRA) - { - capnumber = GET2(code, 1 + LINK_SIZE); - capitem.number = capnumber; - capitem.next = cb->open_caps; - capitem.flag = FALSE; - cb->open_caps = &capitem; - } - -/* Offset is set zero to mark that this bracket is still open */ - -PUT(code, 1, 0); -code += 1 + LINK_SIZE + skipunits; - -/* Loop for each alternative branch */ - -orig_bracount = max_bracount = cb->bracount; - -for (;;) - { - /* For a (?| group, reset the capturing bracket count so that each branch - uses the same numbers. */ - - if (reset_bracount) cb->bracount = orig_bracount; - - /* Set up dummy OP_REVERSE if lookbehind assertion */ - - if (lookbehind) - { - *code++ = OP_REVERSE; - reverse_count = code; - PUTINC(code, 0, 0); - length += 1 + LINK_SIZE; - } - - /* Now compile the branch; in the pre-compile phase its length gets added - into the length. */ - - if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstcu, - &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc, - cond_depth, cb, (lengthptr == NULL)? NULL : &length)) - { - *ptrptr = ptr; - return FALSE; - } - - /* Keep the highest bracket count in case (?| was used and some branch - has fewer than the rest. */ - - if (cb->bracount > max_bracount) max_bracount = cb->bracount; - - /* In the real compile phase, there is some post-processing to be done. */ - - if (lengthptr == NULL) - { - /* If this is the first branch, the firstcu and reqcu values for the - branch become the values for the regex. */ - - if (*last_branch != OP_ALT) - { - firstcu = branchfirstcu; - firstcuflags = branchfirstcuflags; - reqcu = branchreqcu; - reqcuflags = branchreqcuflags; - } - - /* If this is not the first branch, the first char and reqcu have to - match the values from all the previous branches, except that if the - previous value for reqcu didn't have REQ_VARY set, it can still match, - and we set REQ_VARY for the regex. */ - - else - { - /* If we previously had a firstcu, but it doesn't match the new branch, - we have to abandon the firstcu for the regex, but if there was - previously no reqcu, it takes on the value of the old firstcu. */ - - if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu) - { - if (firstcuflags >= 0) - { - if (reqcuflags < 0) - { - reqcu = firstcu; - reqcuflags = firstcuflags; - } - } - firstcuflags = REQ_NONE; - } - - /* If we (now or from before) have no firstcu, a firstcu from the - branch becomes a reqcu if there isn't a branch reqcu. */ - - if (firstcuflags < 0 && branchfirstcuflags >= 0 && - branchreqcuflags < 0) - { - branchreqcu = branchfirstcu; - branchreqcuflags = branchfirstcuflags; - } - - /* Now ensure that the reqcus match */ - - if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) || - reqcu != branchreqcu) - reqcuflags = REQ_NONE; - else - { - reqcu = branchreqcu; - reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */ - } - } - - /* If lookbehind, check that this branch matches a fixed-length string, and - put the length into the OP_REVERSE item. Temporarily mark the end of the - branch with OP_END. If the branch contains OP_RECURSE, the result is -3 - because there may be forward references that we can't check here. Set a - flag to cause another lookbehind check at the end. Why not do it all at the - end? Because common, erroneous checks are picked up here and the offset of - the problem can be shown. */ - - if (lookbehind) - { - int fixed_length; - *code = OP_END; - fixed_length = find_fixedlength(last_branch, (options & PCRE2_UTF) != 0, - FALSE, cb, NULL); - if (fixed_length == -3) - { - cb->check_lookbehind = TRUE; - } - else if (fixed_length < 0) - { - *errorcodeptr = (fixed_length == -2)? ERR36 : - (fixed_length == -4)? ERR70: ERR25; - *ptrptr = ptr; - return FALSE; - } - else - { - if (fixed_length > cb->max_lookbehind) - cb->max_lookbehind = fixed_length; - PUT(reverse_count, 0, fixed_length); - } - } - } - - /* Reached end of expression, either ')' or end of pattern. In the real - compile phase, go back through the alternative branches and reverse the chain - of offsets, with the field in the BRA item now becoming an offset to the - first alternative. If there are no alternatives, it points to the end of the - group. The length in the terminating ket is always the length of the whole - bracketed item. Return leaving the pointer at the terminating char. */ - - if (*ptr != CHAR_VERTICAL_LINE) - { - if (lengthptr == NULL) - { - size_t branch_length = code - last_branch; - do - { - size_t prev_length = GET(last_branch, 1); - PUT(last_branch, 1, branch_length); - branch_length = prev_length; - last_branch -= branch_length; - } - while (branch_length > 0); - } - - /* Fill in the ket */ - - *code = OP_KET; - PUT(code, 1, (int)(code - start_bracket)); - code += 1 + LINK_SIZE; - - /* If it was a capturing subpattern, check to see if it contained any - recursive back references. If so, we must wrap it in atomic brackets. - Because we are moving code along, we must ensure that any pending recursive - or forward subroutine references are updated. In any event, remove the - block from the chain. */ - - if (capnumber > 0) - { - if (cb->open_caps->flag) - { - *code = OP_END; - adjust_recurse(start_bracket, 1 + LINK_SIZE, - (options & PCRE2_UTF) != 0, cb, save_hwm_offset); - memmove(start_bracket + 1 + LINK_SIZE, start_bracket, - CU2BYTES(code - start_bracket)); - *start_bracket = OP_ONCE; - code += 1 + LINK_SIZE; - PUT(start_bracket, 1, (int)(code - start_bracket)); - *code = OP_KET; - PUT(code, 1, (int)(code - start_bracket)); - code += 1 + LINK_SIZE; - length += 2 + 2*LINK_SIZE; - } - cb->open_caps = cb->open_caps->next; - } - - /* Retain the highest bracket number, in case resetting was used. */ - - cb->bracount = max_bracount; - - /* Set values to pass back */ - - *codeptr = code; - *ptrptr = ptr; - *firstcuptr = firstcu; - *firstcuflagsptr = firstcuflags; - *reqcuptr = reqcu; - *reqcuflagsptr = reqcuflags; - if (lengthptr != NULL) - { - if (OFLOW_MAX - *lengthptr < length) - { - *errorcodeptr = ERR20; - return FALSE; - } - *lengthptr += length; - } - return TRUE; - } - - /* Another branch follows. In the pre-compile phase, we can move the code - pointer back to where it was for the start of the first branch. (That is, - pretend that each branch is the only one.) - - In the real compile phase, insert an ALT node. Its length field points back - to the previous branch while the bracket remains open. At the end the chain - is reversed. It's done like this so that the start of the bracket has a - zero offset until it is closed, making it possible to detect recursion. */ - - if (lengthptr != NULL) - { - code = *codeptr + 1 + LINK_SIZE + skipunits; - length += 1 + LINK_SIZE; - } - else - { - *code = OP_ALT; - PUT(code, 1, (int)(code - last_branch)); - bc.current_branch = last_branch = code; - code += 1 + LINK_SIZE; - } - - /* Advance past the vertical bar */ - - ptr++; - } -/* Control never reaches here */ -} - - - -/************************************************* -* Check for anchored pattern * -*************************************************/ - -/* Try to find out if this is an anchored regular expression. Consider each -alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket -all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then -it's anchored. However, if this is a multiline pattern, then only OP_SOD will -be found, because ^ generates OP_CIRCM in that mode. - -We can also consider a regex to be anchored if OP_SOM starts all its branches. -This is the code for \G, which means "match at start of match position, taking -into account the match offset". - -A branch is also implicitly anchored if it starts with .* and DOTALL is set, -because that will try the rest of the pattern at all possible matching points, -so there is no point trying again.... er .... - -.... except when the .* appears inside capturing parentheses, and there is a -subsequent back reference to those parentheses. We haven't enough information -to catch that case precisely. - -At first, the best we could do was to detect when .* was in capturing brackets -and the highest back reference was greater than or equal to that level. -However, by keeping a bitmap of the first 31 back references, we can catch some -of the more common cases more precisely. - -... A second exception is when the .* appears inside an atomic group, because -this prevents the number of characters it matches from being adjusted. - -Arguments: - code points to start of the compiled pattern - bracket_map a bitmap of which brackets we are inside while testing; this - handles up to substring 31; after that we just have to take - the less precise approach - cb points to the compile data block - atomcount atomic group level - -Returns: TRUE or FALSE -*/ - -static BOOL -is_anchored(register PCRE2_SPTR code, unsigned int bracket_map, - compile_block *cb, int atomcount) -{ -do { - PCRE2_SPTR scode = first_significant_code( - code + PRIV(OP_lengths)[*code], FALSE); - register int op = *scode; - - /* Non-capturing brackets */ - - if (op == OP_BRA || op == OP_BRAPOS || - op == OP_SBRA || op == OP_SBRAPOS) - { - if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE; - } - - /* Capturing brackets */ - - else if (op == OP_CBRA || op == OP_CBRAPOS || - op == OP_SCBRA || op == OP_SCBRAPOS) - { - int n = GET2(scode, 1+LINK_SIZE); - int new_map = bracket_map | ((n < 32)? (1u << n) : 1); - if (!is_anchored(scode, new_map, cb, atomcount)) return FALSE; - } - - /* Positive forward assertions and conditions */ - - else if (op == OP_ASSERT || op == OP_COND) - { - if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE; - } - - /* Atomic groups */ - - else if (op == OP_ONCE || op == OP_ONCE_NC) - { - if (!is_anchored(scode, bracket_map, cb, atomcount + 1)) - return FALSE; - } - - /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and - it isn't in brackets that are or may be referenced or inside an atomic - group. There is also an option that disables auto-anchoring. */ - - else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || - op == OP_TYPEPOSSTAR)) - { - if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 || - atomcount > 0 || cb->had_pruneorskip || - (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) - return FALSE; - } - - /* Check for explicit anchoring */ - - else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE; - - code += GET(code, 1); - } -while (*code == OP_ALT); /* Loop for each alternative */ -return TRUE; -} - - - -/************************************************* -* Check for starting with ^ or .* * -*************************************************/ - -/* This is called to find out if every branch starts with ^ or .* so that -"first char" processing can be done to speed things up in multiline -matching and for non-DOTALL patterns that start with .* (which must start at -the beginning or after \n). As in the case of is_anchored() (see above), we -have to take account of back references to capturing brackets that contain .* -because in that case we can't make the assumption. Also, the appearance of .* -inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not -count, because once again the assumption no longer holds. - -Arguments: - code points to start of the compiled pattern or a group - bracket_map a bitmap of which brackets we are inside while testing; this - handles up to substring 31; after that we just have to take - the less precise approach - cb points to the compile data - atomcount atomic group level - -Returns: TRUE or FALSE -*/ - -static BOOL -is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, - int atomcount) -{ -do { - PCRE2_SPTR scode = first_significant_code( - code + PRIV(OP_lengths)[*code], FALSE); - register int op = *scode; - - /* If we are at the start of a conditional assertion group, *both* the - conditional assertion *and* what follows the condition must satisfy the test - for start of line. Other kinds of condition fail. Note that there may be an - auto-callout at the start of a condition. */ - - if (op == OP_COND) - { - scode += 1 + LINK_SIZE; - - if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT]; - else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE); - - switch (*scode) - { - case OP_CREF: - case OP_DNCREF: - case OP_RREF: - case OP_DNRREF: - case OP_FAIL: - case OP_FALSE: - case OP_TRUE: - return FALSE; - - default: /* Assertion */ - if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; - do scode += GET(scode, 1); while (*scode == OP_ALT); - scode += 1 + LINK_SIZE; - break; - } - scode = first_significant_code(scode, FALSE); - op = *scode; - } - - /* Non-capturing brackets */ - - if (op == OP_BRA || op == OP_BRAPOS || - op == OP_SBRA || op == OP_SBRAPOS) - { - if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; - } - - /* Capturing brackets */ - - else if (op == OP_CBRA || op == OP_CBRAPOS || - op == OP_SCBRA || op == OP_SCBRAPOS) - { - int n = GET2(scode, 1+LINK_SIZE); - int new_map = bracket_map | ((n < 32)? (1u << n) : 1); - if (!is_startline(scode, new_map, cb, atomcount)) return FALSE; - } - - /* Positive forward assertions */ - - else if (op == OP_ASSERT) - { - if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; - } - - /* Atomic brackets */ - - else if (op == OP_ONCE || op == OP_ONCE_NC) - { - if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE; - } - - /* .* means "start at start or after \n" if it isn't in atomic brackets or - brackets that may be referenced, as long as the pattern does not contain - *PRUNE or *SKIP, because these break the feature. Consider, for example, - /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the - start of a line. There is also an option that disables this optimization. */ - - else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) - { - if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || - atomcount > 0 || cb->had_pruneorskip || - (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) - return FALSE; - } - - /* Check for explicit circumflex; anything else gives a FALSE result. Note - in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC - because the number of characters matched by .* cannot be adjusted inside - them. */ - - else if (op != OP_CIRC && op != OP_CIRCM) return FALSE; - - /* Move on to the next alternative */ - - code += GET(code, 1); - } -while (*code == OP_ALT); /* Loop for each alternative */ -return TRUE; -} - - - /************************************************* * Check for asserted fixed first code unit * *************************************************/ @@ -7911,7 +7908,7 @@ Returns: the fixed first code unit, or 0 with REQ_NONE in flags static uint32_t find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, BOOL inassert) { -register uint32_t c = 0; +uint32_t c = 0; int cflags = REQ_NONE; *flags = REQ_NONE; @@ -7921,7 +7918,7 @@ do { int xl = (*code == OP_CBRA || *code == OP_SCBRA || *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0; PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE); - register PCRE2_UCHAR op = *scode; + PCRE2_UCHAR op = *scode; switch(op) { @@ -7994,18 +7991,19 @@ Arguments: name the name to add length the length of the name groupno the group number + tablecount the count of names in the table so far Returns: nothing */ static void add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length, - unsigned int groupno) + unsigned int groupno, uint32_t tablecount) { -int i; +uint32_t i; PCRE2_UCHAR *slot = cb->name_table; -for (i = 0; i < cb->names_found; i++) +for (i = 0; i < tablecount; i++) { int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length)); if (crc == 0 && slot[IMM2_SIZE+length] != 0) @@ -8019,7 +8017,7 @@ for (i = 0; i < cb->names_found; i++) if (crc < 0) { memmove(slot + cb->name_entry_size, slot, - CU2BYTES((cb->names_found - i) * cb->name_entry_size)); + CU2BYTES((tablecount - i) * cb->name_entry_size)); break; } @@ -8030,7 +8028,6 @@ for (i = 0; i < cb->names_found; i++) PUT2(slot, 0, groupno); memcpy(slot + IMM2_SIZE, name, CU2BYTES(length)); -cb->names_found++; /* Add a terminating zero and fill the rest of the slot with zeroes so that the memory is all initialized. Otherwise valgrind moans about uninitialized @@ -8042,6 +8039,703 @@ memset(slot + IMM2_SIZE + length, 0, +/************************************************* +* Skip in parsed pattern * +*************************************************/ + +/* This function is called to skip parts of the parsed pattern when finding the +length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find +the end of the branch, it is called to skip over an internal lookaround, and it +is also called to skip to the end of a class, during which it will never +encounter nested groups (but there's no need to have special code for that). + +Arguments: + pptr current pointer to skip from + skiptype PSKIP_CLASS when skipping to end of class + PSKIP_ALT when META_ALT ends the skip + PSKIP_KET when only META_KET ends the skip + +Returns: new value of pptr + NULL if META_END is reached - should never occur + or for an unknown meta value - likewise +*/ + +static uint32_t * +parsed_skip(uint32_t *pptr, uint32_t skiptype) +{ +uint32_t nestlevel = 0; + +for (pptr += 1;; pptr++) + { + uint32_t meta = META_CODE(*pptr); + + switch(meta) + { + default: /* Just skip over most items */ + if (meta < META_END) continue; /* Literal */ + break; + + /* This should never occur. */ + + case META_END: + return NULL; + + /* The data for these items is variable in length. */ + + case META_BACKREF: /* Offset is present only if group >= 10 */ + if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET; + break; + + case META_ESCAPE: /* A few escapes are followed by data items. */ + switch (META_DATA(*pptr)) + { + case ESC_P: + case ESC_p: + pptr += 1; + break; + + case ESC_g: + case ESC_k: + pptr += 1 + SIZEOFFSET; + break; + } + break; + + case META_MARK: /* Add the length of the name. */ + case META_PRUNE_ARG: + case META_SKIP_ARG: + case META_THEN_ARG: + pptr += pptr[1]; + break; + + /* These are the "active" items in this loop. */ + + case META_CLASS_END: + if (skiptype == PSKIP_CLASS) return pptr; + break; + + case META_ATOMIC: + case META_CAPTURE: + case META_COND_ASSERT: + case META_COND_DEFINE: + case META_COND_NAME: + case META_COND_NUMBER: + case META_COND_RNAME: + case META_COND_RNUMBER: + case META_COND_VERSION: + case META_LOOKAHEAD: + case META_LOOKAHEADNOT: + case META_LOOKBEHIND: + case META_LOOKBEHINDNOT: + case META_NOCAPTURE: + nestlevel++; + break; + + case META_ALT: + if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr; + break; + + case META_KET: + if (nestlevel == 0) return pptr; + nestlevel--; + break; + } + + /* The extra data item length for each meta is in a table. */ + + meta = (meta >> 16) & 0x7fff; + if (meta >= sizeof(meta_extra_lengths)) return NULL; + pptr += meta_extra_lengths[meta]; + } +/* Control never reaches here */ +return pptr; +} + + + +/************************************************* +* Find length of a parsed group * +*************************************************/ + +/* This is called for nested groups within a branch of a lookbehind whose +length is being computed. If all the branches in the nested group have the same +length, that is OK. On entry, the pointer must be at the first element after +the group initializing code. Caching is used to improve processing speed when +the same capturing group occurs many times. + +Arguments: + pptrptr pointer to pointer in the parsed pattern + errcodeptr pointer to the errorcode + lcptr pointer to the loop counter + group number of captured group or -1 for a non-capturing group + recurses chain of recurse_check to catch mutual recursion + cb pointer to the compile data + +Returns: the group length or a negative number +*/ + +static int +get_grouplength(uint32_t **pptrptr, int *errcodeptr, int *lcptr, + int group, parsed_recurse_check *recurses, compile_block *cb) +{ +int branchlength; +int grouplength = -1; + +/* The cache can be used only if there is no possibility of there being two +groups with the same number. */ + +if (group > 0) + { + uint32_t groupinfo = cb->groupinfo[group]; + if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0) + { + if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1; + if ((groupinfo & GI_SET_FIXED_LENGTH) != 0) + return groupinfo & GI_FIXED_LENGTH_MASK; + } + } + +/* Scan the group */ + +for(;;) + { + branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); + if (branchlength < 0) goto ISNOTFIXED; + if (grouplength == -1) grouplength = branchlength; + else if (grouplength != branchlength) goto ISNOTFIXED; + if (**pptrptr == META_KET) break; + *pptrptr += 1; /* Skip META_ALT */ + } + +if (group > 0) + cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength); +return grouplength; + +ISNOTFIXED: +if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH; +return -1; +} + + + +/************************************************* +* Find length of a parsed branch * +*************************************************/ + +/* Return a fixed length for a branch in a lookbehind, giving an error if the +length is not fixed. If any lookbehinds are encountered on the way, they get +their length set. On entry, *pptrptr points to the first element inside the +branch. On exit it is set to point to the ALT or KET. + +Arguments: + pptrptr pointer to pointer in the parsed pattern + errcodeptr pointer to error code + lcptr pointer to loop counter + recurses chain of recurse_check to catch mutual recursion + cb pointer to compile block + +Returns: the length, or a negative value on error +*/ + +static int +get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr, + parsed_recurse_check *recurses, compile_block *cb) +{ +int branchlength = 0; +int grouplength; +uint32_t lastitemlength = 0; +uint32_t *pptr = *pptrptr; +PCRE2_SIZE offset; +parsed_recurse_check this_recurse; + +/* A large and/or complex regex can take too long to process. This can happen +more often when (?| groups are present in the pattern because their length +cannot be cached. */ + +if ((*lcptr)++ > 2000) + { + *errcodeptr = ERR35; /* Lookbehind is too complicated */ + return -1; + } + +/* Scan the branch, accumulating the length. */ + +for (;; pptr++) + { + parsed_recurse_check *r; + uint32_t *gptr, *gptrend; + uint32_t escape; + uint32_t group = 0; + uint32_t itemlength = 0; + + if (*pptr < META_END) + { + itemlength = 1; + } + + else switch (META_CODE(*pptr)) + { + case META_KET: + case META_ALT: + goto EXIT; + + /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the + actual termination. */ + + case META_ACCEPT: + case META_FAIL: + pptr = parsed_skip(pptr, PSKIP_ALT); + if (pptr == NULL) goto PARSED_SKIP_FAILED; + goto EXIT; + + case META_MARK: + case META_PRUNE_ARG: + case META_SKIP_ARG: + case META_THEN_ARG: + pptr += pptr[1] + 1; + break; + + case META_CIRCUMFLEX: + case META_COMMIT: + case META_DOLLAR: + case META_PRUNE: + case META_SKIP: + case META_THEN: + break; + + case META_OPTIONS: + pptr += 1; + break; + + case META_BIGVALUE: + itemlength = 1; + pptr += 1; + break; + + case META_CLASS: + case META_CLASS_NOT: + itemlength = 1; + pptr = parsed_skip(pptr, PSKIP_CLASS); + if (pptr == NULL) goto PARSED_SKIP_FAILED; + break; + + case META_CLASS_EMPTY_NOT: + case META_DOT: + itemlength = 1; + break; + + case META_CALLOUT_NUMBER: + pptr += 3; + break; + + case META_CALLOUT_STRING: + pptr += 3 + SIZEOFFSET; + break; + + /* Only some escapes consume a character. Of those, \R and \X are never + allowed because they might match more than character. \C is allowed only in + 32-bit and non-UTF 8/16-bit modes. */ + + case META_ESCAPE: + escape = META_DATA(*pptr); + if (escape == ESC_R || escape == ESC_X) return -1; + if (escape > ESC_b && escape < ESC_Z) + { +#if PCRE2_CODE_UNIT_WIDTH != 32 + if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C) + { + *errcodeptr = ERR36; + return -1; + } +#endif + itemlength = 1; + if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */ + } + break; + + /* Lookaheads can be ignored. */ + + case META_LOOKAHEAD: + case META_LOOKAHEADNOT: + pptr = parsed_skip(pptr, PSKIP_KET); + if (pptr == NULL) goto PARSED_SKIP_FAILED; + break; + + /* Lookbehinds can be ignored, but must themselves be checked. */ + + case META_LOOKBEHIND: + case META_LOOKBEHINDNOT: + if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb)) + return -1; + break; + + /* Back references and recursions are handled by very similar code. At this + stage, the names generated in the parsing pass are available, but the main + name table has not yet been created. So for the named varieties, scan the + list of names in order to get the number of the first one in the pattern, + and whether or not this name is duplicated. */ + + case META_BACKREF_BYNAME: + if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0) + goto ISNOTFIXED; + + case META_RECURSE_BYNAME: + { + int i; + PCRE2_SPTR name; + BOOL is_dupname = FALSE; + named_group *ng = cb->named_groups; + uint32_t meta_code = META_CODE(*pptr); + uint32_t length = *(++pptr); + + GETPLUSOFFSET(offset, pptr); + name = cb->start_pattern + offset; + for (i = 0; i < cb->names_found; i++, ng++) + { + if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0) + { + group = ng->number; + is_dupname = ng->isdup; + break; + } + } + + if (group == 0) + { + *errcodeptr = ERR15; /* Non-existent subpattern */ + cb->erroroffset = offset; + return -1; + } + + /* A numerical back reference can be fixed length if duplicate capturing + groups are not being used. A non-duplicate named back reference can also + be handled. */ + + if (meta_code == META_RECURSE_BYNAME || + (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)) + goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */ + } + goto ISNOTFIXED; /* Duplicate name or number */ + + /* The offset values for back references < 10 are in a separate vector + because otherwise they would use more than two parsed pattern elements on + 64-bit systems. */ + + case META_BACKREF: + if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 || + (cb->external_flags & PCRE2_DUPCAPUSED) != 0) + goto ISNOTFIXED; + group = META_DATA(*pptr); + if (group < 10) + { + offset = cb->small_ref_offset[group]; + goto RECURSE_OR_BACKREF_LENGTH; + } + + /* Fall through for groups >= 10 - picking up group twice does no harm. */ + + /* A true recursion implies not fixed length, but a subroutine call may + be OK. Back reference "recursions" are also failed. */ + + case META_RECURSE: + group = META_DATA(*pptr); + GETPLUSOFFSET(offset, pptr); + + RECURSE_OR_BACKREF_LENGTH: + if (group > cb->bracount) + { + cb->erroroffset = offset; + *errcodeptr = ERR15; /* Non-existent subpattern */ + return -1; + } + if (group == 0) goto ISNOTFIXED; /* Local recursion */ + for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++) + { + if (META_CODE(*gptr) == META_BIGVALUE) gptr++; + else if (*gptr == (META_CAPTURE | group)) break; + } + + gptrend = parsed_skip(gptr, PSKIP_KET); + if (gptrend == NULL) goto PARSED_SKIP_FAILED; + if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */ + for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break; + if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */ + this_recurse.prev = recurses; + this_recurse.groupptr = gptr; + gptr++; + grouplength = get_grouplength(&gptr, errcodeptr, lcptr, group, + &this_recurse, cb); + if (grouplength < 0) + { + if (*errcodeptr == 0) goto ISNOTFIXED; + return -1; /* Error already set */ + } + itemlength = grouplength; + break; + + /* Check nested groups - advance past the initial data for each type and + then seek a fixed length with get_grouplength(). */ + + case META_COND_NAME: + case META_COND_NUMBER: + case META_COND_RNAME: + case META_COND_RNUMBER: + case META_COND_DEFINE: + pptr += 2 + SIZEOFFSET; + goto CHECK_GROUP; + + case META_COND_ASSERT: + pptr += 1; + goto CHECK_GROUP; + + case META_COND_VERSION: + pptr += 4; + goto CHECK_GROUP; + + case META_CAPTURE: + group = META_DATA(*pptr); + /* Fall through */ + + case META_ATOMIC: + case META_NOCAPTURE: + pptr++; + CHECK_GROUP: + grouplength = get_grouplength(&pptr, errcodeptr, lcptr, group, recurses, cb); + if (grouplength < 0) return -1; + itemlength = grouplength; + break; + + /* Exact repetition is OK; variable repetition is not. A repetition of zero + must subtract the length that has already been added. */ + + case META_MINMAX: + case META_MINMAX_PLUS: + case META_MINMAX_QUERY: + if (pptr[1] == pptr[2]) + { + if (pptr[1] == 0) branchlength -= lastitemlength; + else itemlength = (pptr[1] - 1) * lastitemlength; + pptr += 2; + break; + } + /* Fall through */ + + /* Any other item means this branch does not have a fixed length. */ + + default: + ISNOTFIXED: + *errcodeptr = ERR25; /* Not fixed length */ + return -1; + } + + /* Add the item length to the branchlength, and save it for use if the next + thing is a quantifier. */ + + branchlength += itemlength; + lastitemlength = itemlength; + + /* Ensure that the length does not overflow the limit. */ + + if (branchlength > LOOKBEHIND_MAX) + { + *errcodeptr = ERR87; + return -1; + } + } + +EXIT: +*pptrptr = pptr; +if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength; +return branchlength; + +PARSED_SKIP_FAILED: +*errcodeptr = ERR90; +return -1; +} + + + +/************************************************* +* Set lengths in a lookbehind * +*************************************************/ + +/* This function is called for each lookbehind, to set the lengths in its +branches. An error occurs if any branch does not have a fixed length that is +less than the maximum (65535). On exit, the pointer must be left on the final +ket. + +Arguments: + pptrptr pointer to pointer in the parsed pattern + errcodeptr pointer to error code + lcptr pointer to loop counter + recurses chain of recurse_check to catch mutual recursion + cb pointer to compile block + +Returns: TRUE if all is well + FALSE otherwise, with error code and offset set +*/ + +static BOOL +set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr, + parsed_recurse_check *recurses, compile_block *cb) +{ +PCRE2_SIZE offset; +int branchlength; +uint32_t *bptr = *pptrptr; + +READPLUSOFFSET(offset, bptr); /* Offset for error messages */ +*pptrptr += SIZEOFFSET; + +do + { + *pptrptr += 1; + branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb); + if (branchlength < 0) + { + /* The errorcode and offset may already be set from a nested lookbehind. */ + if (*errcodeptr == 0) *errcodeptr = ERR25; + if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset; + return FALSE; + } + *bptr |= branchlength; /* branchlength never more than 65535 */ + bptr = *pptrptr; + } +while (*bptr == META_ALT); + +return TRUE; +} + + + +/************************************************* +* Check parsed pattern lookbehinds * +*************************************************/ + +/* This function is called at the end of parsing a pattern if any lookbehinds +were encountered. It scans the parsed pattern for them, calling +set_lookbehind_lengths() for each one. At the start, the errorcode is zero and +the error offset is marked unset. The enables the functions above not to +override settings from deeper nestings. + +Arguments cb points to the compile block +Returns: 0 on success, or an errorcode (cb->erroroffset will be set) +*/ + +static int +check_lookbehinds(compile_block *cb) +{ +uint32_t *pptr; +int errorcode = 0; +int loopcount = 0; + +cb->erroroffset = PCRE2_UNSET; + +for (pptr = cb->parsed_pattern; *pptr != META_END; pptr++) + { + if (*pptr < META_END) continue; /* Literal */ + + switch (META_CODE(*pptr)) + { + default: + return ERR70; /* Unrecognized meta code */ + + case META_ESCAPE: + if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p) + pptr += 1; + break; + + case META_ACCEPT: + case META_ALT: + case META_ASTERISK: + case META_ASTERISK_PLUS: + case META_ASTERISK_QUERY: + case META_ATOMIC: + case META_BACKREF: + case META_CAPTURE: + case META_CIRCUMFLEX: + case META_CLASS: + case META_CLASS_EMPTY: + case META_CLASS_EMPTY_NOT: + case META_CLASS_END: + case META_CLASS_NOT: + case META_COMMIT: + case META_COND_ASSERT: + case META_DOLLAR: + case META_DOT: + case META_FAIL: + case META_KET: + case META_LOOKAHEAD: + case META_LOOKAHEADNOT: + case META_NOCAPTURE: + case META_PLUS: + case META_PLUS_PLUS: + case META_PLUS_QUERY: + case META_PRUNE: + case META_QUERY: + case META_QUERY_PLUS: + case META_QUERY_QUERY: + case META_RANGE_ESCAPED: + case META_RANGE_LITERAL: + case META_SKIP: + case META_THEN: + break; + + case META_RECURSE: + pptr += SIZEOFFSET; + break; + + case META_BACKREF_BYNAME: + case META_COND_DEFINE: + case META_COND_NAME: + case META_COND_NUMBER: + case META_COND_RNAME: + case META_COND_RNUMBER: + case META_RECURSE_BYNAME: + pptr += 1 + SIZEOFFSET; + break; + + case META_CALLOUT_STRING: + pptr += 3 + SIZEOFFSET; + break; + + case META_BIGVALUE: + case META_OPTIONS: + case META_POSIX: + case META_POSIX_NEG: + pptr += 1; + break; + + case META_MINMAX: + case META_MINMAX_QUERY: + case META_MINMAX_PLUS: + pptr += 2; + break; + + case META_CALLOUT_NUMBER: + case META_COND_VERSION: + pptr += 3; + break; + + case META_MARK: + case META_PRUNE_ARG: + case META_SKIP_ARG: + case META_THEN_ARG: + pptr += 1 + pptr[1]; + break; + + case META_LOOKBEHIND: + case META_LOOKBEHINDNOT: + if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, NULL, cb)) + return errorcode; + break; + } + } + +return 0; +} + + + /************************************************* * External function to compile a pattern * *************************************************/ @@ -8051,7 +8745,7 @@ a pointer to a block of store holding a compiled version of the expression. Arguments: pattern the regular expression - patlen the length of the pattern, or < 0 for zero-terminated + patlen the length of the pattern, or PCRE2_ZERO_TERMINATED options option bits errorptr pointer to errorcode erroroffset pointer to error offset @@ -8065,41 +8759,49 @@ PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) { -BOOL utf; /* Set TRUE for UTF mode */ -pcre2_real_code *re = NULL; /* What we will return */ -compile_block cb; /* "Static" compile-time data */ -const uint8_t *tables; /* Char tables base pointer */ +BOOL utf; /* Set TRUE for UTF mode */ +BOOL has_lookbehind; /* Set TRUE if a lookbehind is found */ +BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */ +pcre2_real_code *re = NULL; /* What we will return */ +compile_block cb; /* "Static" compile-time data */ +const uint8_t *tables; /* Char tables base pointer */ -PCRE2_UCHAR *code; /* Current pointer in compiled code */ -PCRE2_SPTR codestart; /* Start of compiled code */ -PCRE2_SPTR ptr; /* Current pointer in pattern */ +PCRE2_UCHAR *code; /* Current pointer in compiled code */ +PCRE2_SPTR codestart; /* Start of compiled code */ +PCRE2_SPTR ptr; /* Current pointer in pattern */ +uint32_t *pptr; /* Current pointer in parsed pattern */ -size_t length = 1; /* Allow or final END opcode */ -size_t usedlength; /* Actual length used */ -size_t re_blocksize; /* Size of memory block */ +PCRE2_SIZE length = 1; /* Allow for final END opcode */ +PCRE2_SIZE usedlength; /* Actual length used */ +PCRE2_SIZE re_blocksize; /* Size of memory block */ +PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */ +PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */ -int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */ -uint32_t firstcu, reqcu; /* Value of first/req code unit */ -uint32_t setflags = 0; /* NL and BSR set flags */ +int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */ +uint32_t firstcu, reqcu; /* Value of first/req code unit */ +uint32_t setflags = 0; /* NL and BSR set flags */ -uint32_t skipatstart; /* When checking (*UTF) etc */ -uint32_t limit_match = UINT32_MAX; /* Unset match limits */ +uint32_t skipatstart; /* When checking (*UTF) etc */ +uint32_t limit_match = UINT32_MAX; /* Unset match limits */ uint32_t limit_recursion = UINT32_MAX; -int newline = 0; /* Unset; can be set by the pattern */ -int bsr = 0; /* Unset; can be set by the pattern */ -int errorcode = 0; /* Initialize to avoid compiler warn */ +int newline = 0; /* Unset; can be set by the pattern */ +int bsr = 0; /* Unset; can be set by the pattern */ +int errorcode = 0; /* Initialize to avoid compiler warn */ +int regexrc; /* Return from compile */ + +uint32_t i; /* Local loop counter */ /* Comments at the head of this file explain about these variables. */ -PCRE2_UCHAR *copied_pattern = NULL; -PCRE2_UCHAR stack_copied_pattern[COPIED_PATTERN_SIZE]; +uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE]; +uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE]; named_group named_groups[NAMED_GROUP_LIST_SIZE]; /* The workspace is used in different ways in the different compiling phases. -Ensure that it is 16-bit aligned for the preliminary group scan. */ +It needs to be 16-bit aligned for the preliminary parsing scan. */ -uint16_t c16workspace[(COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t)]; +uint32_t c16workspace[C16_WORK_SIZE]; PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace; @@ -8133,29 +8835,21 @@ if (ccontext == NULL) ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context)); /* A zero-terminated pattern is indicated by the special length value -PCRE2_ZERO_TERMINATED. Otherwise, we make a copy of the pattern and add a zero, -to ensure that it is always possible to look one code unit beyond the end of -the pattern's characters. */ +PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */ -if (patlen == PCRE2_ZERO_TERMINATED) patlen = PRIV(strlen)(pattern); else +if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED))) + patlen = PRIV(strlen)(pattern); + +if (patlen > ccontext->max_pattern_length) { - if (patlen < COPIED_PATTERN_SIZE) - copied_pattern = stack_copied_pattern; - else - { - copied_pattern = ccontext->memctl.malloc(CU2BYTES(patlen + 1), - ccontext->memctl.memory_data); - if (copied_pattern == NULL) - { - *errorptr = ERR21; - return NULL; - } - } - memcpy(copied_pattern, pattern, CU2BYTES(patlen)); - copied_pattern[patlen] = 0; - pattern = copied_pattern; + *errorptr = ERR88; + return NULL; } +/* From here on, all returns from this function should end up going via the +EXIT label. */ + + /* ------------ Initialize the "static" compile data -------------- */ tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables); @@ -8166,14 +8860,16 @@ cb.cbits = tables + cbits_offset; /* tables */ cb.ctypes = tables + ctypes_offset; cb.assert_depth = 0; -cb.bracount = cb.final_bracount = 0; +cb.bracount = 0; cb.cx = ccontext; cb.dupnames = FALSE; cb.end_pattern = pattern + patlen; +cb.erroroffset = 0; cb.external_flags = 0; cb.external_options = options; -cb.hwm = cworkspace; -cb.iscondassert = FALSE; +cb.groupinfo = stack_groupinfo; +cb.had_recurse = FALSE; +cb.lastcapture = 0; cb.max_lookbehind = 0; cb.name_entry_size = 0; cb.name_table = NULL; @@ -8182,6 +8878,7 @@ cb.named_group_list_size = NAMED_GROUP_LIST_SIZE; cb.names_found = 0; cb.open_caps = NULL; cb.parens_depth = 0; +cb.parsed_pattern = stack_parsed_pattern; cb.req_varyopt = 0; cb.start_code = cworkspace; cb.start_pattern = pattern; @@ -8195,23 +8892,43 @@ references to help in deciding whether (.*) can be treated as anchored or not. cb.top_backref = 0; cb.backref_map = 0; +/* Escape sequences \1 to \9 are always back references, but as they are only +two characters long, only two elements can be used in the parsed_pattern +vector. The first contains the reference, and we'd like to use the second to +record the offset in the pattern, so that forward references to non-existent +groups can be diagnosed later with an offset. However, on 64-bit systems, +PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first +occurrence of \1 to \9, indexed by the second parsed_pattern value. All other +references have enough space for the offset to be put into the parsed pattern. +*/ + +for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET; + + /* --------------- Start looking at the pattern --------------- */ /* Check for global one-time option settings at the start of the pattern, and -remember the offset to the actual regex. */ +remember the offset to the actual regex. With valgrind support, make the +terminator of a zero-terminated pattern inaccessible. This catches bugs that +would otherwise only show up for non-zero-terminated patterns. */ + +#ifdef SUPPORT_VALGRIND +if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1)); +#endif ptr = pattern; skipatstart = 0; -while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && +while (patlen - skipatstart >= 2 && + ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && ptr[skipatstart+1] == CHAR_ASTERISK) { - unsigned int i; for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++) { pso *p = pso_list + i; - if (PRIV(strncmp_c8)(ptr+skipatstart+2, (char *)(p->name), p->length) == 0) + if (patlen - skipatstart - 2 >= p->length && + PRIV(strncmp_c8)(ptr+skipatstart+2, (char *)(p->name), p->length) == 0) { uint32_t c, pp; @@ -8240,15 +8957,22 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && case PSO_LIMR: c = 0; pp = skipatstart; + if (!IS_DIGIT(ptr[pp])) + { + errorcode = ERR60; + ptr += pp; + goto HAD_EARLY_ERROR; + } while (IS_DIGIT(ptr[pp])) { if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ - c = c*10 + ptr[pp++] - CHAR_0; + c = c*10 + (ptr[pp++] - CHAR_0); } if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) { errorcode = ERR60; - goto HAD_ERROR; + ptr += pp; + goto HAD_EARLY_ERROR; } if (p->type == PSO_LIMM) limit_match = c; else limit_recursion = c; @@ -8271,7 +8995,7 @@ ptr += skipatstart; if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0) { errorcode = ERR32; - goto HAD_ERROR; + goto HAD_EARLY_ERROR; } #endif @@ -8284,11 +9008,11 @@ if (utf) if ((options & PCRE2_NEVER_UTF) != 0) { errorcode = ERR74; - goto HAD_ERROR; + goto HAD_EARLY_ERROR; } if ((options & PCRE2_NO_UTF_CHECK) == 0 && (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0) - goto HAD_ERROR; + goto HAD_ERROR; /* Offset was set by valid_utf() */ } /* Check UCP lockout. */ @@ -8297,7 +9021,7 @@ if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) == (PCRE2_UCP|PCRE2_NEVER_UCP)) { errorcode = ERR75; - goto HAD_ERROR; + goto HAD_EARLY_ERROR; } /* Process the BSR setting. */ @@ -8336,23 +9060,102 @@ switch(newline) default: errorcode = ERR56; - goto HAD_ERROR; + goto HAD_EARLY_ERROR; } -/* Before we do anything else, do a pre-scan of the pattern in order to -discover the named groups and their numerical equivalents, so that this -information is always available for the remaining processing. */ +/* Pre-scan the pattern to do two things: (1) Discover the named groups and +their numerical equivalents, so that this information is always available for +the remaining processing. (2) At the same time, parse the pattern and put a +processed version into the parsed_pattern vector. This has escapes interpreted +and comments removed (amongst other things). -errorcode = scan_for_captures(&ptr, cb.external_options, &cb); -if (errorcode != 0) goto HAD_ERROR; +In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned +32-bit ints in the parsed pattern is bounded by the length of the pattern plus +one (for the terminator). The exceptional case is when running in 32-bit, +non-UTF mode, when literal characters greater than META_END (0x80000000) have +to be coded as two units. In this case, therefore, we scan the pattern to check +for such values. */ -/* For obscure debugging this code can be enabled. */ - -#if 0 +#if PCRE2_CODE_UNIT_WIDTH == 32 +if (!utf) + { + PCRE2_SPTR p; + for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++; + } +#endif + +/* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT +is set we have to assume a numerical callout (4 elements) for each character +plus one at the end. This is overkill, but memory is plentiful these days. For +many smaller patterns the vector on the stack (which was set up above) can be +used. */ + +parsed_size_needed = patlen - skipatstart + big32count; +if ((options & PCRE2_AUTO_CALLOUT) != 0) + parsed_size_needed = (parsed_size_needed + 1) * 5; + +if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE) + { + uint32_t *heap_parsed_pattern = ccontext->memctl.malloc( + (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data); + if (heap_parsed_pattern == NULL) + { + *errorptr = ERR21; + goto EXIT; + } + cb.parsed_pattern = heap_parsed_pattern; + } +cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1; + +/* Do the parsing scan. */ + +errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb); +if (errorcode != 0) goto HAD_CB_ERROR; + +/* Workspace is needed to remember information about numbered groups: whether a +group can match an empty string and what its fixed length is. This is done to +avoid the possibility of recursive references causing very long compile times +when checking these features. Unnumbered groups do not have this exposure since +they cannot be referenced. We use an indexed vector for this purpose. If there +are sufficiently few groups, the default vector on the stack, as set up above, +can be used. Otherwise we have to get/free a special vector. The vector must be +initialized to zero. */ + +if (cb.bracount >= GROUPINFO_DEFAULT_SIZE) + { + cb.groupinfo = ccontext->memctl.malloc( + (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data); + if (cb.groupinfo == NULL) + { + errorcode = ERR21; + cb.erroroffset = 0; + goto HAD_CB_ERROR; + } + } +memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t)); + +/* If there were any lookbehinds, scan the parsed pattern to figure out their +lengths. */ + +if (has_lookbehind) + { + errorcode = check_lookbehinds(&cb); + if (errorcode != 0) goto HAD_CB_ERROR; + } + +/* For debugging, there is a function that shows the parsed data vector. */ + +#ifdef DEBUG_SHOW_PARSED +fprintf(stderr, "+++ Pre-scan complete:\n"); +show_parsed(&cb); +#endif + +/* For debugging capturing information this code can be enabled. */ + +#ifdef DEBUG_SHOW_CAPTURES { - int i; named_group *ng = cb.named_groups; - fprintf(stderr, "+++Captures: %d\n", cb.final_bracount); + fprintf(stderr, "+++Captures: %d\n", cb.bracount); for (i = 0; i < cb.names_found; i++, ng++) { fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name); @@ -8360,12 +9163,6 @@ if (errorcode != 0) goto HAD_ERROR; } #endif -/* Reset current bracket count to zero and current pointer to the start of the -pattern. */ - -cb.bracount = 0; -ptr = pattern + skipatstart; - /* Pretend to compile the pattern while actually just accumulating the amount of memory required in the 'length' variable. This behaviour is triggered by passing a non-NULL final argument to compile_regex(). We pass a block of @@ -8374,24 +9171,26 @@ compiled code is discarded when it is no longer needed, so hopefully this workspace will never overflow, though there is a test for its doing so. On error, errorcode will be set non-zero, so we don't need to look at the -result of the function. The initial options have been put into the cb block so -that they can be changed if an option setting is found within the regex right -at the beginning. Bringing initial option settings outside can help speed up -starting point checks. We still have to pass a separate options variable (the -first argument) because that may change as the pattern is processed. */ +result of the function. The initial options have been put into the cb block, +but we still have to pass a separate options variable (the first argument) +because the options may change as the pattern is processed. */ +cb.erroroffset = patlen; /* For any subsequent errors that do not set it */ +pptr = cb.parsed_pattern; code = cworkspace; *code = OP_BRA; -(void)compile_regex(cb.external_options, &code, &ptr, &errorcode, FALSE, - FALSE, 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, - &cb, &length); +(void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu, + &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length); + +if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */ + +/* This should be caught in compile_regex(), but just in case... */ -if (errorcode != 0) goto HAD_ERROR; if (length > MAX_PATTERN_SIZE) { errorcode = ERR20; - goto HAD_ERROR; + goto HAD_CB_ERROR; } /* Compute the size of, and then get and initialize, the data block for storing @@ -8406,7 +9205,7 @@ re = (pcre2_real_code *) if (re == NULL) { errorcode = ERR21; - goto HAD_ERROR; + goto HAD_CB_ERROR; } re->memctl = ccontext->memctl; @@ -8424,7 +9223,7 @@ re->first_codeunit = 0; re->last_codeunit = 0; re->bsr_convention = bsr; re->newline_convention = newline; -re->max_lookbehind = +re->max_lookbehind = 0; re->minlength = 0; re->top_bracket = 0; re->top_backref = 0; @@ -8440,22 +9239,16 @@ codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) + /* Update the compile data block for the actual compile. The starting points of the name/number translation table and of the code are passed around in the compile data block. The start/end pattern and initial options are already set -from the pre-compile phase, as is the name_entry_size field. Reset the bracket -count and the names_found field. Also reset the hwm field; this time it's used -for remembering forward references to subpatterns. */ +from the pre-compile phase, as is the name_entry_size field. */ cb.parens_depth = 0; cb.assert_depth = 0; -cb.bracount = 0; -cb.max_lookbehind = 0; +cb.lastcapture = 0; cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); cb.start_code = codestart; -cb.hwm = (PCRE2_UCHAR *)(cb.start_workspace); -cb.iscondassert = FALSE; cb.req_varyopt = 0; cb.had_accept = FALSE; cb.had_pruneorskip = FALSE; -cb.check_lookbehind = FALSE; cb.open_caps = NULL; /* If any named groups were found, create the name/number table from the list @@ -8463,23 +9256,21 @@ created in the pre-pass. */ if (cb.names_found > 0) { - int i = cb.names_found; named_group *ng = cb.named_groups; - cb.names_found = 0; - for (; i > 0; i--, ng++) - add_name_to_table(&cb, ng->name, ng->length, ng->number); + for (i = 0; i < cb.names_found; i++, ng++) + add_name_to_table(&cb, ng->name, ng->length, ng->number, i); } /* Set up a starting, non-extracting bracket, then compile the expression. On error, errorcode will be set non-zero, so we don't need to look at the result of the function here. */ -ptr = pattern + skipatstart; +pptr = cb.parsed_pattern; code = (PCRE2_UCHAR *)codestart; *code = OP_BRA; -(void)compile_regex(re->overall_options, &code, &ptr, &errorcode, FALSE, FALSE, - 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL); - +regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0, + &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL); +if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY; re->top_bracket = cb.bracount; re->top_backref = cb.top_backref; re->max_lookbehind = cb.max_lookbehind; @@ -8490,14 +9281,11 @@ if (cb.had_accept) reqcuflags = REQ_NONE; } -/* If we have not reached end of pattern after a successful compile, there's an -excess bracket. Fill in the final opcode and check for disastrous overflow. -If no overflow, but the estimated length exceeds the really used length, adjust -the value of re->blocksize, and if valgrind support is configured, mark the -extra allocated memory as unaddressable, so that any out-of-bound reads can be -detected. */ +/* Fill in the final opcode and check for disastrous overflow. If no overflow, +but the estimated length exceeds the really used length, adjust the value of +re->blocksize, and if valgrind support is configured, mark the extra allocated +memory as unaddressable, so that any out-of-bound reads can be detected. */ -if (errorcode == 0 && ptr < cb.end_pattern) errorcode = ERR22; *code++ = OP_END; usedlength = code - codestart; if (usedlength > length) errorcode = ERR23; else @@ -8508,119 +9296,89 @@ if (usedlength > length) errorcode = ERR23; else #endif } +/* Scan the pattern for recursion/subroutine calls and convert the group +numbers into offsets. Maintain a small cache so that repeated groups containing +recursions are efficiently handled. */ + +#define RSCAN_CACHE_SIZE 8 + +if (errorcode == 0 && cb.had_recurse) + { + PCRE2_UCHAR *rcode; + PCRE2_SPTR rgroup; + unsigned int ccount = 0; + int start = RSCAN_CACHE_SIZE; + recurse_cache rc[RSCAN_CACHE_SIZE]; + + for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf); + rcode != NULL; + rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf)) + { + int p, groupnumber; + + groupnumber = (int)GET(rcode, 1); + if (groupnumber == 0) rgroup = codestart; else + { + PCRE2_SPTR search_from = codestart; + rgroup = NULL; + for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7) + { + if (groupnumber == rc[p].groupnumber) + { + rgroup = rc[p].group; + break; + } + + /* Group n+1 must always start to the right of group n, so we can save + search time below when the new group number is greater than any of the + previously found groups. */ + + if (groupnumber > rc[p].groupnumber) search_from = rc[p].group; + } + + if (rgroup == NULL) + { + rgroup = PRIV(find_bracket)(search_from, utf, groupnumber); + if (rgroup == NULL) + { + errorcode = ERR53; + break; + } + if (--start < 0) start = RSCAN_CACHE_SIZE - 1; + rc[start].groupnumber = groupnumber; + rc[start].group = rgroup; + if (ccount < RSCAN_CACHE_SIZE) ccount++; + } + } + + PUT(rcode, 1, rgroup - codestart); + } + } + /* In rare debugging situations we sometimes need to look at the compiled code at this stage. */ -#ifdef CALL_PRINTINT +#ifdef DEBUG_CALL_PRINTINT pcre2_printint(re, stderr, TRUE); fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength); #endif -/* Fill in any forward references that are required. There may be repeated -references; optimize for them, as searching a large regex takes time. The -test of errorcode inside the loop means that nothing is done if it is already -non-zero. */ +/* Unless disabled, check whether any single character iterators can be +auto-possessified. The function overwrites the appropriate opcode values, so +the type of the pointer must be cast. NOTE: the intermediate variable "temp" is +used in this code because at least one compiler gives a warning about loss of +"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the +function call. */ -if (cb.hwm > cb.start_workspace) +if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) { - int prev_recno = -1; - PCRE2_SPTR groupptr = NULL; - while (errorcode == 0 && cb.hwm > cb.start_workspace) - { - int offset, recno; - cb.hwm -= LINK_SIZE; - offset = GET(cb.hwm, 0); - recno = GET(codestart, offset); - if (recno != prev_recno) - { - groupptr = PRIV(find_bracket)(codestart, utf, recno); - prev_recno = recno; - } - if (groupptr == NULL) errorcode = ERR53; - else PUT(((PCRE2_UCHAR *)codestart), offset, (int)(groupptr - codestart)); - } + PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; + if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; } -/* If the workspace had to be expanded, free the new memory. */ +/* Failed to compile, or error while post-processing. */ -if (cb.workspace_size > COMPILE_WORK_SIZE) - ccontext->memctl.free((void *)cb.start_workspace, - ccontext->memctl.memory_data); - -/* After a successful compile, give an error if there's back reference to a -non-existent capturing subpattern. Then, unless disabled, check whether any -single character iterators can be auto-possessified. The function overwrites -the appropriate opcode values, so the type of the pointer must be cast. NOTE: -the intermediate variable "temp" is used in this code because at least one -compiler gives a warning about loss of "const" attribute if the cast -(PCRE2_UCHAR *)codestart is used directly in the function call. */ - -if (errorcode == 0) - { - if (re->top_backref > re->top_bracket) errorcode = ERR15; - else if ((options & PCRE2_NO_AUTO_POSSESS) == 0) - { - PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; - if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; - } - } - -/* If there were any lookbehind assertions that contained OP_RECURSE -(recursions or subroutine calls), a flag is set for them to be checked here, -because they may contain forward references. Actual recursions cannot be fixed -length, but subroutine calls can. It is done like this so that those without -OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The -exceptional ones forgo this. We scan the pattern to check that they are fixed -length, and set their lengths. */ - -if (errorcode == 0 && cb.check_lookbehind) - { - PCRE2_UCHAR *cc = (PCRE2_UCHAR *)codestart; - - /* Loop, searching for OP_REVERSE items, and process those that do not have - their length set. (Actually, it will also re-process any that have a length - of zero, but that is a pathological case, and it does no harm.) When we find - one, we temporarily terminate the branch it is in while we scan it. Note that - calling find_bracket() with a negative group number returns a pointer to the - OP_REVERSE item, not the actual lookbehind. */ - - for (cc = (PCRE2_UCHAR *)PRIV(find_bracket)(codestart, utf, -1); - cc != NULL; - cc = (PCRE2_UCHAR *)PRIV(find_bracket)(cc, utf, -1)) - { - if (GET(cc, 1) == 0) - { - int fixed_length; - PCRE2_UCHAR *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE); - int end_op = *be; - *be = OP_END; - fixed_length = find_fixedlength(cc, utf, TRUE, &cb, NULL); - *be = end_op; - if (fixed_length < 0) - { - errorcode = (fixed_length == -2)? ERR36 : - (fixed_length == -4)? ERR70 : ERR25; - break; - } - if (fixed_length > cb.max_lookbehind) cb.max_lookbehind = fixed_length; - PUT(cc, 1, fixed_length); - } - cc += 1 + LINK_SIZE; - } - } - -/* Failed to compile, or error while post-processing. Earlier errors get here -via the dreaded goto. */ - -if (errorcode != 0) - { - HAD_ERROR: - pcre2_code_free(re); - re = NULL; - *errorptr = errorcode; - *erroroffset = (int)(ptr - pattern); - goto EXIT; - } +if (errorcode != 0) goto HAD_CB_ERROR; /* Successful compile. If the anchored option was not passed, set it if we can determine that the pattern is anchored by virtue of ^ characters or \A @@ -8629,7 +9387,7 @@ there are no occurrences of *PRUNE or *SKIP (though there is an option to disable this case). */ if ((re->overall_options & PCRE2_ANCHORED) == 0 && - is_anchored(codestart, 0, &cb, 0)) + is_anchored(codestart, 0, &cb, 0, FALSE)) re->overall_options |= PCRE2_ANCHORED; /* If the pattern is still not anchored and we do not have a first code unit, @@ -8678,7 +9436,8 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0) when *PRUNE and SKIP are not present. (There is an option that disables this case.) */ - else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE; + else if (is_startline(codestart, 0, &cb, 0, FALSE)) + re->flags |= PCRE2_STARTLINE; } /* Handle the "required code unit", if one is set. In the case of an anchored @@ -8707,20 +9466,6 @@ if (reqcuflags >= 0 && } } -/* Check for a pattern than can match an empty string, so that this information -can be provided to applications. */ - -do - { - if (could_be_empty_branch(codestart, code, utf, &cb, NULL)) - { - re->flags |= PCRE2_MATCH_EMPTY; - break; - } - codestart += GET(codestart, 1); - } -while (*codestart == OP_ALT); - /* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern to set up information such as a bitmap of starting code units and a minimum matching length. */ @@ -8729,20 +9474,44 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && PRIV(study)(re) != 0) { errorcode = ERR31; - goto HAD_ERROR; + goto HAD_CB_ERROR; } -/* Control ends up here in all cases. If memory was obtained for a -zero-terminated copy of the pattern, remember to free it before returning. Also -free the list of named groups if a larger one had to be obtained. */ +/* Control ends up here in all cases. When running under valgrind, make a +pattern's terminating zero defined again. If memory was obtained for the parsed +version of the pattern, free it before returning. Also free the list of named +groups if a larger one had to be obtained, and likewise the group information +vector. */ EXIT: -if (copied_pattern != stack_copied_pattern) - ccontext->memctl.free(copied_pattern, ccontext->memctl.memory_data); +#ifdef SUPPORT_VALGRIND +if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1)); +#endif +if (cb.parsed_pattern != stack_parsed_pattern) + ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data); if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE) ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data); - +if (cb.groupinfo != stack_groupinfo) + ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data); return re; /* Will be NULL after an error */ + +/* Errors discovered in parse_regex() set the offset value in the compile +block. Errors discovered before it is called must compute it from the ptr +value. After parse_regex() is called, the offset in the compile block is set to +the end of the pattern, but certain errors in compile_regex() may reset it if +an offset is available in the parsed pattern. */ + +HAD_CB_ERROR: +ptr = pattern + cb.erroroffset; + +HAD_EARLY_ERROR: +*erroroffset = ptr - pattern; + +HAD_ERROR: +*errorptr = errorcode; +pcre2_code_free(re); +re = NULL; +goto EXIT; } /* End of pcre2_compile.c */ diff --git a/pcre2/src/pcre2_config.c b/pcre2/src/pcre2_config.c index 22aa3587d..e99272f57 100644 --- a/pcre2/src/pcre2_config.c +++ b/pcre2/src/pcre2_config.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -61,15 +61,16 @@ convenient for user programs that want to test their values. */ * Return info about what features are configured * *************************************************/ -/* +/* If where is NULL, the length of memory required is returned. + Arguments: what what information is required where where to put the information -Returns: 0 if data returned - >= 0 if where is NULL, giving length required +Returns: 0 if a numerical value is returned + >= 0 if a string value PCRE2_ERROR_BADOPTION if "where" not recognized - or JIT target requested when JIT not enabled + or JIT target requested when JIT not enabled */ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION @@ -127,15 +128,15 @@ switch (what) #ifdef SUPPORT_JIT { const char *v = PRIV(jit_get_target)(); - return 1 + ((where == NULL)? - strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v)); + return (int)(1 + ((where == NULL)? + strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); } #else return PCRE2_ERROR_BADOPTION; #endif case PCRE2_CONFIG_LINKSIZE: - *((uint32_t *)where) = configured_link_size; + *((uint32_t *)where) = (uint32_t)configured_link_size; break; case PCRE2_CONFIG_MATCHLIMIT: @@ -169,8 +170,8 @@ switch (what) #else const char *v = "Unicode not supported"; #endif - return 1 + ((where == NULL)? - strlen(v): PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v)); + return (int)(1 + ((where == NULL)? + strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); } break; @@ -206,8 +207,8 @@ switch (what) const char *v = (XSTRING(Z PCRE2_PRERELEASE)[1] == 0)? XSTRING(PCRE2_MAJOR.PCRE2_MINOR PCRE2_DATE) : XSTRING(PCRE2_MAJOR.PCRE2_MINOR) XSTRING(PCRE2_PRERELEASE PCRE2_DATE); - return 1 + ((where == NULL)? - strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v)); + return (int)(1 + ((where == NULL)? + strlen(v) : PRIV(strcpy_c8)((PCRE2_UCHAR *)where, v))); } } diff --git a/pcre2/src/pcre2_context.c b/pcre2/src/pcre2_context.c index 6146999df..ae050fe92 100644 --- a/pcre2/src/pcre2_context.c +++ b/pcre2/src/pcre2_context.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -131,13 +131,14 @@ return gcontext; when no context is supplied to the compile function. */ const pcre2_compile_context PRIV(default_compile_context) = { - { default_malloc, default_free, NULL }, - NULL, - NULL, - PRIV(default_tables), - BSR_DEFAULT, - NEWLINE_DEFAULT, - PARENS_NEST_LIMIT }; + { default_malloc, default_free, NULL }, /* Default memory handling */ + NULL, /* Stack guard */ + NULL, /* Stack guard data */ + PRIV(default_tables), /* Character tables */ + PCRE2_UNSET, /* Max pattern length */ + BSR_DEFAULT, /* Backslash R default */ + NEWLINE_DEFAULT, /* Newline convention */ + PARENS_NEST_LIMIT }; /* As it says */ /* The create function copies the default into the new memory, but must override the default memory handling functions if a gcontext was provided. */ @@ -169,6 +170,7 @@ const pcre2_match_context PRIV(default_match_context) = { #endif NULL, NULL, + PCRE2_UNSET, /* Offset limit */ MATCH_LIMIT, MATCH_LIMIT_RECURSION }; @@ -294,6 +296,13 @@ switch(value) } } +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, PCRE2_SIZE length) +{ +ccontext->max_pattern_length = length; +return 0; +} + PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t newline) { @@ -347,6 +356,13 @@ mcontext->match_limit = limit; return 0; } +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_offset_limit(pcre2_match_context *mcontext, PCRE2_SIZE limit) +{ +mcontext->offset_limit = limit; +return 0; +} + PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION pcre2_set_recursion_limit(pcre2_match_context *mcontext, uint32_t limit) { diff --git a/pcre2/src/pcre2_dfa_match.c b/pcre2/src/pcre2_dfa_match.c index b14477def..c909d6128 100644 --- a/pcre2/src/pcre2_dfa_match.c +++ b/pcre2/src/pcre2_dfa_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -371,7 +371,7 @@ internal_dfa_match( uint32_t offsetcount, int *workspace, int wscount, - int rlevel) + uint32_t rlevel) { stateblock *active_states, *new_states, *temp_states; stateblock *next_active_state, *next_new_state; @@ -400,8 +400,8 @@ BOOL utf = FALSE; BOOL reset_could_continue = FALSE; -rlevel++; -offsetcount &= (-2); +if (rlevel++ > mb->match_limit_recursion) return PCRE2_ERROR_RECURSIONLIMIT; +offsetcount &= (uint32_t)(-2); /* Round down */ wscount -= 2; wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / @@ -433,13 +433,13 @@ move back, and set up each alternative appropriately. */ if (*first_op == OP_REVERSE) { - int max_back = 0; - int gone_back; + size_t max_back = 0; + size_t gone_back; end_code = this_start_code; do { - int back = GET(end_code, 2+LINK_SIZE); + size_t back = (size_t)GET(end_code, 2+LINK_SIZE); if (back > max_back) max_back = back; end_code += GET(end_code, 1); } @@ -466,8 +466,8 @@ if (*first_op == OP_REVERSE) /* In byte-mode we can do this quickly. */ { - gone_back = (current_subject - max_back < start_subject)? - (int)(current_subject - start_subject) : max_back; + size_t current_offset = (size_t)(current_subject - start_subject); + gone_back = (current_offset < max_back)? current_offset : max_back; current_subject -= gone_back; } @@ -481,11 +481,11 @@ if (*first_op == OP_REVERSE) end_code = this_start_code; do { - int back = GET(end_code, 2+LINK_SIZE); + size_t back = (size_t)GET(end_code, 2+LINK_SIZE); if (back <= gone_back) { int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE); - ADD_NEW_DATA(-bstate, 0, gone_back - back); + ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back)); } end_code += GET(end_code, 1); } @@ -509,7 +509,7 @@ else do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); new_count = workspace[1]; if (!workspace[0]) - memcpy(new_states, active_states, new_count * sizeof(stateblock)); + memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock)); } /* Not restarting */ @@ -593,8 +593,9 @@ for (;;) stateblock *current_state = active_states + i; BOOL caseless = FALSE; PCRE2_SPTR code; + uint32_t codevalue; int state_offset = current_state->offset; - int codevalue, rrc; + int rrc; int count; /* A negative offset is a special case meaning "hold off going to this @@ -719,7 +720,7 @@ for (;;) ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); if (codevalue != OP_KET) { - ADD_ACTIVE(state_offset - GET(code, 1), 0); + ADD_ACTIVE(state_offset - (int)GET(code, 1), 0); } } else @@ -733,11 +734,12 @@ for (;;) else if (match_count > 0 && ++match_count * 2 > (int)offsetcount) match_count = 0; count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2; - if (count > 0) memmove(offsets + 2, offsets, count * sizeof(PCRE2_SIZE)); + if (count > 0) memmove(offsets + 2, offsets, + (size_t)count * sizeof(PCRE2_SIZE)); if (offsetcount >= 2) { - offsets[0] = (int)(current_subject - start_subject); - offsets[1] = (int)(ptr - start_subject); + offsets[0] = (PCRE2_SIZE)(current_subject - start_subject); + offsets[1] = (PCRE2_SIZE)(ptr - start_subject); } if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count; } @@ -959,7 +961,7 @@ for (;;) { if (d == '_') left_word = TRUE; else { - int cat = UCD_CATEGORY(d); + uint32_t cat = UCD_CATEGORY(d); left_word = (cat == ucp_L || cat == ucp_N); } } @@ -984,7 +986,7 @@ for (;;) { if (c == '_') right_word = TRUE; else { - int cat = UCD_CATEGORY(c); + uint32_t cat = UCD_CATEGORY(c); right_word = (cat == ucp_L || cat == ucp_N); } } @@ -1369,7 +1371,7 @@ for (;;) if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { - int lgb, rgb; + uint32_t lgb, rgb; PCRE2_SPTR nptr = ptr + clen; int ncount = 0; if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) @@ -1383,7 +1385,7 @@ for (;;) dlen = 1; if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } rgb = UCD_GRAPHBREAK(d); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; ncount++; lgb = rgb; nptr += dlen; @@ -1630,7 +1632,7 @@ for (;;) ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { - int lgb, rgb; + uint32_t lgb, rgb; PCRE2_SPTR nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || @@ -1645,7 +1647,7 @@ for (;;) dlen = 1; if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } rgb = UCD_GRAPHBREAK(d); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; ncount++; lgb = rgb; nptr += dlen; @@ -1902,7 +1904,7 @@ for (;;) count = current_state->count; /* Number already matched */ if (clen > 0) { - int lgb, rgb; + uint32_t lgb, rgb; PCRE2_SPTR nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) @@ -1916,7 +1918,7 @@ for (;;) dlen = 1; if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } rgb = UCD_GRAPHBREAK(d); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; ncount++; lgb = rgb; nptr += dlen; @@ -2097,7 +2099,7 @@ for (;;) case OP_EXTUNI: if (clen > 0) { - int lgb, rgb; + uint32_t lgb, rgb; PCRE2_SPTR nptr = ptr + clen; int ncount = 0; lgb = UCD_GRAPHBREAK(c); @@ -2106,7 +2108,7 @@ for (;;) dlen = 1; if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); } rgb = UCD_GRAPHBREAK(d); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; ncount++; lgb = rgb; nptr += dlen; @@ -2582,14 +2584,14 @@ for (;;) mb, /* static match data */ code, /* this subexpression's code */ ptr, /* where we currently are */ - (int)(ptr - start_subject), /* start offset */ + (PCRE2_SIZE)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ rlevel); /* function recursion level */ - if (rc == PCRE2_ERROR_DFA_UITEM) return rc; + if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc; if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } } @@ -2601,8 +2603,8 @@ for (;;) { PCRE2_SIZE local_offsets[1000]; int local_workspace[1000]; - int codelink = GET(code, 1); - int condcode; + int codelink = (int)GET(code, 1); + PCRE2_UCHAR condcode; /* Because of the way auto-callout works during compile, a callout item is inserted between OP_COND and an assertion condition. This does not @@ -2611,8 +2613,10 @@ for (;;) if (code[LINK_SIZE + 1] == OP_CALLOUT || code[LINK_SIZE + 1] == OP_CALLOUT_STR) { - unsigned int callout_length = (code[LINK_SIZE + 1] == OP_CALLOUT) - ? PRIV(OP_lengths)[OP_CALLOUT] : GET(code, 2 + 3*LINK_SIZE); + PCRE2_SIZE callout_length = (code[LINK_SIZE + 1] == OP_CALLOUT)? + (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] : + (PCRE2_SIZE)GET(code, 2 + 3*LINK_SIZE); + rrc = 0; if (mb->callout != NULL) { @@ -2678,7 +2682,7 @@ for (;;) else if (condcode == OP_RREF) { - int value = GET2(code, LINK_SIZE + 2); + unsigned int value = GET2(code, LINK_SIZE + 2); if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND; if (mb->recursive != NULL) { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } @@ -2699,14 +2703,14 @@ for (;;) mb, /* fixed match data */ asscode, /* this subexpression's code */ ptr, /* where we currently are */ - (int)(ptr - start_subject), /* start offset */ + (PCRE2_SIZE)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ rlevel); /* function recursion level */ - if (rc == PCRE2_ERROR_DFA_UITEM) return rc; + if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc; if ((rc >= 0) == (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } @@ -2747,7 +2751,7 @@ for (;;) mb, /* fixed match data */ callpat, /* this subexpression's code */ ptr, /* where we currently are */ - (int)(ptr - start_subject), /* start offset */ + (PCRE2_SIZE)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ local_workspace, /* workspace vector */ @@ -2768,18 +2772,19 @@ for (;;) { for (rc = rc*2 - 2; rc >= 0; rc -= 2) { - int charcount = local_offsets[rc+1] - local_offsets[rc]; + PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc]; #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 if (utf) { PCRE2_SPTR p = start_subject + local_offsets[rc]; PCRE2_SPTR pp = start_subject + local_offsets[rc+1]; - while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; } #endif if (charcount > 0) { - ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); + ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, + (int)(charcount - 1)); } else { @@ -2798,7 +2803,7 @@ for (;;) case OP_SCBRAPOS: case OP_BRAPOSZERO: { - int charcount, matched_count; + PCRE2_SIZE charcount, matched_count; PCRE2_SPTR local_ptr = ptr; BOOL allow_zero; @@ -2821,7 +2826,7 @@ for (;;) mb, /* fixed match data */ code, /* this subexpression's code */ local_ptr, /* where we currently are */ - (int)(ptr - start_subject), /* start offset */ + (PCRE2_SIZE)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ local_workspace, /* workspace vector */ @@ -2872,11 +2877,11 @@ for (;;) { PCRE2_SPTR p = ptr; PCRE2_SPTR pp = local_ptr; - charcount = (int)(pp - p); + charcount = (PCRE2_SIZE)(pp - p); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; #endif - ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); + ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1)); } } } @@ -2893,7 +2898,7 @@ for (;;) mb, /* fixed match data */ code, /* this subexpression's code */ ptr, /* where we currently are */ - (int)(ptr - start_subject), /* start offset */ + (PCRE2_SIZE)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ local_workspace, /* workspace vector */ @@ -2903,7 +2908,7 @@ for (;;) if (rc >= 0) { PCRE2_SPTR end_subpattern = code; - int charcount = local_offsets[1] - local_offsets[0]; + PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0]; int next_state_offset, repeat_state_offset; do { end_subpattern += GET(end_subpattern, 1); } @@ -2960,12 +2965,12 @@ for (;;) { PCRE2_SPTR p = start_subject + local_offsets[0]; PCRE2_SPTR pp = start_subject + local_offsets[1]; - while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; + while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--; } #endif - ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); + ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1)); if (repeat_state_offset >= 0) - { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } + { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); } } } else if (rc != PCRE2_ERROR_NOMATCH) return rc; @@ -3018,7 +3023,7 @@ for (;;) return rrc; /* Abandon */ } if (rrc == 0) - { ADD_ACTIVE(state_offset + callout_length, 0); } + { ADD_ACTIVE(state_offset + (int)callout_length, 0); } } break; @@ -3110,12 +3115,13 @@ Returns: > 0 => number of match offset pairs placed in offsets PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, - pcre2_match_context *mcontext, int *workspace, size_t wscount) + pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount) { const pcre2_real_code *re = (const pcre2_real_code *)code; PCRE2_SPTR start_match; PCRE2_SPTR end_subject; +PCRE2_SPTR bumpalong_limit; PCRE2_SPTR req_cu_ptr; BOOL utf, anchored, startline, firstline; @@ -3172,15 +3178,10 @@ occur. */ #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) -options |= (re->flags & FF) / ((FF & -FF) / (OO & -OO)); +options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); #undef FF #undef OO -/* A NULL match context means "use a default context" */ - -if (mcontext == NULL) - mcontext = (pcre2_match_context *)(&PRIV(default_match_context)); - /* If restarting after a partial match, do some sanity checks on the contents of the workspace. */ @@ -3205,20 +3206,33 @@ where to start. */ startline = (re->flags & PCRE2_STARTLINE) != 0; firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; +bumpalong_limit = end_subject; -/* Fill in the fields in the match block. */ +/* Get data from the match context, if present, and fill in the fields in the +match block. It is an error to set an offset limit without setting the flag at +compile time. */ if (mcontext == NULL) { mb->callout = NULL; mb->memctl = re->memctl; + mb->match_limit_recursion = PRIV(default_match_context).recursion_limit; } else { + if (mcontext->offset_limit != PCRE2_UNSET) + { + if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) + return PCRE2_ERROR_BADOFFSETLIMIT; + bumpalong_limit = subject + mcontext->offset_limit; + } mb->callout = mcontext->callout; mb->callout_data = mcontext->callout_data; mb->memctl = mcontext->memctl; + mb->match_limit_recursion = mcontext->recursion_limit; } +if (mb->match_limit_recursion > re->limit_recursion) + mb->match_limit_recursion = re->limit_recursion; mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + re->name_count * re->name_entry_size; @@ -3264,18 +3278,50 @@ switch(re->newline_convention) /* Check a UTF string for validity if required. For 8-bit and 16-bit strings, we must also check that a starting offset does not point into the middle of a -multiunit character. */ +multiunit character. We check only the portion of the subject that is going to +be inspected during matching - from the offset minus the maximum back reference +to the given length. This saves time when a small part of a large subject is +being matched by the use of a starting offset. Note that the maximum lookbehind +is a number of characters, not code units. */ #ifdef SUPPORT_UNICODE if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) { - match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar)); - if (match_data->rc != 0) return match_data->rc; + PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ + + if (start_offset > 0) + { #if PCRE2_CODE_UNIT_WIDTH != 32 - if (start_offset > 0 && start_offset < length && - NOT_FIRSTCHAR(subject[start_offset])) - return PCRE2_ERROR_BADUTFOFFSET; + unsigned int i; + if (start_match < end_subject && NOT_FIRSTCU(*start_match)) + return PCRE2_ERROR_BADUTFOFFSET; + for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) + { + check_subject--; + while (check_subject > subject && +#if PCRE2_CODE_UNIT_WIDTH == 8 + (*check_subject & 0xc0) == 0x80) +#else /* 16-bit */ + (*check_subject & 0xfc00) == 0xdc00) +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ + check_subject--; + } +#else /* In the 32-bit library, one code unit equals one character. */ + check_subject -= re->max_lookbehind; + if (check_subject < subject) check_subject = subject; #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ + } + + /* Validate the relevant portion of the subject. After an error, adjust the + offset to be an absolute offset in the whole string. */ + + match_data->rc = PRIV(valid_utf)(check_subject, + length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar)); + if (match_data->rc != 0) + { + match_data->startchar += (PCRE2_SIZE)(check_subject - subject); + return match_data->rc; + } } #endif /* SUPPORT_UNICODE */ @@ -3295,7 +3341,8 @@ if (!anchored) { first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); + if (utf && first_cu > 127) + first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); #endif } } @@ -3315,7 +3362,7 @@ if ((re->flags & PCRE2_LASTSET) != 0) { req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu); + if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); #endif } } @@ -3427,7 +3474,7 @@ for (;;) { while (start_match < end_subject) { - register uint32_t c = UCHAR21TEST(start_match); + uint32_t c = UCHAR21TEST(start_match); #if PCRE2_CODE_UNIT_WIDTH != 8 if (c > 255) c = 255; #endif @@ -3467,7 +3514,7 @@ for (;;) if (has_req_cu && end_subject - start_match < REQ_CU_MAX) { - register PCRE2_SPTR p = start_match + (has_first_cu? 1:0); + PCRE2_SPTR p = start_match + (has_first_cu? 1:0); /* We don't need to repeat the search if we haven't yet reached the place we found it at last time. */ @@ -3478,7 +3525,7 @@ for (;;) { while (p < end_subject) { - register uint32_t pp = UCHAR21INCTEST(p); + uint32_t pp = UCHAR21INCTEST(p); if (pp == req_cu || pp == req_cu2) { p--; break; } } } @@ -3507,6 +3554,10 @@ for (;;) /* ------------ End of start of match optimizations ------------ */ + /* Give no match if we have passed the bumpalong limit. */ + + if (start_match > bumpalong_limit) break; + /* OK, now we can do the business */ mb->start_used_ptr = start_match; @@ -3519,9 +3570,9 @@ for (;;) start_match, /* where we currently are */ start_offset, /* start offset in subject */ match_data->ovector, /* offset vector */ - match_data->oveccount * 2, /* actual size of same */ + (uint32_t)match_data->oveccount * 2, /* actual size of same */ workspace, /* workspace vector */ - wscount, /* size of same */ + (int)wscount, /* size of same */ 0); /* function recurse level */ /* Anything other than "no match" means we are done, always; otherwise, carry @@ -3535,7 +3586,7 @@ for (;;) match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject); } match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject); - match_data->rightchar = mb->last_used_ptr - subject; + match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject); match_data->startchar = (PCRE2_SIZE)(start_match - subject); match_data->rc = rc; return rc; diff --git a/pcre2/src/pcre2_error.c b/pcre2/src/pcre2_error.c index c539bd23e..437bdfd20 100644 --- a/pcre2/src/pcre2_error.c +++ b/pcre2/src/pcre2_error.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2015 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -51,11 +51,10 @@ POSSIBILITY OF SUCH DAMAGE. /* The texts of compile-time error messages. Compile-time error numbers start at COMPILE_ERROR_BASE (100). -Do not ever re-use any error number, because they are documented. Always add a -new error instead. This used to be a table of strings, but in order to reduce -the number of relocations needed when a shared library is loaded dynamically, -it is now one long string. We cannot use a table of offsets, because the -lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, +This used to be a table of strings, but in order to reduce the number of +relocations needed when a shared library is loaded dynamically, it is now one +long string. We cannot use a table of offsets, because the lengths of inserts +such as XSTRING(MAX_NAME_SIZE) are not known. Instead, pcre2_get_error_message() counts through to the one it wants - this isn't a performance issue because these strings are used only when there is an error. @@ -63,7 +62,7 @@ Each substring ends with \0 to insert a null character. This includes the final substring, so that the whole string ends with \0\0, which can be detected when counting through. */ -static const char compile_error_texts[] = +static const unsigned char compile_error_texts[] = "no error\0" "\\ at end of pattern\0" "\\c at end of pattern\0" @@ -92,13 +91,13 @@ static const char compile_error_texts[] = "failed to allocate heap memory\0" "unmatched closing parenthesis\0" "internal error: code overflow\0" - "unrecognized character after (?<\0" + "missing closing parenthesis for condition\0" /* 25 */ "lookbehind assertion is not fixed length\0" - "malformed number or name after (?(\0" + "a relative value of zero is not allowed\0" "conditional group contains more than two branches\0" "assertion expected after (?( or (?(?C)\0" - "(?R or (?[+-]digits must be followed by )\0" + "digit expected after (?+ or (?-\0" /* 30 */ "unknown POSIX class name\0" "internal error in pcre2_study(): should not occur\0" @@ -106,13 +105,13 @@ static const char compile_error_texts[] = "parentheses are too deeply nested (stack check)\0" "character code point value in \\x{} or \\o{} is too large\0" /* 35 */ - "invalid condition (?(0)\0" - "\\C is not allowed in a lookbehind assertion\0" + "lookbehind is too complicated\0" + "\\C is not allowed in a lookbehind assertion in UTF-" XSTRING(PCRE2_CODE_UNIT_WIDTH) " mode\0" "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0" "number after (?C is greater than 255\0" "closing parenthesis for (?C expected\0" /* 40 */ - "recursion could loop indefinitely\0" + "invalid escape sequence in (*VERB) name\0" "unrecognized character after (?P\0" "syntax error in subpattern name (missing terminator)\0" "two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0" @@ -133,13 +132,13 @@ static const char compile_error_texts[] = "missing opening brace after \\o\0" "internal error: unknown newline setting\0" "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" - "a numbered reference must not be zero\0" + "(?R (recursive pattern call) must be followed by a closing parenthesis\0" "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0" /* 60 */ "(*VERB) not recognized or malformed\0" - "number is too big\0" + "group number is too big\0" "subpattern name expected\0" - "digit expected after (?+\0" + "internal error: parsed pattern overflow\0" "non-octal character in \\o{} (closing brace missing?)\0" /* 65 */ "different names for subpatterns of the same number are not allowed\0" @@ -152,9 +151,9 @@ static const char compile_error_texts[] = #endif "\\k is not followed by a braced, angle-bracketed, or quoted name\0" /* 70 */ - "internal error: unknown opcode in find_fixedlength()\0" + "internal error: unknown meta code in check_lookbehinds()\0" "\\N is not supported in a class\0" - "too many forward references\0" + "callout string is too long\0" "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" "using UTF is disabled by the application\0" /* 75 */ @@ -162,18 +161,26 @@ static const char compile_error_texts[] = "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0" "character code point value in \\u.... sequence is too large\0" "digits missing in \\x{} or \\o{}\0" - "syntax error in (?(VERSION condition\0" + "syntax error or number too big in (?(VERSION condition\0" /* 80 */ "internal error: unknown opcode in auto_possessify()\0" "missing terminating delimiter for callout with string argument\0" "unrecognized string delimiter follows (?C\0" "using \\C is disabled by the application\0" "(?| and/or (?J: or (?x: parentheses are too deeply nested\0" + /* 85 */ + "using \\C is disabled in this PCRE2 library\0" + "regular expression is too complicated\0" + "lookbehind assertion is too long\0" + "pattern string is longer than the limit set by the application\0" + "internal error: unknown code in parsed pattern\0" + /* 90 */ + "internal error: bad code value in parsed_skip()\0" ; /* Match-time and UTF error texts are in the same format. */ -static const char match_error_texts[] = +static const unsigned char match_error_texts[] = "no error\0" "no match\0" "partial match\0" @@ -200,7 +207,7 @@ static const char match_error_texts[] = /* 20 */ "UTF-8 error: overlong 5-byte sequence\0" "UTF-8 error: overlong 6-byte sequence\0" - "UTF-8 error: isolated 0x80 byte\0" + "UTF-8 error: isolated byte with 0x80 bit set\0" "UTF-8 error: illegal byte (0xfe or 0xff)\0" "UTF-16 error: missing low surrogate at end\0" /* 25 */ @@ -239,7 +246,16 @@ static const char match_error_texts[] = "nested recursion at the same subject position\0" "recursion limit exceeded\0" "requested value is not available\0" + /* 55 */ "requested value is not set\0" + "offset limit set without PCRE2_USE_OFFSET_LIMIT\0" + "bad escape sequence in replacement string\0" + "expected closing curly bracket in replacement string\0" + "bad substitution in replacement string\0" + /* 60 */ + "match with end before start is not supported\0" + "too many replacements (more than INT_MAX)\0" + "bad serialized data\0" ; @@ -262,34 +278,34 @@ Returns: length of message if all is well */ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_get_error_message(int enumber, PCRE2_UCHAR *buffer, size_t size) +pcre2_get_error_message(int enumber, PCRE2_UCHAR *buffer, PCRE2_SIZE size) { -char xbuff[128]; -const char *message; -size_t i; -uint32_t n; +const unsigned char *message; +PCRE2_SIZE i; +int n; if (size == 0) return PCRE2_ERROR_NOMEMORY; -if (enumber > COMPILE_ERROR_BASE) /* Compile error */ +if (enumber >= COMPILE_ERROR_BASE) /* Compile error */ { message = compile_error_texts; n = enumber - COMPILE_ERROR_BASE; } -else /* Match or UTF error */ +else if (enumber < 0) /* Match or UTF error */ { message = match_error_texts; n = -enumber; } +else /* Invalid error number */ + { + message = (unsigned char *)"\0"; /* Empty message list */ + n = 1; + } for (; n > 0; n--) { while (*message++ != CHAR_NULL) {}; - if (*message == CHAR_NULL) - { - sprintf(xbuff, "No text for error %d", enumber); - break; - } + if (*message == CHAR_NULL) return PCRE2_ERROR_BADDATA; } for (i = 0; *message != 0; i++) @@ -303,7 +319,7 @@ for (i = 0; *message != 0; i++) } buffer[i] = 0; -return i; +return (int)i; } /* End of pcre2_error.c */ diff --git a/pcre2/src/pcre2_find_bracket.c b/pcre2/src/pcre2_find_bracket.c new file mode 100644 index 000000000..357385a11 --- /dev/null +++ b/pcre2/src/pcre2_find_bracket.c @@ -0,0 +1,218 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains a single function that scans through a compiled pattern +until it finds a capturing bracket with the given number, or, if the number is +negative, an instance of OP_REVERSE for a lookbehind. The function is called +from pcre2_compile.c and also from pcre2_study.c when finding the minimum +matching length. */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pcre2_internal.h" + + +/************************************************* +* Scan compiled regex for specific bracket * +*************************************************/ + +/* +Arguments: + code points to start of expression + utf TRUE in UTF mode + number the required bracket number or negative to find a lookbehind + +Returns: pointer to the opcode for the bracket, or NULL if not found +*/ + +PCRE2_SPTR +PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number) +{ +for (;;) + { + PCRE2_UCHAR c = *code; + + if (c == OP_END) return NULL; + + /* XCLASS is used for classes that cannot be represented just by a bit map. + This includes negated single high-valued characters. CALLOUT_STR is used for + callouts with string arguments. In both cases the length in the table is + zero; the actual length is stored in the compiled code. */ + + if (c == OP_XCLASS) code += GET(code, 1); + else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); + + /* Handle lookbehind */ + + else if (c == OP_REVERSE) + { + if (number < 0) return (PCRE2_UCHAR *)code; + code += PRIV(OP_lengths)[c]; + } + + /* Handle capturing bracket */ + + else if (c == OP_CBRA || c == OP_SCBRA || + c == OP_CBRAPOS || c == OP_SCBRAPOS) + { + int n = (int)GET2(code, 1+LINK_SIZE); + if (n == number) return (PCRE2_UCHAR *)code; + code += PRIV(OP_lengths)[c]; + } + + /* Otherwise, we can get the item's length from the table, except that for + repeated character types, we have to test for \p and \P, which have an extra + two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we + must add in its length. */ + + else + { + switch(c) + { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + break; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + case OP_TYPEPOSUPTO: + if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) + code += 2; + break; + + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + code += code[1]; + break; + } + + /* Add in the fixed length from the table */ + + code += PRIV(OP_lengths)[c]; + + /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be + followed by a multi-byte character. The length in the table is a minimum, so + we have to arrange to skip the extra bytes. */ + +#ifdef MAYBE_UTF_MULTI + if (utf) switch(c) + { + case OP_CHAR: + case OP_CHARI: + case OP_NOT: + case OP_NOTI: + case OP_EXACT: + case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: + case OP_UPTO: + case OP_UPTOI: + case OP_NOTUPTO: + case OP_NOTUPTOI: + case OP_MINUPTO: + case OP_MINUPTOI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: + case OP_POSUPTO: + case OP_POSUPTOI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: + case OP_STAR: + case OP_STARI: + case OP_NOTSTAR: + case OP_NOTSTARI: + case OP_MINSTAR: + case OP_MINSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: + case OP_POSSTAR: + case OP_POSSTARI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: + case OP_PLUS: + case OP_PLUSI: + case OP_NOTPLUS: + case OP_NOTPLUSI: + case OP_MINPLUS: + case OP_MINPLUSI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: + case OP_POSPLUS: + case OP_POSPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: + case OP_QUERY: + case OP_QUERYI: + case OP_NOTQUERY: + case OP_NOTQUERYI: + case OP_MINQUERY: + case OP_MINQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: + case OP_POSQUERY: + case OP_POSQUERYI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: + if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); + break; + } +#else + (void)(utf); /* Keep compiler happy by referencing function argument */ +#endif /* MAYBE_UTF_MULTI */ + } + } +} + +/* End of pcre2_find_bracket.c */ diff --git a/pcre2/src/pcre2_fuzzsupport.c b/pcre2/src/pcre2_fuzzsupport.c new file mode 100644 index 000000000..462b48a93 --- /dev/null +++ b/pcre2/src/pcre2_fuzzsupport.c @@ -0,0 +1,316 @@ +/*************************************************************************** +Fuzzer driver for PCRE2. Given an arbitrary string of bytes and a length, it +tries to compile and match it, deriving options from the string itself. If +STANDALONE is defined, a main program that calls the driver with the contents +of specified files is compiled, and commentary on what is happening is output. +If an argument starts with '=' the rest of it it is taken as a literal string +rather than a file name. This allows easy testing of short strings. + +Written by Philip Hazel, October 2016 +***************************************************************************/ + +#include +#include +#include +#include + +#define PCRE2_CODE_UNIT_WIDTH 8 +#include "pcre2.h" + +#define MAX_MATCH_SIZE 1000 + +#define ALLOWED_COMPILE_OPTIONS \ + (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ + PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \ + PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \ + PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \ + PCRE2_NO_AUTO_CAPTURE| \ + PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ + PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \ + PCRE2_UTF) + +#define ALLOWED_MATCH_OPTIONS \ + (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ + PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_HARD| \ + PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT) + +/* This is the callout function. Its only purpose is to halt matching if there +are more than 100 callouts, as one way of stopping too much time being spent on +fruitless matches. The callout data is a pointer to the counter. */ + +static int callout_function(pcre2_callout_block *cb, void *callout_data) +{ +(void)cb; /* Avoid unused parameter warning */ +*((uint32_t *)callout_data) += 1; +return (*((uint32_t *)callout_data) > 100)? PCRE2_ERROR_CALLOUT : 0; +} + +/* Putting in this apparently unnecessary prototype prevents gcc from giving a +"no previous prototype" warning when compiling at high warning level. */ + +int LLVMFuzzerTestOneInput(const unsigned char *, size_t); + +/* Here's the driving function. */ + +int LLVMFuzzerTestOneInput(const unsigned char *data, size_t size) +{ +uint32_t compile_options; +uint32_t match_options; +pcre2_match_data *match_data = NULL; +pcre2_match_context *match_context = NULL; +size_t match_size; +int r1, r2; +int i; + +if (size < 1) return 0; + +/* Limiting the length of the subject for matching stops fruitless searches +in large trees taking too much time. */ + +match_size = (size > MAX_MATCH_SIZE)? MAX_MATCH_SIZE : size; + +/* Figure out some options to use. Initialize the random number to ensure +repeatability. Ensure that we get a 32-bit unsigned random number for testing +options. (RAND_MAX is required to be at least 32767, but is commonly +2147483647, which excludes the top bit.) */ + +srand((unsigned int)(data[size/2])); +r1 = rand(); +r2 = rand(); + +/* Ensure that all undefined option bits are zero (waste of time trying them) +and also that PCRE2_NO_UTF_CHECK is unset, as there is no guarantee that the +input is UTF-8. Also unset PCRE2_NEVER_UTF and PCRE2_NEVER_UCP as there is no +reason to disallow UTF and UCP. Force PCRE2_NEVER_BACKSLASH_C to be set because +\C in random patterns is highly likely to cause a crash. */ + +compile_options = + ((((uint32_t)r1 << 16) | ((uint32_t)r2 & 0xffff)) & ALLOWED_COMPILE_OPTIONS) | + PCRE2_NEVER_BACKSLASH_C; + +match_options = + ((((uint32_t)r1 << 16) | ((uint32_t)r2 & 0xffff)) & ALLOWED_MATCH_OPTIONS); + +/* Do the compile with and without the options, and after a successful compile, +likewise do the match with and without the options. */ + +for (i = 0; i < 2; i++) + { + uint32_t callout_count; + int errorcode; + PCRE2_SIZE erroroffset; + pcre2_code *code; + +#ifdef STANDALONE + printf("Compile options %.8x never_backslash_c", compile_options); + printf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + ((compile_options & PCRE2_ALT_BSUX) != 0)? ",alt_bsux" : "", + ((compile_options & PCRE2_ALT_CIRCUMFLEX) != 0)? ",alt_circumflex" : "", + ((compile_options & PCRE2_ALT_VERBNAMES) != 0)? ",alt_verbnames" : "", + ((compile_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? ",allow_empty_class" : "", + ((compile_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", + ((compile_options & PCRE2_AUTO_CALLOUT) != 0)? ",auto_callout" : "", + ((compile_options & PCRE2_CASELESS) != 0)? ",caseless" : "", + ((compile_options & PCRE2_DOLLAR_ENDONLY) != 0)? ",dollar_endonly" : "", + ((compile_options & PCRE2_DOTALL) != 0)? ",dotall" : "", + ((compile_options & PCRE2_DUPNAMES) != 0)? ",dupnames" : "", + ((compile_options & PCRE2_EXTENDED) != 0)? ",extended" : "", + ((compile_options & PCRE2_FIRSTLINE) != 0)? ",firstline" : "", + ((compile_options & PCRE2_MATCH_UNSET_BACKREF) != 0)? ",match_unset_backref" : "", + ((compile_options & PCRE2_MULTILINE) != 0)? ",multiline" : "", + ((compile_options & PCRE2_NEVER_UCP) != 0)? ",never_ucp" : "", + ((compile_options & PCRE2_NEVER_UTF) != 0)? ",never_utf" : "", + ((compile_options & PCRE2_NO_AUTO_CAPTURE) != 0)? ",no_auto_capture" : "", + ((compile_options & PCRE2_NO_AUTO_POSSESS) != 0)? ",no_auto_possess" : "", + ((compile_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)? ",no_dotstar_anchor" : "", + ((compile_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", + ((compile_options & PCRE2_NO_START_OPTIMIZE) != 0)? ",no_start_optimize" : "", + ((compile_options & PCRE2_UCP) != 0)? ",ucp" : "", + ((compile_options & PCRE2_UNGREEDY) != 0)? ",ungreedy" : "", + ((compile_options & PCRE2_USE_OFFSET_LIMIT) != 0)? ",use_offset_limit" : "", + ((compile_options & PCRE2_UTF) != 0)? ",utf" : ""); +#endif + + code = pcre2_compile((PCRE2_SPTR)data, (PCRE2_SIZE)size, compile_options, + &errorcode, &erroroffset, NULL); + + /* Compilation succeeded */ + + if (code != NULL) + { + int j; + uint32_t save_match_options = match_options; + + /* Create match data and context blocks only when we first need them. Set + low match and recursion limits to avoid wasting too much searching large + pattern trees. Almost all matches are going to fail. */ + + if (match_data == NULL) + { + match_data = pcre2_match_data_create(32, NULL); + if (match_data == NULL) + { +#ifdef STANDALONE + printf("** Failed to create match data block\n"); +#endif + return 0; + } + } + + if (match_context == NULL) + { + match_context = pcre2_match_context_create(NULL); + if (match_context == NULL) + { +#ifdef STANDALONE + printf("** Failed to create match context block\n"); +#endif + return 0; + } + (void)pcre2_set_match_limit(match_context, 100); + (void)pcre2_set_recursion_limit(match_context, 100); + (void)pcre2_set_callout(match_context, callout_function, &callout_count); + } + + /* Match twice, with and without options */ + + for (j = 0; j < 2; j++) + { +#ifdef STANDALONE + printf("Match options %.8x", match_options); + printf("%s%s%s%s%s%s%s%s\n", + ((match_options & PCRE2_ANCHORED) != 0)? ",anchored" : "", + ((match_options & PCRE2_NO_UTF_CHECK) != 0)? ",no_utf_check" : "", + ((match_options & PCRE2_NOTBOL) != 0)? ",notbol" : "", + ((match_options & PCRE2_NOTEMPTY) != 0)? ",notempty" : "", + ((match_options & PCRE2_NOTEMPTY_ATSTART) != 0)? ",notempty_atstart" : "", + ((match_options & PCRE2_NOTEOL) != 0)? ",noteol" : "", + ((match_options & PCRE2_PARTIAL_HARD) != 0)? ",partial_hard" : "", + ((match_options & PCRE2_PARTIAL_SOFT) != 0)? ",partial_soft" : ""); +#endif + + callout_count = 0; + errorcode = pcre2_match(code, (PCRE2_SPTR)data, (PCRE2_SIZE)match_size, 0, + match_options, match_data, match_context); + +#ifdef STANDALONE + if (errorcode >= 0) printf("Match returned %d\n", errorcode); else + { + unsigned char buffer[256]; + pcre2_get_error_message(errorcode, buffer, 256); + printf("Match failed: error %d: %s\n", errorcode, buffer); + } +#endif + + match_options = 0; /* For second time */ + } + + match_options = save_match_options; /* Reset for the second compile */ + pcre2_code_free(code); + } + + /* Compilation failed */ + + else + { + unsigned char buffer[256]; + pcre2_get_error_message(errorcode, buffer, 256); +#ifdef STANDALONE + printf("Error %d at offset %lu: %s\n", errorcode, erroroffset, buffer); +#else + if (strstr((const char *)buffer, "internal error") != NULL) abort(); +#endif + } + + compile_options = PCRE2_NEVER_BACKSLASH_C; /* For second time */ + } + +if (match_data != NULL) pcre2_match_data_free(match_data); +if (match_context != NULL) pcre2_match_context_free(match_context); + +return 0; +} + + +/* Optional main program. */ + +#ifdef STANDALONE +int main(int argc, char **argv) +{ +int i; + +if (argc < 2) + { + printf("** No arguments given\n"); + return 0; + } + +for (i = 1; i < argc; i++) + { + size_t filelen; + size_t readsize; + unsigned char *buffer; + FILE *f; + + /* Handle a literal string. Copy to an exact size buffer so that checks for + overrunning work. */ + + if (argv[i][0] == '=') + { + readsize = strlen(argv[i]) - 1; + printf("------ ------\n"); + printf("Length = %lu\n", readsize); + printf("%.*s\n", (int)readsize, argv[i]+1); + buffer = (unsigned char *)malloc(readsize); + if (buffer == NULL) + printf("** Failed to allocate %lu bytes of memory\n", readsize); + else + { + memcpy(buffer, argv[i]+1, readsize); + LLVMFuzzerTestOneInput(buffer, readsize); + free(buffer); + } + continue; + } + + /* Handle a string given in a file */ + + f = fopen(argv[i], "rb"); + if (f == NULL) + { + printf("** Failed to open %s: %s\n", argv[i], strerror(errno)); + continue; + } + + printf("------ %s ------\n", argv[i]); + + fseek(f, 0, SEEK_END); + filelen = ftell(f); + fseek(f, 0, SEEK_SET); + + buffer = (unsigned char *)malloc(filelen); + if (buffer == NULL) + { + printf("** Failed to allocate %lu bytes of memory\n", filelen); + fclose(f); + continue; + } + + readsize = fread(buffer, 1, filelen, f); + fclose(f); + + if (readsize != filelen) + printf("** File size is %lu but fread() returned %lu\n", filelen, readsize); + else + { + printf("Length = %lu\n", filelen); + LLVMFuzzerTestOneInput(buffer, filelen); + } + free(buffer); + } + +return 0; +} +#endif /* STANDALONE */ + +/* End */ diff --git a/pcre2/src/pcre2_internal.h b/pcre2/src/pcre2_internal.h index a4cf1e08c..6a8774ce8 100644 --- a/pcre2/src/pcre2_internal.h +++ b/pcre2/src/pcre2_internal.h @@ -2,12 +2,12 @@ * Perl-Compatible Regular Expressions * *************************************************/ -/* PCRE is a library of functions to support regular expressions whose syntax +/* PCRE2 is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2015 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -39,7 +39,10 @@ POSSIBILITY OF SUCH DAMAGE. */ /* We do not support both EBCDIC and Unicode at the same time. The "configure" -script prevents both being selected, but not everybody uses "configure". */ +script prevents both being selected, but not everybody uses "configure". EBCDIC +is only supported for the 8-bit library, but the check for this has to be later +in this file, because the first part is not width-dependent, and is included by +pcre2test.c with CODE_UNIT_WIDTH == 0. */ #if defined EBCDIC && defined SUPPORT_UNICODE #error The use of both EBCDIC and SUPPORT_UNICODE is not supported. @@ -70,6 +73,14 @@ typedef int BOOL; #include #endif +/* Older versions of MSVC lack snprintf(). This define allows for +warning/error-free compilation and testing with MSVC compilers back to at least +MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */ + +#if defined(_MSC_VER) && (_MSC_VER < 1900) +#define snprintf _snprintf +#endif + /* When compiling a DLL for Windows, the exported symbols have to be declared using some MS magic. I found some useful information on this web page: http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the @@ -131,20 +142,6 @@ pcre2_match() because of the way it backtracks. */ #define PCRE2_SPTR CUSTOM_SUBJECT_PTR #endif -/* When compiling with the MSVC compiler, it is sometimes necessary to include -a "calling convention" before exported function names. (This is secondhand -information; I know nothing about MSVC myself). For example, something like - - void __cdecl function(....) - -might be needed. In order so make this easy, all the exported functions have -PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not -set, we ensure here that it has no effect. */ - -#ifndef PCRE2_CALL_CONVENTION -#define PCRE2_CALL_CONVENTION -#endif - /* When checking for integer overflow in pcre2_compile(), we need to handle large integers. If a 64-bit integer type is available, we can use that. Otherwise we have to cast to double, which of course requires floating point @@ -166,7 +163,7 @@ by "configure". */ #endif /* When compiling for use with the Virtual Pascal compiler, these functions -need to have their names changed. PCRE must be compiled with the -DVPCOMPAT +need to have their names changed. PCRE2 must be compiled with the -DVPCOMPAT option on the command line. */ #ifdef VPCOMPAT @@ -189,7 +186,7 @@ neither (there some non-Unix environments where this is the case). */ #define memmove(a, b, c) bcopy(b, a, c) #else /* HAVE_BCOPY */ static void * -pcre_memmove(void *d, const void *s, size_t n) +pcre2_memmove(void *d, const void *s, size_t n) { size_t i; unsigned char *dest = (unsigned char *)d; @@ -207,7 +204,7 @@ else return (void *)(dest - n); } } -#define memmove(a, b, c) pcre_memmove(a, b, c) +#define memmove(a, b, c) pcre2_memmove(a, b, c) #endif /* not HAVE_BCOPY */ #endif /* not HAVE_MEMMOVE */ #endif /* not VPCOMPAT */ @@ -231,8 +228,15 @@ Unicode doesn't go beyond 0x0010ffff. */ #define MAX_UTF_CODE_POINT 0x10ffff -/* Compile-time errors are added to this value. As they are documented, it -should probably never be changed. */ +/* Compile-time positive error numbers (all except UTF errors, which are +negative) start at this value. It should probably never be changed, in case +some application is checking for specific numbers. There is a copy of this +#define in pcre2posix.c (which now no longer includes this file). Ideally, a +way of having a single definition should be found, but as the number is +unlikely to change, this is not a pressing issue. The original reason for +having a base other than 0 was to keep the absolute values of compile-time and +run-time error numbers numerically different, but in the event the code does +not rely on this. */ #define COMPILE_ERROR_BASE 100 @@ -266,21 +270,21 @@ advancing the pointer. */ #define GETUTF8(c, eptr) \ { \ - if ((c & 0x20) == 0) \ - c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \ - else if ((c & 0x10) == 0) \ - c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ - else if ((c & 0x08) == 0) \ - c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \ - ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \ - else if ((c & 0x04) == 0) \ - c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \ - ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \ - (eptr[4] & 0x3f); \ + if ((c & 0x20u) == 0) \ + c = ((c & 0x1fu) << 6) | (eptr[1] & 0x3fu); \ + else if ((c & 0x10u) == 0) \ + c = ((c & 0x0fu) << 12) | ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ + else if ((c & 0x08u) == 0) \ + c = ((c & 0x07u) << 18) | ((eptr[1] & 0x3fu) << 12) | \ + ((eptr[2] & 0x3fu) << 6) | (eptr[3] & 0x3fu); \ + else if ((c & 0x04u) == 0) \ + c = ((c & 0x03u) << 24) | ((eptr[1] & 0x3fu) << 18) | \ + ((eptr[2] & 0x3fu) << 12) | ((eptr[3] & 0x3fu) << 6) | \ + (eptr[4] & 0x3fu); \ else \ - c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \ - ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \ - ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \ + c = ((c & 0x01u) << 30) | ((eptr[1] & 0x3fu) << 24) | \ + ((eptr[2] & 0x3fu) << 18) | ((eptr[3] & 0x3fu) << 12) | \ + ((eptr[4] & 0x3fu) << 6) | (eptr[5] & 0x3fu); \ } /* Base macro to pick up the remaining bytes of a UTF-8 character, advancing @@ -288,31 +292,31 @@ the pointer. */ #define GETUTF8INC(c, eptr) \ { \ - if ((c & 0x20) == 0) \ - c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \ - else if ((c & 0x10) == 0) \ + if ((c & 0x20u) == 0) \ + c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \ + else if ((c & 0x10u) == 0) \ { \ - c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \ + c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \ eptr += 2; \ } \ - else if ((c & 0x08) == 0) \ + else if ((c & 0x08u) == 0) \ { \ - c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \ - ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ + c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \ + ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ eptr += 3; \ } \ - else if ((c & 0x04) == 0) \ + else if ((c & 0x04u) == 0) \ { \ - c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \ - ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \ - (eptr[3] & 0x3f); \ + c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \ + ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \ + (eptr[3] & 0x3fu); \ eptr += 4; \ } \ else \ { \ - c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \ - ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \ - ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \ + c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \ + ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \ + ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \ eptr += 5; \ } \ } @@ -322,34 +326,34 @@ advancing the pointer, incrementing the length. */ #define GETUTF8LEN(c, eptr, len) \ { \ - if ((c & 0x20) == 0) \ + if ((c & 0x20u) == 0) \ { \ - c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \ + c = ((c & 0x1fu) << 6) | (eptr[1] & 0x3fu); \ len++; \ } \ - else if ((c & 0x10) == 0) \ + else if ((c & 0x10u) == 0) \ { \ - c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ + c = ((c & 0x0fu) << 12) | ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ len += 2; \ } \ - else if ((c & 0x08) == 0) \ + else if ((c & 0x08u) == 0) \ {\ - c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \ - ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \ + c = ((c & 0x07u) << 18) | ((eptr[1] & 0x3fu) << 12) | \ + ((eptr[2] & 0x3fu) << 6) | (eptr[3] & 0x3fu); \ len += 3; \ } \ - else if ((c & 0x04) == 0) \ + else if ((c & 0x04u) == 0) \ { \ - c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \ - ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \ - (eptr[4] & 0x3f); \ + c = ((c & 0x03u) << 24) | ((eptr[1] & 0x3fu) << 18) | \ + ((eptr[2] & 0x3fu) << 12) | ((eptr[3] & 0x3fu) << 6) | \ + (eptr[4] & 0x3fu); \ len += 4; \ } \ else \ {\ - c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \ - ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \ - ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \ + c = ((c & 0x01u) << 30) | ((eptr[1] & 0x3fu) << 24) | \ + ((eptr[2] & 0x3fu) << 18) | ((eptr[3] & 0x3fu) << 12) | \ + ((eptr[4] & 0x3fu) << 6) | (eptr[5] & 0x3fu); \ len += 5; \ } \ } @@ -379,7 +383,7 @@ other. NOTE: The values also appear in pcre2_jit_compile.c. */ /* Character U+180E (Mongolian Vowel Separator) is not included in the list of spaces in the Unicode file PropList.txt, and Perl does not recognize it as a space. However, in many other sources it is listed as a space and has been in -PCRE for a long time. */ +PCRE (both APIs) for a long time. */ #define HSPACE_LIST \ CHAR_HT, CHAR_SPACE, CHAR_NBSP, \ @@ -524,9 +528,11 @@ bytes in a code unit in that mode. */ #define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */ #define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */ #define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */ -#define PCRE2_DEREF_TABLES 0x00040000 /* Release character tables. */ +#define PCRE2_DEREF_TABLES 0x00040000 /* release character tables */ #define PCRE2_NOJIT 0x00080000 /* (*NOJIT) used */ #define PCRE2_HASBKPORX 0x00100000 /* contains \P, \p, or \X */ +#define PCRE2_DUPCAPUSED 0x00200000 /* contains (?| */ +#define PCRE2_HASBKC 0x00400000 /* contains \C */ #define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32) @@ -545,17 +551,9 @@ req_unit match. */ #define REQ_CU_MAX 1000 -/* Bit definitions for entries in the pcre_ctypes table. */ - -#define ctype_space 0x01 -#define ctype_letter 0x02 -#define ctype_digit 0x04 -#define ctype_xdigit 0x08 -#define ctype_word 0x10 /* alphanumeric or '_' */ -#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ - -/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set -of bits for a class map. Some classes are built by combining these tables. */ +/* Offsets for the bitmap tables in the cbits set of tables. Each table +contains a set of bits for a class map. Some classes are built by combining +these tables. */ #define cbit_space 0 /* [:space:] or \s */ #define cbit_xdigit 32 /* [:xdigit:] */ @@ -569,19 +567,28 @@ of bits for a class map. Some classes are built by combining these tables. */ #define cbit_cntrl 288 /* [:cntrl:] */ #define cbit_length 320 /* Length of the cbits table */ -/* Offsets of the various tables from the base tables pointer, and -total length. */ +/* Bit definitions for entries in the ctypes table. */ -#define lcc_offset 0 -#define fcc_offset 256 -#define cbits_offset 512 -#define ctypes_offset (cbits_offset + cbit_length) +#define ctype_space 0x01 +#define ctype_letter 0x02 +#define ctype_digit 0x04 +#define ctype_xdigit 0x08 +#define ctype_word 0x10 /* alphanumeric or '_' */ +#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ + +/* Offsets of the various tables from the base tables pointer, and +total length of the tables. */ + +#define lcc_offset 0 /* Lower case */ +#define fcc_offset 256 /* Flip case */ +#define cbits_offset 512 /* Character classes */ +#define ctypes_offset (cbits_offset + cbit_length) /* Character types */ #define tables_length (ctypes_offset + 256) /* -------------------- Character and string names ------------------------ */ -/* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal +/* If PCRE2 is to support UTF-8 on EBCDIC platforms, we cannot use normal character constants like '*' because the compiler would emit their EBCDIC code, which is different from their ASCII/UTF-8 code. Instead we define macros for the characters so that they always use the ASCII/UTF-8 code when UTF-8 support @@ -589,7 +596,7 @@ is enabled. When UTF-8 support is not enabled, the definitions use character literals. Both character and string versions of each character are needed, and there are some longer strings as well. -This means that, on EBCDIC platforms, the PCRE library can handle either +This means that, on EBCDIC platforms, the PCRE2 library can handle either EBCDIC, or UTF-8, but not both. To support both in the same compiled library would need different lookups depending on whether PCRE2_UTF was set or not. This would make it impossible to use characters in switch/case statements, @@ -601,7 +608,7 @@ macros to give the functions distinct names. */ #ifndef SUPPORT_UNICODE /* UTF-8 support is not enabled; use the platform-dependent character literals -so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF +so that PCRE2 works in both ASCII and EBCDIC environments, but only in non-UTF mode. Newline characters are problematic in EBCDIC. Though it has CR and LF characters, a common practice has been to use its NL (0x15) character as the line terminator in C-like processing environments. However, sometimes the LF @@ -609,7 +616,7 @@ line terminator in C-like processing environments. However, sometimes the LF http://unicode.org/standard/reports/tr13/tr13-5.html -PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25 +PCRE2 defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25 instead. Whichever is *not* chosen is defined as NEL. In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the @@ -917,6 +924,7 @@ a positive value. */ #define STRING_NOTEMPTY_ATSTART_RIGHTPAR "NOTEMPTY_ATSTART)" #define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH=" #define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION=" +#define STRING_MARK "MARK" #else /* SUPPORT_UNICODE */ @@ -1189,6 +1197,7 @@ only. */ #define STRING_NOTEMPTY_ATSTART_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_UNDERSCORE STR_A STR_T STR_S STR_T STR_A STR_R STR_T STR_RIGHT_PARENTHESIS #define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN #define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN +#define STRING_MARK STR_M STR_A STR_R STR_K #endif /* SUPPORT_UNICODE */ @@ -1212,7 +1221,7 @@ only. */ #define PT_TABSIZE 11 /* Size of square table for autopossessify tests */ /* The following special properties are used only in XCLASS items, when POSIX -classes are specified and PCRE_UCP is set - in other words, for Unicode +classes are specified and PCRE2_UCP is set - in other words, for Unicode handling of these classes. They are not available via the \p or \P escapes like those in the above list, and so they do not take part in the autopossessifying table. */ @@ -1275,23 +1284,16 @@ mode rather than an escape sequence. It is also used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves like \N. -The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc. -when PCRE_UCP is set and replacement of \d etc by \p sequences is required. -They must be contiguous, and remain in order so that the replacements can be -looked up from a table. - Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in -check_escape(). There are two tests in the code for an escape -greater than ESC_b and less than ESC_Z to detect the types that may be -repeated. These are the types that consume characters. If any new escapes are -put in between that don't consume a character, that code will have to change. -*/ +check_escape(). There are tests in the code for an escape greater than ESC_b +and less than ESC_Z to detect the types that may be repeated. These are the +types that consume characters. If any new escapes are put in between that don't +consume a character, that code will have to change. */ enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, - ESC_E, ESC_Q, ESC_g, ESC_k, - ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu }; + ESC_E, ESC_Q, ESC_g, ESC_k }; /********************** Opcode definitions ******************/ @@ -1301,12 +1303,12 @@ enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in order to the list of escapes immediately above. Furthermore, values up to OP_DOLLM must not be changed without adjusting the table called autoposstab in -pcre_compile.c +pcre2_auto_possess.c Whenever this list is updated, the two macro definitions that follow must be updated to match. The possessification table called "opcode_possessify" in -pcre_compile.c must also be updated, and also the tables called "coptable" -and "poptable" in pcre_dfa_exec.c. +pcre2_compile.c must also be updated, and also the tables called "coptable" +and "poptable" in pcre2_dfa_match.c. ****** NOTE NOTE NOTE ******/ @@ -1357,7 +1359,8 @@ enum { OP_CIRC, /* 27 Start of line - not multiline */ OP_CIRCM, /* 28 Start of line - multiline */ - /* Single characters; caseful must precede the caseless ones */ + /* Single characters; caseful must precede the caseless ones, and these + must remain in this order, and adjacent. */ OP_CHAR, /* 29 Match one character, casefully */ OP_CHARI, /* 30 Match one character, caselessly */ @@ -1800,11 +1803,16 @@ typedef struct pcre2_serialized_data { #if defined PCRE2_CODE_UNIT_WIDTH && PCRE2_CODE_UNIT_WIDTH != 0 +/* EBCDIC is supported only for the 8-bit library. */ + +#if defined EBCDIC && PCRE2_CODE_UNIT_WIDTH != 8 +#error EBCDIC is not supported for the 16-bit or 32-bit libraries +#endif + /* This is the largest non-UTF code point. */ #define MAX_NON_UTF_CHAR (0xffffffffU >> (32 - PCRE2_CODE_UNIT_WIDTH)) - /* Internal shared data tables and variables. These are used by more than one of the exported public functions. They have to be "external" in the C sense, but are not part of the PCRE2 public API. Although the data for some of them is @@ -1878,11 +1886,12 @@ private structures. */ /* Private "external" functions. These are internal functions that are called from modules other than the one in which they are defined. They have to be -"external" in the C sense, but are not part of the PCRE public API. They are +"external" in the C sense, but are not part of the PCRE2 public API. They are not referenced from pcre2test, and must not be defined when no code unit width is available. */ #define _pcre2_auto_possessify PCRE2_SUFFIX(_pcre2_auto_possessify_) +#define _pcre2_check_escape PCRE2_SUFFIX(_pcre2_check_escape_) #define _pcre2_find_bracket PCRE2_SUFFIX(_pcre2_find_bracket_) #define _pcre2_is_newline PCRE2_SUFFIX(_pcre2_is_newline_) #define _pcre2_jit_free_rodata PCRE2_SUFFIX(_pcre2_jit_free_rodata_) @@ -1904,6 +1913,8 @@ is available. */ extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, const compile_block *); +extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *, + int *, uint32_t, BOOL, compile_block *); extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int); extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, BOOL); diff --git a/pcre2/src/pcre2_intmodedep.h b/pcre2/src/pcre2_intmodedep.h index f20f71e1e..ebff7e306 100644 --- a/pcre2/src/pcre2_intmodedep.h +++ b/pcre2/src/pcre2_intmodedep.h @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -72,7 +72,7 @@ just to undefine them all. */ #undef MAX_MARK #undef MAX_PATTERN_SIZE #undef MAX_UTF_SINGLE_CU -#undef NOT_FIRSTCHAR +#undef NOT_FIRSTCU #undef PUT #undef PUT2 #undef PUT2INC @@ -94,7 +94,7 @@ easier to maintain, the storing and loading of offsets from the compiled code unit string is now handled by the macros that are defined here. The macros are controlled by the value of LINK_SIZE. This defaults to 2, but -values of 2 or 4 are also supported. */ +values of 3 or 4 are also supported. */ /* ------------------- 8-bit support ------------------ */ @@ -102,29 +102,29 @@ values of 2 or 4 are also supported. */ #if LINK_SIZE == 2 #define PUT(a,n,d) \ - (a[n] = (d) >> 8), \ - (a[(n)+1] = (d) & 255) + (a[n] = (PCRE2_UCHAR)((d) >> 8)), \ + (a[(n)+1] = (PCRE2_UCHAR)((d) & 255)) #define GET(a,n) \ - (((a)[n] << 8) | (a)[(n)+1]) + (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) #define MAX_PATTERN_SIZE (1 << 16) #elif LINK_SIZE == 3 #define PUT(a,n,d) \ - (a[n] = (d) >> 16), \ - (a[(n)+1] = (d) >> 8), \ - (a[(n)+2] = (d) & 255) + (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ + (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \ + (a[(n)+2] = (PCRE2_UCHAR)((d) & 255)) #define GET(a,n) \ - (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) + (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) #define MAX_PATTERN_SIZE (1 << 24) #elif LINK_SIZE == 4 #define PUT(a,n,d) \ - (a[n] = (d) >> 24), \ - (a[(n)+1] = (d) >> 16), \ - (a[(n)+2] = (d) >> 8), \ - (a[(n)+3] = (d) & 255) + (a[n] = (PCRE2_UCHAR)((d) >> 24)), \ + (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \ + (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \ + (a[(n)+3] = (PCRE2_UCHAR)((d) & 255)) #define GET(a,n) \ - (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) + (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ #else @@ -140,7 +140,7 @@ values of 2 or 4 are also supported. */ #undef LINK_SIZE #define LINK_SIZE 1 #define PUT(a,n,d) \ - (a[n] = (d)) + (a[n] = (PCRE2_UCHAR)(d)) #define GET(a,n) \ (a[n]) #define MAX_PATTERN_SIZE (1 << 16) @@ -149,10 +149,10 @@ values of 2 or 4 are also supported. */ #undef LINK_SIZE #define LINK_SIZE 2 #define PUT(a,n,d) \ - (a[n] = (d) >> 16), \ - (a[(n)+1] = (d) & 65535) + (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ + (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535)) #define GET(a,n) \ - (((a)[n] << 16) | (a)[(n)+1]) + (unsigned int)(((a)[n] << 16) | (a)[(n)+1]) #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ #else @@ -200,11 +200,11 @@ arithmetic results in a signed value. Hence the cast. */ #endif /* Other macros that are different for 8-bit mode. The MAX_255 macro checks -whether its argument is less than 256. The maximum length of a MARK name must -fit in one code unit; currently it is set to 255 or 65535. The TABLE_GET macro -is used to access elements of tables containing exactly 256 items. When code -points can be greater than 255, a check is needed before accessing these -tables. */ +whether its argument, which is assumed to be one code unit, is less than 256. +The maximum length of a MARK name must fit in one code unit; currently it is +set to 255 or 65535. The TABLE_GET macro is used to access elements of tables +containing exactly 256 items. When code points can be greater than 255, a check +is needed before accessing these tables. */ #if PCRE2_CODE_UNIT_WIDTH == 8 #define MAX_255(c) TRUE @@ -252,7 +252,7 @@ UTF support is omitted, we don't even define them. */ /* #define MAX_UTF_SINGLE_CU */ /* #define HAS_EXTRALEN(c) */ /* #define GET_EXTRALEN(c) */ -/* #define NOT_FIRSTCHAR(c) */ +/* #define NOT_FIRSTCU(c) */ #define GETCHAR(c, eptr) c = *eptr; #define GETCHARTEST(c, eptr) c = *eptr; #define GETCHARINC(c, eptr) c = *eptr++; @@ -283,47 +283,47 @@ UTF support is omitted, we don't even define them. */ /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. Otherwise it has an undefined behaviour. */ -#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f]) +#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu]) -/* Returns TRUE, if the given character is not the first character -of a UTF sequence. */ +/* Returns TRUE, if the given value is not the first code unit of a UTF +sequence. */ -#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80) +#define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u) /* Get the next UTF-8 character, not advancing the pointer. This is called when we know we are in UTF-8 mode. */ #define GETCHAR(c, eptr) \ c = *eptr; \ - if (c >= 0xc0) GETUTF8(c, eptr); + if (c >= 0xc0u) GETUTF8(c, eptr); /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the pointer. */ #define GETCHARTEST(c, eptr) \ c = *eptr; \ - if (utf && c >= 0xc0) GETUTF8(c, eptr); + if (utf && c >= 0xc0u) GETUTF8(c, eptr); /* Get the next UTF-8 character, advancing the pointer. This is called when we know we are in UTF-8 mode. */ #define GETCHARINC(c, eptr) \ c = *eptr++; \ - if (c >= 0xc0) GETUTF8INC(c, eptr); + if (c >= 0xc0u) GETUTF8INC(c, eptr); /* Get the next character, testing for UTF-8 mode, and advancing the pointer. This is called when we don't know if we are in UTF-8 mode. */ #define GETCHARINCTEST(c, eptr) \ c = *eptr++; \ - if (utf && c >= 0xc0) GETUTF8INC(c, eptr); + if (utf && c >= 0xc0u) GETUTF8INC(c, eptr); /* Get the next UTF-8 character, not advancing the pointer, incrementing length if there are extra bytes. This is called when we know we are in UTF-8 mode. */ #define GETCHARLEN(c, eptr, len) \ c = *eptr; \ - if (c >= 0xc0) GETUTF8LEN(c, eptr, len); + if (c >= 0xc0u) GETUTF8LEN(c, eptr, len); /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the pointer, incrementing length if there are extra bytes. This is called when we @@ -331,21 +331,21 @@ do not know if we are in UTF-8 mode. */ #define GETCHARLENTEST(c, eptr, len) \ c = *eptr; \ - if (utf && c >= 0xc0) GETUTF8LEN(c, eptr, len); + if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len); /* If the pointer is not at the start of a character, move it back until it is. This is called only in UTF-8 mode - we don't put a test within the macro because almost all calls are already within a block of UTF-8 only code. */ -#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- +#define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr-- /* Same as above, just in the other direction. */ -#define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++ -#define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0) == 0x80) eptr++ +#define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++ +#define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++ /* Same as above, but it allows a fully customizable form. */ #define ACROSSCHAR(condition, eptr, action) \ - while((condition) && ((eptr) & 0xc0) == 0x80) action + while((condition) && ((eptr) & 0xc0u) == 0x80u) action /* Deposit a character into memory, returning the number of code units. */ @@ -364,63 +364,63 @@ because almost all calls are already within a block of UTF-8 only code. */ /* Tests whether the code point needs extra characters to decode. */ -#define HAS_EXTRALEN(c) (((c) & 0xfc00) == 0xd800) +#define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u) /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. Otherwise it has an undefined behaviour. */ #define GET_EXTRALEN(c) 1 -/* Returns TRUE, if the given character is not the first character -of a UTF sequence. */ +/* Returns TRUE, if the given value is not the first code unit of a UTF +sequence. */ -#define NOT_FIRSTCHAR(c) (((c) & 0xfc00) == 0xdc00) +#define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u) /* Base macro to pick up the low surrogate of a UTF-16 character, not advancing the pointer. */ #define GETUTF16(c, eptr) \ - { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; } + { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; } /* Get the next UTF-16 character, not advancing the pointer. This is called when we know we are in UTF-16 mode. */ #define GETCHAR(c, eptr) \ c = *eptr; \ - if ((c & 0xfc00) == 0xd800) GETUTF16(c, eptr); + if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the pointer. */ #define GETCHARTEST(c, eptr) \ c = *eptr; \ - if (utf && (c & 0xfc00) == 0xd800) GETUTF16(c, eptr); + if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); /* Base macro to pick up the low surrogate of a UTF-16 character, advancing the pointer. */ #define GETUTF16INC(c, eptr) \ - { c = (((c & 0x3ff) << 10) | (*eptr++ & 0x3ff)) + 0x10000; } + { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; } /* Get the next UTF-16 character, advancing the pointer. This is called when we know we are in UTF-16 mode. */ #define GETCHARINC(c, eptr) \ c = *eptr++; \ - if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr); + if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); /* Get the next character, testing for UTF-16 mode, and advancing the pointer. This is called when we don't know if we are in UTF-16 mode. */ #define GETCHARINCTEST(c, eptr) \ c = *eptr++; \ - if (utf && (c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr); + if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); /* Base macro to pick up the low surrogate of a UTF-16 character, not advancing the pointer, incrementing the length. */ #define GETUTF16LEN(c, eptr, len) \ - { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; len++; } + { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; } /* Get the next UTF-16 character, not advancing the pointer, incrementing length if there is a low surrogate. This is called when we know we are in @@ -428,7 +428,7 @@ UTF-16 mode. */ #define GETCHARLEN(c, eptr, len) \ c = *eptr; \ - if ((c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len); + if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the pointer, incrementing length if there is a low surrogate. This is called when @@ -436,22 +436,22 @@ we do not know if we are in UTF-16 mode. */ #define GETCHARLENTEST(c, eptr, len) \ c = *eptr; \ - if (utf && (c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len); + if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); /* If the pointer is not at the start of a character, move it back until it is. This is called only in UTF-16 mode - we don't put a test within the macro because almost all calls are already within a block of UTF-16 only code. */ -#define BACKCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr-- +#define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr-- /* Same as above, just in the other direction. */ -#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++ -#define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00) == 0xdc00) eptr++ +#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++ +#define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++ /* Same as above, but it allows a fully customizable form. */ #define ACROSSCHAR(condition, eptr, action) \ - if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action + if ((condition) && ((eptr) & 0xfc00u) == 0xdc00u) action /* Deposit a character into memory, returning the number of code units. */ @@ -469,7 +469,7 @@ into one PCRE2_UCHAR unit. */ #define MAX_UTF_SINGLE_CU (0x10ffffu) #define HAS_EXTRALEN(c) (0) #define GET_EXTRALEN(c) (0) -#define NOT_FIRSTCHAR(c) (0) +#define NOT_FIRSTCU(c) (0) /* Get the next UTF-32 character, not advancing the pointer. This is called when we know we are in UTF-32 mode. */ @@ -562,6 +562,7 @@ typedef struct pcre2_real_compile_context { int (*stack_guard)(uint32_t, void *); void *stack_guard_data; const uint8_t *tables; + PCRE2_SIZE max_pattern_length; uint16_t bsr_convention; uint16_t newline_convention; uint32_t parens_nest_limit; @@ -580,6 +581,7 @@ typedef struct pcre2_real_match_context { #endif int (*callout)(pcre2_callout_block *, void *); void *callout_data; + PCRE2_SIZE offset_limit; uint32_t match_limit; uint32_t recursion_limit; } pcre2_real_match_context; @@ -588,11 +590,17 @@ typedef struct pcre2_real_match_context { defined specially because it is required in pcre2_serialize_decode() when copying the size from possibly unaligned memory into a variable of the same type. Use a macro rather than a typedef to avoid compiler warnings when this -file is included multiple times by pcre2test. */ +file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the +largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit +argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field +here.) */ #undef CODE_BLOCKSIZE_TYPE #define CODE_BLOCKSIZE_TYPE size_t +#undef LOOKBEHIND_MAX +#define LOOKBEHIND_MAX UINT16_MAX + typedef struct pcre2_real_code { pcre2_memctl memctl; /* Memory control fields */ const uint8_t *tables; /* The character tables */ @@ -640,13 +648,26 @@ typedef struct pcre2_real_match_data { #ifndef PCRE2_PCRE2TEST -/* Structure for checking for mutual recursion when scanning compiled code. */ +/* Structures for checking for mutual recursion when scanning compiled or +parsed code. */ typedef struct recurse_check { struct recurse_check *prev; PCRE2_SPTR group; } recurse_check; +typedef struct parsed_recurse_check { + struct parsed_recurse_check *prev; + uint32_t *groupptr; +} parsed_recurse_check; + +/* Structure for building a cache when filling in recursion offsets. */ + +typedef struct recurse_cache { + PCRE2_SPTR group; + int groupnumber; +} recurse_cache; + /* Structure for maintaining a chain of pointers to the currently incomplete branches, for testing for left recursion while compiling. */ @@ -678,9 +699,10 @@ typedef struct compile_block { PCRE2_SPTR start_code; /* The start of the compiled code */ PCRE2_SPTR start_pattern; /* The start of the pattern */ PCRE2_SPTR end_pattern; /* The end of the pattern */ - PCRE2_UCHAR *hwm; /* High watermark of workspace */ PCRE2_UCHAR *name_table; /* The name/number table */ - size_t workspace_size; /* Size of workspace */ + PCRE2_SIZE workspace_size; /* Size of workspace */ + PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */ + PCRE2_SIZE erroroffset; /* Offset of error in pattern */ uint16_t names_found; /* Number of entries so far */ uint16_t name_entry_size; /* Size of each entry */ open_capitem *open_caps; /* Chain of open capture items */ @@ -688,12 +710,17 @@ typedef struct compile_block { uint32_t named_group_list_size; /* Number of entries in the list */ uint32_t external_options; /* External (initial) options */ uint32_t external_flags; /* External flag bits to be set */ - uint32_t bracount; /* Count of capturing parens as we compile */ - uint32_t final_bracount; /* Saved value after first pass */ + uint32_t bracount; /* Count of capturing parentheses */ + uint32_t lastcapture; /* Last capture encountered */ + uint32_t *parsed_pattern; /* Parsed pattern buffer */ + uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */ + uint32_t *groupinfo; /* Group info vector */ uint32_t top_backref; /* Maximum back reference */ uint32_t backref_map; /* Bitmap of low back refs */ uint32_t nltype; /* Newline type */ uint32_t nllen; /* Newline string length */ + uint32_t class_range_start; /* Overall class range start */ + uint32_t class_range_end; /* Overall class range end */ PCRE2_UCHAR nl[4]; /* Newline string when fixed length */ int max_lookbehind; /* Maximum lookbehind (characters) */ int parens_depth; /* Depth of nested parentheses */ @@ -701,9 +728,8 @@ typedef struct compile_block { int req_varyopt; /* "After variable item" flag for reqbyte */ BOOL had_accept; /* (*ACCEPT) encountered */ BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ - BOOL check_lookbehind; /* Lookbehinds need later checking */ + BOOL had_recurse; /* Had a recursion or subroutine call */ BOOL dupnames; /* Duplicate names exist */ - BOOL iscondassert; /* Next assert is a condition */ } compile_block; /* Structure for keeping the properties of the in-memory stack used @@ -819,6 +845,7 @@ typedef struct dfa_match_block { PCRE2_SPTR last_used_ptr; /* Latest consulted character */ const uint8_t *tables; /* Character tables */ PCRE2_SIZE start_offset; /* The start offset value */ + uint32_t match_limit_recursion; /* As it says */ uint32_t moptions; /* Match options */ uint32_t poptions; /* Pattern options */ uint32_t nltype; /* Newline type */ diff --git a/pcre2/src/pcre2_jit_compile.c b/pcre2/src/pcre2_jit_compile.c index 272ab2857..8dea90a1c 100644 --- a/pcre2/src/pcre2_jit_compile.c +++ b/pcre2/src/pcre2_jit_compile.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -38,7 +38,6 @@ POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ - #ifdef HAVE_CONFIG_H #include "config.h" #endif @@ -186,9 +185,10 @@ typedef struct jit_arguments { int (*callout)(pcre2_callout_block *, void *); void *callout_data; /* Everything else after. */ - sljit_ui limit_match; - uint32_t oveccount; - uint32_t options; + sljit_uw offset_limit; + sljit_u32 limit_match; + sljit_u32 oveccount; + sljit_u32 options; } jit_arguments; #define JIT_NUMBER_OF_COMPILE_MODES 3 @@ -197,8 +197,8 @@ typedef struct executable_functions { void *executable_funcs[JIT_NUMBER_OF_COMPILE_MODES]; void *read_only_data_heads[JIT_NUMBER_OF_COMPILE_MODES]; sljit_uw executable_sizes[JIT_NUMBER_OF_COMPILE_MODES]; - sljit_ui top_bracket; - sljit_ui limit_match; + sljit_u32 top_bracket; + sljit_u32 limit_match; } executable_functions; typedef struct jump_list { @@ -349,43 +349,48 @@ typedef struct compiler_common { /* First byte code. */ PCRE2_SPTR start; /* Maps private data offset to each opcode. */ - sljit_si *private_data_ptrs; + sljit_s32 *private_data_ptrs; /* Chain list of read-only data ptrs. */ void *read_only_data_head; /* Tells whether the capturing bracket is optimized. */ - sljit_ub *optimized_cbracket; + sljit_u8 *optimized_cbracket; /* Tells whether the starting offset is a target of then. */ - sljit_ub *then_offsets; + sljit_u8 *then_offsets; /* Current position where a THEN must jump. */ then_trap_backtrack *then_trap; /* Starting offset of private data for capturing brackets. */ - int cbra_ptr; + sljit_s32 cbra_ptr; /* Output vector starting point. Must be divisible by 2. */ - int ovector_start; + sljit_s32 ovector_start; /* Points to the starting character of the current match. */ - int start_ptr; + sljit_s32 start_ptr; /* Last known position of the requested byte. */ - int req_char_ptr; + sljit_s32 req_char_ptr; /* Head of the last recursion. */ - int recursive_head_ptr; + sljit_s32 recursive_head_ptr; /* First inspected character for partial matching. (Needed for avoiding zero length partial matches.) */ - int start_used_ptr; + sljit_s32 start_used_ptr; /* Starting pointer for partial soft matches. */ - int hit_start; - /* End pointer of the first line. */ - int first_line_end; + sljit_s32 hit_start; + /* Pointer of the match end position. */ + sljit_s32 match_end_ptr; /* Points to the marked string. */ - int mark_ptr; + sljit_s32 mark_ptr; /* Recursive control verb management chain. */ - int control_head_ptr; + sljit_s32 control_head_ptr; /* Points to the last matched capture block index. */ - int capture_last_ptr; + sljit_s32 capture_last_ptr; + /* Fast forward skipping byte code pointer. */ + PCRE2_SPTR fast_forward_bc_ptr; + /* Locals used by fast fail optimization. */ + sljit_s32 fast_fail_start_ptr; + sljit_s32 fast_fail_end_ptr; /* Flipped and lower case tables. */ - const sljit_ub *fcc; + const sljit_u8 *fcc; sljit_sw lcc; - /* Mode can be PCRE_STUDY_JIT_COMPILE and others. */ + /* Mode can be PCRE2_JIT_COMPLETE and others. */ int mode; /* TRUE, when minlength is greater than 0. */ BOOL might_be_empty; @@ -395,18 +400,20 @@ typedef struct compiler_common { BOOL has_skip_arg; /* (*THEN) is found in the pattern. */ BOOL has_then; + /* (*SKIP) or (*SKIP:arg) is found in lookbehind assertion. */ + BOOL has_skip_in_assert_back; /* Currently in recurse or negative assert. */ BOOL local_exit; /* Currently in a positive assert. */ BOOL positive_assert; /* Newline control. */ int nltype; - sljit_ui nlmax; - sljit_ui nlmin; + sljit_u32 nlmax; + sljit_u32 nlmin; int newline; int bsr_nltype; - sljit_ui bsr_nlmax; - sljit_ui bsr_nlmin; + sljit_u32 bsr_nlmax; + sljit_u32 bsr_nlmin; /* Dollar endonly. */ int endonly; /* Tables. */ @@ -463,27 +470,27 @@ typedef struct compare_context { #if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED int ucharptr; union { - sljit_si asint; - sljit_uh asushort; + sljit_s32 asint; + sljit_u16 asushort; #if PCRE2_CODE_UNIT_WIDTH == 8 - sljit_ub asbyte; - sljit_ub asuchars[4]; + sljit_u8 asbyte; + sljit_u8 asuchars[4]; #elif PCRE2_CODE_UNIT_WIDTH == 16 - sljit_uh asuchars[2]; + sljit_u16 asuchars[2]; #elif PCRE2_CODE_UNIT_WIDTH == 32 - sljit_ui asuchars[1]; + sljit_u32 asuchars[1]; #endif } c; union { - sljit_si asint; - sljit_uh asushort; + sljit_s32 asint; + sljit_u16 asushort; #if PCRE2_CODE_UNIT_WIDTH == 8 - sljit_ub asbyte; - sljit_ub asuchars[4]; + sljit_u8 asbyte; + sljit_u8 asuchars[4]; #elif PCRE2_CODE_UNIT_WIDTH == 16 - sljit_uh asuchars[2]; + sljit_u16 asuchars[2]; #elif PCRE2_CODE_UNIT_WIDTH == 32 - sljit_ui asuchars[1]; + sljit_u32 asuchars[1]; #endif } oc; #endif @@ -525,19 +532,19 @@ the start pointers when the end of the capturing group has not yet reached. */ #define PRIVATE_DATA(cc) (common->private_data_ptrs[(cc) - common->start]) #if PCRE2_CODE_UNIT_WIDTH == 8 -#define MOV_UCHAR SLJIT_MOV_UB -#define MOVU_UCHAR SLJIT_MOVU_UB +#define MOV_UCHAR SLJIT_MOV_U8 +#define MOVU_UCHAR SLJIT_MOVU_U8 #define IN_UCHARS(x) (x) #elif PCRE2_CODE_UNIT_WIDTH == 16 -#define MOV_UCHAR SLJIT_MOV_UH -#define MOVU_UCHAR SLJIT_MOVU_UH +#define MOV_UCHAR SLJIT_MOV_U16 +#define MOVU_UCHAR SLJIT_MOVU_U16 #define UCHAR_SHIFT (1) -#define IN_UCHARS(x) ((x) << UCHAR_SHIFT) +#define IN_UCHARS(x) ((x) * 2) #elif PCRE2_CODE_UNIT_WIDTH == 32 -#define MOV_UCHAR SLJIT_MOV_UI -#define MOVU_UCHAR SLJIT_MOVU_UI +#define MOV_UCHAR SLJIT_MOV_U32 +#define MOVU_UCHAR SLJIT_MOVU_U32 #define UCHAR_SHIFT (2) -#define IN_UCHARS(x) ((x) << UCHAR_SHIFT) +#define IN_UCHARS(x) ((x) * 4) #else #error Unsupported compiling mode #endif @@ -593,11 +600,6 @@ SLJIT_ASSERT(*cc >= OP_KET && *cc <= OP_KETRPOS); return count; } -static int ones_in_half_byte[16] = { - /* 0 */ 0, 1, 1, 2, /* 4 */ 1, 2, 2, 3, - /* 8 */ 1, 2, 2, 3, /* 12 */ 2, 3, 3, 4 -}; - /* Functions whose might need modification for all new supported opcodes: next_opcode check_opcode_types @@ -813,6 +815,7 @@ static BOOL check_opcode_types(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPT { int count; PCRE2_SPTR slot; +PCRE2_SPTR assert_back_end = cc - 1; /* Calculate important variables (like stack size) and checks whether all opcodes are supported. */ while (cc < ccend) @@ -884,6 +887,13 @@ while (cc < ccend) cc += (*cc == OP_CALLOUT) ? PRIV(OP_lengths)[OP_CALLOUT] : GET(cc, 1 + 2*LINK_SIZE); break; + case OP_ASSERTBACK: + slot = bracketend(cc); + if (slot > assert_back_end) + assert_back_end = slot; + cc += 1 + LINK_SIZE; + break; + case OP_THEN_ARG: common->has_then = TRUE; common->control_head_ptr = 1; @@ -905,9 +915,17 @@ while (cc < ccend) cc += 1; break; + case OP_SKIP: + if (cc < assert_back_end) + common->has_skip_in_assert_back = TRUE; + cc += 1; + break; + case OP_SKIP_ARG: common->control_head_ptr = 1; common->has_skip_arg = TRUE; + if (cc < assert_back_end) + common->has_skip_in_assert_back = TRUE; cc += 1 + 2 + cc[1]; break; @@ -921,10 +939,189 @@ while (cc < ccend) return TRUE; } +static BOOL is_accelerated_repeat(PCRE2_SPTR cc) +{ +switch(*cc) + { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + return (cc[1] != OP_ANYNL && cc[1] != OP_EXTUNI); + + case OP_STAR: + case OP_MINSTAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSSTAR: + case OP_POSPLUS: + + case OP_STARI: + case OP_MINSTARI: + case OP_PLUSI: + case OP_MINPLUSI: + case OP_POSSTARI: + case OP_POSPLUSI: + + case OP_NOTSTAR: + case OP_NOTMINSTAR: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTPOSSTAR: + case OP_NOTPOSPLUS: + + case OP_NOTSTARI: + case OP_NOTMINSTARI: + case OP_NOTPLUSI: + case OP_NOTMINPLUSI: + case OP_NOTPOSSTARI: + case OP_NOTPOSPLUSI: + return TRUE; + + case OP_CLASS: + case OP_NCLASS: +#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 + case OP_XCLASS: + cc += (*cc == OP_XCLASS) ? GET(cc, 1) : (int)(1 + (32 / sizeof(PCRE2_UCHAR))); +#else + cc += (1 + (32 / sizeof(PCRE2_UCHAR))); +#endif + + switch(*cc) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + return TRUE; + } + break; + } +return FALSE; +} + +static SLJIT_INLINE BOOL detect_fast_forward_skip(compiler_common *common, int *private_data_start) +{ +PCRE2_SPTR cc = common->start; +PCRE2_SPTR end; + +/* Skip not repeated brackets. */ +while (TRUE) + { + switch(*cc) + { + case OP_SOD: + case OP_SOM: + case OP_SET_SOM: + case OP_NOT_WORD_BOUNDARY: + case OP_WORD_BOUNDARY: + case OP_EODN: + case OP_EOD: + case OP_CIRC: + case OP_CIRCM: + case OP_DOLL: + case OP_DOLLM: + /* Zero width assertions. */ + cc++; + continue; + } + + if (*cc != OP_BRA && *cc != OP_CBRA) + break; + + end = cc + GET(cc, 1); + if (*end != OP_KET || PRIVATE_DATA(end) != 0) + return FALSE; + if (*cc == OP_CBRA) + { + if (common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0) + return FALSE; + cc += IMM2_SIZE; + } + cc += 1 + LINK_SIZE; + } + +if (is_accelerated_repeat(cc)) + { + common->fast_forward_bc_ptr = cc; + common->private_data_ptrs[(cc + 1) - common->start] = *private_data_start; + *private_data_start += sizeof(sljit_sw); + return TRUE; + } +return FALSE; +} + +static SLJIT_INLINE void detect_fast_fail(compiler_common *common, PCRE2_SPTR cc, int *private_data_start, sljit_s32 depth) +{ + PCRE2_SPTR next_alt; + + SLJIT_ASSERT(*cc == OP_BRA || *cc == OP_CBRA); + + if (*cc == OP_CBRA && common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] == 0) + return; + + next_alt = bracketend(cc) - (1 + LINK_SIZE); + if (*next_alt != OP_KET || PRIVATE_DATA(next_alt) != 0) + return; + + do + { + next_alt = cc + GET(cc, 1); + + cc += 1 + LINK_SIZE + ((*cc == OP_CBRA) ? IMM2_SIZE : 0); + + while (TRUE) + { + switch(*cc) + { + case OP_SOD: + case OP_SOM: + case OP_SET_SOM: + case OP_NOT_WORD_BOUNDARY: + case OP_WORD_BOUNDARY: + case OP_EODN: + case OP_EOD: + case OP_CIRC: + case OP_CIRCM: + case OP_DOLL: + case OP_DOLLM: + /* Zero width assertions. */ + cc++; + continue; + } + break; + } + + if (depth > 0 && (*cc == OP_BRA || *cc == OP_CBRA)) + detect_fast_fail(common, cc, private_data_start, depth - 1); + + if (is_accelerated_repeat(cc)) + { + common->private_data_ptrs[(cc + 1) - common->start] = *private_data_start; + + if (common->fast_fail_start_ptr == 0) + common->fast_fail_start_ptr = *private_data_start; + + *private_data_start += sizeof(sljit_sw); + common->fast_fail_end_ptr = *private_data_start; + + if (*private_data_start > SLJIT_MAX_LOCAL_SIZE) + return; + } + + cc = next_alt; + } + while (*cc == OP_ALT); +} + static int get_class_iterator_size(PCRE2_SPTR cc) { -sljit_ui min; -sljit_ui max; +sljit_u32 min; +sljit_u32 max; switch(*cc) { case OP_CRSTAR: @@ -961,7 +1158,7 @@ PCRE2_SPTR next_end; PCRE2_SPTR max_end; PCRE2_UCHAR type; sljit_sw length = end - begin; -sljit_si min, max, i; +sljit_s32 min, max, i; /* Detect fixed iterations first. */ if (end[-(1 + LINK_SIZE)] != OP_KET) @@ -1097,6 +1294,7 @@ PCRE2_SPTR alternative; PCRE2_SPTR end = NULL; int private_data_ptr = *private_data_start; int space, size, bracketlen; +BOOL repeat_check = TRUE; while (cc < ccend) { @@ -1106,7 +1304,8 @@ while (cc < ccend) if (private_data_ptr > SLJIT_MAX_LOCAL_SIZE) break; - if (*cc == OP_ONCE || *cc == OP_ONCE_NC || *cc == OP_BRA || *cc == OP_CBRA || *cc == OP_COND) + if (repeat_check && (*cc == OP_ONCE || *cc == OP_ONCE_NC || *cc == OP_BRA || *cc == OP_CBRA || *cc == OP_COND)) + { if (detect_repeat(common, cc)) { /* These brackets are converted to repeats, so no global @@ -1114,6 +1313,8 @@ while (cc < ccend) if (cc >= end) end = bracketend(cc); } + } + repeat_check = TRUE; switch(*cc) { @@ -1169,6 +1370,13 @@ while (cc < ccend) bracketlen = 1 + LINK_SIZE + IMM2_SIZE; break; + case OP_BRAZERO: + case OP_BRAMINZERO: + case OP_BRAPOSZERO: + repeat_check = FALSE; + size = 1; + break; + CASE_ITERATOR_PRIVATE_DATA_1 space = 1; size = -2; @@ -1208,14 +1416,14 @@ while (cc < ccend) case OP_CLASS: case OP_NCLASS: - size = 1 + 32 / sizeof(PCRE2_UCHAR); space = get_class_iterator_size(cc + size); + size = 1 + 32 / sizeof(PCRE2_UCHAR); break; #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 case OP_XCLASS: - size = GET(cc, 1); space = get_class_iterator_size(cc + size); + size = GET(cc, 1); break; #endif @@ -1354,6 +1562,13 @@ while (cc < ccend) cc += 1 + LINK_SIZE + IMM2_SIZE; break; + case OP_THEN: + stack_restore = TRUE; + if (common->control_head_ptr != 0) + *needs_control_head = TRUE; + cc ++; + break; + default: stack_restore = TRUE; /* Fall through. */ @@ -2008,7 +2223,7 @@ if (save) SLJIT_ASSERT(cc == ccend && stackptr == stacktop && (save || (tmp1empty && tmp2empty))); } -static SLJIT_INLINE PCRE2_SPTR set_then_offsets(compiler_common *common, PCRE2_SPTR cc, sljit_ub *current_offset) +static SLJIT_INLINE PCRE2_SPTR set_then_offsets(compiler_common *common, PCRE2_SPTR cc, sljit_u8 *current_offset) { PCRE2_SPTR end = bracketend(cc); BOOL has_alternatives = cc[GET(cc, 1)] == OP_ALT; @@ -2146,6 +2361,7 @@ add_stub(common, CMP(SLJIT_GREATER, STACK_TOP, 0, STACK_LIMIT, 0)); static SLJIT_INLINE void free_stack(compiler_common *common, int size) { DEFINE_COMPILER; + SLJIT_ASSERT(size > 0); OP2(SLJIT_SUB, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, size * sizeof(sljit_sw)); } @@ -2174,7 +2390,7 @@ static SLJIT_INLINE void reset_ovector(compiler_common *common, int length) { DEFINE_COMPILER; struct sljit_label *loop; -int i; +sljit_s32 i; /* At this point we can freely use all temporary registers. */ SLJIT_ASSERT(length > 1); @@ -2196,6 +2412,18 @@ else } } +static SLJIT_INLINE void reset_fast_fail(compiler_common *common) +{ +DEFINE_COMPILER; +sljit_s32 i; + +SLJIT_ASSERT(common->fast_fail_start_ptr < common->fast_fail_end_ptr); + +OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +for (i = common->fast_fail_start_ptr; i < common->fast_fail_end_ptr; i += sizeof(sljit_sw)) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), i, TMP1, 0); +} + static SLJIT_INLINE void do_reset_match(compiler_common *common, int length) { DEFINE_COMPILER; @@ -2249,6 +2477,7 @@ while (current != NULL) SLJIT_ASSERT_STOP(); break; } + SLJIT_ASSERT(current > (sljit_sw*)current[-1]); current = (sljit_sw*)current[-1]; } return -1; @@ -2267,7 +2496,7 @@ OP1(SLJIT_MOV, SLJIT_R0, 0, ARGUMENTS, 0); OP1(SLJIT_MOV, SLJIT_S0, 0, SLJIT_MEM1(SLJIT_SP), common->start_ptr); if (common->mark_ptr != 0) OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), common->mark_ptr); -OP1(SLJIT_MOV_UI, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, oveccount)); +OP1(SLJIT_MOV_U32, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, oveccount)); OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, startchar_ptr), SLJIT_S0, 0); if (common->mark_ptr != 0) OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R0), SLJIT_OFFSETOF(jit_arguments, mark_ptr), SLJIT_R2, 0); @@ -2286,7 +2515,7 @@ OP2(SLJIT_ASHR, SLJIT_S1, 0, SLJIT_S1, 0, SLJIT_IMM, UCHAR_SHIFT); #endif SLJIT_ASSERT(sizeof(PCRE2_SIZE) == 4 || sizeof(PCRE2_SIZE) == 8); if (sizeof(PCRE2_SIZE) == 4) - OP1(SLJIT_MOVU_UI, SLJIT_MEM1(SLJIT_R2), sizeof(PCRE2_SIZE), SLJIT_S1, 0); + OP1(SLJIT_MOVU_U32, SLJIT_MEM1(SLJIT_R2), sizeof(PCRE2_SIZE), SLJIT_S1, 0); else OP1(SLJIT_MOVU, SLJIT_MEM1(SLJIT_R2), sizeof(PCRE2_SIZE), SLJIT_S1, 0); OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 1); @@ -2312,7 +2541,7 @@ else static SLJIT_INLINE void return_with_partial_match(compiler_common *common, struct sljit_label *quit) { DEFINE_COMPILER; -sljit_si mov_opcode; +sljit_s32 mov_opcode; SLJIT_COMPILE_ASSERT(STR_END == SLJIT_S1, str_end_must_be_saved_reg2); SLJIT_ASSERT(common->start_used_ptr != 0 && common->start_ptr != 0 @@ -2328,7 +2557,7 @@ OP1(SLJIT_MOV, SLJIT_S0, 0, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(jit_arguments, OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(jit_arguments, startchar_ptr), SLJIT_R2, 0); OP1(SLJIT_MOV, SLJIT_R1, 0, SLJIT_MEM1(SLJIT_R1), SLJIT_OFFSETOF(jit_arguments, match_data)); -mov_opcode = (sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_UI : SLJIT_MOV; +mov_opcode = (sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_U32 : SLJIT_MOV; OP2(SLJIT_SUB, SLJIT_R2, 0, SLJIT_R2, 0, SLJIT_S0, 0); #if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 @@ -2565,7 +2794,7 @@ else JUMPHERE(jump); } -static void peek_char(compiler_common *common, sljit_ui max) +static void peek_char(compiler_common *common, sljit_u32 max) { /* Reads the character into TMP1, keeps STR_PTR. Does not check STR_END. TMP2 Destroyed. */ @@ -2610,12 +2839,12 @@ if (common->utf) #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 -static BOOL is_char7_bitset(const sljit_ub *bitset, BOOL nclass) +static BOOL is_char7_bitset(const sljit_u8 *bitset, BOOL nclass) { /* Tells whether the character codes below 128 are enough to determine a match. */ -const sljit_ub value = nclass ? 0xff : 0; -const sljit_ub *end = bitset + 32; +const sljit_u8 value = nclass ? 0xff : 0; +const sljit_u8 *end = bitset + 32; bitset += 16; do @@ -2640,12 +2869,12 @@ SLJIT_ASSERT(common->utf); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); +OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); if (full_read) { jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xc0); - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0); + OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); JUMPHERE(jump); } @@ -2653,7 +2882,7 @@ if (full_read) #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ -static void read_char_range(compiler_common *common, sljit_ui min, sljit_ui max, BOOL update_str_ptr) +static void read_char_range(compiler_common *common, sljit_u32 min, sljit_u32 max, BOOL update_str_ptr) { /* Reads the precise value of a character into TMP1, if the character is between min and max (c >= min && c <= max). Otherwise it returns with a value @@ -2684,7 +2913,7 @@ if (common->utf) { OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf0); if (update_str_ptr) - OP1(SLJIT_MOV_UB, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); + OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x7); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); @@ -2708,7 +2937,7 @@ if (common->utf) { OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xe0); if (update_str_ptr) - OP1(SLJIT_MOV_UB, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); + OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xf); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); @@ -2728,7 +2957,7 @@ if (common->utf) add_jump(compiler, (max < 0x10000) ? &common->utfreadchar16 : &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); else if (max < 128) { - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); + OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); } else @@ -2737,7 +2966,7 @@ if (common->utf) if (!update_str_ptr) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); else - OP1(SLJIT_MOV_UB, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); + OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); @@ -2807,7 +3036,7 @@ if (common->utf) { /* This can be an extra read in some situations, but hopefully it is needed in most cases. */ - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xc0); if (!update_str_ptr) { @@ -2819,7 +3048,7 @@ if (common->utf) OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); JUMPHERE(jump2); } else @@ -2834,7 +3063,7 @@ if (common->utf) OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255); #endif -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); +OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); #if PCRE2_CODE_UNIT_WIDTH != 8 JUMPHERE(jump); #endif @@ -3026,7 +3255,7 @@ compare = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x3); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0); -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); +OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); JUMPHERE(compare); @@ -3035,7 +3264,7 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0); /* We only have types for characters less than 256. */ JUMPHERE(jump); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0); +OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); @@ -3057,27 +3286,27 @@ SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8); sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1)); +OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2)); -OP1(SLJIT_MOV_UH, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1); +OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3); +OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); } #endif /* SUPPORT_UNICODE */ -static SLJIT_INLINE struct sljit_label *mainloop_entry(compiler_common *common, BOOL hascrorlf, BOOL firstline) +static SLJIT_INLINE struct sljit_label *mainloop_entry(compiler_common *common, BOOL hascrorlf, sljit_u32 overall_options) { DEFINE_COMPILER; struct sljit_label *mainloop; struct sljit_label *newlinelabel = NULL; struct sljit_jump *start; struct sljit_jump *end = NULL; -struct sljit_jump *nl = NULL; +struct sljit_jump *end2 = NULL; #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 struct sljit_jump *singlechar; #endif @@ -3085,14 +3314,16 @@ jump_list *newline = NULL; BOOL newlinecheck = FALSE; BOOL readuchar = FALSE; -if (!(hascrorlf || firstline) && (common->nltype == NLTYPE_ANY || - common->nltype == NLTYPE_ANYCRLF || common->newline > 255)) +if (!(hascrorlf || (overall_options & PCRE2_FIRSTLINE) != 0) + && (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF || common->newline > 255)) newlinecheck = TRUE; -if (firstline) +SLJIT_ASSERT(common->forced_quit_label == NULL); + +if ((overall_options & PCRE2_FIRSTLINE) != 0) { /* Search for the end of the first line. */ - SLJIT_ASSERT(common->first_line_end != 0); + SLJIT_ASSERT(common->match_end_ptr != 0); OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0); if (common->nltype == NLTYPE_FIXED && common->newline > 255) @@ -3105,24 +3336,49 @@ if (firstline) CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, mainloop); CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff, mainloop); JUMPHERE(end); - OP2(SLJIT_SUB, SLJIT_MEM1(SLJIT_SP), common->first_line_end, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + OP2(SLJIT_SUB, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); } else { end = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); mainloop = LABEL(); /* Continual stores does not cause data dependency. */ - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->first_line_end, STR_PTR, 0); + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0); read_char_range(common, common->nlmin, common->nlmax, TRUE); check_newlinechar(common, common->nltype, &newline, TRUE); CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop); JUMPHERE(end); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->first_line_end, STR_PTR, 0); + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0); set_jumps(newline, LABEL()); } OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0); } +else if ((overall_options & PCRE2_USE_OFFSET_LIMIT) != 0) + { + /* Check whether offset limit is set and valid. */ + SLJIT_ASSERT(common->match_end_ptr != 0); + + OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, offset_limit)); + OP1(SLJIT_MOV, TMP2, 0, STR_END, 0); + end = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw) PCRE2_UNSET); + OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); +#if PCRE2_CODE_UNIT_WIDTH == 16 + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); +#elif PCRE2_CODE_UNIT_WIDTH == 32 + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2); +#endif + OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin)); + OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); + end2 = CMP(SLJIT_LESS_EQUAL, TMP2, 0, STR_END, 0); + OP1(SLJIT_MOV, TMP2, 0, STR_END, 0); + JUMPHERE(end2); + OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_NOMATCH); + add_jump(compiler, &common->forced_quit, CMP(SLJIT_LESS, TMP2, 0, STR_PTR, 0)); + JUMPHERE(end); + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, TMP2, 0); + } start = JUMP(SLJIT_JUMP); @@ -3138,7 +3394,7 @@ if (newlinecheck) OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT); #endif OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - nl = JUMP(SLJIT_JUMP); + end2 = JUMP(SLJIT_JUMP); } mainloop = LABEL(); @@ -3161,7 +3417,7 @@ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); if (common->utf) { singlechar = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); JUMPHERE(singlechar); } @@ -3183,51 +3439,52 @@ JUMPHERE(start); if (newlinecheck) { JUMPHERE(end); - JUMPHERE(nl); + JUMPHERE(end2); } return mainloop; } #define MAX_N_CHARS 16 -#define MAX_N_BYTES 8 +#define MAX_DIFF_CHARS 6 -static SLJIT_INLINE void add_prefix_byte(sljit_ub byte, sljit_ub *bytes) +static SLJIT_INLINE void add_prefix_char(PCRE2_UCHAR chr, PCRE2_UCHAR *chars) { -sljit_ub len = bytes[0]; -int i; +PCRE2_UCHAR i, len; +len = chars[0]; if (len == 255) return; if (len == 0) { - bytes[0] = 1; - bytes[1] = byte; + chars[0] = 1; + chars[1] = chr; return; } for (i = len; i > 0; i--) - if (bytes[i] == byte) + if (chars[i] == chr) return; -if (len >= MAX_N_BYTES - 1) +if (len >= MAX_DIFF_CHARS - 1) { - bytes[0] = 255; + chars[0] = 255; return; } len++; -bytes[len] = byte; -bytes[0] = len; +chars[len] = chr; +chars[0] = len; } -static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, sljit_ui *chars, sljit_ub *bytes, int max_chars) +static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, PCRE2_UCHAR *chars, int max_chars, sljit_u32 *rec_count) { /* Recursive function, which scans prefix literals. */ -BOOL last, any, caseless; +BOOL last, any, class, caseless; int len, repeat, len_save, consumed = 0; -sljit_ui chr, mask; +sljit_u32 chr; /* Any unicode character. */ +sljit_u8 *bytes, *bytes_end, byte; PCRE2_SPTR alternative, cc_save, oc; #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 PCRE2_UCHAR othercase[8]; @@ -3240,9 +3497,15 @@ PCRE2_UCHAR othercase[1]; repeat = 1; while (TRUE) { + if (*rec_count == 0) + return 0; + (*rec_count)--; + last = TRUE; any = FALSE; + class = FALSE; caseless = FALSE; + switch (*cc) { case OP_CHARI: @@ -3304,7 +3567,7 @@ while (TRUE) #ifdef SUPPORT_UNICODE if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc); #endif - max_chars = scan_prefix(common, cc + len, chars, bytes, max_chars); + max_chars = scan_prefix(common, cc + len, chars, max_chars, rec_count); if (max_chars == 0) return consumed; last = FALSE; @@ -3327,7 +3590,7 @@ while (TRUE) alternative = cc + GET(cc, 1); while (*alternative == OP_ALT) { - max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, bytes, max_chars); + max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars, rec_count); if (max_chars == 0) return consumed; alternative += GET(alternative, 1); @@ -3340,18 +3603,17 @@ while (TRUE) case OP_CLASS: #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && !is_char7_bitset((const sljit_ub *)(cc + 1), FALSE)) return consumed; + if (common->utf && !is_char7_bitset((const sljit_u8 *)(cc + 1), FALSE)) + return consumed; #endif - any = TRUE; - cc += 1 + 32 / sizeof(PCRE2_UCHAR); + class = TRUE; break; case OP_NCLASS: #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 if (common->utf) return consumed; #endif - any = TRUE; - cc += 1 + 32 / sizeof(PCRE2_UCHAR); + class = TRUE; break; #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 @@ -3366,7 +3628,7 @@ while (TRUE) case OP_DIGIT: #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && !is_char7_bitset((const sljit_ub *)common->ctypes - cbit_length + cbit_digit, FALSE)) + if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_digit, FALSE)) return consumed; #endif any = TRUE; @@ -3375,7 +3637,7 @@ while (TRUE) case OP_WHITESPACE: #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && !is_char7_bitset((const sljit_ub *)common->ctypes - cbit_length + cbit_space, FALSE)) + if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_space, FALSE)) return consumed; #endif any = TRUE; @@ -3384,7 +3646,7 @@ while (TRUE) case OP_WORDCHAR: #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && !is_char7_bitset((const sljit_ub *)common->ctypes - cbit_length + cbit_word, FALSE)) + if (common->utf && !is_char7_bitset((const sljit_u8 *)common->ctypes - cbit_length + cbit_word, FALSE)) return consumed; #endif any = TRUE; @@ -3439,27 +3701,14 @@ while (TRUE) if (any) { -#if PCRE2_CODE_UNIT_WIDTH == 8 - mask = 0xff; -#elif PCRE2_CODE_UNIT_WIDTH == 16 - mask = 0xffff; -#elif PCRE2_CODE_UNIT_WIDTH == 32 - mask = 0xffffffff; -#else - SLJIT_ASSERT_STOP(); -#endif - do { - chars[0] = mask; - chars[1] = mask; - bytes[0] = 255; + chars[0] = 255; consumed++; if (--max_chars == 0) return consumed; - chars += 2; - bytes += MAX_N_BYTES; + chars += MAX_DIFF_CHARS; } while (--repeat > 0); @@ -3467,6 +3716,103 @@ while (TRUE) continue; } + if (class) + { + bytes = (sljit_u8*) (cc + 1); + cc += 1 + 32 / sizeof(PCRE2_UCHAR); + + switch (*cc) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPOSSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + case OP_CRPOSQUERY: + max_chars = scan_prefix(common, cc + 1, chars, max_chars, rec_count); + if (max_chars == 0) + return consumed; + break; + + default: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRPOSPLUS: + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + case OP_CRPOSRANGE: + repeat = GET2(cc, 1); + if (repeat <= 0) + return consumed; + break; + } + + do + { + if (bytes[31] & 0x80) + chars[0] = 255; + else if (chars[0] != 255) + { + bytes_end = bytes + 32; + chr = 0; + do + { + byte = *bytes++; + SLJIT_ASSERT((chr & 0x7) == 0); + if (byte == 0) + chr += 8; + else + { + do + { + if ((byte & 0x1) != 0) + add_prefix_char(chr, chars); + byte >>= 1; + chr++; + } + while (byte != 0); + chr = (chr + 7) & ~7; + } + } + while (chars[0] != 255 && bytes < bytes_end); + bytes = bytes_end - 32; + } + + consumed++; + if (--max_chars == 0) + return consumed; + chars += MAX_DIFF_CHARS; + } + while (--repeat > 0); + + switch (*cc) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPOSSTAR: + return consumed; + + case OP_CRQUERY: + case OP_CRMINQUERY: + case OP_CRPOSQUERY: + cc++; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + case OP_CRPOSRANGE: + if (GET2(cc, 1) != GET2(cc, 1 + IMM2_SIZE)) + return consumed; + cc += 1 + 2 * IMM2_SIZE; + break; + } + + repeat = 1; + continue; + } + len = 1; #ifdef SUPPORT_UNICODE if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc); @@ -3502,43 +3848,16 @@ while (TRUE) do { chr = *cc; -#if PCRE2_CODE_UNIT_WIDTH == 32 - if (SLJIT_UNLIKELY(chr == NOTACHAR)) - return consumed; -#endif - add_prefix_byte((sljit_ub)chr, bytes); + add_prefix_char(*cc, chars); - mask = 0; if (caseless) - { - add_prefix_byte((sljit_ub)*oc, bytes); - mask = *cc ^ *oc; - chr |= mask; - } - -#if PCRE2_CODE_UNIT_WIDTH == 32 - if (chars[0] == NOTACHAR && chars[1] == 0) -#else - if (chars[0] == NOTACHAR) -#endif - { - chars[0] = chr; - chars[1] = mask; - } - else - { - mask |= chars[0] ^ chr; - chr |= mask; - chars[0] = chr; - chars[1] |= mask; - } + add_prefix_char(*oc, chars); len--; consumed++; if (--max_chars == 0) return consumed; - chars += 2; - bytes += MAX_N_BYTES; + chars += MAX_DIFF_CHARS; cc++; oc++; } @@ -3557,161 +3876,576 @@ while (TRUE) } } -static SLJIT_INLINE BOOL fast_forward_first_n_chars(compiler_common *common, BOOL firstline) +#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) + +static sljit_s32 character_to_int32(PCRE2_UCHAR chr) +{ +sljit_s32 value = (sljit_s32)chr; +#if PCRE2_CODE_UNIT_WIDTH == 8 +#define SSE2_COMPARE_TYPE_INDEX 0 +return (value << 24) | (value << 16) | (value << 8) | value; +#elif PCRE2_CODE_UNIT_WIDTH == 16 +#define SSE2_COMPARE_TYPE_INDEX 1 +return (value << 16) | value; +#elif PCRE2_CODE_UNIT_WIDTH == 32 +#define SSE2_COMPARE_TYPE_INDEX 2 +return value; +#else +#error "Unsupported unit width" +#endif +} + +static SLJIT_INLINE void fast_forward_first_char2_sse2(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2) +{ +DEFINE_COMPILER; +struct sljit_label *start; +struct sljit_jump *quit[3]; +struct sljit_jump *nomatch; +sljit_u8 instruction[8]; +sljit_s32 tmp1_ind = sljit_get_register_index(TMP1); +sljit_s32 tmp2_ind = sljit_get_register_index(TMP2); +sljit_s32 str_ptr_ind = sljit_get_register_index(STR_PTR); +BOOL load_twice = FALSE; +PCRE2_UCHAR bit; + +bit = char1 ^ char2; +if (!is_powerof2(bit)) + bit = 0; + +if ((char1 != char2) && bit == 0) + load_twice = TRUE; + +quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); + +/* First part (unaligned start) */ + +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit)); + +SLJIT_ASSERT(tmp1_ind < 8 && tmp2_ind == 1); + +/* MOVD xmm, r/m32 */ +instruction[0] = 0x66; +instruction[1] = 0x0f; +instruction[2] = 0x6e; +instruction[3] = 0xc0 | (2 << 3) | tmp1_ind; +sljit_emit_op_custom(compiler, instruction, 4); + +if (char1 != char2) + { + OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2)); + + /* MOVD xmm, r/m32 */ + instruction[3] = 0xc0 | (3 << 3) | tmp1_ind; + sljit_emit_op_custom(compiler, instruction, 4); + } + +/* PSHUFD xmm1, xmm2/m128, imm8 */ +instruction[2] = 0x70; +instruction[3] = 0xc0 | (2 << 3) | 2; +instruction[4] = 0; +sljit_emit_op_custom(compiler, instruction, 5); + +if (char1 != char2) + { + /* PSHUFD xmm1, xmm2/m128, imm8 */ + instruction[3] = 0xc0 | (3 << 3) | 3; + instruction[4] = 0; + sljit_emit_op_custom(compiler, instruction, 5); + } + +OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 0xf); +OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf); + +/* MOVDQA xmm1, xmm2/m128 */ +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + +if (str_ptr_ind < 8) + { + instruction[2] = 0x6f; + instruction[3] = (0 << 3) | str_ptr_ind; + sljit_emit_op_custom(compiler, instruction, 4); + + if (load_twice) + { + instruction[3] = (1 << 3) | str_ptr_ind; + sljit_emit_op_custom(compiler, instruction, 4); + } + } +else + { + instruction[1] = 0x41; + instruction[2] = 0x0f; + instruction[3] = 0x6f; + instruction[4] = (0 << 3) | (str_ptr_ind & 0x7); + sljit_emit_op_custom(compiler, instruction, 5); + + if (load_twice) + { + instruction[4] = (1 << 3) | str_ptr_ind; + sljit_emit_op_custom(compiler, instruction, 5); + } + instruction[1] = 0x0f; + } + +#else + +instruction[2] = 0x6f; +instruction[3] = (0 << 3) | str_ptr_ind; +sljit_emit_op_custom(compiler, instruction, 4); + +if (load_twice) + { + instruction[3] = (1 << 3) | str_ptr_ind; + sljit_emit_op_custom(compiler, instruction, 4); + } + +#endif + +if (bit != 0) + { + /* POR xmm1, xmm2/m128 */ + instruction[2] = 0xeb; + instruction[3] = 0xc0 | (0 << 3) | 3; + sljit_emit_op_custom(compiler, instruction, 4); + } + +/* PCMPEQB/W/D xmm1, xmm2/m128 */ +instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX; +instruction[3] = 0xc0 | (0 << 3) | 2; +sljit_emit_op_custom(compiler, instruction, 4); + +if (load_twice) + { + instruction[3] = 0xc0 | (1 << 3) | 3; + sljit_emit_op_custom(compiler, instruction, 4); + } + +/* PMOVMSKB reg, xmm */ +instruction[2] = 0xd7; +instruction[3] = 0xc0 | (tmp1_ind << 3) | 0; +sljit_emit_op_custom(compiler, instruction, 4); + +if (load_twice) + { + OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0); + instruction[3] = 0xc0 | (tmp2_ind << 3) | 1; + sljit_emit_op_custom(compiler, instruction, 4); + + OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0); + } + +OP2(SLJIT_ASHR, TMP1, 0, TMP1, 0, TMP2, 0); + +/* BSF r32, r/m32 */ +instruction[0] = 0x0f; +instruction[1] = 0xbc; +instruction[2] = 0xc0 | (tmp1_ind << 3) | tmp1_ind; +sljit_emit_op_custom(compiler, instruction, 3); + +nomatch = JUMP(SLJIT_ZERO); + +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); +quit[1] = JUMP(SLJIT_JUMP); + +JUMPHERE(nomatch); + +start = LABEL(); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16); +quit[2] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); + +/* Second part (aligned) */ + +instruction[0] = 0x66; +instruction[1] = 0x0f; + +/* MOVDQA xmm1, xmm2/m128 */ +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + +if (str_ptr_ind < 8) + { + instruction[2] = 0x6f; + instruction[3] = (0 << 3) | str_ptr_ind; + sljit_emit_op_custom(compiler, instruction, 4); + + if (load_twice) + { + instruction[3] = (1 << 3) | str_ptr_ind; + sljit_emit_op_custom(compiler, instruction, 4); + } + } +else + { + instruction[1] = 0x41; + instruction[2] = 0x0f; + instruction[3] = 0x6f; + instruction[4] = (0 << 3) | (str_ptr_ind & 0x7); + sljit_emit_op_custom(compiler, instruction, 5); + + if (load_twice) + { + instruction[4] = (1 << 3) | str_ptr_ind; + sljit_emit_op_custom(compiler, instruction, 5); + } + instruction[1] = 0x0f; + } + +#else + +instruction[2] = 0x6f; +instruction[3] = (0 << 3) | str_ptr_ind; +sljit_emit_op_custom(compiler, instruction, 4); + +if (load_twice) + { + instruction[3] = (1 << 3) | str_ptr_ind; + sljit_emit_op_custom(compiler, instruction, 4); + } + +#endif + +if (bit != 0) + { + /* POR xmm1, xmm2/m128 */ + instruction[2] = 0xeb; + instruction[3] = 0xc0 | (0 << 3) | 3; + sljit_emit_op_custom(compiler, instruction, 4); + } + +/* PCMPEQB/W/D xmm1, xmm2/m128 */ +instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX; +instruction[3] = 0xc0 | (0 << 3) | 2; +sljit_emit_op_custom(compiler, instruction, 4); + +if (load_twice) + { + instruction[3] = 0xc0 | (1 << 3) | 3; + sljit_emit_op_custom(compiler, instruction, 4); + } + +/* PMOVMSKB reg, xmm */ +instruction[2] = 0xd7; +instruction[3] = 0xc0 | (tmp1_ind << 3) | 0; +sljit_emit_op_custom(compiler, instruction, 4); + +if (load_twice) + { + instruction[3] = 0xc0 | (tmp2_ind << 3) | 1; + sljit_emit_op_custom(compiler, instruction, 4); + + OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + } + +/* BSF r32, r/m32 */ +instruction[0] = 0x0f; +instruction[1] = 0xbc; +instruction[2] = 0xc0 | (tmp1_ind << 3) | tmp1_ind; +sljit_emit_op_custom(compiler, instruction, 3); + +JUMPTO(SLJIT_ZERO, start); + +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + +start = LABEL(); +SET_LABEL(quit[0], start); +SET_LABEL(quit[1], start); +SET_LABEL(quit[2], start); +} + +#undef SSE2_COMPARE_TYPE_INDEX + +#endif + +static void fast_forward_first_char2(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset) { DEFINE_COMPILER; struct sljit_label *start; struct sljit_jump *quit; -sljit_ui chars[MAX_N_CHARS * 2]; -sljit_ub bytes[MAX_N_CHARS * MAX_N_BYTES]; -sljit_ub ones[MAX_N_CHARS]; -int offsets[3]; -sljit_ui mask; -sljit_ub *byte_set, *byte_set_end; -int i, max, from; -int range_right = -1, range_len = 3 - 1; -sljit_ub *update_table = NULL; -BOOL in_range; +struct sljit_jump *found; +PCRE2_UCHAR mask; +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +struct sljit_label *utf_start = NULL; +struct sljit_jump *utf_quit = NULL; +#endif +BOOL has_match_end = (common->match_end_ptr != 0); -for (i = 0; i < MAX_N_CHARS; i++) +if (offset > 0) + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset)); + +if (has_match_end) { - chars[i << 1] = NOTACHAR; - chars[(i << 1) + 1] = 0; - bytes[i * MAX_N_BYTES] = 0; - } + OP1(SLJIT_MOV, TMP3, 0, STR_END, 0); -max = scan_prefix(common, common->start, chars, bytes, MAX_N_CHARS); - -if (max <= 1) - return FALSE; - -for (i = 0; i < max; i++) - { - mask = chars[(i << 1) + 1]; - ones[i] = ones_in_half_byte[mask & 0xf]; - mask >>= 4; - while (mask != 0) + OP2(SLJIT_ADD, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, SLJIT_IMM, IN_UCHARS(offset + 1)); +#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) + if (sljit_x86_is_cmov_available()) { - ones[i] += ones_in_half_byte[mask & 0xf]; - mask >>= 4; + OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, STR_END, 0, TMP3, 0); + sljit_x86_emit_cmov(compiler, SLJIT_GREATER, STR_END, TMP3, 0); + } +#endif + { + quit = CMP(SLJIT_LESS_EQUAL, STR_END, 0, TMP3, 0); + OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); + JUMPHERE(quit); } } +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +if (common->utf && offset > 0) + utf_start = LABEL(); +#endif + +#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) + +/* SSE2 accelerated first character search. */ + +if (sljit_x86_is_sse2_available()) + { + fast_forward_first_char2_sse2(common, char1, char2); + + SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE || offset == 0); + if (common->mode == PCRE2_JIT_COMPLETE) + { + /* In complete mode, we don't need to run a match when STR_PTR == STR_END. */ + SLJIT_ASSERT(common->forced_quit_label == NULL); + OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_NOMATCH); + add_jump(compiler, &common->forced_quit, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); + +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 + if (common->utf && offset > 0) + { + SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE); + + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset)); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +#if PCRE2_CODE_UNIT_WIDTH == 8 + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); + CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, utf_start); +#elif PCRE2_CODE_UNIT_WIDTH == 16 + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00, utf_start); +#else +#error "Unknown code width" +#endif + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + } +#endif + + if (offset > 0) + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset)); + } + else if (sljit_x86_is_cmov_available()) + { + OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0); + sljit_x86_emit_cmov(compiler, SLJIT_GREATER_EQUAL, STR_PTR, has_match_end ? SLJIT_MEM1(SLJIT_SP) : STR_END, has_match_end ? common->match_end_ptr : 0); + } + else + { + quit = CMP(SLJIT_LESS, STR_PTR, 0, STR_END, 0); + OP1(SLJIT_MOV, STR_PTR, 0, has_match_end ? SLJIT_MEM1(SLJIT_SP) : STR_END, has_match_end ? common->match_end_ptr : 0); + JUMPHERE(quit); + } + + if (has_match_end) + OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); + return; + } + +#endif + +quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); + +start = LABEL(); +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + +if (char1 == char2) + found = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, char1); +else + { + mask = char1 ^ char2; + if (is_powerof2(mask)) + { + OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, mask); + found = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, char1 | mask); + } + else + { + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, char1); + OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, char2); + OP_FLAGS(SLJIT_OR | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_EQUAL); + found = JUMP(SLJIT_NOT_ZERO); + } + } + +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, start); + +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +if (common->utf && offset > 0) + utf_quit = JUMP(SLJIT_JUMP); +#endif + +JUMPHERE(found); + +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +if (common->utf && offset > 0) + { + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset)); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +#if PCRE2_CODE_UNIT_WIDTH == 8 + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); + CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, utf_start); +#elif PCRE2_CODE_UNIT_WIDTH == 16 + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00, utf_start); +#else +#error "Unknown code width" +#endif + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + JUMPHERE(utf_quit); + } +#endif + +JUMPHERE(quit); + +if (has_match_end) + { + quit = CMP(SLJIT_LESS, STR_PTR, 0, STR_END, 0); + OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); + if (offset > 0) + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset)); + JUMPHERE(quit); + OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); + } + +if (offset > 0) + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset)); +} + +static SLJIT_INLINE BOOL fast_forward_first_n_chars(compiler_common *common) +{ +DEFINE_COMPILER; +struct sljit_label *start; +struct sljit_jump *quit; +struct sljit_jump *match; +/* bytes[0] represent the number of characters between 0 +and MAX_N_BYTES - 1, 255 represents any character. */ +PCRE2_UCHAR chars[MAX_N_CHARS * MAX_DIFF_CHARS]; +sljit_s32 offset; +PCRE2_UCHAR mask; +PCRE2_UCHAR *char_set, *char_set_end; +int i, max, from; +int range_right = -1, range_len; +sljit_u8 *update_table = NULL; +BOOL in_range; +sljit_u32 rec_count; + +for (i = 0; i < MAX_N_CHARS; i++) + chars[i * MAX_DIFF_CHARS] = 0; + +rec_count = 10000; +max = scan_prefix(common, common->start, chars, MAX_N_CHARS, &rec_count); + +if (max < 1) + return FALSE; + in_range = FALSE; -from = 0; /* Prevent compiler "uninitialized" warning */ +/* Prevent compiler "uninitialized" warning */ +from = 0; +range_len = 4 /* minimum length */ - 1; for (i = 0; i <= max; i++) { - if (in_range && (i - from) > range_len && (bytes[(i - 1) * MAX_N_BYTES] <= 4)) + if (in_range && (i - from) > range_len && (chars[(i - 1) * MAX_DIFF_CHARS] < 255)) { range_len = i - from; range_right = i - 1; } - if (i < max && bytes[i * MAX_N_BYTES] < 255) + if (i < max && chars[i * MAX_DIFF_CHARS] < 255) { + SLJIT_ASSERT(chars[i * MAX_DIFF_CHARS] > 0); if (!in_range) { in_range = TRUE; from = i; } } - else if (in_range) + else in_range = FALSE; } if (range_right >= 0) { - update_table = (sljit_ub *)allocate_read_only_data(common, 256); + update_table = (sljit_u8 *)allocate_read_only_data(common, 256); if (update_table == NULL) return TRUE; memset(update_table, IN_UCHARS(range_len), 256); for (i = 0; i < range_len; i++) { - byte_set = bytes + ((range_right - i) * MAX_N_BYTES); - SLJIT_ASSERT(byte_set[0] > 0 && byte_set[0] < 255); - byte_set_end = byte_set + byte_set[0]; - byte_set++; - while (byte_set <= byte_set_end) + char_set = chars + ((range_right - i) * MAX_DIFF_CHARS); + SLJIT_ASSERT(char_set[0] > 0 && char_set[0] < 255); + char_set_end = char_set + char_set[0]; + char_set++; + while (char_set <= char_set_end) { - if (update_table[*byte_set] > IN_UCHARS(i)) - update_table[*byte_set] = IN_UCHARS(i); - byte_set++; + if (update_table[(*char_set) & 0xff] > IN_UCHARS(i)) + update_table[(*char_set) & 0xff] = IN_UCHARS(i); + char_set++; } } } -offsets[0] = -1; -offsets[1] = -1; -offsets[2] = -1; +offset = -1; /* Scan forward. */ for (i = 0; i < max; i++) - if (ones[i] <= 2) { - offsets[0] = i; - break; - } - -if (offsets[0] < 0 && range_right < 0) - return FALSE; - -if (offsets[0] >= 0) { - /* Scan backward. */ - for (i = max - 1; i > offsets[0]; i--) - if (ones[i] <= 2 && i != range_right) - { - offsets[1] = i; - break; - } - - /* This case is handled better by fast_forward_first_char. */ - if (offsets[1] == -1 && offsets[0] == 0 && range_right < 0) - return FALSE; - - /* We only search for a middle character if there is no range check. */ - if (offsets[1] >= 0 && range_right == -1) + if (offset == -1) { - /* Scan from middle. */ - for (i = (offsets[0] + offsets[1]) / 2 + 1; i < offsets[1]; i++) - if (ones[i] <= 2) + if (chars[i * MAX_DIFF_CHARS] <= 2) + offset = i; + } + else if (chars[offset * MAX_DIFF_CHARS] == 2 && chars[i * MAX_DIFF_CHARS] <= 2) + { + if (chars[i * MAX_DIFF_CHARS] == 1) + offset = i; + else + { + mask = chars[offset * MAX_DIFF_CHARS + 1] ^ chars[offset * MAX_DIFF_CHARS + 2]; + if (!is_powerof2(mask)) { - offsets[2] = i; - break; + mask = chars[i * MAX_DIFF_CHARS + 1] ^ chars[i * MAX_DIFF_CHARS + 2]; + if (is_powerof2(mask)) + offset = i; } - - if (offsets[2] == -1) - { - for (i = (offsets[0] + offsets[1]) / 2; i > offsets[0]; i--) - if (ones[i] <= 2) - { - offsets[2] = i; - break; - } } } - - SLJIT_ASSERT(offsets[1] == -1 || (offsets[0] < offsets[1])); - SLJIT_ASSERT(offsets[2] == -1 || (offsets[0] < offsets[2] && offsets[1] > offsets[2])); - - chars[0] = chars[offsets[0] << 1]; - chars[1] = chars[(offsets[0] << 1) + 1]; - if (offsets[2] >= 0) - { - chars[2] = chars[offsets[2] << 1]; - chars[3] = chars[(offsets[2] << 1) + 1]; - } - if (offsets[1] >= 0) - { - chars[4] = chars[offsets[1] << 1]; - chars[5] = chars[(offsets[1] << 1) + 1]; - } } +if (range_right < 0) + { + if (offset < 0) + return FALSE; + SLJIT_ASSERT(chars[offset * MAX_DIFF_CHARS] >= 1 && chars[offset * MAX_DIFF_CHARS] <= 2); + /* Works regardless the value is 1 or 2. */ + mask = chars[offset * MAX_DIFF_CHARS + chars[offset * MAX_DIFF_CHARS]]; + fast_forward_first_char2(common, chars[offset * MAX_DIFF_CHARS + 1], mask, offset); + return TRUE; + } + +if (range_right == offset) + offset = -1; + +SLJIT_ASSERT(offset == -1 || (chars[offset * MAX_DIFF_CHARS] >= 1 && chars[offset * MAX_DIFF_CHARS] <= 2)); + max -= 1; -if (firstline) +SLJIT_ASSERT(max > 0); +if (common->match_end_ptr != 0) { - SLJIT_ASSERT(common->first_line_end != 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->first_line_end); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); OP1(SLJIT_MOV, TMP3, 0, STR_END, 0); OP2(SLJIT_SUB, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max)); quit = CMP(SLJIT_LESS_EQUAL, STR_END, 0, TMP1, 0); @@ -3721,68 +4455,86 @@ if (firstline) else OP2(SLJIT_SUB, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max)); +SLJIT_ASSERT(range_right >= 0); + #if !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) -if (range_right >= 0) - OP1(SLJIT_MOV, RETURN_ADDR, 0, SLJIT_IMM, (sljit_sw)update_table); +OP1(SLJIT_MOV, RETURN_ADDR, 0, SLJIT_IMM, (sljit_sw)update_table); #endif start = LABEL(); quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); -SLJIT_ASSERT(range_right >= 0 || offsets[0] >= 0); - -if (range_right >= 0) - { #if PCRE2_CODE_UNIT_WIDTH == 8 || (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN) - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right)); +OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right)); #else - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right + 1) - 1); +OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right + 1) - 1); #endif #if !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM2(RETURN_ADDR, TMP1), 0); +OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(RETURN_ADDR, TMP1), 0); #else - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)update_table); +OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)update_table); #endif - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, start); - } +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); +CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, start); -if (offsets[0] >= 0) +if (offset >= 0) { - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[0])); - if (offsets[1] >= 0) - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[1])); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offset)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - if (chars[1] != 0) - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, chars[1]); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[0], start); - if (offsets[2] >= 0) - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[2] - 1)); - - if (offsets[1] >= 0) + if (chars[offset * MAX_DIFF_CHARS] == 1) + CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset * MAX_DIFF_CHARS + 1], start); + else { - if (chars[5] != 0) - OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, chars[5]); - CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, chars[4], start); + mask = chars[offset * MAX_DIFF_CHARS + 1] ^ chars[offset * MAX_DIFF_CHARS + 2]; + if (is_powerof2(mask)) + { + OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, mask); + CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset * MAX_DIFF_CHARS + 1] | mask, start); + } + else + { + match = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset * MAX_DIFF_CHARS + 1]); + CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset * MAX_DIFF_CHARS + 2], start); + JUMPHERE(match); + } } - - if (offsets[2] >= 0) - { - if (chars[3] != 0) - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, chars[3]); - CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[2], start); - } - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); } +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +if (common->utf && offset != 0) + { + if (offset < 0) + { + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + } + else + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); +#if PCRE2_CODE_UNIT_WIDTH == 8 + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); + CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, start); +#elif PCRE2_CODE_UNIT_WIDTH == 16 + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00, start); +#else +#error "Unknown code width" +#endif + if (offset < 0) + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + } +#endif + +if (offset >= 0) + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + JUMPHERE(quit); -if (firstline) +if (common->match_end_ptr != 0) { if (range_right >= 0) - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->first_line_end); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); if (range_right >= 0) { @@ -3797,26 +4549,10 @@ return TRUE; } #undef MAX_N_CHARS -#undef MAX_N_BYTES -static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, PCRE2_UCHAR first_char, BOOL caseless, BOOL firstline) +static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, PCRE2_UCHAR first_char, BOOL caseless) { -DEFINE_COMPILER; -struct sljit_label *start; -struct sljit_jump *quit; -struct sljit_jump *found; -PCRE2_UCHAR oc, bit; - -if (firstline) - { - SLJIT_ASSERT(common->first_line_end != 0); - OP1(SLJIT_MOV, TMP3, 0, STR_END, 0); - OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->first_line_end); - } - -start = LABEL(); -quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); +PCRE2_UCHAR oc; oc = first_char; if (caseless) @@ -3827,36 +4563,11 @@ if (caseless) oc = UCD_OTHERCASE(first_char); #endif } -if (first_char == oc) - found = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, first_char); -else - { - bit = first_char ^ oc; - if (is_powerof2(bit)) - { - OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, bit); - found = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, first_char | bit); - } - else - { - OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, first_char); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc); - OP_FLAGS(SLJIT_OR | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_EQUAL); - found = JUMP(SLJIT_NOT_ZERO); - } - } -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -JUMPTO(SLJIT_JUMP, start); -JUMPHERE(found); -JUMPHERE(quit); - -if (firstline) - OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); +fast_forward_first_char2(common, first_char, oc, 0); } -static SLJIT_INLINE void fast_forward_newline(compiler_common *common, BOOL firstline) +static SLJIT_INLINE void fast_forward_newline(compiler_common *common) { DEFINE_COMPILER; struct sljit_label *loop; @@ -3867,11 +4578,10 @@ struct sljit_jump *foundcr = NULL; struct sljit_jump *notfoundnl; jump_list *newline = NULL; -if (firstline) +if (common->match_end_ptr != 0) { - SLJIT_ASSERT(common->first_line_end != 0); OP1(SLJIT_MOV, TMP3, 0, STR_END, 0); - OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->first_line_end); + OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); } if (common->nltype == NLTYPE_FIXED && common->newline > 255) @@ -3902,7 +4612,7 @@ if (common->nltype == NLTYPE_FIXED && common->newline > 255) JUMPHERE(firstchar); JUMPHERE(lastchar); - if (firstline) + if (common->match_end_ptr != 0) OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); return; } @@ -3940,13 +4650,13 @@ if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF) JUMPHERE(lastchar); JUMPHERE(firstchar); -if (firstline) +if (common->match_end_ptr != 0) OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); } -static BOOL check_class_ranges(compiler_common *common, const sljit_ub *bits, BOOL nclass, BOOL invert, jump_list **backtracks); +static BOOL check_class_ranges(compiler_common *common, const sljit_u8 *bits, BOOL nclass, BOOL invert, jump_list **backtracks); -static SLJIT_INLINE void fast_forward_start_bits(compiler_common *common, const sljit_ub *start_bits, BOOL firstline) +static SLJIT_INLINE void fast_forward_start_bits(compiler_common *common, const sljit_u8 *start_bits) { DEFINE_COMPILER; struct sljit_label *start; @@ -3957,11 +4667,10 @@ jump_list *matches = NULL; struct sljit_jump *jump; #endif -if (firstline) +if (common->match_end_ptr != 0) { - SLJIT_ASSERT(common->first_line_end != 0); OP1(SLJIT_MOV, RETURN_ADDR, 0, STR_END, 0); - OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->first_line_end); + OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); } start = LABEL(); @@ -3981,7 +4690,7 @@ if (!check_class_ranges(common, start_bits, (start_bits[31] & 0x80) != 0, TRUE, #endif OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)start_bits); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)start_bits); OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); found = JUMP(SLJIT_NOT_ZERO); @@ -3997,7 +4706,7 @@ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); if (common->utf) { CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); } #elif PCRE2_CODE_UNIT_WIDTH == 16 @@ -4019,7 +4728,7 @@ if (matches != NULL) set_jumps(matches, LABEL()); JUMPHERE(quit); -if (firstline) +if (common->match_end_ptr != 0) OP1(SLJIT_MOV, STR_END, 0, RETURN_ADDR, 0); } @@ -4032,7 +4741,7 @@ struct sljit_jump *alreadyfound; struct sljit_jump *found; struct sljit_jump *foundoc = NULL; struct sljit_jump *notfound; -sljit_ui oc, bit; +sljit_u32 oc, bit; SLJIT_ASSERT(common->req_char_ptr != 0); OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->req_char_ptr); @@ -4169,7 +4878,7 @@ else if (common->utf) jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 4 /* ctype_word */); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP1, 0); @@ -4214,7 +4923,7 @@ else if (common->utf) jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); #endif - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes); + OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes); OP2(SLJIT_LSHR, TMP2, 0, TMP2, 0, SLJIT_IMM, 4 /* ctype_word */); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); #if PCRE2_CODE_UNIT_WIDTH != 8 @@ -4230,12 +4939,12 @@ OP2(SLJIT_XOR | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOC sljit_emit_fast_return(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0); } -static BOOL check_class_ranges(compiler_common *common, const sljit_ub *bits, BOOL nclass, BOOL invert, jump_list **backtracks) +static BOOL check_class_ranges(compiler_common *common, const sljit_u8 *bits, BOOL nclass, BOOL invert, jump_list **backtracks) { /* May destroy TMP1. */ DEFINE_COMPILER; int ranges[MAX_RANGE_SIZE]; -sljit_ub bit, cbit, all; +sljit_u8 bit, cbit, all; int i, byte, length = 0; bit = bits[0] & 0x1; @@ -4328,8 +5037,10 @@ switch(length) case 4: if ((ranges[1] - ranges[0]) == (ranges[3] - ranges[2]) && (ranges[0] | (ranges[2] - ranges[0])) == ranges[2] + && (ranges[1] & (ranges[2] - ranges[0])) == 0 && is_powerof2(ranges[2] - ranges[0])) { + SLJIT_ASSERT((ranges[0] & (ranges[2] - ranges[0])) == 0 && (ranges[2] & ranges[3] & (ranges[2] - ranges[0])) != 0); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2] - ranges[0]); if (ranges[2] + 1 != ranges[3]) { @@ -4528,12 +5239,12 @@ OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); #if PCRE2_CODE_UNIT_WIDTH != 8 jump = CMP(SLJIT_GREATER, CHAR1, 0, SLJIT_IMM, 255); #endif -OP1(SLJIT_MOV_UB, CHAR1, 0, SLJIT_MEM2(LCC_TABLE, CHAR1), 0); +OP1(SLJIT_MOV_U8, CHAR1, 0, SLJIT_MEM2(LCC_TABLE, CHAR1), 0); #if PCRE2_CODE_UNIT_WIDTH != 8 JUMPHERE(jump); jump = CMP(SLJIT_GREATER, CHAR2, 0, SLJIT_IMM, 255); #endif -OP1(SLJIT_MOV_UB, CHAR2, 0, SLJIT_MEM2(LCC_TABLE, CHAR2), 0); +OP1(SLJIT_MOV_U8, CHAR2, 0, SLJIT_MEM2(LCC_TABLE, CHAR2), 0); #if PCRE2_CODE_UNIT_WIDTH != 8 JUMPHERE(jump); #endif @@ -4558,11 +5269,11 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0); static PCRE2_SPTR SLJIT_CALL do_utf_caselesscmp(PCRE2_SPTR src1, jit_arguments *args, PCRE2_SPTR end1) { /* This function would be ineffective to do in JIT level. */ -sljit_ui c1, c2; +sljit_u32 c1, c2; PCRE2_SPTR src2 = args->startchar_ptr; PCRE2_SPTR end2 = args->end; const ucd_record *ur; -const sljit_ui *pp; +const sljit_u32 *pp; while (src1 < end1) { @@ -4586,8 +5297,6 @@ return src2; #endif /* SUPPORT_UNICODE */ -static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr); - static PCRE2_SPTR byte_sequence_compare(compiler_common *common, BOOL caseless, PCRE2_SPTR cc, compare_context *context, jump_list **backtracks) { @@ -4624,16 +5333,16 @@ if (context->sourcereg == -1) #if PCRE2_CODE_UNIT_WIDTH == 8 #if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED if (context->length >= 4) - OP1(SLJIT_MOV_SI, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); + OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); else if (context->length >= 2) - OP1(SLJIT_MOV_UH, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); + OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); else #endif - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); #elif PCRE2_CODE_UNIT_WIDTH == 16 #if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED if (context->length >= 4) - OP1(SLJIT_MOV_SI, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); + OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); else #endif OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); @@ -4675,12 +5384,12 @@ do #endif { if (context->length >= 4) - OP1(SLJIT_MOV_SI, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); + OP1(SLJIT_MOV_S32, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); else if (context->length >= 2) - OP1(SLJIT_MOV_UH, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); + OP1(SLJIT_MOV_U16, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); #if PCRE2_CODE_UNIT_WIDTH == 8 else if (context->length >= 1) - OP1(SLJIT_MOV_UB, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); + OP1(SLJIT_MOV_U8, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1; @@ -4763,6 +5472,8 @@ return cc; } \ charoffset = (value); +static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr); + static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks) { DEFINE_COMPILER; @@ -4780,7 +5491,7 @@ BOOL utf = common->utf; BOOL needstype = FALSE, needsscript = FALSE, needschar = FALSE; BOOL charsaved = FALSE; int typereg = TMP1; -const sljit_ui *other_cases; +const sljit_u32 *other_cases; sljit_uw typeoffset; #endif @@ -4788,6 +5499,7 @@ sljit_uw typeoffset; cc++; ccbegin = cc; compares = 0; + if (cc[-1] & XCL_MAP) { min = 0; @@ -4845,9 +5557,8 @@ while (*cc != XCL_END) /* Any either accepts everything or ignored. */ if (cc[-1] == XCL_PROP) { - if (list != backtracks) - compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE); - else + compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE); + if (list == backtracks) add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); return; } @@ -4898,11 +5609,11 @@ if ((cc[-1] & XCL_HASPROP) == 0) if ((cc[-1] & XCL_MAP) != 0) { jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); - if (!check_class_ranges(common, (const sljit_ub *)cc, (((const sljit_ub *)cc)[31] & 0x80) != 0, TRUE, &found)) + if (!check_class_ranges(common, (const sljit_u8 *)cc, (((const sljit_u8 *)cc)[31] & 0x80) != 0, TRUE, &found)) { OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc); OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); add_jump(compiler, &found, JUMP(SLJIT_NOT_ZERO)); @@ -4925,21 +5636,25 @@ else if ((cc[-1] & XCL_MAP) != 0) #ifdef SUPPORT_UNICODE charsaved = TRUE; #endif - if (!check_class_ranges(common, (const sljit_ub *)cc, FALSE, TRUE, list)) + if (!check_class_ranges(common, (const sljit_u8 *)cc, FALSE, TRUE, list)) { #if PCRE2_CODE_UNIT_WIDTH == 8 - SLJIT_ASSERT(common->utf); + jump = NULL; + if (common->utf) #endif - jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); + jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc); OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); add_jump(compiler, list, JUMP(SLJIT_NOT_ZERO)); - JUMPHERE(jump); +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (common->utf) +#endif + JUMPHERE(jump); } OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0); @@ -4953,18 +5668,18 @@ if (needstype || needsscript) OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0); OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1)); + OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2)); - OP1(SLJIT_MOV_UH, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1); + OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1); /* Before anything else, we deal with scripts. */ if (needsscript) { OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script)); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3); ccbegin = cc; @@ -5011,12 +5726,12 @@ if (needstype || needsscript) if (!needschar) { OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3); } else { OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3); - OP1(SLJIT_MOV_UB, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); + OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); typereg = RETURN_ADDR; } } @@ -5280,7 +5995,7 @@ while (*cc != XCL_END) OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_LESS_EQUAL); SET_CHAR_OFFSET(0); - OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xff); + OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x7f); OP_FLAGS(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_LESS_EQUAL); SET_TYPE_OFFSET(ucp_Pc); @@ -5412,47 +6127,9 @@ switch(type) check_partial(common, FALSE); return cc; - case OP_CIRC: - OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin)); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, TMP1, 0)); - OP2(SLJIT_IAND | SLJIT_SET_E, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL); - add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO)); - return cc; - - case OP_CIRCM: - OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin)); - jump[1] = CMP(SLJIT_GREATER, STR_PTR, 0, TMP1, 0); - OP2(SLJIT_IAND | SLJIT_SET_E, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL); - add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO)); - jump[0] = JUMP(SLJIT_JUMP); - JUMPHERE(jump[1]); - - if (!common->alt_circumflex) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); - - if (common->nltype == NLTYPE_FIXED && common->newline > 255) - { - OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, TMP1, 0)); - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff)); - add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff)); - } - else - { - skip_char_back(common); - read_char_range(common, common->nlmin, common->nlmax, TRUE); - check_newlinechar(common, common->nltype, backtracks, FALSE); - } - JUMPHERE(jump[0]); - return cc; - case OP_DOLL: OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); - OP2(SLJIT_IAND | SLJIT_SET_E, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTEOL); + OP2(SLJIT_AND32 | SLJIT_SET_E, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTEOL); add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO)); if (!common->endonly) @@ -5467,7 +6144,7 @@ switch(type) case OP_DOLLM: jump[1] = CMP(SLJIT_LESS, STR_PTR, 0, STR_END, 0); OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); - OP2(SLJIT_IAND | SLJIT_SET_E, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTEOL); + OP2(SLJIT_AND32 | SLJIT_SET_E, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTEOL); add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO)); check_partial(common, FALSE); jump[0] = JUMP(SLJIT_JUMP); @@ -5501,6 +6178,44 @@ switch(type) JUMPHERE(jump[0]); return cc; + case OP_CIRC: + OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin)); + add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, TMP1, 0)); + OP2(SLJIT_AND32 | SLJIT_SET_E, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL); + add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO)); + return cc; + + case OP_CIRCM: + OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin)); + jump[1] = CMP(SLJIT_GREATER, STR_PTR, 0, TMP1, 0); + OP2(SLJIT_AND32 | SLJIT_SET_E, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL); + add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO)); + jump[0] = JUMP(SLJIT_JUMP); + JUMPHERE(jump[1]); + + if (!common->alt_circumflex) + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); + + if (common->nltype == NLTYPE_FIXED && common->newline > 255) + { + OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); + add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, TMP1, 0)); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); + OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); + add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff)); + add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff)); + } + else + { + skip_char_back(common); + read_char_range(common, common->nlmin, common->nlmax, TRUE); + check_newlinechar(common, common->nltype, backtracks, FALSE); + } + JUMPHERE(jump[0]); + return cc; + case OP_REVERSE: length = GET(cc, 0); if (length == 0) @@ -5552,7 +6267,7 @@ switch(type) if (check_str_ptr) detect_partial_match(common, backtracks); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && is_char7_bitset((const sljit_ub*)common->ctypes - cbit_length + cbit_digit, FALSE)) + if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_digit, FALSE)) read_char7_type(common, type == OP_NOT_DIGIT); else #endif @@ -5567,7 +6282,7 @@ switch(type) if (check_str_ptr) detect_partial_match(common, backtracks); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && is_char7_bitset((const sljit_ub*)common->ctypes - cbit_length + cbit_space, FALSE)) + if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_space, FALSE)) read_char7_type(common, type == OP_NOT_WHITESPACE); else #endif @@ -5581,7 +6296,7 @@ switch(type) if (check_str_ptr) detect_partial_match(common, backtracks); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - if (common->utf && is_char7_bitset((const sljit_ub*)common->ctypes - cbit_length + cbit_word, FALSE)) + if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_word, FALSE)) read_char7_type(common, type == OP_NOT_WORDCHAR); else #endif @@ -5623,7 +6338,7 @@ switch(type) #if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 #if PCRE2_CODE_UNIT_WIDTH == 8 jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); #elif PCRE2_CODE_UNIT_WIDTH == 16 jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xd800); @@ -5710,7 +6425,7 @@ switch(type) OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, gbprop)); /* Optimize register allocation: use a real register. */ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STACK_TOP, 0); - OP1(SLJIT_MOV_UB, STACK_TOP, 0, SLJIT_MEM2(TMP1, TMP2), 3); + OP1(SLJIT_MOV_U8, STACK_TOP, 0, SLJIT_MEM2(TMP1, TMP2), 3); label = LABEL(); jump[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); @@ -5718,10 +6433,10 @@ switch(type) read_char(common); add_jump(compiler, &common->getucd, JUMP(SLJIT_FAST_CALL)); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, gbprop)); - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM2(TMP1, TMP2), 3); + OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM2(TMP1, TMP2), 3); OP2(SLJIT_SHL, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, 2); - OP1(SLJIT_MOV_UI, TMP1, 0, SLJIT_MEM1(STACK_TOP), (sljit_sw)PRIV(ucp_gbtable)); + OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(STACK_TOP), (sljit_sw)PRIV(ucp_gbtable)); OP1(SLJIT_MOV, STACK_TOP, 0, TMP2, 0); OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); @@ -5805,7 +6520,7 @@ switch(type) c = *cc; if (c < 128) { - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); if (type == OP_NOT || !char_has_othercase(common, cc)) add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c)); else @@ -5861,13 +6576,13 @@ switch(type) detect_partial_match(common, backtracks); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - bit = (common->utf && is_char7_bitset((const sljit_ub *)cc, type == OP_NCLASS)) ? 127 : 255; + bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255; read_char_range(common, 0, bit, type == OP_NCLASS); #else read_char_range(common, 0, 255, type == OP_NCLASS); #endif - if (check_class_ranges(common, (const sljit_ub *)cc, type == OP_NCLASS, FALSE, backtracks)) + if (check_class_ranges(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks)) return cc + 32 / sizeof(PCRE2_UCHAR); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 @@ -5892,7 +6607,7 @@ switch(type) OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc); OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, TMP2, 0); add_jump(compiler, backtracks, JUMP(SLJIT_ZERO)); @@ -6419,8 +7134,8 @@ static int SLJIT_CALL do_callout(struct jit_arguments *arguments, pcre2_callout_ { PCRE2_SPTR begin = arguments->begin; PCRE2_SIZE *ovector = arguments->match_data->ovector; -uint32_t oveccount = arguments->oveccount; -uint32_t i; +sljit_u32 oveccount = arguments->oveccount; +sljit_u32 i; if (arguments->callout == NULL) return 0; @@ -6461,7 +7176,7 @@ static SLJIT_INLINE PCRE2_SPTR compile_callout_matchingpath(compiler_common *com { DEFINE_COMPILER; backtrack_common *backtrack; -sljit_si mov_opcode; +sljit_s32 mov_opcode; unsigned int callout_length = (*cc == OP_CALLOUT) ? PRIV(OP_lengths)[OP_CALLOUT] : GET(cc, 1 + 2 * LINK_SIZE); sljit_sw value1; @@ -6476,8 +7191,8 @@ SLJIT_ASSERT(common->capture_last_ptr != 0); OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr); OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); value1 = (*cc == OP_CALLOUT) ? cc[1 + 2 * LINK_SIZE] : 0; -OP1(SLJIT_MOV_UI, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_number), SLJIT_IMM, value1); -OP1(SLJIT_MOV_UI, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(capture_last), TMP2, 0); +OP1(SLJIT_MOV_U32, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_number), SLJIT_IMM, value1); +OP1(SLJIT_MOV_U32, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(capture_last), TMP2, 0); /* These pointer sized fields temporarly stores internal variables. */ OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0)); @@ -6486,7 +7201,7 @@ OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(subject), TMP2, 0); if (common->mark_ptr != 0) OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, mark_ptr)); -mov_opcode = (sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_UI : SLJIT_MOV; +mov_opcode = (sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_U32 : SLJIT_MOV; OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(pattern_position), SLJIT_IMM, GET(cc, 1)); OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(next_item_length), SLJIT_IMM, GET(cc, 1 + LINK_SIZE)); @@ -6513,7 +7228,7 @@ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STACK_TOP, 0); OP2(SLJIT_SUB, SLJIT_R1, 0, STACK_TOP, 0, SLJIT_IMM, CALLOUT_ARG_SIZE); GET_LOCAL_BASE(SLJIT_R2, 0, OVECTOR_START); sljit_emit_ijump(compiler, SLJIT_CALL3, SLJIT_IMM, SLJIT_FUNC_OFFSET(do_callout)); -OP1(SLJIT_MOV_SI, SLJIT_RETURN_REG, 0, SLJIT_RETURN_REG, 0); +OP1(SLJIT_MOV_S32, SLJIT_RETURN_REG, 0, SLJIT_RETURN_REG, 0); OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); free_stack(common, CALLOUT_ARG_SIZE / sizeof(sljit_sw)); @@ -6542,6 +7257,10 @@ while (TRUE) case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: + case OP_CIRC: + case OP_CIRCM: + case OP_DOLL: + case OP_DOLLM: case OP_CALLOUT: case OP_ALT: cc += PRIV(OP_lengths)[*cc]; @@ -7835,6 +8554,10 @@ while (*cc != OP_KETRPOS) OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); } + /* Even if the match is empty, we need to reset the control head. */ + if (needs_control_head) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack)); + if (opcode == OP_SBRAPOS || opcode == OP_SCBRAPOS) add_jump(compiler, &emptymatch, CMP(SLJIT_EQUAL, TMP1, 0, STR_PTR, 0)); @@ -7862,6 +8585,10 @@ while (*cc != OP_KETRPOS) OP1(SLJIT_MOV, SLJIT_MEM1(TMP2), (framesize + 1) * sizeof(sljit_sw), STR_PTR, 0); } + /* Even if the match is empty, we need to reset the control head. */ + if (needs_control_head) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack)); + if (opcode == OP_SBRAPOS || opcode == OP_SCBRAPOS) add_jump(compiler, &emptymatch, CMP(SLJIT_EQUAL, TMP1, 0, STR_PTR, 0)); @@ -7874,9 +8601,6 @@ while (*cc != OP_KETRPOS) } } - if (needs_control_head) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->control_head_ptr, SLJIT_MEM1(STACK_TOP), STACK(stack)); - JUMPTO(SLJIT_JUMP, loop); flush_stubs(common); @@ -7930,7 +8654,7 @@ count_match(common); return cc + 1 + LINK_SIZE; } -static SLJIT_INLINE PCRE2_SPTR get_iterator_parameters(compiler_common *common, PCRE2_SPTR cc, PCRE2_UCHAR *opcode, PCRE2_UCHAR *type, sljit_ui *max, sljit_ui *exact, PCRE2_SPTR *end) +static SLJIT_INLINE PCRE2_SPTR get_iterator_parameters(compiler_common *common, PCRE2_SPTR cc, PCRE2_UCHAR *opcode, PCRE2_UCHAR *type, sljit_u32 *max, sljit_u32 *exact, PCRE2_SPTR *end) { int class_len; @@ -8081,7 +8805,9 @@ DEFINE_COMPILER; backtrack_common *backtrack; PCRE2_UCHAR opcode; PCRE2_UCHAR type; -sljit_ui max = 0, exact; +sljit_u32 max = 0, exact; +BOOL fast_fail; +sljit_s32 fast_str_ptr; BOOL charpos_enabled; PCRE2_UCHAR charpos_char; unsigned int charpos_othercasebit; @@ -8098,6 +8824,19 @@ int tmp_base, tmp_offset; PUSH_BACKTRACK(sizeof(char_iterator_backtrack), cc, NULL); +fast_str_ptr = PRIVATE_DATA(cc + 1); +fast_fail = TRUE; + +SLJIT_ASSERT(common->fast_forward_bc_ptr == NULL || fast_str_ptr == 0 || cc == common->fast_forward_bc_ptr); + +if (cc == common->fast_forward_bc_ptr) + fast_fail = FALSE; +else if (common->fast_fail_start_ptr == 0) + fast_str_ptr = 0; + +SLJIT_ASSERT(common->fast_forward_bc_ptr != NULL || fast_str_ptr == 0 + || (fast_str_ptr >= common->fast_fail_start_ptr && fast_str_ptr <= common->fast_fail_end_ptr)); + cc = get_iterator_parameters(common, cc, &opcode, &type, &max, &exact, &end); if (type != OP_EXTUNI) @@ -8111,9 +8850,13 @@ else tmp_offset = POSSESSIVE0; } +if (fast_fail && fast_str_ptr != 0) + add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), fast_str_ptr)); + /* Handle fixed part first. */ if (exact > 1) { + SLJIT_ASSERT(fast_str_ptr == 0); if (common->mode == PCRE2_JIT_COMPLETE #ifdef SUPPORT_UNICODE && !common->utf @@ -8144,9 +8887,12 @@ switch(opcode) { case OP_STAR: case OP_UPTO: + SLJIT_ASSERT(fast_str_ptr == 0 || opcode == OP_STAR); + if (type == OP_ANYNL || type == OP_EXTUNI) { SLJIT_ASSERT(private_data_ptr == 0); + SLJIT_ASSERT(fast_str_ptr == 0); allocate_stack(common, 2); OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), STR_PTR, 0); @@ -8201,9 +8947,7 @@ switch(opcode) #elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 SLJIT_ASSERT((charpos_othercasebit >> 9) == 0); if ((charpos_othercasebit & 0x100) != 0) - { charpos_othercasebit = (charpos_othercasebit & 0xff) << 8; - } #endif if (charpos_othercasebit != 0) charpos_char |= charpos_othercasebit; @@ -8228,6 +8972,8 @@ switch(opcode) add_jump(compiler, &backtrack->topbacktracks, JUMP(SLJIT_ZERO)); } compile_char1_matchingpath(common, type, cc, &backtrack->topbacktracks, FALSE); + if (fast_str_ptr != 0) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), fast_str_ptr, STR_PTR, 0); JUMPHERE(jump); detect_partial_match(common, &backtrack->topbacktracks); @@ -8249,6 +8995,8 @@ switch(opcode) /* Search the last instance of charpos_char. */ label = LABEL(); compile_char1_matchingpath(common, type, cc, &no_match, FALSE); + if (fast_str_ptr != 0) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), fast_str_ptr, STR_PTR, 0); detect_partial_match(common, &no_match); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); if (charpos_othercasebit != 0) @@ -8304,6 +9052,8 @@ switch(opcode) set_jumps(no_match, LABEL()); OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); + if (fast_str_ptr != 0) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), fast_str_ptr, STR_PTR, 0); } #endif else @@ -8331,6 +9081,8 @@ switch(opcode) OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); set_jumps(no_match, LABEL()); OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); + if (fast_str_ptr != 0) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), fast_str_ptr, STR_PTR, 0); } } BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL(); @@ -8341,9 +9093,12 @@ switch(opcode) allocate_stack(common, 1); OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); BACKTRACK_AS(char_iterator_backtrack)->matchingpath = LABEL(); + if (fast_str_ptr != 0) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), fast_str_ptr, STR_PTR, 0); break; case OP_MINUPTO: + SLJIT_ASSERT(fast_str_ptr == 0); if (private_data_ptr == 0) allocate_stack(common, 2); OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); @@ -8353,6 +9108,7 @@ switch(opcode) case OP_QUERY: case OP_MINQUERY: + SLJIT_ASSERT(fast_str_ptr == 0); if (private_data_ptr == 0) allocate_stack(common, 1); OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); @@ -8375,6 +9131,8 @@ switch(opcode) JUMPTO(SLJIT_JUMP, label); set_jumps(no_match, LABEL()); OP1(SLJIT_MOV, STR_PTR, 0, tmp_base, tmp_offset); + if (fast_str_ptr != 0) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), fast_str_ptr, STR_PTR, 0); break; } #endif @@ -8385,9 +9143,12 @@ switch(opcode) set_jumps(no_char1_match, LABEL()); OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); set_jumps(no_match, LABEL()); + if (fast_str_ptr != 0) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), fast_str_ptr, STR_PTR, 0); break; case OP_POSUPTO: + SLJIT_ASSERT(fast_str_ptr == 0); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 if (common->utf) { @@ -8416,6 +9177,7 @@ switch(opcode) break; case OP_POSQUERY: + SLJIT_ASSERT(fast_str_ptr == 0); OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0); compile_char1_matchingpath(common, type, cc, &no_match, TRUE); OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0); @@ -8460,7 +9222,7 @@ if (common->accept_label == NULL) else CMPTO(SLJIT_NOT_EQUAL, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), OVECTOR(0), common->accept_label); OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); -OP1(SLJIT_MOV_UI, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, options)); +OP1(SLJIT_MOV_U32, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, options)); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, PCRE2_NOTEMPTY); add_jump(compiler, &backtrack->topbacktracks, JUMP(SLJIT_NOT_ZERO)); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, PCRE2_NOTEMPTY_ATSTART); @@ -8587,10 +9349,10 @@ while (cc < ccend) case OP_WORD_BOUNDARY: case OP_EODN: case OP_EOD: - case OP_CIRC: - case OP_CIRCM: case OP_DOLL: case OP_DOLLM: + case OP_CIRC: + case OP_CIRCM: case OP_REVERSE: cc = compile_simple_assertion_matchingpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks); break; @@ -8774,8 +9536,7 @@ while (cc < ccend) OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(1), STR_PTR, 0); } BACKTRACK_AS(braminzero_backtrack)->matchingpath = LABEL(); - if (cc[1] > OP_ASSERTBACK_NOT) - count_match(common); + count_match(common); break; case OP_ONCE: @@ -8893,7 +9654,7 @@ DEFINE_COMPILER; PCRE2_SPTR cc = current->cc; PCRE2_UCHAR opcode; PCRE2_UCHAR type; -sljit_ui max = 0, exact; +sljit_u32 max = 0, exact; struct sljit_label *label = NULL; struct sljit_jump *jump = NULL; jump_list *jumplist = NULL; @@ -9012,7 +9773,7 @@ switch(opcode) break; } - set_jumps(current->topbacktracks, LABEL()); +set_jumps(current->topbacktracks, LABEL()); } static SLJIT_INLINE void compile_ref_iterator_backtrackingpath(compiler_common *common, struct backtrack_common *current) @@ -9944,7 +10705,7 @@ static SLJIT_INLINE void compile_recurse(compiler_common *common) DEFINE_COMPILER; PCRE2_SPTR cc = common->start + common->currententry->start; PCRE2_SPTR ccbegin = cc + 1 + LINK_SIZE + (*cc == OP_BRA ? 0 : IMM2_SIZE); -PCRE2_SPTR ccend = bracketend(cc); +PCRE2_SPTR ccend = bracketend(cc) - (1 + LINK_SIZE); BOOL needs_control_head; int framesize = get_framesize(common, cc, NULL, TRUE, &needs_control_head); int private_data_size = get_private_data_copy_length(common, ccbegin, ccend, needs_control_head); @@ -9967,6 +10728,7 @@ common->currententry->entry = LABEL(); set_jumps(common->currententry->calls, common->currententry->entry); sljit_emit_fast_enter(compiler, TMP2, 0); +count_match(common); allocate_stack(common, private_data_size + framesize + alternativesize); OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(private_data_size + framesize + alternativesize - 1), TMP2, 0); copy_private_data(common, ccbegin, ccend, TRUE, private_data_size + framesize + alternativesize, framesize + alternativesize, needs_control_head); @@ -10066,14 +10828,14 @@ sljit_emit_fast_return(compiler, SLJIT_MEM1(STACK_TOP), 0); #undef COMPILE_BACKTRACKINGPATH #undef CURRENT_AS -static int jit_compile(pcre2_code *code, uint32_t mode) +static int jit_compile(pcre2_code *code, sljit_u32 mode) { pcre2_real_code *re = (pcre2_real_code *)code; struct sljit_compiler *compiler; backtrack_common rootbacktrack; compiler_common common_data; compiler_common *common = &common_data; -const sljit_ub *tables = re->tables; +const sljit_u8 *tables = re->tables; void *allocator_data = &re->memctl; int private_data_size; PCRE2_SPTR ccend; @@ -10170,7 +10932,7 @@ ccend = bracketend(common->start); /* Calculate the local space size on the stack. */ common->ovector_start = LIMIT_MATCH + sizeof(sljit_sw); -common->optimized_cbracket = (sljit_ub *)SLJIT_MALLOC(re->top_bracket + 1, allocator_data); +common->optimized_cbracket = (sljit_u8 *)SLJIT_MALLOC(re->top_bracket + 1, allocator_data); if (!common->optimized_cbracket) return PCRE2_ERROR_NOMEMORY; #if defined DEBUG_FORCE_UNOPTIMIZED_CBRAS && DEBUG_FORCE_UNOPTIMIZED_CBRAS == 1 @@ -10206,9 +10968,9 @@ if (mode != PCRE2_JIT_COMPLETE) common->ovector_start += sizeof(sljit_sw); } } -if ((re->overall_options & PCRE2_FIRSTLINE) != 0) +if ((re->overall_options & (PCRE2_FIRSTLINE | PCRE2_USE_OFFSET_LIMIT)) != 0) { - common->first_line_end = common->ovector_start; + common->match_end_ptr = common->ovector_start; common->ovector_start += sizeof(sljit_sw); } #if defined DEBUG_FORCE_CONTROL_HEAD && DEBUG_FORCE_CONTROL_HEAD @@ -10241,16 +11003,24 @@ SLJIT_ASSERT(!(common->req_char_ptr != 0 && common->start_used_ptr != 0)); common->cbra_ptr = OVECTOR_START + (re->top_bracket + 1) * 2 * sizeof(sljit_sw); total_length = ccend - common->start; -common->private_data_ptrs = (sljit_si *)SLJIT_MALLOC(total_length * (sizeof(sljit_si) + (common->has_then ? 1 : 0)), allocator_data); +common->private_data_ptrs = (sljit_s32 *)SLJIT_MALLOC(total_length * (sizeof(sljit_s32) + (common->has_then ? 1 : 0)), allocator_data); if (!common->private_data_ptrs) { SLJIT_FREE(common->optimized_cbracket, allocator_data); return PCRE2_ERROR_NOMEMORY; } -memset(common->private_data_ptrs, 0, total_length * sizeof(sljit_si)); +memset(common->private_data_ptrs, 0, total_length * sizeof(sljit_s32)); private_data_size = common->cbra_ptr + (re->top_bracket + 1) * sizeof(sljit_sw); set_private_data_ptrs(common, &private_data_size, ccend); +if ((re->overall_options & PCRE2_ANCHORED) == 0 && (re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) + { + if (!detect_fast_forward_skip(common, &private_data_size) && !common->has_skip_in_assert_back) + detect_fast_fail(common, common->start, &private_data_size, 4); + } + +SLJIT_ASSERT(common->fast_fail_start_ptr <= common->fast_fail_end_ptr); + if (private_data_size > SLJIT_MAX_LOCAL_SIZE) { SLJIT_FREE(common->private_data_ptrs, allocator_data); @@ -10260,7 +11030,7 @@ if (private_data_size > SLJIT_MAX_LOCAL_SIZE) if (common->has_then) { - common->then_offsets = (sljit_ub *)(common->private_data_ptrs + total_length); + common->then_offsets = (sljit_u8 *)(common->private_data_ptrs + total_length); memset(common->then_offsets, 0, total_length); set_then_offsets(common, common->start, NULL); } @@ -10287,11 +11057,15 @@ OP1(SLJIT_MOV, TMP1, 0, SLJIT_S0, 0); OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str)); OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, end)); OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, stack)); -OP1(SLJIT_MOV_UI, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, limit_match)); +OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, limit_match)); OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(struct sljit_stack, base)); OP1(SLJIT_MOV, STACK_LIMIT, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(struct sljit_stack, limit)); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LIMIT_MATCH, TMP1, 0); +if (common->fast_fail_start_ptr < common->fast_fail_end_ptr) + reset_fast_fail(common); + if (mode == PCRE2_JIT_PARTIAL_SOFT) OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->hit_start, SLJIT_IMM, -1); if (common->mark_ptr != 0) @@ -10302,19 +11076,19 @@ if (common->control_head_ptr != 0) /* Main part of the matching */ if ((re->overall_options & PCRE2_ANCHORED) == 0) { - mainloop_label = mainloop_entry(common, (re->flags & PCRE2_HASCRORLF) != 0, (re->overall_options & PCRE2_FIRSTLINE) != 0); + mainloop_label = mainloop_entry(common, (re->flags & PCRE2_HASCRORLF) != 0, re->overall_options); continue_match_label = LABEL(); /* Forward search if possible. */ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) { - if (mode == PCRE2_JIT_COMPLETE && fast_forward_first_n_chars(common, (re->overall_options & PCRE2_FIRSTLINE) != 0)) + if (mode == PCRE2_JIT_COMPLETE && fast_forward_first_n_chars(common)) ; else if ((re->flags & PCRE2_FIRSTSET) != 0) - fast_forward_first_char(common, (PCRE2_UCHAR)(re->first_codeunit), (re->flags & PCRE2_FIRSTCASELESS) != 0, (re->overall_options & PCRE2_FIRSTLINE) != 0); + fast_forward_first_char(common, (PCRE2_UCHAR)(re->first_codeunit), (re->flags & PCRE2_FIRSTCASELESS) != 0); else if ((re->flags & PCRE2_STARTLINE) != 0) - fast_forward_newline(common, (re->overall_options & PCRE2_FIRSTLINE) != 0); + fast_forward_newline(common); else if ((re->flags & PCRE2_FIRSTMAPSET) != 0) - fast_forward_start_bits(common, re->start_bitmap, (re->overall_options & PCRE2_FIRSTLINE) != 0); + fast_forward_start_bits(common, re->start_bitmap); } } else @@ -10335,6 +11109,8 @@ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), OVECTOR(0), STR_PTR, 0); OP1(SLJIT_MOV, COUNT_MATCH, 0, SLJIT_MEM1(SLJIT_SP), LIMIT_MATCH); if (common->capture_last_ptr != 0) OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr, SLJIT_IMM, 0); +if (common->fast_forward_bc_ptr != NULL) + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), PRIVATE_DATA(common->fast_forward_bc_ptr + 1), STR_PTR, 0); if (common->start_ptr != OVECTOR(0)) OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->start_ptr, STR_PTR, 0); @@ -10413,29 +11189,34 @@ if (mode == PCRE2_JIT_PARTIAL_SOFT) } /* Check we have remaining characters. */ -if ((re->overall_options & PCRE2_ANCHORED) == 0 && (re->overall_options & PCRE2_FIRSTLINE) != 0) +if ((re->overall_options & PCRE2_ANCHORED) == 0 && common->match_end_ptr != 0) { - SLJIT_ASSERT(common->first_line_end != 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->first_line_end); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); } -OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), common->start_ptr); +OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), + (common->fast_forward_bc_ptr != NULL) ? (PRIVATE_DATA(common->fast_forward_bc_ptr + 1)) : common->start_ptr); if ((re->overall_options & PCRE2_ANCHORED) == 0) { if (common->ff_newline_shortcut != NULL) { + /* There cannot be more newlines if PCRE2_FIRSTLINE is set. */ if ((re->overall_options & PCRE2_FIRSTLINE) == 0) - CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, common->ff_newline_shortcut); - /* There cannot be more newlines here. */ + { + if (common->match_end_ptr != 0) + { + OP1(SLJIT_MOV, TMP3, 0, STR_END, 0); + OP1(SLJIT_MOV, STR_END, 0, TMP1, 0); + CMPTO(SLJIT_LESS, STR_PTR, 0, TMP1, 0, common->ff_newline_shortcut); + OP1(SLJIT_MOV, STR_END, 0, TMP3, 0); + } + else + CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, common->ff_newline_shortcut); + } } else - { - if ((re->overall_options & PCRE2_FIRSTLINE) == 0) - CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop_label); - else - CMPTO(SLJIT_LESS, STR_PTR, 0, TMP1, 0, mainloop_label); - } + CMPTO(SLJIT_LESS, STR_PTR, 0, (common->match_end_ptr == 0) ? STR_END : TMP1, 0, mainloop_label); } /* No more remaining characters. */ @@ -10454,7 +11235,7 @@ if (common->might_be_empty) { JUMPHERE(empty_match); OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV_UI, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, options)); + OP1(SLJIT_MOV_U32, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, options)); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, PCRE2_NOTEMPTY); JUMPTO(SLJIT_NOT_ZERO, empty_match_backtrack_label); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, PCRE2_NOTEMPTY_ATSTART); @@ -10464,6 +11245,9 @@ if (common->might_be_empty) JUMPTO(SLJIT_JUMP, empty_match_backtrack_label); } +common->fast_forward_bc_ptr = NULL; +common->fast_fail_start_ptr = 0; +common->fast_fail_end_ptr = 0; common->currententry = common->entries; common->local_exit = TRUE; quit_label = common->quit_label; diff --git a/pcre2/src/pcre2_jit_match.c b/pcre2/src/pcre2_jit_match.c index d8d941e46..a323971ff 100644 --- a/pcre2/src/pcre2_jit_match.c +++ b/pcre2/src/pcre2_jit_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -46,7 +46,7 @@ POSSIBILITY OF SUCH DAMAGE. static SLJIT_NOINLINE int jit_machine_stack_exec(jit_arguments *arguments, jit_function executable_func) { -sljit_ub local_space[MACHINE_STACK_SIZE]; +sljit_u8 local_space[MACHINE_STACK_SIZE]; struct sljit_stack local_stack; local_stack.top = (sljit_sw)&local_space; @@ -129,10 +129,12 @@ arguments.match_data = match_data; arguments.startchar_ptr = subject; arguments.mark_ptr = NULL; arguments.options = options; + if (mcontext != NULL) { arguments.callout = mcontext->callout; arguments.callout_data = mcontext->callout_data; + arguments.offset_limit = mcontext->offset_limit; arguments.limit_match = (mcontext->match_limit < re->limit_match)? mcontext->match_limit : re->limit_match; if (mcontext->jit_callback != NULL) @@ -144,6 +146,7 @@ else { arguments.callout = NULL; arguments.callout_data = NULL; + arguments.offset_limit = PCRE2_UNSET; arguments.limit_match = (MATCH_LIMIT < re->limit_match)? MATCH_LIMIT : re->limit_match; jit_stack = NULL; diff --git a/pcre2/src/pcre2_jit_misc.c b/pcre2/src/pcre2_jit_misc.c index f5b51286e..efdb05580 100644 --- a/pcre2/src/pcre2_jit_misc.c +++ b/pcre2/src/pcre2_jit_misc.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/pcre2/src/pcre2_jit_test.c b/pcre2/src/pcre2_jit_test.c index b076c67d1..705ba181e 100644 --- a/pcre2/src/pcre2_jit_test.c +++ b/pcre2/src/pcre2_jit_test.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -140,7 +140,6 @@ int main(void) #define F_DIFF 0x080000 #define F_FORCECONV 0x100000 #define F_PROPERTY 0x200000 -#define F_STUDY 0x400000 struct regression_test_case { int compile_options; @@ -188,6 +187,7 @@ static struct regression_test_case regression_test_cases[] = { { CMUP, A, 0, 0, "\xf0\x90\x90\x80{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" }, { CMUP, A, 0, 0, "\xf0\x90\x90\xa8{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" }, { CMUP, A, 0, 0, "\xe1\xbd\xb8\xe1\xbf\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" }, + { M, A, 0, 0, "[3-57-9]", "5" }, /* Assertions. */ { MU, A, 0, 0, "\\b[^A]", "A_B#" }, @@ -247,13 +247,17 @@ static struct regression_test_case regression_test_cases[] = { { M, A, 0, 0, "a\\z", "aaa" }, { M, A, 0, 0 | F_NOMATCH, "a\\z", "aab" }, - /* Brackets. */ + /* Brackets and alternatives. */ { MU, A, 0, 0, "(ab|bb|cd)", "bacde" }, { MU, A, 0, 0, "(?:ab|a)(bc|c)", "ababc" }, { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|efg))", "abac" }, { CMU, A, 0, 0, "((aB|(Cc))|(bB)|(?:cd|EFg))", "AcCe" }, { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|ebg))", "acebebg" }, { MU, A, 0, 0, "(?:(a)|(?:b))(cc|(?:d|e))(a|b)k", "accabdbbccbk" }, + { MU, A, 0, 0, "\xc7\x82|\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" }, + { MU, A, 0, 0, "=\xc7\x82|#\xc6\x82", "\xf1\x83\x82\x82=\xc7\x82\xc7\x83" }, + { MU, A, 0, 0, "\xc7\x82\xc7\x83|\xc6\x82\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" }, + { MU, A, 0, 0, "\xc6\x82\xc6\x82|\xc7\x83\xc7\x83|\xc8\x84\xc8\x84", "\xf1\x83\x82\x82\xc8\x84\xc8\x84" }, /* Greedy and non-greedy ? operators. */ { MU, A, 0, 0, "(?:a)?a", "laab" }, @@ -323,6 +327,14 @@ static struct regression_test_case regression_test_cases[] = { { CMU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" }, { MU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" }, { MU, A, 0, 0, "[^\xe1\xbd\xb8]{3,}?", "##\xe1\xbd\xb8#\xe1\xbd\xb8#\xc3\x89#\xe1\xbd\xb8" }, + { MU, A, 0, 0, "\\d+123", "987654321,01234" }, + { MU, A, 0, 0, "abcd*|\\w+xy", "aaaaa,abxyz" }, + { MU, A, 0, 0, "(?:abc|((?:amc|\\b\\w*xy)))", "aaaaa,abxyz" }, + { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.abcd#."}, + { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.mbcd#."}, + { MU, A, 0, 0, ".[ab]*.", "xx" }, + { MU, A, 0, 0, ".[ab]*a", "xxa" }, + { MU, A, 0, 0, ".[ab]?.", "xx" }, /* Bracket repeats with limit. */ { MU, A, 0, 0, "(?:(ab){2}){5}M", "abababababababababababM" }, @@ -679,6 +691,8 @@ static struct regression_test_case regression_test_cases[] = { { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 1, ".", "\r\n" }, { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_LF, 0, 0 | F_NOMATCH, "ab.", "ab" }, { MU | PCRE2_FIRSTLINE, A, 0, 1 | F_NOMATCH, "^[a-d0-9]", "\nxx\nd" }, + { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_ANY, 0, 0, "....a", "012\n0a" }, + { MU | PCRE2_FIRSTLINE, A, 0, 0, "[aC]", "a" }, /* Recurse. */ { MU, A, 0, 0, "(a)(?1)", "aa" }, @@ -765,11 +779,11 @@ static struct regression_test_case regression_test_cases[] = { { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))a(?1)b|aac", "aac" }, { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" }, { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b)+", "babba" }, - { MU, A, 0, 0 | F_NOMATCH | F_STUDY, "(a(*:aa)){0}(?:b(?1)b)+", "ba" }, + { MU, A, 0, 0 | F_NOMATCH, "(a(*:aa)){0}(?:b(?1)b)+", "ba" }, { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" }, { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b)+", "babba" }, - { MU, A, 0, 0 | F_NOMATCH | F_STUDY, "(a\\K(*:aa)){0}(?:b(?1)b)+", "ba" }, - { MU, A, 0, 0 | F_NOMATCH | F_STUDY, "(*:mark)m", "a" }, + { MU, A, 0, 0 | F_NOMATCH, "(a\\K(*:aa)){0}(?:b(?1)b)+", "ba" }, + { MU, A, 0, 0 | F_NOMATCH, "(*:mark)m", "a" }, /* (*COMMIT) verb. */ { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)b", "ac" }, @@ -813,6 +827,9 @@ static struct regression_test_case regression_test_cases[] = { /* (*SKIP) verb. */ { MU, A, 0, 0 | F_NOMATCH, "(?=a(*SKIP)b)ab|ad", "ad" }, + { MU, A, 0, 0, "(\\w+(*SKIP)#)", "abcd,xyz#," }, + { MU, A, 0, 0, "\\w+(*SKIP)#|mm", "abcd,xyz#," }, + { MU, A, 0, 0 | F_NOMATCH, "b+(?<=(*SKIP)#c)|b+", "#bbb" }, /* (*THEN) verb. */ { MU, A, 0, 0, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcaabcaabcaabcnacm" }, @@ -1516,10 +1533,10 @@ static int regression_tests(void) is_successful = 0; } #endif -#if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_16 - if (ovector16_1[i] != ovector16_2[i] || ovector16_1[i] != ovector16_1[i] || ovector16_1[i] != ovector16_2[i]) { - printf("\n16 and 16 bit: Ovector[%d] value differs(J16:%d,I16:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n", - i, ovector16_1[i], ovector16_2[i], ovector16_1[i], ovector16_2[i], +#if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32 + if (ovector16_1[i] != ovector16_2[i] || ovector16_1[i] != ovector32_1[i] || ovector16_1[i] != ovector32_2[i]) { + printf("\n16 and 32 bit: Ovector[%d] value differs(J16:%d,I16:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n", + i, ovector16_1[i], ovector16_2[i], ovector32_1[i], ovector32_2[i], total, current->pattern, current->input); is_successful = 0; } diff --git a/pcre2/src/pcre2_maketables.c b/pcre2/src/pcre2_maketables.c index ca68bca2a..2c7ae84d8 100644 --- a/pcre2/src/pcre2_maketables.c +++ b/pcre2/src/pcre2_maketables.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/pcre2/src/pcre2_match.c b/pcre2/src/pcre2_match.c index d3d5c1dfa..78a9bacbc 100644 --- a/pcre2/src/pcre2_match.c +++ b/pcre2/src/pcre2_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -55,7 +55,7 @@ POSSIBILITY OF SUCH DAMAGE. #define PUBLIC_MATCH_OPTIONS \ (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ - PCRE2_PARTIAL_SOFT) + PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT) #define PUBLIC_JIT_MATCH_OPTIONS \ (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\ @@ -142,14 +142,14 @@ Returns: = 0 sucessful match; number of code units matched is set */ static int -match_ref(PCRE2_SIZE offset, PCRE2_SIZE offset_top, register PCRE2_SPTR eptr, +match_ref(PCRE2_SIZE offset, PCRE2_SIZE offset_top, PCRE2_SPTR eptr, match_block *mb, BOOL caseless, PCRE2_SIZE *lengthptr) { #if defined SUPPORT_UNICODE BOOL utf = (mb->poptions & PCRE2_UTF) != 0; #endif -register PCRE2_SPTR p; +PCRE2_SPTR p; PCRE2_SIZE length; PCRE2_SPTR eptr_start = eptr; @@ -194,7 +194,7 @@ if (caseless) GETCHARINC(c, eptr); GETCHARINC(d, p); ur = GET_UCD(d); - if (c != d && c != d + ur->other_case) + if (c != d && c != (uint32_t)((int)d + ur->other_case)) { const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; for (;;) @@ -211,7 +211,7 @@ if (caseless) /* Not in UTF mode */ { - while (length-- > 0) + for (; length > 0; length--) { uint32_t cc, cp; if (eptr >= mb->end_subject) return 1; /* Partial match */ @@ -226,11 +226,11 @@ if (caseless) } /* In the caseful case, we can just compare the code units, whether or not we -are in UT mode. */ +are in UTF mode. */ else { - while (length-- > 0) + for (; length > 0; length--) { if (eptr >= mb->end_subject) return 1; /* Partial match */ if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /*No match */ @@ -296,7 +296,6 @@ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, argument of RMATCH isn't actually used in this definition. */ #ifndef HEAP_MATCH_RECURSE -#define REGISTER register #define RMATCH(ra,rb,rc,rd,re,rw) \ rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1) #define RRETURN(ra) return ra @@ -306,8 +305,6 @@ argument of RMATCH isn't actually used in this definition. */ the "rd" argument of RMATCH isn't actually used in this definition. It's the mb argument of match(), which never changes. */ -#define REGISTER - #define RMATCH(ra,rb,rc,rd,re,rw)\ {\ heapframe *newframe = frame->Xnextframe;\ @@ -425,7 +422,7 @@ to save the ovector while calling match() to process the pattern recursion. */ op_recurse_ovecsave(). */ static int -match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, PCRE2_SPTR mstart, +match(PCRE2_SPTR eptr, PCRE2_SPTR ecode, PCRE2_SPTR mstart, PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth); @@ -465,14 +462,14 @@ Returns: a match() return code */ static int -#ifdef __GNUC__ +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) __attribute__ ((noinline)) #endif -op_recurse_ovecsave(REGISTER PCRE2_SPTR eptr, PCRE2_SPTR callpat, +op_recurse_ovecsave(PCRE2_SPTR eptr, PCRE2_SPTR callpat, PCRE2_SPTR mstart, PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth) { -register int rrc; +int rrc; BOOL cbegroup = *callpat >= OP_SBRA; recursion_info *new_recursive = mb->recursive; PCRE2_SIZE ovecsave[OP_RECURSE_STACK_SAVE_MAX]; @@ -576,20 +573,19 @@ Returns: MATCH_MATCH if matched ) these values are >= 0 */ static int -match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, PCRE2_SPTR mstart, +match(PCRE2_SPTR eptr, PCRE2_SPTR ecode, PCRE2_SPTR mstart, PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth) { /* These variables do not need to be preserved over recursion in this function, so they can be ordinary variables in all cases. Mark some of them with "register" because they are used a lot in loops. */ -register int rrc; /* Returns from recursive calls */ -register int i; /* Used for loops not involving calls to RMATCH() */ -register uint32_t c; /* Character values not kept over RMATCH() calls */ -register BOOL utf; /* Local copy of UTF flag for speed */ +int rrc; /* Returns from recursive calls */ +int i; /* Used for loops not involving calls to RMATCH() */ +uint32_t c; /* Character values not kept over RMATCH() calls */ +BOOL utf; /* Local copy of UTF flag for speed */ BOOL minimize, possessive; /* Quantifier options */ -BOOL caseless; int condcode; /* When recursion is not being used, all "local" variables that have to be @@ -727,6 +723,7 @@ still need to be preserved over recursive calls of match(). These macros define the alternative names that are used. */ #define allow_zero cur_is_word +#define caseless cur_is_word #define cbegroup condition #define code_offset codelink #define condassert condition @@ -1319,7 +1316,7 @@ for (;;) { pcre2_callout_block cb; cb.version = 1; - cb.capture_top = offset_top/2; + cb.capture_top = (uint32_t)offset_top/2; cb.capture_last = mb->capture_last & CAPLMASK; cb.offset_vector = mb->ovector; cb.mark = mb->nomatch_mark; @@ -1503,8 +1500,8 @@ for (;;) if (offset >= offset_top) { - register PCRE2_SIZE *iptr = mb->ovector + offset_top; - register PCRE2_SIZE *iend = mb->ovector + offset; + PCRE2_SIZE *iptr = mb->ovector + offset_top; + PCRE2_SIZE *iend = mb->ovector + offset; while (iptr < iend) *iptr++ = PCRE2_UNSET; offset_top = offset + 2; } @@ -1704,14 +1701,14 @@ for (;;) back a number of characters, not bytes. */ case OP_REVERSE: + i = GET(ecode, 1); #ifdef SUPPORT_UNICODE if (utf) { - i = GET(ecode, 1); while (i-- > 0) { + if (eptr <= mb->start_subject) RRETURN(MATCH_NOMATCH); eptr--; - if (eptr < mb->start_subject) RRETURN(MATCH_NOMATCH); BACKCHAR(eptr); } } @@ -1721,8 +1718,8 @@ for (;;) /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ { - eptr -= GET(ecode, 1); - if (eptr < mb->start_subject) RRETURN(MATCH_NOMATCH); + if (i > eptr - mb->start_subject) RRETURN(MATCH_NOMATCH); + eptr -= i; } /* Save the earliest consulted character, then skip to next op code */ @@ -1746,7 +1743,7 @@ for (;;) pcre2_callout_block cb; cb.version = 1; cb.callout_number = ecode[LINK_SIZE + 1]; - cb.capture_top = offset_top/2; + cb.capture_top = (uint32_t)offset_top/2; cb.capture_last = mb->capture_last & CAPLMASK; cb.offset_vector = mb->ovector; cb.mark = mb->nomatch_mark; @@ -2052,8 +2049,8 @@ for (;;) if (offset > offset_top) { - register PCRE2_SIZE *iptr = mb->ovector + offset_top; - register PCRE2_SIZE *iend = mb->ovector + offset; + PCRE2_SIZE *iptr = mb->ovector + offset_top; + PCRE2_SIZE *iend = mb->ovector + offset; while (iptr < iend) *iptr++ = PCRE2_UNSET; } @@ -2382,7 +2379,7 @@ for (;;) case OP_ANY: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); if (mb->partial != 0 && - eptr + 1 >= mb->end_subject && + eptr == mb->end_subject - 1 && NLBLOCK->nltype == NLTYPE_FIXED && NLBLOCK->nllen == 2 && UCHAR21TEST(eptr) == NLBLOCK->nl[0]) @@ -2408,8 +2405,9 @@ for (;;) ecode++; break; - /* Match a single byte, even in UTF-8 mode. This opcode really does match - any byte, even newline, independent of the setting of PCRE2_DOTALL. */ + /* Match a single code unit, even in UTF-8 mode. This opcode really does + match any code unit, even newline. (It really should be called ANYCODEUNIT, + of course - the byte name is from pre-16 bit days.) */ case OP_ANYBYTE: if (eptr >= mb->end_subject) /* DO NOT merge the eptr++ here; it must */ @@ -2848,9 +2846,7 @@ for (;;) continue; } - /* First, ensure the minimum number of matches are present. We get back - the length of the reference string explicitly rather than passing the - address of eptr, so that eptr can be a register variable. */ + /* First, ensure the minimum number of matches are present. */ for (i = 1; i <= min; i++) { @@ -3342,7 +3338,10 @@ for (;;) CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ RRETURN(MATCH_NOMATCH); } - while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH); + for (; length > 0; length--) + { + if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH); + } } else #endif @@ -3758,7 +3757,7 @@ for (;;) #ifdef SUPPORT_UNICODE if (utf) { - register uint32_t ch, och; + uint32_t ch, och; ecode++; GETCHARINC(ch, ecode); @@ -3780,7 +3779,7 @@ for (;;) else #endif /* SUPPORT_UNICODE */ { - register uint32_t ch = ecode[1]; + uint32_t ch = ecode[1]; c = *eptr++; if (ch == c || (op == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == c)) RRETURN(MATCH_NOMATCH); @@ -3886,7 +3885,7 @@ for (;;) #ifdef SUPPORT_UNICODE if (utf) { - register uint32_t d; + uint32_t d; for (i = 1; i <= min; i++) { if (eptr >= mb->end_subject) @@ -3921,7 +3920,7 @@ for (;;) #ifdef SUPPORT_UNICODE if (utf) { - register uint32_t d; + uint32_t d; for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, mb, eptrb, RM28); @@ -3966,7 +3965,7 @@ for (;;) #ifdef SUPPORT_UNICODE if (utf) { - register uint32_t d; + uint32_t d; for (i = min; i < max; i++) { int len = 1; @@ -4027,7 +4026,7 @@ for (;;) #ifdef SUPPORT_UNICODE if (utf) { - register uint32_t d; + uint32_t d; for (i = 1; i <= min; i++) { if (eptr >= mb->end_subject) @@ -4061,7 +4060,7 @@ for (;;) #ifdef SUPPORT_UNICODE if (utf) { - register uint32_t d; + uint32_t d; for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, mb, eptrb, RM32); @@ -4105,7 +4104,7 @@ for (;;) #ifdef SUPPORT_UNICODE if (utf) { - register uint32_t d; + uint32_t d; for (i = min; i < max; i++) { int len = 1; @@ -6459,6 +6458,7 @@ PCRE2_UCHAR first_cu2 = 0; PCRE2_UCHAR req_cu = 0; PCRE2_UCHAR req_cu2 = 0; +PCRE2_SPTR bumpalong_limit; PCRE2_SPTR end_subject; PCRE2_SPTR start_match = subject + start_offset; PCRE2_SPTR req_cu_ptr = start_match - 1; @@ -6482,6 +6482,7 @@ mb->match_frames_base = &frame_zero; subject string. */ if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); +end_subject = subject + length; /* Plausibility checks */ @@ -6513,7 +6514,7 @@ occur. */ #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) -options |= (re->flags & FF) / ((FF & -FF) / (OO & -OO)); +options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); #undef FF #undef OO @@ -6533,21 +6534,66 @@ mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : /* Check a UTF string for validity if required. For 8-bit and 16-bit strings, we must also check that a starting offset does not point into the middle of a -multiunit character. */ +multiunit character. We check only the portion of the subject that is going to +be inspected during matching - from the offset minus the maximum back reference +to the given length. This saves time when a small part of a large subject is +being matched by the use of a starting offset. Note that the maximum lookbehind +is a number of characters, not code units. */ #ifdef SUPPORT_UNICODE if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) { - match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar)); - if (match_data->rc != 0) return match_data->rc; + PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ + + if (start_offset > 0) + { #if PCRE2_CODE_UNIT_WIDTH != 32 - if (start_offset > 0 && start_offset < length && - NOT_FIRSTCHAR(subject[start_offset])) - return PCRE2_ERROR_BADUTFOFFSET; + unsigned int i; + if (start_match < end_subject && NOT_FIRSTCU(*start_match)) + return PCRE2_ERROR_BADUTFOFFSET; + for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) + { + check_subject--; + while (check_subject > subject && +#if PCRE2_CODE_UNIT_WIDTH == 8 + (*check_subject & 0xc0) == 0x80) +#else /* 16-bit */ + (*check_subject & 0xfc00) == 0xdc00) +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ + check_subject--; + } +#else + /* In the 32-bit library, one code unit equals one character. However, + we cannot just subtract the lookbehind and then compare pointers, because + a very large lookbehind could create an invalid pointer. */ + + if (start_offset >= re->max_lookbehind) + check_subject -= re->max_lookbehind; + else + check_subject = subject; #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ + } + + /* Validate the relevant portion of the subject. After an error, adjust the + offset to be an absolute offset in the whole string. */ + + match_data->rc = PRIV(valid_utf)(check_subject, + length - (check_subject - subject), &(match_data->startchar)); + if (match_data->rc != 0) + { + match_data->startchar += check_subject - subject; + return match_data->rc; + } } #endif /* SUPPORT_UNICODE */ +/* It is an error to set an offset limit without setting the flag at compile +time. */ + +if (mcontext->offset_limit != PCRE2_UNSET && + (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) + return PCRE2_ERROR_BADOFFSETLIMIT; + /* If the pattern was successfully studied with JIT support, run the JIT executable instead of the rest of this function. Most options must be set at compile time for the JIT code to be usable. Fallback to the normal code path if @@ -6568,30 +6614,21 @@ if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0) anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; startline = (re->flags & PCRE2_STARTLINE) != 0; +bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? + end_subject : subject + mcontext->offset_limit; /* Fill in the fields in the match block. */ -if (mcontext == NULL) - { - mb->callout = NULL; - mb->memctl = re->memctl; +mb->callout = mcontext->callout; +mb->callout_data = mcontext->callout_data; +mb->memctl = mcontext->memctl; #ifdef HEAP_MATCH_RECURSE - mb->stack_memctl = re->memctl; +mb->stack_memctl = mcontext->stack_memctl; #endif - } -else - { - mb->callout = mcontext->callout; - mb->callout_data = mcontext->callout_data; - mb->memctl = mcontext->memctl; -#ifdef HEAP_MATCH_RECURSE - mb->stack_memctl = mcontext->stack_memctl; -#endif - } mb->start_subject = subject; mb->start_offset = start_offset; -mb->end_subject = end_subject = mb->start_subject + length; +mb->end_subject = end_subject; mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; mb->moptions = options; /* Match options */ @@ -6689,8 +6726,8 @@ in case they inspect these fields. */ if (ocount > 0) { - register PCRE2_SIZE *iptr = mb->ovector + ocount; - register PCRE2_SIZE *iend = iptr - re->top_bracket; + PCRE2_SIZE *iptr = mb->ovector + ocount; + PCRE2_SIZE *iend = iptr - re->top_bracket; if (iend < mb->ovector + 2) iend = mb->ovector + 2; while (--iptr >= iend) *iptr = PCRE2_UNSET; mb->ovector[0] = mb->ovector[1] = PCRE2_UNSET; @@ -6783,7 +6820,8 @@ for(;;) end_subject = t; } - /* Advance to a unique first code unit if there is one. */ + /* Advance to a unique first code unit if there is one. In 8-bit mode, the + use of memchr() gives a big speed up. */ if (has_first_cu) { @@ -6793,8 +6831,15 @@ for(;;) (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2) start_match++; else + { +#if PCRE2_CODE_UNIT_WIDTH != 8 while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu) start_match++; +#else + start_match = memchr(start_match, first_cu, end_subject - start_match); + if (start_match == NULL) start_match = end_subject; +#endif + } } /* Or to just after a linebreak for a multiline match */ @@ -6838,7 +6883,7 @@ for(;;) { while (start_match < end_subject) { - register uint32_t c = UCHAR21TEST(start_match); + uint32_t c = UCHAR21TEST(start_match); #if PCRE2_CODE_UNIT_WIDTH != 8 if (c > 255) c = 255; #endif @@ -6882,7 +6927,7 @@ for(;;) if (has_req_cu && end_subject - start_match < REQ_CU_MAX) { - register PCRE2_SPTR p = start_match + (has_first_cu? 1:0); + PCRE2_SPTR p = start_match + (has_first_cu? 1:0); /* We don't need to repeat the search if we haven't yet reached the place we found it at last time. */ @@ -6893,7 +6938,7 @@ for(;;) { while (p < end_subject) { - register uint32_t pp = UCHAR21INCTEST(p); + uint32_t pp = UCHAR21INCTEST(p); if (pp == req_cu || pp == req_cu2) { p--; break; } } } @@ -6926,6 +6971,14 @@ for(;;) /* ------------ End of start of match optimizations ------------ */ + /* Give no match if we have passed the bumpalong limit. */ + + if (start_match > bumpalong_limit) + { + rc = MATCH_NOMATCH; + break; + } + /* OK, we can now run the match. If "hitend" is set afterwards, remember the first starting point for which a partial match was found. */ @@ -7044,7 +7097,7 @@ for(;;) (2) The pattern is anchored or the match was failed by (*COMMIT); -(3) We are past the end of the subject; +(3) We are past the end of the subject or the bumpalong limit; (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because this option requests that a match occur at or before the first newline in @@ -7104,7 +7157,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) too many to fit into the ovector. */ match_data->rc = ((mb->capture_last & OVFLBIT) != 0)? - 0 : mb->end_offset_top/2; + 0 : (int)mb->end_offset_top/2; /* If there is space in the offset vector, set any pairs that follow the highest-numbered captured string but are less than the number of capturing @@ -7118,7 +7171,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) if (mb->end_offset_top/2 <= re->top_bracket) { - register PCRE2_SIZE *iptr, *iend; + PCRE2_SIZE *iptr, *iend; int resetcount = re->top_bracket + 1; if (resetcount > match_data->oveccount) resetcount = match_data->oveccount; iptr = match_data->ovector + mb->end_offset_top; diff --git a/pcre2/src/pcre2_match_data.c b/pcre2/src/pcre2_match_data.c index 1f2fb1536..85ac99834 100644 --- a/pcre2/src/pcre2_match_data.c +++ b/pcre2/src/pcre2_match_data.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/pcre2/src/pcre2_newline.c b/pcre2/src/pcre2_newline.c index 7f482f245..6e9366db9 100644 --- a/pcre2/src/pcre2_newline.c +++ b/pcre2/src/pcre2_newline.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/pcre2/src/pcre2_ord2utf.c b/pcre2/src/pcre2_ord2utf.c index d268e94ee..140373099 100644 --- a/pcre2/src/pcre2_ord2utf.c +++ b/pcre2/src/pcre2_ord2utf.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -83,7 +83,7 @@ PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer) /* Convert to UTF-8 */ #if PCRE2_CODE_UNIT_WIDTH == 8 -register int i, j; +int i, j; for (i = 0; i < PRIV(utf8_table1_size); i++) if ((int)cvalue <= PRIV(utf8_table1)[i]) break; buffer += i; diff --git a/pcre2/src/pcre2_pattern_info.c b/pcre2/src/pcre2_pattern_info.c index a0e734c9b..5b32a905b 100644 --- a/pcre2/src/pcre2_pattern_info.c +++ b/pcre2/src/pcre2_pattern_info.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -77,6 +77,7 @@ if (where == NULL) /* Requests field length */ case PCRE2_INFO_CAPTURECOUNT: case PCRE2_INFO_FIRSTCODETYPE: case PCRE2_INFO_FIRSTCODEUNIT: + case PCRE2_INFO_HASBACKSLASHC: case PCRE2_INFO_HASCRORLF: case PCRE2_INFO_JCHANGED: case PCRE2_INFO_LASTCODETYPE: @@ -151,6 +152,10 @@ switch(what) &(re->start_bitmap[0]) : NULL; break; + case PCRE2_INFO_HASBACKSLASHC: + *((uint32_t *)where) = (re->flags & PCRE2_HASBKC) != 0; + break; + case PCRE2_INFO_HASCRORLF: *((uint32_t *)where) = (re->flags & PCRE2_HASCRORLF) != 0; break; diff --git a/pcre2/src/pcre2_printint.c b/pcre2/src/pcre2_printint.c index 2cd01ab63..620749764 100644 --- a/pcre2/src/pcre2_printint.c +++ b/pcre2/src/pcre2_printint.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -58,12 +58,13 @@ static const char *OP_names[] = { OP_NAME_LIST }; /* The functions and tables herein must all have mode-dependent names. */ -#define OP_lengths PCRE2_SUFFIX(OP_lengths_) -#define get_ucpname PCRE2_SUFFIX(get_ucpname_) -#define pcre2_printint PCRE2_SUFFIX(pcre2_printint_) -#define print_char PCRE2_SUFFIX(print_char_) -#define print_custring PCRE2_SUFFIX(print_custring_) -#define print_prop PCRE2_SUFFIX(print_prop_) +#define OP_lengths PCRE2_SUFFIX(OP_lengths_) +#define get_ucpname PCRE2_SUFFIX(get_ucpname_) +#define pcre2_printint PCRE2_SUFFIX(pcre2_printint_) +#define print_char PCRE2_SUFFIX(print_char_) +#define print_custring PCRE2_SUFFIX(print_custring_) +#define print_custring_bylen PCRE2_SUFFIX(print_custring_bylen_) +#define print_prop PCRE2_SUFFIX(print_prop_) /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that the definition is next to the definition of the opcodes in pcre2_internal.h. @@ -188,12 +189,14 @@ return 0; * Print string as a list of code units * *************************************************/ -/* This takes no account of UTF as it always prints each individual code unit. -The string is zero-terminated. +/* These take no account of UTF as they always print each individual code unit. +The string is zero-terminated for print_custring(); the length is given for +print_custring_bylen(). Arguments: f file to write to ptr point to the string + len length for print_custring_bylen() Returns: nothing */ @@ -203,7 +206,17 @@ print_custring(FILE *f, PCRE2_SPTR ptr) { while (*ptr != '\0') { - register uint32_t c = *ptr++; + uint32_t c = *ptr++; + if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c); + } +} + +static void +print_custring_bylen(FILE *f, PCRE2_SPTR ptr, PCRE2_UCHAR len) +{ +for (; len > 0; len--) + { + uint32_t c = *ptr++; if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c); } } @@ -603,7 +616,7 @@ for(;;) c = code[1 + 4*LINK_SIZE]; fprintf(f, " %s %c", OP_names[*code], c); extra = GET(code, 1 + 2*LINK_SIZE); - print_custring(f, code + 2 + 4*LINK_SIZE); + print_custring_bylen(f, code + 2 + 4*LINK_SIZE, extra - 3 - 4*LINK_SIZE); for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) if (c == PRIV(callout_start_delims)[i]) { @@ -791,7 +804,7 @@ for(;;) case OP_SKIP_ARG: case OP_THEN_ARG: fprintf(f, " %s ", OP_names[*code]); - print_custring(f, code + 2); + print_custring_bylen(f, code + 2, code[1]); extra += code[1]; break; diff --git a/pcre2/src/pcre2_serialize.c b/pcre2/src/pcre2_serialize.c index 828b9461e..0af26d8fc 100644 --- a/pcre2/src/pcre2_serialize.c +++ b/pcre2/src/pcre2_serialize.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2015 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -104,7 +104,7 @@ for (i = 0; i < number_of_codes; i++) return PCRE2_ERROR_MIXEDTABLES; total_size += re->blocksize; } - + /* Initialize the byte stream. */ bytes = memctl->malloc(total_size + sizeof(pcre2_memctl), memctl->memory_data); if (bytes == NULL) return PCRE2_ERROR_NOMEMORY; @@ -158,6 +158,7 @@ int32_t i, j; if (data == NULL || codes == NULL) return PCRE2_ERROR_NULL; if (number_of_codes <= 0) return PCRE2_ERROR_BADDATA; +if (data->number_of_codes <= 0) return PCRE2_ERROR_BADSERIALIZEDDATA; if (data->magic != SERIALIZED_DATA_MAGIC) return PCRE2_ERROR_BADMAGIC; if (data->version != SERIALIZED_DATA_VERSION) return PCRE2_ERROR_BADMODE; if (data->config != SERIALIZED_DATA_CONFIG) return PCRE2_ERROR_BADMODE; @@ -167,7 +168,7 @@ if (number_of_codes > data->number_of_codes) src_bytes = bytes + sizeof(pcre2_serialized_data); -/* Decode tables. The reference count for the tables is stored immediately +/* Decode tables. The reference count for the tables is stored immediately following them. */ tables = memctl->malloc(tables_length + sizeof(PCRE2_SIZE), memctl->memory_data); @@ -179,8 +180,8 @@ src_bytes += tables_length; /* Decode the byte stream. We must not try to read the size from the compiled code block in the stream, because it might be unaligned, which causes errors on -hardware such as Sparc-64 that doesn't like unaligned memory accesses. The type -of the blocksize field is given its own name to ensure that it is the same here +hardware such as Sparc-64 that doesn't like unaligned memory accesses. The type +of the blocksize field is given its own name to ensure that it is the same here as in the block. */ for (i = 0; i < number_of_codes; i++) @@ -188,10 +189,12 @@ for (i = 0; i < number_of_codes; i++) CODE_BLOCKSIZE_TYPE blocksize; memcpy(&blocksize, src_bytes + offsetof(pcre2_real_code, blocksize), sizeof(CODE_BLOCKSIZE_TYPE)); + if (blocksize <= sizeof(pcre2_real_code)) + return PCRE2_ERROR_BADSERIALIZEDDATA; /* The allocator provided by gcontext replaces the original one. */ - - dst_re = (pcre2_real_code *)PRIV(memctl_malloc)(blocksize, + + dst_re = (pcre2_real_code *)PRIV(memctl_malloc)(blocksize, (pcre2_memctl *)gcontext); if (dst_re == NULL) { @@ -205,12 +208,16 @@ for (i = 0; i < number_of_codes; i++) } /* The new allocator must be preserved. */ - + memcpy(((uint8_t *)dst_re) + sizeof(pcre2_memctl), src_bytes + sizeof(pcre2_memctl), blocksize - sizeof(pcre2_memctl)); + if (dst_re->magic_number != MAGIC_NUMBER || + dst_re->name_entry_size > MAX_NAME_SIZE + IMM2_SIZE + 1 || + dst_re->name_count > MAX_NAME_COUNT) + return PCRE2_ERROR_BADSERIALIZEDDATA; /* At the moment only one table is supported. */ - + dst_re->tables = tables; dst_re->executable_jit = NULL; dst_re->flags |= PCRE2_DEREF_TABLES; @@ -252,7 +259,7 @@ if (bytes != NULL) { pcre2_memctl *memctl = (pcre2_memctl *)(bytes - sizeof(pcre2_memctl)); memctl->free(memctl, memctl->memory_data); - } + } } /* End of pcre2_serialize.c */ diff --git a/pcre2/src/pcre2_string_utils.c b/pcre2/src/pcre2_string_utils.c index 888620e19..2a1f28262 100644 --- a/pcre2/src/pcre2_string_utils.c +++ b/pcre2/src/pcre2_string_utils.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -121,7 +121,7 @@ int PRIV(strncmp)(PCRE2_SPTR str1, PCRE2_SPTR str2, size_t len) { PCRE2_UCHAR c1, c2; -while (len-- > 0) +for (; len > 0; len--) { c1 = *str1++; c2 = *str2++; @@ -150,7 +150,7 @@ int PRIV(strncmp_c8)(PCRE2_SPTR str1, const char *str2, size_t len) { PCRE2_UCHAR c1, c2; -while (len-- > 0) +for (; len > 0; len--) { c1 = *str1++; c2 = *str2++; diff --git a/pcre2/src/pcre2_study.c b/pcre2/src/pcre2_study.c index 25d7e5140..5a4d520c0 100644 --- a/pcre2/src/pcre2_study.c +++ b/pcre2/src/pcre2_study.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -50,6 +50,10 @@ collecting data (e.g. minimum matching length). */ #include "pcre2_internal.h" +/* The maximum remembered capturing brackets minimum. */ + +#define MAX_CACHE_BACKREF 128 + /* Set a bit in the starting code unit bit map. */ #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7)) @@ -59,15 +63,23 @@ collecting data (e.g. minimum matching length). */ enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN }; - /************************************************* * Find the minimum subject length for a group * *************************************************/ /* Scan a parenthesized group and compute the minimum length of subject that is needed to match it. This is a lower bound; it does not mean there is a -string of that length that matches. In UTF8 mode, the result is in characters -rather than bytes. +string of that length that matches. In UTF mode, the result is in characters +rather than code units. The field in a compiled pattern for storing the minimum +length is 16-bits long (on the grounds that anything longer than that is +pathological), so we give up when we reach that amount. This also means that +integer overflow for really crazy patterns cannot happen. + +Backreference minimum lengths are cached to speed up multiple references. This +function is called only when the highest back reference in the pattern is less +than or equal to MAX_CACHE_BACKREF, which is one less than the size of the +caching vector. The zeroth element contains the number of the highest set +value. Arguments: re compiled pattern block @@ -75,35 +87,58 @@ Arguments: startcode pointer to start of the whole pattern's code utf UTF flag recurses chain of recurse_check to catch mutual recursion + countptr pointer to call count (to catch over complexity) + backref_cache vector for caching back references. Returns: the minimum length -1 \C in UTF-8 mode or (*ACCEPT) + or pattern too complicated + or back reference to duplicate name/number -2 internal error (missing capturing bracket) -3 internal error (opcode not listed) */ static int find_minlength(const pcre2_real_code *re, PCRE2_SPTR code, - PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses) + PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr, + int *backref_cache) { int length = -1; +int prev_cap_recno = -1; +int prev_cap_d = 0; +int prev_recurse_recno = -1; +int prev_recurse_d = 0; +uint32_t once_fudge = 0; BOOL had_recurse = FALSE; +BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0; recurse_check this_recurse; -register int branchlength = 0; -register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE; +int branchlength = 0; +PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE; -if (*code == OP_CBRA || *code == OP_SCBRA || - *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE; +/* If this is a "could be empty" group, its minimum length is 0. */ -/* Scan along the opcodes for this branch. If we get to the end of the -branch, check the length against that of the other branches. */ +if (*code >= OP_SBRA && *code <= OP_SCOND) return 0; + +/* Skip over capturing bracket number */ + +if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE; + +/* A large and/or complex regex can take too long to process. */ + +if ((*countptr)++ > 1000) return -1; + +/* Scan along the opcodes for this branch. If we get to the end of the branch, +check the length against that of the other branches. If the accumulated length +passes 16-bits, stop. */ for (;;) { - int d, min; + int d, min, recno; PCRE2_UCHAR *cs, *ce; - register PCRE2_UCHAR op = *cc; + PCRE2_UCHAR op = *cc; + + if (branchlength >= UINT16_MAX) return UINT16_MAX; switch (op) { @@ -112,7 +147,8 @@ for (;;) /* If there is only one branch in a condition, the implied branch has zero length, so we don't add anything. This covers the DEFINE "condition" - automatically. */ + automatically. If there are two branches we can treat it the same as any + other non-capturing subpattern. */ cs = cc + GET(cc, 1); if (*cs != OP_ALT) @@ -120,23 +156,54 @@ for (;;) cc = cs + 1 + LINK_SIZE; break; } + goto PROCESS_NON_CAPTURE; - /* Otherwise we can fall through and treat it the same as any other - subpattern. */ + /* There's a special case of OP_ONCE, when it is wrapped round an + OP_RECURSE. We'd like to process the latter at this level so that + remembering the value works for repeated cases. So we do nothing, but + set a fudge value to skip over the OP_KET after the recurse. */ + + case OP_ONCE: + if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET) + { + once_fudge = 1 + LINK_SIZE; + cc += 1 + LINK_SIZE; + break; + } + /* Fall through */ + + case OP_ONCE_NC: + case OP_BRA: + case OP_SBRA: + case OP_BRAPOS: + case OP_SBRAPOS: + PROCESS_NON_CAPTURE: + d = find_minlength(re, cc, startcode, utf, recurses, countptr, + backref_cache); + if (d < 0) return d; + branchlength += d; + do cc += GET(cc, 1); while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; + + /* To save time for repeated capturing subpatterns, we remember the + length of the previous one. Unfortunately we can't do the same for + the unnumbered ones above. Nor can we do this if (?| is present in the + pattern because captures with the same number are not then identical. */ case OP_CBRA: case OP_SCBRA: - case OP_BRA: - case OP_SBRA: case OP_CBRAPOS: case OP_SCBRAPOS: - case OP_BRAPOS: - case OP_SBRAPOS: - case OP_ONCE: - case OP_ONCE_NC: - d = find_minlength(re, cc, startcode, utf, recurses); - if (d < 0) return d; - branchlength += d; + recno = (int)GET2(cc, 1+LINK_SIZE); + if (dupcapused || recno != prev_cap_recno) + { + prev_cap_recno = recno; + prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr, + backref_cache); + if (prev_cap_d < 0) return prev_cap_d; + } + branchlength += prev_cap_d; do cc += GET(cc, 1); while (*cc == OP_ALT); cc += 1 + LINK_SIZE; break; @@ -388,8 +455,12 @@ for (;;) matches an empty string (by default it causes a matching failure), so in that case we must set the minimum length to zero. */ - case OP_DNREF: /* Duplicate named pattern back reference */ + /* Duplicate named pattern back reference. We cannot reliably find a length + for this if duplicate numbers are present in the pattern. */ + + case OP_DNREF: case OP_DNREFI: + if (dupcapused) return -1; if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) { int count = GET2(cc, 1+IMM2_SIZE); @@ -399,18 +470,80 @@ for (;;) d = INT_MAX; - /* Scan all groups with the same name */ + /* Scan all groups with the same name; find the shortest. */ while (count-- > 0) { - ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0)); + int dd, i; + recno = GET2(slot, 0); + + if (recno <= backref_cache[0] && backref_cache[recno] >= 0) + dd = backref_cache[recno]; + else + { + ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); + if (cs == NULL) return -2; + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) /* Simple recursion */ + { + dd = 0; + had_recurse = TRUE; + } + else + { + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) + if (r->group == cs) break; + if (r != NULL) /* Mutual recursion */ + { + dd = 0; + had_recurse = TRUE; + } + else + { + this_recurse.prev = recurses; + this_recurse.group = cs; + dd = find_minlength(re, cs, startcode, utf, &this_recurse, + countptr, backref_cache); + if (dd < 0) return dd; + } + } + + backref_cache[recno] = dd; + for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; + backref_cache[0] = recno; + } + + if (dd < d) d = dd; + if (d <= 0) break; /* No point looking at any more */ + slot += re->name_entry_size; + } + } + else d = 0; + cc += 1 + 2*IMM2_SIZE; + goto REPEAT_BACK_REFERENCE; + + /* Single back reference. We cannot find a length for this if duplicate + numbers are present in the pattern. */ + + case OP_REF: + case OP_REFI: + if (dupcapused) return -1; + recno = GET2(cc, 1); + if (recno <= backref_cache[0] && backref_cache[recno] >= 0) + d = backref_cache[recno]; + else + { + int i; + if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) + { + ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); if (cs == NULL) return -2; do ce += GET(ce, 1); while (*ce == OP_ALT); if (cc > cs && cc < ce) /* Simple recursion */ { d = 0; had_recurse = TRUE; - break; } else { @@ -420,54 +553,24 @@ for (;;) { d = 0; had_recurse = TRUE; - break; } else { - int dd; this_recurse.prev = recurses; this_recurse.group = cs; - dd = find_minlength(re, cs, startcode, utf, &this_recurse); - if (dd < d) d = dd; + d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr, + backref_cache); + if (d < 0) return d; } } - slot += re->name_entry_size; } - } - else d = 0; - cc += 1 + 2*IMM2_SIZE; - goto REPEAT_BACK_REFERENCE; + else d = 0; - case OP_REF: /* Single back reference */ - case OP_REFI: - if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) - { - ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); - if (cs == NULL) return -2; - do ce += GET(ce, 1); while (*ce == OP_ALT); - if (cc > cs && cc < ce) /* Simple recursion */ - { - d = 0; - had_recurse = TRUE; - } - else - { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; - if (r != NULL) /* Mutual recursion */ - { - d = 0; - had_recurse = TRUE; - } - else - { - this_recurse.prev = recurses; - this_recurse.group = cs; - d = find_minlength(re, cs, startcode, utf, &this_recurse); - } - } + backref_cache[recno] = d; + for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; + backref_cache[0] = recno; } - else d = 0; + cc += 1 + IMM2_SIZE; /* Handle repeated back references */ @@ -504,28 +607,51 @@ for (;;) break; } - branchlength += min * d; + /* Take care not to overflow: (1) min and d are ints, so check that their + product is not greater than INT_MAX. (2) branchlength is limited to + UINT16_MAX (checked at the top of the loop). */ + + if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d) + branchlength = UINT16_MAX; + else branchlength += min * d; break; + /* Recursion always refers to the first occurrence of a subpattern with a + given number. Therefore, we can always make use of caching, even when the + pattern contains multiple subpatterns with the same number. */ + case OP_RECURSE: cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1); - do ce += GET(ce, 1); while (*ce == OP_ALT); - if (cc > cs && cc < ce) /* Simple recursion */ - had_recurse = TRUE; + recno = GET2(cs, 1+LINK_SIZE); + if (recno == prev_recurse_recno) + { + branchlength += prev_recurse_d; + } else { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; - if (r != NULL) /* Mutual recursion */ + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) /* Simple recursion */ had_recurse = TRUE; else { - this_recurse.prev = recurses; - this_recurse.group = cs; - branchlength += find_minlength(re, cs, startcode, utf, &this_recurse); + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; + if (r != NULL) /* Mutual recursion */ + had_recurse = TRUE; + else + { + this_recurse.prev = recurses; + this_recurse.group = cs; + prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse, + countptr, backref_cache); + if (prev_recurse_d < 0) return prev_recurse_d; + prev_recurse_recno = recno; + branchlength += prev_recurse_d; + } } } - cc += 1 + LINK_SIZE; + cc += 1 + LINK_SIZE + once_fudge; + once_fudge = 0; break; /* Anything else does not or need not match a character. We can get the @@ -708,7 +834,7 @@ Returns: nothing static void set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit) { -register uint32_t c; +uint32_t c; for (c = 0; c < table_limit; c++) re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type]; #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 @@ -749,7 +875,7 @@ Returns: nothing static void set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit) { -register uint32_t c; +uint32_t c; for (c = 0; c < table_limit; c++) re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 @@ -789,7 +915,7 @@ Returns: SSB_FAIL => Failed to find any starting code units static int set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf) { -register uint32_t c; +uint32_t c; int yield = SSB_DONE; #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 @@ -1368,7 +1494,7 @@ do for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c]; for (c = 128; c < 256; c++) { - if ((classmap[c/8] && (1 << (c&7))) != 0) + if ((classmap[c/8] & (1 << (c&7))) != 0) { int d = (c >> 6) | 0xc0; /* Set bit for this starter */ re->start_bitmap[d/8] |= (1 << (d&7)); /* and then skip on to the */ @@ -1441,6 +1567,7 @@ int PRIV(study)(pcre2_real_code *re) { int min; +int count = 0; PCRE2_UCHAR *code; BOOL utf = (re->overall_options & PCRE2_UTF) != 0; @@ -1461,22 +1588,35 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 && if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET; } -/* Find the minimum length of subject string. */ +/* Find the minimum length of subject string. If the pattern can match an empty +string, the minimum length is already known. If there are more back references +than the size of the vector we are going to cache them in, do nothing. A +pattern that complicated will probably take a long time to analyze and may in +any case turn out to be too complicated. Note that back reference minima are +held as 16-bit numbers. */ -switch(min = find_minlength(re, code, code, utf, NULL)) +if ((re->flags & PCRE2_MATCH_EMPTY) == 0 && + re->top_backref <= MAX_CACHE_BACKREF) { - case -1: /* \C in UTF mode or (*ACCEPT) */ - break; /* Leave minlength unchanged (will be zero) */ + int backref_cache[MAX_CACHE_BACKREF+1]; + backref_cache[0] = 0; /* Highest one that is set */ + min = find_minlength(re, code, code, utf, NULL, &count, backref_cache); + switch(min) + { + case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */ + break; /* Leave minlength unchanged (will be zero) */ - case -2: - return 2; /* missing capturing bracket */ + case -2: + return 2; /* missing capturing bracket */ - case -3: - return 3; /* unrecognized opcode */ + case -3: + return 3; /* unrecognized opcode */ - default: - re->minlength = min; - break; + default: + if (min > UINT16_MAX) min = UINT16_MAX; + re->minlength = min; + break; + } } return 0; diff --git a/pcre2/src/pcre2_substitute.c b/pcre2/src/pcre2_substitute.c index ec00ebb86..8da951fc6 100644 --- a/pcre2/src/pcre2_substitute.c +++ b/pcre2/src/pcre2_substitute.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -45,6 +45,123 @@ POSSIBILITY OF SUCH DAMAGE. #include "pcre2_internal.h" +#define PTR_STACK_SIZE 20 + +#define SUBSTITUTE_OPTIONS \ + (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_UNKNOWN_UNSET| \ + PCRE2_SUBSTITUTE_UNSET_EMPTY) + + + +/************************************************* +* Find end of substitute text * +*************************************************/ + +/* In extended mode, we recognize ${name:+set text:unset text} and similar +constructions. This requires the identification of unescaped : and } +characters. This function scans for such. It must deal with nested ${ +constructions. The pointer to the text is updated, either to the required end +character, or to where an error was detected. + +Arguments: + code points to the compiled expression (for options) + ptrptr points to the pointer to the start of the text (updated) + ptrend end of the whole string + last TRUE if the last expected string (only } recognized) + +Returns: 0 on success + negative error code on failure +*/ + +static int +find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, + BOOL last) +{ +int rc = 0; +uint32_t nestlevel = 0; +BOOL literal = FALSE; +PCRE2_SPTR ptr = *ptrptr; + +for (; ptr < ptrend; ptr++) + { + if (literal) + { + if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E) + { + literal = FALSE; + ptr += 1; + } + } + + else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) + { + if (nestlevel == 0) goto EXIT; + nestlevel--; + } + + else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT; + + else if (*ptr == CHAR_DOLLAR_SIGN) + { + if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET) + { + nestlevel++; + ptr += 1; + } + } + + else if (*ptr == CHAR_BACKSLASH) + { + int erc; + int errorcode; + uint32_t ch; + + if (ptr < ptrend - 1) switch (ptr[1]) + { + case CHAR_L: + case CHAR_l: + case CHAR_U: + case CHAR_u: + ptr += 1; + continue; + } + + ptr += 1; /* Must point after \ */ + erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, + code->overall_options, FALSE, NULL); + ptr -= 1; /* Back to last code unit of escape */ + if (errorcode != 0) + { + rc = errorcode; + goto EXIT; + } + + switch(erc) + { + case 0: /* Data character */ + case ESC_E: /* Isolated \E is ignored */ + break; + + case ESC_Q: + literal = TRUE; + break; + + default: + rc = PCRE2_ERROR_BADREPESCAPE; + goto EXIT; + } + } + } + +rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */ + +EXIT: +*ptrptr = ptr; +return rc; +} + + /************************************************* * Match and substitute * @@ -72,6 +189,30 @@ Returns: >= 0 number of substitutions made PCRE2_ERROR_BADREPLACEMENT means invalid use of $ */ +/* This macro checks for space in the buffer before copying into it. On +overflow, either give an error immediately, or keep on, accumulating the +length. */ + +#define CHECKMEMCPY(from,length) \ + if (!overflowed && lengthleft < length) \ + { \ + if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ + overflowed = TRUE; \ + extra_needed = length - lengthleft; \ + } \ + else if (overflowed) \ + { \ + extra_needed += length; \ + } \ + else \ + { \ + memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ + buff_offset += length; \ + lengthleft -= length; \ + } + +/* Here's the function */ + PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, @@ -80,13 +221,28 @@ pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, { int rc; int subs; +int forcecase = 0; +int forcecasereset = 0; uint32_t ovector_count; uint32_t goptions = 0; +uint32_t suboptions; BOOL match_data_created = FALSE; -BOOL global = FALSE; -PCRE2_SIZE buff_offset, lengthleft, fraglength; +BOOL literal = FALSE; +BOOL overflowed = FALSE; +#ifdef SUPPORT_UNICODE +BOOL utf = (code->overall_options & PCRE2_UTF) != 0; +#endif +PCRE2_UCHAR temp[6]; +PCRE2_SPTR ptr; +PCRE2_SPTR repend; +PCRE2_SIZE extra_needed = 0; +PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; PCRE2_SIZE *ovector; +buff_offset = 0; +lengthleft = buff_length = *blength; +*blength = PCRE2_UNSET; + /* Partial matching is not valid. */ if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) @@ -106,11 +262,16 @@ if (match_data == NULL) ovector = pcre2_get_ovector_pointer(match_data); ovector_count = pcre2_get_ovector_count(match_data); +/* Find lengths of zero-terminated strings and the end of the replacement. */ + +if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); +if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); +repend = replacement + rlength; + /* Check UTF replacement string if necessary. */ #ifdef SUPPORT_UNICODE -if ((code->overall_options & PCRE2_UTF) != 0 && - (options & PCRE2_NO_UTF_CHECK) == 0) +if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) { rc = PRIV(valid_utf)(replacement, rlength, &(match_data->rightchar)); if (rc != 0) @@ -121,37 +282,36 @@ if ((code->overall_options & PCRE2_UTF) != 0 && } #endif /* SUPPORT_UNICODE */ -/* Notice the global option and remove it from the options that are passed to -pcre2_match(). */ +/* Save the substitute options and remove them from the match options. */ -if ((options & PCRE2_SUBSTITUTE_GLOBAL) != 0) - { - options &= ~PCRE2_SUBSTITUTE_GLOBAL; - global = TRUE; - } - -/* Find lengths of zero-terminated strings. */ - -if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); -if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); +suboptions = options & SUBSTITUTE_OPTIONS; +options &= ~SUBSTITUTE_OPTIONS; /* Copy up to the start offset */ -if (start_offset > *blength) goto NOROOM; -memcpy(buffer, subject, start_offset * (PCRE2_CODE_UNIT_WIDTH/8)); -buff_offset = start_offset; -lengthleft = *blength - start_offset; +if (start_offset > length) + { + match_data->leftchar = 0; + rc = PCRE2_ERROR_BADOFFSET; + goto EXIT; + } +CHECKMEMCPY(subject, start_offset); /* Loop for global substituting. */ subs = 0; do { - PCRE2_SIZE i; + PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; + uint32_t ptrstackptr = 0; rc = pcre2_match(code, subject, length, start_offset, options|goptions, match_data, mcontext); +#ifdef SUPPORT_UNICODE + if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */ +#endif + /* Any error other than no match returns the error code. No match when not doing the special after-empty-match global rematch, or when at the end of the subject, breaks the global loop. Otherwise, advance the starting point by one @@ -164,8 +324,22 @@ do if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; if (goptions == 0 || start_offset >= length) break; + /* Advance by one code point. Then, if CRLF is a valid newline sequence and + we have advanced into the middle of it, advance one more code point. In + other words, do not start in the middle of CRLF, even if CR and LF on their + own are valid newlines. */ + save_start = start_offset++; - if ((code->overall_options & PCRE2_UTF) != 0) + if (subject[start_offset-1] == CHAR_CR && + code->newline_convention != PCRE2_NEWLINE_CR && + code->newline_convention != PCRE2_NEWLINE_LF && + start_offset < length && + subject[start_offset] == CHAR_LF) + start_offset++; + + /* Otherwise, in UTF mode, advance past any secondary code points. */ + + else if ((code->overall_options & PCRE2_UTF) != 0) { #if PCRE2_CODE_UNIT_WIDTH == 8 while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) @@ -177,60 +351,138 @@ do #endif } - fraglength = start_offset - save_start; - if (lengthleft < fraglength) goto NOROOM; - memcpy(buffer + buff_offset, subject + save_start, - fraglength*(PCRE2_CODE_UNIT_WIDTH/8)); - buff_offset += fraglength; - lengthleft -= fraglength; + /* Copy what we have advanced past, reset the special global options, and + continue to the next match. */ + fraglength = start_offset - save_start; + CHECKMEMCPY(subject + save_start, fraglength); goptions = 0; continue; } - /* Handle a successful match. */ + /* Handle a successful match. Matches that use \K to end before they start + are not supported. */ + if (ovector[1] < ovector[0]) + { + rc = PCRE2_ERROR_BADSUBSPATTERN; + goto EXIT; + } + + /* Count substitutions with a paranoid check for integer overflow; surely no + real call to this function would ever hit this! */ + + if (subs == INT_MAX) + { + rc = PCRE2_ERROR_TOOMANYREPLACE; + goto EXIT; + } subs++; + + /* Copy the text leading up to the match. */ + if (rc == 0) rc = ovector_count; fraglength = ovector[0] - start_offset; - if (fraglength >= lengthleft) goto NOROOM; - memcpy(buffer + buff_offset, subject + start_offset, - fraglength*(PCRE2_CODE_UNIT_WIDTH/8)); - buff_offset += fraglength; - lengthleft -= fraglength; + CHECKMEMCPY(subject + start_offset, fraglength); - for (i = 0; i < rlength; i++) + /* Process the replacement string. Literal mode is set by \Q, but only in + extended mode when backslashes are being interpreted. In extended mode we + must handle nested substrings that are to be reprocessed. */ + + ptr = replacement; + for (;;) { - if (replacement[i] == CHAR_DOLLAR_SIGN) + uint32_t ch; + unsigned int chlen; + + /* If at the end of a nested substring, pop the stack. */ + + if (ptr >= repend) + { + if (ptrstackptr <= 0) break; /* End of replacement string */ + repend = ptrstack[--ptrstackptr]; + ptr = ptrstack[--ptrstackptr]; + continue; + } + + /* Handle the next character */ + + if (literal) + { + if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) + { + literal = FALSE; + ptr += 2; + continue; + } + goto LOADLITERAL; + } + + /* Not in literal mode. */ + + if (*ptr == CHAR_DOLLAR_SIGN) { int group, n; + uint32_t special = 0; BOOL inparens; + BOOL star; PCRE2_SIZE sublength; + PCRE2_SPTR text1_start = NULL; + PCRE2_SPTR text1_end = NULL; + PCRE2_SPTR text2_start = NULL; + PCRE2_SPTR text2_end = NULL; PCRE2_UCHAR next; PCRE2_UCHAR name[33]; - if (++i == rlength) goto BAD; - if ((next = replacement[i]) == CHAR_DOLLAR_SIGN) goto LITERAL; + if (++ptr >= repend) goto BAD; + if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL; group = -1; n = 0; inparens = FALSE; + star = FALSE; if (next == CHAR_LEFT_CURLY_BRACKET) { - if (++i == rlength) goto BAD; - next = replacement[i]; + if (++ptr >= repend) goto BAD; + next = *ptr; inparens = TRUE; } - if (next >= CHAR_0 && next <= CHAR_9) + if (next == CHAR_ASTERISK) + { + if (++ptr >= repend) goto BAD; + next = *ptr; + star = TRUE; + } + + if (!star && next >= CHAR_0 && next <= CHAR_9) { group = next - CHAR_0; - while (++i < rlength) + while (++ptr < repend) { - next = replacement[i]; + next = *ptr; if (next < CHAR_0 || next > CHAR_9) break; group = group * 10 + next - CHAR_0; + + /* A check for a number greater than the hightest captured group + is sufficient here; no need for a separate overflow check. If unknown + groups are to be treated as unset, just skip over any remaining + digits and carry on. */ + + if (group > code->top_bracket) + { + if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) + { + while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9); + break; + } + else + { + rc = PCRE2_ERROR_NOSUBSTRING; + goto PTREXIT; + } + } } } else @@ -240,43 +492,312 @@ do { name[n++] = next; if (n > 32) goto BAD; - if (i == rlength) break; - next = replacement[++i]; + if (++ptr >= repend) break; + next = *ptr; } if (n == 0) goto BAD; name[n] = 0; } + /* In extended mode we recognize ${name:+set text:unset text} and + ${name:-default text}. */ + if (inparens) { - if (i == rlength || next != CHAR_RIGHT_CURLY_BRACKET) goto BAD; + if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && + !star && ptr < repend - 2 && next == CHAR_COLON) + { + special = *(++ptr); + if (special != CHAR_PLUS && special != CHAR_MINUS) + { + rc = PCRE2_ERROR_BADSUBSTITUTION; + goto PTREXIT; + } + + text1_start = ++ptr; + rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS); + if (rc != 0) goto PTREXIT; + text1_end = ptr; + + if (special == CHAR_PLUS && *ptr == CHAR_COLON) + { + text2_start = ++ptr; + rc = find_text_end(code, &ptr, repend, TRUE); + if (rc != 0) goto PTREXIT; + text2_end = ptr; + } + } + + else + { + if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET) + { + rc = PCRE2_ERROR_REPMISSINGBRACE; + goto PTREXIT; + } + } + + ptr++; } - else i--; /* Last code unit of name/number */ - /* Have found a syntactically correct group number or name. */ + /* Have found a syntactically correct group number or name, or *name. + Only *MARK is currently recognized. */ + + if (star) + { + if (PRIV(strcmp_c8)(name, STRING_MARK) == 0) + { + PCRE2_SPTR mark = pcre2_get_mark(match_data); + if (mark != NULL) + { + PCRE2_SPTR mark_start = mark; + while (*mark != 0) mark++; + fraglength = mark - mark_start; + CHECKMEMCPY(mark_start, fraglength); + } + } + else goto BAD; + } + + /* Substitute the contents of a group. We don't use substring_copy + functions any more, in order to support case forcing. */ - sublength = lengthleft; - if (group < 0) - rc = pcre2_substring_copy_byname(match_data, name, - buffer + buff_offset, &sublength); else - rc = pcre2_substring_copy_bynumber(match_data, group, - buffer + buff_offset, &sublength); + { + PCRE2_SPTR subptr, subptrend; - if (rc < 0) goto EXIT; - buff_offset += sublength; - lengthleft -= sublength; + /* Find a number for a named group. In case there are duplicate names, + search for the first one that is set. If the name is not found when + PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a + non-existent group. */ + + if (group < 0) + { + PCRE2_SPTR first, last, entry; + rc = pcre2_substring_nametable_scan(code, name, &first, &last); + if (rc == PCRE2_ERROR_NOSUBSTRING && + (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) + { + group = code->top_bracket + 1; + } + else + { + if (rc < 0) goto PTREXIT; + for (entry = first; entry <= last; entry += rc) + { + uint32_t ng = GET2(entry, 0); + if (ng < ovector_count) + { + if (group < 0) group = ng; /* First in ovector */ + if (ovector[ng*2] != PCRE2_UNSET) + { + group = ng; /* First that is set */ + break; + } + } + } + + /* If group is still negative, it means we did not find a group + that is in the ovector. Just set the first group. */ + + if (group < 0) group = GET2(first, 0); + } + } + + /* We now have a group that is identified by number. Find the length of + the captured string. If a group in a non-special substitution is unset + when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */ + + rc = pcre2_substring_length_bynumber(match_data, group, &sublength); + if (rc < 0) + { + if (rc == PCRE2_ERROR_NOSUBSTRING && + (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) + { + rc = PCRE2_ERROR_UNSET; + } + if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */ + if (special == 0) /* Plain substitution */ + { + if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue; + goto PTREXIT; /* Else error */ + } + } + + /* If special is '+' we have a 'set' and possibly an 'unset' text, + both of which are reprocessed when used. If special is '-' we have a + default text for when the group is unset; it must be reprocessed. */ + + if (special != 0) + { + if (special == CHAR_MINUS) + { + if (rc == 0) goto LITERAL_SUBSTITUTE; + text2_start = text1_start; + text2_end = text1_end; + } + + if (ptrstackptr >= PTR_STACK_SIZE) goto BAD; + ptrstack[ptrstackptr++] = ptr; + ptrstack[ptrstackptr++] = repend; + + if (rc == 0) + { + ptr = text1_start; + repend = text1_end; + } + else + { + ptr = text2_start; + repend = text2_end; + } + continue; + } + + /* Otherwise we have a literal substitution of a group's contents. */ + + LITERAL_SUBSTITUTE: + subptr = subject + ovector[group*2]; + subptrend = subject + ovector[group*2 + 1]; + + /* Substitute a literal string, possibly forcing alphabetic case. */ + + while (subptr < subptrend) + { + GETCHARINCTEST(ch, subptr); + if (forcecase != 0) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + uint32_t type = UCD_CHARTYPE(ch); + if (PRIV(ucp_gentype)[type] == ucp_L && + type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) + ch = UCD_OTHERCASE(ch); + } + else +#endif + { + if (((code->tables + cbits_offset + + ((forcecase > 0)? cbit_upper:cbit_lower) + )[ch/8] & (1 << (ch%8))) == 0) + ch = (code->tables + fcc_offset)[ch]; + } + forcecase = forcecasereset; + } + +#ifdef SUPPORT_UNICODE + if (utf) chlen = PRIV(ord2utf)(ch, temp); else +#endif + { + temp[0] = ch; + chlen = 1; + } + CHECKMEMCPY(temp, chlen); + } + } } - /* Handle a literal code unit */ + /* Handle an escape sequence in extended mode. We can use check_escape() + to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but + the case-forcing escapes are not supported in pcre2_compile() so must be + recognized here. */ - else + else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && + *ptr == CHAR_BACKSLASH) { - LITERAL: - if (lengthleft-- < 1) goto NOROOM; - buffer[buff_offset++] = replacement[i]; + int errorcode; + + if (ptr < repend - 1) switch (ptr[1]) + { + case CHAR_L: + forcecase = forcecasereset = -1; + ptr += 2; + continue; + + case CHAR_l: + forcecase = -1; + forcecasereset = 0; + ptr += 2; + continue; + + case CHAR_U: + forcecase = forcecasereset = 1; + ptr += 2; + continue; + + case CHAR_u: + forcecase = 1; + forcecasereset = 0; + ptr += 2; + continue; + + default: + break; + } + + ptr++; /* Point after \ */ + rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, + code->overall_options, FALSE, NULL); + if (errorcode != 0) goto BADESCAPE; + + switch(rc) + { + case ESC_E: + forcecase = forcecasereset = 0; + continue; + + case ESC_Q: + literal = TRUE; + continue; + + case 0: /* Data character */ + goto LITERAL; + + default: + goto BADESCAPE; + } } - } + + /* Handle a literal code unit */ + + else + { + LOADLITERAL: + GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */ + + LITERAL: + if (forcecase != 0) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + uint32_t type = UCD_CHARTYPE(ch); + if (PRIV(ucp_gentype)[type] == ucp_L && + type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) + ch = UCD_OTHERCASE(ch); + } + else +#endif + { + if (((code->tables + cbits_offset + + ((forcecase > 0)? cbit_upper:cbit_lower) + )[ch/8] & (1 << (ch%8))) == 0) + ch = (code->tables + fcc_offset)[ch]; + } + forcecase = forcecasereset; + } + +#ifdef SUPPORT_UNICODE + if (utf) chlen = PRIV(ord2utf)(ch, temp); else +#endif + { + temp[0] = ch; + chlen = 1; + } + CHECKMEMCPY(temp, chlen); + } /* End handling a literal code unit */ + } /* End of loop for scanning the replacement. */ /* The replacement has been copied to the output. Update the start offset to point to the rest of the subject string. If we matched an empty string, @@ -285,18 +806,33 @@ do start_offset = ovector[1]; goptions = (ovector[0] != ovector[1])? 0 : PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; - } while (global); /* Repeat "do" loop */ + } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ -/* Copy the rest of the subject and return the number of substitutions. */ +/* Copy the rest of the subject. */ -rc = subs; fraglength = length - start_offset; -if (fraglength + 1 > lengthleft) goto NOROOM; -memcpy(buffer + buff_offset, subject + start_offset, - fraglength*(PCRE2_CODE_UNIT_WIDTH/8)); -buff_offset += fraglength; -buffer[buff_offset] = 0; -*blength = buff_offset; +CHECKMEMCPY(subject + start_offset, fraglength); +temp[0] = 0; +CHECKMEMCPY(temp , 1); + +/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, +and matching has carried on after a full buffer, in order to compute the length +needed. Otherwise, an overflow generates an immediate error return. */ + +if (overflowed) + { + rc = PCRE2_ERROR_NOMEMORY; + *blength = buff_length + extra_needed; + } + +/* After a successful execution, return the number of substitutions and set the +length of buffer used, excluding the trailing zero. */ + +else + { + rc = subs; + *blength = buff_offset - 1; + } EXIT: if (match_data_created) pcre2_match_data_free(match_data); @@ -309,6 +845,13 @@ goto EXIT; BAD: rc = PCRE2_ERROR_BADREPLACEMENT; +goto PTREXIT; + +BADESCAPE: +rc = PCRE2_ERROR_BADREPESCAPE; + +PTREXIT: +*blength = (PCRE2_SIZE)(ptr - replacement); goto EXIT; } diff --git a/pcre2/src/pcre2_substring.c b/pcre2/src/pcre2_substring.c index eb72ad7d0..f6d7c3972 100644 --- a/pcre2/src/pcre2_substring.c +++ b/pcre2/src/pcre2_substring.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -240,8 +240,11 @@ Returns: nothing PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION pcre2_substring_free(PCRE2_UCHAR *string) { -pcre2_memctl *memctl = (pcre2_memctl *)((char *)string - sizeof(pcre2_memctl)); -memctl->free(memctl, memctl->memory_data); +if (string != NULL) + { + pcre2_memctl *memctl = (pcre2_memctl *)((char *)string - sizeof(pcre2_memctl)); + memctl->free(memctl, memctl->memory_data); + } } @@ -436,8 +439,11 @@ Returns: nothing PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION pcre2_substring_list_free(PCRE2_SPTR *list) { -pcre2_memctl *memctl = (pcre2_memctl *)((char *)list - sizeof(pcre2_memctl)); -memctl->free(memctl, memctl->memory_data); +if (list != NULL) + { + pcre2_memctl *memctl = (pcre2_memctl *)((char *)list - sizeof(pcre2_memctl)); + memctl->free(memctl, memctl->memory_data); + } } diff --git a/pcre2/src/pcre2_tables.c b/pcre2/src/pcre2_tables.c index 17e4537d4..b945ed7a7 100644 --- a/pcre2/src/pcre2_tables.c +++ b/pcre2/src/pcre2_tables.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -227,6 +227,8 @@ version. Like all other character and string literals that are compared against the regular expression pattern, we must use STR_ macros instead of literal strings to make sure that UTF-8 support works on EBCDIC platforms. */ +#define STRING_Ahom0 STR_A STR_h STR_o STR_m "\0" +#define STRING_Anatolian_Hieroglyphs0 STR_A STR_n STR_a STR_t STR_o STR_l STR_i STR_a STR_n STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0" #define STRING_Any0 STR_A STR_n STR_y "\0" #define STRING_Arabic0 STR_A STR_r STR_a STR_b STR_i STR_c "\0" #define STRING_Armenian0 STR_A STR_r STR_m STR_e STR_n STR_i STR_a STR_n "\0" @@ -274,6 +276,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */ #define STRING_Han0 STR_H STR_a STR_n "\0" #define STRING_Hangul0 STR_H STR_a STR_n STR_g STR_u STR_l "\0" #define STRING_Hanunoo0 STR_H STR_a STR_n STR_u STR_n STR_o STR_o "\0" +#define STRING_Hatran0 STR_H STR_a STR_t STR_r STR_a STR_n "\0" #define STRING_Hebrew0 STR_H STR_e STR_b STR_r STR_e STR_w "\0" #define STRING_Hiragana0 STR_H STR_i STR_r STR_a STR_g STR_a STR_n STR_a "\0" #define STRING_Imperial_Aramaic0 STR_I STR_m STR_p STR_e STR_r STR_i STR_a STR_l STR_UNDERSCORE STR_A STR_r STR_a STR_m STR_a STR_i STR_c "\0" @@ -321,6 +324,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */ #define STRING_Modi0 STR_M STR_o STR_d STR_i "\0" #define STRING_Mongolian0 STR_M STR_o STR_n STR_g STR_o STR_l STR_i STR_a STR_n "\0" #define STRING_Mro0 STR_M STR_r STR_o "\0" +#define STRING_Multani0 STR_M STR_u STR_l STR_t STR_a STR_n STR_i "\0" #define STRING_Myanmar0 STR_M STR_y STR_a STR_n STR_m STR_a STR_r "\0" #define STRING_N0 STR_N "\0" #define STRING_Nabataean0 STR_N STR_a STR_b STR_a STR_t STR_a STR_e STR_a STR_n "\0" @@ -331,6 +335,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */ #define STRING_No0 STR_N STR_o "\0" #define STRING_Ogham0 STR_O STR_g STR_h STR_a STR_m "\0" #define STRING_Ol_Chiki0 STR_O STR_l STR_UNDERSCORE STR_C STR_h STR_i STR_k STR_i "\0" +#define STRING_Old_Hungarian0 STR_O STR_l STR_d STR_UNDERSCORE STR_H STR_u STR_n STR_g STR_a STR_r STR_i STR_a STR_n "\0" #define STRING_Old_Italic0 STR_O STR_l STR_d STR_UNDERSCORE STR_I STR_t STR_a STR_l STR_i STR_c "\0" #define STRING_Old_North_Arabian0 STR_O STR_l STR_d STR_UNDERSCORE STR_N STR_o STR_r STR_t STR_h STR_UNDERSCORE STR_A STR_r STR_a STR_b STR_i STR_a STR_n "\0" #define STRING_Old_Permic0 STR_O STR_l STR_d STR_UNDERSCORE STR_P STR_e STR_r STR_m STR_i STR_c "\0" @@ -362,6 +367,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */ #define STRING_Sharada0 STR_S STR_h STR_a STR_r STR_a STR_d STR_a "\0" #define STRING_Shavian0 STR_S STR_h STR_a STR_v STR_i STR_a STR_n "\0" #define STRING_Siddham0 STR_S STR_i STR_d STR_d STR_h STR_a STR_m "\0" +#define STRING_SignWriting0 STR_S STR_i STR_g STR_n STR_W STR_r STR_i STR_t STR_i STR_n STR_g "\0" #define STRING_Sinhala0 STR_S STR_i STR_n STR_h STR_a STR_l STR_a "\0" #define STRING_Sk0 STR_S STR_k "\0" #define STRING_Sm0 STR_S STR_m "\0" @@ -398,6 +404,8 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */ #define STRING_Zs0 STR_Z STR_s "\0" const char PRIV(utt_names)[] = + STRING_Ahom0 + STRING_Anatolian_Hieroglyphs0 STRING_Any0 STRING_Arabic0 STRING_Armenian0 @@ -445,6 +453,7 @@ const char PRIV(utt_names)[] = STRING_Han0 STRING_Hangul0 STRING_Hanunoo0 + STRING_Hatran0 STRING_Hebrew0 STRING_Hiragana0 STRING_Imperial_Aramaic0 @@ -492,6 +501,7 @@ const char PRIV(utt_names)[] = STRING_Modi0 STRING_Mongolian0 STRING_Mro0 + STRING_Multani0 STRING_Myanmar0 STRING_N0 STRING_Nabataean0 @@ -502,6 +512,7 @@ const char PRIV(utt_names)[] = STRING_No0 STRING_Ogham0 STRING_Ol_Chiki0 + STRING_Old_Hungarian0 STRING_Old_Italic0 STRING_Old_North_Arabian0 STRING_Old_Permic0 @@ -533,6 +544,7 @@ const char PRIV(utt_names)[] = STRING_Sharada0 STRING_Shavian0 STRING_Siddham0 + STRING_SignWriting0 STRING_Sinhala0 STRING_Sk0 STRING_Sm0 @@ -569,175 +581,181 @@ const char PRIV(utt_names)[] = STRING_Zs0; const ucp_type_table PRIV(utt)[] = { - { 0, PT_ANY, 0 }, - { 4, PT_SC, ucp_Arabic }, - { 11, PT_SC, ucp_Armenian }, - { 20, PT_SC, ucp_Avestan }, - { 28, PT_SC, ucp_Balinese }, - { 37, PT_SC, ucp_Bamum }, - { 43, PT_SC, ucp_Bassa_Vah }, - { 53, PT_SC, ucp_Batak }, - { 59, PT_SC, ucp_Bengali }, - { 67, PT_SC, ucp_Bopomofo }, - { 76, PT_SC, ucp_Brahmi }, - { 83, PT_SC, ucp_Braille }, - { 91, PT_SC, ucp_Buginese }, - { 100, PT_SC, ucp_Buhid }, - { 106, PT_GC, ucp_C }, - { 108, PT_SC, ucp_Canadian_Aboriginal }, - { 128, PT_SC, ucp_Carian }, - { 135, PT_SC, ucp_Caucasian_Albanian }, - { 154, PT_PC, ucp_Cc }, - { 157, PT_PC, ucp_Cf }, - { 160, PT_SC, ucp_Chakma }, - { 167, PT_SC, ucp_Cham }, - { 172, PT_SC, ucp_Cherokee }, - { 181, PT_PC, ucp_Cn }, - { 184, PT_PC, ucp_Co }, - { 187, PT_SC, ucp_Common }, - { 194, PT_SC, ucp_Coptic }, - { 201, PT_PC, ucp_Cs }, - { 204, PT_SC, ucp_Cuneiform }, - { 214, PT_SC, ucp_Cypriot }, - { 222, PT_SC, ucp_Cyrillic }, - { 231, PT_SC, ucp_Deseret }, - { 239, PT_SC, ucp_Devanagari }, - { 250, PT_SC, ucp_Duployan }, - { 259, PT_SC, ucp_Egyptian_Hieroglyphs }, - { 280, PT_SC, ucp_Elbasan }, - { 288, PT_SC, ucp_Ethiopic }, - { 297, PT_SC, ucp_Georgian }, - { 306, PT_SC, ucp_Glagolitic }, - { 317, PT_SC, ucp_Gothic }, - { 324, PT_SC, ucp_Grantha }, - { 332, PT_SC, ucp_Greek }, - { 338, PT_SC, ucp_Gujarati }, - { 347, PT_SC, ucp_Gurmukhi }, - { 356, PT_SC, ucp_Han }, - { 360, PT_SC, ucp_Hangul }, - { 367, PT_SC, ucp_Hanunoo }, - { 375, PT_SC, ucp_Hebrew }, - { 382, PT_SC, ucp_Hiragana }, - { 391, PT_SC, ucp_Imperial_Aramaic }, - { 408, PT_SC, ucp_Inherited }, - { 418, PT_SC, ucp_Inscriptional_Pahlavi }, - { 440, PT_SC, ucp_Inscriptional_Parthian }, - { 463, PT_SC, ucp_Javanese }, - { 472, PT_SC, ucp_Kaithi }, - { 479, PT_SC, ucp_Kannada }, - { 487, PT_SC, ucp_Katakana }, - { 496, PT_SC, ucp_Kayah_Li }, - { 505, PT_SC, ucp_Kharoshthi }, - { 516, PT_SC, ucp_Khmer }, - { 522, PT_SC, ucp_Khojki }, - { 529, PT_SC, ucp_Khudawadi }, - { 539, PT_GC, ucp_L }, - { 541, PT_LAMP, 0 }, - { 544, PT_SC, ucp_Lao }, - { 548, PT_SC, ucp_Latin }, - { 554, PT_SC, ucp_Lepcha }, - { 561, PT_SC, ucp_Limbu }, - { 567, PT_SC, ucp_Linear_A }, - { 576, PT_SC, ucp_Linear_B }, - { 585, PT_SC, ucp_Lisu }, - { 590, PT_PC, ucp_Ll }, - { 593, PT_PC, ucp_Lm }, - { 596, PT_PC, ucp_Lo }, - { 599, PT_PC, ucp_Lt }, - { 602, PT_PC, ucp_Lu }, - { 605, PT_SC, ucp_Lycian }, - { 612, PT_SC, ucp_Lydian }, - { 619, PT_GC, ucp_M }, - { 621, PT_SC, ucp_Mahajani }, - { 630, PT_SC, ucp_Malayalam }, - { 640, PT_SC, ucp_Mandaic }, - { 648, PT_SC, ucp_Manichaean }, - { 659, PT_PC, ucp_Mc }, - { 662, PT_PC, ucp_Me }, - { 665, PT_SC, ucp_Meetei_Mayek }, - { 678, PT_SC, ucp_Mende_Kikakui }, - { 692, PT_SC, ucp_Meroitic_Cursive }, - { 709, PT_SC, ucp_Meroitic_Hieroglyphs }, - { 730, PT_SC, ucp_Miao }, - { 735, PT_PC, ucp_Mn }, - { 738, PT_SC, ucp_Modi }, - { 743, PT_SC, ucp_Mongolian }, - { 753, PT_SC, ucp_Mro }, - { 757, PT_SC, ucp_Myanmar }, - { 765, PT_GC, ucp_N }, - { 767, PT_SC, ucp_Nabataean }, - { 777, PT_PC, ucp_Nd }, - { 780, PT_SC, ucp_New_Tai_Lue }, - { 792, PT_SC, ucp_Nko }, - { 796, PT_PC, ucp_Nl }, - { 799, PT_PC, ucp_No }, - { 802, PT_SC, ucp_Ogham }, - { 808, PT_SC, ucp_Ol_Chiki }, - { 817, PT_SC, ucp_Old_Italic }, - { 828, PT_SC, ucp_Old_North_Arabian }, - { 846, PT_SC, ucp_Old_Permic }, - { 857, PT_SC, ucp_Old_Persian }, - { 869, PT_SC, ucp_Old_South_Arabian }, - { 887, PT_SC, ucp_Old_Turkic }, - { 898, PT_SC, ucp_Oriya }, - { 904, PT_SC, ucp_Osmanya }, - { 912, PT_GC, ucp_P }, - { 914, PT_SC, ucp_Pahawh_Hmong }, - { 927, PT_SC, ucp_Palmyrene }, - { 937, PT_SC, ucp_Pau_Cin_Hau }, - { 949, PT_PC, ucp_Pc }, - { 952, PT_PC, ucp_Pd }, - { 955, PT_PC, ucp_Pe }, - { 958, PT_PC, ucp_Pf }, - { 961, PT_SC, ucp_Phags_Pa }, - { 970, PT_SC, ucp_Phoenician }, - { 981, PT_PC, ucp_Pi }, - { 984, PT_PC, ucp_Po }, - { 987, PT_PC, ucp_Ps }, - { 990, PT_SC, ucp_Psalter_Pahlavi }, - { 1006, PT_SC, ucp_Rejang }, - { 1013, PT_SC, ucp_Runic }, - { 1019, PT_GC, ucp_S }, - { 1021, PT_SC, ucp_Samaritan }, - { 1031, PT_SC, ucp_Saurashtra }, - { 1042, PT_PC, ucp_Sc }, - { 1045, PT_SC, ucp_Sharada }, - { 1053, PT_SC, ucp_Shavian }, - { 1061, PT_SC, ucp_Siddham }, - { 1069, PT_SC, ucp_Sinhala }, - { 1077, PT_PC, ucp_Sk }, - { 1080, PT_PC, ucp_Sm }, - { 1083, PT_PC, ucp_So }, - { 1086, PT_SC, ucp_Sora_Sompeng }, - { 1099, PT_SC, ucp_Sundanese }, - { 1109, PT_SC, ucp_Syloti_Nagri }, - { 1122, PT_SC, ucp_Syriac }, - { 1129, PT_SC, ucp_Tagalog }, - { 1137, PT_SC, ucp_Tagbanwa }, - { 1146, PT_SC, ucp_Tai_Le }, - { 1153, PT_SC, ucp_Tai_Tham }, - { 1162, PT_SC, ucp_Tai_Viet }, - { 1171, PT_SC, ucp_Takri }, - { 1177, PT_SC, ucp_Tamil }, - { 1183, PT_SC, ucp_Telugu }, - { 1190, PT_SC, ucp_Thaana }, - { 1197, PT_SC, ucp_Thai }, - { 1202, PT_SC, ucp_Tibetan }, - { 1210, PT_SC, ucp_Tifinagh }, - { 1219, PT_SC, ucp_Tirhuta }, - { 1227, PT_SC, ucp_Ugaritic }, - { 1236, PT_SC, ucp_Vai }, - { 1240, PT_SC, ucp_Warang_Citi }, - { 1252, PT_ALNUM, 0 }, - { 1256, PT_PXSPACE, 0 }, - { 1260, PT_SPACE, 0 }, - { 1264, PT_UCNC, 0 }, - { 1268, PT_WORD, 0 }, - { 1272, PT_SC, ucp_Yi }, - { 1275, PT_GC, ucp_Z }, - { 1277, PT_PC, ucp_Zl }, - { 1280, PT_PC, ucp_Zp }, - { 1283, PT_PC, ucp_Zs } + { 0, PT_SC, ucp_Ahom }, + { 5, PT_SC, ucp_Anatolian_Hieroglyphs }, + { 27, PT_ANY, 0 }, + { 31, PT_SC, ucp_Arabic }, + { 38, PT_SC, ucp_Armenian }, + { 47, PT_SC, ucp_Avestan }, + { 55, PT_SC, ucp_Balinese }, + { 64, PT_SC, ucp_Bamum }, + { 70, PT_SC, ucp_Bassa_Vah }, + { 80, PT_SC, ucp_Batak }, + { 86, PT_SC, ucp_Bengali }, + { 94, PT_SC, ucp_Bopomofo }, + { 103, PT_SC, ucp_Brahmi }, + { 110, PT_SC, ucp_Braille }, + { 118, PT_SC, ucp_Buginese }, + { 127, PT_SC, ucp_Buhid }, + { 133, PT_GC, ucp_C }, + { 135, PT_SC, ucp_Canadian_Aboriginal }, + { 155, PT_SC, ucp_Carian }, + { 162, PT_SC, ucp_Caucasian_Albanian }, + { 181, PT_PC, ucp_Cc }, + { 184, PT_PC, ucp_Cf }, + { 187, PT_SC, ucp_Chakma }, + { 194, PT_SC, ucp_Cham }, + { 199, PT_SC, ucp_Cherokee }, + { 208, PT_PC, ucp_Cn }, + { 211, PT_PC, ucp_Co }, + { 214, PT_SC, ucp_Common }, + { 221, PT_SC, ucp_Coptic }, + { 228, PT_PC, ucp_Cs }, + { 231, PT_SC, ucp_Cuneiform }, + { 241, PT_SC, ucp_Cypriot }, + { 249, PT_SC, ucp_Cyrillic }, + { 258, PT_SC, ucp_Deseret }, + { 266, PT_SC, ucp_Devanagari }, + { 277, PT_SC, ucp_Duployan }, + { 286, PT_SC, ucp_Egyptian_Hieroglyphs }, + { 307, PT_SC, ucp_Elbasan }, + { 315, PT_SC, ucp_Ethiopic }, + { 324, PT_SC, ucp_Georgian }, + { 333, PT_SC, ucp_Glagolitic }, + { 344, PT_SC, ucp_Gothic }, + { 351, PT_SC, ucp_Grantha }, + { 359, PT_SC, ucp_Greek }, + { 365, PT_SC, ucp_Gujarati }, + { 374, PT_SC, ucp_Gurmukhi }, + { 383, PT_SC, ucp_Han }, + { 387, PT_SC, ucp_Hangul }, + { 394, PT_SC, ucp_Hanunoo }, + { 402, PT_SC, ucp_Hatran }, + { 409, PT_SC, ucp_Hebrew }, + { 416, PT_SC, ucp_Hiragana }, + { 425, PT_SC, ucp_Imperial_Aramaic }, + { 442, PT_SC, ucp_Inherited }, + { 452, PT_SC, ucp_Inscriptional_Pahlavi }, + { 474, PT_SC, ucp_Inscriptional_Parthian }, + { 497, PT_SC, ucp_Javanese }, + { 506, PT_SC, ucp_Kaithi }, + { 513, PT_SC, ucp_Kannada }, + { 521, PT_SC, ucp_Katakana }, + { 530, PT_SC, ucp_Kayah_Li }, + { 539, PT_SC, ucp_Kharoshthi }, + { 550, PT_SC, ucp_Khmer }, + { 556, PT_SC, ucp_Khojki }, + { 563, PT_SC, ucp_Khudawadi }, + { 573, PT_GC, ucp_L }, + { 575, PT_LAMP, 0 }, + { 578, PT_SC, ucp_Lao }, + { 582, PT_SC, ucp_Latin }, + { 588, PT_SC, ucp_Lepcha }, + { 595, PT_SC, ucp_Limbu }, + { 601, PT_SC, ucp_Linear_A }, + { 610, PT_SC, ucp_Linear_B }, + { 619, PT_SC, ucp_Lisu }, + { 624, PT_PC, ucp_Ll }, + { 627, PT_PC, ucp_Lm }, + { 630, PT_PC, ucp_Lo }, + { 633, PT_PC, ucp_Lt }, + { 636, PT_PC, ucp_Lu }, + { 639, PT_SC, ucp_Lycian }, + { 646, PT_SC, ucp_Lydian }, + { 653, PT_GC, ucp_M }, + { 655, PT_SC, ucp_Mahajani }, + { 664, PT_SC, ucp_Malayalam }, + { 674, PT_SC, ucp_Mandaic }, + { 682, PT_SC, ucp_Manichaean }, + { 693, PT_PC, ucp_Mc }, + { 696, PT_PC, ucp_Me }, + { 699, PT_SC, ucp_Meetei_Mayek }, + { 712, PT_SC, ucp_Mende_Kikakui }, + { 726, PT_SC, ucp_Meroitic_Cursive }, + { 743, PT_SC, ucp_Meroitic_Hieroglyphs }, + { 764, PT_SC, ucp_Miao }, + { 769, PT_PC, ucp_Mn }, + { 772, PT_SC, ucp_Modi }, + { 777, PT_SC, ucp_Mongolian }, + { 787, PT_SC, ucp_Mro }, + { 791, PT_SC, ucp_Multani }, + { 799, PT_SC, ucp_Myanmar }, + { 807, PT_GC, ucp_N }, + { 809, PT_SC, ucp_Nabataean }, + { 819, PT_PC, ucp_Nd }, + { 822, PT_SC, ucp_New_Tai_Lue }, + { 834, PT_SC, ucp_Nko }, + { 838, PT_PC, ucp_Nl }, + { 841, PT_PC, ucp_No }, + { 844, PT_SC, ucp_Ogham }, + { 850, PT_SC, ucp_Ol_Chiki }, + { 859, PT_SC, ucp_Old_Hungarian }, + { 873, PT_SC, ucp_Old_Italic }, + { 884, PT_SC, ucp_Old_North_Arabian }, + { 902, PT_SC, ucp_Old_Permic }, + { 913, PT_SC, ucp_Old_Persian }, + { 925, PT_SC, ucp_Old_South_Arabian }, + { 943, PT_SC, ucp_Old_Turkic }, + { 954, PT_SC, ucp_Oriya }, + { 960, PT_SC, ucp_Osmanya }, + { 968, PT_GC, ucp_P }, + { 970, PT_SC, ucp_Pahawh_Hmong }, + { 983, PT_SC, ucp_Palmyrene }, + { 993, PT_SC, ucp_Pau_Cin_Hau }, + { 1005, PT_PC, ucp_Pc }, + { 1008, PT_PC, ucp_Pd }, + { 1011, PT_PC, ucp_Pe }, + { 1014, PT_PC, ucp_Pf }, + { 1017, PT_SC, ucp_Phags_Pa }, + { 1026, PT_SC, ucp_Phoenician }, + { 1037, PT_PC, ucp_Pi }, + { 1040, PT_PC, ucp_Po }, + { 1043, PT_PC, ucp_Ps }, + { 1046, PT_SC, ucp_Psalter_Pahlavi }, + { 1062, PT_SC, ucp_Rejang }, + { 1069, PT_SC, ucp_Runic }, + { 1075, PT_GC, ucp_S }, + { 1077, PT_SC, ucp_Samaritan }, + { 1087, PT_SC, ucp_Saurashtra }, + { 1098, PT_PC, ucp_Sc }, + { 1101, PT_SC, ucp_Sharada }, + { 1109, PT_SC, ucp_Shavian }, + { 1117, PT_SC, ucp_Siddham }, + { 1125, PT_SC, ucp_SignWriting }, + { 1137, PT_SC, ucp_Sinhala }, + { 1145, PT_PC, ucp_Sk }, + { 1148, PT_PC, ucp_Sm }, + { 1151, PT_PC, ucp_So }, + { 1154, PT_SC, ucp_Sora_Sompeng }, + { 1167, PT_SC, ucp_Sundanese }, + { 1177, PT_SC, ucp_Syloti_Nagri }, + { 1190, PT_SC, ucp_Syriac }, + { 1197, PT_SC, ucp_Tagalog }, + { 1205, PT_SC, ucp_Tagbanwa }, + { 1214, PT_SC, ucp_Tai_Le }, + { 1221, PT_SC, ucp_Tai_Tham }, + { 1230, PT_SC, ucp_Tai_Viet }, + { 1239, PT_SC, ucp_Takri }, + { 1245, PT_SC, ucp_Tamil }, + { 1251, PT_SC, ucp_Telugu }, + { 1258, PT_SC, ucp_Thaana }, + { 1265, PT_SC, ucp_Thai }, + { 1270, PT_SC, ucp_Tibetan }, + { 1278, PT_SC, ucp_Tifinagh }, + { 1287, PT_SC, ucp_Tirhuta }, + { 1295, PT_SC, ucp_Ugaritic }, + { 1304, PT_SC, ucp_Vai }, + { 1308, PT_SC, ucp_Warang_Citi }, + { 1320, PT_ALNUM, 0 }, + { 1324, PT_PXSPACE, 0 }, + { 1328, PT_SPACE, 0 }, + { 1332, PT_UCNC, 0 }, + { 1336, PT_WORD, 0 }, + { 1340, PT_SC, ucp_Yi }, + { 1343, PT_GC, ucp_Z }, + { 1345, PT_PC, ucp_Zl }, + { 1348, PT_PC, ucp_Zp }, + { 1351, PT_PC, ucp_Zs } }; const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); diff --git a/pcre2/src/pcre2_ucd.c b/pcre2/src/pcre2_ucd.c index 7199cbda7..116f537b3 100644 --- a/pcre2/src/pcre2_ucd.c +++ b/pcre2/src/pcre2_ucd.c @@ -20,7 +20,7 @@ needed. */ /* Unicode character database. */ /* This file was autogenerated by the MultiStage2.py script. */ -/* Total size: 72576 bytes, block size: 128. */ +/* Total size: 75072 bytes, block size: 128. */ /* The tables herein are needed only when UCP support is built, and in PCRE2 that happens automatically with UTF support. @@ -39,7 +39,7 @@ const uint16_t PRIV(ucd_stage2)[] = {0}; const uint32_t PRIV(ucd_caseless_sets)[] = {0}; #else -const char *PRIV(unicode_version) = "7.0.0"; +const char *PRIV(unicode_version) = "8.0.0"; /* When recompiling tables with a new Unicode version, please check the types in this structure definition from pcre2_internal.h (the actual @@ -82,7 +82,7 @@ const uint32_t PRIV(ucd_caseless_sets)[] = { #ifndef PCRE2_PCRE2TEST -const ucd_record PRIV(ucd_records)[] = { /* 5760 bytes, record size 8 */ +const ucd_record PRIV(ucd_records)[] = { /* 5952 bytes, record size 8 */ { 9, 0, 2, 0, 0, }, /* 0 */ { 9, 0, 1, 0, 0, }, /* 1 */ { 9, 0, 0, 0, 0, }, /* 2 */ @@ -188,621 +188,645 @@ const ucd_record PRIV(ucd_records)[] = { /* 5760 bytes, record size 8 */ { 33, 5, 12, 0, -217, }, /* 102 */ { 33, 5, 12, 0, -71, }, /* 103 */ { 33, 5, 12, 0, -219, }, /* 104 */ - { 33, 5, 12, 0, 42258, }, /* 105 */ - { 33, 6, 12, 0, 0, }, /* 106 */ - { 9, 6, 12, 0, 0, }, /* 107 */ - { 3, 24, 12, 0, 0, }, /* 108 */ - { 27, 12, 3, 0, 0, }, /* 109 */ - { 27, 12, 3, 21, 116, }, /* 110 */ - { 19, 9, 12, 0, 1, }, /* 111 */ - { 19, 5, 12, 0, -1, }, /* 112 */ - { 19, 24, 12, 0, 0, }, /* 113 */ - { 9, 2, 12, 0, 0, }, /* 114 */ - { 19, 6, 12, 0, 0, }, /* 115 */ - { 19, 5, 12, 0, 130, }, /* 116 */ - { 19, 9, 12, 0, 116, }, /* 117 */ - { 19, 9, 12, 0, 38, }, /* 118 */ - { 19, 9, 12, 0, 37, }, /* 119 */ - { 19, 9, 12, 0, 64, }, /* 120 */ - { 19, 9, 12, 0, 63, }, /* 121 */ - { 19, 5, 12, 0, 0, }, /* 122 */ - { 19, 9, 12, 0, 32, }, /* 123 */ - { 19, 9, 12, 34, 32, }, /* 124 */ - { 19, 9, 12, 59, 32, }, /* 125 */ - { 19, 9, 12, 38, 32, }, /* 126 */ - { 19, 9, 12, 21, 32, }, /* 127 */ - { 19, 9, 12, 51, 32, }, /* 128 */ - { 19, 9, 12, 26, 32, }, /* 129 */ - { 19, 9, 12, 47, 32, }, /* 130 */ - { 19, 9, 12, 55, 32, }, /* 131 */ - { 19, 9, 12, 30, 32, }, /* 132 */ - { 19, 9, 12, 43, 32, }, /* 133 */ - { 19, 9, 12, 67, 32, }, /* 134 */ - { 19, 5, 12, 0, -38, }, /* 135 */ - { 19, 5, 12, 0, -37, }, /* 136 */ - { 19, 5, 12, 0, -32, }, /* 137 */ - { 19, 5, 12, 34, -32, }, /* 138 */ - { 19, 5, 12, 59, -32, }, /* 139 */ - { 19, 5, 12, 38, -32, }, /* 140 */ - { 19, 5, 12, 21, -116, }, /* 141 */ - { 19, 5, 12, 51, -32, }, /* 142 */ - { 19, 5, 12, 26, -775, }, /* 143 */ - { 19, 5, 12, 47, -32, }, /* 144 */ - { 19, 5, 12, 55, -32, }, /* 145 */ - { 19, 5, 12, 30, 1, }, /* 146 */ - { 19, 5, 12, 30, -32, }, /* 147 */ - { 19, 5, 12, 43, -32, }, /* 148 */ - { 19, 5, 12, 67, -32, }, /* 149 */ - { 19, 5, 12, 0, -64, }, /* 150 */ - { 19, 5, 12, 0, -63, }, /* 151 */ - { 19, 9, 12, 0, 8, }, /* 152 */ - { 19, 5, 12, 34, -30, }, /* 153 */ - { 19, 5, 12, 38, -25, }, /* 154 */ - { 19, 9, 12, 0, 0, }, /* 155 */ - { 19, 5, 12, 43, -15, }, /* 156 */ - { 19, 5, 12, 47, -22, }, /* 157 */ - { 19, 5, 12, 0, -8, }, /* 158 */ - { 10, 9, 12, 0, 1, }, /* 159 */ - { 10, 5, 12, 0, -1, }, /* 160 */ - { 19, 5, 12, 51, -54, }, /* 161 */ - { 19, 5, 12, 55, -48, }, /* 162 */ - { 19, 5, 12, 0, 7, }, /* 163 */ - { 19, 5, 12, 0, -116, }, /* 164 */ - { 19, 9, 12, 38, -60, }, /* 165 */ - { 19, 5, 12, 59, -64, }, /* 166 */ - { 19, 25, 12, 0, 0, }, /* 167 */ - { 19, 9, 12, 0, -7, }, /* 168 */ - { 19, 9, 12, 0, -130, }, /* 169 */ - { 12, 9, 12, 0, 80, }, /* 170 */ - { 12, 9, 12, 0, 32, }, /* 171 */ - { 12, 5, 12, 0, -32, }, /* 172 */ - { 12, 5, 12, 0, -80, }, /* 173 */ - { 12, 9, 12, 0, 1, }, /* 174 */ - { 12, 5, 12, 0, -1, }, /* 175 */ - { 12, 26, 12, 0, 0, }, /* 176 */ - { 12, 12, 3, 0, 0, }, /* 177 */ - { 12, 11, 3, 0, 0, }, /* 178 */ - { 12, 9, 12, 0, 15, }, /* 179 */ - { 12, 5, 12, 0, -15, }, /* 180 */ - { 1, 9, 12, 0, 48, }, /* 181 */ - { 1, 6, 12, 0, 0, }, /* 182 */ - { 1, 21, 12, 0, 0, }, /* 183 */ - { 1, 5, 12, 0, -48, }, /* 184 */ - { 1, 5, 12, 0, 0, }, /* 185 */ - { 1, 17, 12, 0, 0, }, /* 186 */ - { 1, 26, 12, 0, 0, }, /* 187 */ - { 1, 23, 12, 0, 0, }, /* 188 */ - { 25, 12, 3, 0, 0, }, /* 189 */ - { 25, 17, 12, 0, 0, }, /* 190 */ - { 25, 21, 12, 0, 0, }, /* 191 */ - { 25, 7, 12, 0, 0, }, /* 192 */ - { 0, 1, 2, 0, 0, }, /* 193 */ - { 0, 25, 12, 0, 0, }, /* 194 */ - { 0, 21, 12, 0, 0, }, /* 195 */ - { 0, 23, 12, 0, 0, }, /* 196 */ - { 0, 26, 12, 0, 0, }, /* 197 */ - { 0, 12, 3, 0, 0, }, /* 198 */ - { 0, 7, 12, 0, 0, }, /* 199 */ - { 0, 6, 12, 0, 0, }, /* 200 */ + { 33, 5, 12, 0, 42261, }, /* 105 */ + { 33, 5, 12, 0, 42258, }, /* 106 */ + { 33, 6, 12, 0, 0, }, /* 107 */ + { 9, 6, 12, 0, 0, }, /* 108 */ + { 3, 24, 12, 0, 0, }, /* 109 */ + { 27, 12, 3, 0, 0, }, /* 110 */ + { 27, 12, 3, 21, 116, }, /* 111 */ + { 19, 9, 12, 0, 1, }, /* 112 */ + { 19, 5, 12, 0, -1, }, /* 113 */ + { 19, 24, 12, 0, 0, }, /* 114 */ + { 9, 2, 12, 0, 0, }, /* 115 */ + { 19, 6, 12, 0, 0, }, /* 116 */ + { 19, 5, 12, 0, 130, }, /* 117 */ + { 19, 9, 12, 0, 116, }, /* 118 */ + { 19, 9, 12, 0, 38, }, /* 119 */ + { 19, 9, 12, 0, 37, }, /* 120 */ + { 19, 9, 12, 0, 64, }, /* 121 */ + { 19, 9, 12, 0, 63, }, /* 122 */ + { 19, 5, 12, 0, 0, }, /* 123 */ + { 19, 9, 12, 0, 32, }, /* 124 */ + { 19, 9, 12, 34, 32, }, /* 125 */ + { 19, 9, 12, 59, 32, }, /* 126 */ + { 19, 9, 12, 38, 32, }, /* 127 */ + { 19, 9, 12, 21, 32, }, /* 128 */ + { 19, 9, 12, 51, 32, }, /* 129 */ + { 19, 9, 12, 26, 32, }, /* 130 */ + { 19, 9, 12, 47, 32, }, /* 131 */ + { 19, 9, 12, 55, 32, }, /* 132 */ + { 19, 9, 12, 30, 32, }, /* 133 */ + { 19, 9, 12, 43, 32, }, /* 134 */ + { 19, 9, 12, 67, 32, }, /* 135 */ + { 19, 5, 12, 0, -38, }, /* 136 */ + { 19, 5, 12, 0, -37, }, /* 137 */ + { 19, 5, 12, 0, -32, }, /* 138 */ + { 19, 5, 12, 34, -32, }, /* 139 */ + { 19, 5, 12, 59, -32, }, /* 140 */ + { 19, 5, 12, 38, -32, }, /* 141 */ + { 19, 5, 12, 21, -116, }, /* 142 */ + { 19, 5, 12, 51, -32, }, /* 143 */ + { 19, 5, 12, 26, -775, }, /* 144 */ + { 19, 5, 12, 47, -32, }, /* 145 */ + { 19, 5, 12, 55, -32, }, /* 146 */ + { 19, 5, 12, 30, 1, }, /* 147 */ + { 19, 5, 12, 30, -32, }, /* 148 */ + { 19, 5, 12, 43, -32, }, /* 149 */ + { 19, 5, 12, 67, -32, }, /* 150 */ + { 19, 5, 12, 0, -64, }, /* 151 */ + { 19, 5, 12, 0, -63, }, /* 152 */ + { 19, 9, 12, 0, 8, }, /* 153 */ + { 19, 5, 12, 34, -30, }, /* 154 */ + { 19, 5, 12, 38, -25, }, /* 155 */ + { 19, 9, 12, 0, 0, }, /* 156 */ + { 19, 5, 12, 43, -15, }, /* 157 */ + { 19, 5, 12, 47, -22, }, /* 158 */ + { 19, 5, 12, 0, -8, }, /* 159 */ + { 10, 9, 12, 0, 1, }, /* 160 */ + { 10, 5, 12, 0, -1, }, /* 161 */ + { 19, 5, 12, 51, -54, }, /* 162 */ + { 19, 5, 12, 55, -48, }, /* 163 */ + { 19, 5, 12, 0, 7, }, /* 164 */ + { 19, 5, 12, 0, -116, }, /* 165 */ + { 19, 9, 12, 38, -60, }, /* 166 */ + { 19, 5, 12, 59, -64, }, /* 167 */ + { 19, 25, 12, 0, 0, }, /* 168 */ + { 19, 9, 12, 0, -7, }, /* 169 */ + { 19, 9, 12, 0, -130, }, /* 170 */ + { 12, 9, 12, 0, 80, }, /* 171 */ + { 12, 9, 12, 0, 32, }, /* 172 */ + { 12, 5, 12, 0, -32, }, /* 173 */ + { 12, 5, 12, 0, -80, }, /* 174 */ + { 12, 9, 12, 0, 1, }, /* 175 */ + { 12, 5, 12, 0, -1, }, /* 176 */ + { 12, 26, 12, 0, 0, }, /* 177 */ + { 12, 12, 3, 0, 0, }, /* 178 */ + { 12, 11, 3, 0, 0, }, /* 179 */ + { 12, 9, 12, 0, 15, }, /* 180 */ + { 12, 5, 12, 0, -15, }, /* 181 */ + { 1, 9, 12, 0, 48, }, /* 182 */ + { 1, 6, 12, 0, 0, }, /* 183 */ + { 1, 21, 12, 0, 0, }, /* 184 */ + { 1, 5, 12, 0, -48, }, /* 185 */ + { 1, 5, 12, 0, 0, }, /* 186 */ + { 1, 17, 12, 0, 0, }, /* 187 */ + { 1, 26, 12, 0, 0, }, /* 188 */ + { 1, 23, 12, 0, 0, }, /* 189 */ + { 25, 12, 3, 0, 0, }, /* 190 */ + { 25, 17, 12, 0, 0, }, /* 191 */ + { 25, 21, 12, 0, 0, }, /* 192 */ + { 25, 7, 12, 0, 0, }, /* 193 */ + { 0, 1, 2, 0, 0, }, /* 194 */ + { 0, 25, 12, 0, 0, }, /* 195 */ + { 0, 21, 12, 0, 0, }, /* 196 */ + { 0, 23, 12, 0, 0, }, /* 197 */ + { 0, 26, 12, 0, 0, }, /* 198 */ + { 0, 12, 3, 0, 0, }, /* 199 */ + { 0, 7, 12, 0, 0, }, /* 200 */ { 0, 13, 12, 0, 0, }, /* 201 */ - { 49, 21, 12, 0, 0, }, /* 202 */ - { 49, 1, 2, 0, 0, }, /* 203 */ - { 49, 7, 12, 0, 0, }, /* 204 */ - { 49, 12, 3, 0, 0, }, /* 205 */ - { 55, 7, 12, 0, 0, }, /* 206 */ - { 55, 12, 3, 0, 0, }, /* 207 */ - { 63, 13, 12, 0, 0, }, /* 208 */ - { 63, 7, 12, 0, 0, }, /* 209 */ - { 63, 12, 3, 0, 0, }, /* 210 */ - { 63, 6, 12, 0, 0, }, /* 211 */ - { 63, 26, 12, 0, 0, }, /* 212 */ - { 63, 21, 12, 0, 0, }, /* 213 */ - { 89, 7, 12, 0, 0, }, /* 214 */ - { 89, 12, 3, 0, 0, }, /* 215 */ - { 89, 6, 12, 0, 0, }, /* 216 */ - { 89, 21, 12, 0, 0, }, /* 217 */ - { 94, 7, 12, 0, 0, }, /* 218 */ - { 94, 12, 3, 0, 0, }, /* 219 */ - { 94, 21, 12, 0, 0, }, /* 220 */ - { 14, 12, 3, 0, 0, }, /* 221 */ - { 14, 10, 5, 0, 0, }, /* 222 */ - { 14, 7, 12, 0, 0, }, /* 223 */ - { 14, 13, 12, 0, 0, }, /* 224 */ - { 14, 21, 12, 0, 0, }, /* 225 */ - { 14, 6, 12, 0, 0, }, /* 226 */ - { 2, 7, 12, 0, 0, }, /* 227 */ - { 2, 12, 3, 0, 0, }, /* 228 */ - { 2, 10, 5, 0, 0, }, /* 229 */ - { 2, 10, 3, 0, 0, }, /* 230 */ - { 2, 13, 12, 0, 0, }, /* 231 */ - { 2, 23, 12, 0, 0, }, /* 232 */ - { 2, 15, 12, 0, 0, }, /* 233 */ - { 2, 26, 12, 0, 0, }, /* 234 */ - { 21, 12, 3, 0, 0, }, /* 235 */ - { 21, 10, 5, 0, 0, }, /* 236 */ - { 21, 7, 12, 0, 0, }, /* 237 */ - { 21, 13, 12, 0, 0, }, /* 238 */ - { 20, 12, 3, 0, 0, }, /* 239 */ - { 20, 10, 5, 0, 0, }, /* 240 */ - { 20, 7, 12, 0, 0, }, /* 241 */ - { 20, 13, 12, 0, 0, }, /* 242 */ - { 20, 21, 12, 0, 0, }, /* 243 */ - { 20, 23, 12, 0, 0, }, /* 244 */ - { 43, 12, 3, 0, 0, }, /* 245 */ - { 43, 10, 5, 0, 0, }, /* 246 */ - { 43, 7, 12, 0, 0, }, /* 247 */ - { 43, 10, 3, 0, 0, }, /* 248 */ - { 43, 13, 12, 0, 0, }, /* 249 */ - { 43, 26, 12, 0, 0, }, /* 250 */ - { 43, 15, 12, 0, 0, }, /* 251 */ - { 53, 12, 3, 0, 0, }, /* 252 */ - { 53, 7, 12, 0, 0, }, /* 253 */ - { 53, 10, 3, 0, 0, }, /* 254 */ - { 53, 10, 5, 0, 0, }, /* 255 */ - { 53, 13, 12, 0, 0, }, /* 256 */ - { 53, 15, 12, 0, 0, }, /* 257 */ - { 53, 26, 12, 0, 0, }, /* 258 */ - { 53, 23, 12, 0, 0, }, /* 259 */ - { 54, 12, 3, 0, 0, }, /* 260 */ - { 54, 10, 5, 0, 0, }, /* 261 */ - { 54, 7, 12, 0, 0, }, /* 262 */ - { 54, 13, 12, 0, 0, }, /* 263 */ - { 54, 15, 12, 0, 0, }, /* 264 */ - { 54, 26, 12, 0, 0, }, /* 265 */ - { 28, 12, 3, 0, 0, }, /* 266 */ - { 28, 10, 5, 0, 0, }, /* 267 */ - { 28, 7, 12, 0, 0, }, /* 268 */ - { 28, 10, 3, 0, 0, }, /* 269 */ - { 28, 13, 12, 0, 0, }, /* 270 */ - { 36, 12, 3, 0, 0, }, /* 271 */ - { 36, 10, 5, 0, 0, }, /* 272 */ - { 36, 7, 12, 0, 0, }, /* 273 */ - { 36, 10, 3, 0, 0, }, /* 274 */ - { 36, 13, 12, 0, 0, }, /* 275 */ - { 36, 15, 12, 0, 0, }, /* 276 */ - { 36, 26, 12, 0, 0, }, /* 277 */ - { 47, 10, 5, 0, 0, }, /* 278 */ - { 47, 7, 12, 0, 0, }, /* 279 */ - { 47, 12, 3, 0, 0, }, /* 280 */ - { 47, 10, 3, 0, 0, }, /* 281 */ - { 47, 13, 12, 0, 0, }, /* 282 */ - { 47, 21, 12, 0, 0, }, /* 283 */ - { 56, 7, 12, 0, 0, }, /* 284 */ - { 56, 12, 3, 0, 0, }, /* 285 */ - { 56, 7, 5, 0, 0, }, /* 286 */ - { 56, 6, 12, 0, 0, }, /* 287 */ - { 56, 21, 12, 0, 0, }, /* 288 */ - { 56, 13, 12, 0, 0, }, /* 289 */ - { 32, 7, 12, 0, 0, }, /* 290 */ - { 32, 12, 3, 0, 0, }, /* 291 */ - { 32, 7, 5, 0, 0, }, /* 292 */ - { 32, 6, 12, 0, 0, }, /* 293 */ - { 32, 13, 12, 0, 0, }, /* 294 */ - { 57, 7, 12, 0, 0, }, /* 295 */ - { 57, 26, 12, 0, 0, }, /* 296 */ - { 57, 21, 12, 0, 0, }, /* 297 */ - { 57, 12, 3, 0, 0, }, /* 298 */ - { 57, 13, 12, 0, 0, }, /* 299 */ - { 57, 15, 12, 0, 0, }, /* 300 */ - { 57, 22, 12, 0, 0, }, /* 301 */ - { 57, 18, 12, 0, 0, }, /* 302 */ - { 57, 10, 5, 0, 0, }, /* 303 */ - { 38, 7, 12, 0, 0, }, /* 304 */ - { 38, 10, 12, 0, 0, }, /* 305 */ - { 38, 12, 3, 0, 0, }, /* 306 */ - { 38, 10, 5, 0, 0, }, /* 307 */ - { 38, 13, 12, 0, 0, }, /* 308 */ - { 38, 21, 12, 0, 0, }, /* 309 */ - { 38, 26, 12, 0, 0, }, /* 310 */ - { 16, 9, 12, 0, 7264, }, /* 311 */ - { 16, 7, 12, 0, 0, }, /* 312 */ - { 16, 6, 12, 0, 0, }, /* 313 */ - { 23, 7, 6, 0, 0, }, /* 314 */ - { 23, 7, 7, 0, 0, }, /* 315 */ - { 23, 7, 8, 0, 0, }, /* 316 */ - { 15, 7, 12, 0, 0, }, /* 317 */ - { 15, 12, 3, 0, 0, }, /* 318 */ - { 15, 21, 12, 0, 0, }, /* 319 */ - { 15, 15, 12, 0, 0, }, /* 320 */ - { 15, 26, 12, 0, 0, }, /* 321 */ - { 8, 7, 12, 0, 0, }, /* 322 */ - { 7, 17, 12, 0, 0, }, /* 323 */ - { 7, 7, 12, 0, 0, }, /* 324 */ - { 7, 21, 12, 0, 0, }, /* 325 */ - { 40, 29, 12, 0, 0, }, /* 326 */ - { 40, 7, 12, 0, 0, }, /* 327 */ - { 40, 22, 12, 0, 0, }, /* 328 */ - { 40, 18, 12, 0, 0, }, /* 329 */ - { 45, 7, 12, 0, 0, }, /* 330 */ - { 45, 14, 12, 0, 0, }, /* 331 */ - { 50, 7, 12, 0, 0, }, /* 332 */ - { 50, 12, 3, 0, 0, }, /* 333 */ - { 24, 7, 12, 0, 0, }, /* 334 */ - { 24, 12, 3, 0, 0, }, /* 335 */ - { 6, 7, 12, 0, 0, }, /* 336 */ - { 6, 12, 3, 0, 0, }, /* 337 */ - { 51, 7, 12, 0, 0, }, /* 338 */ - { 51, 12, 3, 0, 0, }, /* 339 */ - { 31, 7, 12, 0, 0, }, /* 340 */ - { 31, 12, 3, 0, 0, }, /* 341 */ - { 31, 10, 5, 0, 0, }, /* 342 */ - { 31, 21, 12, 0, 0, }, /* 343 */ - { 31, 6, 12, 0, 0, }, /* 344 */ - { 31, 23, 12, 0, 0, }, /* 345 */ - { 31, 13, 12, 0, 0, }, /* 346 */ - { 31, 15, 12, 0, 0, }, /* 347 */ - { 37, 21, 12, 0, 0, }, /* 348 */ - { 37, 17, 12, 0, 0, }, /* 349 */ - { 37, 12, 3, 0, 0, }, /* 350 */ - { 37, 1, 2, 0, 0, }, /* 351 */ - { 37, 13, 12, 0, 0, }, /* 352 */ - { 37, 7, 12, 0, 0, }, /* 353 */ - { 37, 6, 12, 0, 0, }, /* 354 */ - { 34, 7, 12, 0, 0, }, /* 355 */ - { 34, 12, 3, 0, 0, }, /* 356 */ - { 34, 10, 5, 0, 0, }, /* 357 */ - { 34, 26, 12, 0, 0, }, /* 358 */ - { 34, 21, 12, 0, 0, }, /* 359 */ - { 34, 13, 12, 0, 0, }, /* 360 */ - { 52, 7, 12, 0, 0, }, /* 361 */ - { 39, 7, 12, 0, 0, }, /* 362 */ - { 39, 10, 12, 0, 0, }, /* 363 */ - { 39, 10, 5, 0, 0, }, /* 364 */ - { 39, 13, 12, 0, 0, }, /* 365 */ - { 39, 15, 12, 0, 0, }, /* 366 */ - { 39, 26, 12, 0, 0, }, /* 367 */ - { 31, 26, 12, 0, 0, }, /* 368 */ - { 5, 7, 12, 0, 0, }, /* 369 */ - { 5, 12, 3, 0, 0, }, /* 370 */ - { 5, 10, 5, 0, 0, }, /* 371 */ - { 5, 21, 12, 0, 0, }, /* 372 */ - { 90, 7, 12, 0, 0, }, /* 373 */ - { 90, 10, 5, 0, 0, }, /* 374 */ - { 90, 12, 3, 0, 0, }, /* 375 */ - { 90, 10, 12, 0, 0, }, /* 376 */ - { 90, 13, 12, 0, 0, }, /* 377 */ - { 90, 21, 12, 0, 0, }, /* 378 */ - { 90, 6, 12, 0, 0, }, /* 379 */ - { 27, 11, 3, 0, 0, }, /* 380 */ - { 61, 12, 3, 0, 0, }, /* 381 */ - { 61, 10, 5, 0, 0, }, /* 382 */ - { 61, 7, 12, 0, 0, }, /* 383 */ - { 61, 13, 12, 0, 0, }, /* 384 */ - { 61, 21, 12, 0, 0, }, /* 385 */ - { 61, 26, 12, 0, 0, }, /* 386 */ - { 75, 12, 3, 0, 0, }, /* 387 */ - { 75, 10, 5, 0, 0, }, /* 388 */ - { 75, 7, 12, 0, 0, }, /* 389 */ - { 75, 13, 12, 0, 0, }, /* 390 */ - { 92, 7, 12, 0, 0, }, /* 391 */ - { 92, 12, 3, 0, 0, }, /* 392 */ - { 92, 10, 5, 0, 0, }, /* 393 */ - { 92, 21, 12, 0, 0, }, /* 394 */ - { 69, 7, 12, 0, 0, }, /* 395 */ - { 69, 10, 5, 0, 0, }, /* 396 */ - { 69, 12, 3, 0, 0, }, /* 397 */ - { 69, 21, 12, 0, 0, }, /* 398 */ - { 69, 13, 12, 0, 0, }, /* 399 */ - { 72, 13, 12, 0, 0, }, /* 400 */ - { 72, 7, 12, 0, 0, }, /* 401 */ - { 72, 6, 12, 0, 0, }, /* 402 */ - { 72, 21, 12, 0, 0, }, /* 403 */ - { 75, 21, 12, 0, 0, }, /* 404 */ - { 9, 10, 5, 0, 0, }, /* 405 */ - { 9, 7, 12, 0, 0, }, /* 406 */ - { 12, 5, 12, 0, 0, }, /* 407 */ - { 12, 6, 12, 0, 0, }, /* 408 */ - { 33, 5, 12, 0, 35332, }, /* 409 */ - { 33, 5, 12, 0, 3814, }, /* 410 */ - { 33, 9, 12, 63, 1, }, /* 411 */ - { 33, 5, 12, 63, -1, }, /* 412 */ - { 33, 5, 12, 63, -58, }, /* 413 */ - { 33, 9, 12, 0, -7615, }, /* 414 */ - { 19, 5, 12, 0, 8, }, /* 415 */ - { 19, 9, 12, 0, -8, }, /* 416 */ - { 19, 5, 12, 0, 74, }, /* 417 */ - { 19, 5, 12, 0, 86, }, /* 418 */ - { 19, 5, 12, 0, 100, }, /* 419 */ - { 19, 5, 12, 0, 128, }, /* 420 */ - { 19, 5, 12, 0, 112, }, /* 421 */ - { 19, 5, 12, 0, 126, }, /* 422 */ - { 19, 8, 12, 0, -8, }, /* 423 */ - { 19, 5, 12, 0, 9, }, /* 424 */ - { 19, 9, 12, 0, -74, }, /* 425 */ - { 19, 8, 12, 0, -9, }, /* 426 */ - { 19, 5, 12, 21, -7173, }, /* 427 */ - { 19, 9, 12, 0, -86, }, /* 428 */ - { 19, 9, 12, 0, -100, }, /* 429 */ - { 19, 9, 12, 0, -112, }, /* 430 */ - { 19, 9, 12, 0, -128, }, /* 431 */ - { 19, 9, 12, 0, -126, }, /* 432 */ - { 27, 1, 3, 0, 0, }, /* 433 */ - { 9, 27, 2, 0, 0, }, /* 434 */ - { 9, 28, 2, 0, 0, }, /* 435 */ - { 9, 2, 2, 0, 0, }, /* 436 */ - { 9, 9, 12, 0, 0, }, /* 437 */ - { 9, 5, 12, 0, 0, }, /* 438 */ - { 19, 9, 12, 67, -7517, }, /* 439 */ - { 33, 9, 12, 71, -8383, }, /* 440 */ - { 33, 9, 12, 75, -8262, }, /* 441 */ - { 33, 9, 12, 0, 28, }, /* 442 */ - { 33, 5, 12, 0, -28, }, /* 443 */ - { 33, 14, 12, 0, 16, }, /* 444 */ - { 33, 14, 12, 0, -16, }, /* 445 */ - { 33, 14, 12, 0, 0, }, /* 446 */ - { 9, 26, 12, 0, 26, }, /* 447 */ - { 9, 26, 12, 0, -26, }, /* 448 */ - { 4, 26, 12, 0, 0, }, /* 449 */ - { 17, 9, 12, 0, 48, }, /* 450 */ - { 17, 5, 12, 0, -48, }, /* 451 */ - { 33, 9, 12, 0, -10743, }, /* 452 */ - { 33, 9, 12, 0, -3814, }, /* 453 */ - { 33, 9, 12, 0, -10727, }, /* 454 */ - { 33, 5, 12, 0, -10795, }, /* 455 */ - { 33, 5, 12, 0, -10792, }, /* 456 */ - { 33, 9, 12, 0, -10780, }, /* 457 */ - { 33, 9, 12, 0, -10749, }, /* 458 */ - { 33, 9, 12, 0, -10783, }, /* 459 */ - { 33, 9, 12, 0, -10782, }, /* 460 */ - { 33, 9, 12, 0, -10815, }, /* 461 */ - { 10, 5, 12, 0, 0, }, /* 462 */ - { 10, 26, 12, 0, 0, }, /* 463 */ - { 10, 12, 3, 0, 0, }, /* 464 */ - { 10, 21, 12, 0, 0, }, /* 465 */ - { 10, 15, 12, 0, 0, }, /* 466 */ - { 16, 5, 12, 0, -7264, }, /* 467 */ - { 58, 7, 12, 0, 0, }, /* 468 */ - { 58, 6, 12, 0, 0, }, /* 469 */ - { 58, 21, 12, 0, 0, }, /* 470 */ - { 58, 12, 3, 0, 0, }, /* 471 */ - { 22, 26, 12, 0, 0, }, /* 472 */ - { 22, 6, 12, 0, 0, }, /* 473 */ - { 22, 14, 12, 0, 0, }, /* 474 */ - { 23, 10, 3, 0, 0, }, /* 475 */ - { 26, 7, 12, 0, 0, }, /* 476 */ - { 26, 6, 12, 0, 0, }, /* 477 */ - { 29, 7, 12, 0, 0, }, /* 478 */ - { 29, 6, 12, 0, 0, }, /* 479 */ - { 3, 7, 12, 0, 0, }, /* 480 */ - { 23, 7, 12, 0, 0, }, /* 481 */ - { 23, 26, 12, 0, 0, }, /* 482 */ - { 29, 26, 12, 0, 0, }, /* 483 */ - { 22, 7, 12, 0, 0, }, /* 484 */ - { 60, 7, 12, 0, 0, }, /* 485 */ - { 60, 6, 12, 0, 0, }, /* 486 */ - { 60, 26, 12, 0, 0, }, /* 487 */ - { 85, 7, 12, 0, 0, }, /* 488 */ - { 85, 6, 12, 0, 0, }, /* 489 */ - { 85, 21, 12, 0, 0, }, /* 490 */ - { 76, 7, 12, 0, 0, }, /* 491 */ - { 76, 6, 12, 0, 0, }, /* 492 */ - { 76, 21, 12, 0, 0, }, /* 493 */ - { 76, 13, 12, 0, 0, }, /* 494 */ - { 12, 7, 12, 0, 0, }, /* 495 */ - { 12, 21, 12, 0, 0, }, /* 496 */ - { 78, 7, 12, 0, 0, }, /* 497 */ - { 78, 14, 12, 0, 0, }, /* 498 */ - { 78, 12, 3, 0, 0, }, /* 499 */ - { 78, 21, 12, 0, 0, }, /* 500 */ - { 33, 9, 12, 0, -35332, }, /* 501 */ - { 33, 9, 12, 0, -42280, }, /* 502 */ - { 33, 9, 12, 0, -42308, }, /* 503 */ - { 33, 9, 12, 0, -42319, }, /* 504 */ - { 33, 9, 12, 0, -42315, }, /* 505 */ - { 33, 9, 12, 0, -42305, }, /* 506 */ - { 33, 9, 12, 0, -42258, }, /* 507 */ - { 33, 9, 12, 0, -42282, }, /* 508 */ - { 48, 7, 12, 0, 0, }, /* 509 */ - { 48, 12, 3, 0, 0, }, /* 510 */ - { 48, 10, 5, 0, 0, }, /* 511 */ - { 48, 26, 12, 0, 0, }, /* 512 */ - { 64, 7, 12, 0, 0, }, /* 513 */ - { 64, 21, 12, 0, 0, }, /* 514 */ - { 74, 10, 5, 0, 0, }, /* 515 */ - { 74, 7, 12, 0, 0, }, /* 516 */ - { 74, 12, 3, 0, 0, }, /* 517 */ - { 74, 21, 12, 0, 0, }, /* 518 */ - { 74, 13, 12, 0, 0, }, /* 519 */ - { 68, 13, 12, 0, 0, }, /* 520 */ - { 68, 7, 12, 0, 0, }, /* 521 */ - { 68, 12, 3, 0, 0, }, /* 522 */ - { 68, 21, 12, 0, 0, }, /* 523 */ - { 73, 7, 12, 0, 0, }, /* 524 */ - { 73, 12, 3, 0, 0, }, /* 525 */ - { 73, 10, 5, 0, 0, }, /* 526 */ - { 73, 21, 12, 0, 0, }, /* 527 */ - { 83, 12, 3, 0, 0, }, /* 528 */ - { 83, 10, 5, 0, 0, }, /* 529 */ - { 83, 7, 12, 0, 0, }, /* 530 */ - { 83, 21, 12, 0, 0, }, /* 531 */ - { 83, 13, 12, 0, 0, }, /* 532 */ - { 38, 6, 12, 0, 0, }, /* 533 */ - { 67, 7, 12, 0, 0, }, /* 534 */ - { 67, 12, 3, 0, 0, }, /* 535 */ - { 67, 10, 5, 0, 0, }, /* 536 */ - { 67, 13, 12, 0, 0, }, /* 537 */ - { 67, 21, 12, 0, 0, }, /* 538 */ - { 91, 7, 12, 0, 0, }, /* 539 */ - { 91, 12, 3, 0, 0, }, /* 540 */ - { 91, 6, 12, 0, 0, }, /* 541 */ - { 91, 21, 12, 0, 0, }, /* 542 */ - { 86, 7, 12, 0, 0, }, /* 543 */ - { 86, 10, 5, 0, 0, }, /* 544 */ - { 86, 12, 3, 0, 0, }, /* 545 */ - { 86, 21, 12, 0, 0, }, /* 546 */ - { 86, 6, 12, 0, 0, }, /* 547 */ - { 86, 13, 12, 0, 0, }, /* 548 */ - { 23, 7, 9, 0, 0, }, /* 549 */ - { 23, 7, 10, 0, 0, }, /* 550 */ - { 9, 4, 2, 0, 0, }, /* 551 */ - { 9, 3, 12, 0, 0, }, /* 552 */ - { 25, 25, 12, 0, 0, }, /* 553 */ - { 0, 24, 12, 0, 0, }, /* 554 */ - { 9, 6, 3, 0, 0, }, /* 555 */ - { 35, 7, 12, 0, 0, }, /* 556 */ - { 19, 14, 12, 0, 0, }, /* 557 */ - { 19, 15, 12, 0, 0, }, /* 558 */ - { 19, 26, 12, 0, 0, }, /* 559 */ - { 70, 7, 12, 0, 0, }, /* 560 */ - { 66, 7, 12, 0, 0, }, /* 561 */ - { 41, 7, 12, 0, 0, }, /* 562 */ - { 41, 15, 12, 0, 0, }, /* 563 */ - { 18, 7, 12, 0, 0, }, /* 564 */ - { 18, 14, 12, 0, 0, }, /* 565 */ - { 117, 7, 12, 0, 0, }, /* 566 */ - { 117, 12, 3, 0, 0, }, /* 567 */ - { 59, 7, 12, 0, 0, }, /* 568 */ - { 59, 21, 12, 0, 0, }, /* 569 */ - { 42, 7, 12, 0, 0, }, /* 570 */ - { 42, 21, 12, 0, 0, }, /* 571 */ - { 42, 14, 12, 0, 0, }, /* 572 */ - { 13, 9, 12, 0, 40, }, /* 573 */ - { 13, 5, 12, 0, -40, }, /* 574 */ - { 46, 7, 12, 0, 0, }, /* 575 */ - { 44, 7, 12, 0, 0, }, /* 576 */ - { 44, 13, 12, 0, 0, }, /* 577 */ - { 105, 7, 12, 0, 0, }, /* 578 */ - { 103, 7, 12, 0, 0, }, /* 579 */ - { 103, 21, 12, 0, 0, }, /* 580 */ - { 109, 7, 12, 0, 0, }, /* 581 */ - { 11, 7, 12, 0, 0, }, /* 582 */ - { 80, 7, 12, 0, 0, }, /* 583 */ - { 80, 21, 12, 0, 0, }, /* 584 */ - { 80, 15, 12, 0, 0, }, /* 585 */ - { 119, 7, 12, 0, 0, }, /* 586 */ - { 119, 26, 12, 0, 0, }, /* 587 */ - { 119, 15, 12, 0, 0, }, /* 588 */ - { 115, 7, 12, 0, 0, }, /* 589 */ - { 115, 15, 12, 0, 0, }, /* 590 */ - { 65, 7, 12, 0, 0, }, /* 591 */ - { 65, 15, 12, 0, 0, }, /* 592 */ - { 65, 21, 12, 0, 0, }, /* 593 */ - { 71, 7, 12, 0, 0, }, /* 594 */ - { 71, 21, 12, 0, 0, }, /* 595 */ - { 97, 7, 12, 0, 0, }, /* 596 */ - { 96, 7, 12, 0, 0, }, /* 597 */ - { 30, 7, 12, 0, 0, }, /* 598 */ - { 30, 12, 3, 0, 0, }, /* 599 */ - { 30, 15, 12, 0, 0, }, /* 600 */ - { 30, 21, 12, 0, 0, }, /* 601 */ - { 87, 7, 12, 0, 0, }, /* 602 */ - { 87, 15, 12, 0, 0, }, /* 603 */ - { 87, 21, 12, 0, 0, }, /* 604 */ - { 116, 7, 12, 0, 0, }, /* 605 */ - { 116, 15, 12, 0, 0, }, /* 606 */ - { 111, 7, 12, 0, 0, }, /* 607 */ - { 111, 26, 12, 0, 0, }, /* 608 */ - { 111, 12, 3, 0, 0, }, /* 609 */ - { 111, 15, 12, 0, 0, }, /* 610 */ - { 111, 21, 12, 0, 0, }, /* 611 */ - { 77, 7, 12, 0, 0, }, /* 612 */ - { 77, 21, 12, 0, 0, }, /* 613 */ - { 82, 7, 12, 0, 0, }, /* 614 */ - { 82, 15, 12, 0, 0, }, /* 615 */ - { 81, 7, 12, 0, 0, }, /* 616 */ - { 81, 15, 12, 0, 0, }, /* 617 */ - { 120, 7, 12, 0, 0, }, /* 618 */ - { 120, 21, 12, 0, 0, }, /* 619 */ - { 120, 15, 12, 0, 0, }, /* 620 */ - { 88, 7, 12, 0, 0, }, /* 621 */ - { 0, 15, 12, 0, 0, }, /* 622 */ - { 93, 10, 5, 0, 0, }, /* 623 */ - { 93, 12, 3, 0, 0, }, /* 624 */ - { 93, 7, 12, 0, 0, }, /* 625 */ - { 93, 21, 12, 0, 0, }, /* 626 */ - { 93, 15, 12, 0, 0, }, /* 627 */ - { 93, 13, 12, 0, 0, }, /* 628 */ - { 84, 12, 3, 0, 0, }, /* 629 */ - { 84, 10, 5, 0, 0, }, /* 630 */ - { 84, 7, 12, 0, 0, }, /* 631 */ - { 84, 21, 12, 0, 0, }, /* 632 */ - { 84, 1, 2, 0, 0, }, /* 633 */ - { 100, 7, 12, 0, 0, }, /* 634 */ - { 100, 13, 12, 0, 0, }, /* 635 */ - { 95, 12, 3, 0, 0, }, /* 636 */ - { 95, 7, 12, 0, 0, }, /* 637 */ - { 95, 10, 5, 0, 0, }, /* 638 */ - { 95, 13, 12, 0, 0, }, /* 639 */ - { 95, 21, 12, 0, 0, }, /* 640 */ - { 110, 7, 12, 0, 0, }, /* 641 */ - { 110, 12, 3, 0, 0, }, /* 642 */ - { 110, 21, 12, 0, 0, }, /* 643 */ - { 99, 12, 3, 0, 0, }, /* 644 */ - { 99, 10, 5, 0, 0, }, /* 645 */ - { 99, 7, 12, 0, 0, }, /* 646 */ - { 99, 21, 12, 0, 0, }, /* 647 */ - { 99, 13, 12, 0, 0, }, /* 648 */ - { 47, 15, 12, 0, 0, }, /* 649 */ - { 107, 7, 12, 0, 0, }, /* 650 */ - { 107, 10, 5, 0, 0, }, /* 651 */ - { 107, 12, 3, 0, 0, }, /* 652 */ - { 107, 21, 12, 0, 0, }, /* 653 */ - { 108, 7, 12, 0, 0, }, /* 654 */ - { 108, 12, 3, 0, 0, }, /* 655 */ - { 108, 10, 5, 0, 0, }, /* 656 */ - { 108, 13, 12, 0, 0, }, /* 657 */ - { 106, 12, 3, 0, 0, }, /* 658 */ - { 106, 10, 5, 0, 0, }, /* 659 */ - { 106, 7, 12, 0, 0, }, /* 660 */ - { 106, 10, 3, 0, 0, }, /* 661 */ - { 123, 7, 12, 0, 0, }, /* 662 */ - { 123, 10, 3, 0, 0, }, /* 663 */ - { 123, 10, 5, 0, 0, }, /* 664 */ - { 123, 12, 3, 0, 0, }, /* 665 */ - { 123, 21, 12, 0, 0, }, /* 666 */ - { 123, 13, 12, 0, 0, }, /* 667 */ - { 122, 7, 12, 0, 0, }, /* 668 */ - { 122, 10, 3, 0, 0, }, /* 669 */ - { 122, 10, 5, 0, 0, }, /* 670 */ - { 122, 12, 3, 0, 0, }, /* 671 */ - { 122, 21, 12, 0, 0, }, /* 672 */ - { 113, 7, 12, 0, 0, }, /* 673 */ - { 113, 10, 5, 0, 0, }, /* 674 */ - { 113, 12, 3, 0, 0, }, /* 675 */ - { 113, 21, 12, 0, 0, }, /* 676 */ - { 113, 13, 12, 0, 0, }, /* 677 */ - { 101, 7, 12, 0, 0, }, /* 678 */ - { 101, 12, 3, 0, 0, }, /* 679 */ - { 101, 10, 5, 0, 0, }, /* 680 */ - { 101, 13, 12, 0, 0, }, /* 681 */ - { 124, 9, 12, 0, 32, }, /* 682 */ - { 124, 5, 12, 0, -32, }, /* 683 */ - { 124, 13, 12, 0, 0, }, /* 684 */ - { 124, 15, 12, 0, 0, }, /* 685 */ - { 124, 7, 12, 0, 0, }, /* 686 */ - { 121, 7, 12, 0, 0, }, /* 687 */ - { 62, 7, 12, 0, 0, }, /* 688 */ - { 62, 14, 12, 0, 0, }, /* 689 */ - { 62, 21, 12, 0, 0, }, /* 690 */ - { 79, 7, 12, 0, 0, }, /* 691 */ - { 114, 7, 12, 0, 0, }, /* 692 */ - { 114, 13, 12, 0, 0, }, /* 693 */ - { 114, 21, 12, 0, 0, }, /* 694 */ - { 102, 7, 12, 0, 0, }, /* 695 */ - { 102, 12, 3, 0, 0, }, /* 696 */ - { 102, 21, 12, 0, 0, }, /* 697 */ - { 118, 7, 12, 0, 0, }, /* 698 */ - { 118, 12, 3, 0, 0, }, /* 699 */ - { 118, 21, 12, 0, 0, }, /* 700 */ - { 118, 26, 12, 0, 0, }, /* 701 */ - { 118, 6, 12, 0, 0, }, /* 702 */ - { 118, 13, 12, 0, 0, }, /* 703 */ - { 118, 15, 12, 0, 0, }, /* 704 */ - { 98, 7, 12, 0, 0, }, /* 705 */ - { 98, 10, 5, 0, 0, }, /* 706 */ - { 98, 12, 3, 0, 0, }, /* 707 */ - { 98, 6, 12, 0, 0, }, /* 708 */ - { 104, 7, 12, 0, 0, }, /* 709 */ - { 104, 26, 12, 0, 0, }, /* 710 */ - { 104, 12, 3, 0, 0, }, /* 711 */ - { 104, 21, 12, 0, 0, }, /* 712 */ - { 9, 10, 3, 0, 0, }, /* 713 */ - { 19, 12, 3, 0, 0, }, /* 714 */ - { 112, 7, 12, 0, 0, }, /* 715 */ - { 112, 15, 12, 0, 0, }, /* 716 */ - { 112, 12, 3, 0, 0, }, /* 717 */ - { 9, 26, 11, 0, 0, }, /* 718 */ - { 26, 26, 12, 0, 0, }, /* 719 */ + { 0, 6, 12, 0, 0, }, /* 202 */ + { 49, 21, 12, 0, 0, }, /* 203 */ + { 49, 1, 2, 0, 0, }, /* 204 */ + { 49, 7, 12, 0, 0, }, /* 205 */ + { 49, 12, 3, 0, 0, }, /* 206 */ + { 55, 7, 12, 0, 0, }, /* 207 */ + { 55, 12, 3, 0, 0, }, /* 208 */ + { 63, 13, 12, 0, 0, }, /* 209 */ + { 63, 7, 12, 0, 0, }, /* 210 */ + { 63, 12, 3, 0, 0, }, /* 211 */ + { 63, 6, 12, 0, 0, }, /* 212 */ + { 63, 26, 12, 0, 0, }, /* 213 */ + { 63, 21, 12, 0, 0, }, /* 214 */ + { 89, 7, 12, 0, 0, }, /* 215 */ + { 89, 12, 3, 0, 0, }, /* 216 */ + { 89, 6, 12, 0, 0, }, /* 217 */ + { 89, 21, 12, 0, 0, }, /* 218 */ + { 94, 7, 12, 0, 0, }, /* 219 */ + { 94, 12, 3, 0, 0, }, /* 220 */ + { 94, 21, 12, 0, 0, }, /* 221 */ + { 14, 12, 3, 0, 0, }, /* 222 */ + { 14, 10, 5, 0, 0, }, /* 223 */ + { 14, 7, 12, 0, 0, }, /* 224 */ + { 14, 13, 12, 0, 0, }, /* 225 */ + { 14, 21, 12, 0, 0, }, /* 226 */ + { 14, 6, 12, 0, 0, }, /* 227 */ + { 2, 7, 12, 0, 0, }, /* 228 */ + { 2, 12, 3, 0, 0, }, /* 229 */ + { 2, 10, 5, 0, 0, }, /* 230 */ + { 2, 10, 3, 0, 0, }, /* 231 */ + { 2, 13, 12, 0, 0, }, /* 232 */ + { 2, 23, 12, 0, 0, }, /* 233 */ + { 2, 15, 12, 0, 0, }, /* 234 */ + { 2, 26, 12, 0, 0, }, /* 235 */ + { 21, 12, 3, 0, 0, }, /* 236 */ + { 21, 10, 5, 0, 0, }, /* 237 */ + { 21, 7, 12, 0, 0, }, /* 238 */ + { 21, 13, 12, 0, 0, }, /* 239 */ + { 20, 12, 3, 0, 0, }, /* 240 */ + { 20, 10, 5, 0, 0, }, /* 241 */ + { 20, 7, 12, 0, 0, }, /* 242 */ + { 20, 13, 12, 0, 0, }, /* 243 */ + { 20, 21, 12, 0, 0, }, /* 244 */ + { 20, 23, 12, 0, 0, }, /* 245 */ + { 43, 12, 3, 0, 0, }, /* 246 */ + { 43, 10, 5, 0, 0, }, /* 247 */ + { 43, 7, 12, 0, 0, }, /* 248 */ + { 43, 10, 3, 0, 0, }, /* 249 */ + { 43, 13, 12, 0, 0, }, /* 250 */ + { 43, 26, 12, 0, 0, }, /* 251 */ + { 43, 15, 12, 0, 0, }, /* 252 */ + { 53, 12, 3, 0, 0, }, /* 253 */ + { 53, 7, 12, 0, 0, }, /* 254 */ + { 53, 10, 3, 0, 0, }, /* 255 */ + { 53, 10, 5, 0, 0, }, /* 256 */ + { 53, 13, 12, 0, 0, }, /* 257 */ + { 53, 15, 12, 0, 0, }, /* 258 */ + { 53, 26, 12, 0, 0, }, /* 259 */ + { 53, 23, 12, 0, 0, }, /* 260 */ + { 54, 12, 3, 0, 0, }, /* 261 */ + { 54, 10, 5, 0, 0, }, /* 262 */ + { 54, 7, 12, 0, 0, }, /* 263 */ + { 54, 13, 12, 0, 0, }, /* 264 */ + { 54, 15, 12, 0, 0, }, /* 265 */ + { 54, 26, 12, 0, 0, }, /* 266 */ + { 28, 12, 3, 0, 0, }, /* 267 */ + { 28, 10, 5, 0, 0, }, /* 268 */ + { 28, 7, 12, 0, 0, }, /* 269 */ + { 28, 10, 3, 0, 0, }, /* 270 */ + { 28, 13, 12, 0, 0, }, /* 271 */ + { 36, 12, 3, 0, 0, }, /* 272 */ + { 36, 10, 5, 0, 0, }, /* 273 */ + { 36, 7, 12, 0, 0, }, /* 274 */ + { 36, 10, 3, 0, 0, }, /* 275 */ + { 36, 13, 12, 0, 0, }, /* 276 */ + { 36, 15, 12, 0, 0, }, /* 277 */ + { 36, 26, 12, 0, 0, }, /* 278 */ + { 47, 10, 5, 0, 0, }, /* 279 */ + { 47, 7, 12, 0, 0, }, /* 280 */ + { 47, 12, 3, 0, 0, }, /* 281 */ + { 47, 10, 3, 0, 0, }, /* 282 */ + { 47, 13, 12, 0, 0, }, /* 283 */ + { 47, 21, 12, 0, 0, }, /* 284 */ + { 56, 7, 12, 0, 0, }, /* 285 */ + { 56, 12, 3, 0, 0, }, /* 286 */ + { 56, 7, 5, 0, 0, }, /* 287 */ + { 56, 6, 12, 0, 0, }, /* 288 */ + { 56, 21, 12, 0, 0, }, /* 289 */ + { 56, 13, 12, 0, 0, }, /* 290 */ + { 32, 7, 12, 0, 0, }, /* 291 */ + { 32, 12, 3, 0, 0, }, /* 292 */ + { 32, 7, 5, 0, 0, }, /* 293 */ + { 32, 6, 12, 0, 0, }, /* 294 */ + { 32, 13, 12, 0, 0, }, /* 295 */ + { 57, 7, 12, 0, 0, }, /* 296 */ + { 57, 26, 12, 0, 0, }, /* 297 */ + { 57, 21, 12, 0, 0, }, /* 298 */ + { 57, 12, 3, 0, 0, }, /* 299 */ + { 57, 13, 12, 0, 0, }, /* 300 */ + { 57, 15, 12, 0, 0, }, /* 301 */ + { 57, 22, 12, 0, 0, }, /* 302 */ + { 57, 18, 12, 0, 0, }, /* 303 */ + { 57, 10, 5, 0, 0, }, /* 304 */ + { 38, 7, 12, 0, 0, }, /* 305 */ + { 38, 10, 12, 0, 0, }, /* 306 */ + { 38, 12, 3, 0, 0, }, /* 307 */ + { 38, 10, 5, 0, 0, }, /* 308 */ + { 38, 13, 12, 0, 0, }, /* 309 */ + { 38, 21, 12, 0, 0, }, /* 310 */ + { 38, 26, 12, 0, 0, }, /* 311 */ + { 16, 9, 12, 0, 7264, }, /* 312 */ + { 16, 7, 12, 0, 0, }, /* 313 */ + { 16, 6, 12, 0, 0, }, /* 314 */ + { 23, 7, 6, 0, 0, }, /* 315 */ + { 23, 7, 7, 0, 0, }, /* 316 */ + { 23, 7, 8, 0, 0, }, /* 317 */ + { 15, 7, 12, 0, 0, }, /* 318 */ + { 15, 12, 3, 0, 0, }, /* 319 */ + { 15, 21, 12, 0, 0, }, /* 320 */ + { 15, 15, 12, 0, 0, }, /* 321 */ + { 15, 26, 12, 0, 0, }, /* 322 */ + { 8, 9, 12, 0, 38864, }, /* 323 */ + { 8, 9, 12, 0, 8, }, /* 324 */ + { 8, 5, 12, 0, -8, }, /* 325 */ + { 7, 17, 12, 0, 0, }, /* 326 */ + { 7, 7, 12, 0, 0, }, /* 327 */ + { 7, 21, 12, 0, 0, }, /* 328 */ + { 40, 29, 12, 0, 0, }, /* 329 */ + { 40, 7, 12, 0, 0, }, /* 330 */ + { 40, 22, 12, 0, 0, }, /* 331 */ + { 40, 18, 12, 0, 0, }, /* 332 */ + { 45, 7, 12, 0, 0, }, /* 333 */ + { 45, 14, 12, 0, 0, }, /* 334 */ + { 50, 7, 12, 0, 0, }, /* 335 */ + { 50, 12, 3, 0, 0, }, /* 336 */ + { 24, 7, 12, 0, 0, }, /* 337 */ + { 24, 12, 3, 0, 0, }, /* 338 */ + { 6, 7, 12, 0, 0, }, /* 339 */ + { 6, 12, 3, 0, 0, }, /* 340 */ + { 51, 7, 12, 0, 0, }, /* 341 */ + { 51, 12, 3, 0, 0, }, /* 342 */ + { 31, 7, 12, 0, 0, }, /* 343 */ + { 31, 12, 3, 0, 0, }, /* 344 */ + { 31, 10, 5, 0, 0, }, /* 345 */ + { 31, 21, 12, 0, 0, }, /* 346 */ + { 31, 6, 12, 0, 0, }, /* 347 */ + { 31, 23, 12, 0, 0, }, /* 348 */ + { 31, 13, 12, 0, 0, }, /* 349 */ + { 31, 15, 12, 0, 0, }, /* 350 */ + { 37, 21, 12, 0, 0, }, /* 351 */ + { 37, 17, 12, 0, 0, }, /* 352 */ + { 37, 12, 3, 0, 0, }, /* 353 */ + { 37, 1, 2, 0, 0, }, /* 354 */ + { 37, 13, 12, 0, 0, }, /* 355 */ + { 37, 7, 12, 0, 0, }, /* 356 */ + { 37, 6, 12, 0, 0, }, /* 357 */ + { 34, 7, 12, 0, 0, }, /* 358 */ + { 34, 12, 3, 0, 0, }, /* 359 */ + { 34, 10, 5, 0, 0, }, /* 360 */ + { 34, 26, 12, 0, 0, }, /* 361 */ + { 34, 21, 12, 0, 0, }, /* 362 */ + { 34, 13, 12, 0, 0, }, /* 363 */ + { 52, 7, 12, 0, 0, }, /* 364 */ + { 39, 7, 12, 0, 0, }, /* 365 */ + { 39, 13, 12, 0, 0, }, /* 366 */ + { 39, 15, 12, 0, 0, }, /* 367 */ + { 39, 26, 12, 0, 0, }, /* 368 */ + { 31, 26, 12, 0, 0, }, /* 369 */ + { 5, 7, 12, 0, 0, }, /* 370 */ + { 5, 12, 3, 0, 0, }, /* 371 */ + { 5, 10, 5, 0, 0, }, /* 372 */ + { 5, 21, 12, 0, 0, }, /* 373 */ + { 90, 7, 12, 0, 0, }, /* 374 */ + { 90, 10, 5, 0, 0, }, /* 375 */ + { 90, 12, 3, 0, 0, }, /* 376 */ + { 90, 10, 12, 0, 0, }, /* 377 */ + { 90, 13, 12, 0, 0, }, /* 378 */ + { 90, 21, 12, 0, 0, }, /* 379 */ + { 90, 6, 12, 0, 0, }, /* 380 */ + { 27, 11, 3, 0, 0, }, /* 381 */ + { 61, 12, 3, 0, 0, }, /* 382 */ + { 61, 10, 5, 0, 0, }, /* 383 */ + { 61, 7, 12, 0, 0, }, /* 384 */ + { 61, 13, 12, 0, 0, }, /* 385 */ + { 61, 21, 12, 0, 0, }, /* 386 */ + { 61, 26, 12, 0, 0, }, /* 387 */ + { 75, 12, 3, 0, 0, }, /* 388 */ + { 75, 10, 5, 0, 0, }, /* 389 */ + { 75, 7, 12, 0, 0, }, /* 390 */ + { 75, 13, 12, 0, 0, }, /* 391 */ + { 92, 7, 12, 0, 0, }, /* 392 */ + { 92, 12, 3, 0, 0, }, /* 393 */ + { 92, 10, 5, 0, 0, }, /* 394 */ + { 92, 21, 12, 0, 0, }, /* 395 */ + { 69, 7, 12, 0, 0, }, /* 396 */ + { 69, 10, 5, 0, 0, }, /* 397 */ + { 69, 12, 3, 0, 0, }, /* 398 */ + { 69, 21, 12, 0, 0, }, /* 399 */ + { 69, 13, 12, 0, 0, }, /* 400 */ + { 72, 13, 12, 0, 0, }, /* 401 */ + { 72, 7, 12, 0, 0, }, /* 402 */ + { 72, 6, 12, 0, 0, }, /* 403 */ + { 72, 21, 12, 0, 0, }, /* 404 */ + { 75, 21, 12, 0, 0, }, /* 405 */ + { 9, 10, 5, 0, 0, }, /* 406 */ + { 9, 7, 12, 0, 0, }, /* 407 */ + { 12, 5, 12, 0, 0, }, /* 408 */ + { 12, 6, 12, 0, 0, }, /* 409 */ + { 33, 5, 12, 0, 35332, }, /* 410 */ + { 33, 5, 12, 0, 3814, }, /* 411 */ + { 33, 9, 12, 63, 1, }, /* 412 */ + { 33, 5, 12, 63, -1, }, /* 413 */ + { 33, 5, 12, 63, -58, }, /* 414 */ + { 33, 9, 12, 0, -7615, }, /* 415 */ + { 19, 5, 12, 0, 8, }, /* 416 */ + { 19, 9, 12, 0, -8, }, /* 417 */ + { 19, 5, 12, 0, 74, }, /* 418 */ + { 19, 5, 12, 0, 86, }, /* 419 */ + { 19, 5, 12, 0, 100, }, /* 420 */ + { 19, 5, 12, 0, 128, }, /* 421 */ + { 19, 5, 12, 0, 112, }, /* 422 */ + { 19, 5, 12, 0, 126, }, /* 423 */ + { 19, 8, 12, 0, -8, }, /* 424 */ + { 19, 5, 12, 0, 9, }, /* 425 */ + { 19, 9, 12, 0, -74, }, /* 426 */ + { 19, 8, 12, 0, -9, }, /* 427 */ + { 19, 5, 12, 21, -7173, }, /* 428 */ + { 19, 9, 12, 0, -86, }, /* 429 */ + { 19, 9, 12, 0, -100, }, /* 430 */ + { 19, 9, 12, 0, -112, }, /* 431 */ + { 19, 9, 12, 0, -128, }, /* 432 */ + { 19, 9, 12, 0, -126, }, /* 433 */ + { 27, 1, 3, 0, 0, }, /* 434 */ + { 9, 27, 2, 0, 0, }, /* 435 */ + { 9, 28, 2, 0, 0, }, /* 436 */ + { 9, 2, 2, 0, 0, }, /* 437 */ + { 9, 9, 12, 0, 0, }, /* 438 */ + { 9, 5, 12, 0, 0, }, /* 439 */ + { 19, 9, 12, 67, -7517, }, /* 440 */ + { 33, 9, 12, 71, -8383, }, /* 441 */ + { 33, 9, 12, 75, -8262, }, /* 442 */ + { 33, 9, 12, 0, 28, }, /* 443 */ + { 33, 5, 12, 0, -28, }, /* 444 */ + { 33, 14, 12, 0, 16, }, /* 445 */ + { 33, 14, 12, 0, -16, }, /* 446 */ + { 33, 14, 12, 0, 0, }, /* 447 */ + { 9, 26, 12, 0, 26, }, /* 448 */ + { 9, 26, 12, 0, -26, }, /* 449 */ + { 4, 26, 12, 0, 0, }, /* 450 */ + { 17, 9, 12, 0, 48, }, /* 451 */ + { 17, 5, 12, 0, -48, }, /* 452 */ + { 33, 9, 12, 0, -10743, }, /* 453 */ + { 33, 9, 12, 0, -3814, }, /* 454 */ + { 33, 9, 12, 0, -10727, }, /* 455 */ + { 33, 5, 12, 0, -10795, }, /* 456 */ + { 33, 5, 12, 0, -10792, }, /* 457 */ + { 33, 9, 12, 0, -10780, }, /* 458 */ + { 33, 9, 12, 0, -10749, }, /* 459 */ + { 33, 9, 12, 0, -10783, }, /* 460 */ + { 33, 9, 12, 0, -10782, }, /* 461 */ + { 33, 9, 12, 0, -10815, }, /* 462 */ + { 10, 5, 12, 0, 0, }, /* 463 */ + { 10, 26, 12, 0, 0, }, /* 464 */ + { 10, 12, 3, 0, 0, }, /* 465 */ + { 10, 21, 12, 0, 0, }, /* 466 */ + { 10, 15, 12, 0, 0, }, /* 467 */ + { 16, 5, 12, 0, -7264, }, /* 468 */ + { 58, 7, 12, 0, 0, }, /* 469 */ + { 58, 6, 12, 0, 0, }, /* 470 */ + { 58, 21, 12, 0, 0, }, /* 471 */ + { 58, 12, 3, 0, 0, }, /* 472 */ + { 22, 26, 12, 0, 0, }, /* 473 */ + { 22, 6, 12, 0, 0, }, /* 474 */ + { 22, 14, 12, 0, 0, }, /* 475 */ + { 23, 10, 3, 0, 0, }, /* 476 */ + { 26, 7, 12, 0, 0, }, /* 477 */ + { 26, 6, 12, 0, 0, }, /* 478 */ + { 29, 7, 12, 0, 0, }, /* 479 */ + { 29, 6, 12, 0, 0, }, /* 480 */ + { 3, 7, 12, 0, 0, }, /* 481 */ + { 23, 7, 12, 0, 0, }, /* 482 */ + { 23, 26, 12, 0, 0, }, /* 483 */ + { 29, 26, 12, 0, 0, }, /* 484 */ + { 22, 7, 12, 0, 0, }, /* 485 */ + { 60, 7, 12, 0, 0, }, /* 486 */ + { 60, 6, 12, 0, 0, }, /* 487 */ + { 60, 26, 12, 0, 0, }, /* 488 */ + { 85, 7, 12, 0, 0, }, /* 489 */ + { 85, 6, 12, 0, 0, }, /* 490 */ + { 85, 21, 12, 0, 0, }, /* 491 */ + { 76, 7, 12, 0, 0, }, /* 492 */ + { 76, 6, 12, 0, 0, }, /* 493 */ + { 76, 21, 12, 0, 0, }, /* 494 */ + { 76, 13, 12, 0, 0, }, /* 495 */ + { 12, 7, 12, 0, 0, }, /* 496 */ + { 12, 21, 12, 0, 0, }, /* 497 */ + { 78, 7, 12, 0, 0, }, /* 498 */ + { 78, 14, 12, 0, 0, }, /* 499 */ + { 78, 12, 3, 0, 0, }, /* 500 */ + { 78, 21, 12, 0, 0, }, /* 501 */ + { 33, 9, 12, 0, -35332, }, /* 502 */ + { 33, 9, 12, 0, -42280, }, /* 503 */ + { 33, 9, 12, 0, -42308, }, /* 504 */ + { 33, 9, 12, 0, -42319, }, /* 505 */ + { 33, 9, 12, 0, -42315, }, /* 506 */ + { 33, 9, 12, 0, -42305, }, /* 507 */ + { 33, 9, 12, 0, -42258, }, /* 508 */ + { 33, 9, 12, 0, -42282, }, /* 509 */ + { 33, 9, 12, 0, -42261, }, /* 510 */ + { 33, 9, 12, 0, 928, }, /* 511 */ + { 48, 7, 12, 0, 0, }, /* 512 */ + { 48, 12, 3, 0, 0, }, /* 513 */ + { 48, 10, 5, 0, 0, }, /* 514 */ + { 48, 26, 12, 0, 0, }, /* 515 */ + { 64, 7, 12, 0, 0, }, /* 516 */ + { 64, 21, 12, 0, 0, }, /* 517 */ + { 74, 10, 5, 0, 0, }, /* 518 */ + { 74, 7, 12, 0, 0, }, /* 519 */ + { 74, 12, 3, 0, 0, }, /* 520 */ + { 74, 21, 12, 0, 0, }, /* 521 */ + { 74, 13, 12, 0, 0, }, /* 522 */ + { 68, 13, 12, 0, 0, }, /* 523 */ + { 68, 7, 12, 0, 0, }, /* 524 */ + { 68, 12, 3, 0, 0, }, /* 525 */ + { 68, 21, 12, 0, 0, }, /* 526 */ + { 73, 7, 12, 0, 0, }, /* 527 */ + { 73, 12, 3, 0, 0, }, /* 528 */ + { 73, 10, 5, 0, 0, }, /* 529 */ + { 73, 21, 12, 0, 0, }, /* 530 */ + { 83, 12, 3, 0, 0, }, /* 531 */ + { 83, 10, 5, 0, 0, }, /* 532 */ + { 83, 7, 12, 0, 0, }, /* 533 */ + { 83, 21, 12, 0, 0, }, /* 534 */ + { 83, 13, 12, 0, 0, }, /* 535 */ + { 38, 6, 12, 0, 0, }, /* 536 */ + { 67, 7, 12, 0, 0, }, /* 537 */ + { 67, 12, 3, 0, 0, }, /* 538 */ + { 67, 10, 5, 0, 0, }, /* 539 */ + { 67, 13, 12, 0, 0, }, /* 540 */ + { 67, 21, 12, 0, 0, }, /* 541 */ + { 91, 7, 12, 0, 0, }, /* 542 */ + { 91, 12, 3, 0, 0, }, /* 543 */ + { 91, 6, 12, 0, 0, }, /* 544 */ + { 91, 21, 12, 0, 0, }, /* 545 */ + { 86, 7, 12, 0, 0, }, /* 546 */ + { 86, 10, 5, 0, 0, }, /* 547 */ + { 86, 12, 3, 0, 0, }, /* 548 */ + { 86, 21, 12, 0, 0, }, /* 549 */ + { 86, 6, 12, 0, 0, }, /* 550 */ + { 33, 5, 12, 0, -928, }, /* 551 */ + { 8, 5, 12, 0, -38864, }, /* 552 */ + { 86, 13, 12, 0, 0, }, /* 553 */ + { 23, 7, 9, 0, 0, }, /* 554 */ + { 23, 7, 10, 0, 0, }, /* 555 */ + { 9, 4, 2, 0, 0, }, /* 556 */ + { 9, 3, 12, 0, 0, }, /* 557 */ + { 25, 25, 12, 0, 0, }, /* 558 */ + { 0, 24, 12, 0, 0, }, /* 559 */ + { 9, 6, 3, 0, 0, }, /* 560 */ + { 35, 7, 12, 0, 0, }, /* 561 */ + { 19, 14, 12, 0, 0, }, /* 562 */ + { 19, 15, 12, 0, 0, }, /* 563 */ + { 19, 26, 12, 0, 0, }, /* 564 */ + { 70, 7, 12, 0, 0, }, /* 565 */ + { 66, 7, 12, 0, 0, }, /* 566 */ + { 41, 7, 12, 0, 0, }, /* 567 */ + { 41, 15, 12, 0, 0, }, /* 568 */ + { 18, 7, 12, 0, 0, }, /* 569 */ + { 18, 14, 12, 0, 0, }, /* 570 */ + { 117, 7, 12, 0, 0, }, /* 571 */ + { 117, 12, 3, 0, 0, }, /* 572 */ + { 59, 7, 12, 0, 0, }, /* 573 */ + { 59, 21, 12, 0, 0, }, /* 574 */ + { 42, 7, 12, 0, 0, }, /* 575 */ + { 42, 21, 12, 0, 0, }, /* 576 */ + { 42, 14, 12, 0, 0, }, /* 577 */ + { 13, 9, 12, 0, 40, }, /* 578 */ + { 13, 5, 12, 0, -40, }, /* 579 */ + { 46, 7, 12, 0, 0, }, /* 580 */ + { 44, 7, 12, 0, 0, }, /* 581 */ + { 44, 13, 12, 0, 0, }, /* 582 */ + { 105, 7, 12, 0, 0, }, /* 583 */ + { 103, 7, 12, 0, 0, }, /* 584 */ + { 103, 21, 12, 0, 0, }, /* 585 */ + { 109, 7, 12, 0, 0, }, /* 586 */ + { 11, 7, 12, 0, 0, }, /* 587 */ + { 80, 7, 12, 0, 0, }, /* 588 */ + { 80, 21, 12, 0, 0, }, /* 589 */ + { 80, 15, 12, 0, 0, }, /* 590 */ + { 119, 7, 12, 0, 0, }, /* 591 */ + { 119, 26, 12, 0, 0, }, /* 592 */ + { 119, 15, 12, 0, 0, }, /* 593 */ + { 115, 7, 12, 0, 0, }, /* 594 */ + { 115, 15, 12, 0, 0, }, /* 595 */ + { 127, 7, 12, 0, 0, }, /* 596 */ + { 127, 15, 12, 0, 0, }, /* 597 */ + { 65, 7, 12, 0, 0, }, /* 598 */ + { 65, 15, 12, 0, 0, }, /* 599 */ + { 65, 21, 12, 0, 0, }, /* 600 */ + { 71, 7, 12, 0, 0, }, /* 601 */ + { 71, 21, 12, 0, 0, }, /* 602 */ + { 97, 7, 12, 0, 0, }, /* 603 */ + { 96, 7, 12, 0, 0, }, /* 604 */ + { 96, 15, 12, 0, 0, }, /* 605 */ + { 30, 7, 12, 0, 0, }, /* 606 */ + { 30, 12, 3, 0, 0, }, /* 607 */ + { 30, 15, 12, 0, 0, }, /* 608 */ + { 30, 21, 12, 0, 0, }, /* 609 */ + { 87, 7, 12, 0, 0, }, /* 610 */ + { 87, 15, 12, 0, 0, }, /* 611 */ + { 87, 21, 12, 0, 0, }, /* 612 */ + { 116, 7, 12, 0, 0, }, /* 613 */ + { 116, 15, 12, 0, 0, }, /* 614 */ + { 111, 7, 12, 0, 0, }, /* 615 */ + { 111, 26, 12, 0, 0, }, /* 616 */ + { 111, 12, 3, 0, 0, }, /* 617 */ + { 111, 15, 12, 0, 0, }, /* 618 */ + { 111, 21, 12, 0, 0, }, /* 619 */ + { 77, 7, 12, 0, 0, }, /* 620 */ + { 77, 21, 12, 0, 0, }, /* 621 */ + { 82, 7, 12, 0, 0, }, /* 622 */ + { 82, 15, 12, 0, 0, }, /* 623 */ + { 81, 7, 12, 0, 0, }, /* 624 */ + { 81, 15, 12, 0, 0, }, /* 625 */ + { 120, 7, 12, 0, 0, }, /* 626 */ + { 120, 21, 12, 0, 0, }, /* 627 */ + { 120, 15, 12, 0, 0, }, /* 628 */ + { 88, 7, 12, 0, 0, }, /* 629 */ + { 129, 9, 12, 0, 64, }, /* 630 */ + { 129, 5, 12, 0, -64, }, /* 631 */ + { 129, 15, 12, 0, 0, }, /* 632 */ + { 0, 15, 12, 0, 0, }, /* 633 */ + { 93, 10, 5, 0, 0, }, /* 634 */ + { 93, 12, 3, 0, 0, }, /* 635 */ + { 93, 7, 12, 0, 0, }, /* 636 */ + { 93, 21, 12, 0, 0, }, /* 637 */ + { 93, 15, 12, 0, 0, }, /* 638 */ + { 93, 13, 12, 0, 0, }, /* 639 */ + { 84, 12, 3, 0, 0, }, /* 640 */ + { 84, 10, 5, 0, 0, }, /* 641 */ + { 84, 7, 12, 0, 0, }, /* 642 */ + { 84, 21, 12, 0, 0, }, /* 643 */ + { 84, 1, 2, 0, 0, }, /* 644 */ + { 100, 7, 12, 0, 0, }, /* 645 */ + { 100, 13, 12, 0, 0, }, /* 646 */ + { 95, 12, 3, 0, 0, }, /* 647 */ + { 95, 7, 12, 0, 0, }, /* 648 */ + { 95, 10, 5, 0, 0, }, /* 649 */ + { 95, 13, 12, 0, 0, }, /* 650 */ + { 95, 21, 12, 0, 0, }, /* 651 */ + { 110, 7, 12, 0, 0, }, /* 652 */ + { 110, 12, 3, 0, 0, }, /* 653 */ + { 110, 21, 12, 0, 0, }, /* 654 */ + { 99, 12, 3, 0, 0, }, /* 655 */ + { 99, 10, 5, 0, 0, }, /* 656 */ + { 99, 7, 12, 0, 0, }, /* 657 */ + { 99, 21, 12, 0, 0, }, /* 658 */ + { 99, 13, 12, 0, 0, }, /* 659 */ + { 47, 15, 12, 0, 0, }, /* 660 */ + { 107, 7, 12, 0, 0, }, /* 661 */ + { 107, 10, 5, 0, 0, }, /* 662 */ + { 107, 12, 3, 0, 0, }, /* 663 */ + { 107, 21, 12, 0, 0, }, /* 664 */ + { 128, 7, 12, 0, 0, }, /* 665 */ + { 128, 21, 12, 0, 0, }, /* 666 */ + { 108, 7, 12, 0, 0, }, /* 667 */ + { 108, 12, 3, 0, 0, }, /* 668 */ + { 108, 10, 5, 0, 0, }, /* 669 */ + { 108, 13, 12, 0, 0, }, /* 670 */ + { 106, 12, 3, 0, 0, }, /* 671 */ + { 106, 10, 5, 0, 0, }, /* 672 */ + { 106, 7, 12, 0, 0, }, /* 673 */ + { 106, 10, 3, 0, 0, }, /* 674 */ + { 123, 7, 12, 0, 0, }, /* 675 */ + { 123, 10, 3, 0, 0, }, /* 676 */ + { 123, 10, 5, 0, 0, }, /* 677 */ + { 123, 12, 3, 0, 0, }, /* 678 */ + { 123, 21, 12, 0, 0, }, /* 679 */ + { 123, 13, 12, 0, 0, }, /* 680 */ + { 122, 7, 12, 0, 0, }, /* 681 */ + { 122, 10, 3, 0, 0, }, /* 682 */ + { 122, 10, 5, 0, 0, }, /* 683 */ + { 122, 12, 3, 0, 0, }, /* 684 */ + { 122, 21, 12, 0, 0, }, /* 685 */ + { 113, 7, 12, 0, 0, }, /* 686 */ + { 113, 10, 5, 0, 0, }, /* 687 */ + { 113, 12, 3, 0, 0, }, /* 688 */ + { 113, 21, 12, 0, 0, }, /* 689 */ + { 113, 13, 12, 0, 0, }, /* 690 */ + { 101, 7, 12, 0, 0, }, /* 691 */ + { 101, 12, 3, 0, 0, }, /* 692 */ + { 101, 10, 5, 0, 0, }, /* 693 */ + { 101, 13, 12, 0, 0, }, /* 694 */ + { 125, 7, 12, 0, 0, }, /* 695 */ + { 125, 12, 3, 0, 0, }, /* 696 */ + { 125, 10, 5, 0, 0, }, /* 697 */ + { 125, 13, 12, 0, 0, }, /* 698 */ + { 125, 15, 12, 0, 0, }, /* 699 */ + { 125, 21, 12, 0, 0, }, /* 700 */ + { 125, 26, 12, 0, 0, }, /* 701 */ + { 124, 9, 12, 0, 32, }, /* 702 */ + { 124, 5, 12, 0, -32, }, /* 703 */ + { 124, 13, 12, 0, 0, }, /* 704 */ + { 124, 15, 12, 0, 0, }, /* 705 */ + { 124, 7, 12, 0, 0, }, /* 706 */ + { 121, 7, 12, 0, 0, }, /* 707 */ + { 62, 7, 12, 0, 0, }, /* 708 */ + { 62, 14, 12, 0, 0, }, /* 709 */ + { 62, 21, 12, 0, 0, }, /* 710 */ + { 79, 7, 12, 0, 0, }, /* 711 */ + { 126, 7, 12, 0, 0, }, /* 712 */ + { 114, 7, 12, 0, 0, }, /* 713 */ + { 114, 13, 12, 0, 0, }, /* 714 */ + { 114, 21, 12, 0, 0, }, /* 715 */ + { 102, 7, 12, 0, 0, }, /* 716 */ + { 102, 12, 3, 0, 0, }, /* 717 */ + { 102, 21, 12, 0, 0, }, /* 718 */ + { 118, 7, 12, 0, 0, }, /* 719 */ + { 118, 12, 3, 0, 0, }, /* 720 */ + { 118, 21, 12, 0, 0, }, /* 721 */ + { 118, 26, 12, 0, 0, }, /* 722 */ + { 118, 6, 12, 0, 0, }, /* 723 */ + { 118, 13, 12, 0, 0, }, /* 724 */ + { 118, 15, 12, 0, 0, }, /* 725 */ + { 98, 7, 12, 0, 0, }, /* 726 */ + { 98, 10, 5, 0, 0, }, /* 727 */ + { 98, 12, 3, 0, 0, }, /* 728 */ + { 98, 6, 12, 0, 0, }, /* 729 */ + { 104, 7, 12, 0, 0, }, /* 730 */ + { 104, 26, 12, 0, 0, }, /* 731 */ + { 104, 12, 3, 0, 0, }, /* 732 */ + { 104, 21, 12, 0, 0, }, /* 733 */ + { 9, 10, 3, 0, 0, }, /* 734 */ + { 19, 12, 3, 0, 0, }, /* 735 */ + { 130, 26, 12, 0, 0, }, /* 736 */ + { 130, 12, 3, 0, 0, }, /* 737 */ + { 130, 21, 12, 0, 0, }, /* 738 */ + { 112, 7, 12, 0, 0, }, /* 739 */ + { 112, 15, 12, 0, 0, }, /* 740 */ + { 112, 12, 3, 0, 0, }, /* 741 */ + { 9, 26, 11, 0, 0, }, /* 742 */ + { 26, 26, 12, 0, 0, }, /* 743 */ }; const uint8_t PRIV(ucd_stage1)[] = { /* 8704 bytes */ @@ -839,19 +863,19 @@ const uint8_t PRIV(ucd_stage1)[] = { /* 8704 bytes */ 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123, /* U+F000 */ 123,123, 95, 95,124,125,126,127,128,128,129,130,131,132,133,134, /* U+F800 */ 135,136,137,138,139,140,141,142,143,144,145,139,146,146,147,139, /* U+10000 */ -148,149,150,151,152,153,154,155,156,139,139,139,157,139,139,139, /* U+10800 */ -158,159,160,161,162,163,164,139,139,165,139,166,167,168,139,139, /* U+11000 */ -139,169,139,139,139,170,139,139,139,139,139,139,139,139,139,139, /* U+11800 */ -171,171,171,171,171,171,171,172,173,139,139,139,139,139,139,139, /* U+12000 */ +148,149,150,151,152,153,154,155,156,157,139,139,158,139,139,139, /* U+10800 */ +159,160,161,162,163,164,165,139,139,166,139,167,168,169,170,139, /* U+11000 */ +139,171,139,139,139,172,139,139,139,139,139,139,139,139,139,139, /* U+11800 */ +173,173,173,173,173,173,173,174,175,173,176,139,139,139,139,139, /* U+12000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+12800 */ -174,174,174,174,174,174,174,174,175,139,139,139,139,139,139,139, /* U+13000 */ +177,177,177,177,177,177,177,177,178,139,139,139,139,139,139,139, /* U+13000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+13800 */ -139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+14000 */ +139,139,139,139,139,139,139,139,179,179,179,179,180,139,139,139, /* U+14000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+14800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+15000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+15800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+16000 */ -176,176,176,176,177,178,179,180,139,139,139,139,139,139,181,182, /* U+16800 */ +181,181,181,181,182,183,184,185,139,139,139,139,139,139,186,187, /* U+16800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+17000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+17800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+18000 */ @@ -860,16 +884,16 @@ const uint8_t PRIV(ucd_stage1)[] = { /* 8704 bytes */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+19800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+1A000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+1A800 */ -183,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+1B000 */ -139,139,139,139,139,139,139,139,184,185,139,139,139,139,139,139, /* U+1B800 */ +188,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+1B000 */ +139,139,139,139,139,139,139,139,189,190,139,139,139,139,139,139, /* U+1B800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+1C000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+1C800 */ - 71,186,187,188,189,139,190,139,191,192,193,194,195,196,197,198, /* U+1D000 */ -139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+1D800 */ + 71,191,192,193,194,139,195,139,196,197,198,199,200,201,202,203, /* U+1D000 */ +204,204,204,204,205,206,139,139,139,139,139,139,139,139,139,139, /* U+1D800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+1E000 */ -199,200,139,139,139,139,139,139,139,139,139,139,201,202,139,139, /* U+1E800 */ -203,204,205,206,207,139,208,209, 71,210,211,212,213,214,215,216, /* U+1F000 */ -217,218,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+1F800 */ +207,208,139,139,139,139,139,139,139,139,139,139,209,210,139,139, /* U+1E800 */ +211,212,213,214,215,139, 71,216, 71, 71,217,218, 71,219,220,221, /* U+1F000 */ +222,223,224,225,139,139,139,139,139,139,139,139,139,139,139,139, /* U+1F800 */ 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, /* U+20000 */ 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, /* U+20800 */ 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, /* U+21000 */ @@ -890,18 +914,18 @@ const uint8_t PRIV(ucd_stage1)[] = { /* 8704 bytes */ 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, /* U+28800 */ 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, /* U+29000 */ 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, /* U+29800 */ - 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,219, 95, 95, /* U+2A000 */ + 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,226, 95, 95, /* U+2A000 */ 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, /* U+2A800 */ - 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,220, 95, /* U+2B000 */ -221,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+2B800 */ -139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+2C000 */ -139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+2C800 */ + 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,227, 95, /* U+2B000 */ +228, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, /* U+2B800 */ + 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, /* U+2C000 */ + 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,229,139,139, /* U+2C800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+2D000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+2D800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+2E000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+2E800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+2F000 */ - 95, 95, 95, 95,221,139,139,139,139,139,139,139,139,139,139,139, /* U+2F800 */ + 95, 95, 95, 95,230,139,139,139,139,139,139,139,139,139,139,139, /* U+2F800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+30000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+30800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+31000 */ @@ -1254,8 +1278,8 @@ const uint8_t PRIV(ucd_stage1)[] = { /* 8704 bytes */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+DE800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+DF000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+DF800 */ -222,223,224,225,223,223,223,223,223,223,223,223,223,223,223,223, /* U+E0000 */ -223,223,223,223,223,223,223,223,223,223,223,223,223,223,223,223, /* U+E0800 */ +231,232,233,234,232,232,232,232,232,232,232,232,232,232,232,232, /* U+E0000 */ +232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232, /* U+E0800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+E1000 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+E1800 */ 139,139,139,139,139,139,139,139,139,139,139,139,139,139,139,139, /* U+E2000 */ @@ -1317,7 +1341,7 @@ const uint8_t PRIV(ucd_stage1)[] = { /* 8704 bytes */ 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123, /* U+FE000 */ 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123, /* U+FE800 */ 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123, /* U+FF000 */ -123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,226, /* U+FF800 */ +123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,235, /* U+FF800 */ 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123, /* U+100000 */ 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123, /* U+100800 */ 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123, /* U+101000 */ @@ -1349,10 +1373,10 @@ const uint8_t PRIV(ucd_stage1)[] = { /* 8704 bytes */ 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123, /* U+10E000 */ 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123, /* U+10E800 */ 123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123, /* U+10F000 */ -123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,226, /* U+10F800 */ +123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,235, /* U+10F800 */ }; -const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */ +const uint16_t PRIV(ucd_stage2)[] = { /* 60416 bytes, block = 128 */ /* block 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1405,533 +1429,533 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */ /* block 5 */ 99, 33, 33, 99, 33, 33, 33,100, 99,101,102,102,103, 33, 33, 33, - 33, 33,104, 33, 20, 33, 33, 33, 33, 33, 33, 33, 33, 33,105, 33, + 33, 33,104, 33, 20, 33, 33, 33, 33, 33, 33, 33, 33,105,106, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, -106,106,106,106,106,106,106,106,106,107,107,107,107,107,107,107, -107,107, 14, 14, 14, 14,107,107,107,107,107,107,107,107,107,107, -107,107, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, -106,106,106,106,106, 14, 14, 14, 14, 14,108,108,107, 14,107, 14, +107,107,107,107,107,107,107,107,107,108,108,108,108,108,108,108, +108,108, 14, 14, 14, 14,108,108,108,108,108,108,108,108,108,108, +108,108, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, +107,107,107,107,107, 14, 14, 14, 14, 14,109,109,108, 14,108, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, /* block 6 */ -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,110,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -111,112,111,112,107,113,111,112,114,114,115,116,116,116, 4,117, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,111,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +112,113,112,113,108,114,112,113,115,115,116,117,117,117, 4,118, /* block 7 */ -114,114,114,114,113, 14,118, 4,119,119,119,114,120,114,121,121, -122,123,124,123,123,125,123,123,126,127,128,123,129,123,123,123, -130,131,114,132,123,123,133,123,123,134,123,123,135,136,136,136, -122,137,138,137,137,139,137,137,140,141,142,137,143,137,137,137, -144,145,146,147,137,137,148,137,137,149,137,137,150,151,151,152, -153,154,155,155,155,156,157,158,111,112,111,112,111,112,111,112, -111,112,159,160,159,160,159,160,159,160,159,160,159,160,159,160, -161,162,163,164,165,166,167,111,112,168,111,112,122,169,169,169, +115,115,115,115,114, 14,119, 4,120,120,120,115,121,115,122,122, +123,124,125,124,124,126,124,124,127,128,129,124,130,124,124,124, +131,132,115,133,124,124,134,124,124,135,124,124,136,137,137,137, +123,138,139,138,138,140,138,138,141,142,143,138,144,138,138,138, +145,146,147,148,138,138,149,138,138,150,138,138,151,152,152,153, +154,155,156,156,156,157,158,159,112,113,112,113,112,113,112,113, +112,113,160,161,160,161,160,161,160,161,160,161,160,161,160,161, +162,163,164,165,166,167,168,112,113,169,112,113,123,170,170,170, /* block 8 */ -170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170, -171,171,171,171,171,171,171,171,171,171,171,171,171,171,171,171, 171,171,171,171,171,171,171,171,171,171,171,171,171,171,171,171, 172,172,172,172,172,172,172,172,172,172,172,172,172,172,172,172, 172,172,172,172,172,172,172,172,172,172,172,172,172,172,172,172, 173,173,173,173,173,173,173,173,173,173,173,173,173,173,173,173, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, +173,173,173,173,173,173,173,173,173,173,173,173,173,173,173,173, +174,174,174,174,174,174,174,174,174,174,174,174,174,174,174,174, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, /* block 9 */ -174,175,176,177,177,109,109,177,178,178,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -179,174,175,174,175,174,175,174,175,174,175,174,175,174,175,180, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, +175,176,177,178,178,110,110,178,179,179,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +180,175,176,175,176,175,176,175,176,175,176,175,176,175,176,181, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, /* block 10 */ -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -114,181,181,181,181,181,181,181,181,181,181,181,181,181,181,181, -181,181,181,181,181,181,181,181,181,181,181,181,181,181,181,181, -181,181,181,181,181,181,181,114,114,182,183,183,183,183,183,183, -114,184,184,184,184,184,184,184,184,184,184,184,184,184,184,184, -184,184,184,184,184,184,184,184,184,184,184,184,184,184,184,184, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +115,182,182,182,182,182,182,182,182,182,182,182,182,182,182,182, +182,182,182,182,182,182,182,182,182,182,182,182,182,182,182,182, +182,182,182,182,182,182,182,115,115,183,184,184,184,184,184,184, +115,185,185,185,185,185,185,185,185,185,185,185,185,185,185,185, +185,185,185,185,185,185,185,185,185,185,185,185,185,185,185,185, /* block 11 */ -184,184,184,184,184,184,184,185,114, 4,186,114,114,187,187,188, -114,189,189,189,189,189,189,189,189,189,189,189,189,189,189,189, -189,189,189,189,189,189,189,189,189,189,189,189,189,189,189,189, -189,189,189,189,189,189,189,189,189,189,189,189,189,189,190,189, -191,189,189,191,189,189,191,189,114,114,114,114,114,114,114,114, -192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192, -192,192,192,192,192,192,192,192,192,192,192,114,114,114,114,114, -192,192,192,191,191,114,114,114,114,114,114,114,114,114,114,114, +185,185,185,185,185,185,185,186,115, 4,187,115,115,188,188,189, +115,190,190,190,190,190,190,190,190,190,190,190,190,190,190,190, +190,190,190,190,190,190,190,190,190,190,190,190,190,190,190,190, +190,190,190,190,190,190,190,190,190,190,190,190,190,190,191,190, +192,190,190,192,190,190,192,190,115,115,115,115,115,115,115,115, +193,193,193,193,193,193,193,193,193,193,193,193,193,193,193,193, +193,193,193,193,193,193,193,193,193,193,193,115,115,115,115,115, +193,193,193,192,192,115,115,115,115,115,115,115,115,115,115,115, /* block 12 */ -193,193,193,193,193, 22,194,194,194,195,195,196, 4,195,197,197, -198,198,198,198,198,198,198,198,198,198,198, 4, 22,114,195, 4, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -107,199,199,199,199,199,199,199,199,199,199,109,109,109,109,109, -109,109,109,109,109,109,198,198,198,198,198,198,198,198,198,198, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,195,195,195,195,199,199, -109,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, +194,194,194,194,194, 22,195,195,195,196,196,197, 4,196,198,198, +199,199,199,199,199,199,199,199,199,199,199, 4, 22,115,196, 4, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +108,200,200,200,200,200,200,200,200,200,200,110,110,110,110,110, +110,110,110,110,110,110,199,199,199,199,199,199,199,199,199,199, +201,201,201,201,201,201,201,201,201,201,196,196,196,196,200,200, +110,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, /* block 13 */ -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,195,199,198,198,198,198,198,198,198, 22,197,198, -198,198,198,198,198,200,200,198,198,197,198,198,198,198,199,199, -201,201,201,201,201,201,201,201,201,201,199,199,199,197,197,199, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,196,200,199,199,199,199,199,199,199, 22,198,199, +199,199,199,199,199,202,202,199,199,198,199,199,199,199,200,200, +201,201,201,201,201,201,201,201,201,201,200,200,200,198,198,200, /* block 14 */ -202,202,202,202,202,202,202,202,202,202,202,202,202,202,114,203, -204,205,204,204,204,204,204,204,204,204,204,204,204,204,204,204, -204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204, +203,203,203,203,203,203,203,203,203,203,203,203,203,203,115,204, +205,206,205,205,205,205,205,205,205,205,205,205,205,205,205,205, 205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205, -205,205,205,205,205,205,205,205,205,205,205,114,114,204,204,204, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, +206,206,206,206,206,206,206,206,206,206,206,206,206,206,206,206, +206,206,206,206,206,206,206,206,206,206,206,115,115,205,205,205, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, /* block 15 */ -206,206,206,206,206,206,206,206,206,206,206,206,206,206,206,206, -206,206,206,206,206,206,206,206,206,206,206,206,206,206,206,206, -206,206,206,206,206,206,207,207,207,207,207,207,207,207,207,207, -207,206,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -208,208,208,208,208,208,208,208,208,208,209,209,209,209,209,209, -209,209,209,209,209,209,209,209,209,209,209,209,209,209,209,209, -209,209,209,209,209,209,209,209,209,209,209,210,210,210,210,210, -210,210,210,210,211,211,212,213,213,213,211,114,114,114,114,114, +207,207,207,207,207,207,207,207,207,207,207,207,207,207,207,207, +207,207,207,207,207,207,207,207,207,207,207,207,207,207,207,207, +207,207,207,207,207,207,208,208,208,208,208,208,208,208,208,208, +208,207,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +209,209,209,209,209,209,209,209,209,209,210,210,210,210,210,210, +210,210,210,210,210,210,210,210,210,210,210,210,210,210,210,210, +210,210,210,210,210,210,210,210,210,210,210,211,211,211,211,211, +211,211,211,211,212,212,213,214,214,214,212,115,115,115,115,115, /* block 16 */ -214,214,214,214,214,214,214,214,214,214,214,214,214,214,214,214, -214,214,214,214,214,214,215,215,215,215,216,215,215,215,215,215, -215,215,215,215,216,215,215,215,216,215,215,215,215,215,114,114, -217,217,217,217,217,217,217,217,217,217,217,217,217,217,217,114, -218,218,218,218,218,218,218,218,218,218,218,218,218,218,218,218, -218,218,218,218,218,218,218,218,218,219,219,219,114,114,220,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +215,215,215,215,215,215,215,215,215,215,215,215,215,215,215,215, +215,215,215,215,215,215,216,216,216,216,217,216,216,216,216,216, +216,216,216,216,217,216,216,216,217,216,216,216,216,216,115,115, +218,218,218,218,218,218,218,218,218,218,218,218,218,218,218,115, +219,219,219,219,219,219,219,219,219,219,219,219,219,219,219,219, +219,219,219,219,219,219,219,219,219,220,220,220,115,115,221,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 17 */ -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,199,199,199,199,199,199,199,199,199,199,199,199,199, 199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,198,198,198,198,198,198,198,198,198,198,198,198, -198,198,198,198,198,198,198,198,198,198,198,198,198,198,198,198, /* block 18 */ -221,221,221,222,223,223,223,223,223,223,223,223,223,223,223,223, -223,223,223,223,223,223,223,223,223,223,223,223,223,223,223,223, -223,223,223,223,223,223,223,223,223,223,223,223,223,223,223,223, -223,223,223,223,223,223,223,223,223,223,221,222,221,223,222,222, -222,221,221,221,221,221,221,221,221,222,222,222,222,221,222,222, -223,109,109,221,221,221,221,221,223,223,223,223,223,223,223,223, -223,223,221,221, 4, 4,224,224,224,224,224,224,224,224,224,224, -225,226,223,223,223,223,223,223,223,223,223,223,223,223,223,223, +222,222,222,223,224,224,224,224,224,224,224,224,224,224,224,224, +224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224, +224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224, +224,224,224,224,224,224,224,224,224,224,222,223,222,224,223,223, +223,222,222,222,222,222,222,222,222,223,223,223,223,222,223,223, +224,110,110,222,222,222,222,222,224,224,224,224,224,224,224,224, +224,224,222,222, 4, 4,225,225,225,225,225,225,225,225,225,225, +226,227,224,224,224,224,224,224,224,224,224,224,224,224,224,224, /* block 19 */ -227,228,229,229,114,227,227,227,227,227,227,227,227,114,114,227, -227,114,114,227,227,227,227,227,227,227,227,227,227,227,227,227, -227,227,227,227,227,227,227,227,227,114,227,227,227,227,227,227, -227,114,227,114,114,114,227,227,227,227,114,114,228,227,230,229, -229,228,228,228,228,114,114,229,229,114,114,229,229,228,227,114, -114,114,114,114,114,114,114,230,114,114,114,114,227,227,114,227, -227,227,228,228,114,114,231,231,231,231,231,231,231,231,231,231, -227,227,232,232,233,233,233,233,233,233,234,232,114,114,114,114, +228,229,230,230,115,228,228,228,228,228,228,228,228,115,115,228, +228,115,115,228,228,228,228,228,228,228,228,228,228,228,228,228, +228,228,228,228,228,228,228,228,228,115,228,228,228,228,228,228, +228,115,228,115,115,115,228,228,228,228,115,115,229,228,231,230, +230,229,229,229,229,115,115,230,230,115,115,230,230,229,228,115, +115,115,115,115,115,115,115,231,115,115,115,115,228,228,115,228, +228,228,229,229,115,115,232,232,232,232,232,232,232,232,232,232, +228,228,233,233,234,234,234,234,234,234,235,233,115,115,115,115, /* block 20 */ -114,235,235,236,114,237,237,237,237,237,237,114,114,114,114,237, -237,114,114,237,237,237,237,237,237,237,237,237,237,237,237,237, -237,237,237,237,237,237,237,237,237,114,237,237,237,237,237,237, -237,114,237,237,114,237,237,114,237,237,114,114,235,114,236,236, -236,235,235,114,114,114,114,235,235,114,114,235,235,235,114,114, -114,235,114,114,114,114,114,114,114,237,237,237,237,114,237,114, -114,114,114,114,114,114,238,238,238,238,238,238,238,238,238,238, -235,235,237,237,237,235,114,114,114,114,114,114,114,114,114,114, +115,236,236,237,115,238,238,238,238,238,238,115,115,115,115,238, +238,115,115,238,238,238,238,238,238,238,238,238,238,238,238,238, +238,238,238,238,238,238,238,238,238,115,238,238,238,238,238,238, +238,115,238,238,115,238,238,115,238,238,115,115,236,115,237,237, +237,236,236,115,115,115,115,236,236,115,115,236,236,236,115,115, +115,236,115,115,115,115,115,115,115,238,238,238,238,115,238,115, +115,115,115,115,115,115,239,239,239,239,239,239,239,239,239,239, +236,236,238,238,238,236,115,115,115,115,115,115,115,115,115,115, /* block 21 */ -114,239,239,240,114,241,241,241,241,241,241,241,241,241,114,241, -241,241,114,241,241,241,241,241,241,241,241,241,241,241,241,241, -241,241,241,241,241,241,241,241,241,114,241,241,241,241,241,241, -241,114,241,241,114,241,241,241,241,241,114,114,239,241,240,240, -240,239,239,239,239,239,114,239,239,240,114,240,240,239,114,114, -241,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -241,241,239,239,114,114,242,242,242,242,242,242,242,242,242,242, -243,244,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +115,240,240,241,115,242,242,242,242,242,242,242,242,242,115,242, +242,242,115,242,242,242,242,242,242,242,242,242,242,242,242,242, +242,242,242,242,242,242,242,242,242,115,242,242,242,242,242,242, +242,115,242,242,115,242,242,242,242,242,115,115,240,242,241,241, +241,240,240,240,240,240,115,240,240,241,115,241,241,240,115,115, +242,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +242,242,240,240,115,115,243,243,243,243,243,243,243,243,243,243, +244,245,115,115,115,115,115,115,115,242,115,115,115,115,115,115, /* block 22 */ -114,245,246,246,114,247,247,247,247,247,247,247,247,114,114,247, -247,114,114,247,247,247,247,247,247,247,247,247,247,247,247,247, -247,247,247,247,247,247,247,247,247,114,247,247,247,247,247,247, -247,114,247,247,114,247,247,247,247,247,114,114,245,247,248,245, -246,245,245,245,245,114,114,246,246,114,114,246,246,245,114,114, -114,114,114,114,114,114,245,248,114,114,114,114,247,247,114,247, -247,247,245,245,114,114,249,249,249,249,249,249,249,249,249,249, -250,247,251,251,251,251,251,251,114,114,114,114,114,114,114,114, +115,246,247,247,115,248,248,248,248,248,248,248,248,115,115,248, +248,115,115,248,248,248,248,248,248,248,248,248,248,248,248,248, +248,248,248,248,248,248,248,248,248,115,248,248,248,248,248,248, +248,115,248,248,115,248,248,248,248,248,115,115,246,248,249,246, +247,246,246,246,246,115,115,247,247,115,115,247,247,246,115,115, +115,115,115,115,115,115,246,249,115,115,115,115,248,248,115,248, +248,248,246,246,115,115,250,250,250,250,250,250,250,250,250,250, +251,248,252,252,252,252,252,252,115,115,115,115,115,115,115,115, /* block 23 */ -114,114,252,253,114,253,253,253,253,253,253,114,114,114,253,253, -253,114,253,253,253,253,114,114,114,253,253,114,253,114,253,253, -114,114,114,253,253,114,114,114,253,253,253,114,114,114,253,253, -253,253,253,253,253,253,253,253,253,253,114,114,114,114,254,255, -252,255,255,114,114,114,255,255,255,114,255,255,255,252,114,114, -253,114,114,114,114,114,114,254,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,256,256,256,256,256,256,256,256,256,256, -257,257,257,258,258,258,258,258,258,259,258,114,114,114,114,114, +115,115,253,254,115,254,254,254,254,254,254,115,115,115,254,254, +254,115,254,254,254,254,115,115,115,254,254,115,254,115,254,254, +115,115,115,254,254,115,115,115,254,254,254,115,115,115,254,254, +254,254,254,254,254,254,254,254,254,254,115,115,115,115,255,256, +253,256,256,115,115,115,256,256,256,115,256,256,256,253,115,115, +254,115,115,115,115,115,115,255,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,257,257,257,257,257,257,257,257,257,257, +258,258,258,259,259,259,259,259,259,260,259,115,115,115,115,115, /* block 24 */ -260,261,261,261,114,262,262,262,262,262,262,262,262,114,262,262, -262,114,262,262,262,262,262,262,262,262,262,262,262,262,262,262, -262,262,262,262,262,262,262,262,262,114,262,262,262,262,262,262, -262,262,262,262,262,262,262,262,262,262,114,114,114,262,260,260, -260,261,261,261,261,114,260,260,260,114,260,260,260,260,114,114, -114,114,114,114,114,260,260,114,262,262,114,114,114,114,114,114, -262,262,260,260,114,114,263,263,263,263,263,263,263,263,263,263, -114,114,114,114,114,114,114,114,264,264,264,264,264,264,264,265, +261,262,262,262,115,263,263,263,263,263,263,263,263,115,263,263, +263,115,263,263,263,263,263,263,263,263,263,263,263,263,263,263, +263,263,263,263,263,263,263,263,263,115,263,263,263,263,263,263, +263,263,263,263,263,263,263,263,263,263,115,115,115,263,261,261, +261,262,262,262,262,115,261,261,261,115,261,261,261,261,115,115, +115,115,115,115,115,261,261,115,263,263,263,115,115,115,115,115, +263,263,261,261,115,115,264,264,264,264,264,264,264,264,264,264, +115,115,115,115,115,115,115,115,265,265,265,265,265,265,265,266, /* block 25 */ -114,266,267,267,114,268,268,268,268,268,268,268,268,114,268,268, -268,114,268,268,268,268,268,268,268,268,268,268,268,268,268,268, -268,268,268,268,268,268,268,268,268,114,268,268,268,268,268,268, -268,268,268,268,114,268,268,268,268,268,114,114,266,268,267,266, -267,267,269,267,267,114,266,267,267,114,267,267,266,266,114,114, -114,114,114,114,114,269,269,114,114,114,114,114,114,114,268,114, -268,268,266,266,114,114,270,270,270,270,270,270,270,270,270,270, -114,268,268,114,114,114,114,114,114,114,114,114,114,114,114,114, +115,267,268,268,115,269,269,269,269,269,269,269,269,115,269,269, +269,115,269,269,269,269,269,269,269,269,269,269,269,269,269,269, +269,269,269,269,269,269,269,269,269,115,269,269,269,269,269,269, +269,269,269,269,115,269,269,269,269,269,115,115,267,269,268,267, +268,268,270,268,268,115,267,268,268,115,268,268,267,267,115,115, +115,115,115,115,115,270,270,115,115,115,115,115,115,115,269,115, +269,269,267,267,115,115,271,271,271,271,271,271,271,271,271,271, +115,269,269,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 26 */ -114,271,272,272,114,273,273,273,273,273,273,273,273,114,273,273, -273,114,273,273,273,273,273,273,273,273,273,273,273,273,273,273, -273,273,273,273,273,273,273,273,273,273,273,273,273,273,273,273, -273,273,273,273,273,273,273,273,273,273,273,114,114,273,274,272, -272,271,271,271,271,114,272,272,272,114,272,272,272,271,273,114, -114,114,114,114,114,114,114,274,114,114,114,114,114,114,114,114, -273,273,271,271,114,114,275,275,275,275,275,275,275,275,275,275, -276,276,276,276,276,276,114,114,114,277,273,273,273,273,273,273, +115,272,273,273,115,274,274,274,274,274,274,274,274,115,274,274, +274,115,274,274,274,274,274,274,274,274,274,274,274,274,274,274, +274,274,274,274,274,274,274,274,274,274,274,274,274,274,274,274, +274,274,274,274,274,274,274,274,274,274,274,115,115,274,275,273, +273,272,272,272,272,115,273,273,273,115,273,273,273,272,274,115, +115,115,115,115,115,115,115,275,115,115,115,115,115,115,115,274, +274,274,272,272,115,115,276,276,276,276,276,276,276,276,276,276, +277,277,277,277,277,277,115,115,115,278,274,274,274,274,274,274, /* block 27 */ -114,114,278,278,114,279,279,279,279,279,279,279,279,279,279,279, -279,279,279,279,279,279,279,114,114,114,279,279,279,279,279,279, -279,279,279,279,279,279,279,279,279,279,279,279,279,279,279,279, -279,279,114,279,279,279,279,279,279,279,279,279,114,279,114,114, -279,279,279,279,279,279,279,114,114,114,280,114,114,114,114,281, -278,278,280,280,280,114,280,114,278,278,278,278,278,278,278,281, -114,114,114,114,114,114,282,282,282,282,282,282,282,282,282,282, -114,114,278,278,283,114,114,114,114,114,114,114,114,114,114,114, +115,115,279,279,115,280,280,280,280,280,280,280,280,280,280,280, +280,280,280,280,280,280,280,115,115,115,280,280,280,280,280,280, +280,280,280,280,280,280,280,280,280,280,280,280,280,280,280,280, +280,280,115,280,280,280,280,280,280,280,280,280,115,280,115,115, +280,280,280,280,280,280,280,115,115,115,281,115,115,115,115,282, +279,279,281,281,281,115,281,115,279,279,279,279,279,279,279,282, +115,115,115,115,115,115,283,283,283,283,283,283,283,283,283,283, +115,115,279,279,284,115,115,115,115,115,115,115,115,115,115,115, /* block 28 */ -114,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284, -284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284, -284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284, -284,285,284,286,285,285,285,285,285,285,285,114,114,114,114, 5, -284,284,284,284,284,284,287,285,285,285,285,285,285,285,285,288, -289,289,289,289,289,289,289,289,289,289,288,288,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +115,285,285,285,285,285,285,285,285,285,285,285,285,285,285,285, +285,285,285,285,285,285,285,285,285,285,285,285,285,285,285,285, +285,285,285,285,285,285,285,285,285,285,285,285,285,285,285,285, +285,286,285,287,286,286,286,286,286,286,286,115,115,115,115, 5, +285,285,285,285,285,285,288,286,286,286,286,286,286,286,286,289, +290,290,290,290,290,290,290,290,290,290,289,289,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 29 */ -114,290,290,114,290,114,114,290,290,114,290,114,114,290,114,114, -114,114,114,114,290,290,290,290,114,290,290,290,290,290,290,290, -114,290,290,290,114,290,114,290,114,114,290,290,114,290,290,290, -290,291,290,292,291,291,291,291,291,291,114,291,291,290,114,114, -290,290,290,290,290,114,293,114,291,291,291,291,291,291,114,114, -294,294,294,294,294,294,294,294,294,294,114,114,290,290,290,290, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +115,291,291,115,291,115,115,291,291,115,291,115,115,291,115,115, +115,115,115,115,291,291,291,291,115,291,291,291,291,291,291,291, +115,291,291,291,115,291,115,291,115,115,291,291,115,291,291,291, +291,292,291,293,292,292,292,292,292,292,115,292,292,291,115,115, +291,291,291,291,291,115,294,115,292,292,292,292,292,292,115,115, +295,295,295,295,295,295,295,295,295,295,115,115,291,291,291,291, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 30 */ -295,296,296,296,297,297,297,297,297,297,297,297,297,297,297,297, -297,297,297,296,297,296,296,296,298,298,296,296,296,296,296,296, -299,299,299,299,299,299,299,299,299,299,300,300,300,300,300,300, -300,300,300,300,296,298,296,298,296,298,301,302,301,302,303,303, -295,295,295,295,295,295,295,295,114,295,295,295,295,295,295,295, -295,295,295,295,295,295,295,295,295,295,295,295,295,295,295,295, -295,295,295,295,295,295,295,295,295,295,295,295,295,114,114,114, -114,298,298,298,298,298,298,298,298,298,298,298,298,298,298,303, +296,297,297,297,298,298,298,298,298,298,298,298,298,298,298,298, +298,298,298,297,298,297,297,297,299,299,297,297,297,297,297,297, +300,300,300,300,300,300,300,300,300,300,301,301,301,301,301,301, +301,301,301,301,297,299,297,299,297,299,302,303,302,303,304,304, +296,296,296,296,296,296,296,296,115,296,296,296,296,296,296,296, +296,296,296,296,296,296,296,296,296,296,296,296,296,296,296,296, +296,296,296,296,296,296,296,296,296,296,296,296,296,115,115,115, +115,299,299,299,299,299,299,299,299,299,299,299,299,299,299,304, /* block 31 */ -298,298,298,298,298,297,298,298,295,295,295,295,295,298,298,298, -298,298,298,298,298,298,298,298,114,298,298,298,298,298,298,298, -298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298, -298,298,298,298,298,298,298,298,298,298,298,298,298,114,296,296, -296,296,296,296,296,296,298,296,296,296,296,296,296,114,296,296, -297,297,297,297,297, 19, 19, 19, 19,297,297,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +299,299,299,299,299,298,299,299,296,296,296,296,296,299,299,299, +299,299,299,299,299,299,299,299,115,299,299,299,299,299,299,299, +299,299,299,299,299,299,299,299,299,299,299,299,299,299,299,299, +299,299,299,299,299,299,299,299,299,299,299,299,299,115,297,297, +297,297,297,297,297,297,299,297,297,297,297,297,297,115,297,297, +298,298,298,298,298, 19, 19, 19, 19,298,298,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 32 */ -304,304,304,304,304,304,304,304,304,304,304,304,304,304,304,304, -304,304,304,304,304,304,304,304,304,304,304,304,304,304,304,304, -304,304,304,304,304,304,304,304,304,304,304,305,305,306,306,306, -306,307,306,306,306,306,306,306,305,306,306,307,307,306,306,304, -308,308,308,308,308,308,308,308,308,308,309,309,309,309,309,309, -304,304,304,304,304,304,307,307,306,306,304,304,304,304,306,306, -306,304,305,305,305,304,304,305,305,305,305,305,305,305,304,304, -304,306,306,306,306,304,304,304,304,304,304,304,304,304,304,304, +305,305,305,305,305,305,305,305,305,305,305,305,305,305,305,305, +305,305,305,305,305,305,305,305,305,305,305,305,305,305,305,305, +305,305,305,305,305,305,305,305,305,305,305,306,306,307,307,307, +307,308,307,307,307,307,307,307,306,307,307,308,308,307,307,305, +309,309,309,309,309,309,309,309,309,309,310,310,310,310,310,310, +305,305,305,305,305,305,308,308,307,307,305,305,305,305,307,307, +307,305,306,306,306,305,305,306,306,306,306,306,306,306,305,305, +305,307,307,307,307,305,305,305,305,305,305,305,305,305,305,305, /* block 33 */ -304,304,306,305,307,306,306,305,305,305,305,305,305,306,304,305, -308,308,308,308,308,308,308,308,308,308,305,305,305,306,310,310, -311,311,311,311,311,311,311,311,311,311,311,311,311,311,311,311, -311,311,311,311,311,311,311,311,311,311,311,311,311,311,311,311, -311,311,311,311,311,311,114,311,114,114,114,114,114,311,114,114, +305,305,307,306,308,307,307,306,306,306,306,306,306,307,305,306, +309,309,309,309,309,309,309,309,309,309,306,306,306,307,311,311, 312,312,312,312,312,312,312,312,312,312,312,312,312,312,312,312, 312,312,312,312,312,312,312,312,312,312,312,312,312,312,312,312, -312,312,312,312,312,312,312,312,312,312,312, 4,313,312,312,312, +312,312,312,312,312,312,115,312,115,115,115,115,115,312,115,115, +313,313,313,313,313,313,313,313,313,313,313,313,313,313,313,313, +313,313,313,313,313,313,313,313,313,313,313,313,313,313,313,313, +313,313,313,313,313,313,313,313,313,313,313, 4,314,313,313,313, /* block 34 */ -314,314,314,314,314,314,314,314,314,314,314,314,314,314,314,314, -314,314,314,314,314,314,314,314,314,314,314,314,314,314,314,314, -314,314,314,314,314,314,314,314,314,314,314,314,314,314,314,314, -314,314,314,314,314,314,314,314,314,314,314,314,314,314,314,314, -314,314,314,314,314,314,314,314,314,314,314,314,314,314,314,314, -314,314,314,314,314,314,314,314,314,314,314,314,314,314,314,314, 315,315,315,315,315,315,315,315,315,315,315,315,315,315,315,315, 315,315,315,315,315,315,315,315,315,315,315,315,315,315,315,315, +315,315,315,315,315,315,315,315,315,315,315,315,315,315,315,315, +315,315,315,315,315,315,315,315,315,315,315,315,315,315,315,315, +315,315,315,315,315,315,315,315,315,315,315,315,315,315,315,315, +315,315,315,315,315,315,315,315,315,315,315,315,315,315,315,315, +316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316, +316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316, /* block 35 */ -315,315,315,315,315,315,315,315,315,315,315,315,315,315,315,315, -315,315,315,315,315,315,315,315,315,315,315,315,315,315,315,315, -315,315,315,315,315,315,315,315,316,316,316,316,316,316,316,316, -316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316, -316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316, -316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316, 316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316, 316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316, +316,316,316,316,316,316,316,316,317,317,317,317,317,317,317,317, +317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, +317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, +317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, +317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, +317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, /* block 36 */ -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,114,317,317,317,317,114,114, -317,317,317,317,317,317,317,114,317,114,317,317,317,317,114,114, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,115,318,318,318,318,115,115, +318,318,318,318,318,318,318,115,318,115,318,318,318,318,115,115, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, /* block 37 */ -317,317,317,317,317,317,317,317,317,114,317,317,317,317,114,114, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,114,317,317,317,317,114,114,317,317,317,317,317,317,317,114, -317,114,317,317,317,317,114,114,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,114,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, +318,318,318,318,318,318,318,318,318,115,318,318,318,318,115,115, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,115,318,318,318,318,115,115,318,318,318,318,318,318,318,115, +318,115,318,318,318,318,115,115,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,115,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, /* block 38 */ -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,114,317,317,317,317,114,114,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,317,317,317,317,114,114,318,318,318, -319,319,319,319,319,319,319,319,319,320,320,320,320,320,320,320, -320,320,320,320,320,320,320,320,320,320,320,320,320,114,114,114, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,115,318,318,318,318,115,115,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,318,318,318,318,115,115,319,319,319, +320,320,320,320,320,320,320,320,320,321,321,321,321,321,321,321, +321,321,321,321,321,321,321,321,321,321,321,321,321,115,115,115, /* block 39 */ -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -321,321,321,321,321,321,321,321,321,321,114,114,114,114,114,114, -322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322, -322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322, -322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322, -322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322, -322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322, -322,322,322,322,322,114,114,114,114,114,114,114,114,114,114,114, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +322,322,322,322,322,322,322,322,322,322,115,115,115,115,115,115, +323,323,323,323,323,323,323,323,323,323,323,323,323,323,323,323, +323,323,323,323,323,323,323,323,323,323,323,323,323,323,323,323, +323,323,323,323,323,323,323,323,323,323,323,323,323,323,323,323, +323,323,323,323,323,323,323,323,323,323,323,323,323,323,323,323, +323,323,323,323,323,323,323,323,323,323,323,323,323,323,323,323, +324,324,324,324,324,324,115,115,325,325,325,325,325,325,115,115, /* block 40 */ -323,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, +326,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, /* blockblock 42 */ -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,325,325,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,328,328,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, /* block 43 */ -326,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, -327,327,327,327,327,327,327,327,327,327,327,328,329,114,114,114, -330,330,330,330,330,330,330,330,330,330,330,330,330,330,330,330, -330,330,330,330,330,330,330,330,330,330,330,330,330,330,330,330, -330,330,330,330,330,330,330,330,330,330,330,330,330,330,330,330, -330,330,330,330,330,330,330,330,330,330,330,330,330,330,330,330, -330,330,330,330,330,330,330,330,330,330,330, 4, 4, 4,331,331, -331,330,330,330,330,330,330,330,330,114,114,114,114,114,114,114, +329,330,330,330,330,330,330,330,330,330,330,330,330,330,330,330, +330,330,330,330,330,330,330,330,330,330,330,331,332,115,115,115, +333,333,333,333,333,333,333,333,333,333,333,333,333,333,333,333, +333,333,333,333,333,333,333,333,333,333,333,333,333,333,333,333, +333,333,333,333,333,333,333,333,333,333,333,333,333,333,333,333, +333,333,333,333,333,333,333,333,333,333,333,333,333,333,333,333, +333,333,333,333,333,333,333,333,333,333,333, 4, 4, 4,334,334, +334,333,333,333,333,333,333,333,333,115,115,115,115,115,115,115, /* block 44 */ -332,332,332,332,332,332,332,332,332,332,332,332,332,114,332,332, -332,332,333,333,333,114,114,114,114,114,114,114,114,114,114,114, -334,334,334,334,334,334,334,334,334,334,334,334,334,334,334,334, -334,334,335,335,335, 4, 4,114,114,114,114,114,114,114,114,114, -336,336,336,336,336,336,336,336,336,336,336,336,336,336,336,336, -336,336,337,337,114,114,114,114,114,114,114,114,114,114,114,114, -338,338,338,338,338,338,338,338,338,338,338,338,338,114,338,338, -338,114,339,339,114,114,114,114,114,114,114,114,114,114,114,114, +335,335,335,335,335,335,335,335,335,335,335,335,335,115,335,335, +335,335,336,336,336,115,115,115,115,115,115,115,115,115,115,115, +337,337,337,337,337,337,337,337,337,337,337,337,337,337,337,337, +337,337,338,338,338, 4, 4,115,115,115,115,115,115,115,115,115, +339,339,339,339,339,339,339,339,339,339,339,339,339,339,339,339, +339,339,340,340,115,115,115,115,115,115,115,115,115,115,115,115, +341,341,341,341,341,341,341,341,341,341,341,341,341,115,341,341, +341,115,342,342,115,115,115,115,115,115,115,115,115,115,115,115, /* block 45 */ -340,340,340,340,340,340,340,340,340,340,340,340,340,340,340,340, -340,340,340,340,340,340,340,340,340,340,340,340,340,340,340,340, -340,340,340,340,340,340,340,340,340,340,340,340,340,340,340,340, -340,340,340,340,341,341,342,341,341,341,341,341,341,341,342,342, -342,342,342,342,342,342,341,342,342,341,341,341,341,341,341,341, -341,341,341,341,343,343,343,344,343,343,343,345,340,341,114,114, -346,346,346,346,346,346,346,346,346,346,114,114,114,114,114,114, -347,347,347,347,347,347,347,347,347,347,114,114,114,114,114,114, +343,343,343,343,343,343,343,343,343,343,343,343,343,343,343,343, +343,343,343,343,343,343,343,343,343,343,343,343,343,343,343,343, +343,343,343,343,343,343,343,343,343,343,343,343,343,343,343,343, +343,343,343,343,344,344,345,344,344,344,344,344,344,344,345,345, +345,345,345,345,345,345,344,345,345,344,344,344,344,344,344,344, +344,344,344,344,346,346,346,347,346,346,346,348,343,344,115,115, +349,349,349,349,349,349,349,349,349,349,115,115,115,115,115,115, +350,350,350,350,350,350,350,350,350,350,115,115,115,115,115,115, /* block 46 */ -348,348, 4, 4,348, 4,349,348,348,348,348,350,350,350,351,114, -352,352,352,352,352,352,352,352,352,352,114,114,114,114,114,114, -353,353,353,353,353,353,353,353,353,353,353,353,353,353,353,353, -353,353,353,353,353,353,353,353,353,353,353,353,353,353,353,353, -353,353,353,354,353,353,353,353,353,353,353,353,353,353,353,353, -353,353,353,353,353,353,353,353,353,353,353,353,353,353,353,353, -353,353,353,353,353,353,353,353,353,353,353,353,353,353,353,353, -353,353,353,353,353,353,353,353,114,114,114,114,114,114,114,114, +351,351, 4, 4,351, 4,352,351,351,351,351,353,353,353,354,115, +355,355,355,355,355,355,355,355,355,355,115,115,115,115,115,115, +356,356,356,356,356,356,356,356,356,356,356,356,356,356,356,356, +356,356,356,356,356,356,356,356,356,356,356,356,356,356,356,356, +356,356,356,357,356,356,356,356,356,356,356,356,356,356,356,356, +356,356,356,356,356,356,356,356,356,356,356,356,356,356,356,356, +356,356,356,356,356,356,356,356,356,356,356,356,356,356,356,356, +356,356,356,356,356,356,356,356,115,115,115,115,115,115,115,115, /* block 47 */ -353,353,353,353,353,353,353,353,353,353,353,353,353,353,353,353, -353,353,353,353,353,353,353,353,353,353,353,353,353,353,353,353, -353,353,353,353,353,353,353,353,353,350,353,114,114,114,114,114, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,324,324,324,324,324,324,324,324,324,324, -324,324,324,324,324,324,114,114,114,114,114,114,114,114,114,114, +356,356,356,356,356,356,356,356,356,356,356,356,356,356,356,356, +356,356,356,356,356,356,356,356,356,356,356,356,356,356,356,356, +356,356,356,356,356,356,356,356,356,353,356,115,115,115,115,115, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,327,327,327,327,327,327,327,327,327,327, +327,327,327,327,327,327,115,115,115,115,115,115,115,115,115,115, /* block 48 */ -355,355,355,355,355,355,355,355,355,355,355,355,355,355,355,355, -355,355,355,355,355,355,355,355,355,355,355,355,355,355,355,114, -356,356,356,357,357,357,357,356,356,357,357,357,114,114,114,114, -357,357,356,357,357,357,357,357,357,356,356,356,114,114,114,114, -358,114,114,114,359,359,360,360,360,360,360,360,360,360,360,360, -361,361,361,361,361,361,361,361,361,361,361,361,361,361,361,361, -361,361,361,361,361,361,361,361,361,361,361,361,361,361,114,114, -361,361,361,361,361,114,114,114,114,114,114,114,114,114,114,114, +358,358,358,358,358,358,358,358,358,358,358,358,358,358,358,358, +358,358,358,358,358,358,358,358,358,358,358,358,358,358,358,115, +359,359,359,360,360,360,360,359,359,360,360,360,115,115,115,115, +360,360,359,360,360,360,360,360,360,359,359,359,115,115,115,115, +361,115,115,115,362,362,363,363,363,363,363,363,363,363,363,363, +364,364,364,364,364,364,364,364,364,364,364,364,364,364,364,364, +364,364,364,364,364,364,364,364,364,364,364,364,364,364,115,115, +364,364,364,364,364,115,115,115,115,115,115,115,115,115,115,115, /* block 49 */ -362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362, -362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362, -362,362,362,362,362,362,362,362,362,362,362,362,114,114,114,114, -363,363,363,363,363,364,364,364,363,363,364,363,363,363,363,363, -363,362,362,362,362,362,362,362,363,363,114,114,114,114,114,114, -365,365,365,365,365,365,365,365,365,365,366,114,114,114,367,367, -368,368,368,368,368,368,368,368,368,368,368,368,368,368,368,368, -368,368,368,368,368,368,368,368,368,368,368,368,368,368,368,368, +365,365,365,365,365,365,365,365,365,365,365,365,365,365,365,365, +365,365,365,365,365,365,365,365,365,365,365,365,365,365,365,365, +365,365,365,365,365,365,365,365,365,365,365,365,115,115,115,115, +365,365,365,365,365,365,365,365,365,365,365,365,365,365,365,365, +365,365,365,365,365,365,365,365,365,365,115,115,115,115,115,115, +366,366,366,366,366,366,366,366,366,366,367,115,115,115,368,368, +369,369,369,369,369,369,369,369,369,369,369,369,369,369,369,369, +369,369,369,369,369,369,369,369,369,369,369,369,369,369,369,369, /* block 50 */ -369,369,369,369,369,369,369,369,369,369,369,369,369,369,369,369, -369,369,369,369,369,369,369,370,370,371,371,370,114,114,372,372, -373,373,373,373,373,373,373,373,373,373,373,373,373,373,373,373, -373,373,373,373,373,373,373,373,373,373,373,373,373,373,373,373, -373,373,373,373,373,373,373,373,373,373,373,373,373,373,373,373, -373,373,373,373,373,374,375,374,375,375,375,375,375,375,375,114, -375,376,375,376,376,375,375,375,375,375,375,375,375,374,374,374, -374,374,374,375,375,375,375,375,375,375,375,375,375,114,114,375, +370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370, +370,370,370,370,370,370,370,371,371,372,372,371,115,115,373,373, +374,374,374,374,374,374,374,374,374,374,374,374,374,374,374,374, +374,374,374,374,374,374,374,374,374,374,374,374,374,374,374,374, +374,374,374,374,374,374,374,374,374,374,374,374,374,374,374,374, +374,374,374,374,374,375,376,375,376,376,376,376,376,376,376,115, +376,377,376,377,377,376,376,376,376,376,376,376,376,375,375,375, +375,375,375,376,376,376,376,376,376,376,376,376,376,115,115,376, /* block 51 */ -377,377,377,377,377,377,377,377,377,377,114,114,114,114,114,114, -377,377,377,377,377,377,377,377,377,377,114,114,114,114,114,114, -378,378,378,378,378,378,378,379,378,378,378,378,378,378,114,114, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,380,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +378,378,378,378,378,378,378,378,378,378,115,115,115,115,115,115, +378,378,378,378,378,378,378,378,378,378,115,115,115,115,115,115, +379,379,379,379,379,379,379,380,379,379,379,379,379,379,115,115, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,381,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 52 */ -381,381,381,381,382,383,383,383,383,383,383,383,383,383,383,383, -383,383,383,383,383,383,383,383,383,383,383,383,383,383,383,383, -383,383,383,383,383,383,383,383,383,383,383,383,383,383,383,383, -383,383,383,383,381,382,381,381,381,381,381,382,381,382,382,382, -382,382,381,382,382,383,383,383,383,383,383,383,114,114,114,114, -384,384,384,384,384,384,384,384,384,384,385,385,385,385,385,385, -385,386,386,386,386,386,386,386,386,386,386,381,381,381,381,381, -381,381,381,381,386,386,386,386,386,386,386,386,386,114,114,114, +382,382,382,382,383,384,384,384,384,384,384,384,384,384,384,384, +384,384,384,384,384,384,384,384,384,384,384,384,384,384,384,384, +384,384,384,384,384,384,384,384,384,384,384,384,384,384,384,384, +384,384,384,384,382,383,382,382,382,382,382,383,382,383,383,383, +383,383,382,383,383,384,384,384,384,384,384,384,115,115,115,115, +385,385,385,385,385,385,385,385,385,385,386,386,386,386,386,386, +386,387,387,387,387,387,387,387,387,387,387,382,382,382,382,382, +382,382,382,382,387,387,387,387,387,387,387,387,387,115,115,115, /* block 53 */ -387,387,388,389,389,389,389,389,389,389,389,389,389,389,389,389, -389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389, -389,388,387,387,387,387,388,388,387,387,388,387,387,387,389,389, -390,390,390,390,390,390,390,390,390,390,389,389,389,389,389,389, -391,391,391,391,391,391,391,391,391,391,391,391,391,391,391,391, -391,391,391,391,391,391,391,391,391,391,391,391,391,391,391,391, -391,391,391,391,391,391,392,393,392,392,393,393,393,392,393,392, -392,392,393,393,114,114,114,114,114,114,114,114,394,394,394,394, +388,388,389,390,390,390,390,390,390,390,390,390,390,390,390,390, +390,390,390,390,390,390,390,390,390,390,390,390,390,390,390,390, +390,389,388,388,388,388,389,389,388,388,389,388,388,388,390,390, +391,391,391,391,391,391,391,391,391,391,390,390,390,390,390,390, +392,392,392,392,392,392,392,392,392,392,392,392,392,392,392,392, +392,392,392,392,392,392,392,392,392,392,392,392,392,392,392,392, +392,392,392,392,392,392,393,394,393,393,394,394,394,393,394,393, +393,393,394,394,115,115,115,115,115,115,115,115,395,395,395,395, /* block 54 */ -395,395,395,395,395,395,395,395,395,395,395,395,395,395,395,395, -395,395,395,395,395,395,395,395,395,395,395,395,395,395,395,395, -395,395,395,395,396,396,396,396,396,396,396,396,397,397,397,397, -397,397,397,397,396,396,397,397,114,114,114,398,398,398,398,398, -399,399,399,399,399,399,399,399,399,399,114,114,114,395,395,395, -400,400,400,400,400,400,400,400,400,400,401,401,401,401,401,401, -401,401,401,401,401,401,401,401,401,401,401,401,401,401,401,401, -401,401,401,401,401,401,401,401,402,402,402,402,402,402,403,403, +396,396,396,396,396,396,396,396,396,396,396,396,396,396,396,396, +396,396,396,396,396,396,396,396,396,396,396,396,396,396,396,396, +396,396,396,396,397,397,397,397,397,397,397,397,398,398,398,398, +398,398,398,398,397,397,398,398,115,115,115,399,399,399,399,399, +400,400,400,400,400,400,400,400,400,400,115,115,115,396,396,396, +401,401,401,401,401,401,401,401,401,401,402,402,402,402,402,402, +402,402,402,402,402,402,402,402,402,402,402,402,402,402,402,402, +402,402,402,402,402,402,402,402,403,403,403,403,403,403,404,404, /* block 55 */ -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -404,404,404,404,404,404,404,404,114,114,114,114,114,114,114,114, -109,109,109, 4,109,109,109,109,109,109,109,109,109,109,109,109, -109,405,109,109,109,109,109,109,109,406,406,406,406,109,406,406, -406,406,405,405,109,406,406,114,109,109,114,114,114,114,114,114, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +405,405,405,405,405,405,405,405,115,115,115,115,115,115,115,115, +110,110,110, 4,110,110,110,110,110,110,110,110,110,110,110,110, +110,406,110,110,110,110,110,110,110,407,407,407,407,110,407,407, +407,407,406,406,110,407,407,115,110,110,115,115,115,115,115,115, /* block 56 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, - 33, 33, 33, 33, 33, 33,122,122,122,122,122,407,106,106,106,106, -106,106,106,106,106,106,106,106,106,106,106,106,106,106,106,106, -106,106,106,106,106,106,106,106,106,106,106,106,106,106,106,106, -106,106,106,106,106,106,106,106,106,106,106,106,106,115,115,115, -115,115,106,106,106,106,115,115,115,115,115, 33, 33, 33, 33, 33, - 33, 33, 33, 33, 33, 33, 33, 33,408,409, 33, 33, 33,410, 33, 33, + 33, 33, 33, 33, 33, 33,123,123,123,123,123,408,107,107,107,107, +107,107,107,107,107,107,107,107,107,107,107,107,107,107,107,107, +107,107,107,107,107,107,107,107,107,107,107,107,107,107,107,107, +107,107,107,107,107,107,107,107,107,107,107,107,107,116,116,116, +116,116,107,107,107,107,116,116,116,116,116, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33,409,410, 33, 33, 33,411, 33, 33, /* block 57 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, - 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,106,106,106,106,106, -106,106,106,106,106,106,106,106,106,106,106,106,106,106,106,106, -106,106,106,106,106,106,106,106,106,106,106,106,106,106,106,115, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,114,114,114,114,114,114,109,109,109,109, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,107,107,107,107,107, +107,107,107,107,107,107,107,107,107,107,107,107,107,107,107,107, +107,107,107,107,107,107,107,107,107,107,107,107,107,107,107,116, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,115,115,115,115,115,115,110,110,110,110, /* block 58 */ 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, @@ -1940,12 +1964,12 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */ 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, -411,412, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, +412,413, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, /* block 59 */ 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, - 30, 31, 30, 31, 30, 31, 33, 33, 33, 33, 33,413, 33, 33,414, 33, + 30, 31, 30, 31, 30, 31, 33, 33, 33, 33, 33,414, 33, 33,415, 33, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, @@ -1954,57 +1978,57 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */ 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, /* block 60 */ -415,415,415,415,415,415,415,415,416,416,416,416,416,416,416,416, -415,415,415,415,415,415,114,114,416,416,416,416,416,416,114,114, -415,415,415,415,415,415,415,415,416,416,416,416,416,416,416,416, -415,415,415,415,415,415,415,415,416,416,416,416,416,416,416,416, -415,415,415,415,415,415,114,114,416,416,416,416,416,416,114,114, -122,415,122,415,122,415,122,415,114,416,114,416,114,416,114,416, -415,415,415,415,415,415,415,415,416,416,416,416,416,416,416,416, -417,417,418,418,418,418,419,419,420,420,421,421,422,422,114,114, +416,416,416,416,416,416,416,416,417,417,417,417,417,417,417,417, +416,416,416,416,416,416,115,115,417,417,417,417,417,417,115,115, +416,416,416,416,416,416,416,416,417,417,417,417,417,417,417,417, +416,416,416,416,416,416,416,416,417,417,417,417,417,417,417,417, +416,416,416,416,416,416,115,115,417,417,417,417,417,417,115,115, +123,416,123,416,123,416,123,416,115,417,115,417,115,417,115,417, +416,416,416,416,416,416,416,416,417,417,417,417,417,417,417,417, +418,418,419,419,419,419,420,420,421,421,422,422,423,423,115,115, /* block 61 */ -415,415,415,415,415,415,415,415,423,423,423,423,423,423,423,423, -415,415,415,415,415,415,415,415,423,423,423,423,423,423,423,423, -415,415,415,415,415,415,415,415,423,423,423,423,423,423,423,423, -415,415,122,424,122,114,122,122,416,416,425,425,426,113,427,113, -113,113,122,424,122,114,122,122,428,428,428,428,426,113,113,113, -415,415,122,122,114,114,122,122,416,416,429,429,114,113,113,113, -415,415,122,122,122,163,122,122,416,416,430,430,168,113,113,113, -114,114,122,424,122,114,122,122,431,431,432,432,426,113,113,114, +416,416,416,416,416,416,416,416,424,424,424,424,424,424,424,424, +416,416,416,416,416,416,416,416,424,424,424,424,424,424,424,424, +416,416,416,416,416,416,416,416,424,424,424,424,424,424,424,424, +416,416,123,425,123,115,123,123,417,417,426,426,427,114,428,114, +114,114,123,425,123,115,123,123,429,429,429,429,427,114,114,114, +416,416,123,123,115,115,123,123,417,417,430,430,115,114,114,114, +416,416,123,123,123,164,123,123,417,417,431,431,169,114,114,114, +115,115,123,425,123,115,123,123,432,432,433,433,427,114,114,115, /* block 62 */ - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 22,433,433, 22, 22, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 22,434,434, 22, 22, 9, 9, 9, 9, 9, 9, 4, 4, 21, 25, 6, 21, 21, 25, 6, 21, - 4, 4, 4, 4, 4, 4, 4, 4,434,435, 22, 22, 22, 22, 22, 3, + 4, 4, 4, 4, 4, 4, 4, 4,435,436, 22, 22, 22, 22, 22, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 21, 25, 4, 4, 4, 4, 15, 15, 4, 4, 4, 8, 6, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 4, 15, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, - 22, 22, 22, 22, 22,436, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 23,106,114,114, 23, 23, 23, 23, 23, 23, 8, 8, 8, 6, 7,106, + 22, 22, 22, 22, 22,437, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 23,107,115,115, 23, 23, 23, 23, 23, 23, 8, 8, 8, 6, 7,107, /* block 63 */ - 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 8, 8, 8, 6, 7,114, -106,106,106,106,106,106,106,106,106,106,106,106,106,114,114,114, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 8, 8, 8, 6, 7,115, +107,107,107,107,107,107,107,107,107,107,107,107,107,115,115,115, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -109,109,109,109,109,109,109,109,109,109,109,109,109,380,380,380, -380,109,380,380,380,109,109,109,109,109,109,109,109,109,109,109, -109,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +110,110,110,110,110,110,110,110,110,110,110,110,110,381,381,381, +381,110,381,381,381,110,110,110,110,110,110,110,110,110,110,110, +110,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 64 */ - 19, 19,437, 19, 19, 19, 19,437, 19, 19,438,437,437,437,438,438, -437,437,437,438, 19,437, 19, 19, 8,437,437,437,437,437, 19, 19, - 19, 19, 19, 19,437, 19,439, 19,437, 19,440,441,437,437, 19,438, -437,437,442,437,438,406,406,406,406,438, 19, 19,438,438,437,437, - 8, 8, 8, 8, 8,437,438,438,438,438, 19, 8, 19, 19,443, 19, + 19, 19,438, 19, 19, 19, 19,438, 19, 19,439,438,438,438,439,439, +438,438,438,439, 19,438, 19, 19, 8,438,438,438,438,438, 19, 19, + 19, 19, 19, 19,438, 19,440, 19,438, 19,441,442,438,438, 19,439, +438,438,443,438,439,407,407,407,407,439, 19, 19,439,439,438,438, + 8, 8, 8, 8, 8,438,439,439,439,439, 19, 8, 19, 19,444, 19, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, -444,444,444,444,444,444,444,444,444,444,444,444,444,444,444,444, 445,445,445,445,445,445,445,445,445,445,445,445,445,445,445,445, +446,446,446,446,446,446,446,446,446,446,446,446,446,446,446,446, /* block 65 */ -446,446,446, 30, 31,446,446,446,446, 23,114,114,114,114,114,114, +447,447,447, 30, 31,447,447,447,447, 23, 19, 19,115,115,115,115, 8, 8, 8, 8, 8, 19, 19, 19, 19, 19, 8, 8, 19, 19, 19, 19, 8, 19, 19, 8, 19, 19, 8, 19, 19, 19, 19, 19, 19, 19, 8, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, @@ -2041,15 +2065,15 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 8, 8, 8, 8, 8, 8, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115, /* block 69 */ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, + 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, @@ -2057,10 +2081,10 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19,447,447,447,447,447,447,447,447,447,447, -447,447,447,447,447,447,447,447,447,447,447,447,447,447,447,447, + 19, 19, 19, 19, 19, 19,448,448,448,448,448,448,448,448,448,448, 448,448,448,448,448,448,448,448,448,448,448,448,448,448,448,448, -448,448,448,448,448,448,448,448,448,448, 23, 23, 23, 23, 23, 23, +449,449,449,449,449,449,449,449,449,449,449,449,449,449,449,449, +449,449,449,449,449,449,449,449,449,449, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, /* block 71 */ @@ -2114,14 +2138,14 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, /* blockblock 77 */ 8, 8, 8, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, @@ -2141,147 +2165,147 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */ 8, 8, 8, 8, 8, 19, 19, 8, 8, 8, 8, 8, 8, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19,114,114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19,115,115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, /* block 79 */ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19,114,114, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19,115,115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19,114, 19, 19, 19, 19, 19, 19, - 19, 19,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19,115, 19, 19, 19, 19, 19, 19, + 19, 19,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115, 19, 19, 19, 19, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 80 */ -450,450,450,450,450,450,450,450,450,450,450,450,450,450,450,450, -450,450,450,450,450,450,450,450,450,450,450,450,450,450,450,450, -450,450,450,450,450,450,450,450,450,450,450,450,450,450,450,114, 451,451,451,451,451,451,451,451,451,451,451,451,451,451,451,451, 451,451,451,451,451,451,451,451,451,451,451,451,451,451,451,451, -451,451,451,451,451,451,451,451,451,451,451,451,451,451,451,114, - 30, 31,452,453,454,455,456, 30, 31, 30, 31, 30, 31,457,458,459, -460, 33, 30, 31, 33, 30, 31, 33, 33, 33, 33, 33,106,106,461,461, +451,451,451,451,451,451,451,451,451,451,451,451,451,451,451,115, +452,452,452,452,452,452,452,452,452,452,452,452,452,452,452,452, +452,452,452,452,452,452,452,452,452,452,452,452,452,452,452,452, +452,452,452,452,452,452,452,452,452,452,452,452,452,452,452,115, + 30, 31,453,454,455,456,457, 30, 31, 30, 31, 30, 31,458,459,460, +461, 33, 30, 31, 33, 30, 31, 33, 33, 33, 33, 33,107,107,462,462, /* block 81 */ -159,160,159,160,159,160,159,160,159,160,159,160,159,160,159,160, -159,160,159,160,159,160,159,160,159,160,159,160,159,160,159,160, -159,160,159,160,159,160,159,160,159,160,159,160,159,160,159,160, -159,160,159,160,159,160,159,160,159,160,159,160,159,160,159,160, -159,160,159,160,159,160,159,160,159,160,159,160,159,160,159,160, -159,160,159,160,159,160,159,160,159,160,159,160,159,160,159,160, -159,160,159,160,462,463,463,463,463,463,463,159,160,159,160,464, -464,464,159,160,114,114,114,114,114,465,465,465,465,466,465,465, +160,161,160,161,160,161,160,161,160,161,160,161,160,161,160,161, +160,161,160,161,160,161,160,161,160,161,160,161,160,161,160,161, +160,161,160,161,160,161,160,161,160,161,160,161,160,161,160,161, +160,161,160,161,160,161,160,161,160,161,160,161,160,161,160,161, +160,161,160,161,160,161,160,161,160,161,160,161,160,161,160,161, +160,161,160,161,160,161,160,161,160,161,160,161,160,161,160,161, +160,161,160,161,463,464,464,464,464,464,464,160,161,160,161,465, +465,465,160,161,115,115,115,115,115,466,466,466,466,467,466,466, /* block 82 */ -467,467,467,467,467,467,467,467,467,467,467,467,467,467,467,467, -467,467,467,467,467,467,467,467,467,467,467,467,467,467,467,467, -467,467,467,467,467,467,114,467,114,114,114,114,114,467,114,114, 468,468,468,468,468,468,468,468,468,468,468,468,468,468,468,468, 468,468,468,468,468,468,468,468,468,468,468,468,468,468,468,468, -468,468,468,468,468,468,468,468,468,468,468,468,468,468,468,468, -468,468,468,468,468,468,468,468,114,114,114,114,114,114,114,469, -470,114,114,114,114,114,114,114,114,114,114,114,114,114,114,471, +468,468,468,468,468,468,115,468,115,115,115,115,115,468,115,115, +469,469,469,469,469,469,469,469,469,469,469,469,469,469,469,469, +469,469,469,469,469,469,469,469,469,469,469,469,469,469,469,469, +469,469,469,469,469,469,469,469,469,469,469,469,469,469,469,469, +469,469,469,469,469,469,469,469,115,115,115,115,115,115,115,470, +471,115,115,115,115,115,115,115,115,115,115,115,115,115,115,472, /* block 83 */ -317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, -317,317,317,317,317,317,317,114,114,114,114,114,114,114,114,114, -317,317,317,317,317,317,317,114,317,317,317,317,317,317,317,114, -317,317,317,317,317,317,317,114,317,317,317,317,317,317,317,114, -317,317,317,317,317,317,317,114,317,317,317,317,317,317,317,114, -317,317,317,317,317,317,317,114,317,317,317,317,317,317,317,114, -177,177,177,177,177,177,177,177,177,177,177,177,177,177,177,177, -177,177,177,177,177,177,177,177,177,177,177,177,177,177,177,177, +318,318,318,318,318,318,318,318,318,318,318,318,318,318,318,318, +318,318,318,318,318,318,318,115,115,115,115,115,115,115,115,115, +318,318,318,318,318,318,318,115,318,318,318,318,318,318,318,115, +318,318,318,318,318,318,318,115,318,318,318,318,318,318,318,115, +318,318,318,318,318,318,318,115,318,318,318,318,318,318,318,115, +318,318,318,318,318,318,318,115,318,318,318,318,318,318,318,115, +178,178,178,178,178,178,178,178,178,178,178,178,178,178,178,178, +178,178,178,178,178,178,178,178,178,178,178,178,178,178,178,178, /* block 84 */ 4, 4, 21, 25, 21, 25, 4, 4, 4, 21, 25, 4, 21, 25, 4, 4, 4, 4, 4, 4, 4, 4, 4, 9, 4, 4, 9, 4, 21, 25, 4, 4, - 21, 25, 6, 7, 6, 7, 6, 7, 6, 7, 4, 4, 4, 4, 4,107, + 21, 25, 6, 7, 6, 7, 6, 7, 6, 7, 4, 4, 4, 4, 4,108, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 9, 9, 4, 4, 4, 4, - 9, 4, 6,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, + 9, 4, 6,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 85 */ -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,472,472,472,472,472,472,114,472,472,472,472,472, -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,114,114,114,114,114,114,114,114,114,114,114,114, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,473,473,473,473,473,473,115,473,473,473,473,473, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,115,115,115,115,115,115,115,115,115,115,115,115, /* blockblock 87 */ -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,472,472,472,472,472,472,472,472,472,472,472,472, -472,472,472,472,472,472,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,473,473,473,473,473,473,473,473,473,473,473,473, +473,473,473,473,473,473,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115, /* block 88 */ - 3, 4, 4, 4, 19,473,406,474, 6, 7, 6, 7, 6, 7, 6, 7, + 3, 4, 4, 4, 19,474,407,475, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 19, 19, 6, 7, 6, 7, 6, 7, 6, 7, 9, 6, 7, 7, - 19,474,474,474,474,474,474,474,474,474,109,109,109,109,475,475, - 9,107,107,107,107,107, 19, 19,474,474,474,473,406, 4, 19, 19, -114,476,476,476,476,476,476,476,476,476,476,476,476,476,476,476, -476,476,476,476,476,476,476,476,476,476,476,476,476,476,476,476, -476,476,476,476,476,476,476,476,476,476,476,476,476,476,476,476, -476,476,476,476,476,476,476,476,476,476,476,476,476,476,476,476, + 19,475,475,475,475,475,475,475,475,475,110,110,110,110,476,476, + 9,108,108,108,108,108, 19, 19,475,475,475,474,407, 4, 19, 19, +115,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477, +477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477, +477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477, +477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477, /* block 89 */ -476,476,476,476,476,476,476,476,476,476,476,476,476,476,476,476, -476,476,476,476,476,476,476,114,114,109,109, 14, 14,477,477,476, - 9,478,478,478,478,478,478,478,478,478,478,478,478,478,478,478, -478,478,478,478,478,478,478,478,478,478,478,478,478,478,478,478, -478,478,478,478,478,478,478,478,478,478,478,478,478,478,478,478, -478,478,478,478,478,478,478,478,478,478,478,478,478,478,478,478, -478,478,478,478,478,478,478,478,478,478,478,478,478,478,478,478, -478,478,478,478,478,478,478,478,478,478,478, 4,107,479,479,478, +477,477,477,477,477,477,477,477,477,477,477,477,477,477,477,477, +477,477,477,477,477,477,477,115,115,110,110, 14, 14,478,478,477, + 9,479,479,479,479,479,479,479,479,479,479,479,479,479,479,479, +479,479,479,479,479,479,479,479,479,479,479,479,479,479,479,479, +479,479,479,479,479,479,479,479,479,479,479,479,479,479,479,479, +479,479,479,479,479,479,479,479,479,479,479,479,479,479,479,479, +479,479,479,479,479,479,479,479,479,479,479,479,479,479,479,479, +479,479,479,479,479,479,479,479,479,479,479, 4,108,480,480,479, /* block 90 */ -114,114,114,114,114,480,480,480,480,480,480,480,480,480,480,480, -480,480,480,480,480,480,480,480,480,480,480,480,480,480,480,480, -480,480,480,480,480,480,480,480,480,480,480,480,480,480,114,114, -114,481,481,481,481,481,481,481,481,481,481,481,481,481,481,481, -481,481,481,481,481,481,481,481,481,481,481,481,481,481,481,481, -481,481,481,481,481,481,481,481,481,481,481,481,481,481,481,481, -481,481,481,481,481,481,481,481,481,481,481,481,481,481,481,481, +115,115,115,115,115,481,481,481,481,481,481,481,481,481,481,481, 481,481,481,481,481,481,481,481,481,481,481,481,481,481,481,481, +481,481,481,481,481,481,481,481,481,481,481,481,481,481,115,115, +115,482,482,482,482,482,482,482,482,482,482,482,482,482,482,482, +482,482,482,482,482,482,482,482,482,482,482,482,482,482,482,482, +482,482,482,482,482,482,482,482,482,482,482,482,482,482,482,482, +482,482,482,482,482,482,482,482,482,482,482,482,482,482,482,482, +482,482,482,482,482,482,482,482,482,482,482,482,482,482,482,482, /* block 91 */ -481,481,481,481,481,481,481,481,481,481,481,481,481,481,481,114, +482,482,482,482,482,482,482,482,482,482,482,482,482,482,482,115, 19, 19, 23, 23, 23, 23, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, -480,480,480,480,480,480,480,480,480,480,480,480,480,480,480,480, -480,480,480,480,480,480,480,480,480,480,480,114,114,114,114,114, +481,481,481,481,481,481,481,481,481,481,481,481,481,481,481,481, +481,481,481,481,481,481,481,481,481,481,481,115,115,115,115,115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19,114,114,114,114,114,114,114,114,114,114,114,114, -478,478,478,478,478,478,478,478,478,478,478,478,478,478,478,478, + 19, 19, 19, 19,115,115,115,115,115,115,115,115,115,115,115,115, +479,479,479,479,479,479,479,479,479,479,479,479,479,479,479,479, /* block 92 */ -482,482,482,482,482,482,482,482,482,482,482,482,482,482,482,482, -482,482,482,482,482,482,482,482,482,482,482,482,482,482,482,114, +483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,483, +483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,115, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 23, 23, 23, 23, 23, 23, 23, 23, 19, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, -482,482,482,482,482,482,482,482,482,482,482,482,482,482,482,482, -482,482,482,482,482,482,482,482,482,482,482,482,482,482,482, 19, +483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,483, +483,483,483,483,483,483,483,483,483,483,483,483,483,483,483, 19, /* block 93 */ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 19, 19, 19, 19, 19, 19, @@ -2289,1229 +2313,1229 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, -483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,483, -483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,483, -483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,114, +484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, +484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, +484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,115, /* block 94 */ -483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,483, -483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,483, -483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,483, -483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,483, -483,483,483,483,483,483,483,483,483,483,483,483,483,483,483,483, -483,483,483,483,483,483,483,483, 19, 19, 19, 19, 19, 19, 19, 19, +484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, +484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, +484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, +484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, +484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, +484,484,484,484,484,484,484,484, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, /* blockblock 96 */ -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,114,114,114,114,114,114,114,114,114,114, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,115,115,115,115,115,115,115,115,115,115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, /* block 97 */ -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* blockblock 99 */ -485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, -485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, -485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, -485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, -485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, -485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, -485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, -485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +486,486,486,486,486,486,486,486,486,486,486,486,486,486,486,486, +486,486,486,486,486,486,486,486,486,486,486,486,486,486,486,486, +486,486,486,486,486,486,486,486,486,486,486,486,486,486,486,486, +486,486,486,486,486,486,486,486,486,486,486,486,486,486,486,486, +486,486,486,486,486,486,486,486,486,486,486,486,486,486,486,486, +486,486,486,486,486,486,486,486,486,486,486,486,486,486,486,486, +486,486,486,486,486,486,486,486,486,486,486,486,486,486,486,486, +486,486,486,486,486,486,486,486,486,486,486,486,486,486,486,486, /* block 100 */ -485,485,485,485,485,485,485,485,485,485,485,485,485,114,114,114, -487,487,487,487,487,487,487,487,487,487,487,487,487,487,487,487, -487,487,487,487,487,487,487,487,487,487,487,487,487,487,487,487, -487,487,487,487,487,487,487,487,487,487,487,487,487,487,487,487, -487,487,487,487,487,487,487,114,114,114,114,114,114,114,114,114, +486,486,486,486,486,486,486,486,486,486,486,486,486,115,115,115, 488,488,488,488,488,488,488,488,488,488,488,488,488,488,488,488, 488,488,488,488,488,488,488,488,488,488,488,488,488,488,488,488, -488,488,488,488,488,488,488,488,489,489,489,489,489,489,490,490, +488,488,488,488,488,488,488,488,488,488,488,488,488,488,488,488, +488,488,488,488,488,488,488,115,115,115,115,115,115,115,115,115, +489,489,489,489,489,489,489,489,489,489,489,489,489,489,489,489, +489,489,489,489,489,489,489,489,489,489,489,489,489,489,489,489, +489,489,489,489,489,489,489,489,490,490,490,490,490,490,491,491, /* block 101 */ -491,491,491,491,491,491,491,491,491,491,491,491,491,491,491,491, -491,491,491,491,491,491,491,491,491,491,491,491,491,491,491,491, -491,491,491,491,491,491,491,491,491,491,491,491,491,491,491,491, -491,491,491,491,491,491,491,491,491,491,491,491,491,491,491,491, -491,491,491,491,491,491,491,491,491,491,491,491,491,491,491,491, -491,491,491,491,491,491,491,491,491,491,491,491,491,491,491,491, -491,491,491,491,491,491,491,491,491,491,491,491,491,491,491,491, -491,491,491,491,491,491,491,491,491,491,491,491,491,491,491,491, +492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492, +492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492, +492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492, +492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492, +492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492, +492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492, +492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492, +492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492, /* block 102 */ -491,491,491,491,491,491,491,491,491,491,491,491,492,493,493,493, -491,491,491,491,491,491,491,491,491,491,491,491,491,491,491,491, -494,494,494,494,494,494,494,494,494,494,491,491,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,174,175,495,177, -178,178,178,496,177,177,177,177,177,177,177,177,177,177,496,408, +492,492,492,492,492,492,492,492,492,492,492,492,493,494,494,494, +492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492, +495,495,495,495,495,495,495,495,495,495,492,492,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,496,178, +179,179,179,497,178,178,178,178,178,178,178,178,178,178,497,409, /* block 103 */ -174,175,174,175,174,175,174,175,174,175,174,175,174,175,174,175, -174,175,174,175,174,175,174,175,174,175,174,175,408,408,114,177, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,498,498,498,498,498,498,498,498,498,498, -499,499,500,500,500,500,500,500,114,114,114,114,114,114,114,114, +175,176,175,176,175,176,175,176,175,176,175,176,175,176,175,176, +175,176,175,176,175,176,175,176,175,176,175,176,409,409,178,178, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,499,499,499,499,499,499,499,499,499,499, +500,500,501,501,501,501,501,501,115,115,115,115,115,115,115,115, /* block 104 */ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 14, 14, 14, 14, 14, 14, 14,107,107,107,107,107,107,107,107,107, + 14, 14, 14, 14, 14, 14, 14,108,108,108,108,108,108,108,108,108, 14, 14, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 33, 33, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, -106, 33, 33, 33, 33, 33, 33, 33, 33, 30, 31, 30, 31,501, 30, 31, +107, 33, 33, 33, 33, 33, 33, 33, 33, 30, 31, 30, 31,502, 30, 31, /* block 105 */ - 30, 31, 30, 31, 30, 31, 30, 31,107, 14, 14, 30, 31,502, 33,114, + 30, 31, 30, 31, 30, 31, 30, 31,108, 14, 14, 30, 31,503, 33, 20, 30, 31, 30, 31, 33, 33, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, - 30, 31, 30, 31, 30, 31, 30, 31, 30, 31,503,504,505,506,114,114, -507,508,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114, 20,106,106, 33, 20, 20, 20, 20, 20, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31,504,505,506,507,115,115, +508,509,510,511, 30, 31, 30, 31,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115, 20,107,107, 33, 20, 20, 20, 20, 20, /* block 106 */ -509,509,510,509,509,509,510,509,509,509,509,510,509,509,509,509, -509,509,509,509,509,509,509,509,509,509,509,509,509,509,509,509, -509,509,509,511,511,510,510,511,512,512,512,512,114,114,114,114, - 23, 23, 23, 23, 23, 23, 19, 19, 5, 19,114,114,114,114,114,114, -513,513,513,513,513,513,513,513,513,513,513,513,513,513,513,513, -513,513,513,513,513,513,513,513,513,513,513,513,513,513,513,513, -513,513,513,513,513,513,513,513,513,513,513,513,513,513,513,513, -513,513,513,513,514,514,514,514,114,114,114,114,114,114,114,114, +512,512,513,512,512,512,513,512,512,512,512,513,512,512,512,512, +512,512,512,512,512,512,512,512,512,512,512,512,512,512,512,512, +512,512,512,514,514,513,513,514,515,515,515,515,115,115,115,115, + 23, 23, 23, 23, 23, 23, 19, 19, 5, 19,115,115,115,115,115,115, +516,516,516,516,516,516,516,516,516,516,516,516,516,516,516,516, +516,516,516,516,516,516,516,516,516,516,516,516,516,516,516,516, +516,516,516,516,516,516,516,516,516,516,516,516,516,516,516,516, +516,516,516,516,517,517,517,517,115,115,115,115,115,115,115,115, /* block 107 */ -515,515,516,516,516,516,516,516,516,516,516,516,516,516,516,516, -516,516,516,516,516,516,516,516,516,516,516,516,516,516,516,516, -516,516,516,516,516,516,516,516,516,516,516,516,516,516,516,516, -516,516,516,516,515,515,515,515,515,515,515,515,515,515,515,515, -515,515,515,515,517,114,114,114,114,114,114,114,114,114,518,518, -519,519,519,519,519,519,519,519,519,519,114,114,114,114,114,114, -221,221,221,221,221,221,221,221,221,221,221,221,221,221,221,221, -221,221,223,223,223,223,223,223,225,225,225,223,114,114,114,114, +518,518,519,519,519,519,519,519,519,519,519,519,519,519,519,519, +519,519,519,519,519,519,519,519,519,519,519,519,519,519,519,519, +519,519,519,519,519,519,519,519,519,519,519,519,519,519,519,519, +519,519,519,519,518,518,518,518,518,518,518,518,518,518,518,518, +518,518,518,518,520,115,115,115,115,115,115,115,115,115,521,521, +522,522,522,522,522,522,522,522,522,522,115,115,115,115,115,115, +222,222,222,222,222,222,222,222,222,222,222,222,222,222,222,222, +222,222,224,224,224,224,224,224,226,226,226,224,226,224,115,115, /* block 108 */ -520,520,520,520,520,520,520,520,520,520,521,521,521,521,521,521, -521,521,521,521,521,521,521,521,521,521,521,521,521,521,521,521, -521,521,521,521,521,521,522,522,522,522,522,522,522,522, 4,523, +523,523,523,523,523,523,523,523,523,523,524,524,524,524,524,524, 524,524,524,524,524,524,524,524,524,524,524,524,524,524,524,524, -524,524,524,524,524,524,524,525,525,525,525,525,525,525,525,525, -525,525,526,526,114,114,114,114,114,114,114,114,114,114,114,527, -314,314,314,314,314,314,314,314,314,314,314,314,314,314,314,314, -314,314,314,314,314,314,314,314,314,314,314,314,314,114,114,114, +524,524,524,524,524,524,525,525,525,525,525,525,525,525, 4,526, +527,527,527,527,527,527,527,527,527,527,527,527,527,527,527,527, +527,527,527,527,527,527,527,528,528,528,528,528,528,528,528,528, +528,528,529,529,115,115,115,115,115,115,115,115,115,115,115,530, +315,315,315,315,315,315,315,315,315,315,315,315,315,315,315,315, +315,315,315,315,315,315,315,315,315,315,315,315,315,115,115,115, /* block 109 */ -528,528,528,529,530,530,530,530,530,530,530,530,530,530,530,530, -530,530,530,530,530,530,530,530,530,530,530,530,530,530,530,530, -530,530,530,530,530,530,530,530,530,530,530,530,530,530,530,530, -530,530,530,528,529,529,528,528,528,528,529,529,528,529,529,529, -529,531,531,531,531,531,531,531,531,531,531,531,531,531,114,107, -532,532,532,532,532,532,532,532,532,532,114,114,114,114,531,531, -304,304,304,304,304,306,533,304,304,304,304,304,304,304,304,304, -308,308,308,308,308,308,308,308,308,308,304,304,304,304,304,114, +531,531,531,532,533,533,533,533,533,533,533,533,533,533,533,533, +533,533,533,533,533,533,533,533,533,533,533,533,533,533,533,533, +533,533,533,533,533,533,533,533,533,533,533,533,533,533,533,533, +533,533,533,531,532,532,531,531,531,531,532,532,531,532,532,532, +532,534,534,534,534,534,534,534,534,534,534,534,534,534,115,108, +535,535,535,535,535,535,535,535,535,535,115,115,115,115,534,534, +305,305,305,305,305,307,536,305,305,305,305,305,305,305,305,305, +309,309,309,309,309,309,309,309,309,309,305,305,305,305,305,115, /* block 110 */ -534,534,534,534,534,534,534,534,534,534,534,534,534,534,534,534, -534,534,534,534,534,534,534,534,534,534,534,534,534,534,534,534, -534,534,534,534,534,534,534,534,534,535,535,535,535,535,535,536, -536,535,535,536,536,535,535,114,114,114,114,114,114,114,114,114, -534,534,534,535,534,534,534,534,534,534,534,534,535,536,114,114, -537,537,537,537,537,537,537,537,537,537,114,114,538,538,538,538, -304,304,304,304,304,304,304,304,304,304,304,304,304,304,304,304, -533,304,304,304,304,304,304,310,310,310,304,305,306,305,304,304, +537,537,537,537,537,537,537,537,537,537,537,537,537,537,537,537, +537,537,537,537,537,537,537,537,537,537,537,537,537,537,537,537, +537,537,537,537,537,537,537,537,537,538,538,538,538,538,538,539, +539,538,538,539,539,538,538,115,115,115,115,115,115,115,115,115, +537,537,537,538,537,537,537,537,537,537,537,537,538,539,115,115, +540,540,540,540,540,540,540,540,540,540,115,115,541,541,541,541, +305,305,305,305,305,305,305,305,305,305,305,305,305,305,305,305, +536,305,305,305,305,305,305,311,311,311,305,306,307,306,305,305, /* block 111 */ -539,539,539,539,539,539,539,539,539,539,539,539,539,539,539,539, -539,539,539,539,539,539,539,539,539,539,539,539,539,539,539,539, -539,539,539,539,539,539,539,539,539,539,539,539,539,539,539,539, -540,539,540,540,540,539,539,540,540,539,539,539,539,539,540,540, -539,540,539,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,539,539,541,542,542, -543,543,543,543,543,543,543,543,543,543,543,544,545,545,544,544, -546,546,543,547,547,544,545,114,114,114,114,114,114,114,114,114, +542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542, +542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542, +542,542,542,542,542,542,542,542,542,542,542,542,542,542,542,542, +543,542,543,543,543,542,542,543,543,542,542,542,542,542,543,543, +542,543,542,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,542,542,544,545,545, +546,546,546,546,546,546,546,546,546,546,546,547,548,548,547,547, +549,549,546,550,550,547,548,115,115,115,115,115,115,115,115,115, /* block 112 */ -114,317,317,317,317,317,317,114,114,317,317,317,317,317,317,114, -114,317,317,317,317,317,317,114,114,114,114,114,114,114,114,114, -317,317,317,317,317,317,317,114,317,317,317,317,317,317,317,114, +115,318,318,318,318,318,318,115,115,318,318,318,318,318,318,115, +115,318,318,318,318,318,318,115,115,115,115,115,115,115,115,115, +318,318,318,318,318,318,318,115,318,318,318,318,318,318,318,115, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, - 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 14,106,106,106,106, -114,114,114,114, 33,122,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, + 33, 33, 33,551, 33, 33, 33, 33, 33, 33, 33, 14,107,107,107,107, + 33, 33, 33, 33, 33,123,115,115,115,115,115,115,115,115,115,115, +552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, /* block 113 */ -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -543,543,543,543,543,543,543,543,543,543,543,543,543,543,543,543, -543,543,543,543,543,543,543,543,543,543,543,543,543,543,543,543, -543,543,543,544,544,545,544,544,545,544,544,546,544,545,114,114, -548,548,548,548,548,548,548,548,548,548,114,114,114,114,114,114, +552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, +552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, +552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, +552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, +546,546,546,546,546,546,546,546,546,546,546,546,546,546,546,546, +546,546,546,546,546,546,546,546,546,546,546,546,546,546,546,546, +546,546,546,547,547,548,547,547,548,547,547,549,547,548,115,115, +553,553,553,553,553,553,553,553,553,553,115,115,115,115,115,115, /* block 114 */ -549,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,549,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,549,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,549,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -549,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, +554,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,554,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,554,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,554,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +554,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, /* block 115 */ -550,550,550,550,550,550,550,550,550,550,550,550,549,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,549,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,549,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -549,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,549,550,550,550, +555,555,555,555,555,555,555,555,555,555,555,555,554,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,554,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,554,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +554,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,554,555,555,555, /* block 116 */ -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,549,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,549,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -549,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,549,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,554,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,554,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +554,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,554,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, /* block 117 */ -550,550,550,550,550,550,550,550,549,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,549,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -549,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,549,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,549,550,550,550,550,550,550,550, +555,555,555,555,555,555,555,555,554,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,554,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +554,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,554,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,554,555,555,555,555,555,555,555, /* block 118 */ -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,549,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -549,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,549,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,549,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,554,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +554,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,554,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,554,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, /* block 119 */ -550,550,550,550,549,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -549,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,549,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,549,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,549,550,550,550,550,550,550,550,550,550,550,550, +555,555,555,555,554,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +554,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,554,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,554,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,554,555,555,555,555,555,555,555,555,555,555,555, /* block 120 */ -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -549,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,549,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,549,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,549,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +554,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,554,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,554,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,554,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, /* block 121 */ -550,550,550,550,550,550,550,550,549,550,550,550,550,550,550,550, -550,550,550,550,550,550,550,550,550,550,550,550,550,550,550,550, -550,550,550,550,114,114,114,114,114,114,114,114,114,114,114,114, -315,315,315,315,315,315,315,315,315,315,315,315,315,315,315,315, -315,315,315,315,315,315,315,114,114,114,114,316,316,316,316,316, +555,555,555,555,555,555,555,555,554,555,555,555,555,555,555,555, +555,555,555,555,555,555,555,555,555,555,555,555,555,555,555,555, +555,555,555,555,115,115,115,115,115,115,115,115,115,115,115,115, 316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316, -316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316, -316,316,316,316,316,316,316,316,316,316,316,316,114,114,114,114, +316,316,316,316,316,316,316,115,115,115,115,317,317,317,317,317, +317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, +317,317,317,317,317,317,317,317,317,317,317,317,317,317,317,317, +317,317,317,317,317,317,317,317,317,317,317,317,115,115,115,115, /* block 122 */ -551,551,551,551,551,551,551,551,551,551,551,551,551,551,551,551, -551,551,551,551,551,551,551,551,551,551,551,551,551,551,551,551, -551,551,551,551,551,551,551,551,551,551,551,551,551,551,551,551, -551,551,551,551,551,551,551,551,551,551,551,551,551,551,551,551, -551,551,551,551,551,551,551,551,551,551,551,551,551,551,551,551, -551,551,551,551,551,551,551,551,551,551,551,551,551,551,551,551, -551,551,551,551,551,551,551,551,551,551,551,551,551,551,551,551, -551,551,551,551,551,551,551,551,551,551,551,551,551,551,551,551, +556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, +556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, +556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, +556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, +556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, +556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, +556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, +556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, /* block 123 */ -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, /* block 124 */ -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,114,114, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,115,115, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, /* block 125 */ -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 126 */ - 33, 33, 33, 33, 33, 33, 33,114,114,114,114,114,114,114,114,114, -114,114,114,185,185,185,185,185,114,114,114,114,114,192,189,192, -192,192,192,192,192,192,192,192,192,553,192,192,192,192,192,192, -192,192,192,192,192,192,192,114,192,192,192,192,192,114,192,114, -192,192,114,192,192,114,192,192,192,192,192,192,192,192,192,192, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, + 33, 33, 33, 33, 33, 33, 33,115,115,115,115,115,115,115,115,115, +115,115,115,186,186,186,186,186,115,115,115,115,115,193,190,193, +193,193,193,193,193,193,193,193,193,558,193,193,193,193,193,193, +193,193,193,193,193,193,193,115,193,193,193,193,193,115,193,115, +193,193,115,193,193,115,193,193,193,193,193,193,193,193,193,193, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, /* block 127 */ -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,554,554,554,554,554,554,554,554,554,554,554,554,554,554, -554,554,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,559,559,559,559,559,559,559,559,559,559,559,559,559,559, +559,559,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, /* block 128 */ -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, /* block 129 */ -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199, 7, 6, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200, 7, 6, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, /* block 130 */ -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -114,114,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -199,199,199,199,199,199,199,199,199,199,199,199,196,197,114,114, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +115,115,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +200,200,200,200,200,200,200,200,200,200,200,200,197,198,115,115, /* block 131 */ -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, - 4, 4, 4, 4, 4, 4, 4, 6, 7, 4,114,114,114,114,114,114, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,114,114, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, + 4, 4, 4, 4, 4, 4, 4, 6, 7, 4,115,115,115,115,115,115, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,178,178, 4, 9, 9, 15, 15, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 4, 4, 6, 7, 4, 4, 4, 4, 15, 15, 15, - 4, 4, 4,114, 4, 4, 4, 4, 9, 6, 7, 6, 7, 6, 7, 4, - 4, 4, 8, 9, 8, 8, 8,114, 4, 5, 4, 4,114,114,114,114, -199,199,199,199,199,114,199,199,199,199,199,199,199,199,199,199, + 4, 4, 4,115, 4, 4, 4, 4, 9, 6, 7, 6, 7, 6, 7, 4, + 4, 4, 8, 9, 8, 8, 8,115, 4, 5, 4, 4,115,115,115,115, +200,200,200,200,200,115,200,200,200,200,200,200,200,200,200,200, /* block 132 */ -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,114,114, 22, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,115,115, 22, /* block 133 */ -114, 4, 4, 4, 5, 4, 4, 4, 6, 7, 4, 8, 4, 9, 4, 4, +115, 4, 4, 4, 5, 4, 4, 4, 6, 7, 4, 8, 4, 9, 4, 4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 8, 8, 8, 4, 4, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6, 4, 7, 14, 15, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 6, 8, 7, 8, 6, - 7, 4, 6, 7, 4, 4,478,478,478,478,478,478,478,478,478,478, -107,478,478,478,478,478,478,478,478,478,478,478,478,478,478,478, + 7, 4, 6, 7, 4, 4,479,479,479,479,479,479,479,479,479,479, +108,479,479,479,479,479,479,479,479,479,479,479,479,479,479,479, /* block 134 */ -478,478,478,478,478,478,478,478,478,478,478,478,478,478,478,478, -478,478,478,478,478,478,478,478,478,478,478,478,478,478,555,555, -481,481,481,481,481,481,481,481,481,481,481,481,481,481,481,481, -481,481,481,481,481,481,481,481,481,481,481,481,481,481,481,114, -114,114,481,481,481,481,481,481,114,114,481,481,481,481,481,481, -114,114,481,481,481,481,481,481,114,114,481,481,481,114,114,114, - 5, 5, 8, 14, 19, 5, 5,114, 19, 8, 8, 8, 8, 19, 19,114, -436,436,436,436,436,436,436,436,436, 22, 22, 22, 19, 19,114,114, +479,479,479,479,479,479,479,479,479,479,479,479,479,479,479,479, +479,479,479,479,479,479,479,479,479,479,479,479,479,479,560,560, +482,482,482,482,482,482,482,482,482,482,482,482,482,482,482,482, +482,482,482,482,482,482,482,482,482,482,482,482,482,482,482,115, +115,115,482,482,482,482,482,482,115,115,482,482,482,482,482,482, +115,115,482,482,482,482,482,482,115,115,482,482,482,115,115,115, + 5, 5, 8, 14, 19, 5, 5,115, 19, 8, 8, 8, 8, 19, 19,115, +437,437,437,437,437,437,437,437,437, 22, 22, 22, 19, 19,115,115, /* block 135 */ -556,556,556,556,556,556,556,556,556,556,556,556,114,556,556,556, -556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, -556,556,556,556,556,556,556,114,556,556,556,556,556,556,556,556, -556,556,556,556,556,556,556,556,556,556,556,114,556,556,114,556, -556,556,556,556,556,556,556,556,556,556,556,556,556,556,114,114, -556,556,556,556,556,556,556,556,556,556,556,556,556,556,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +561,561,561,561,561,561,561,561,561,561,561,561,115,561,561,561, +561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, +561,561,561,561,561,561,561,115,561,561,561,561,561,561,561,561, +561,561,561,561,561,561,561,561,561,561,561,115,561,561,115,561, +561,561,561,561,561,561,561,561,561,561,561,561,561,561,115,115, +561,561,561,561,561,561,561,561,561,561,561,561,561,561,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 136 */ -556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, -556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, -556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, -556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, -556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, -556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, -556,556,556,556,556,556,556,556,556,556,556,556,556,556,556,556, -556,556,556,556,556,556,556,556,556,556,556,114,114,114,114,114, +561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, +561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, +561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, +561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, +561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, +561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, +561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, +561,561,561,561,561,561,561,561,561,561,561,115,115,115,115,115, /* block 137 */ - 4, 4, 4,114,114,114,114, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 4, 4, 4,115,115,115,115, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, - 23, 23, 23, 23,114,114,114, 19, 19, 19, 19, 19, 19, 19, 19, 19, -557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, -557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, -557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, -557,557,557,557,557,558,558,558,558,559,559,559,559,559,559,559, + 23, 23, 23, 23,115,115,115, 19, 19, 19, 19, 19, 19, 19, 19, 19, +562,562,562,562,562,562,562,562,562,562,562,562,562,562,562,562, +562,562,562,562,562,562,562,562,562,562,562,562,562,562,562,562, +562,562,562,562,562,562,562,562,562,562,562,562,562,562,562,562, +562,562,562,562,562,563,563,563,563,564,564,564,564,564,564,564, /* block 138 */ -559,559,559,559,559,559,559,559,559,559,558,558,559,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114, -559,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +564,564,564,564,564,564,564,564,564,564,563,563,564,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115, +564,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,109,114,114, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,110,115,115, /* block 139 */ -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 140 */ -560,560,560,560,560,560,560,560,560,560,560,560,560,560,560,560, -560,560,560,560,560,560,560,560,560,560,560,560,560,114,114,114, -561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, -561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, -561,561,561,561,561,561,561,561,561,561,561,561,561,561,561,561, -561,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -109, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, - 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,114,114,114,114, +565,565,565,565,565,565,565,565,565,565,565,565,565,565,565,565, +565,565,565,565,565,565,565,565,565,565,565,565,565,115,115,115, +566,566,566,566,566,566,566,566,566,566,566,566,566,566,566,566, +566,566,566,566,566,566,566,566,566,566,566,566,566,566,566,566, +566,566,566,566,566,566,566,566,566,566,566,566,566,566,566,566, +566,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +110, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,115,115,115,115, /* block 141 */ -562,562,562,562,562,562,562,562,562,562,562,562,562,562,562,562, -562,562,562,562,562,562,562,562,562,562,562,562,562,562,562,562, -563,563,563,563,114,114,114,114,114,114,114,114,114,114,114,114, -564,564,564,564,564,564,564,564,564,564,564,564,564,564,564,564, -564,565,564,564,564,564,564,564,564,564,565,114,114,114,114,114, -566,566,566,566,566,566,566,566,566,566,566,566,566,566,566,566, -566,566,566,566,566,566,566,566,566,566,566,566,566,566,566,566, -566,566,566,566,566,566,567,567,567,567,567,114,114,114,114,114, +567,567,567,567,567,567,567,567,567,567,567,567,567,567,567,567, +567,567,567,567,567,567,567,567,567,567,567,567,567,567,567,567, +568,568,568,568,115,115,115,115,115,115,115,115,115,115,115,115, +569,569,569,569,569,569,569,569,569,569,569,569,569,569,569,569, +569,570,569,569,569,569,569,569,569,569,570,115,115,115,115,115, +571,571,571,571,571,571,571,571,571,571,571,571,571,571,571,571, +571,571,571,571,571,571,571,571,571,571,571,571,571,571,571,571, +571,571,571,571,571,571,572,572,572,572,572,115,115,115,115,115, /* block 142 */ -568,568,568,568,568,568,568,568,568,568,568,568,568,568,568,568, -568,568,568,568,568,568,568,568,568,568,568,568,568,568,114,569, -570,570,570,570,570,570,570,570,570,570,570,570,570,570,570,570, -570,570,570,570,570,570,570,570,570,570,570,570,570,570,570,570, -570,570,570,570,114,114,114,114,570,570,570,570,570,570,570,570, -571,572,572,572,572,572,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +573,573,573,573,573,573,573,573,573,573,573,573,573,573,573,573, +573,573,573,573,573,573,573,573,573,573,573,573,573,573,115,574, +575,575,575,575,575,575,575,575,575,575,575,575,575,575,575,575, +575,575,575,575,575,575,575,575,575,575,575,575,575,575,575,575, +575,575,575,575,115,115,115,115,575,575,575,575,575,575,575,575, +576,577,577,577,577,577,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 143 */ -573,573,573,573,573,573,573,573,573,573,573,573,573,573,573,573, -573,573,573,573,573,573,573,573,573,573,573,573,573,573,573,573, -573,573,573,573,573,573,573,573,574,574,574,574,574,574,574,574, -574,574,574,574,574,574,574,574,574,574,574,574,574,574,574,574, -574,574,574,574,574,574,574,574,574,574,574,574,574,574,574,574, -575,575,575,575,575,575,575,575,575,575,575,575,575,575,575,575, -575,575,575,575,575,575,575,575,575,575,575,575,575,575,575,575, -575,575,575,575,575,575,575,575,575,575,575,575,575,575,575,575, +578,578,578,578,578,578,578,578,578,578,578,578,578,578,578,578, +578,578,578,578,578,578,578,578,578,578,578,578,578,578,578,578, +578,578,578,578,578,578,578,578,579,579,579,579,579,579,579,579, +579,579,579,579,579,579,579,579,579,579,579,579,579,579,579,579, +579,579,579,579,579,579,579,579,579,579,579,579,579,579,579,579, +580,580,580,580,580,580,580,580,580,580,580,580,580,580,580,580, +580,580,580,580,580,580,580,580,580,580,580,580,580,580,580,580, +580,580,580,580,580,580,580,580,580,580,580,580,580,580,580,580, /* block 144 */ -576,576,576,576,576,576,576,576,576,576,576,576,576,576,576,576, -576,576,576,576,576,576,576,576,576,576,576,576,576,576,114,114, -577,577,577,577,577,577,577,577,577,577,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, +581,581,581,581,581,581,581,581,581,581,581,581,581,581,115,115, +582,582,582,582,582,582,582,582,582,582,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 145 */ -578,578,578,578,578,578,578,578,578,578,578,578,578,578,578,578, -578,578,578,578,578,578,578,578,578,578,578,578,578,578,578,578, -578,578,578,578,578,578,578,578,114,114,114,114,114,114,114,114, -579,579,579,579,579,579,579,579,579,579,579,579,579,579,579,579, -579,579,579,579,579,579,579,579,579,579,579,579,579,579,579,579, -579,579,579,579,579,579,579,579,579,579,579,579,579,579,579,579, -579,579,579,579,114,114,114,114,114,114,114,114,114,114,114,580, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +583,583,583,583,583,583,583,583,583,583,583,583,583,583,583,583, +583,583,583,583,583,583,583,583,583,583,583,583,583,583,583,583, +583,583,583,583,583,583,583,583,115,115,115,115,115,115,115,115, +584,584,584,584,584,584,584,584,584,584,584,584,584,584,584,584, +584,584,584,584,584,584,584,584,584,584,584,584,584,584,584,584, +584,584,584,584,584,584,584,584,584,584,584,584,584,584,584,584, +584,584,584,584,115,115,115,115,115,115,115,115,115,115,115,585, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 146 */ -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, /* block 147 */ -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,581,114,114,114,114,114,114,114,114,114, -581,581,581,581,581,581,581,581,581,581,581,581,581,581,581,581, -581,581,581,581,581,581,114,114,114,114,114,114,114,114,114,114, -581,581,581,581,581,581,581,581,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,586,115,115,115,115,115,115,115,115,115, +586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, +586,586,586,586,586,586,115,115,115,115,115,115,115,115,115,115, +586,586,586,586,586,586,586,586,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 148 */ -582,582,582,582,582,582,114,114,582,114,582,582,582,582,582,582, -582,582,582,582,582,582,582,582,582,582,582,582,582,582,582,582, -582,582,582,582,582,582,582,582,582,582,582,582,582,582,582,582, -582,582,582,582,582,582,114,582,582,114,114,114,582,114,114,582, -583,583,583,583,583,583,583,583,583,583,583,583,583,583,583,583, -583,583,583,583,583,583,114,584,585,585,585,585,585,585,585,585, -586,586,586,586,586,586,586,586,586,586,586,586,586,586,586,586, -586,586,586,586,586,586,586,587,587,588,588,588,588,588,588,588, +587,587,587,587,587,587,115,115,587,115,587,587,587,587,587,587, +587,587,587,587,587,587,587,587,587,587,587,587,587,587,587,587, +587,587,587,587,587,587,587,587,587,587,587,587,587,587,587,587, +587,587,587,587,587,587,115,587,587,115,115,115,587,115,115,587, +588,588,588,588,588,588,588,588,588,588,588,588,588,588,588,588, +588,588,588,588,588,588,115,589,590,590,590,590,590,590,590,590, +591,591,591,591,591,591,591,591,591,591,591,591,591,591,591,591, +591,591,591,591,591,591,591,592,592,593,593,593,593,593,593,593, /* block 149 */ -589,589,589,589,589,589,589,589,589,589,589,589,589,589,589,589, -589,589,589,589,589,589,589,589,589,589,589,589,589,589,589,114, -114,114,114,114,114,114,114,590,590,590,590,590,590,590,590,590, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +594,594,594,594,594,594,594,594,594,594,594,594,594,594,594,594, +594,594,594,594,594,594,594,594,594,594,594,594,594,594,594,115, +115,115,115,115,115,115,115,595,595,595,595,595,595,595,595,595, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +596,596,596,596,596,596,596,596,596,596,596,596,596,596,596,596, +596,596,596,115,596,596,115,115,115,115,115,597,597,597,597,597, /* block 150 */ -591,591,591,591,591,591,591,591,591,591,591,591,591,591,591,591, -591,591,591,591,591,591,592,592,592,592,592,592,114,114,114,593, -594,594,594,594,594,594,594,594,594,594,594,594,594,594,594,594, -594,594,594,594,594,594,594,594,594,594,114,114,114,114,114,595, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +598,598,598,598,598,598,598,598,598,598,598,598,598,598,598,598, +598,598,598,598,598,598,599,599,599,599,599,599,115,115,115,600, +601,601,601,601,601,601,601,601,601,601,601,601,601,601,601,601, +601,601,601,601,601,601,601,601,601,601,115,115,115,115,115,602, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 151 */ -596,596,596,596,596,596,596,596,596,596,596,596,596,596,596,596, -596,596,596,596,596,596,596,596,596,596,596,596,596,596,596,596, -597,597,597,597,597,597,597,597,597,597,597,597,597,597,597,597, -597,597,597,597,597,597,597,597,114,114,114,114,114,114,597,597, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +603,603,603,603,603,603,603,603,603,603,603,603,603,603,603,603, +603,603,603,603,603,603,603,603,603,603,603,603,603,603,603,603, +604,604,604,604,604,604,604,604,604,604,604,604,604,604,604,604, +604,604,604,604,604,604,604,604,115,115,115,115,605,605,604,604, +605,605,605,605,605,605,605,605,605,605,605,605,605,605,605,605, +115,115,605,605,605,605,605,605,605,605,605,605,605,605,605,605, +605,605,605,605,605,605,605,605,605,605,605,605,605,605,605,605, +605,605,605,605,605,605,605,605,605,605,605,605,605,605,605,605, /* block 152 */ -598,599,599,599,114,599,599,114,114,114,114,114,599,599,599,599, -598,598,598,598,114,598,598,598,114,598,598,598,598,598,598,598, -598,598,598,598,598,598,598,598,598,598,598,598,598,598,598,598, -598,598,598,598,114,114,114,114,599,599,599,114,114,114,114,599, -600,600,600,600,600,600,600,600,114,114,114,114,114,114,114,114, -601,601,601,601,601,601,601,601,601,114,114,114,114,114,114,114, -602,602,602,602,602,602,602,602,602,602,602,602,602,602,602,602, -602,602,602,602,602,602,602,602,602,602,602,602,602,603,603,604, +606,607,607,607,115,607,607,115,115,115,115,115,607,607,607,607, +606,606,606,606,115,606,606,606,115,606,606,606,606,606,606,606, +606,606,606,606,606,606,606,606,606,606,606,606,606,606,606,606, +606,606,606,606,115,115,115,115,607,607,607,115,115,115,115,607, +608,608,608,608,608,608,608,608,115,115,115,115,115,115,115,115, +609,609,609,609,609,609,609,609,609,115,115,115,115,115,115,115, +610,610,610,610,610,610,610,610,610,610,610,610,610,610,610,610, +610,610,610,610,610,610,610,610,610,610,610,610,610,611,611,612, /* block 153 */ -605,605,605,605,605,605,605,605,605,605,605,605,605,605,605,605, -605,605,605,605,605,605,605,605,605,605,605,605,605,606,606,606, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -607,607,607,607,607,607,607,607,608,607,607,607,607,607,607,607, -607,607,607,607,607,607,607,607,607,607,607,607,607,607,607,607, -607,607,607,607,607,609,609,114,114,114,114,610,610,610,610,610, -611,611,611,611,611,611,611,114,114,114,114,114,114,114,114,114, +613,613,613,613,613,613,613,613,613,613,613,613,613,613,613,613, +613,613,613,613,613,613,613,613,613,613,613,613,613,614,614,614, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +615,615,615,615,615,615,615,615,616,615,615,615,615,615,615,615, +615,615,615,615,615,615,615,615,615,615,615,615,615,615,615,615, +615,615,615,615,615,617,617,115,115,115,115,618,618,618,618,618, +619,619,619,619,619,619,619,115,115,115,115,115,115,115,115,115, /* block 154 */ -612,612,612,612,612,612,612,612,612,612,612,612,612,612,612,612, -612,612,612,612,612,612,612,612,612,612,612,612,612,612,612,612, -612,612,612,612,612,612,612,612,612,612,612,612,612,612,612,612, -612,612,612,612,612,612,114,114,114,613,613,613,613,613,613,613, -614,614,614,614,614,614,614,614,614,614,614,614,614,614,614,614, -614,614,614,614,614,614,114,114,615,615,615,615,615,615,615,615, -616,616,616,616,616,616,616,616,616,616,616,616,616,616,616,616, -616,616,616,114,114,114,114,114,617,617,617,617,617,617,617,617, +620,620,620,620,620,620,620,620,620,620,620,620,620,620,620,620, +620,620,620,620,620,620,620,620,620,620,620,620,620,620,620,620, +620,620,620,620,620,620,620,620,620,620,620,620,620,620,620,620, +620,620,620,620,620,620,115,115,115,621,621,621,621,621,621,621, +622,622,622,622,622,622,622,622,622,622,622,622,622,622,622,622, +622,622,622,622,622,622,115,115,623,623,623,623,623,623,623,623, +624,624,624,624,624,624,624,624,624,624,624,624,624,624,624,624, +624,624,624,115,115,115,115,115,625,625,625,625,625,625,625,625, /* block 155 */ -618,618,618,618,618,618,618,618,618,618,618,618,618,618,618,618, -618,618,114,114,114,114,114,114,114,619,619,619,619,114,114,114, -114,114,114,114,114,114,114,114,114,620,620,620,620,620,620,620, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +626,626,626,626,626,626,626,626,626,626,626,626,626,626,626,626, +626,626,115,115,115,115,115,115,115,627,627,627,627,115,115,115, +115,115,115,115,115,115,115,115,115,628,628,628,628,628,628,628, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 156 */ -621,621,621,621,621,621,621,621,621,621,621,621,621,621,621,621, -621,621,621,621,621,621,621,621,621,621,621,621,621,621,621,621, -621,621,621,621,621,621,621,621,621,621,621,621,621,621,621,621, -621,621,621,621,621,621,621,621,621,621,621,621,621,621,621,621, -621,621,621,621,621,621,621,621,621,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629, +629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629, +629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629, +629,629,629,629,629,629,629,629,629,629,629,629,629,629,629,629, +629,629,629,629,629,629,629,629,629,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 157 */ -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -622,622,622,622,622,622,622,622,622,622,622,622,622,622,622,622, -622,622,622,622,622,622,622,622,622,622,622,622,622,622,622,114, +630,630,630,630,630,630,630,630,630,630,630,630,630,630,630,630, +630,630,630,630,630,630,630,630,630,630,630,630,630,630,630,630, +630,630,630,630,630,630,630,630,630,630,630,630,630,630,630,630, +630,630,630,115,115,115,115,115,115,115,115,115,115,115,115,115, +631,631,631,631,631,631,631,631,631,631,631,631,631,631,631,631, +631,631,631,631,631,631,631,631,631,631,631,631,631,631,631,631, +631,631,631,631,631,631,631,631,631,631,631,631,631,631,631,631, +631,631,631,115,115,115,115,115,115,115,632,632,632,632,632,632, /* block 158 */ -623,624,623,625,625,625,625,625,625,625,625,625,625,625,625,625, -625,625,625,625,625,625,625,625,625,625,625,625,625,625,625,625, -625,625,625,625,625,625,625,625,625,625,625,625,625,625,625,625, -625,625,625,625,625,625,625,625,624,624,624,624,624,624,624,624, -624,624,624,624,624,624,624,626,626,626,626,626,626,626,114,114, -114,114,627,627,627,627,627,627,627,627,627,627,627,627,627,627, -627,627,627,627,627,627,628,628,628,628,628,628,628,628,628,628, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,624, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +633,633,633,633,633,633,633,633,633,633,633,633,633,633,633,633, +633,633,633,633,633,633,633,633,633,633,633,633,633,633,633,115, /* block 159 */ -629,629,630,631,631,631,631,631,631,631,631,631,631,631,631,631, -631,631,631,631,631,631,631,631,631,631,631,631,631,631,631,631, -631,631,631,631,631,631,631,631,631,631,631,631,631,631,631,631, -630,630,630,629,629,629,629,630,630,629,629,632,632,633,632,632, -632,632,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -634,634,634,634,634,634,634,634,634,634,634,634,634,634,634,634, -634,634,634,634,634,634,634,634,634,114,114,114,114,114,114,114, -635,635,635,635,635,635,635,635,635,635,114,114,114,114,114,114, +634,635,634,636,636,636,636,636,636,636,636,636,636,636,636,636, +636,636,636,636,636,636,636,636,636,636,636,636,636,636,636,636, +636,636,636,636,636,636,636,636,636,636,636,636,636,636,636,636, +636,636,636,636,636,636,636,636,635,635,635,635,635,635,635,635, +635,635,635,635,635,635,635,637,637,637,637,637,637,637,115,115, +115,115,638,638,638,638,638,638,638,638,638,638,638,638,638,638, +638,638,638,638,638,638,639,639,639,639,639,639,639,639,639,639, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,635, /* block 160 */ -636,636,636,637,637,637,637,637,637,637,637,637,637,637,637,637, -637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637, -637,637,637,637,637,637,637,636,636,636,636,636,638,636,636,636, -636,636,636,636,636,114,639,639,639,639,639,639,639,639,639,639, -640,640,640,640,114,114,114,114,114,114,114,114,114,114,114,114, -641,641,641,641,641,641,641,641,641,641,641,641,641,641,641,641, -641,641,641,641,641,641,641,641,641,641,641,641,641,641,641,641, -641,641,641,642,643,643,641,114,114,114,114,114,114,114,114,114, +640,640,641,642,642,642,642,642,642,642,642,642,642,642,642,642, +642,642,642,642,642,642,642,642,642,642,642,642,642,642,642,642, +642,642,642,642,642,642,642,642,642,642,642,642,642,642,642,642, +641,641,641,640,640,640,640,641,641,640,640,643,643,644,643,643, +643,643,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +645,645,645,645,645,645,645,645,645,645,645,645,645,645,645,645, +645,645,645,645,645,645,645,645,645,115,115,115,115,115,115,115, +646,646,646,646,646,646,646,646,646,646,115,115,115,115,115,115, /* block 161 */ -644,644,645,646,646,646,646,646,646,646,646,646,646,646,646,646, -646,646,646,646,646,646,646,646,646,646,646,646,646,646,646,646, -646,646,646,646,646,646,646,646,646,646,646,646,646,646,646,646, -646,646,646,645,645,645,644,644,644,644,644,644,644,644,644,645, -645,646,646,646,646,647,647,647,647,114,114,114,114,647,114,114, -648,648,648,648,648,648,648,648,648,648,646,114,114,114,114,114, -114,649,649,649,649,649,649,649,649,649,649,649,649,649,649,649, -649,649,649,649,649,114,114,114,114,114,114,114,114,114,114,114, +647,647,647,648,648,648,648,648,648,648,648,648,648,648,648,648, +648,648,648,648,648,648,648,648,648,648,648,648,648,648,648,648, +648,648,648,648,648,648,648,647,647,647,647,647,649,647,647,647, +647,647,647,647,647,115,650,650,650,650,650,650,650,650,650,650, +651,651,651,651,115,115,115,115,115,115,115,115,115,115,115,115, +652,652,652,652,652,652,652,652,652,652,652,652,652,652,652,652, +652,652,652,652,652,652,652,652,652,652,652,652,652,652,652,652, +652,652,652,653,654,654,652,115,115,115,115,115,115,115,115,115, /* block 162 */ -650,650,650,650,650,650,650,650,650,650,650,650,650,650,650,650, -650,650,114,650,650,650,650,650,650,650,650,650,650,650,650,650, -650,650,650,650,650,650,650,650,650,650,650,650,651,651,651,652, -652,652,651,651,652,651,652,652,653,653,653,653,653,653,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +655,655,656,657,657,657,657,657,657,657,657,657,657,657,657,657, +657,657,657,657,657,657,657,657,657,657,657,657,657,657,657,657, +657,657,657,657,657,657,657,657,657,657,657,657,657,657,657,657, +657,657,657,656,656,656,655,655,655,655,655,655,655,655,655,656, +656,657,657,657,657,658,658,658,658,658,655,655,655,658,115,115, +659,659,659,659,659,659,659,659,659,659,657,658,657,658,658,658, +115,660,660,660,660,660,660,660,660,660,660,660,660,660,660,660, +660,660,660,660,660,115,115,115,115,115,115,115,115,115,115,115, /* block 163 */ -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -654,654,654,654,654,654,654,654,654,654,654,654,654,654,654,654, -654,654,654,654,654,654,654,654,654,654,654,654,654,654,654,654, -654,654,654,654,654,654,654,654,654,654,654,654,654,654,654,655, -656,656,656,655,655,655,655,655,655,655,655,114,114,114,114,114, -657,657,657,657,657,657,657,657,657,657,114,114,114,114,114,114, +661,661,661,661,661,661,661,661,661,661,661,661,661,661,661,661, +661,661,115,661,661,661,661,661,661,661,661,661,661,661,661,661, +661,661,661,661,661,661,661,661,661,661,661,661,662,662,662,663, +663,663,662,662,663,662,663,663,664,664,664,664,664,664,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 164 */ -114,658,659,659,114,660,660,660,660,660,660,660,660,114,114,660, -660,114,114,660,660,660,660,660,660,660,660,660,660,660,660,660, -660,660,660,660,660,660,660,660,660,114,660,660,660,660,660,660, -660,114,660,660,114,660,660,660,660,660,114,114,658,660,661,659, -658,659,659,659,659,114,114,659,659,114,114,659,659,659,114,114, -114,114,114,114,114,114,114,661,114,114,114,114,114,660,660,660, -660,660,659,659,114,114,658,658,658,658,658,658,658,114,114,114, -658,658,658,658,658,114,114,114,114,114,114,114,114,114,114,114, +665,665,665,665,665,665,665,115,665,115,665,665,665,665,115,665, +665,665,665,665,665,665,665,665,665,665,665,665,665,665,115,665, +665,665,665,665,665,665,665,665,665,666,115,115,115,115,115,115, +667,667,667,667,667,667,667,667,667,667,667,667,667,667,667,667, +667,667,667,667,667,667,667,667,667,667,667,667,667,667,667,667, +667,667,667,667,667,667,667,667,667,667,667,667,667,667,667,668, +669,669,669,668,668,668,668,668,668,668,668,115,115,115,115,115, +670,670,670,670,670,670,670,670,670,670,115,115,115,115,115,115, /* block 165 */ -662,662,662,662,662,662,662,662,662,662,662,662,662,662,662,662, -662,662,662,662,662,662,662,662,662,662,662,662,662,662,662,662, -662,662,662,662,662,662,662,662,662,662,662,662,662,662,662,662, -663,664,664,665,665,665,665,665,665,664,665,664,664,663,664,665, -665,664,665,665,662,662,666,662,114,114,114,114,114,114,114,114, -667,667,667,667,667,667,667,667,667,667,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +671,671,672,672,115,673,673,673,673,673,673,673,673,115,115,673, +673,115,115,673,673,673,673,673,673,673,673,673,673,673,673,673, +673,673,673,673,673,673,673,673,673,115,673,673,673,673,673,673, +673,115,673,673,115,673,673,673,673,673,115,115,671,673,674,672, +671,672,672,672,672,115,115,672,672,115,115,672,672,672,115,115, +673,115,115,115,115,115,115,674,115,115,115,115,115,673,673,673, +673,673,672,672,115,115,671,671,671,671,671,671,671,115,115,115, +671,671,671,671,671,115,115,115,115,115,115,115,115,115,115,115, /* block 166 */ -668,668,668,668,668,668,668,668,668,668,668,668,668,668,668,668, -668,668,668,668,668,668,668,668,668,668,668,668,668,668,668,668, -668,668,668,668,668,668,668,668,668,668,668,668,668,668,668,669, -670,670,671,671,671,671,114,114,670,670,670,670,671,671,670,671, -671,672,672,672,672,672,672,672,672,672,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +675,675,675,675,675,675,675,675,675,675,675,675,675,675,675,675, +675,675,675,675,675,675,675,675,675,675,675,675,675,675,675,675, +675,675,675,675,675,675,675,675,675,675,675,675,675,675,675,675, +676,677,677,678,678,678,678,678,678,677,678,677,677,676,677,678, +678,677,678,678,675,675,679,675,115,115,115,115,115,115,115,115, +680,680,680,680,680,680,680,680,680,680,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 167 */ -673,673,673,673,673,673,673,673,673,673,673,673,673,673,673,673, -673,673,673,673,673,673,673,673,673,673,673,673,673,673,673,673, -673,673,673,673,673,673,673,673,673,673,673,673,673,673,673,673, -674,674,674,675,675,675,675,675,675,675,675,674,674,675,674,675, -675,676,676,676,673,114,114,114,114,114,114,114,114,114,114,114, -677,677,677,677,677,677,677,677,677,677,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +681,681,681,681,681,681,681,681,681,681,681,681,681,681,681,681, +681,681,681,681,681,681,681,681,681,681,681,681,681,681,681,681, +681,681,681,681,681,681,681,681,681,681,681,681,681,681,681,682, +683,683,684,684,684,684,115,115,683,683,683,683,684,684,683,684, +684,685,685,685,685,685,685,685,685,685,685,685,685,685,685,685, +685,685,685,685,685,685,685,685,681,681,681,681,684,684,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 168 */ -678,678,678,678,678,678,678,678,678,678,678,678,678,678,678,678, -678,678,678,678,678,678,678,678,678,678,678,678,678,678,678,678, -678,678,678,678,678,678,678,678,678,678,678,679,680,679,680,680, -679,679,679,679,679,679,680,679,114,114,114,114,114,114,114,114, -681,681,681,681,681,681,681,681,681,681,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +686,686,686,686,686,686,686,686,686,686,686,686,686,686,686,686, +686,686,686,686,686,686,686,686,686,686,686,686,686,686,686,686, +686,686,686,686,686,686,686,686,686,686,686,686,686,686,686,686, +687,687,687,688,688,688,688,688,688,688,688,687,687,688,687,688, +688,689,689,689,686,115,115,115,115,115,115,115,115,115,115,115, +690,690,690,690,690,690,690,690,690,690,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 169 */ -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -682,682,682,682,682,682,682,682,682,682,682,682,682,682,682,682, -682,682,682,682,682,682,682,682,682,682,682,682,682,682,682,682, -683,683,683,683,683,683,683,683,683,683,683,683,683,683,683,683, -683,683,683,683,683,683,683,683,683,683,683,683,683,683,683,683, -684,684,684,684,684,684,684,684,684,684,685,685,685,685,685,685, -685,685,685,114,114,114,114,114,114,114,114,114,114,114,114,686, +691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, +691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, +691,691,691,691,691,691,691,691,691,691,691,692,693,692,693,693, +692,692,692,692,692,692,693,692,115,115,115,115,115,115,115,115, +694,694,694,694,694,694,694,694,694,694,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 170 */ -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -687,687,687,687,687,687,687,687,687,687,687,687,687,687,687,687, -687,687,687,687,687,687,687,687,687,687,687,687,687,687,687,687, -687,687,687,687,687,687,687,687,687,687,687,687,687,687,687,687, -687,687,687,687,687,687,687,687,687,114,114,114,114,114,114,114, +695,695,695,695,695,695,695,695,695,695,695,695,695,695,695,695, +695,695,695,695,695,695,695,695,695,695,115,115,115,696,696,696, +697,697,696,696,696,696,697,696,696,696,696,696,115,115,115,115, +698,698,698,698,698,698,698,698,698,698,699,699,700,700,700,701, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 171 */ -688,688,688,688,688,688,688,688,688,688,688,688,688,688,688,688, -688,688,688,688,688,688,688,688,688,688,688,688,688,688,688,688, -688,688,688,688,688,688,688,688,688,688,688,688,688,688,688,688, -688,688,688,688,688,688,688,688,688,688,688,688,688,688,688,688, -688,688,688,688,688,688,688,688,688,688,688,688,688,688,688,688, -688,688,688,688,688,688,688,688,688,688,688,688,688,688,688,688, -688,688,688,688,688,688,688,688,688,688,688,688,688,688,688,688, -688,688,688,688,688,688,688,688,688,688,688,688,688,688,688,688, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +702,702,702,702,702,702,702,702,702,702,702,702,702,702,702,702, +702,702,702,702,702,702,702,702,702,702,702,702,702,702,702,702, +703,703,703,703,703,703,703,703,703,703,703,703,703,703,703,703, +703,703,703,703,703,703,703,703,703,703,703,703,703,703,703,703, +704,704,704,704,704,704,704,704,704,704,705,705,705,705,705,705, +705,705,705,115,115,115,115,115,115,115,115,115,115,115,115,706, /* block 172 */ -688,688,688,688,688,688,688,688,688,688,688,688,688,688,688,688, -688,688,688,688,688,688,688,688,688,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +707,707,707,707,707,707,707,707,707,707,707,707,707,707,707,707, +707,707,707,707,707,707,707,707,707,707,707,707,707,707,707,707, +707,707,707,707,707,707,707,707,707,707,707,707,707,707,707,707, +707,707,707,707,707,707,707,707,707,115,115,115,115,115,115,115, /* block 173 */ -689,689,689,689,689,689,689,689,689,689,689,689,689,689,689,689, -689,689,689,689,689,689,689,689,689,689,689,689,689,689,689,689, -689,689,689,689,689,689,689,689,689,689,689,689,689,689,689,689, -689,689,689,689,689,689,689,689,689,689,689,689,689,689,689,689, -689,689,689,689,689,689,689,689,689,689,689,689,689,689,689,689, -689,689,689,689,689,689,689,689,689,689,689,689,689,689,689,689, -689,689,689,689,689,689,689,689,689,689,689,689,689,689,689,114, -690,690,690,690,690,114,114,114,114,114,114,114,114,114,114,114, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, /* block 174 */ -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 175 */ -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,691, -691,691,691,691,691,691,691,691,691,691,691,691,691,691,691,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, +709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, +709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, +709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, +709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, +709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, +709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,115, +710,710,710,710,710,115,115,115,115,115,115,115,115,115,115,115, /* block 176 */ -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,708,708,708,708,708,708,708,708,708,708,708,708, +708,708,708,708,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 177 */ -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,497,497,497,497,497,497,497, -497,497,497,497,497,497,497,497,497,114,114,114,114,114,114,114, -692,692,692,692,692,692,692,692,692,692,692,692,692,692,692,692, -692,692,692,692,692,692,692,692,692,692,692,692,692,692,692,114, -693,693,693,693,693,693,693,693,693,693,114,114,114,114,694,694, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,711, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,711, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,711, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,711, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,711, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,711, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,711, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,711, /* block 178 */ -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -695,695,695,695,695,695,695,695,695,695,695,695,695,695,695,695, -695,695,695,695,695,695,695,695,695,695,695,695,695,695,114,114, -696,696,696,696,696,697,114,114,114,114,114,114,114,114,114,114, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,711, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,711, +711,711,711,711,711,711,711,711,711,711,711,711,711,711,711,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 179 */ -698,698,698,698,698,698,698,698,698,698,698,698,698,698,698,698, -698,698,698,698,698,698,698,698,698,698,698,698,698,698,698,698, -698,698,698,698,698,698,698,698,698,698,698,698,698,698,698,698, -699,699,699,699,699,699,699,700,700,700,700,700,701,701,701,701, -702,702,702,702,700,701,114,114,114,114,114,114,114,114,114,114, -703,703,703,703,703,703,703,703,703,703,114,704,704,704,704,704, -704,704,114,698,698,698,698,698,698,698,698,698,698,698,698,698, -698,698,698,698,698,698,698,698,114,114,114,114,114,698,698,698, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, /* block 180 */ -698,698,698,698,698,698,698,698,698,698,698,698,698,698,698,698, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,712,712,712,712,712,712,712,712,712, +712,712,712,712,712,712,712,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 181 */ -705,705,705,705,705,705,705,705,705,705,705,705,705,705,705,705, -705,705,705,705,705,705,705,705,705,705,705,705,705,705,705,705, -705,705,705,705,705,705,705,705,705,705,705,705,705,705,705,705, -705,705,705,705,705,705,705,705,705,705,705,705,705,705,705,705, -705,705,705,705,705,114,114,114,114,114,114,114,114,114,114,114, -705,706,706,706,706,706,706,706,706,706,706,706,706,706,706,706, -706,706,706,706,706,706,706,706,706,706,706,706,706,706,706,706, -706,706,706,706,706,706,706,706,706,706,706,706,706,706,706,114, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, /* block 182 */ -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,707, -707,707,707,708,708,708,708,708,708,708,708,708,708,708,708,708, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,498,498,498,498,498,498,498, +498,498,498,498,498,498,498,498,498,115,115,115,115,115,115,115, +713,713,713,713,713,713,713,713,713,713,713,713,713,713,713,713, +713,713,713,713,713,713,713,713,713,713,713,713,713,713,713,115, +714,714,714,714,714,714,714,714,714,714,115,115,115,115,715,715, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 183 */ -478,476,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716, +716,716,716,716,716,716,716,716,716,716,716,716,716,716,115,115, +717,717,717,717,717,718,115,115,115,115,115,115,115,115,115,115, /* block 184 */ -709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, -709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, -709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, -709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, -709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, -709,709,709,709,709,709,709,709,709,709,709,709,709,709,709,709, -709,709,709,709,709,709,709,709,709,709,709,114,114,114,114,114, -709,709,709,709,709,709,709,709,709,709,709,709,709,114,114,114, +719,719,719,719,719,719,719,719,719,719,719,719,719,719,719,719, +719,719,719,719,719,719,719,719,719,719,719,719,719,719,719,719, +719,719,719,719,719,719,719,719,719,719,719,719,719,719,719,719, +720,720,720,720,720,720,720,721,721,721,721,721,722,722,722,722, +723,723,723,723,721,722,115,115,115,115,115,115,115,115,115,115, +724,724,724,724,724,724,724,724,724,724,115,725,725,725,725,725, +725,725,115,719,719,719,719,719,719,719,719,719,719,719,719,719, +719,719,719,719,719,719,719,719,115,115,115,115,115,719,719,719, /* block 185 */ -709,709,709,709,709,709,709,709,709,114,114,114,114,114,114,114, -709,709,709,709,709,709,709,709,709,709,114,114,710,711,711,712, - 22, 22, 22, 22,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +719,719,719,719,719,719,719,719,719,719,719,719,719,719,719,719, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 186 */ - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19,114,114,114,114,114,114,114,114,114,114, +726,726,726,726,726,726,726,726,726,726,726,726,726,726,726,726, +726,726,726,726,726,726,726,726,726,726,726,726,726,726,726,726, +726,726,726,726,726,726,726,726,726,726,726,726,726,726,726,726, +726,726,726,726,726,726,726,726,726,726,726,726,726,726,726,726, +726,726,726,726,726,115,115,115,115,115,115,115,115,115,115,115, +726,727,727,727,727,727,727,727,727,727,727,727,727,727,727,727, +727,727,727,727,727,727,727,727,727,727,727,727,727,727,727,727, +727,727,727,727,727,727,727,727,727,727,727,727,727,727,727,115, /* block 187 */ - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19,114,114, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19,713,405,109,109,109, 19, 19, 19,405,713,713, -713,713,713, 22, 22, 22, 22, 22, 22, 22, 22,109,109,109,109,109, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,728, +728,728,728,729,729,729,729,729,729,729,729,729,729,729,729,729, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 188 */ -109,109,109, 19, 19,109,109,109,109,109,109,109, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,109,109,109,109, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +479,477,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 189 */ -559,559,559,559,559,559,559,559,559,559,559,559,559,559,559,559, -559,559,559,559,559,559,559,559,559,559,559,559,559,559,559,559, -559,559,559,559,559,559,559,559,559,559,559,559,559,559,559,559, -559,559,559,559,559,559,559,559,559,559,559,559,559,559,559,559, -559,559,714,714,714,559,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +730,730,730,730,730,730,730,730,730,730,730,730,730,730,730,730, +730,730,730,730,730,730,730,730,730,730,730,730,730,730,730,730, +730,730,730,730,730,730,730,730,730,730,730,730,730,730,730,730, +730,730,730,730,730,730,730,730,730,730,730,730,730,730,730,730, +730,730,730,730,730,730,730,730,730,730,730,730,730,730,730,730, +730,730,730,730,730,730,730,730,730,730,730,730,730,730,730,730, +730,730,730,730,730,730,730,730,730,730,730,115,115,115,115,115, +730,730,730,730,730,730,730,730,730,730,730,730,730,115,115,115, /* block 190 */ - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114,114,114,114,114, - 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, - 23, 23,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +730,730,730,730,730,730,730,730,730,115,115,115,115,115,115,115, +730,730,730,730,730,730,730,730,730,730,115,115,731,732,732,733, + 22, 22, 22, 22,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 191 */ -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,437,437,437,437,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,438,438, -438,438,438,438,438,114,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19,115,115,115,115,115,115,115,115,115,115, /* block 192 */ -437,437,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,437,114,437,437, -114,114,437,114,114,437,437,114,114,437,437,437,437,114,437,437, -437,437,437,437,437,437,438,438,438,438,114,438,114,438,438,438, -438,438,438,438,114,438,438,438,438,438,438,438,438,438,438,438, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19,115,115, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19,734,406,110,110,110, 19, 19, 19,406,734,734, +734,734,734, 22, 22, 22, 22, 22, 22, 22, 22,110,110,110,110,110, /* block 193 */ -438,438,438,438,437,437,114,437,437,437,437,114,114,437,437,437, -437,437,437,437,437,114,437,437,437,437,437,437,437,114,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,437,437,114,437,437,437,437,114, -437,437,437,437,437,114,437,114,114,114,437,437,437,437,437,437, -437,114,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, +110,110,110, 19, 19,110,110,110,110,110,110,110, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,110,110,110,110, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 194 */ -437,437,437,437,437,437,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,437,437,437,437,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, +564,564,564,564,564,564,564,564,564,564,564,564,564,564,564,564, +564,564,564,564,564,564,564,564,564,564,564,564,564,564,564,564, +564,564,564,564,564,564,564,564,564,564,564,564,564,564,564,564, +564,564,564,564,564,564,564,564,564,564,564,564,564,564,564,564, +564,564,735,735,735,564,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 195 */ -438,438,438,438,438,438,438,438,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, -437,437,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115,115,115,115,115, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 196 */ -437,437,437,437,437,437,437,437,437,437,438,438,438,438,438,438, 438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438,438,114,114,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, -437, 8,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,438,438,438, 8,438,438,438,438, -438,438,437,437,437,437,437,437,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,437, 8,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,438,438,438,438,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,439,439, +439,439,439,439,439,115,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, /* block 197 */ +438,438,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,438,115,438,438, +115,115,438,115,115,438,438,115,115,438,438,438,438,115,438,438, +438,438,438,438,438,438,439,439,439,439,115,439,115,439,439,439, +439,439,439,439,115,439,439,439,439,439,439,439,439,439,439,439, 438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438, 8,438,438,438,438,438,438,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, -437,437,437,437,437, 8,438,438,438,438,438,438,438,438,438,438, -438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, 8, -438,438,438,438,438,438,437,437,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, 8, -438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, /* block 198 */ -438,438,438,438,438,438,438,438,438, 8,438,438,438,438,438,438, -437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, -437,437,437,437,437,437,437,437,437, 8,438,438,438,438,438,438, +439,439,439,439,438,438,115,438,438,438,438,115,115,438,438,438, +438,438,438,438,438,115,438,438,438,438,438,438,438,115,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,438,438,115,438,438,438,438,115, +438,438,438,438,438,115,438,115,115,115,438,438,438,438,438,438, +438,115,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,438,438,438,438, 438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, -438,438,438, 8,438,438,438,438,438,438,437,438,114,114, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, /* block 199 */ -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, +438,438,438,438,438,438,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,438,438,438,438,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, /* block 200 */ -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,715,715,715,715,715,715,715,715,715,715,715, -715,715,715,715,715,114,114,716,716,716,716,716,716,716,716,716, -717,717,717,717,717,717,717,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +439,439,439,439,439,439,439,439,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, +438,438,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, /* block 201 */ -199,199,199,199,114,199,199,199,199,199,199,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,199,199,199,199, -114,199,199,114,199,114,114,199,114,199,199,199,199,199,199,199, -199,199,199,114,199,199,199,199,114,199,114,199,114,114,114,114, -114,114,199,114,114,114,114,199,114,199,114,199,114,199,199,199, -114,199,199,114,199,114,114,199,114,199,114,199,114,199,114,199, -114,199,199,114,199,114,114,199,199,199,199,114,199,199,199,199, -199,199,199,114,199,199,199,199,114,199,199,199,199,114,199,114, +438,438,438,438,438,438,438,438,438,438,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439,439,115,115,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, +438, 8,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439, 8,439,439,439,439, +439,439,438,438,438,438,438,438,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,438, 8,439,439,439,439, /* block 202 */ -199,199,199,199,199,199,199,199,199,199,114,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,114,114,114,114, -114,199,199,199,114,199,199,199,199,199,114,199,199,199,199,199, -199,199,199,199,199,199,199,199,199,199,199,199,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -194,194,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439, 8,439,439,439,439,439,439,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, +438,438,438,438,438, 8,439,439,439,439,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, 8, +439,439,439,439,439,439,438,438,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, 8, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, /* block 203 */ - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, +439,439,439,439,439,439,439,439,439, 8,439,439,439,439,439,439, +438,438,438,438,438,438,438,438,438,438,438,438,438,438,438,438, +438,438,438,438,438,438,438,438,438, 8,439,439,439,439,439,439, +439,439,439,439,439,439,439,439,439,439,439,439,439,439,439,439, +439,439,439, 8,439,439,439,439,439,439,438,439,115,115, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, /* block 204 */ - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19,114,114,114,114,114,114,114,114,114,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114, -114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, -114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, -114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19,114,114,114,114,114,114,114,114,114,114, +736,736,736,736,736,736,736,736,736,736,736,736,736,736,736,736, +736,736,736,736,736,736,736,736,736,736,736,736,736,736,736,736, +736,736,736,736,736,736,736,736,736,736,736,736,736,736,736,736, +736,736,736,736,736,736,736,736,736,736,736,736,736,736,736,736, +736,736,736,736,736,736,736,736,736,736,736,736,736,736,736,736, +736,736,736,736,736,736,736,736,736,736,736,736,736,736,736,736, +736,736,736,736,736,736,736,736,736,736,736,736,736,736,736,736, +736,736,736,736,736,736,736,736,736,736,736,736,736,736,736,736, /* block 205 */ - 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, +737,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737, +737,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737, +737,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737, +737,737,737,737,737,737,737,736,736,736,736,737,737,737,737,737, +737,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737, +737,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737, +737,737,737,737,737,737,737,737,737,737,737,737,737,736,736,736, +736,736,736,736,736,737,736,736,736,736,736,736,736,736,736,736, /* block 206 */ - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,718,718,718,718,718,718,718,718,718,718, -718,718,718,718,718,718,718,718,718,718,718,718,718,718,718,718, +736,736,736,736,737,736,736,738,738,738,738,738,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,737,737,737,737,737, +115,737,737,737,737,737,737,737,737,737,737,737,737,737,737,737, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 207 */ -719, 19, 19,114,114,114,114,114,114,114,114,114,114,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114,114,114, - 19, 19,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, /* block 208 */ - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,739,739,739,739,739,739,739,739,739,739,739, +739,739,739,739,739,115,115,740,740,740,740,740,740,740,740,740, +741,741,741,741,741,741,741,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 209 */ - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114, -114,114,114,114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114,114,114,114, +200,200,200,200,115,200,200,200,200,200,200,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,200,200,200,200, +115,200,200,115,200,115,115,200,115,200,200,200,200,200,200,200, +200,200,200,115,200,200,200,200,115,200,115,200,115,115,115,115, +115,115,200,115,115,115,115,200,115,200,115,200,115,200,200,200, +115,200,200,115,200,115,115,200,115,200,115,200,115,200,115,200, +115,200,200,115,200,115,115,200,200,200,200,115,200,200,200,200, +200,200,200,115,200,200,200,200,115,200,200,200,200,115,200,115, /* block 210 */ - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114, +200,200,200,200,200,200,200,200,200,200,115,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,115,115,115,115, +115,200,200,200,115,200,200,200,200,200,115,200,200,200,200,200, +200,200,200,200,200,200,200,200,200,200,200,200,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +195,195,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 211 */ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114, 19, 19, 19, 19, 19, /* block 212 */ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19,115,115,115,115,115,115,115,115,115,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115, +115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, +115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, +115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19,114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19,115,115,115,115,115,115,115,115,115,115, /* block 213 */ + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19,114,114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, /* block 214 */ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114, - 19, 19, 19, 19,114,114,114,114,114,114,114,114,114,114,114,114, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,742,742,742,742,742,742,742,742,742,742, +742,742,742,742,742,742,742,742,742,742,742,742,742,742,742,742, /* block 215 */ +743, 19, 19,115,115,115,115,115,115,115,115,115,115,115,115,115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19,114,114,114,114,114,114,114,114,114,114,114,114, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115,115,115, + 19, 19,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 216 */ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, @@ -3519,109 +3543,199 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 14, 14, 14, 14, 14, /* block 217 */ - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114,114,114,114, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114,114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115, 19, 19, 19, 19, 19, /* block 218 */ - 19, 19, 19, 19, 19, 19, 19, 19,114,114,114,114,114,114,114,114, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, - 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19,115, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, /* block 219 */ -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115, + 19, 19, 19, 19,115,115,115,115,115,115,115,115,115,115,115,115, /* block 220 */ -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,114,114,114,114,114,114,114,114,114,114,114, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19,115,115,115,115,115,115,115,115,115,115,115,115, /* block 221 */ -484,484,484,484,484,484,484,484,484,484,484,484,484,484,484,484, -484,484,484,484,484,484,484,484,484,484,484,484,484,484,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, -114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,114, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 222 */ -436, 22,436,436,436,436,436,436,436,436,436,436,436,436,436,436, -436,436,436,436,436,436,436,436,436,436,436,436,436,436,436,436, - 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, /* blockblock 224 */ -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, + 19, 19, 19, 19, 19, 19, 19, 19, 19,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 225 */ -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -109,109,109,109,109,109,109,109,109,109,109,109,109,109,109,109, -436,436,436,436,436,436,436,436,436,436,436,436,436,436,436,436, + 19, 19, 19, 19, 19,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, + 19,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, /* block 226 */ -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,552,552, -552,552,552,552,552,552,552,552,552,552,552,552,552,552,114,114, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, + +/* block 227 */ +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,115,115,115,115,115,115,115,115,115,115,115, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, + +/* block 228 */ +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,115,115, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, + +/* block 229 */ +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, + +/* block 230 */ +485,485,485,485,485,485,485,485,485,485,485,485,485,485,485,485, +485,485,485,485,485,485,485,485,485,485,485,485,485,485,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, +115,115,115,115,115,115,115,115,115,115,115,115,115,115,115,115, + +/* block 231 */ +437, 22,437,437,437,437,437,437,437,437,437,437,437,437,437,437, +437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + +/* block 232 */ +437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, +437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, +437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, +437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, +437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, +437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, +437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, +437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, + +/* block 233 */ +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, + +/* block 234 */ +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110, +437,437,437,437,437,437,437,437,437,437,437,437,437,437,437,437, + +/* block 235 */ +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,557,557, +557,557,557,557,557,557,557,557,557,557,557,557,557,557,115,115, }; diff --git a/pcre2/src/pcre2_ucp.h b/pcre2/src/pcre2_ucp.h index e7db0c015..02e5012c2 100644 --- a/pcre2/src/pcre2_ucp.h +++ b/pcre2/src/pcre2_ucp.h @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -39,8 +39,8 @@ POSSIBILITY OF SUCH DAMAGE. */ -#ifndef _PCRE2_UCP_H -#define _PCRE2_UCP_H +#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD +#define PCRE2_UCP_H_IDEMPOTENT_GUARD /* This file contains definitions of the property values that are returned by the UCD access macros. New values that are added for new releases of Unicode @@ -253,9 +253,16 @@ enum { ucp_Pau_Cin_Hau, ucp_Siddham, ucp_Tirhuta, - ucp_Warang_Citi + ucp_Warang_Citi, + /* New for Unicode 8.0.0: */ + ucp_Ahom, + ucp_Anatolian_Hieroglyphs, + ucp_Hatran, + ucp_Multani, + ucp_Old_Hungarian, + ucp_SignWriting }; -#endif +#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */ /* End of pcre2_ucp.h */ diff --git a/pcre2/src/pcre2_valid_utf.c b/pcre2/src/pcre2_valid_utf.c index a97847ab9..3e18f1200 100644 --- a/pcre2/src/pcre2_valid_utf.c +++ b/pcre2/src/pcre2_valid_utf.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -93,8 +93,8 @@ Returns: == 0 if the string is a valid UTF string int PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) { -register PCRE2_SPTR p; -register uint32_t c; +PCRE2_SPTR p; +uint32_t c; /* ----------------- Check a UTF-8 string ----------------- */ @@ -131,11 +131,13 @@ PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff */ -for (p = string; length-- > 0; p++) +for (p = string; length > 0; p++) { - register uint32_t ab, d; + uint32_t ab, d; c = *p; + length--; + if (c < 128) continue; /* ASCII character */ if (c < 0xc0) /* Isolated 10xx xxxx byte */ @@ -324,9 +326,10 @@ PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate */ -for (p = string; length-- > 0; p++) +for (p = string; length > 0; p++) { c = *p; + length--; if ((c & 0xf800) != 0xd800) { @@ -368,7 +371,7 @@ PCRE2_ERROR_UTF32_ERR1 Surrogate character PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff */ -for (p = string; length-- > 0; p++) +for (p = string; length > 0; length--, p++) { c = *p; if ((c & 0xfffff800u) != 0xd800u) diff --git a/pcre2/src/pcre2_xclass.c b/pcre2/src/pcre2_xclass.c index 2ea89c4b8..407d3f5b8 100644 --- a/pcre2/src/pcre2_xclass.c +++ b/pcre2/src/pcre2_xclass.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -247,7 +247,7 @@ while ((t = *data++) != XCL_END) case PT_PXPUNCT: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P || - (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop) + (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop) return !negated; break; diff --git a/pcre2/src/pcre2demo.c b/pcre2/src/pcre2demo.c index ec51cf11c..8ae49f100 100644 --- a/pcre2/src/pcre2demo.c +++ b/pcre2/src/pcre2demo.c @@ -3,28 +3,31 @@ *************************************************/ /* This is a demonstration program to illustrate a straightforward way of -calling the PCRE2 regular expression library from a C program. See the +using the PCRE2 regular expression library from a C program. See the pcre2sample documentation for a short discussion ("man pcre2sample" if you have the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is incompatible with the original PCRE API. There are actually three libraries, each supporting a different code unit -width. This demonstration program uses the 8-bit library. +width. This demonstration program uses the 8-bit library. The default is to +process each code unit as a separate character, but if the pattern begins with +"(*UTF)", both it and the subject are treated as UTF-8 strings, where +characters may occupy multiple code units. In Unix-like environments, if PCRE2 is installed in your standard system libraries, you should be able to compile this program using this command: -gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo +cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo If PCRE2 is not installed in a standard place, it is likely to be installed with support for the pkg-config mechanism. If you have pkg-config, you can compile this program using this command: -gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo +cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo -If you do not have pkg-config, you may have to use this: +If you do not have pkg-config, you may have to use something like this: -gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \ +cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \ -R/usr/local/lib -lpcre2-8 -o pcre2demo Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and @@ -39,9 +42,14 @@ the following line. */ /* #define PCRE2_STATIC */ -/* This macro must be defined before including pcre2.h. For a program that uses -only one code unit width, it makes it possible to use generic function names -such as pcre2_compile(). */ +/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h. +For a program that uses only one code unit width, setting it to 8, 16, or 32 +makes it possible to use generic function names such as pcre2_compile(). Note +that just changing 8 to 16 (for example) is not sufficient to convert this +program to process 16-bit characters. Even in a fully 16-bit environment, where +string-handling functions such as strcmp() and printf() work with 16-bit +characters, the code for handling the table of named substrings will still need +to be modified. */ #define PCRE2_CODE_UNIT_WIDTH 8 @@ -62,19 +70,19 @@ int main(int argc, char **argv) { pcre2_code *re; PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ -PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */ +PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */ PCRE2_SPTR name_table; int crlf_is_newline; int errornumber; int find_all; int i; -int namecount; -int name_entry_size; int rc; int utf8; uint32_t option_bits; +uint32_t namecount; +uint32_t name_entry_size; uint32_t newline; PCRE2_SIZE erroroffset; @@ -89,15 +97,19 @@ pcre2_match_data *match_data; * First, sort out the command line. There is only one possible option at * * the moment, "-g" to request repeated matching to find all occurrences, * * like Perl's /g option. We set the variable find_all to a non-zero value * -* if the -g option is present. Apart from that, there must be exactly two * -* arguments. * +* if the -g option is present. * **************************************************************************/ find_all = 0; for (i = 1; i < argc; i++) { if (strcmp(argv[i], "-g") == 0) find_all = 1; - else break; + else if (argv[i][0] == '-') + { + printf("Unrecognised option %s\n", argv[i]); + return 1; + } + else break; } /* After the options, we require exactly two arguments, which are the pattern, @@ -105,7 +117,7 @@ and the subject string. */ if (argc - i != 2) { - printf("Two arguments required: a regex and a subject string\n"); + printf("Exactly two arguments required: a regex and a subject string\n"); return 1; } @@ -184,7 +196,7 @@ if (rc < 0) stored. */ ovector = pcre2_get_ovector_pointer(match_data); -printf("\nMatch succeeded at offset %d\n", (int)ovector[0]); +printf("Match succeeded at offset %d\n", (int)ovector[0]); /************************************************************************* @@ -225,7 +237,7 @@ we have to extract the count of named parentheses from the pattern. */ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ &namecount); /* where to put the answer */ -if (namecount <= 0) printf("No named substrings\n"); else +if (namecount == 0) printf("No named substrings\n"); else { PCRE2_SPTR tabptr; printf("Named substrings\n"); @@ -313,8 +325,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY || for (;;) { - uint32_t options = 0; /* Normally no options */ - PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ + uint32_t options = 0; /* Normally no options */ + PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ /* If the previous match was for an empty string, we are finished if we are at the end of the subject. Otherwise, arrange to run another match at the @@ -354,7 +366,7 @@ for (;;) { if (options == 0) break; /* All matches found */ ovector[1] = start_offset + 1; /* Advance one code unit */ - if (crlf_is_newline && /* If CRLF is newline & */ + if (crlf_is_newline && /* If CRLF is a newline & */ start_offset < subject_length - 1 && /* we are at CRLF, */ subject[start_offset] == '\r' && subject[start_offset + 1] == '\n') @@ -400,7 +412,7 @@ for (;;) printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); } - if (namecount <= 0) printf("No named substrings\n"); else + if (namecount == 0) printf("No named substrings\n"); else { PCRE2_SPTR tabptr = name_table; printf("Named substrings\n"); diff --git a/pcre2/src/pcre2grep.c b/pcre2/src/pcre2grep.c index d5a5d6db9..e98d743de 100644 --- a/pcre2/src/pcre2grep.c +++ b/pcre2/src/pcre2grep.c @@ -13,7 +13,7 @@ distribution because other apparatus is needed to compile pcre2grep for z/OS. The header can be found in the special z/OS distribution, which is available from www.zaconsultants.net or from www.cbttape.org. - Copyright (c) 1997-2014 University of Cambridge + Copyright (c) 1997-2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -58,6 +58,23 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#if (defined _WIN32 || (defined HAVE_WINDOWS_H && HAVE_WINDOWS_H)) && !defined WIN32 +#define WIN32 +#endif + +#ifdef WIN32 +#include /* For _setmode() */ +#include /* For _O_BINARY */ +#endif + +#ifdef SUPPORT_PCRE2GREP_CALLOUT +#ifdef WIN32 +#include +#else +#include +#endif +#endif + #ifdef HAVE_UNISTD_H #include #endif @@ -121,6 +138,20 @@ apply to fprintf(). */ #define FWRITE(a,b,c,d) if (fwrite(a,b,c,d)) {} +/* Under Windows, we have to set stdout to be binary, so that it does not +convert \r\n at the ends of output lines to \r\r\n. However, that means that +any messages written to stdout must have \r\n as their line terminator. This is +handled by using STDOUT_NL as the newline string. We also use a normal double +quote for the example, as single quotes aren't usually available. */ + +#ifdef WIN32 +#define STDOUT_NL "\r\n" +#define QUOT "\"" +#else +#define STDOUT_NL "\n" +#define QUOT "'" +#endif + /************************************************* @@ -138,25 +169,29 @@ static const char *jfriedl_prefix = ""; static const char *jfriedl_postfix = ""; #endif -static char *colour_string = (char *)"1;31"; -static char *colour_option = NULL; -static char *dee_option = NULL; -static char *DEE_option = NULL; -static char *locale = NULL; +static const char *colour_string = "1;31"; +static const char *colour_option = NULL; +static const char *dee_option = NULL; +static const char *DEE_option = NULL; +static const char *locale = NULL; +static const char *newline_arg = NULL; +static const char *om_separator = ""; +static const char *stdin_name = "(standard input)"; + static char *main_buffer = NULL; -static char *newline_arg = NULL; -static char *om_separator = (char *)""; -static char *stdin_name = (char *)"(standard input)"; static int after_context = 0; static int before_context = 0; static int binary_files = BIN_BINARY; static int both_context = 0; static int bufthird = PCRE2GREP_BUFSIZE; +static int max_bufthird = PCRE2GREP_MAX_BUFSIZE; static int bufsize = 3*PCRE2GREP_BUFSIZE; static int endlinetype; +static int total_count = 0; +static int counts_printed = 0; -#if defined HAVE_WINDOWS_H && HAVE_WINDOWS_H +#ifdef WIN32 static int dee_action = dee_SKIP; #else static int dee_action = dee_READ; @@ -185,6 +220,9 @@ static PCRE2_SIZE *offsets; static BOOL count_only = FALSE; static BOOL do_colour = FALSE; +#ifdef WIN32 +static BOOL do_ansi = FALSE; +#endif static BOOL file_offsets = FALSE; static BOOL hyphenpending = FALSE; static BOOL invert = FALSE; @@ -196,6 +234,7 @@ static BOOL omit_zero_count = FALSE; static BOOL resource_error = FALSE; static BOOL quiet = FALSE; static BOOL show_only_matching = FALSE; +static BOOL show_total_count = FALSE; static BOOL silent = FALSE; static BOOL utf = FALSE; @@ -324,6 +363,7 @@ used to identify them. */ #define N_EXCLUDE_FROM (-19) #define N_INCLUDE_FROM (-20) #define N_OM_SEPARATOR (-21) +#define N_MAX_BUFSIZE (-22) static option_item optionlist[] = { { OP_NODATA, N_NULL, NULL, "", "terminate options" }, @@ -332,7 +372,8 @@ static option_item optionlist[] = { { OP_NODATA, 'a', NULL, "text", "treat binary files as text" }, { OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" }, { OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" }, - { OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer size parameter" }, + { OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer starting size" }, + { OP_NUMBER, N_MAX_BUFSIZE,&max_bufthird, "max-buffer-size=number", "set processing buffer maximum size" }, { OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" }, { OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" }, { OP_NUMBER, 'C', &both_context, "context=number", "set number of context lines, before & after" }, @@ -348,11 +389,6 @@ static option_item optionlist[] = { { OP_NODATA, 'h', NULL, "no-filename", "suppress the prefixing filename on output" }, { OP_NODATA, 'I', NULL, "", "treat binary files as not matching (ignore)" }, { OP_NODATA, 'i', NULL, "ignore-case", "ignore case distinctions" }, -#ifdef SUPPORT_PCRE2GREP_JIT - { OP_NODATA, N_NOJIT, NULL, "no-jit", "do not use just-in-time compiler optimization" }, -#else - { OP_NODATA, N_NOJIT, NULL, "no-jit", "ignored: this pcre2grep does not support JIT" }, -#endif { OP_NODATA, 'l', NULL, "files-with-matches", "print only FILE names containing matches" }, { OP_NODATA, 'L', NULL, "files-without-match","print only FILE names not containing matches" }, { OP_STRING, N_LABEL, &stdin_name, "label=name", "set name for standard input" }, @@ -364,6 +400,11 @@ static option_item optionlist[] = { { OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" }, { OP_STRING, 'N', &newline_arg, "newline=type", "set newline type (CR, LF, CRLF, ANYCRLF or ANY)" }, { OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" }, +#ifdef SUPPORT_PCRE2GREP_JIT + { OP_NODATA, N_NOJIT, NULL, "no-jit", "do not use just-in-time compiler optimization" }, +#else + { OP_NODATA, N_NOJIT, NULL, "no-jit", "ignored: this pcre2grep does not support JIT" }, +#endif { OP_OP_NUMBERS, 'o', &only_matching_data, "only-matching=n", "show only the part of the line that matched" }, { OP_STRING, N_OM_SEPARATOR, &om_separator, "om-separator=text", "set separator for multiple -o output" }, { OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" }, @@ -378,6 +419,7 @@ static option_item optionlist[] = { { OP_OP_NUMBER, 'S', &S_arg, "jeffS", "replace matched (sub)string with X" }, #endif { OP_NODATA, 's', NULL, "no-messages", "suppress error messages" }, + { OP_NODATA, 't', NULL, "total-count", "print total count of matching lines" }, { OP_NODATA, 'u', NULL, "utf", "use UTF mode" }, { OP_NODATA, 'V', NULL, "version", "print version information and exit" }, { OP_NODATA, 'v', NULL, "invert-match", "select non-matching lines" }, @@ -435,6 +477,34 @@ return 0; } +/************************************************* +* Parse GREP_COLORS * +*************************************************/ + +/* Extract ms or mt from GREP_COLORS. + +Argument: the string, possibly NULL +Returns: the value of ms or mt, or NULL if neither present +*/ + +static char * +parse_grep_colors(const char *gc) +{ +static char seq[16]; +char *col; +uint32_t len; +if (gc == NULL) return NULL; +col = strstr(gc, "ms="); +if (col == NULL) col = strstr(gc, "mt="); +if (col == NULL) return NULL; +len = 0; +col += 3; +while (*col != ':' && *col != 0 && len < sizeof(seq)-1) + seq[len++] = *col++; +seq[len] = 0; +return seq; +} + /************************************************* * Exit from the program * @@ -657,6 +727,18 @@ return isatty(fileno(f)); } #endif + +/************* Print optionally coloured match Unix-style and z/OS **********/ + +static void +print_match(const char* buf, int length) +{ +if (length == 0) return; +if (do_colour) fprintf(stdout, "%c[%sm", 0x1b, colour_string); +FWRITE(buf, 1, length, stdout); +if (do_colour) fprintf(stdout, "%c[0m", 0x1b); +} + /* End of Unix-style or native z/OS environment functions. */ @@ -665,11 +747,9 @@ return isatty(fileno(f)); /* I (Philip Hazel) have no means of testing this code. It was contributed by Lionel Fourquaux. David Burgess added a patch to define INVALID_FILE_ATTRIBUTES when it did not exist. David Byron added a patch that moved the #include of - to before the INVALID_FILE_ATTRIBUTES definition rather than after. -The double test below stops gcc 4.4.4 grumbling that HAVE_WINDOWS_H is -undefined when it is indeed undefined. */ + to before the INVALID_FILE_ATTRIBUTES definition rather than after. */ -#elif defined HAVE_WINDOWS_H && HAVE_WINDOWS_H +#elif defined WIN32 #ifndef STRICT # define STRICT @@ -684,6 +764,11 @@ undefined when it is indeed undefined. */ #define INVALID_FILE_ATTRIBUTES 0xFFFFFFFF #endif +/* Allow opendirectory to provide globbing, since Microsoft started doing it +wrong (expanding quoted arguments). */ + +#define iswild(name) (strpbrk(name, "*?") != NULL) + typedef struct directory_type { HANDLE handle; @@ -718,7 +803,10 @@ if ((pattern == NULL) || (dir == NULL)) pcre2grep_exit(2); } memcpy(pattern, filename, len); -memcpy(&(pattern[len]), "\\*", 3); +if (iswild(filename)) + pattern[len] = 0; +else + memcpy(&(pattern[len]), "\\*", 3); dir->handle = FindFirstFile(pattern, &(dir->data)); if (dir->handle != INVALID_HANDLE_VALUE) { @@ -776,18 +864,92 @@ return !isdirectory(filename); /************* Test for a terminal in Windows **********/ -/* I don't know how to do this; assume never */ - static BOOL is_stdout_tty(void) { -return FALSE; +return _isatty(_fileno(stdout)); } static BOOL is_file_tty(FILE *f) { -return FALSE; +return _isatty(_fileno(f)); +} + + +/************* Print optionally coloured match in Windows **********/ + +static HANDLE hstdout; +static CONSOLE_SCREEN_BUFFER_INFO csbi; +static WORD match_colour; + +static void +print_match(const char* buf, int length) +{ +if (length == 0) return; +if (do_colour) + { + if (do_ansi) fprintf(stdout, "%c[%sm", 0x1b, colour_string); + else SetConsoleTextAttribute(hstdout, match_colour); + } +FWRITE(buf, 1, length, stdout); +if (do_colour) + { + if (do_ansi) fprintf(stdout, "%c[0m", 0x1b); + else SetConsoleTextAttribute(hstdout, csbi.wAttributes); + } +} + +/* Convert ANSI BGR format to RGB used by Windows */ +#define BGR_RGB(x) ((x & 1 ? 4 : 0) | (x & 2) | (x & 4 ? 1 : 0)) + +static WORD +decode_ANSI_colour(const char *cs) +{ +WORD result = csbi.wAttributes; +while (*cs) + { + if (isdigit(*cs)) + { + int code = atoi(cs); + if (code == 1) result |= 0x08; + else if (code == 4) result |= 0x8000; + else if (code == 5) result |= 0x80; + else if (code >= 30 && code <= 37) result = (result & 0xF8) | BGR_RGB(code - 30); + else if (code == 39) result = (result & 0xF0) | (csbi.wAttributes & 0x0F); + else if (code >= 40 && code <= 47) result = (result & 0x8F) | (BGR_RGB(code - 40) << 4); + else if (code == 49) result = (result & 0x0F) | (csbi.wAttributes & 0xF0); + /* aixterm high intensity colour codes */ + else if (code >= 90 && code <= 97) result = (result & 0xF0) | BGR_RGB(code - 90) | 0x08; + else if (code >= 100 && code <= 107) result = (result & 0x0F) | (BGR_RGB(code - 100) << 4) | 0x80; + + while (isdigit(*cs)) cs++; + } + + if (*cs) cs++; + } + +return result; +} + +static void +init_colour_output() +{ +if (do_colour) + { + hstdout = GetStdHandle(STD_OUTPUT_HANDLE); + /* This fails when redirected to con; try again if so. */ + if (!GetConsoleScreenBufferInfo(hstdout, &csbi) && !do_ansi) + { + HANDLE hcon = CreateFile("CONOUT$", GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_WRITE, NULL, OPEN_EXISTING, 0, NULL); + GetConsoleScreenBufferInfo(hcon, &csbi); + CloseHandle(hcon); + } + match_colour = decode_ANSI_colour(colour_string); + /* No valid colour found - turn off colouring */ + if (!match_colour) do_colour = FALSE; + } } /* End of Windows functions */ @@ -829,6 +991,16 @@ is_file_tty(FILE *f) return FALSE; } + +/************* Print optionally coloured match when we can't do it **********/ + +static void +print_match(const char* buf, int length) +{ +if (length == 0) return; +FWRITE(buf, 1, length, stdout); +} + #endif /* End of system-specific functions */ @@ -869,7 +1041,7 @@ for (op = optionlist; op->one_char != 0; op++) if (op->one_char > 0) fprintf(stderr, "%c", op->one_char); } fprintf(stderr, "] [long options] [pattern] [files]\n"); -fprintf(stderr, "Type `pcre2grep --help' for more information and the long " +fprintf(stderr, "Type \"pcre2grep --help\" for more information and the long " "options.\n"); return rc; } @@ -885,27 +1057,34 @@ help(void) { option_item *op; -printf("Usage: pcre2grep [OPTION]... [PATTERN] [FILE1 FILE2 ...]\n"); -printf("Search for PATTERN in each FILE or standard input.\n"); -printf("PATTERN must be present if neither -e nor -f is used.\n"); -printf("\"-\" can be used as a file name to mean STDIN.\n"); +printf("Usage: pcre2grep [OPTION]... [PATTERN] [FILE1 FILE2 ...]" STDOUT_NL); +printf("Search for PATTERN in each FILE or standard input." STDOUT_NL); +printf("PATTERN must be present if neither -e nor -f is used." STDOUT_NL); + +#ifdef SUPPORT_PCRE2GREP_CALLOUT +printf("Callout scripts in patterns are supported." STDOUT_NL); +#else +printf("Callout scripts are not supported in this pcre2grep." STDOUT_NL); +#endif + +printf("\"-\" can be used as a file name to mean STDIN." STDOUT_NL); #ifdef SUPPORT_LIBZ -printf("Files whose names end in .gz are read using zlib.\n"); +printf("Files whose names end in .gz are read using zlib." STDOUT_NL); #endif #ifdef SUPPORT_LIBBZ2 -printf("Files whose names end in .bz2 are read using bzlib2.\n"); +printf("Files whose names end in .bz2 are read using bzlib2." STDOUT_NL); #endif #if defined SUPPORT_LIBZ || defined SUPPORT_LIBBZ2 -printf("Other files and the standard input are read as plain files.\n\n"); +printf("Other files and the standard input are read as plain files." STDOUT_NL STDOUT_NL); #else -printf("All files are read as plain files, without any interpretation.\n\n"); +printf("All files are read as plain files, without any interpretation." STDOUT_NL STDOUT_NL); #endif -printf("Example: pcre2grep -i 'hello.*world' menu.h main.c\n\n"); -printf("Options:\n"); +printf("Example: pcre2grep -i " QUOT "hello.*world" QUOT " menu.h main.c" STDOUT_NL STDOUT_NL); +printf("Options:" STDOUT_NL); for (op = optionlist; op->one_char != 0; op++) { @@ -922,17 +1101,18 @@ for (op = optionlist; op->one_char != 0; op++) } if (n < 1) n = 1; - printf("%.*s%s\n", n, " ", op->help_text); + printf("%.*s%s" STDOUT_NL, n, " ", op->help_text); } -printf("\nNumbers may be followed by K or M, e.g. --buffer-size=100K.\n"); -printf("The default value for --buffer-size is %d.\n", PCRE2GREP_BUFSIZE); -printf("When reading patterns or file names from a file, trailing white\n"); -printf("space is removed and blank lines are ignored.\n"); -printf("The maximum size of any pattern is %d bytes.\n", MAXPATLEN); +printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --max-buffer-size=100K." STDOUT_NL); +printf("The default value for --buffer-size is %d." STDOUT_NL, PCRE2GREP_BUFSIZE); +printf("The default value for --max-buffer-size is %d." STDOUT_NL, PCRE2GREP_MAX_BUFSIZE); +printf("When reading patterns or file names from a file, trailing white" STDOUT_NL); +printf("space is removed and blank lines are ignored." STDOUT_NL); +printf("The maximum size of any pattern is %d bytes." STDOUT_NL, MAXPATLEN); -printf("\nWith no FILEs, read standard input. If fewer than two FILEs given, assume -h.\n"); -printf("Exit status is 0 if any matches, 1 if no matches, and 2 if trouble.\n"); +printf(STDOUT_NL "With no FILEs, read standard input. If fewer than two FILEs given, assume -h." STDOUT_NL); +printf("Exit status is 0 if any matches, 1 if no matches, and 2 if trouble." STDOUT_NL); } @@ -1073,12 +1253,12 @@ return om; * Read one line of input * *************************************************/ -/* Normally, input is read using fread() into a large buffer, so many lines may -be read at once. However, doing this for tty input means that no output appears -until a lot of input has been typed. Instead, tty input is handled line by -line. We cannot use fgets() for this, because it does not stop at a binary -zero, and therefore there is no way of telling how many characters it has read, -because there may be binary zeros embedded in the data. +/* Normally, input is read using fread() (or gzread, or BZ2_read) into a large +buffer, so many lines may be read at once. However, doing this for tty input +means that no output appears until a lot of input has been typed. Instead, tty +input is handled line by line. We cannot use fgets() for this, because it does +not stop at a binary zero, and therefore there is no way of telling how many +characters it has read, because there may be binary zeros embedded in the data. Arguments: buffer the buffer to read into @@ -1166,7 +1346,7 @@ switch(endlinetype) while (p < endptr) { int extra = 0; - register int c = *((unsigned char *)p); + int c = *((unsigned char *)p); if (utf && c >= 0xc0) { @@ -1210,7 +1390,7 @@ switch(endlinetype) while (p < endptr) { int extra = 0; - register int c = *((unsigned char *)p); + int c = *((unsigned char *)p); if (utf && c >= 0xc0) { @@ -1312,7 +1492,7 @@ switch(endlinetype) while (p > startptr) { - register unsigned int c; + unsigned int c; char *pp = p - 1; if (utf) @@ -1392,22 +1572,23 @@ Returns: nothing static void do_after_lines(int lastmatchnumber, char *lastmatchrestart, char *endptr, - char *printname) + const char *printname) { if (after_context > 0 && lastmatchnumber > 0) { int count = 0; - while (lastmatchrestart < endptr && count++ < after_context) + while (lastmatchrestart < endptr && count < after_context) { int ellength; - char *pp = lastmatchrestart; + char *pp = end_of_line(lastmatchrestart, endptr, &ellength); + if (ellength == 0 && pp == main_buffer + bufsize) break; if (printname != NULL) fprintf(stdout, "%s-", printname); if (number) fprintf(stdout, "%d-", lastmatchnumber++); - pp = end_of_line(pp, endptr, &ellength); FWRITE(lastmatchrestart, 1, pp - lastmatchrestart, stdout); lastmatchrestart = pp; + count++; } - hyphenpending = TRUE; + if (count > 0) hyphenpending = TRUE; } } @@ -1473,6 +1654,309 @@ return FALSE; /* No match, no errors */ } +#ifdef SUPPORT_PCRE2GREP_CALLOUT + +/************************************************* +* Parse and execute callout scripts * +*************************************************/ + +/* This function parses a callout string block and executes the +program specified by the string. The string is a list of substrings +separated by pipe characters. The first substring represents the +executable name, and the following substrings specify the arguments: + + program_name|param1|param2|... + +Any substirng (including the program name) can contain escape sequences +started by the dollar character. The escape sequences are substituted as +follows: + + $ or ${ } is replaced by the captured substring of the given + decimal number, which must be greater than zero. If the number is greater + than the number of capturing substrings, or if the capture is unset, the + replacement is empty. + + Any other character is substituted by itself. E.g: $$ is replaced by a single + dollar or $| replaced by a pipe character. + +Example: + + echo -e "abcde\n12345" | pcre2grep \ + '(.)(..(.))(?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' - + + Output: + + Arg1: [a] [bcd] [d] Arg2: |a| () + abcde + Arg1: [1] [234] [4] Arg2: |1| () + 12345 + +Arguments: + blockptr the callout block + +Returns: currently it always returns with 0 +*/ + +static int +pcre2grep_callout(pcre2_callout_block *calloutptr, void *unused) +{ +PCRE2_SIZE length = calloutptr->callout_string_length; +PCRE2_SPTR string = calloutptr->callout_string; +PCRE2_SPTR subject = calloutptr->subject; +PCRE2_SIZE *ovector = calloutptr->offset_vector; +PCRE2_SIZE capture_top = calloutptr->capture_top; +PCRE2_SIZE argsvectorlen = 2; +PCRE2_SIZE argslen = 1; +char *args; +char *argsptr; +char **argsvector; +char **argsvectorptr; +#ifndef WIN32 +pid_t pid; +#endif +int result = 0; + +(void)unused; /* Avoid compiler warning */ + +/* Only callout with strings are supported. */ +if (string == NULL || length == 0) return 0; + +/* Checking syntax and compute the number of string fragments. Callout strings +are ignored in case of a syntax error. */ + +while (length > 0) + { + if (*string == '|') + { + argsvectorlen++; + + /* Maximum 10000 arguments allowed. */ + if (argsvectorlen > 10000) return 0; + } + else if (*string == '$') + { + PCRE2_SIZE capture_id = 0; + + string++; + length--; + + /* Syntax error: a character must be present after $. */ + if (length == 0) return 0; + + if (*string >= '1' && *string <= '9') + { + do + { + /* Maximum capture id is 65535. */ + if (capture_id <= 65535) + capture_id = capture_id * 10 + (*string - '0'); + + string++; + length--; + } + while (length > 0 && *string >= '0' && *string <= '9'); + + /* To negate the effect of string++ below. */ + string--; + length++; + } + else if (*string == '{') + { + /* Must be a decimal number in braces, e.g: {5} or {38} */ + string++; + length--; + + /* Syntax error: a decimal number required. */ + if (length == 0) return 0; + if (*string < '1' || *string > '9') return 0; + + do + { + /* Maximum capture id is 65535. */ + if (capture_id <= 65535) + capture_id = capture_id * 10 + (*string - '0'); + + string++; + length--; + + /* Syntax error: no more characters */ + if (length == 0) return 0; + } + while (*string >= '0' && *string <= '9'); + + /* Syntax error: closing brace is missing. */ + if (*string != '}') return 0; + } + + if (capture_id > 0) + { + if (capture_id < capture_top) + { + capture_id *= 2; + argslen += ovector[capture_id + 1] - ovector[capture_id]; + } + + /* To negate the effect of argslen++ below. */ + argslen--; + } + } + + string++; + length--; + argslen++; + } + +args = (char*)malloc(argslen); +if (args == NULL) return 0; + +argsvector = (char**)malloc(argsvectorlen * sizeof(char*)); +if (argsvector == NULL) + { + free(args); + return 0; + } + +argsptr = args; +argsvectorptr = argsvector; + +*argsvectorptr++ = argsptr; + +length = calloutptr->callout_string_length; +string = calloutptr->callout_string; + +while (length > 0) + { + if (*string == '|') + { + *argsptr++ = '\0'; + *argsvectorptr++ = argsptr; + } + else if (*string == '$') + { + string++; + length--; + + if ((*string >= '1' && *string <= '9') || *string == '{') + { + PCRE2_SIZE capture_id = 0; + + if (*string != '{') + { + do + { + /* Maximum capture id is 65535. */ + if (capture_id <= 65535) + capture_id = capture_id * 10 + (*string - '0'); + + string++; + length--; + } + while (length > 0 && *string >= '0' && *string <= '9'); + + /* To negate the effect of string++ below. */ + string--; + length++; + } + else + { + string++; + length--; + + do + { + /* Maximum capture id is 65535. */ + if (capture_id <= 65535) + capture_id = capture_id * 10 + (*string - '0'); + + string++; + length--; + } + while (*string != '}'); + } + + if (capture_id < capture_top) + { + PCRE2_SIZE capturesize; + capture_id *= 2; + + capturesize = ovector[capture_id + 1] - ovector[capture_id]; + memcpy(argsptr, subject + ovector[capture_id], capturesize); + argsptr += capturesize; + } + } + else + { + *argsptr++ = *string; + } + } + else + { + *argsptr++ = *string; + } + + string++; + length--; + } + +*argsptr++ = '\0'; +*argsvectorptr = NULL; + +#ifdef WIN32 +result = _spawnvp(_P_WAIT, argsvector[0], (const char * const *)argsvector); +#else +pid = fork(); + +if (pid == 0) + { + (void)execv(argsvector[0], argsvector); + /* Control gets here if there is an error, e.g. a non-existent program */ + exit(1); + } +else if (pid > 0) + (void)waitpid(pid, &result, 0); +#endif + +free(args); +free(argsvector); + +/* Currently negative return values are not supported, only zero (match +continues) or non-zero (match fails). */ + +return result != 0; +} + +#endif + + + +/************************************************* +* Read a portion of the file into buffer * +*************************************************/ + +static int +fill_buffer(void *handle, int frtype, char *buffer, int length, + BOOL input_line_buffered) +{ +(void)frtype; /* Avoid warning when not used */ + +#ifdef SUPPORT_LIBZ +if (frtype == FR_LIBZ) + return gzread((gzFile)handle, buffer, length); +else +#endif + +#ifdef SUPPORT_LIBBZ2 +if (frtype == FR_LIBBZ2) + return BZ2_bzread((BZFILE *)handle, buffer, length); +else +#endif + +return (input_line_buffered ? + read_one_line(buffer, length, (FILE *)handle) : + fread(buffer, 1, length, (FILE *)handle)); +} + + /************************************************* * Grep an individual file * @@ -1502,7 +1986,7 @@ Returns: 0 if there was at least one match */ static int -pcre2grep(void *handle, int frtype, char *filename, char *printname) +pcre2grep(void *handle, int frtype, const char *filename, const char *printname) { int rc = 1; int linenumber = 1; @@ -1518,49 +2002,24 @@ BOOL endhyphenpending = FALSE; BOOL input_line_buffered = line_buffered; FILE *in = NULL; /* Ensure initialized */ -#ifdef SUPPORT_LIBZ -gzFile ingz = NULL; -#endif - -#ifdef SUPPORT_LIBBZ2 -BZFILE *inbz2 = NULL; -#endif - - /* Do the first read into the start of the buffer and set up the pointer to end of what we have. In the case of libz, a non-zipped .gz file will be read as a plain file. However, if a .bz2 file isn't actually bzipped, the first read will fail. */ -(void)frtype; - -#ifdef SUPPORT_LIBZ -if (frtype == FR_LIBZ) - { - ingz = (gzFile)handle; - bufflength = gzread (ingz, main_buffer, bufsize); - } -else -#endif - -#ifdef SUPPORT_LIBBZ2 -if (frtype == FR_LIBBZ2) - { - inbz2 = (BZFILE *)handle; - bufflength = BZ2_bzread(inbz2, main_buffer, bufsize); - if ((int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */ - } /* without the cast it is unsigned. */ -else -#endif - +if (frtype != FR_LIBZ && frtype != FR_LIBBZ2) { in = (FILE *)handle; if (is_file_tty(in)) input_line_buffered = TRUE; - bufflength = input_line_buffered? - read_one_line(main_buffer, bufsize, in) : - fread(main_buffer, 1, bufsize, in); } +bufflength = fill_buffer(handle, frtype, main_buffer, bufsize, + input_line_buffered); + +#ifdef SUPPORT_LIBBZ2 +if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */ +#endif + endptr = main_buffer + bufflength; /* Unless binary-files=text, see if we have a binary file. This uses the same @@ -1591,7 +2050,7 @@ while (ptr < endptr) size_t startoffset = 0; /* At this point, ptr is at the start of a line. We need to find the length - of the subject string to pass to pcre_exec(). In multiline mode, it is the + of the subject string to pass to pcre2_match(). In multiline mode, it is the length remainder of the data in the buffer. Otherwise, it is the length of the next line, excluding the terminating newline. After matching, we always advance by the length of the next line. In multiline mode the PCRE2_FIRSTLINE @@ -1604,16 +2063,61 @@ while (ptr < endptr) /* Check to see if the line we are looking at extends right to the very end of the buffer without a line terminator. This means the line is too long to - handle. */ + handle at the current buffer size. Until the buffer reaches its maximum size, + try doubling it and reading more data. */ if (endlinelength == 0 && t == main_buffer + bufsize) { - fprintf(stderr, "pcre2grep: line %d%s%s is too long for the internal buffer\n" - "pcre2grep: check the --buffer-size option\n", - linenumber, - (filename == NULL)? "" : " of file ", - (filename == NULL)? "" : filename); - return 2; + if (bufthird < max_bufthird) + { + char *new_buffer; + int new_bufthird = 2*bufthird; + + if (new_bufthird > max_bufthird) new_bufthird = max_bufthird; + new_buffer = (char *)malloc(3*new_bufthird); + + if (new_buffer == NULL) + { + fprintf(stderr, + "pcre2grep: line %d%s%s is too long for the internal buffer\n" + "pcre2grep: not enough memory to increase the buffer size to %d\n", + linenumber, + (filename == NULL)? "" : " of file ", + (filename == NULL)? "" : filename, + new_bufthird); + return 2; + } + + /* Copy the data and adjust pointers to the new buffer location. */ + + memcpy(new_buffer, main_buffer, bufsize); + bufthird = new_bufthird; + bufsize = 3*bufthird; + ptr = new_buffer + (ptr - main_buffer); + lastmatchrestart = new_buffer + (lastmatchrestart - main_buffer); + free(main_buffer); + main_buffer = new_buffer; + + /* Read more data into the buffer and then try to find the line ending + again. */ + + bufflength += fill_buffer(handle, frtype, main_buffer + bufflength, + bufsize - bufflength, input_line_buffered); + endptr = main_buffer + bufflength; + continue; + } + else + { + fprintf(stderr, + "pcre2grep: line %d%s%s is too long for the internal buffer\n" + "pcre2grep: the maximum buffer size is %d\n" + "pcre2grep: use the --max-buffer-size option to change it\n", + linenumber, + (filename == NULL)? "" : " of file ", + (filename == NULL)? "" : filename, + bufthird); + return 2; + } } /* Extra processing for Jeffrey Friedl's debugging. */ @@ -1691,9 +2195,13 @@ while (ptr < endptr) if (filenames == FN_NOMATCH_ONLY) return 1; + /* If all we want is a yes/no answer, we can return immediately. */ + + if (quiet) return 0; + /* Just count if just counting is wanted. */ - if (count_only) count++; + else if (count_only || show_total_count) count++; /* When handling a binary file and binary-files==binary, the "binary" variable will be set true (it's false in all other cases). In this @@ -1701,23 +2209,19 @@ while (ptr < endptr) else if (binary) { - fprintf(stdout, "Binary file %s matches\n", filename); + fprintf(stdout, "Binary file %s matches" STDOUT_NL, filename); return 0; } - /* If all we want is a file name, there is no need to scan any more lines - in the file. */ + /* Likewise, if all we want is a file name, there is no need to scan any + more lines in the file. */ else if (filenames == FN_MATCH_ONLY) { - fprintf(stdout, "%s\n", printname); + fprintf(stdout, "%s" STDOUT_NL, printname); return 0; } - /* Likewise, if all we want is a yes/no answer. */ - - else if (quiet) return 0; - /* The --only-matching option prints just the substring that matched, and/or one or more captured portions of it, as long as these strings are not empty. The --file-offsets and --line-offsets options output offsets for @@ -1739,13 +2243,13 @@ while (ptr < endptr) /* Handle --line-offsets */ if (line_offsets) - fprintf(stdout, "%d,%d\n", (int)(matchptr + offsets[0] - ptr), + fprintf(stdout, "%d,%d" STDOUT_NL, (int)(matchptr + offsets[0] - ptr), (int)(offsets[1] - offsets[0])); /* Handle --file-offsets */ else if (file_offsets) - fprintf(stdout, "%d,%d\n", + fprintf(stdout, "%d,%d" STDOUT_NL, (int)(filepos + matchptr + offsets[0] - ptr), (int)(offsets[1] - offsets[0])); @@ -1765,34 +2269,51 @@ while (ptr < endptr) if (plen > 0) { if (printed) fprintf(stdout, "%s", om_separator); - if (do_colour) fprintf(stdout, "%c[%sm", 0x1b, colour_string); - FWRITE(matchptr + offsets[n*2], 1, plen, stdout); - if (do_colour) fprintf(stdout, "%c[00m", 0x1b); + print_match(matchptr + offsets[n*2], plen); printed = TRUE; } } } - if (printed || printname != NULL || number) fprintf(stdout, "\n"); + if (printed || printname != NULL || number) + fprintf(stdout, STDOUT_NL); } - /* Prepare to repeat to find the next match. If the pattern contained a - lookbehind that included \K, it is possible that the end of the match - might be at or before the actual starting offset we have just used. In - this case, start one character further on. */ + /* Prepare to repeat to find the next match in the line. */ match = FALSE; if (line_buffered) fflush(stdout); rc = 0; /* Had some success */ + + /* If the pattern contained a lookbehind that included \K, it is + possible that the end of the match might be at or before the actual + starting offset we have just used. In this case, start one character + further on. */ + startoffset = offsets[1]; /* Restart after the match */ oldstartoffset = pcre2_get_startchar(match_data); if (startoffset <= oldstartoffset) { if (startoffset >= length) goto END_ONE_MATCH; /* Were at end */ startoffset = oldstartoffset + 1; - if (utf) - while ((matchptr[startoffset] & 0xc0) == 0x80) startoffset++; + if (utf) while ((matchptr[startoffset] & 0xc0) == 0x80) startoffset++; } + + /* If the current match ended past the end of the line (only possible + in multiline mode), we must move on to the line in which it did end + before searching for more matches. */ + + while (startoffset > linelength) + { + matchptr = ptr += linelength + endlinelength; + filepos += (int)(linelength + endlinelength); + linenumber++; + startoffset -= (int)(linelength + endlinelength); + t = end_of_line(ptr, endptr, &endlinelength); + linelength = t - ptr - endlinelength; + length = (size_t)(endptr - ptr); + } + goto ONLY_MATCHING_RESTART; } } @@ -1838,7 +2359,7 @@ while (ptr < endptr) if (hyphenpending) { - fprintf(stdout, "--\n"); + fprintf(stdout, "--" STDOUT_NL); hyphenpending = FALSE; hyphenprinted = TRUE; } @@ -1859,7 +2380,7 @@ while (ptr < endptr) } if (lastmatchnumber > 0 && p > lastmatchrestart && !hyphenprinted) - fprintf(stdout, "--\n"); + fprintf(stdout, "--" STDOUT_NL); while (p < ptr) { @@ -1926,9 +2447,7 @@ while (ptr < endptr) { int plength; FWRITE(ptr, 1, offsets[0], stdout); - fprintf(stdout, "%c[%sm", 0x1b, colour_string); - FWRITE(ptr + offsets[0], 1, offsets[1] - offsets[0], stdout); - fprintf(stdout, "%c[00m", 0x1b); + print_match(ptr + offsets[0], offsets[1] - offsets[0]); for (;;) { startoffset = offsets[1]; @@ -1936,9 +2455,7 @@ while (ptr < endptr) !match_patterns(matchptr, length, options, startoffset, &mrc)) break; FWRITE(matchptr + startoffset, 1, offsets[0] - startoffset, stdout); - fprintf(stdout, "%c[%sm", 0x1b, colour_string); - FWRITE(matchptr + offsets[0], 1, offsets[1] - offsets[0], stdout); - fprintf(stdout, "%c[00m", 0x1b); + print_match(matchptr + offsets[0], offsets[1] - offsets[0]); } /* In multiline mode, we may have already printed the complete line @@ -2015,7 +2532,7 @@ while (ptr < endptr) lastmatchrestart < main_buffer + bufthird) { do_after_lines(lastmatchnumber, lastmatchrestart, endptr, printname); - lastmatchnumber = 0; + lastmatchnumber = 0; /* Indicates no after lines pending */ } /* Now do the shuffle */ @@ -2023,24 +2540,8 @@ while (ptr < endptr) memmove(main_buffer, main_buffer + bufthird, 2*bufthird); ptr -= bufthird; -#ifdef SUPPORT_LIBZ - if (frtype == FR_LIBZ) - bufflength = 2*bufthird + - gzread (ingz, main_buffer + 2*bufthird, bufthird); - else -#endif - -#ifdef SUPPORT_LIBBZ2 - if (frtype == FR_LIBBZ2) - bufflength = 2*bufthird + - BZ2_bzread(inbz2, main_buffer + 2*bufthird, bufthird); - else -#endif - - bufflength = 2*bufthird + - (input_line_buffered? - read_one_line(main_buffer + 2*bufthird, bufthird, in) : - fread(main_buffer + 2*bufthird, 1, bufthird, in)); + bufflength = 2*bufthird + fill_buffer(handle, frtype, + main_buffer + 2*bufthird, bufthird, input_line_buffered); endptr = main_buffer + bufflength; /* Adjust any last match point */ @@ -2052,7 +2553,7 @@ while (ptr < endptr) /* End of file; print final "after" lines if wanted; do_after_lines sets hyphenpending if it prints something. */ -if (!show_only_matching && !count_only) +if (!show_only_matching && !(count_only|show_total_count)) { do_after_lines(lastmatchnumber, lastmatchrestart, endptr, printname); hyphenpending |= endhyphenpending; @@ -2063,22 +2564,24 @@ were none. If we found a match, we won't have got this far. */ if (filenames == FN_NOMATCH_ONLY) { - fprintf(stdout, "%s\n", printname); + fprintf(stdout, "%s" STDOUT_NL, printname); return 0; } /* Print the match count if wanted */ -if (count_only) +if (count_only && !quiet) { if (count > 0 || !omit_zero_count) { if (printname != NULL && filenames != FN_NONE) fprintf(stdout, "%s:", printname); - fprintf(stdout, "%d\n", count); + fprintf(stdout, "%d" STDOUT_NL, count); + counts_printed++; } } +total_count += count; /* Can be set without count_only */ return rc; } @@ -2223,6 +2726,36 @@ if (isdirectory(pathname)) } } +#ifdef WIN32 +if (iswild(pathname)) + { + char buffer[1024]; + char *nextfile; + char *name; + directory_type *dir = opendirectory(pathname); + + if (dir == NULL) + return 0; + + for (nextfile = name = pathname; *nextfile != 0; nextfile++) + if (*nextfile == '/' || *nextfile == '\\') + name = nextfile + 1; + *name = 0; + + while ((nextfile = readdirectory(dir)) != NULL) + { + int frc; + sprintf(buffer, "%.512s%.128s", pathname, nextfile); + frc = grep_or_recurse(buffer, dir_recurse, FALSE); + if (frc > 1) rc = frc; + else if (frc == 0 && rc == 1) rc = 0; + } + + closedirectory(dir); + return rc; + } +#endif + #if defined NATIVE_ZOS } #endif @@ -2387,6 +2920,7 @@ switch(letter) case 'q': quiet = TRUE; break; case 'r': dee_action = dee_RECURSE; break; case 's': silent = TRUE; break; + case 't': show_total_count = TRUE; break; case 'u': options |= PCRE2_UTF; utf = TRUE; break; case 'v': invert = TRUE; break; case 'w': process_options |= PO_WORD_MATCH; break; @@ -2396,7 +2930,7 @@ switch(letter) { unsigned char buffer[128]; (void)pcre2_config(PCRE2_CONFIG_VERSION, buffer); - fprintf(stdout, "pcre2grep version %s\n", buffer); + fprintf(stdout, "pcre2grep version %s" STDOUT_NL, buffer); } pcre2grep_exit(0); break; @@ -2421,10 +2955,12 @@ return options; static char * ordin(int n) { -static char buffer[8]; +static char buffer[14]; char *p = buffer; sprintf(p, "%d", n); while (*p != 0) p++; +n %= 100; +if (n >= 11 && n <= 13) n = 0; switch (n%10) { case 1: strcpy(p, "st"); break; @@ -2488,9 +3024,20 @@ if ((popts & PO_FIXED_STRINGS) != 0) } sprintf((char *)buffer, "%s%.*s%s", prefix[popts], patlen, ps, suffix[popts]); -p->compiled = pcre2_compile(buffer, -1, options, &errcode, &erroffset, - compile_context); -if (p->compiled != NULL) return TRUE; +p->compiled = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, options, &errcode, + &erroffset, compile_context); + +/* Handle successful compile. Try JIT-compiling if supported and enabled. We +ignore any JIT compiler errors, relying falling back to interpreting if +anything goes wrong with JIT. */ + +if (p->compiled != NULL) + { +#ifdef SUPPORT_PCRE2GREP_JIT + if (use_jit) (void)pcre2_jit_compile(p->compiled, PCRE2_JIT_COMPLETE); +#endif + return TRUE; + } /* Handle compile errors */ @@ -2538,7 +3085,7 @@ read_pattern_file(char *name, patstr **patptr, patstr **patlastptr, int popts) { int linenumber = 0; FILE *f; -char *filename; +const char *filename; char buffer[PATBUFSIZE]; if (strcmp(name, "-") == 0) @@ -2623,6 +3170,16 @@ const char *locale_from = "--locale"; pcre2_jit_stack *jit_stack = NULL; #endif +/* In Windows, stdout is set up as a text stream, which means that \n is +converted to \r\n. This causes output lines that are copied from the input to +change from ....\r\n to ....\r\r\n, which is not right. We therefore ensure +that stdout is a binary stream. Note that this means all other output to stdout +must use STDOUT_NL to terminate lines. */ + +#ifdef WIN32 +_setmode(_fileno(stdout), _O_BINARY); +#endif + /* Set up a default compile and match contexts and a match data block. */ compile_context = pcre2_compile_context_create(NULL); @@ -2630,6 +3187,13 @@ match_context = pcre2_match_context_create(NULL); match_data = pcre2_match_data_create(OFFSET_SIZE, NULL); offsets = pcre2_get_ovector_pointer(match_data); +/* If string (script) callouts are supported, set up the callout processing +function. */ + +#ifdef SUPPORT_PCRE2GREP_CALLOUT +pcre2_set_callout(match_context, pcre2grep_callout, NULL); +#endif + /* Process the options */ for (i = 1; i < argc; i++) @@ -2836,7 +3400,7 @@ for (i = 1; i < argc; i++) switch (op->one_char) { case N_COLOUR: - colour_option = (char *)"auto"; + colour_option = "auto"; break; case 'o': @@ -2977,7 +3541,7 @@ LC_ALL environment variable is set, and if so, use it. */ if (locale == NULL) { locale = getenv("LC_ALL"); - locale_from = "LCC_ALL"; + locale_from = "LC_ALL"; } if (locale == NULL) @@ -3005,7 +3569,11 @@ if (locale != NULL) if (colour_option != NULL && strcmp(colour_option, "never") != 0) { - if (strcmp(colour_option, "always") == 0) do_colour = TRUE; + if (strcmp(colour_option, "always") == 0) +#ifdef WIN32 + do_ansi = !is_stdout_tty(), +#endif + do_colour = TRUE; else if (strcmp(colour_option, "auto") == 0) do_colour = is_stdout_tty(); else { @@ -3017,7 +3585,17 @@ if (colour_option != NULL && strcmp(colour_option, "never") != 0) { char *cs = getenv("PCRE2GREP_COLOUR"); if (cs == NULL) cs = getenv("PCRE2GREP_COLOR"); - if (cs != NULL) colour_string = cs; + if (cs == NULL) cs = getenv("PCREGREP_COLOUR"); + if (cs == NULL) cs = getenv("PCREGREP_COLOR"); + if (cs == NULL) cs = parse_grep_colors(getenv("GREP_COLORS")); + if (cs == NULL) cs = getenv("GREP_COLOR"); + if (cs != NULL) + { + if (strspn(cs, ";0123456789") == strlen(cs)) colour_string = cs; + } +#ifdef WIN32 + init_colour_output(); +#endif } } @@ -3087,8 +3665,24 @@ if (jfriedl_XT != 0 || jfriedl_XR != 0) } #endif +/* If use_jit is set, check whether JIT is available. If not, do not try +to use JIT. */ + +if (use_jit) + { + uint32_t answer; + (void)pcre2_config(PCRE2_CONFIG_JIT, &answer); + if (!answer) use_jit = FALSE; + } + /* Get memory for the main buffer. */ +if (bufthird <= 0) + { + fprintf(stderr, "pcre2grep: --buffer-size must be greater than zero\n"); + goto EXIT2; + } + bufsize = 3*bufthird; main_buffer = (char *)malloc(bufsize); @@ -3230,6 +3824,16 @@ for (; i < argc; i++) else if (frc == 0 && rc == 1) rc = 0; } +/* Show the total number of matches if requested, but not if only one file's +count was printed. */ + +if (show_total_count && counts_printed != 1 && filenames != FN_NOMATCH_ONLY) + { + if (counts_printed != 0 && filenames >= FN_DEFAULT) + fprintf(stdout, "TOTAL:"); + fprintf(stdout, "%d" STDOUT_NL, total_count); + } + EXIT: #ifdef SUPPORT_PCRE2GREP_JIT if (jit_stack != NULL) pcre2_jit_stack_free(jit_stack); diff --git a/pcre2/src/pcre2posix.c b/pcre2/src/pcre2posix.c index da212fc4d..4ecc701c2 100644 --- a/pcre2/src/pcre2posix.c +++ b/pcre2/src/pcre2posix.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -58,16 +58,49 @@ previously been set. */ # define PCRE2POSIX_EXP_DEFN __declspec(dllexport) #endif -/* We include pcre2.h before pcre2_internal.h so that the PCRE2 library -functions are declared as "import" for Windows by defining PCRE2_EXP_DECL as -"import". This is needed even though pcre2_internal.h itself includes pcre2.h, -because it does so after it has set PCRE2_EXP_DECL to "export" if it is not -already set. */ +/* Older versions of MSVC lack snprintf(). This define allows for +warning/error-free compilation and testing with MSVC compilers back to at least +MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */ + +#if defined(_MSC_VER) && (_MSC_VER < 1900) +#define snprintf _snprintf +#endif + + +/* Compile-time error numbers start at this value. It should probably never be +changed. This #define is a copy of the one in pcre2_internal.h. */ + +#define COMPILE_ERROR_BASE 100 + + +/* Standard C headers */ + +#include +#include +#include +#include +#include +#include + +/* PCRE2 headers */ #include "pcre2.h" -#include "pcre2_internal.h" #include "pcre2posix.h" +/* When compiling with the MSVC compiler, it is sometimes necessary to include +a "calling convention" before exported function names. (This is secondhand +information; I know nothing about MSVC myself). For example, something like + + void __cdecl function(....) + +might be needed. In order so make this easy, all the exported functions have +PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not +set, we ensure here that it has no effect. */ + +#ifndef PCRE2_CALL_CONVENTION +#define PCRE2_CALL_CONVENTION +#endif + /* Table to translate PCRE2 compile time error codes into POSIX error codes. Only a few PCRE2 errors with a value greater than 23 turn into special POSIX codes: most go to REG_BADPAT. The second table lists, in pairs, those that @@ -106,7 +139,7 @@ static const int eint1[] = { static const int eint2[] = { 30, REG_ECTYPE, /* unknown POSIX class name */ - 32, REG_INVARG, /* this version of PCRE does not have UTF or UCP support */ + 32, REG_INVARG, /* this version of PCRE2 does not have Unicode support */ 37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */ 56, REG_INVARG, /* internal error: unknown newline setting */ }; @@ -144,29 +177,23 @@ static const char *const pstring[] = { PCRE2POSIX_EXP_DEFN size_t PCRE2_CALL_CONVENTION regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) { -const char *message, *addmessage; -size_t length, addlength; +int used; +const char *message; -message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))? +message = (errcode <= 0 || errcode >= (int)(sizeof(pstring)/sizeof(char *)))? "unknown error code" : pstring[errcode]; -length = strlen(message) + 1; -addmessage = " at offset "; -addlength = (preg != NULL && (int)preg->re_erroffset != -1)? - strlen(addmessage) + 6 : 0; - -if (errbuf_size > 0) +if (preg != NULL && (int)preg->re_erroffset != -1) { - if (addlength > 0 && errbuf_size >= length + addlength) - sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset); - else - { - strncpy(errbuf, message, errbuf_size - 1); - errbuf[errbuf_size-1] = 0; - } + used = snprintf(errbuf, errbuf_size, "%s at offset %-6d", message, + (int)preg->re_erroffset); + } +else + { + used = snprintf(errbuf, errbuf_size, "%s", message); } -return length + addlength; +return used + 1; } @@ -211,11 +238,11 @@ int re_nsub = 0; if ((cflags & REG_ICASE) != 0) options |= PCRE2_CASELESS; if ((cflags & REG_NEWLINE) != 0) options |= PCRE2_MULTILINE; if ((cflags & REG_DOTALL) != 0) options |= PCRE2_DOTALL; -if ((cflags & REG_NOSUB) != 0) options |= PCRE2_NO_AUTO_CAPTURE; if ((cflags & REG_UTF) != 0) options |= PCRE2_UTF; if ((cflags & REG_UCP) != 0) options |= PCRE2_UCP; if ((cflags & REG_UNGREEDY) != 0) options |= PCRE2_UNGREEDY; +preg->re_cflags = cflags; preg->re_pcre2_code = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, options, &errorcode, &erroffset, NULL); preg->re_erroffset = erroffset; @@ -223,8 +250,13 @@ preg->re_erroffset = erroffset; if (preg->re_pcre2_code == NULL) { unsigned int i; - if (errorcode < 0) return REG_BADPAT; /* UTF error */ + + /* A negative value is a UTF error; otherwise all error codes are greater + than COMPILE_ERROR_BASE, but check, just in case. */ + + if (errorcode < COMPILE_ERROR_BASE) return REG_BADPAT; errorcode -= COMPILE_ERROR_BASE; + if (errorcode < (int)(sizeof(eint1)/sizeof(const int))) return eint1[errorcode]; for (i = 0; i < sizeof(eint2)/(2*sizeof(const int)); i += 2) @@ -235,8 +267,14 @@ if (preg->re_pcre2_code == NULL) (void)pcre2_pattern_info((const pcre2_code *)preg->re_pcre2_code, PCRE2_INFO_CAPTURECOUNT, &re_nsub); preg->re_nsub = (size_t)re_nsub; -if ((options & PCRE2_NO_AUTO_CAPTURE) != 0) re_nsub = -1; preg->re_match_data = pcre2_match_data_create(re_nsub + 1, NULL); + +if (preg->re_match_data == NULL) + { + pcre2_code_free(preg->re_pcre2_code); + return REG_ESPACE; + } + return 0; } @@ -248,8 +286,7 @@ return 0; /* A suitable match_data block, large enough to hold all possible captures, was obtained when the pattern was compiled, to save having to allocate and free it -for each match. If REG_NOSUB was specified at compile time, the -PCRE_NO_AUTO_CAPTURE flag will be set. When this is the case, the nmatch and +for each match. If REG_NOSUB was specified at compile time, the nmatch and pmatch arguments are ignored, and the only result is yes/no/error. */ PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION @@ -266,11 +303,11 @@ if ((eflags & REG_NOTEMPTY) != 0) options |= PCRE2_NOTEMPTY; ((regex_t *)preg)->re_erroffset = (size_t)(-1); /* Only has meaning after compile */ -/* When no string data is being returned, or no vector has been passed in which -to put it, ensure that nmatch is zero. */ +/* When REG_NOSUB was specified, or if no vector has been passed in which to +put captured strings, ensure that nmatch is zero. This will stop any attempt to +write to pmatch. */ -if ((((pcre2_real_code *)(preg->re_pcre2_code))->compile_options & - PCRE2_NO_AUTO_CAPTURE) != 0 || pmatch == NULL) nmatch = 0; +if ((preg->re_cflags & REG_NOSUB) != 0 || pmatch == NULL) nmatch = 0; /* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings. The man page from OS X says "REG_STARTEND affects only the location of the @@ -279,6 +316,7 @@ start location rather than being passed as a PCRE2 "starting offset". */ if ((eflags & REG_STARTEND) != 0) { + if (pmatch == NULL) return REG_INVARG; so = pmatch[0].rm_so; eo = pmatch[0].rm_eo; } @@ -296,11 +334,12 @@ rc = pcre2_match((const pcre2_code *)preg->re_pcre2_code, if (rc >= 0) { size_t i; + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md); if ((size_t)rc > nmatch) rc = (int)nmatch; for (i = 0; i < (size_t)rc; i++) { - pmatch[i].rm_so = md->ovector[i*2]; - pmatch[i].rm_eo = md->ovector[i*2+1]; + pmatch[i].rm_so = ovector[i*2]; + pmatch[i].rm_eo = ovector[i*2+1]; } for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1; return 0; diff --git a/pcre2/src/pcre2posix.h b/pcre2/src/pcre2posix.h index 6f19b51b2..6505976aa 100644 --- a/pcre2/src/pcre2posix.h +++ b/pcre2/src/pcre2posix.h @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2014 University of Cambridge + New API code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -56,7 +56,7 @@ extern "C" { #define REG_NOTBOL 0x0004 /* Maps to PCRE2_NOTBOL */ #define REG_NOTEOL 0x0008 /* Maps to PCRE2_NOTEOL */ #define REG_DOTALL 0x0010 /* NOT defined by POSIX; maps to PCRE2_DOTALL */ -#define REG_NOSUB 0x0020 /* Maps to PCRE2_NO_AUTO_CAPTURE */ +#define REG_NOSUB 0x0020 /* Do not report what was matched */ #define REG_UTF 0x0040 /* NOT defined by POSIX; maps to PCRE2_UTF */ #define REG_STARTEND 0x0080 /* BSD feature: pass subject string by so,eo */ #define REG_NOTEMPTY 0x0100 /* NOT defined by POSIX; maps to PCRE2_NOTEMPTY */ @@ -98,6 +98,7 @@ typedef struct { void *re_match_data; size_t re_nsub; size_t re_erroffset; + int re_cflags; } regex_t; /* The structure in which a captured offset is returned. */ diff --git a/pcre2/src/pcre2test.c b/pcre2/src/pcre2test.c index 34cc3a5ed..241c22c46 100644 --- a/pcre2/src/pcre2test.c +++ b/pcre2/src/pcre2test.c @@ -11,7 +11,7 @@ hacked-up (non-) design had also run out of steam. Written by Philip Hazel Original code Copyright (c) 1997-2012 University of Cambridge - Rewritten code Copyright (c) 2015 University of Cambridge + Rewritten code Copyright (c) 2016 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -66,6 +66,14 @@ it references only the enabled library functions. */ #include #include +#if defined NATIVE_ZOS +#include "pcrzoscs.h" +/* That header is not included in the main PCRE2 distribution because other +apparatus is needed to compile pcre2test for z/OS. The header can be found in +the special z/OS distribution, which is available from www.zaconsultants.net or +from www.cbttape.org. */ +#endif + #ifdef HAVE_UNISTD_H #include #endif @@ -150,6 +158,13 @@ patterns. */ void vms_setsymbol( char *, char *, int ); #endif +/* VC doesn't support "%td". */ +#ifdef _MSC_VER +#define PTR_SPEC "%lu" +#else +#define PTR_SPEC "%td" +#endif + /* ------------------End of system-specific definitions -------------------- */ /* Glueing macros that are used in several places below. */ @@ -167,20 +182,20 @@ void vms_setsymbol( char *, char *, int ); #endif #endif -#define CFAIL_UNSET UINT32_MAX /* Unset value for cfail fields */ +#define CFORE_UNSET UINT32_MAX /* Unset value for cfail/cerror fields */ #define DFA_WS_DIMENSION 1000 /* Size of DFA workspace */ #define DEFAULT_OVECCOUNT 15 /* Default ovector count */ #define JUNK_OFFSET 0xdeadbeef /* For initializing ovector */ #define LOCALESIZE 32 /* Size of locale name */ #define LOOPREPEAT 500000 /* Default loop count for timing */ #define PATSTACKSIZE 20 /* Pattern stack for save/restore testing */ -#define REPLACE_MODSIZE 96 /* Field for reading 8-bit replacement */ +#define REPLACE_MODSIZE 100 /* Field for reading 8-bit replacement */ #define VERSION_SIZE 64 /* Size of buffer for the version strings */ /* Make sure the buffer into which replacement strings are copied is big enough to hold them as 32-bit code units. */ -#define REPLACE_BUFFSIZE (4*REPLACE_MODSIZE) +#define REPLACE_BUFFSIZE 1024 /* This is a byte value */ /* Execution modes */ @@ -203,7 +218,7 @@ systems that differ in their output from isprint() even in the "C" locale. */ #define PRINTABLE(c) ((c) >= 32 && (c) < 127) #endif -#define PRINTOK(c) ((locale_tables != NULL)? isprint(c) : PRINTABLE(c)) +#define PRINTOK(c) ((use_tables != NULL && c < 256)? isprint(c) : PRINTABLE(c)) /* We have to include some of the library source files because we need to use some of the macros, internal structure definitions, and other internal @@ -231,6 +246,22 @@ of PRIV avoids name clashes. */ #include "pcre2_tables.c" #include "pcre2_ucd.c" +/* 32-bit integer values in the input are read by strtoul() or strtol(). The +check needed for overflow depends on whether long ints are in fact longer than +ints. They are defined not to be shorter. */ + +#if ULONG_MAX > UINT32_MAX +#define U32OVERFLOW(x) (x > UINT32_MAX) +#else +#define U32OVERFLOW(x) (x == UINT32_MAX) +#endif + +#if LONG_MAX > INT32_MAX +#define S32OVERFLOW(x) (x > INT32_MAX || x < INT32_MIN) +#else +#define S32OVERFLOW(x) (x == INT32_MAX || x == INT32_MIN) +#endif + /* When PCRE2_CODE_UNIT_WIDTH is zero, pcre2_internal.h does not include pcre2_intmodedep.h, which is where mode-dependent macros and structures are defined. We can now include it for each supported code unit width. Because @@ -328,17 +359,19 @@ typedef struct cmdstruct { int value; } cmdstruct; -enum { CMD_FORBID_UTF, CMD_LOAD, CMD_PATTERN, CMD_PERLTEST, CMD_POP, CMD_SAVE, - CMD_SUBJECT, CMD_UNKNOWN }; +enum { CMD_FORBID_UTF, CMD_LOAD, CMD_NEWLINE_DEFAULT, CMD_PATTERN, + CMD_PERLTEST, CMD_POP, CMD_POPCOPY, CMD_SAVE, CMD_SUBJECT, CMD_UNKNOWN }; static cmdstruct cmdlist[] = { - { "forbid_utf", CMD_FORBID_UTF }, - { "load", CMD_LOAD }, - { "pattern", CMD_PATTERN }, - { "perltest", CMD_PERLTEST }, - { "pop", CMD_POP }, - { "save", CMD_SAVE }, - { "subject", CMD_SUBJECT }}; + { "forbid_utf", CMD_FORBID_UTF }, + { "load", CMD_LOAD }, + { "newline_default", CMD_NEWLINE_DEFAULT }, + { "pattern", CMD_PATTERN }, + { "perltest", CMD_PERLTEST }, + { "pop", CMD_POP }, + { "popcopy", CMD_POPCOPY }, + { "save", CMD_SAVE }, + { "subject", CMD_SUBJECT }}; #define cmdlistcount sizeof(cmdlist)/sizeof(cmdstruct) @@ -370,38 +403,56 @@ enum { MOD_CTC, /* Applies to a compile context */ MOD_NL, /* Is a newline value */ MOD_NN, /* Is a number or a name; more than one may occur */ MOD_OPT, /* Is an option bit */ + MOD_SIZ, /* Is a PCRE2_SIZE value */ MOD_STR }; /* Is a string */ /* Control bits. Some apply to compiling, some to matching, but some can be set -either on a pattern or a data line, so they must all be distinct. */ +either on a pattern or a data line, so they must all be distinct. There are now +so many of them that they are split into two fields. */ -#define CTL_AFTERTEXT 0x00000001u -#define CTL_ALLAFTERTEXT 0x00000002u -#define CTL_ALLCAPTURES 0x00000004u -#define CTL_ALLUSEDTEXT 0x00000008u -#define CTL_ALTGLOBAL 0x00000010u -#define CTL_BINCODE 0x00000020u -#define CTL_CALLOUT_CAPTURE 0x00000040u -#define CTL_CALLOUT_INFO 0x00000080u -#define CTL_CALLOUT_NONE 0x00000100u -#define CTL_DFA 0x00000200u -#define CTL_FINDLIMITS 0x00000400u -#define CTL_FULLBINCODE 0x00000800u -#define CTL_GETALL 0x00001000u -#define CTL_GLOBAL 0x00002000u -#define CTL_HEXPAT 0x00004000u -#define CTL_INFO 0x00008000u -#define CTL_JITFAST 0x00010000u -#define CTL_JITVERIFY 0x00020000u -#define CTL_MARK 0x00040000u -#define CTL_MEMORY 0x00080000u -#define CTL_POSIX 0x00100000u -#define CTL_PUSH 0x00200000u -#define CTL_STARTCHAR 0x00400000u -#define CTL_ZERO_TERMINATE 0x00800000u +#define CTL_AFTERTEXT 0x00000001u +#define CTL_ALLAFTERTEXT 0x00000002u +#define CTL_ALLCAPTURES 0x00000004u +#define CTL_ALLUSEDTEXT 0x00000008u +#define CTL_ALTGLOBAL 0x00000010u +#define CTL_BINCODE 0x00000020u +#define CTL_CALLOUT_CAPTURE 0x00000040u +#define CTL_CALLOUT_INFO 0x00000080u +#define CTL_CALLOUT_NONE 0x00000100u +#define CTL_DFA 0x00000200u +#define CTL_EXPAND 0x00000400u +#define CTL_FINDLIMITS 0x00000800u +#define CTL_FULLBINCODE 0x00001000u +#define CTL_GETALL 0x00002000u +#define CTL_GLOBAL 0x00004000u +#define CTL_HEXPAT 0x00008000u /* Same word as USE_LENGTH */ +#define CTL_INFO 0x00010000u +#define CTL_JITFAST 0x00020000u +#define CTL_JITVERIFY 0x00040000u +#define CTL_MARK 0x00080000u +#define CTL_MEMORY 0x00100000u +#define CTL_NULLCONTEXT 0x00200000u +#define CTL_POSIX 0x00400000u +#define CTL_POSIX_NOSUB 0x00800000u +#define CTL_PUSH 0x01000000u /* These three must be */ +#define CTL_PUSHCOPY 0x02000000u /* all in the same */ +#define CTL_PUSHTABLESCOPY 0x04000000u /* word. */ +#define CTL_STARTCHAR 0x08000000u +#define CTL_USE_LENGTH 0x10000000u /* Same word as HEXPAT */ +#define CTL_UTF8_INPUT 0x20000000u +#define CTL_ZERO_TERMINATE 0x40000000u -#define CTL_BSR_SET 0x80000000u /* This is informational */ -#define CTL_NL_SET 0x40000000u /* This is informational */ +/* Second control word */ + +#define CTL2_SUBSTITUTE_EXTENDED 0x00000001u +#define CTL2_SUBSTITUTE_OVERFLOW_LENGTH 0x00000002u +#define CTL2_SUBSTITUTE_UNKNOWN_UNSET 0x00000004u +#define CTL2_SUBSTITUTE_UNSET_EMPTY 0x00000008u + +#define CTL_NL_SET 0x40000000u /* Informational */ +#define CTL_BSR_SET 0x80000000u /* Informational */ + +/* Combinations */ #define CTL_DEBUG (CTL_FULLBINCODE|CTL_INFO) /* For setting */ #define CTL_ANYINFO (CTL_DEBUG|CTL_BINCODE|CTL_CALLOUT_INFO) @@ -418,7 +469,13 @@ data line. */ CTL_GLOBAL|\ CTL_MARK|\ CTL_MEMORY|\ - CTL_STARTCHAR) + CTL_STARTCHAR|\ + CTL_UTF8_INPUT) + +#define CTL2_ALLPD (CTL2_SUBSTITUTE_EXTENDED|\ + CTL2_SUBSTITUTE_OVERFLOW_LENGTH|\ + CTL2_SUBSTITUTE_UNKNOWN_UNSET|\ + CTL2_SUBSTITUTE_UNSET_EMPTY) /* Structures for holding modifier information for patterns and subject strings (data). Fields containing modifiers that can be set either for a pattern or a @@ -428,10 +485,12 @@ same offset in the big table below works for both. */ typedef struct patctl { /* Structure for pattern modifiers. */ uint32_t options; /* Must be in same position as datctl */ uint32_t control; /* Must be in same position as datctl */ + uint32_t control2; /* Must be in same position as datctl */ uint8_t replacement[REPLACE_MODSIZE]; /* So must this */ uint32_t jit; uint32_t stackguard_test; uint32_t tables_id; + uint32_t regerror_buffsize; uint8_t locale[LOCALESIZE]; } patctl; @@ -441,7 +500,9 @@ typedef struct patctl { /* Structure for pattern modifiers. */ typedef struct datctl { /* Structure for data line modifiers. */ uint32_t options; /* Must be in same position as patctl */ uint32_t control; /* Must be in same position as patctl */ + uint32_t control2; /* Must be in same position as patctl */ uint8_t replacement[REPLACE_MODSIZE]; /* So must this */ + uint32_t cerror[2]; uint32_t cfail[2]; int32_t callout_data; int32_t copy_numbers[MAXCPYGET]; @@ -481,82 +542,101 @@ typedef struct modstruct { } modstruct; static modstruct modlist[] = { - { "aftertext", MOD_PNDP, MOD_CTL, CTL_AFTERTEXT, PO(control) }, - { "allaftertext", MOD_PNDP, MOD_CTL, CTL_ALLAFTERTEXT, PO(control) }, - { "allcaptures", MOD_PND, MOD_CTL, CTL_ALLCAPTURES, PO(control) }, - { "allow_empty_class", MOD_PAT, MOD_OPT, PCRE2_ALLOW_EMPTY_CLASS, PO(options) }, - { "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) }, - { "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) }, - { "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) }, - { "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) }, - { "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) }, - { "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) }, - { "bincode", MOD_PAT, MOD_CTL, CTL_BINCODE, PO(control) }, - { "bsr", MOD_CTC, MOD_BSR, 0, CO(bsr_convention) }, - { "callout_capture", MOD_DAT, MOD_CTL, CTL_CALLOUT_CAPTURE, DO(control) }, - { "callout_data", MOD_DAT, MOD_INS, 0, DO(callout_data) }, - { "callout_fail", MOD_DAT, MOD_IN2, 0, DO(cfail) }, - { "callout_info", MOD_PAT, MOD_CTL, CTL_CALLOUT_INFO, PO(control) }, - { "callout_none", MOD_DAT, MOD_CTL, CTL_CALLOUT_NONE, DO(control) }, - { "caseless", MOD_PATP, MOD_OPT, PCRE2_CASELESS, PO(options) }, - { "copy", MOD_DAT, MOD_NN, DO(copy_numbers), DO(copy_names) }, - { "debug", MOD_PAT, MOD_CTL, CTL_DEBUG, PO(control) }, - { "dfa", MOD_DAT, MOD_CTL, CTL_DFA, DO(control) }, - { "dfa_restart", MOD_DAT, MOD_OPT, PCRE2_DFA_RESTART, DO(options) }, - { "dfa_shortest", MOD_DAT, MOD_OPT, PCRE2_DFA_SHORTEST, DO(options) }, - { "dollar_endonly", MOD_PAT, MOD_OPT, PCRE2_DOLLAR_ENDONLY, PO(options) }, - { "dotall", MOD_PATP, MOD_OPT, PCRE2_DOTALL, PO(options) }, - { "dupnames", MOD_PATP, MOD_OPT, PCRE2_DUPNAMES, PO(options) }, - { "extended", MOD_PATP, MOD_OPT, PCRE2_EXTENDED, PO(options) }, - { "find_limits", MOD_DAT, MOD_CTL, CTL_FINDLIMITS, DO(control) }, - { "firstline", MOD_PAT, MOD_OPT, PCRE2_FIRSTLINE, PO(options) }, - { "fullbincode", MOD_PAT, MOD_CTL, CTL_FULLBINCODE, PO(control) }, - { "get", MOD_DAT, MOD_NN, DO(get_numbers), DO(get_names) }, - { "getall", MOD_DAT, MOD_CTL, CTL_GETALL, DO(control) }, - { "global", MOD_PNDP, MOD_CTL, CTL_GLOBAL, PO(control) }, - { "hex", MOD_PAT, MOD_CTL, CTL_HEXPAT, PO(control) }, - { "info", MOD_PAT, MOD_CTL, CTL_INFO, PO(control) }, - { "jit", MOD_PAT, MOD_IND, 7, PO(jit) }, - { "jitfast", MOD_PAT, MOD_CTL, CTL_JITFAST, PO(control) }, - { "jitstack", MOD_DAT, MOD_INT, 0, DO(jitstack) }, - { "jitverify", MOD_PAT, MOD_CTL, CTL_JITVERIFY, PO(control) }, - { "locale", MOD_PAT, MOD_STR, LOCALESIZE, PO(locale) }, - { "mark", MOD_PNDP, MOD_CTL, CTL_MARK, PO(control) }, - { "match_limit", MOD_CTM, MOD_INT, 0, MO(match_limit) }, - { "match_unset_backref", MOD_PAT, MOD_OPT, PCRE2_MATCH_UNSET_BACKREF, PO(options) }, - { "memory", MOD_PD, MOD_CTL, CTL_MEMORY, PD(control) }, - { "multiline", MOD_PATP, MOD_OPT, PCRE2_MULTILINE, PO(options) }, - { "never_backslash_c", MOD_PAT, MOD_OPT, PCRE2_NEVER_BACKSLASH_C, PO(options) }, - { "never_ucp", MOD_PAT, MOD_OPT, PCRE2_NEVER_UCP, PO(options) }, - { "never_utf", MOD_PAT, MOD_OPT, PCRE2_NEVER_UTF, PO(options) }, - { "newline", MOD_CTC, MOD_NL, 0, CO(newline_convention) }, - { "no_auto_capture", MOD_PAT, MOD_OPT, PCRE2_NO_AUTO_CAPTURE, PO(options) }, - { "no_auto_possess", MOD_PATP, MOD_OPT, PCRE2_NO_AUTO_POSSESS, PO(options) }, - { "no_dotstar_anchor", MOD_PAT, MOD_OPT, PCRE2_NO_DOTSTAR_ANCHOR, PO(options) }, - { "no_start_optimize", MOD_PATP, MOD_OPT, PCRE2_NO_START_OPTIMIZE, PO(options) }, - { "no_utf_check", MOD_PD, MOD_OPT, PCRE2_NO_UTF_CHECK, PD(options) }, - { "notbol", MOD_DAT, MOD_OPT, PCRE2_NOTBOL, DO(options) }, - { "notempty", MOD_DAT, MOD_OPT, PCRE2_NOTEMPTY, DO(options) }, - { "notempty_atstart", MOD_DAT, MOD_OPT, PCRE2_NOTEMPTY_ATSTART, DO(options) }, - { "noteol", MOD_DAT, MOD_OPT, PCRE2_NOTEOL, DO(options) }, - { "offset", MOD_DAT, MOD_INT, 0, DO(offset) }, - { "ovector", MOD_DAT, MOD_INT, 0, DO(oveccount) }, - { "parens_nest_limit", MOD_CTC, MOD_INT, 0, CO(parens_nest_limit) }, - { "partial_hard", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, - { "partial_soft", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, - { "ph", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, - { "posix", MOD_PAT, MOD_CTL, CTL_POSIX, PO(control) }, - { "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, - { "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) }, - { "recursion_limit", MOD_CTM, MOD_INT, 0, MO(recursion_limit) }, - { "replace", MOD_PND, MOD_STR, REPLACE_MODSIZE, PO(replacement) }, - { "stackguard", MOD_PAT, MOD_INT, 0, PO(stackguard_test) }, - { "startchar", MOD_PND, MOD_CTL, CTL_STARTCHAR, PO(control) }, - { "tables", MOD_PAT, MOD_INT, 0, PO(tables_id) }, - { "ucp", MOD_PATP, MOD_OPT, PCRE2_UCP, PO(options) }, - { "ungreedy", MOD_PAT, MOD_OPT, PCRE2_UNGREEDY, PO(options) }, - { "utf", MOD_PATP, MOD_OPT, PCRE2_UTF, PO(options) }, - { "zero_terminate", MOD_DAT, MOD_CTL, CTL_ZERO_TERMINATE, DO(control) } + { "aftertext", MOD_PNDP, MOD_CTL, CTL_AFTERTEXT, PO(control) }, + { "allaftertext", MOD_PNDP, MOD_CTL, CTL_ALLAFTERTEXT, PO(control) }, + { "allcaptures", MOD_PND, MOD_CTL, CTL_ALLCAPTURES, PO(control) }, + { "allow_empty_class", MOD_PAT, MOD_OPT, PCRE2_ALLOW_EMPTY_CLASS, PO(options) }, + { "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) }, + { "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) }, + { "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) }, + { "alt_verbnames", MOD_PAT, MOD_OPT, PCRE2_ALT_VERBNAMES, PO(options) }, + { "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) }, + { "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) }, + { "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) }, + { "bincode", MOD_PAT, MOD_CTL, CTL_BINCODE, PO(control) }, + { "bsr", MOD_CTC, MOD_BSR, 0, CO(bsr_convention) }, + { "callout_capture", MOD_DAT, MOD_CTL, CTL_CALLOUT_CAPTURE, DO(control) }, + { "callout_data", MOD_DAT, MOD_INS, 0, DO(callout_data) }, + { "callout_error", MOD_DAT, MOD_IN2, 0, DO(cerror) }, + { "callout_fail", MOD_DAT, MOD_IN2, 0, DO(cfail) }, + { "callout_info", MOD_PAT, MOD_CTL, CTL_CALLOUT_INFO, PO(control) }, + { "callout_none", MOD_DAT, MOD_CTL, CTL_CALLOUT_NONE, DO(control) }, + { "caseless", MOD_PATP, MOD_OPT, PCRE2_CASELESS, PO(options) }, + { "copy", MOD_DAT, MOD_NN, DO(copy_numbers), DO(copy_names) }, + { "debug", MOD_PAT, MOD_CTL, CTL_DEBUG, PO(control) }, + { "dfa", MOD_DAT, MOD_CTL, CTL_DFA, DO(control) }, + { "dfa_restart", MOD_DAT, MOD_OPT, PCRE2_DFA_RESTART, DO(options) }, + { "dfa_shortest", MOD_DAT, MOD_OPT, PCRE2_DFA_SHORTEST, DO(options) }, + { "dollar_endonly", MOD_PAT, MOD_OPT, PCRE2_DOLLAR_ENDONLY, PO(options) }, + { "dotall", MOD_PATP, MOD_OPT, PCRE2_DOTALL, PO(options) }, + { "dupnames", MOD_PATP, MOD_OPT, PCRE2_DUPNAMES, PO(options) }, + { "expand", MOD_PAT, MOD_CTL, CTL_EXPAND, PO(control) }, + { "extended", MOD_PATP, MOD_OPT, PCRE2_EXTENDED, PO(options) }, + { "find_limits", MOD_DAT, MOD_CTL, CTL_FINDLIMITS, DO(control) }, + { "firstline", MOD_PAT, MOD_OPT, PCRE2_FIRSTLINE, PO(options) }, + { "fullbincode", MOD_PAT, MOD_CTL, CTL_FULLBINCODE, PO(control) }, + { "get", MOD_DAT, MOD_NN, DO(get_numbers), DO(get_names) }, + { "getall", MOD_DAT, MOD_CTL, CTL_GETALL, DO(control) }, + { "global", MOD_PNDP, MOD_CTL, CTL_GLOBAL, PO(control) }, + { "hex", MOD_PAT, MOD_CTL, CTL_HEXPAT, PO(control) }, + { "info", MOD_PAT, MOD_CTL, CTL_INFO, PO(control) }, + { "jit", MOD_PAT, MOD_IND, 7, PO(jit) }, + { "jitfast", MOD_PAT, MOD_CTL, CTL_JITFAST, PO(control) }, + { "jitstack", MOD_DAT, MOD_INT, 0, DO(jitstack) }, + { "jitverify", MOD_PAT, MOD_CTL, CTL_JITVERIFY, PO(control) }, + { "locale", MOD_PAT, MOD_STR, LOCALESIZE, PO(locale) }, + { "mark", MOD_PNDP, MOD_CTL, CTL_MARK, PO(control) }, + { "match_limit", MOD_CTM, MOD_INT, 0, MO(match_limit) }, + { "match_unset_backref", MOD_PAT, MOD_OPT, PCRE2_MATCH_UNSET_BACKREF, PO(options) }, + { "max_pattern_length", MOD_CTC, MOD_SIZ, 0, CO(max_pattern_length) }, + { "memory", MOD_PD, MOD_CTL, CTL_MEMORY, PD(control) }, + { "multiline", MOD_PATP, MOD_OPT, PCRE2_MULTILINE, PO(options) }, + { "never_backslash_c", MOD_PAT, MOD_OPT, PCRE2_NEVER_BACKSLASH_C, PO(options) }, + { "never_ucp", MOD_PAT, MOD_OPT, PCRE2_NEVER_UCP, PO(options) }, + { "never_utf", MOD_PAT, MOD_OPT, PCRE2_NEVER_UTF, PO(options) }, + { "newline", MOD_CTC, MOD_NL, 0, CO(newline_convention) }, + { "no_auto_capture", MOD_PAT, MOD_OPT, PCRE2_NO_AUTO_CAPTURE, PO(options) }, + { "no_auto_possess", MOD_PATP, MOD_OPT, PCRE2_NO_AUTO_POSSESS, PO(options) }, + { "no_dotstar_anchor", MOD_PAT, MOD_OPT, PCRE2_NO_DOTSTAR_ANCHOR, PO(options) }, + { "no_jit", MOD_DAT, MOD_OPT, PCRE2_NO_JIT, DO(options) }, + { "no_start_optimize", MOD_PATP, MOD_OPT, PCRE2_NO_START_OPTIMIZE, PO(options) }, + { "no_utf_check", MOD_PD, MOD_OPT, PCRE2_NO_UTF_CHECK, PD(options) }, + { "notbol", MOD_DAT, MOD_OPT, PCRE2_NOTBOL, DO(options) }, + { "notempty", MOD_DAT, MOD_OPT, PCRE2_NOTEMPTY, DO(options) }, + { "notempty_atstart", MOD_DAT, MOD_OPT, PCRE2_NOTEMPTY_ATSTART, DO(options) }, + { "noteol", MOD_DAT, MOD_OPT, PCRE2_NOTEOL, DO(options) }, + { "null_context", MOD_PD, MOD_CTL, CTL_NULLCONTEXT, PO(control) }, + { "offset", MOD_DAT, MOD_INT, 0, DO(offset) }, + { "offset_limit", MOD_CTM, MOD_SIZ, 0, MO(offset_limit)}, + { "ovector", MOD_DAT, MOD_INT, 0, DO(oveccount) }, + { "parens_nest_limit", MOD_CTC, MOD_INT, 0, CO(parens_nest_limit) }, + { "partial_hard", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, + { "partial_soft", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, + { "ph", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, + { "posix", MOD_PAT, MOD_CTL, CTL_POSIX, PO(control) }, + { "posix_nosub", MOD_PAT, MOD_CTL, CTL_POSIX|CTL_POSIX_NOSUB, PO(control) }, + { "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, + { "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) }, + { "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) }, + { "pushtablescopy", MOD_PAT, MOD_CTL, CTL_PUSHTABLESCOPY, PO(control) }, + { "recursion_limit", MOD_CTM, MOD_INT, 0, MO(recursion_limit) }, + { "regerror_buffsize", MOD_PAT, MOD_INT, 0, PO(regerror_buffsize) }, + { "replace", MOD_PND, MOD_STR, REPLACE_MODSIZE, PO(replacement) }, + { "stackguard", MOD_PAT, MOD_INT, 0, PO(stackguard_test) }, + { "startchar", MOD_PND, MOD_CTL, CTL_STARTCHAR, PO(control) }, + { "startoffset", MOD_DAT, MOD_INT, 0, DO(offset) }, + { "substitute_extended", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_EXTENDED, PO(control2) }, + { "substitute_overflow_length", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_OVERFLOW_LENGTH, PO(control2) }, + { "substitute_unknown_unset", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_UNKNOWN_UNSET, PO(control2) }, + { "substitute_unset_empty", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_UNSET_EMPTY, PO(control2) }, + { "tables", MOD_PAT, MOD_INT, 0, PO(tables_id) }, + { "ucp", MOD_PATP, MOD_OPT, PCRE2_UCP, PO(options) }, + { "ungreedy", MOD_PAT, MOD_OPT, PCRE2_UNGREEDY, PO(options) }, + { "use_length", MOD_PAT, MOD_CTL, CTL_USE_LENGTH, PO(control) }, + { "use_offset_limit", MOD_PAT, MOD_OPT, PCRE2_USE_OFFSET_LIMIT, PO(options) }, + { "utf", MOD_PATP, MOD_OPT, PCRE2_UTF, PO(options) }, + { "utf8_input", MOD_PAT, MOD_CTL, CTL_UTF8_INPUT, PO(control) }, + { "zero_terminate", MOD_DAT, MOD_CTL, CTL_ZERO_TERMINATE, DO(control) } }; #define MODLISTCOUNT sizeof(modlist)/sizeof(modstruct) @@ -564,34 +644,57 @@ static modstruct modlist[] = { /* Controls and options that are supported for use with the POSIX interface. */ #define POSIX_SUPPORTED_COMPILE_OPTIONS ( \ - PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \ - PCRE2_UCP|PCRE2_UTF|PCRE2_UNGREEDY) + PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_MULTILINE|PCRE2_UCP|PCRE2_UTF| \ + PCRE2_UNGREEDY) #define POSIX_SUPPORTED_COMPILE_CONTROLS ( \ - CTL_AFTERTEXT|CTL_ALLAFTERTEXT|CTL_POSIX) + CTL_AFTERTEXT|CTL_ALLAFTERTEXT|CTL_EXPAND|CTL_POSIX|CTL_POSIX_NOSUB) + +#define POSIX_SUPPORTED_COMPILE_CONTROLS2 (0) #define POSIX_SUPPORTED_MATCH_OPTIONS ( \ PCRE2_NOTBOL|PCRE2_NOTEMPTY|PCRE2_NOTEOL) -#define POSIX_SUPPORTED_MATCH_CONTROLS (CTL_AFTERTEXT|CTL_ALLAFTERTEXT) - -/* Controls that are mutually exclusive. */ - -#define EXCLUSIVE_DAT_CONTROLS (CTL_ALLUSEDTEXT|CTL_STARTCHAR) +#define POSIX_SUPPORTED_MATCH_CONTROLS (CTL_AFTERTEXT|CTL_ALLAFTERTEXT) +#define POSIX_SUPPORTED_MATCH_CONTROLS2 (0) /* Control bits that are not ignored with 'push'. */ #define PUSH_SUPPORTED_COMPILE_CONTROLS ( \ CTL_BINCODE|CTL_CALLOUT_INFO|CTL_FULLBINCODE|CTL_HEXPAT|CTL_INFO| \ - CTL_JITVERIFY|CTL_MEMORY|CTL_PUSH|CTL_BSR_SET|CTL_NL_SET) + CTL_JITVERIFY|CTL_MEMORY|CTL_PUSH|CTL_PUSHCOPY|CTL_PUSHTABLESCOPY| \ + CTL_USE_LENGTH) + +#define PUSH_SUPPORTED_COMPILE_CONTROLS2 (CTL_BSR_SET|CTL_NL_SET) /* Controls that apply only at compile time with 'push'. */ -#define PUSH_COMPILE_ONLY_CONTROLS CTL_JITVERIFY +#define PUSH_COMPILE_ONLY_CONTROLS CTL_JITVERIFY +#define PUSH_COMPILE_ONLY_CONTROLS2 (0) -/* Controls that are forbidden with #pop. */ +/* Controls that are forbidden with #pop or #popcopy. */ -#define NOTPOP_CONTROLS (CTL_HEXPAT|CTL_POSIX|CTL_PUSH) +#define NOTPOP_CONTROLS (CTL_HEXPAT|CTL_POSIX|CTL_POSIX_NOSUB|CTL_PUSH| \ + CTL_PUSHCOPY|CTL_PUSHTABLESCOPY|CTL_USE_LENGTH) + +/* Pattern controls that are mutually exclusive. At present these are all in +the first control word. Note that CTL_POSIX_NOSUB is always accompanied by +CTL_POSIX, so it doesn't need its own entries. */ + +static uint32_t exclusive_pat_controls[] = { + CTL_POSIX | CTL_HEXPAT, + CTL_POSIX | CTL_PUSH, + CTL_POSIX | CTL_PUSHCOPY, + CTL_POSIX | CTL_PUSHTABLESCOPY, + CTL_POSIX | CTL_USE_LENGTH, + CTL_EXPAND | CTL_HEXPAT }; + +/* Data controls that are mutually exclusive. At present these are all in the +first control word. */ + +static uint32_t exclusive_dat_controls[] = { + CTL_ALLUSEDTEXT | CTL_STARTCHAR, + CTL_FINDLIMITS | CTL_NULLCONTEXT }; /* Table of single-character abbreviated modifiers. The index field is initialized to -1, but the first time the modifier is encountered, it is filled @@ -648,6 +751,12 @@ table itself easier to read. */ #define EBCDIC_NL 0 #endif +#ifdef NEVER_BACKSLASH_C +#define BACKSLASH_C 0 +#else +#define BACKSLASH_C 1 +#endif + typedef struct coptstruct { const char *name; uint32_t type; @@ -662,16 +771,17 @@ enum { CONF_BSR, }; static coptstruct coptlist[] = { - { "bsr", CONF_BSR, PCRE2_CONFIG_BSR }, - { "ebcdic", CONF_FIX, SUPPORT_EBCDIC }, - { "ebcdic-nl", CONF_FIZ, EBCDIC_NL }, - { "jit", CONF_INT, PCRE2_CONFIG_JIT }, - { "linksize", CONF_INT, PCRE2_CONFIG_LINKSIZE }, - { "newline", CONF_NL, PCRE2_CONFIG_NEWLINE }, - { "pcre2-16", CONF_FIX, SUPPORT_16 }, - { "pcre2-32", CONF_FIX, SUPPORT_32 }, - { "pcre2-8", CONF_FIX, SUPPORT_8 }, - { "unicode", CONF_INT, PCRE2_CONFIG_UNICODE } + { "backslash-C", CONF_FIX, BACKSLASH_C }, + { "bsr", CONF_BSR, PCRE2_CONFIG_BSR }, + { "ebcdic", CONF_FIX, SUPPORT_EBCDIC }, + { "ebcdic-nl", CONF_FIZ, EBCDIC_NL }, + { "jit", CONF_INT, PCRE2_CONFIG_JIT }, + { "linksize", CONF_INT, PCRE2_CONFIG_LINKSIZE }, + { "newline", CONF_NL, PCRE2_CONFIG_NEWLINE }, + { "pcre2-16", CONF_FIX, SUPPORT_16 }, + { "pcre2-32", CONF_FIX, SUPPORT_32 }, + { "pcre2-8", CONF_FIX, SUPPORT_8 }, + { "unicode", CONF_INT, PCRE2_CONFIG_UNICODE } }; #define COPTLISTCOUNT sizeof(coptlist)/sizeof(coptstruct) @@ -697,6 +807,7 @@ static BOOL restrict_for_perl_test = FALSE; static BOOL show_memory = FALSE; static int code_unit_size; /* Bytes */ +static int jitrc; /* Return from JIT compile */ static int test_mode = DEFAULT_TEST_MODE; static int timeit = 0; static int timeitm = 0; @@ -711,6 +822,8 @@ static uint32_t maxlookbehind; static uint32_t max_oveccount; static uint32_t callout_count; +static uint16_t local_newline_default = 0; + static VERSION_TYPE jittarget[VERSION_SIZE]; static VERSION_TYPE version[VERSION_SIZE]; static VERSION_TYPE uversion[VERSION_SIZE]; @@ -724,11 +837,12 @@ static void *patstack[PATSTACKSIZE]; static int patstacknext = 0; #ifdef SUPPORT_PCRE2_8 -static regex_t preg = { NULL, NULL, 0, 0 }; +static regex_t preg = { NULL, NULL, 0, 0, 0 }; #endif static int *dfa_workspace = NULL; static const uint8_t *locale_tables = NULL; +static const uint8_t *use_tables = NULL; static uint8_t locale_name[32]; /* We need buffers for building 16/32-bit strings; 8-bit strings don't need @@ -737,7 +851,7 @@ buffer is where all input lines are read. Its size is the same as pbuffer8. Pattern lines are always copied to pbuffer8 for use in callouts, even if they are actually compiled from pbuffer16 or pbuffer32. */ -static int pbuffer8_size = 50000; /* Initial size, bytes */ +static size_t pbuffer8_size = 50000; /* Initial size, bytes */ static uint8_t *pbuffer8 = NULL; static uint8_t *buffer = NULL; @@ -856,21 +970,45 @@ are supported. */ a = pcre2_callout_enumerate_32(compiled_code32, \ (int (*)(struct pcre2_callout_enumerate_block_32 *, void *))b,c) +#define PCRE2_CODE_COPY_FROM_VOID(a,b) \ + if (test_mode == PCRE8_MODE) \ + G(a,8) = pcre2_code_copy_8(b); \ + else if (test_mode == PCRE16_MODE) \ + G(a,16) = pcre2_code_copy_16(b); \ + else \ + G(a,32) = pcre2_code_copy_32(b) + +#define PCRE2_CODE_COPY_TO_VOID(a,b) \ + if (test_mode == PCRE8_MODE) \ + a = (void *)pcre2_code_copy_8(G(b,8)); \ + else if (test_mode == PCRE16_MODE) \ + a = (void *)pcre2_code_copy_16(G(b,16)); \ + else \ + a = (void *)pcre2_code_copy_32(G(b,32)) + +#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) \ + if (test_mode == PCRE8_MODE) \ + a = (void *)pcre2_code_copy_with_tables_8(G(b,8)); \ + else if (test_mode == PCRE16_MODE) \ + a = (void *)pcre2_code_copy_with_tables_16(G(b,16)); \ + else \ + a = (void *)pcre2_code_copy_with_tables_32(G(b,32)) + #define PCRE2_COMPILE(a,b,c,d,e,f,g) \ if (test_mode == PCRE8_MODE) \ - G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,G(g,8)); \ + G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,g); \ else if (test_mode == PCRE16_MODE) \ - G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,G(g,16)); \ + G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,g); \ else \ - G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,G(g,32)) + G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,g) #define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \ if (test_mode == PCRE8_MODE) \ - a = pcre2_dfa_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8),i,j); \ + a = pcre2_dfa_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),h,i,j); \ else if (test_mode == PCRE16_MODE) \ - a = pcre2_dfa_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16),i,j); \ + a = pcre2_dfa_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),h,i,j); \ else \ - a = pcre2_dfa_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32),i,j) + a = pcre2_dfa_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),h,i,j) #define PCRE2_GET_ERROR_MESSAGE(r,a,b) \ if (test_mode == PCRE8_MODE) \ @@ -896,10 +1034,10 @@ are supported. */ else \ a = pcre2_get_startchar_32(G(b,32)) -#define PCRE2_JIT_COMPILE(a,b) \ - if (test_mode == PCRE8_MODE) pcre2_jit_compile_8(G(a,8),b); \ - else if (test_mode == PCRE16_MODE) pcre2_jit_compile_16(G(a,16),b); \ - else pcre2_jit_compile_32(G(a,32),b) +#define PCRE2_JIT_COMPILE(r,a,b) \ + if (test_mode == PCRE8_MODE) r = pcre2_jit_compile_8(G(a,8),b); \ + else if (test_mode == PCRE16_MODE) r = pcre2_jit_compile_16(G(a,16),b); \ + else r = pcre2_jit_compile_32(G(a,32),b) #define PCRE2_JIT_FREE_UNUSED_MEMORY(a) \ if (test_mode == PCRE8_MODE) pcre2_jit_free_unused_memory_8(G(a,8)); \ @@ -908,11 +1046,11 @@ are supported. */ #define PCRE2_JIT_MATCH(a,b,c,d,e,f,g,h) \ if (test_mode == PCRE8_MODE) \ - a = pcre2_jit_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8)); \ + a = pcre2_jit_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),h); \ else if (test_mode == PCRE16_MODE) \ - a = pcre2_jit_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16)); \ + a = pcre2_jit_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),h); \ else \ - a = pcre2_jit_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32)) + a = pcre2_jit_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),h) #define PCRE2_JIT_STACK_CREATE(a,b,c,d) \ if (test_mode == PCRE8_MODE) \ @@ -945,11 +1083,11 @@ are supported. */ #define PCRE2_MATCH(a,b,c,d,e,f,g,h) \ if (test_mode == PCRE8_MODE) \ - a = pcre2_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8)); \ + a = pcre2_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),h); \ else if (test_mode == PCRE16_MODE) \ - a = pcre2_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16)); \ + a = pcre2_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),h); \ else \ - a = pcre2_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32)) + a = pcre2_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),h) #define PCRE2_MATCH_DATA_CREATE(a,b,c) \ if (test_mode == PCRE8_MODE) \ @@ -1055,6 +1193,22 @@ are supported. */ else \ pcre2_set_match_limit_32(G(a,32),b) +#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) \ + if (test_mode == PCRE8_MODE) \ + pcre2_set_max_pattern_length_8(G(a,8),b); \ + else if (test_mode == PCRE16_MODE) \ + pcre2_set_max_pattern_length_16(G(a,16),b); \ + else \ + pcre2_set_max_pattern_length_32(G(a,32),b) + +#define PCRE2_SET_OFFSET_LIMIT(a,b) \ + if (test_mode == PCRE8_MODE) \ + pcre2_set_offset_limit_8(G(a,8),b); \ + else if (test_mode == PCRE16_MODE) \ + pcre2_set_offset_limit_16(G(a,16),b); \ + else \ + pcre2_set_offset_limit_32(G(a,32),b) + #define PCRE2_SET_PARENS_NEST_LIMIT(a,b) \ if (test_mode == PCRE8_MODE) \ pcre2_set_parens_nest_limit_8(G(a,8),b); \ @@ -1291,19 +1445,37 @@ the three different cases. */ a = G(pcre2_callout_enumerate,BITTWO)(G(compiled_code,BITTWO), \ (int (*)(struct G(pcre2_callout_enumerate_block_,BITTWO) *, void *))b,c) +#define PCRE2_CODE_COPY_FROM_VOID(a,b) \ + if (test_mode == G(G(PCRE,BITONE),_MODE)) \ + G(a,BITONE) = G(pcre2_code_copy_,BITONE)(b); \ + else \ + G(a,BITTWO) = G(pcre2_code_copy_,BITTWO)(b) + +#define PCRE2_CODE_COPY_TO_VOID(a,b) \ + if (test_mode == G(G(PCRE,BITONE),_MODE)) \ + a = (void *)G(pcre2_code_copy_,BITONE)(G(b,BITONE)); \ + else \ + a = (void *)G(pcre2_code_copy_,BITTWO)(G(b,BITTWO)) + +#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) \ + if (test_mode == G(G(PCRE,BITONE),_MODE)) \ + a = (void *)G(pcre2_code_copy_with_tables_,BITONE)(G(b,BITONE)); \ + else \ + a = (void *)G(pcre2_code_copy_with_tables_,BITTWO)(G(b,BITTWO)) + #define PCRE2_COMPILE(a,b,c,d,e,f,g) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ - G(a,BITONE) = G(pcre2_compile_,BITONE)(G(b,BITONE),c,d,e,f,G(g,BITONE)); \ + G(a,BITONE) = G(pcre2_compile_,BITONE)(G(b,BITONE),c,d,e,f,g); \ else \ - G(a,BITTWO) = G(pcre2_compile_,BITTWO)(G(b,BITTWO),c,d,e,f,G(g,BITTWO)) + G(a,BITTWO) = G(pcre2_compile_,BITTWO)(G(b,BITTWO),c,d,e,f,g) #define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ a = G(pcre2_dfa_match_,BITONE)(G(b,BITONE),(G(PCRE2_SPTR,BITONE))c,d,e,f, \ - G(g,BITONE),G(h,BITONE),i,j); \ + G(g,BITONE),h,i,j); \ else \ a = G(pcre2_dfa_match_,BITTWO)(G(b,BITTWO),(G(PCRE2_SPTR,BITTWO))c,d,e,f, \ - G(g,BITTWO),G(h,BITTWO),i,j) + G(g,BITTWO),h,i,j) #define PCRE2_GET_ERROR_MESSAGE(r,a,b) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ @@ -1323,11 +1495,11 @@ the three different cases. */ else \ a = G(pcre2_get_startchar_,BITTWO)(G(b,BITTWO)) -#define PCRE2_JIT_COMPILE(a,b) \ +#define PCRE2_JIT_COMPILE(r,a,b) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ - G(pcre2_jit_compile_,BITONE)(G(a,BITONE),b); \ + r = G(pcre2_jit_compile_,BITONE)(G(a,BITONE),b); \ else \ - G(pcre2_jit_compile_,BITTWO)(G(a,BITTWO),b) + r = G(pcre2_jit_compile_,BITTWO)(G(a,BITTWO),b) #define PCRE2_JIT_FREE_UNUSED_MEMORY(a) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ @@ -1338,10 +1510,10 @@ the three different cases. */ #define PCRE2_JIT_MATCH(a,b,c,d,e,f,g,h) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ a = G(pcre2_jit_match_,BITONE)(G(b,BITONE),(G(PCRE2_SPTR,BITONE))c,d,e,f, \ - G(g,BITONE),G(h,BITONE)); \ + G(g,BITONE),h); \ else \ a = G(pcre2_jit_match_,BITTWO)(G(b,BITTWO),(G(PCRE2_SPTR,BITTWO))c,d,e,f, \ - G(g,BITTWO),G(h,BITTWO)) + G(g,BITTWO),h) #define PCRE2_JIT_STACK_CREATE(a,b,c,d) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ @@ -1370,10 +1542,10 @@ the three different cases. */ #define PCRE2_MATCH(a,b,c,d,e,f,g,h) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ a = G(pcre2_match_,BITONE)(G(b,BITONE),(G(PCRE2_SPTR,BITONE))c,d,e,f, \ - G(g,BITONE),G(h,BITONE)); \ + G(g,BITONE),h); \ else \ a = G(pcre2_match_,BITTWO)(G(b,BITTWO),(G(PCRE2_SPTR,BITTWO))c,d,e,f, \ - G(g,BITTWO),G(h,BITTWO)) + G(g,BITTWO),h) #define PCRE2_MATCH_DATA_CREATE(a,b,c) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ @@ -1455,6 +1627,18 @@ the three different cases. */ else \ G(pcre2_set_match_limit_,BITTWO)(G(a,BITTWO),b) +#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) \ + if (test_mode == G(G(PCRE,BITONE),_MODE)) \ + G(pcre2_set_max_pattern_length_,BITONE)(G(a,BITONE),b); \ + else \ + G(pcre2_set_max_pattern_length_,BITTWO)(G(a,BITTWO),b) + +#define PCRE2_SET_OFFSET_LIMIT(a,b) \ + if (test_mode == G(G(PCRE,BITONE),_MODE)) \ + G(pcre2_set_offset_limit_,BITONE)(G(a,BITONE),b); \ + else \ + G(pcre2_set_offset_limit_,BITTWO)(G(a,BITTWO),b) + #define PCRE2_SET_PARENS_NEST_LIMIT(a,b) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ G(pcre2_set_parens_nest_limit_,BITONE)(G(a,BITONE),b); \ @@ -1614,18 +1798,21 @@ the three different cases. */ #define PCRE2_CALLOUT_ENUMERATE(a,b,c) \ a = pcre2_callout_enumerate_8(compiled_code8, \ (int (*)(struct pcre2_callout_enumerate_block_8 *, void *))b,c) +#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,8) = pcre2_code_copy_8(b) +#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_8(G(b,8)) +#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) a = (void *)pcre2_code_copy_with_tables_8(G(b,8)) #define PCRE2_COMPILE(a,b,c,d,e,f,g) \ - G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,G(g,8)) + G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,g) #define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \ - a = pcre2_dfa_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8),i,j) + a = pcre2_dfa_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),h,i,j) #define PCRE2_GET_ERROR_MESSAGE(r,a,b) \ r = pcre2_get_error_message_8(a,G(b,8),G(G(b,8),_size)) #define PCRE2_GET_OVECTOR_COUNT(a,b) a = pcre2_get_ovector_count_8(G(b,8)) #define PCRE2_GET_STARTCHAR(a,b) a = pcre2_get_startchar_8(G(b,8)) -#define PCRE2_JIT_COMPILE(a,b) pcre2_jit_compile_8(G(a,8),b) +#define PCRE2_JIT_COMPILE(r,a,b) r = pcre2_jit_compile_8(G(a,8),b) #define PCRE2_JIT_FREE_UNUSED_MEMORY(a) pcre2_jit_free_unused_memory_8(G(a,8)) #define PCRE2_JIT_MATCH(a,b,c,d,e,f,g,h) \ - a = pcre2_jit_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8)) + a = pcre2_jit_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),h) #define PCRE2_JIT_STACK_CREATE(a,b,c,d) \ a = (PCRE2_JIT_STACK *)pcre2_jit_stack_create_8(b,c,d); #define PCRE2_JIT_STACK_ASSIGN(a,b,c) \ @@ -1633,7 +1820,7 @@ the three different cases. */ #define PCRE2_JIT_STACK_FREE(a) pcre2_jit_stack_free_8((pcre2_jit_stack_8 *)a); #define PCRE2_MAKETABLES(a) a = pcre2_maketables_8(NULL) #define PCRE2_MATCH(a,b,c,d,e,f,g,h) \ - a = pcre2_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8)) + a = pcre2_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),h) #define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,8) = pcre2_match_data_create_8(b,c) #define PCRE2_MATCH_DATA_CREATE_FROM_PATTERN(a,b,c) \ G(a,8) = pcre2_match_data_create_from_pattern_8(G(b,8),c) @@ -1653,6 +1840,8 @@ the three different cases. */ #define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \ pcre2_set_compile_recursion_guard_8(G(a,8),b,c) #define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_8(G(a,8),b) +#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_8(G(a,8),b) +#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_8(G(a,8),b) #define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_8(G(a,8),b) #define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_8(G(a,8),b) #define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \ @@ -1705,18 +1894,21 @@ the three different cases. */ #define PCRE2_CALLOUT_ENUMERATE(a,b,c) \ a = pcre2_callout_enumerate_16(compiled_code16, \ (int (*)(struct pcre2_callout_enumerate_block_16 *, void *))b,c) +#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,16) = pcre2_code_copy_16(b) +#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_16(G(b,16)) +#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) a = (void *)pcre2_code_copy_with_tables_16(G(b,16)) #define PCRE2_COMPILE(a,b,c,d,e,f,g) \ - G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,G(g,16)) + G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,g) #define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \ - a = pcre2_dfa_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16),i,j) + a = pcre2_dfa_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),h,i,j) #define PCRE2_GET_ERROR_MESSAGE(r,a,b) \ r = pcre2_get_error_message_16(a,G(b,16),G(G(b,16),_size)) #define PCRE2_GET_OVECTOR_COUNT(a,b) a = pcre2_get_ovector_count_16(G(b,16)) #define PCRE2_GET_STARTCHAR(a,b) a = pcre2_get_startchar_16(G(b,16)) -#define PCRE2_JIT_COMPILE(a,b) pcre2_jit_compile_16(G(a,16),b) +#define PCRE2_JIT_COMPILE(r,a,b) r = pcre2_jit_compile_16(G(a,16),b) #define PCRE2_JIT_FREE_UNUSED_MEMORY(a) pcre2_jit_free_unused_memory_16(G(a,16)) #define PCRE2_JIT_MATCH(a,b,c,d,e,f,g,h) \ - a = pcre2_jit_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16)) + a = pcre2_jit_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),h) #define PCRE2_JIT_STACK_CREATE(a,b,c,d) \ a = (PCRE2_JIT_STACK *)pcre2_jit_stack_create_16(b,c,d); #define PCRE2_JIT_STACK_ASSIGN(a,b,c) \ @@ -1724,7 +1916,7 @@ the three different cases. */ #define PCRE2_JIT_STACK_FREE(a) pcre2_jit_stack_free_16((pcre2_jit_stack_16 *)a); #define PCRE2_MAKETABLES(a) a = pcre2_maketables_16(NULL) #define PCRE2_MATCH(a,b,c,d,e,f,g,h) \ - a = pcre2_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16)) + a = pcre2_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),h) #define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,16) = pcre2_match_data_create_16(b,c) #define PCRE2_MATCH_DATA_CREATE_FROM_PATTERN(a,b,c) \ G(a,16) = pcre2_match_data_create_from_pattern_16(G(b,16),c) @@ -1744,6 +1936,8 @@ the three different cases. */ #define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \ pcre2_set_compile_recursion_guard_16(G(a,16),b,c) #define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_16(G(a,16),b) +#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_16(G(a,16),b) +#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_16(G(a,16),b) #define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_16(G(a,16),b) #define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_16(G(a,16),b) #define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \ @@ -1796,18 +1990,21 @@ the three different cases. */ #define PCRE2_CALLOUT_ENUMERATE(a,b,c) \ a = pcre2_callout_enumerate_32(compiled_code32, \ (int (*)(struct pcre2_callout_enumerate_block_32 *, void *))b,c) +#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,32) = pcre2_code_copy_32(b) +#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_32(G(b,32)) +#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) a = (void *)pcre2_code_copy_with_tables_32(G(b,32)) #define PCRE2_COMPILE(a,b,c,d,e,f,g) \ - G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,G(g,32)) + G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,g) #define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \ - a = pcre2_dfa_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32),i,j) + a = pcre2_dfa_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),h,i,j) #define PCRE2_GET_ERROR_MESSAGE(r,a,b) \ r = pcre2_get_error_message_32(a,G(b,32),G(G(b,32),_size)) #define PCRE2_GET_OVECTOR_COUNT(a,b) a = pcre2_get_ovector_count_32(G(b,32)) #define PCRE2_GET_STARTCHAR(a,b) a = pcre2_get_startchar_32(G(b,32)) -#define PCRE2_JIT_COMPILE(a,b) pcre2_jit_compile_32(G(a,32),b) +#define PCRE2_JIT_COMPILE(r,a,b) r = pcre2_jit_compile_32(G(a,32),b) #define PCRE2_JIT_FREE_UNUSED_MEMORY(a) pcre2_jit_free_unused_memory_32(G(a,32)) #define PCRE2_JIT_MATCH(a,b,c,d,e,f,g,h) \ - a = pcre2_jit_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32)) + a = pcre2_jit_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),h) #define PCRE2_JIT_STACK_CREATE(a,b,c,d) \ a = (PCRE2_JIT_STACK *)pcre2_jit_stack_create_32(b,c,d); #define PCRE2_JIT_STACK_ASSIGN(a,b,c) \ @@ -1815,7 +2012,7 @@ the three different cases. */ #define PCRE2_JIT_STACK_FREE(a) pcre2_jit_stack_free_32((pcre2_jit_stack_32 *)a); #define PCRE2_MAKETABLES(a) a = pcre2_maketables_32(NULL) #define PCRE2_MATCH(a,b,c,d,e,f,g,h) \ - a = pcre2_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32)) + a = pcre2_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),h) #define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,32) = pcre2_match_data_create_32(b,c) #define PCRE2_MATCH_DATA_CREATE_FROM_PATTERN(a,b,c) \ G(a,32) = pcre2_match_data_create_from_pattern_32(G(b,32),c) @@ -1835,6 +2032,8 @@ the three different cases. */ #define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \ pcre2_set_compile_recursion_guard_32(G(a,32),b,c) #define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_32(G(a,32),b) +#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_32(G(a,32),b) +#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_32(G(a,32),b) #define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_32(G(a,32),b) #define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_32(G(a,32),b) #define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \ @@ -2203,6 +2402,27 @@ static const uint8_t tables2[] = { }; +#ifndef HAVE_STRERROR +/************************************************* +* Provide strerror() for non-ANSI libraries * +*************************************************/ + +/* Some old-fashioned systems (e.g. SunOS4) didn't have strerror() in their +libraries. They may no longer be around, but just in case, we can try to +provide the same facility by this simple alternative function. */ + +extern int sys_nerr; +extern char *sys_errlist[]; + +char * +strerror(int n) +{ +if (n < 0 || n >= sys_nerr) return "unknown error number"; +return sys_errlist[n]; +} +#endif /* HAVE_STRERROR */ + + /************************************************* * Local memory functions * @@ -2363,6 +2583,8 @@ static int pchar(uint32_t c, BOOL utf, FILE *f) { int n = 0; +char tempbuffer[16]; + if (PRINTOK(c)) { if (f != NULL) fprintf(f, "%c", c); @@ -2384,6 +2606,8 @@ if (c < 0x100) } if (f != NULL) n = fprintf(f, "\\x{%02x}", c); + else n = sprintf(tempbuffer, "\\x{%02x}", c); + return n >= 0 ? n : 0; } @@ -2424,13 +2648,15 @@ return (int)(pp - p); *************************************************/ /* Must handle UTF-8 strings in utf8 mode. Yields number of characters printed. -If handed a NULL file, just counts chars without printing. */ +For printing *MARK strings, a negative length is given. If handed a NULL file, +just counts chars without printing (because pchar() does that). */ static int pchars8(PCRE2_SPTR8 p, int length, BOOL utf, FILE *f) { uint32_t c = 0; int yield = 0; -if (length < 0) length = strlen((char *)p); + +if (length < 0) length = p[-1]; while (length-- > 0) { if (utf) @@ -2447,6 +2673,7 @@ while (length-- > 0) c = *p++; yield += pchar(c, utf, f); } + return yield; } #endif @@ -2458,12 +2685,13 @@ return yield; *************************************************/ /* Must handle UTF-16 strings in utf mode. Yields number of characters printed. -If handed a NULL file, just counts chars without printing. */ +For printing *MARK strings, a negative length is given. If handed a NULL file, +just counts chars without printing. */ static int pchars16(PCRE2_SPTR16 p, int length, BOOL utf, FILE *f) { int yield = 0; -if (length < 0) length = strlen16(p); +if (length < 0) length = p[-1]; while (length-- > 0) { uint32_t c = *p++ & 0xffff; @@ -2491,13 +2719,14 @@ return yield; *************************************************/ /* Must handle UTF-32 strings in utf mode. Yields number of characters printed. -If handed a NULL file, just counts chars without printing. */ +For printing *MARK strings, a negative length is given.If handed a NULL file, +just counts chars without printing. */ static int pchars32(PCRE2_SPTR32 p, int length, BOOL utf, FILE *f) { int yield = 0; (void)(utf); /* Avoid compiler warning */ -if (length < 0) length = strlen32(p); +if (length < 0) length = p[-1]; while (length-- > 0) { uint32_t c = *p++; @@ -2528,7 +2757,7 @@ Returns: number of characters placed in the buffer static int ord2utf8(uint32_t cvalue, uint8_t *utf8bytes) { -register int i, j; +int i, j; if (cvalue > 0x7fffffffu) return -1; for (i = 0; i < utf8_table1_size; i++) @@ -2548,16 +2777,22 @@ return i + 1; #ifdef SUPPORT_PCRE2_16 /************************************************* -* Convert pattern to 16-bit * +* Convert string to 16-bit * *************************************************/ -/* In UTF mode the input is always interpreted as a string of UTF-8 bytes. If -all the input bytes are ASCII, the space needed for a 16-bit string is exactly -double the 8-bit size. Otherwise, the size needed for a 16-bit string is no -more than double, because up to 0xffff uses no more than 3 bytes in UTF-8 but -possibly 4 in UTF-16. Higher values use 4 bytes in UTF-8 and up to 4 bytes in -UTF-16. The result is always left in pbuffer16. Impose a minimum size to save -repeated re-sizing. +/* In UTF mode the input is always interpreted as a string of UTF-8 bytes using +the original UTF-8 definition of RFC 2279, which allows for up to 6 bytes, and +code values from 0 to 0x7fffffff. However, values greater than the later UTF +limit of 0x10ffff cause an error. In non-UTF mode the input is interpreted as +UTF-8 if the utf8_input modifier is set, but an error is generated for values +greater than 0xffff. + +If all the input bytes are ASCII, the space needed for a 16-bit string is +exactly double the 8-bit size. Otherwise, the size needed for a 16-bit string +is no more than double, because up to 0xffff uses no more than 3 bytes in UTF-8 +but possibly 4 in UTF-16. Higher values use 4 bytes in UTF-8 and up to 4 bytes +in UTF-16. The result is always left in pbuffer16. Impose a minimum size to +save repeated re-sizing. Note that this function does not object to surrogate values. This is deliberate; it makes it possible to construct UTF-16 strings that are invalid, @@ -2565,7 +2800,7 @@ for the purpose of testing that they are correctly faulted. Arguments: p points to a byte string - utf non-zero if converting to UTF-16 + utf true in UTF mode lenptr points to number of bytes in the string (excluding trailing zero) Returns: 0 on success, with the length updated to the number of 16-bit @@ -2596,21 +2831,21 @@ if (pbuffer16_size < 2*len + 2) } pp = pbuffer16; -if (!utf) +if (!utf && (pat_patctl.control & CTL_UTF8_INPUT) == 0) { - while (len-- > 0) *pp++ = *p++; + for (; len > 0; len--) *pp++ = *p++; } else while (len > 0) { uint32_t c; int chlen = utf82ord(p, &c); if (chlen <= 0) return -1; + if (!utf && c > 0xffff) return -3; if (c > 0x10ffff) return -2; p += chlen; len -= chlen; if (c < 0x10000) *pp++ = c; else { - if (!utf) return -3; c -= 0x10000; *pp++ = 0xD800 | (c >> 10); *pp++ = 0xDC00 | (c & 0x3ff); @@ -2627,15 +2862,25 @@ return 0; #ifdef SUPPORT_PCRE2_32 /************************************************* -* Convert pattern to 32-bit * +* Convert string to 32-bit * *************************************************/ -/* In UTF mode the input is always interpreted as a string of UTF-8 bytes. If -all the input bytes are ASCII, the space needed for a 32-bit string is exactly -four times the 8-bit size. Otherwise, the size needed for a 32-bit string is no -more than four times, because the number of characters must be less than the -number of bytes. The result is always left in pbuffer32. Impose a minimum size -to save repeated re-sizing. +/* In UTF mode the input is always interpreted as a string of UTF-8 bytes using +the original UTF-8 definition of RFC 2279, which allows for up to 6 bytes, and +code values from 0 to 0x7fffffff. However, values greater than the later UTF +limit of 0x10ffff cause an error. + +In non-UTF mode the input is interpreted as UTF-8 if the utf8_input modifier +is set, and no limit is imposed. There is special interpretation of the 0xff +byte (which is illegal in UTF-8) in this case: it causes the top bit of the +next character to be set. This provides a way of generating 32-bit characters +greater than 0x7fffffff. + +If all the input bytes are ASCII, the space needed for a 32-bit string is +exactly four times the 8-bit size. Otherwise, the size needed for a 32-bit +string is no more than four times, because the number of characters must be +less than the number of bytes. The result is always left in pbuffer32. Impose a +minimum size to save repeated re-sizing. Note that this function does not object to surrogate values. This is deliberate; it makes it possible to construct UTF-32 strings that are invalid, @@ -2643,7 +2888,7 @@ for the purpose of testing that they are correctly faulted. Arguments: p points to a byte string - utf true if UTF-8 (to be converted to UTF-32) + utf true in UTF mode lenptr points to number of bytes in the string (excluding trailing zero) Returns: 0 on success, with the length updated to the number of 32-bit @@ -2673,19 +2918,29 @@ if (pbuffer32_size < 4*len + 4) } pp = pbuffer32; -if (!utf) + +if (!utf && (pat_patctl.control & CTL_UTF8_INPUT) == 0) { - while (len-- > 0) *pp++ = *p++; + for (; len > 0; len--) *pp++ = *p++; } + else while (len > 0) { + int chlen; uint32_t c; - int chlen = utf82ord(p, &c); + uint32_t topbit = 0; + if (!utf && *p == 0xff && len > 1) + { + topbit = 0x80000000u; + p++; + len--; + } + chlen = utf82ord(p, &c); if (chlen <= 0) return -1; if (utf && c > 0x10ffff) return -2; p += chlen; len -= chlen; - *pp++ = c; + *pp++ = c | topbit; } *pp = 0; @@ -2715,9 +2970,8 @@ Returns: a possibly changed offset static PCRE2_SIZE backchars(uint8_t *subject, PCRE2_SIZE offset, uint32_t count, BOOL utf) { -long int yield; - -if (!utf || test_mode == PCRE32_MODE) yield = offset - count; +if (!utf || test_mode == PCRE32_MODE) + return (count >= offset)? 0 : (offset - count); else if (test_mode == PCRE8_MODE) { @@ -2727,7 +2981,7 @@ else if (test_mode == PCRE8_MODE) pp--; while ((*pp & 0xc0) == 0x80) pp--; } - yield = pp - (PCRE2_SPTR8)subject; + return pp - (PCRE2_SPTR8)subject; } else /* 16-bit mode */ @@ -2738,13 +2992,51 @@ else /* 16-bit mode */ pp--; if ((*pp & 0xfc00) == 0xdc00) pp--; } - yield = pp - (PCRE2_SPTR16)subject; + return pp - (PCRE2_SPTR16)subject; + } +} + + + +/************************************************* +* Expand input buffers * +*************************************************/ + +/* This function doubles the size of the input buffer and the buffer for +keeping an 8-bit copy of patterns (pbuffer8), and copies the current buffers to +the new ones. + +Arguments: none +Returns: nothing (aborts if malloc() fails) +*/ + +static void +expand_input_buffers(void) +{ +int new_pbuffer8_size = 2*pbuffer8_size; +uint8_t *new_buffer = (uint8_t *)malloc(new_pbuffer8_size); +uint8_t *new_pbuffer8 = (uint8_t *)malloc(new_pbuffer8_size); + +if (new_buffer == NULL || new_pbuffer8 == NULL) + { + fprintf(stderr, "pcre2test: malloc(%d) failed\n", new_pbuffer8_size); + exit(1); } -return (yield >= 0)? yield : 0; +memcpy(new_buffer, buffer, pbuffer8_size); +memcpy(new_pbuffer8, pbuffer8, pbuffer8_size); + +pbuffer8_size = new_pbuffer8_size; + +free(buffer); +free(pbuffer8); + +buffer = new_buffer; +pbuffer8 = new_pbuffer8; } + /************************************************* * Read or extend an input line * *************************************************/ @@ -2752,10 +3044,11 @@ return (yield >= 0)? yield : 0; /* Input lines are read into buffer, but both patterns and data lines can be continued over multiple input lines. In addition, if the buffer fills up, we want to automatically expand it so as to be able to handle extremely large -lines that are needed for certain stress tests. When the input buffer is -expanded, the other two buffers must also be expanded likewise, and the -contents of pbuffer, which are a copy of the input for callouts, must be -preserved (for when expansion happens for a data line). This is not the most +lines that are needed for certain stress tests, although this is less likely +now that there are repetition features for both patterns and data. When the +input buffer is expanded, the other two buffers must also be expanded likewise, +and the contents of pbuffer, which are a copy of the input for callouts, must +be preserved (for when expansion happens for a data line). This is not the most optimal way of handling this, but hey, this is just a test program! Arguments: @@ -2779,7 +3072,7 @@ for (;;) if (rlen > 1000) { - int dlen; + size_t dlen; /* If libreadline or libedit support is required, use readline() to read a line if the input is a terminal. Note that readline() removes the trailing @@ -2810,36 +3103,36 @@ for (;;) return (here == start)? NULL : start; } - dlen = (int)strlen((char *)here); - if (dlen > 0 && here[dlen - 1] == '\n') return start; + dlen = strlen((char *)here); here += dlen; + + /* Check for end of line reached. Take care not to read data from before + start (dlen will be zero for a file starting with a binary zero). */ + + if (here > start && here[-1] == '\n') return start; + + /* If we have not read a newline when reading a file, we have either filled + the buffer or reached the end of the file. We can detect the former by + checking that the string fills the buffer, and the latter by feof(). If + neither of these is true, it means we read a binary zero which has caused + strlen() to give a short length. This is a hard error because pcre2test + expects to work with C strings. */ + + if (!INTERACTIVE(f) && dlen < rlen - 1 && !feof(f)) + { + fprintf(outfile, "** Binary zero encountered in input\n"); + fprintf(outfile, "** pcre2test run abandoned\n"); + exit(1); + } } else { - int new_pbuffer8_size = 2*pbuffer8_size; - uint8_t *new_buffer = (uint8_t *)malloc(new_pbuffer8_size); - uint8_t *new_pbuffer8 = (uint8_t *)malloc(new_pbuffer8_size); - - if (new_buffer == NULL || new_pbuffer8 == NULL) - { - fprintf(stderr, "pcre2test: malloc(%d) failed\n", new_pbuffer8_size); - exit(1); - } - - memcpy(new_buffer, buffer, pbuffer8_size); - memcpy(new_pbuffer8, pbuffer8, pbuffer8_size); - - pbuffer8_size = new_pbuffer8_size; - - start = new_buffer + (start - buffer); - here = new_buffer + (here - buffer); - - free(buffer); - free(pbuffer8); - - buffer = new_buffer; - pbuffer8 = new_pbuffer8; + size_t start_offset = start - buffer; + size_t here_offset = here - buffer; + expand_input_buffers(); + start = buffer + start_offset; + here = buffer + here_offset; } } @@ -2874,33 +3167,6 @@ return 0; -/************************************************* -* Read number from string * -*************************************************/ - -/* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess -around with conditional compilation, just do the job by hand. It is only used -for unpicking arguments, so just keep it simple. - -Arguments: - str string to be converted - endptr where to put the end pointer - -Returns: the unsigned long -*/ - -static int -get_value(const char *str, const char **endptr) -{ -int result = 0; -while(*str != 0 && isspace(*str)) str++; -while (isdigit(*str)) result = result * 10 + (int)(*str++ - '0'); -*endptr = str; -return(result); -} - - - /************************************************* * Scan the main modifier list * *************************************************/ @@ -2928,7 +3194,7 @@ while (top > bot) if (c == 0) { if (len == mlen) return mid; - c = len - mlen; + c = (int)len - (int)mlen; } if (c > 0) bot = mid + 1; else top = mid; } @@ -3050,6 +3316,8 @@ static BOOL decode_modifiers(uint8_t *p, int ctx, patctl *pctl, datctl *dctl) { uint8_t *ep, *pp; +long li; +unsigned long uli; BOOL first = TRUE; for (;;) @@ -3066,9 +3334,14 @@ for (;;) while (isspace(*p) || *p == ',') p++; if (*p == 0) break; - /* Find the end of the item. */ + /* Find the end of the item; lose trailing whitespace at end of line. */ - for (ep = p; *ep != 0 && *ep != ',' && !isspace(*ep); ep++); + for (ep = p; *ep != 0 && *ep != ','; ep++); + if (*ep == 0) + { + while (ep > p && isspace(ep[-1])) ep--; + *ep = 0; + } /* Remember if the first character is '-'. */ @@ -3192,8 +3465,8 @@ for (;;) #else *((uint16_t *)field) = PCRE2_BSR_UNICODE; #endif - if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control &= ~CTL_BSR_SET; - else dctl->control &= ~CTL_BSR_SET; + if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 &= ~CTL_BSR_SET; + else dctl->control2 &= ~CTL_BSR_SET; } else { @@ -3202,21 +3475,42 @@ for (;;) else if (len == 7 && strncmpic(pp, (const uint8_t *)"unicode", 7) == 0) *((uint16_t *)field) = PCRE2_BSR_UNICODE; else goto INVALID_VALUE; - if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control |= CTL_BSR_SET; - else dctl->control |= CTL_BSR_SET; + if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 |= CTL_BSR_SET; + else dctl->control2 |= CTL_BSR_SET; } pp = ep; break; case MOD_IN2: /* One or two unsigned integers */ if (!isdigit(*pp)) goto INVALID_VALUE; - ((uint32_t *)field)[0] = (uint32_t)strtoul((const char *)pp, &endptr, 10); + uli = strtoul((const char *)pp, &endptr, 10); + if (U32OVERFLOW(uli)) goto INVALID_VALUE; + ((uint32_t *)field)[0] = (uint32_t)uli; if (*endptr == ':') - ((uint32_t *)field)[1] = (uint32_t)strtoul((const char *)endptr+1, &endptr, 10); + { + uli = strtoul((const char *)endptr+1, &endptr, 10); + if (U32OVERFLOW(uli)) goto INVALID_VALUE; + ((uint32_t *)field)[1] = (uint32_t)uli; + } else ((uint32_t *)field)[1] = 0; pp = (uint8_t *)endptr; break; + /* PCRE2_SIZE_MAX is usually SIZE_MAX, which may be greater, equal to, or + less than ULONG_MAX. So first test for overflowing the long int, and then + test for overflowing PCRE2_SIZE_MAX if it is smaller than ULONG_MAX. */ + + case MOD_SIZ: /* PCRE2_SIZE value */ + if (!isdigit(*pp)) goto INVALID_VALUE; + uli = strtoul((const char *)pp, &endptr, 10); + if (uli == ULONG_MAX) goto INVALID_VALUE; +#if ULONG_MAX > PCRE2_SIZE_MAX + if (uli > PCRE2_SIZE_MAX) goto INVALID_VALUE; +#endif + *((PCRE2_SIZE *)field) = (PCRE2_SIZE)uli; + pp = (uint8_t *)endptr; + break; + case MOD_IND: /* Unsigned integer with default */ if (len == 0) { @@ -3227,13 +3521,17 @@ for (;;) case MOD_INT: /* Unsigned integer */ if (!isdigit(*pp)) goto INVALID_VALUE; - *((uint32_t *)field) = (uint32_t)strtoul((const char *)pp, &endptr, 10); + uli = strtoul((const char *)pp, &endptr, 10); + if (U32OVERFLOW(uli)) goto INVALID_VALUE; + *((uint32_t *)field) = (uint32_t)uli; pp = (uint8_t *)endptr; break; case MOD_INS: /* Signed integer */ if (!isdigit(*pp) && *pp != '-') goto INVALID_VALUE; - *((int32_t *)field) = (int32_t)strtol((const char *)pp, &endptr, 10); + li = strtol((const char *)pp, &endptr, 10); + if (S32OVERFLOW(li)) goto INVALID_VALUE; + *((int32_t *)field) = (int32_t)li; pp = (uint8_t *)endptr; break; @@ -3245,14 +3543,14 @@ for (;;) if (i == 0) { *((uint16_t *)field) = NEWLINE_DEFAULT; - if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control &= ~CTL_NL_SET; - else dctl->control &= ~CTL_NL_SET; + if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 &= ~CTL_NL_SET; + else dctl->control2 &= ~CTL_NL_SET; } else { *((uint16_t *)field) = i; - if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control |= CTL_NL_SET; - else dctl->control |= CTL_NL_SET; + if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 |= CTL_NL_SET; + else dctl->control2 |= CTL_NL_SET; } pp = ep; break; @@ -3261,7 +3559,10 @@ for (;;) if (isdigit(*pp) || *pp == '-') { int ct = MAXCPYGET - 1; - int32_t value = (int32_t)strtol((const char *)pp, &endptr, 10); + int32_t value; + li = strtol((const char *)pp, &endptr, 10); + if (S32OVERFLOW(li)) goto INVALID_VALUE; + value = (int32_t)li; field = (char *)field - m->offset + m->value; /* Adjust field ptr */ if (value >= 0) /* Add new number */ { @@ -3285,10 +3586,16 @@ for (;;) char *nn = (char *)field; if (len > 0) /* Add new name */ { - while (*nn != 0) nn += strlen(nn) + 1; - if (nn + len + 1 - (char *)field > LENCPYGET) + if (len > MAX_NAME_SIZE) { - fprintf(outfile, "** Too many named '%s' modifiers\n", m->name); + fprintf(outfile, "** Group name in '%s' is too long\n", m->name); + return FALSE; + } + while (*nn != 0) nn += strlen(nn) + 1; + if (nn + len + 2 - (char *)field > LENCPYGET) + { + fprintf(outfile, "** Too many characters in named '%s' modifiers\n", + m->name); return FALSE; } memcpy(nn, pp, len); @@ -3405,15 +3712,16 @@ words. Arguments: controls control bits + controls2 more control bits before text to print before Returns: nothing */ static void -show_controls(uint32_t controls, const char *before) +show_controls(uint32_t controls, uint32_t controls2, const char *before) { -fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", +fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "", ((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "", @@ -3421,10 +3729,12 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", ((controls & CTL_ALLUSEDTEXT) != 0)? " allusedtext" : "", ((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "", ((controls & CTL_BINCODE) != 0)? " bincode" : "", + ((controls2 & CTL_BSR_SET) != 0)? " bsr" : "", ((controls & CTL_CALLOUT_CAPTURE) != 0)? " callout_capture" : "", ((controls & CTL_CALLOUT_INFO) != 0)? " callout_info" : "", ((controls & CTL_CALLOUT_NONE) != 0)? " callout_none" : "", ((controls & CTL_DFA) != 0)? " dfa" : "", + ((controls & CTL_EXPAND) != 0)? " expand" : "", ((controls & CTL_FINDLIMITS) != 0)? " find_limits" : "", ((controls & CTL_FULLBINCODE) != 0)? " fullbincode" : "", ((controls & CTL_GETALL) != 0)? " getall" : "", @@ -3435,9 +3745,20 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", ((controls & CTL_JITVERIFY) != 0)? " jitverify" : "", ((controls & CTL_MARK) != 0)? " mark" : "", ((controls & CTL_MEMORY) != 0)? " memory" : "", + ((controls2 & CTL_NL_SET) != 0)? " newline" : "", + ((controls & CTL_NULLCONTEXT) != 0)? " null_context" : "", ((controls & CTL_POSIX) != 0)? " posix" : "", + ((controls & CTL_POSIX_NOSUB) != 0)? " posix_nosub" : "", ((controls & CTL_PUSH) != 0)? " push" : "", + ((controls & CTL_PUSHCOPY) != 0)? " pushcopy" : "", + ((controls & CTL_PUSHTABLESCOPY) != 0)? " pushtablescopy" : "", ((controls & CTL_STARTCHAR) != 0)? " startchar" : "", + ((controls2 & CTL2_SUBSTITUTE_EXTENDED) != 0)? " substitute_extended" : "", + ((controls2 & CTL2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)? " substitute_overflow_length" : "", + ((controls2 & CTL2_SUBSTITUTE_UNKNOWN_UNSET) != 0)? " substitute_unknown_unset" : "", + ((controls2 & CTL2_SUBSTITUTE_UNSET_EMPTY) != 0)? " substitute_unset_empty" : "", + ((controls & CTL_USE_LENGTH) != 0)? " use_length" : "", + ((controls & CTL_UTF8_INPUT) != 0)? " utf8_input" : "", ((controls & CTL_ZERO_TERMINATE) != 0)? " zero_terminate" : ""); } @@ -3461,10 +3782,11 @@ static void show_compile_options(uint32_t options, const char *before, const char *after) { if (options == 0) fprintf(outfile, "%s %s", before, after); -else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", +else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "", ((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "", + ((options & PCRE2_ALT_VERBNAMES) != 0)? " alt_verbnames" : "", ((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "", ((options & PCRE2_ANCHORED) != 0)? " anchored" : "", ((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "", @@ -3486,6 +3808,7 @@ else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", ((options & PCRE2_NO_START_OPTIMIZE) != 0)? " no_start_optimize" : "", ((options & PCRE2_UCP) != 0)? " ucp" : "", ((options & PCRE2_UNGREEDY) != 0)? " ungreedy" : "", + ((options & PCRE2_USE_OFFSET_LIMIT) != 0)? " use_offset_limit" : "", ((options & PCRE2_UTF) != 0)? " utf" : "", after); } @@ -3528,14 +3851,18 @@ show_memory_info(void) uint32_t name_count, name_entry_size; size_t size, cblock_size; +/* One of the test_mode values will always be true, but to stop a compiler +warning we must initialize cblock_size. */ + +cblock_size = 0; #ifdef SUPPORT_PCRE2_8 -if (test_mode == 8) cblock_size = sizeof(pcre2_real_code_8); +if (test_mode == PCRE8_MODE) cblock_size = sizeof(pcre2_real_code_8); #endif #ifdef SUPPORT_PCRE2_16 -if (test_mode == 16) cblock_size = sizeof(pcre2_real_code_16); +if (test_mode == PCRE16_MODE) cblock_size = sizeof(pcre2_real_code_16); #endif #ifdef SUPPORT_PCRE2_32 -if (test_mode == 32) cblock_size = sizeof(pcre2_real_code_32); +if (test_mode == PCRE32_MODE) cblock_size = sizeof(pcre2_real_code_32); #endif (void)pattern_info(PCRE2_INFO_SIZE, &size, FALSE); @@ -3629,12 +3956,13 @@ if ((pat_patctl.control & (CTL_BINCODE|CTL_FULLBINCODE)) != 0) if ((pat_patctl.control & CTL_INFO) != 0) { - const void *nametable; - const uint8_t *start_bits; + void *nametable; + uint8_t *start_bits; BOOL match_limit_set, recursion_limit_set; uint32_t backrefmax, bsr_convention, capture_count, first_ctype, first_cunit, - hascrorlf, jchanged, last_ctype, last_cunit, match_empty, match_limit, - minlength, nameentrysize, namecount, newline_convention, recursion_limit; + hasbackslashc, hascrorlf, jchanged, last_ctype, last_cunit, match_empty, + match_limit, minlength, nameentrysize, namecount, newline_convention, + recursion_limit; /* These info requests may return PCRE2_ERROR_UNSET. */ @@ -3674,6 +4002,7 @@ if ((pat_patctl.control & CTL_INFO) != 0) pattern_info(PCRE2_INFO_FIRSTBITMAP, &start_bits, FALSE) + pattern_info(PCRE2_INFO_FIRSTCODEUNIT, &first_cunit, FALSE) + pattern_info(PCRE2_INFO_FIRSTCODETYPE, &first_ctype, FALSE) + + pattern_info(PCRE2_INFO_HASBACKSLASHC, &hasbackslashc, FALSE) + pattern_info(PCRE2_INFO_HASCRORLF, &hascrorlf, FALSE) + pattern_info(PCRE2_INFO_JCHANGED, &jchanged, FALSE) + pattern_info(PCRE2_INFO_LASTCODEUNIT, &last_cunit, FALSE) + @@ -3704,7 +4033,7 @@ if ((pat_patctl.control & CTL_INFO) != 0) if (namecount > 0) { fprintf(outfile, "Named capturing subpatterns:\n"); - while (namecount-- > 0) + for (; namecount > 0; namecount--) { int imm2_size = test_mode == PCRE8_MODE ? 2 : 1; uint32_t length = (uint32_t)STRLEN(nametable + imm2_size); @@ -3728,8 +4057,9 @@ if ((pat_patctl.control & CTL_INFO) != 0) } } - if (hascrorlf) fprintf(outfile, "Contains explicit CR or LF match\n"); - if (match_empty) fprintf(outfile, "May match empty string\n"); + if (hascrorlf) fprintf(outfile, "Contains explicit CR or LF match\n"); + if (hasbackslashc) fprintf(outfile, "Contains \\C\n"); + if (match_empty) fprintf(outfile, "May match empty string\n"); pattern_info(PCRE2_INFO_ARGOPTIONS, &compile_options, FALSE); pattern_info(PCRE2_INFO_ALLOPTIONS, &overall_options, FALSE); @@ -3762,13 +4092,12 @@ if ((pat_patctl.control & CTL_INFO) != 0) if (jchanged) fprintf(outfile, "Duplicate name status changes\n"); - if ((pat_patctl.control & CTL_BSR_SET) != 0 || + if ((pat_patctl.control2 & CTL_BSR_SET) != 0 || (FLD(compiled_code, flags) & PCRE2_BSR_SET) != 0) fprintf(outfile, "\\R matches %s\n", (bsr_convention == PCRE2_BSR_UNICODE)? "any Unicode newline" : "CR, LF, or CRLF"); - if ((pat_patctl.control & CTL_NL_SET) != 0 || - (FLD(compiled_code, flags) & PCRE2_NL_SET) != 0) + if ((FLD(compiled_code, flags) & PCRE2_NL_SET) != 0) { switch (newline_convention) { @@ -3866,11 +4195,22 @@ if ((pat_patctl.control & CTL_INFO) != 0) if (FLD(compiled_code, executable_jit) != NULL) fprintf(outfile, "JIT compilation was successful\n"); else + { #ifdef SUPPORT_JIT - fprintf(outfile, "JIT compilation was not successful\n"); + int len; + fprintf(outfile, "JIT compilation was not successful"); + if (jitrc != 0) + { + fprintf(outfile, " ("); + PCRE2_GET_ERROR_MESSAGE(len, jitrc, pbuffer); + PCHARSV(CASTVAR(void *, pbuffer), 0, len, FALSE, outfile); + fprintf(outfile, ")"); + } + fprintf(outfile, "\n"); #else fprintf(outfile, "JIT support is not available in this version of PCRE2\n"); #endif + } } } @@ -3954,7 +4294,7 @@ if (endf == filename) *fptr = fopen((const char *)filename, mode); if (*fptr == NULL) { - fprintf(outfile, "** Failed to open '%s'\n", filename); + fprintf(outfile, "** Failed to open '%s': %s\n", filename, strerror(errno)); return PR_ABEND; } @@ -3985,6 +4325,7 @@ FILE *f; PCRE2_SIZE serial_size; size_t i; int rc, cmd, cmdlen; +uint16_t first_listed_newline; const char *cmdname; uint8_t *argptr, *serial; @@ -4039,11 +4380,37 @@ switch(cmd) (void)decode_modifiers(argptr, CTX_DEFDAT, NULL, &def_datctl); break; - /* Pop a compiled pattern off the stack. Modifiers that do not affect the - compiled pattern (e.g. to give information) are permitted. The default + /* Check the default newline, and if not one of those listed, set up the + first one to be forced. An empty list unsets. */ + + case CMD_NEWLINE_DEFAULT: + local_newline_default = 0; /* Unset */ + first_listed_newline = 0; + for (;;) + { + while (isspace(*argptr)) argptr++; + if (*argptr == 0) break; + for (i = 1; i < sizeof(newlines)/sizeof(char *); i++) + { + size_t nlen = strlen(newlines[i]); + if (strncmpic(argptr, (const uint8_t *)newlines[i], nlen) == 0 && + isspace(argptr[nlen])) + { + if (i == NEWLINE_DEFAULT) return PR_OK; /* Default is valid */ + if (first_listed_newline == 0) first_listed_newline = i; + } + } + while (*argptr != 0 && !isspace(*argptr)) argptr++; + } + local_newline_default = first_listed_newline; + break; + + /* Pop or copy a compiled pattern off the stack. Modifiers that do not affect + the compiled pattern (e.g. to give information) are permitted. The default pattern modifiers are ignored. */ case CMD_POP: + case CMD_POPCOPY: if (patstacknext <= 0) { fprintf(outfile, "** Can't pop off an empty stack\n"); @@ -4052,10 +4419,19 @@ switch(cmd) memset(&pat_patctl, 0, sizeof(patctl)); /* Completely unset */ if (!decode_modifiers(argptr, CTX_POPPAT, &pat_patctl, NULL)) return PR_SKIP; - SET(compiled_code, patstack[--patstacknext]); + + if (cmd == CMD_POP) + { + SET(compiled_code, patstack[--patstacknext]); + } + else + { + PCRE2_CODE_COPY_FROM_VOID(compiled_code, patstack[patstacknext - 1]); + } + if (pat_patctl.jit != 0) { - PCRE2_JIT_COMPILE(compiled_code, pat_patctl.jit); + PCRE2_JIT_COMPILE(jitrc, compiled_code, pat_patctl.jit); } if ((pat_patctl.control & CTL_MEMORY) != 0) show_memory_info(); if ((pat_patctl.control & CTL_ANYINFO) != 0) @@ -4128,6 +4504,7 @@ switch(cmd) if (fread(serial, 1, serial_size, f) != serial_size) { fprintf(outfile, "** Wrong return from fread()\n"); + free(serial); return PR_ABEND; } fclose(f); @@ -4178,11 +4555,13 @@ static int process_pattern(void) { BOOL utf; +uint32_t k; uint8_t *p = buffer; -const uint8_t *use_tables; unsigned int delimiter = *p++; int errorcode; +void *use_pat_context; PCRE2_SIZE patlen; +PCRE2_SIZE valgrind_access_length; PCRE2_SIZE erroroffset; /* Initialize the context and pattern/data controls for this test from the @@ -4226,6 +4605,37 @@ patlen = p - buffer - 2; if (!decode_modifiers(p, CTX_PAT, &pat_patctl, NULL)) return PR_SKIP; utf = (pat_patctl.options & PCRE2_UTF) != 0; +/* The utf8_input modifier is not allowed in 8-bit mode, and is mutually +exclusive with the utf modifier. */ + +if ((pat_patctl.control & CTL_UTF8_INPUT) != 0) + { + if (test_mode == PCRE8_MODE) + { + fprintf(outfile, "** The utf8_input modifier is not allowed in 8-bit mode\n"); + return PR_SKIP; + } + if (utf) + { + fprintf(outfile, "** The utf and utf8_input modifiers are mutually exclusive\n"); + return PR_SKIP; + } + } + +/* Check for mutually exclusive modifiers. At present, these are all in the +first control word. */ + +for (k = 0; k < sizeof(exclusive_pat_controls)/sizeof(uint32_t); k++) + { + uint32_t c = pat_patctl.control & exclusive_pat_controls[k]; + if (c != 0 && c != (c & (~c+1))) + { + show_controls(c, 0, "** Not allowed together:"); + fprintf(outfile, "\n"); + return PR_SKIP; + } + } + /* Assume full JIT compile for jitverify and/or jitfast if nothing else was specified. */ @@ -4233,50 +4643,159 @@ if (pat_patctl.jit == 0 && (pat_patctl.control & (CTL_JITVERIFY|CTL_JITFAST)) != 0) pat_patctl.jit = 7; -/* POSIX and 'push' do not play together. */ - -if ((pat_patctl.control & (CTL_POSIX|CTL_PUSH)) == (CTL_POSIX|CTL_PUSH)) - { - fprintf(outfile, "** The POSIX interface is incompatible with 'push'\n"); - return PR_ABEND; - } - /* Now copy the pattern to pbuffer8 for use in 8-bit testing and for reflecting -in callouts. Convert to binary if required. */ +in callouts. Convert from hex if requested (literal strings in quotes may be +present within the hexadecimal pairs). The result must necessarily be fewer +characters so will always fit in pbuffer8. */ if ((pat_patctl.control & CTL_HEXPAT) != 0) { uint8_t *pp, *pt; uint32_t c, d; - if ((pat_patctl.control & CTL_POSIX) != 0) - { - fprintf(outfile, "** Hex patterns are not supported for the POSIX API\n"); - return PR_SKIP; - } - pt = pbuffer8; for (pp = buffer + 1; *pp != 0; pp++) { if (isspace(*pp)) continue; - c = toupper(*pp++); - if (*pp == 0) + c = *pp++; + + /* Handle a literal substring */ + + if (c == '\'' || c == '"') { - fprintf(outfile, "** Odd number of digits in hex pattern.\n"); - return PR_SKIP; + uint8_t *pq = pp; + for (;; pp++) + { + d = *pp; + if (d == 0) + { + fprintf(outfile, "** Missing closing quote in hex pattern: " + "opening quote is at offset " PTR_SPEC ".\n", pq - buffer - 2); + return PR_SKIP; + } + if (d == c) break; + *pt++ = d; + } } - d = toupper(*pp); - if (!isxdigit(c) || !isxdigit(d)) + + /* Expect a hex pair */ + + else { - fprintf(outfile, "** Non-hex-digit in hex pattern.\n"); - return PR_SKIP; + if (!isxdigit(c)) + { + fprintf(outfile, "** Unexpected non-hex-digit '%c' at offset " + PTR_SPEC " in hex pattern: quote missing?\n", c, pp - buffer - 2); + return PR_SKIP; + } + if (*pp == 0) + { + fprintf(outfile, "** Odd number of digits in hex pattern\n"); + return PR_SKIP; + } + d = *pp; + if (!isxdigit(d)) + { + fprintf(outfile, "** Unexpected non-hex-digit '%c' at offset " + PTR_SPEC " in hex pattern: quote missing?\n", d, pp - buffer - 1); + return PR_SKIP; + } + c = toupper(c); + d = toupper(d); + *pt++ = ((isdigit(c)? (c - '0') : (c - 'A' + 10)) << 4) + + (isdigit(d)? (d - '0') : (d - 'A' + 10)); } - *pt++ = ((isdigit(c)? (c - '0') : (c - 'A' + 10)) << 4) + - (isdigit(d)? (d - '0') : (d - 'A' + 10)); } *pt = 0; patlen = pt - pbuffer8; } + +/* If not a hex string, process for repetition expansion if requested. */ + +else if ((pat_patctl.control & CTL_EXPAND) != 0) + { + uint8_t *pp, *pt; + + pt = pbuffer8; + for (pp = buffer + 1; *pp != 0; pp++) + { + uint8_t *pc = pp; + uint32_t count = 1; + size_t length = 1; + + /* Check for replication syntax; if not found, the defaults just set will + prevail and one character will be copied. */ + + if (pp[0] == '\\' && pp[1] == '[') + { + uint8_t *pe; + for (pe = pp + 2; *pe != 0; pe++) + { + if (pe[0] == ']' && pe[1] == '{') + { + uint32_t clen = pe - pc - 2; + uint32_t i = 0; + unsigned long uli; + char *endptr; + + pe += 2; + uli = strtoul((const char *)pe, &endptr, 10); + if (U32OVERFLOW(uli)) + { + fprintf(outfile, "** Pattern repeat count too large\n"); + return PR_SKIP; + } + + i = (uint32_t)uli; + pe = (uint8_t *)endptr; + if (*pe == '}') + { + if (i == 0) + { + fprintf(outfile, "** Zero repeat not allowed\n"); + return PR_SKIP; + } + pc += 2; + count = i; + length = clen; + pp = pe; + break; + } + } + } + } + + /* Add to output. If the buffer is too small expand it. The function for + expanding buffers always keeps buffer and pbuffer8 in step as far as their + size goes. */ + + while (pt + count * length > pbuffer8 + pbuffer8_size) + { + size_t pc_offset = pc - buffer; + size_t pp_offset = pp - buffer; + size_t pt_offset = pt - pbuffer8; + expand_input_buffers(); + pc = buffer + pc_offset; + pp = buffer + pp_offset; + pt = pbuffer8 + pt_offset; + } + + for (; count > 0; count--) + { + memcpy(pt, pc, length); + pt += length; + } + } + + *pt = 0; + patlen = pt - pbuffer8; + + if ((pat_patctl.control & CTL_INFO) != 0) + fprintf(outfile, "Expanded: %s\n", pbuffer8); + } + +/* Neither hex nor expanded, just copy the input verbatim. */ + else { strncpy((char *)pbuffer8, (char *)(buffer+1), patlen + 1); @@ -4288,7 +4807,7 @@ if (pat_patctl.locale[0] != 0) { if (pat_patctl.tables_id != 0) { - fprintf(outfile, "** 'Locale' and 'tables' must not both be set.\n"); + fprintf(outfile, "** 'Locale' and 'tables' must not both be set\n"); return PR_SKIP; } if (setlocale(LC_CTYPE, (const char *)pat_patctl.locale) == NULL) @@ -4336,7 +4855,7 @@ if ((pat_patctl.control & CTL_POSIX) != 0) const char *msg = "** Ignored with POSIX interface:"; #endif - if (test_mode != 8) + if (test_mode != PCRE8_MODE) { fprintf(outfile, "** The POSIX interface is available only in 8-bit mode\n"); return PR_SKIP; @@ -4358,32 +4877,83 @@ if ((pat_patctl.control & CTL_POSIX) != 0) pat_patctl.options & ~POSIX_SUPPORTED_COMPILE_OPTIONS, msg, ""); msg = ""; } - if ((pat_patctl.control & ~POSIX_SUPPORTED_COMPILE_CONTROLS) != 0) + if ((pat_patctl.control & ~POSIX_SUPPORTED_COMPILE_CONTROLS) != 0 || + (pat_patctl.control2 & ~POSIX_SUPPORTED_COMPILE_CONTROLS2) != 0) { - show_controls(pat_patctl.control & ~POSIX_SUPPORTED_COMPILE_CONTROLS, msg); + show_controls(pat_patctl.control & ~POSIX_SUPPORTED_COMPILE_CONTROLS, + pat_patctl.control2 & ~POSIX_SUPPORTED_COMPILE_CONTROLS2, msg); msg = ""; } + if (local_newline_default != 0) prmsg(&msg, "#newline_default"); + if (msg[0] == 0) fprintf(outfile, "\n"); - /* Translate PCRE2 options to POSIX options and then compile. On success, set - up a match_data block to be used for all matches. */ + /* Translate PCRE2 options to POSIX options and then compile. */ if (utf) cflags |= REG_UTF; + if ((pat_patctl.control & CTL_POSIX_NOSUB) != 0) cflags |= REG_NOSUB; if ((pat_patctl.options & PCRE2_UCP) != 0) cflags |= REG_UCP; if ((pat_patctl.options & PCRE2_CASELESS) != 0) cflags |= REG_ICASE; if ((pat_patctl.options & PCRE2_MULTILINE) != 0) cflags |= REG_NEWLINE; if ((pat_patctl.options & PCRE2_DOTALL) != 0) cflags |= REG_DOTALL; - if ((pat_patctl.options & PCRE2_NO_AUTO_CAPTURE) != 0) cflags |= REG_NOSUB; if ((pat_patctl.options & PCRE2_UNGREEDY) != 0) cflags |= REG_UNGREEDY; rc = regcomp(&preg, (char *)pbuffer8, cflags); - if (rc != 0) /* Failure */ + + /* Compiling failed */ + + if (rc != 0) { - (void)regerror(rc, &preg, (char *)pbuffer8, pbuffer8_size); - fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, pbuffer8); + size_t bsize, usize; + int psize; + + preg.re_pcre2_code = NULL; /* In case something was left in there */ + preg.re_match_data = NULL; + + bsize = (pat_patctl.regerror_buffsize != 0)? + pat_patctl.regerror_buffsize : pbuffer8_size; + if (bsize + 8 < pbuffer8_size) + memcpy(pbuffer8 + bsize, "DEADBEEF", 8); + usize = regerror(rc, &preg, (char *)pbuffer8, bsize); + + /* Inside regerror(), snprintf() is used. If the buffer is too small, some + versions of snprintf() put a zero byte at the end, but others do not. + Therefore, we print a maximum of one less than the size of the buffer. */ + + psize = (int)bsize - 1; + fprintf(outfile, "Failed: POSIX code %d: %.*s\n", rc, psize, pbuffer8); + if (usize > bsize) + { + fprintf(outfile, "** regerror() message truncated\n"); + if (memcmp(pbuffer8 + bsize, "DEADBEEF", 8) != 0) + fprintf(outfile, "** regerror() buffer overflow\n"); + } return PR_SKIP; } + + /* Compiling succeeded. Check that the values in the preg block are sensible. + It can happen that pcre2test is accidentally linked with a different POSIX + library which succeeds, but of course puts different things into preg. In + this situation, calling regfree() may cause a segfault (or invalid free() in + valgrind), so ensure that preg.re_pcre2_code is NULL, which suppresses the + calling of regfree() on exit. */ + + if (preg.re_pcre2_code == NULL || + ((pcre2_real_code_8 *)preg.re_pcre2_code)->magic_number != MAGIC_NUMBER || + ((pcre2_real_code_8 *)preg.re_pcre2_code)->top_bracket != preg.re_nsub || + preg.re_match_data == NULL || + preg.re_cflags != cflags) + { + fprintf(outfile, + "** The regcomp() function returned zero (success), but the values set\n" + "** in the preg block are not valid for PCRE2. Check that pcre2test is\n" + "** linked with PCRE2's pcre2posix module (-lpcre2-posix) and not with\n" + "** some other POSIX regex library.\n**\n"); + preg.re_pcre2_code = NULL; + return PR_ABEND; + } + return PR_OK; #endif /* SUPPORT_PCRE2_8 */ } @@ -4391,22 +4961,26 @@ if ((pat_patctl.control & CTL_POSIX) != 0) /* Handle compiling via the native interface. Controls that act later are ignored with "push". Replacements are locked out. */ -if ((pat_patctl.control & CTL_PUSH) != 0) +if ((pat_patctl.control & (CTL_PUSH|CTL_PUSHCOPY|CTL_PUSHTABLESCOPY)) != 0) { if (pat_patctl.replacement[0] != 0) { fprintf(outfile, "** Replacement text is not supported with 'push'.\n"); return PR_OK; } - if ((pat_patctl.control & ~PUSH_SUPPORTED_COMPILE_CONTROLS) != 0) + if ((pat_patctl.control & ~PUSH_SUPPORTED_COMPILE_CONTROLS) != 0 || + (pat_patctl.control2 & ~PUSH_SUPPORTED_COMPILE_CONTROLS2) != 0) { show_controls(pat_patctl.control & ~PUSH_SUPPORTED_COMPILE_CONTROLS, + pat_patctl.control2 & ~PUSH_SUPPORTED_COMPILE_CONTROLS2, "** Ignored when compiled pattern is stacked with 'push':"); fprintf(outfile, "\n"); } - if ((pat_patctl.control & PUSH_COMPILE_ONLY_CONTROLS) != 0) + if ((pat_patctl.control & PUSH_COMPILE_ONLY_CONTROLS) != 0 || + (pat_patctl.control2 & PUSH_COMPILE_ONLY_CONTROLS2) != 0) { show_controls(pat_patctl.control & PUSH_COMPILE_ONLY_CONTROLS, + pat_patctl.control2 & PUSH_COMPILE_ONLY_CONTROLS2, "** Applies only to compile when pattern is stacked with 'push':"); fprintf(outfile, "\n"); } @@ -4414,9 +4988,7 @@ if ((pat_patctl.control & CTL_PUSH) != 0) /* Convert the input in non-8-bit modes. */ -#ifdef SUPPORT_PCRE2_8 -if (test_mode == PCRE8_MODE) errorcode = 0; -#endif +errorcode = 0; #ifdef SUPPORT_PCRE2_16 if (test_mode == PCRE16_MODE) errorcode = to16(pbuffer8, utf, &patlen); @@ -4447,23 +5019,71 @@ switch(errorcode) break; } -/* The pattern is now in pbuffer[8|16|32], with the length in patlen. By -default, however, we pass a zero-terminated pattern. The length is passed only -if we had a hex pattern. */ +/* The pattern is now in pbuffer[8|16|32], with the length in code units in +patlen. By default we pass a zero-terminated pattern, but a length is passed if +"use_length" was specified or this is a hex pattern (which might contain binary +zeros). When valgrind is supported, arrange for the unused part of the buffer +to be marked as no access. */ -if ((pat_patctl.control & CTL_HEXPAT) == 0) patlen = PCRE2_ZERO_TERMINATED; +valgrind_access_length = patlen; +if ((pat_patctl.control & (CTL_HEXPAT|CTL_USE_LENGTH)) == 0) + { + patlen = PCRE2_ZERO_TERMINATED; + valgrind_access_length += 1; /* For the terminating zero */ + } + +#ifdef SUPPORT_VALGRIND +#ifdef SUPPORT_PCRE2_8 +if (test_mode == PCRE8_MODE && pbuffer8 != NULL) + { + VALGRIND_MAKE_MEM_NOACCESS(pbuffer8 + valgrind_access_length, + pbuffer8_size - valgrind_access_length); + } +#endif +#ifdef SUPPORT_PCRE2_16 +if (test_mode == PCRE16_MODE && pbuffer16 != NULL) + { + VALGRIND_MAKE_MEM_NOACCESS(pbuffer16 + valgrind_access_length, + pbuffer16_size - valgrind_access_length*sizeof(uint16_t)); + } +#endif +#ifdef SUPPORT_PCRE2_32 +if (test_mode == PCRE32_MODE && pbuffer32 != NULL) + { + VALGRIND_MAKE_MEM_NOACCESS(pbuffer32 + valgrind_access_length, + pbuffer32_size - valgrind_access_length*sizeof(uint32_t)); + } +#endif +#else /* Valgrind not supported */ +(void)valgrind_access_length; /* Avoid compiler warning */ +#endif + +/* If #newline_default has been used and the library was not compiled with an +appropriate default newline setting, local_newline_default will be non-zero. We +use this if there is no explicit newline modifier. */ + +if ((pat_patctl.control2 & CTL_NL_SET) == 0 && local_newline_default != 0) + { + SETFLD(pat_context, newline_convention, local_newline_default); + } + +/* The null_context modifier is used to test calling pcre2_compile() with a +NULL context. */ + +use_pat_context = ((pat_patctl.control & CTL_NULLCONTEXT) != 0)? + NULL : PTR(pat_context); /* Compile many times when timing. */ if (timeit > 0) { - register int i; + int i; clock_t time_taken = 0; for (i = 0; i < timeit; i++) { clock_t start_time = clock(); PCRE2_COMPILE(compiled_code, pbuffer, patlen, - pat_patctl.options|forbid_utf, &errorcode, &erroroffset, pat_context); + pat_patctl.options|forbid_utf, &errorcode, &erroroffset, use_pat_context); time_taken += clock() - start_time; if (TEST(compiled_code, !=, NULL)) { SUB1(pcre2_code_free, compiled_code); } @@ -4477,7 +5097,66 @@ if (timeit > 0) /* A final compile that is used "for real". */ PCRE2_COMPILE(compiled_code, pbuffer, patlen, pat_patctl.options|forbid_utf, - &errorcode, &erroroffset, pat_context); + &errorcode, &erroroffset, use_pat_context); + +/* Call the JIT compiler if requested. When timing, we must free and recompile +the pattern each time because that is the only way to free the JIT compiled +code. We know that compilation will always succeed. */ + +if (TEST(compiled_code, !=, NULL) && pat_patctl.jit != 0) + { + if (timeit > 0) + { + int i; + clock_t time_taken = 0; + for (i = 0; i < timeit; i++) + { + clock_t start_time; + SUB1(pcre2_code_free, compiled_code); + PCRE2_COMPILE(compiled_code, pbuffer, patlen, + pat_patctl.options|forbid_utf, &errorcode, &erroroffset, + use_pat_context); + start_time = clock(); + PCRE2_JIT_COMPILE(jitrc,compiled_code, pat_patctl.jit); + time_taken += clock() - start_time; + } + total_jit_compile_time += time_taken; + fprintf(outfile, "JIT compile %.4f milliseconds\n", + (((double)time_taken * 1000.0) / (double)timeit) / + (double)CLOCKS_PER_SEC); + } + else + { + PCRE2_JIT_COMPILE(jitrc, compiled_code, pat_patctl.jit); + } + } + +/* If valgrind is supported, mark the pbuffer as accessible again. The 16-bit +and 32-bit buffers can be marked completely undefined, but we must leave the +pattern in the 8-bit buffer defined because it may be read from a callout +during matching. */ + +#ifdef SUPPORT_VALGRIND +#ifdef SUPPORT_PCRE2_8 +if (test_mode == PCRE8_MODE) + { + VALGRIND_MAKE_MEM_UNDEFINED(pbuffer8 + valgrind_access_length, + pbuffer8_size - valgrind_access_length); + } +#endif +#ifdef SUPPORT_PCRE2_16 +if (test_mode == PCRE16_MODE) + { + VALGRIND_MAKE_MEM_UNDEFINED(pbuffer16, pbuffer16_size); + } +#endif +#ifdef SUPPORT_PCRE2_32 +if (test_mode == PCRE32_MODE) + { + VALGRIND_MAKE_MEM_UNDEFINED(pbuffer32, pbuffer32_size); + } +#endif +#endif /* Compilation failed; go back for another re, skipping to blank line if non-interactive. */ @@ -4512,35 +5191,12 @@ if (forbid_utf != 0) if (pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) != 0) return PR_ABEND; -/* Call the JIT compiler if requested. When timing, we must free and recompile -the pattern each time because that is the only way to free the JIT compiled -code. We know that compilation will always succeed. */ +/* If an explicit newline modifier was given, set the information flag in the +pattern so that it is preserved over push/pop. */ -if (pat_patctl.jit != 0) +if ((pat_patctl.control2 & CTL_NL_SET) != 0) { - if (timeit > 0) - { - register int i; - clock_t time_taken = 0; - for (i = 0; i < timeit; i++) - { - clock_t start_time; - SUB1(pcre2_code_free, compiled_code); - PCRE2_COMPILE(compiled_code, pbuffer, patlen, - pat_patctl.options|forbid_utf, &errorcode, &erroroffset, pat_context); - start_time = clock(); - PCRE2_JIT_COMPILE(compiled_code, pat_patctl.jit); - time_taken += clock() - start_time; - } - total_jit_compile_time += time_taken; - fprintf(outfile, "JIT compile %.4f milliseconds\n", - (((double)time_taken * 1000.0) / (double)timeit) / - (double)CLOCKS_PER_SEC); - } - else - { - PCRE2_JIT_COMPILE(compiled_code, pat_patctl.jit); - } + SETFLD(compiled_code, flags, FLD(compiled_code, flags) | PCRE2_NL_SET); } /* Output code size and other information if requested. */ @@ -4566,6 +5222,27 @@ if ((pat_patctl.control & CTL_PUSH) != 0) SET(compiled_code, NULL); } +/* The "pushcopy" and "pushtablescopy" controls are similar, but push a +copy of the pattern, the latter with a copy of its character tables. This tests +the pcre2_code_copy() and pcre2_code_copy_with_tables() functions. */ + +if ((pat_patctl.control & (CTL_PUSHCOPY|CTL_PUSHTABLESCOPY)) != 0) + { + if (patstacknext >= PATSTACKSIZE) + { + fprintf(outfile, "** Too many pushed patterns (max %d)\n", PATSTACKSIZE); + return PR_ABEND; + } + if ((pat_patctl.control & CTL_PUSHCOPY) != 0) + { + PCRE2_CODE_COPY_TO_VOID(patstack[patstacknext++], compiled_code); + } + else + { + PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(patstack[patstacknext++], + compiled_code); } + } + return PR_OK; } @@ -4599,10 +5276,10 @@ for (;;) if ((pat_patctl.control & CTL_JITFAST) != 0) PCRE2_JIT_MATCH(capcount, compiled_code, pp, ulen, dat_datctl.offset, - dat_datctl.options, match_data, dat_context); + dat_datctl.options, match_data, PTR(dat_context)); else PCRE2_MATCH(capcount, compiled_code, pp, ulen, dat_datctl.offset, - dat_datctl.options, match_data, dat_context); + dat_datctl.options, match_data, PTR(dat_context)); if (capcount == errnumber) { @@ -4648,6 +5325,7 @@ static int callout_function(pcre2_callout_block_8 *cb, void *callout_data_ptr) { uint32_t i, pre_start, post_start, subject_length; +PCRE2_SIZE current_position; BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0; BOOL callout_capture = (dat_datctl.control & CTL_CALLOUT_CAPTURE) != 0; @@ -4698,22 +5376,37 @@ if (callout_capture) } } -/* Re-print the subject in canonical form, the first time or if giving full -datails. On subsequent calls in the same match, we use pchars just to find the -printed lengths of the substrings. */ +/* Re-print the subject in canonical form (with escapes for non-printing +characters), the first time, or if giving full details. On subsequent calls in +the same match, we use PCHARS() just to find the printed lengths of the +substrings. */ if (f != NULL) fprintf(f, "--->"); +/* The subject before the match start. */ + PCHARS(pre_start, cb->subject, 0, cb->start_match, utf, f); +/* If a lookbehind is involved, the current position may be earlier than the +match start. If so, use the match start instead. */ + +current_position = (cb->current_position >= cb->start_match)? + cb->current_position : cb->start_match; + +/* The subject between the match start and the current position. */ + PCHARS(post_start, cb->subject, cb->start_match, - cb->current_position - cb->start_match, utf, f); + current_position - cb->start_match, utf, f); + +/* Print from the current position to the end. */ + +PCHARSV(cb->subject, current_position, cb->subject_length - current_position, + utf, f); + +/* Calculate the total subject printed length (no print). */ PCHARS(subject_length, cb->subject, 0, cb->subject_length, utf, NULL); -PCHARSV(cb->subject, cb->current_position, - cb->subject_length - cb->current_position, utf, f); - if (f != NULL) fprintf(f, "\n"); /* For automatic callouts, show the pattern offset. Otherwise, for a numerical @@ -4745,9 +5438,9 @@ if (post_start > 0) for (i = 0; i < subject_length - pre_start - post_start + 4; i++) fprintf(outfile, " "); -fprintf(outfile, "%.*s", - (int)((cb->next_item_length == 0)? 1 : cb->next_item_length), - pbuffer8 + cb->pattern_position); +if (cb->next_item_length != 0) + fprintf(outfile, "%.*s", (int)(cb->next_item_length), + pbuffer8 + cb->pattern_position); fprintf(outfile, "\n"); first_callout = FALSE; @@ -4775,8 +5468,17 @@ if (callout_data_ptr != NULL) } } -return (cb->callout_number != dat_datctl.cfail[0])? 0 : - (++callout_count >= dat_datctl.cfail[1])? 1 : 0; +callout_count++; + +if (cb->callout_number == dat_datctl.cerror[0] && + callout_count >= dat_datctl.cerror[1]) + return PCRE2_ERROR_CALLOUT; + +if (cb->callout_number == dat_datctl.cfail[0] && + callout_count >= dat_datctl.cfail[1]) + return 1; + +return 0; } @@ -5025,10 +5727,11 @@ process_data(void) { PCRE2_SIZE len, ulen; uint32_t gmatched; -uint32_t c; +uint32_t c, k; uint32_t g_notempty = 0; uint8_t *p, *pp, *start_rep; size_t needlen; +void *use_dat_context; BOOL utf; #ifdef SUPPORT_PCRE2_8 @@ -5050,6 +5753,7 @@ matching. */ DATCTXCPY(dat_context, default_dat_context); memcpy(&dat_datctl, &def_datctl, sizeof(datctl)); dat_datctl.control |= (pat_patctl.control & CTL_ALLPD); +dat_datctl.control2 |= (pat_patctl.control2 & CTL2_ALLPD); strcpy((char *)dat_datctl.replacement, (char *)pat_patctl.replacement); /* Initialize for scanning the data line. */ @@ -5098,7 +5802,7 @@ if (dbuffer != NULL) the number of code units that will be needed (though the buffer may have to be extended if replication is involved). */ -needlen = (size_t)(len * code_unit_size); +needlen = (size_t)((len+1) * code_unit_size); if (dbuffer == NULL || needlen >= dbuffer_size) { while (needlen >= dbuffer_size) dbuffer_size *= 2; @@ -5112,17 +5816,21 @@ if (dbuffer == NULL || needlen >= dbuffer_size) SETCASTPTR(q, dbuffer); /* Sets q8, q16, or q32, as appropriate. */ /* Scan the data line, interpreting data escapes, and put the result into a -buffer of the appropriate width. In UTF mode, input can be UTF-8. */ +buffer of the appropriate width. In UTF mode, input is always UTF-8; otherwise, +in 16- and 32-bit modes, it can be forced to UTF-8 by the utf8_input modifier. +*/ while ((c = *p++) != 0) { - int i = 0; + int32_t i = 0; size_t replen; /* ] may mark the end of a replicated sequence */ if (c == ']' && start_rep != NULL) { + long li; + char *endptr; size_t qoffset = CAST8VAR(q) - dbuffer; size_t rep_offset = start_rep - dbuffer; @@ -5131,12 +5839,22 @@ while ((c = *p++) != 0) fprintf(outfile, "** Expected '{' after \\[....]\n"); return PR_OK; } - while (isdigit(*p)) i = i * 10 + *p++ - '0'; + + li = strtol((const char *)p, &endptr, 10); + if (S32OVERFLOW(li)) + { + fprintf(outfile, "** Repeat count too large\n"); + return PR_OK; + } + + p = (uint8_t *)endptr; if (*p++ != '}') { fprintf(outfile, "** Expected '}' after \\[...]{...\n"); return PR_OK; } + + i = (int32_t)li; if (i-- == 0) { fprintf(outfile, "** Zero repeat not allowed\n"); @@ -5169,11 +5887,20 @@ while ((c = *p++) != 0) continue; } - /* Handle a non-escaped character */ + /* Handle a non-escaped character. In non-UTF 32-bit mode with utf8_input + set, do the fudge for setting the top bit. */ if (c != '\\') { - if (utf && HASUTF8EXTRALEN(c)) { GETUTF8INC(c, p); } + uint32_t topbit = 0; + if (test_mode == PCRE32_MODE && c == 0xff && *p != 0) + { + topbit = 0x80000000; + c = *p++; + } + if ((utf || (pat_patctl.control & CTL_UTF8_INPUT) != 0) && + HASUTF8EXTRALEN(c)) { GETUTF8INC(c, p); } + c |= topbit; } /* Handle backslash escapes */ @@ -5367,38 +6094,44 @@ ulen = len/code_unit_size; /* Length in code units */ if (p[-1] != 0 && !decode_modifiers(p, CTX_DAT, NULL, &dat_datctl)) return PR_OK; -/* Check for mutually exclusive modifiers. */ +/* Check for mutually exclusive modifiers. At present, these are all in the +first control word. */ -c = dat_datctl.control & EXCLUSIVE_DAT_CONTROLS; -if (c - (c & -c) != 0) +for (k = 0; k < sizeof(exclusive_dat_controls)/sizeof(uint32_t); k++) { - show_controls(c, "** Not allowed together:"); - fprintf(outfile, "\n"); + c = dat_datctl.control & exclusive_dat_controls[k]; + if (c != 0 && c != (c & (~c+1))) + { + show_controls(c, 0, "** Not allowed together:"); + fprintf(outfile, "\n"); + return PR_OK; + } + } + +if (pat_patctl.replacement[0] != 0 && + (dat_datctl.control & CTL_NULLCONTEXT) != 0) + { + fprintf(outfile, "** Replacement text is not supported with null_context.\n"); return PR_OK; } -/* If we have explicit valgrind support, mark the data from after its end to -the end of the buffer as unaddressable, so that a read over the end of the -buffer will be seen by valgrind, even if it doesn't cause a crash. If we're not -building with valgrind support, at least move the data to the end of the buffer -so that it might at least cause a crash. If we are using the POSIX interface, -or testing zero-termination, we must include the terminating zero. */ +/* We now have the subject in dbuffer, with len containing the byte length, and +ulen containing the code unit length. Move the data to the end of the buffer so +that a read over the end can be caught by valgrind or other means. If we have +explicit valgrind support, mark the unused start of the buffer unaddressable. +If we are using the POSIX interface, or testing zero-termination, we must +include the terminating zero in the usable data. */ -pp = dbuffer; c = code_unit_size * (((pat_patctl.control & CTL_POSIX) + (dat_datctl.control & CTL_ZERO_TERMINATE) != 0)? 1:0); - +pp = memmove(dbuffer + dbuffer_size - len - c, dbuffer, len + c); #ifdef SUPPORT_VALGRIND - VALGRIND_MAKE_MEM_NOACCESS(dbuffer + len + c, dbuffer_size - (len + c)); -#else - pp = memmove(pp + dbuffer_size - len - c, pp, len + c); + VALGRIND_MAKE_MEM_NOACCESS(dbuffer, dbuffer_size - (len + c)); #endif -/* We now have len containing the byte length, ulen containing the code unit -length, and pp pointing to the subject string. POSIX matching is only possible -in 8-bit mode, and it does not support timing or other fancy features. Some -were checked at compile time, but we need to check the match-time settings -here. */ +/* Now pp points to the subject string. POSIX matching is only possible in +8-bit mode, and it does not support timing or other fancy features. Some were +checked at compile time, but we need to check the match-time settings here. */ #ifdef SUPPORT_PCRE2_8 if ((pat_patctl.control & CTL_POSIX) != 0) @@ -5408,13 +6141,16 @@ if ((pat_patctl.control & CTL_POSIX) != 0) regmatch_t *pmatch = NULL; const char *msg = "** Ignored with POSIX interface:"; - if (dat_datctl.cfail[0] != CFAIL_UNSET || dat_datctl.cfail[1] != CFAIL_UNSET) + if (dat_datctl.cerror[0] != CFORE_UNSET || dat_datctl.cerror[1] != CFORE_UNSET) + prmsg(&msg, "callout_error"); + if (dat_datctl.cfail[0] != CFORE_UNSET || dat_datctl.cfail[1] != CFORE_UNSET) prmsg(&msg, "callout_fail"); if (dat_datctl.copy_numbers[0] >= 0 || dat_datctl.copy_names[0] != 0) prmsg(&msg, "copy"); if (dat_datctl.get_numbers[0] >= 0 || dat_datctl.get_names[0] != 0) prmsg(&msg, "get"); if (dat_datctl.jitstack != 0) prmsg(&msg, "jitstack"); + if (dat_datctl.offset != 0) prmsg(&msg, "offset"); if ((dat_datctl.options & ~POSIX_SUPPORTED_MATCH_OPTIONS) != 0) { @@ -5422,9 +6158,11 @@ if ((pat_patctl.control & CTL_POSIX) != 0) show_match_options(dat_datctl.options & ~POSIX_SUPPORTED_MATCH_OPTIONS); msg = ""; } - if ((dat_datctl.control & ~POSIX_SUPPORTED_MATCH_CONTROLS) != 0) + if ((dat_datctl.control & ~POSIX_SUPPORTED_MATCH_CONTROLS) != 0 || + (dat_datctl.control2 & ~POSIX_SUPPORTED_MATCH_CONTROLS2) != 0) { - show_controls(dat_datctl.control & ~POSIX_SUPPORTED_MATCH_CONTROLS, msg); + show_controls(dat_datctl.control & ~POSIX_SUPPORTED_MATCH_CONTROLS, + dat_datctl.control2 & ~POSIX_SUPPORTED_MATCH_CONTROLS2, msg); msg = ""; } @@ -5436,14 +6174,13 @@ if ((pat_patctl.control & CTL_POSIX) != 0) if ((dat_datctl.options & PCRE2_NOTEOL) != 0) eflags |= REG_NOTEOL; if ((dat_datctl.options & PCRE2_NOTEMPTY) != 0) eflags |= REG_NOTEMPTY; - rc = regexec(&preg, (const char *)pp + dat_datctl.offset, - dat_datctl.oveccount, pmatch, eflags); + rc = regexec(&preg, (const char *)pp, dat_datctl.oveccount, pmatch, eflags); if (rc != 0) { (void)regerror(rc, &preg, (char *)pbuffer8, pbuffer8_size); fprintf(outfile, "No match: POSIX code %d: %s\n", rc, pbuffer8); } - else if ((pat_patctl.options & PCRE2_NO_AUTO_CAPTURE) != 0) + else if ((pat_patctl.control & CTL_POSIX_NOSUB) != 0) fprintf(outfile, "Matched with REG_NOSUB\n"); else if (dat_datctl.oveccount == 0) fprintf(outfile, "Matched without capture\n"); @@ -5454,18 +6191,27 @@ if ((pat_patctl.control & CTL_POSIX) != 0) { if (pmatch[i].rm_so >= 0) { + PCRE2_SIZE start = pmatch[i].rm_so; + PCRE2_SIZE end = pmatch[i].rm_eo; + if (start > end) + { + start = pmatch[i].rm_eo; + end = pmatch[i].rm_so; + fprintf(outfile, "Start of matched string is beyond its end - " + "displaying from end to start.\n"); + } fprintf(outfile, "%2d: ", (int)i); - PCHARSV(dbuffer, pmatch[i].rm_so, - pmatch[i].rm_eo - pmatch[i].rm_so, utf, outfile); + PCHARSV(pp, start, end - start, utf, outfile); fprintf(outfile, "\n"); + if ((i == 0 && (dat_datctl.control & CTL_AFTERTEXT) != 0) || (dat_datctl.control & CTL_ALLAFTERTEXT) != 0) { fprintf(outfile, "%2d+ ", (int)i); - PCHARSV(dbuffer, pmatch[i].rm_eo, len - pmatch[i].rm_eo, - utf, outfile); - fprintf(outfile, "\n"); - } + /* Note: don't use the start/end variables here because we want to + show the text from what is reported as the end. */ + PCHARSV(pp, pmatch[i].rm_eo, len - pmatch[i].rm_eo, utf, outfile); + fprintf(outfile, "\n"); } } } } @@ -5498,6 +6244,12 @@ if ((dat_datctl.control & (CTL_ALLUSEDTEXT|CTL_DFA)) == CTL_ALLUSEDTEXT && if ((dat_datctl.control & CTL_ZERO_TERMINATE) != 0) ulen = PCRE2_ZERO_TERMINATED; +/* The nullcontext modifier is used to test calling pcre2_[jit_]match() with a +NULL context. */ + +use_dat_context = ((dat_datctl.control & CTL_NULLCONTEXT) != 0)? + NULL : PTR(dat_context); + /* Enable display of malloc/free if wanted. */ show_memory = (dat_datctl.control & CTL_MEMORY) != 0; @@ -5571,7 +6323,7 @@ if (dat_datctl.replacement[0] != 0) uint8_t *pr; uint8_t rbuffer[REPLACE_BUFFSIZE]; uint8_t nbuffer[REPLACE_BUFFSIZE]; - uint32_t goption; + uint32_t xoptions; PCRE2_SIZE rlen, nsize, erroroffset; BOOL badutf = FALSE; @@ -5588,8 +6340,17 @@ if (dat_datctl.replacement[0] != 0) if (timeitm) fprintf(outfile, "** Timing is not supported with replace: ignored\n"); - goption = ((dat_datctl.control & CTL_GLOBAL) == 0)? 0 : - PCRE2_SUBSTITUTE_GLOBAL; + xoptions = (((dat_datctl.control & CTL_GLOBAL) == 0)? 0 : + PCRE2_SUBSTITUTE_GLOBAL) | + (((dat_datctl.control2 & CTL2_SUBSTITUTE_EXTENDED) == 0)? 0 : + PCRE2_SUBSTITUTE_EXTENDED) | + (((dat_datctl.control2 & CTL2_SUBSTITUTE_OVERFLOW_LENGTH) == 0)? 0 : + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) | + (((dat_datctl.control2 & CTL2_SUBSTITUTE_UNKNOWN_UNSET) == 0)? 0 : + PCRE2_SUBSTITUTE_UNKNOWN_UNSET) | + (((dat_datctl.control2 & CTL2_SUBSTITUTE_UNSET_EMPTY) == 0)? 0 : + PCRE2_SUBSTITUTE_UNSET_EMPTY); + SETCASTPTR(r, rbuffer); /* Sets r8, r16, or r32, as appropriate. */ pr = dat_datctl.replacement; @@ -5676,14 +6437,21 @@ if (dat_datctl.replacement[0] != 0) else rlen = (CASTVAR(uint8_t *, r) - rbuffer)/code_unit_size; PCRE2_SUBSTITUTE(rc, compiled_code, pp, ulen, dat_datctl.offset, - dat_datctl.options|goption, match_data, dat_context, + dat_datctl.options|xoptions, match_data, dat_context, rbuffer, rlen, nbuffer, &nsize); if (rc < 0) { - fprintf(outfile, "Failed: error %d: ", rc); - PCRE2_GET_ERROR_MESSAGE(nsize, rc, pbuffer); - PCHARSV(CASTVAR(void *, pbuffer), 0, nsize, FALSE, outfile); + PCRE2_SIZE msize; + fprintf(outfile, "Failed: error %d", rc); + if (rc != PCRE2_ERROR_NOMEMORY && nsize != PCRE2_UNSET) + fprintf(outfile, " at offset %ld in replacement", (long int)nsize); + fprintf(outfile, ": "); + PCRE2_GET_ERROR_MESSAGE(msize, rc, pbuffer); + PCHARSV(CASTVAR(void *, pbuffer), 0, msize, FALSE, outfile); + if (rc == PCRE2_ERROR_NOMEMORY && + (xoptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) != 0) + fprintf(outfile, ": %ld code units are needed", (long int)nsize); } else { @@ -5735,7 +6503,7 @@ else for (gmatched = 0;; gmatched++) if (timeitm > 0) { - register int i; + int i; clock_t start_time, time_taken; if ((dat_datctl.control & CTL_DFA) != 0) @@ -5752,7 +6520,7 @@ else for (gmatched = 0;; gmatched++) { PCRE2_DFA_MATCH(capcount, compiled_code, pp, ulen, dat_datctl.offset, dat_datctl.options | g_notempty, match_data, - dat_context, dfa_workspace, DFA_WS_DIMENSION); + use_dat_context, dfa_workspace, DFA_WS_DIMENSION); } } @@ -5763,7 +6531,7 @@ else for (gmatched = 0;; gmatched++) { PCRE2_JIT_MATCH(capcount, compiled_code, pp, ulen, dat_datctl.offset, dat_datctl.options | g_notempty, match_data, - dat_context); + use_dat_context); } } @@ -5774,7 +6542,7 @@ else for (gmatched = 0;; gmatched++) { PCRE2_MATCH(capcount, compiled_code, pp, ulen, dat_datctl.offset, dat_datctl.options | g_notempty, match_data, - dat_context); + use_dat_context); } } total_match_time += (time_taken = clock() - start_time); @@ -5795,7 +6563,7 @@ else for (gmatched = 0;; gmatched++) } /* Otherwise just run a single match, setting up a callout if required (the - default). */ + default). There is a copy of the pattern in pbuffer8 for use by callouts. */ else { @@ -5822,7 +6590,7 @@ else for (gmatched = 0;; gmatched++) dfa_workspace[0] = -1; /* To catch bad restart */ PCRE2_DFA_MATCH(capcount, compiled_code, pp, ulen, dat_datctl.offset, dat_datctl.options | g_notempty, match_data, - dat_context, dfa_workspace, DFA_WS_DIMENSION); + use_dat_context, dfa_workspace, DFA_WS_DIMENSION); if (capcount == 0) { fprintf(outfile, "Matched, but offsets vector is too small to show all matches\n"); @@ -5833,10 +6601,10 @@ else for (gmatched = 0;; gmatched++) { if ((pat_patctl.control & CTL_JITFAST) != 0) PCRE2_JIT_MATCH(capcount, compiled_code, pp, ulen, dat_datctl.offset, - dat_datctl.options | g_notempty, match_data, dat_context); + dat_datctl.options | g_notempty, match_data, use_dat_context); else PCRE2_MATCH(capcount, compiled_code, pp, ulen, dat_datctl.offset, - dat_datctl.options | g_notempty, match_data, dat_context); + dat_datctl.options | g_notempty, match_data, use_dat_context); if (capcount == 0) { fprintf(outfile, "Matched, but too many substrings\n"); @@ -5884,15 +6652,23 @@ else for (gmatched = 0;; gmatched++) /* "allcaptures" requests showing of all captures in the pattern, to check unset ones at the end. It may be set on the pattern or the data. Implement - by setting capcount to the maximum. */ + by setting capcount to the maximum. This is not relevant for DFA matching, + so ignore it. */ if ((dat_datctl.control & CTL_ALLCAPTURES) != 0) { uint32_t maxcapcount; - if (pattern_info(PCRE2_INFO_CAPTURECOUNT, &maxcapcount, FALSE) < 0) - return PR_SKIP; - capcount = maxcapcount + 1; /* Allow for full match */ - if (capcount > (int)oveccount) capcount = oveccount; + if ((dat_datctl.control & CTL_DFA) != 0) + { + fprintf(outfile, "** Ignored after DFA matching: allcaptures\n"); + } + else + { + if (pattern_info(PCRE2_INFO_CAPTURECOUNT, &maxcapcount, FALSE) < 0) + return PR_SKIP; + capcount = maxcapcount + 1; /* Allow for full match */ + if (capcount > (int)oveccount) capcount = oveccount; + } } /* Output the captured substrings. Note that, for the matched string, @@ -6046,7 +6822,8 @@ else for (gmatched = 0;; gmatched++) TESTFLD(match_data, mark, !=, NULL)) { fprintf(outfile, ", mark="); - PCHARS(rubriclength, CASTFLD(void *, match_data, mark), 0, -1, utf, outfile); + PCHARS(rubriclength, CASTFLD(void *, match_data, mark), 0, -1, utf, + outfile); rubriclength += 7; } fprintf(outfile, ": "); @@ -6340,10 +7117,12 @@ printf(" -16 use the 16-bit library\n"); #ifdef SUPPORT_PCRE2_32 printf(" -32 use the 32-bit library\n"); #endif -printf(" -b set default pattern control 'fullbincode'\n"); +printf(" -ac set default pattern option PCRE2_AUTO_CALLOUT\n"); +printf(" -b set default pattern modifier 'fullbincode'\n"); printf(" -C show PCRE2 compile-time options and exit\n"); printf(" -C arg show a specific compile-time option and exit with its\n"); printf(" value if numeric (else 0). The arg can be:\n"); +printf(" backslash-C use of \\C is enabled [0, 1]\n"); printf(" bsr \\R type [ANYCRLF, ANY]\n"); printf(" ebcdic compiled for EBCDIC character code [0,1]\n"); printf(" ebcdic-nl NL code if compiled for EBCDIC\n"); @@ -6354,14 +7133,15 @@ printf(" pcre2-8 8 bit library support enabled [0, 1]\n"); printf(" pcre2-16 16 bit library support enabled [0, 1]\n"); printf(" pcre2-32 32 bit library support enabled [0, 1]\n"); printf(" unicode Unicode and UTF support enabled [0, 1]\n"); -printf(" -d set default pattern control 'debug'\n"); -printf(" -dfa set default subject control 'dfa'\n"); +printf(" -d set default pattern modifier 'debug'\n"); +printf(" -dfa set default subject modifier 'dfa'\n"); +printf(" -error show messages for error numbers, then exit\n"); printf(" -help show usage information\n"); -printf(" -i set default pattern control 'info'\n"); -printf(" -jit set default pattern control 'jit'\n"); +printf(" -i set default pattern modifier 'info'\n"); +printf(" -jit set default pattern modifier 'jit'\n"); printf(" -q quiet: do not output PCRE2 version number at start\n"); -printf(" -pattern set default pattern control fields\n"); -printf(" -subject