From c105117b40d1a7b2b9ddf1672cd08b11bd565bd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Svensson?= Date: Mon, 23 Oct 2023 12:38:24 +0200 Subject: [PATCH] Use PCRE2 instead of PCRE (#153) PCRE is now at end of life and is no longer actively maintained. Lift the dependency to the next major version, i.e. PCRE2. Implementation notes: - Removed the pcre study option since: "The new API ... was simplified by abolishing the separate "study" optimizing function; in PCRE2, patterns are automatically optimized where possible." - If asprintf() fails the content of the 'strp' variable is undefined. Lets check the return value and return NULL upon error. - Pattern and subject can straightforwardly be cast to PCRE2_SPTR since we only work with 8-bit code units. --- .travis-ci/install.sh | 2 +- CMakeLists.txt | 2 +- README.md | 10 ++--- cmake/Modules/FindPCRE.cmake | 37 ---------------- cmake/Modules/FindPCRE2.cmake | 37 ++++++++++++++++ configure.ac | 2 +- dist-debian/control | 2 +- include/r3.h | 8 ++-- r3.pc.in | 2 +- src/CMakeLists.txt | 2 +- src/edge.c | 2 - src/match_entry.c | 1 - src/node.c | 80 ++++++++++++++++------------------- 13 files changed, 88 insertions(+), 99 deletions(-) delete mode 100644 cmake/Modules/FindPCRE.cmake create mode 100644 cmake/Modules/FindPCRE2.cmake diff --git a/.travis-ci/install.sh b/.travis-ci/install.sh index 577d1a5..5a7dbb2 100755 --- a/.travis-ci/install.sh +++ b/.travis-ci/install.sh @@ -10,7 +10,7 @@ apt-get install -qq \ cmake \ graphviz-dev \ libjemalloc-dev \ - libpcre3-dev \ + libpcre2-dev \ libtool \ ninja-build \ pkg-config diff --git a/CMakeLists.txt b/CMakeLists.txt index 5223166..0e31a0a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) set(CMAKE_C_STANDARD 99) find_package(Check) -find_package(PCRE REQUIRED) +find_package(PCRE2 REQUIRED) include(CheckSymbolExists) include(CheckIncludeFile) diff --git a/README.md b/README.md index f3a8423..9106816 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Requirement ### Runtime Requirement -* pcre +* pcre2 * (optional) graphviz version 2.38.0 (20140413.2041) * (optional) libjson-c-dev @@ -187,13 +187,13 @@ Optimization Simple regular expressions are optimized through a regexp pattern to opcode translator, which translates simple patterns into small & fast scanners. -By using this method, r3 reduces the matching overhead of pcre library. +By using this method, r3 reduces the matching overhead of pcre2 library. Optimized patterns are: `[a-z]+`, `[0-9]+`, `\d+`, `\w+`, `[^/]+`, `[^-]+` or `.*`. Slugs without specified regular expression will be compiled into the `[^/]+` pattern. therefore, it's optimized too. -Complex regular expressions will still use libpcre to match URL (partially). +Complex regular expressions will still use libpcre2 to match URL (partially). Performance @@ -356,7 +356,7 @@ if ( $error ) { Install ---------------------- - sudo apt-get install check libpcre3 libpcre3-dev libjemalloc-dev libjemalloc1 build-essential libtool automake autoconf pkg-config + sudo apt-get install check libpcre2 libpcre2-dev libjemalloc-dev libjemalloc1 build-essential libtool automake autoconf pkg-config sudo apt-get install graphviz-dev graphviz # if you want graphviz ./autogen.sh ./configure && make @@ -364,7 +364,7 @@ Install And we support debian-based distro now! - sudo apt-get install build-essential autoconf automake libpcre3-dev pkg-config debhelper libtool check + sudo apt-get install build-essential autoconf automake libpcre2-dev pkg-config debhelper libtool check mv dist-debian debian dpkg-buildpackage -b -us -uc sudo gdebi ../libr3*.deb diff --git a/cmake/Modules/FindPCRE.cmake b/cmake/Modules/FindPCRE.cmake deleted file mode 100644 index dbbd60a..0000000 --- a/cmake/Modules/FindPCRE.cmake +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (C) 2007-2009 LuaDist. -# Created by Peter Kapec -# Redistribution and use of this file is allowed according to the terms of the MIT license. -# For details see the COPYRIGHT file distributed with LuaDist. -# Note: -# Searching headers and libraries is very simple and is NOT as powerful as scripts -# distributed with CMake, because LuaDist defines directories to search for. -# Everyone is encouraged to contact the author with improvements. Maybe this file -# becomes part of CMake distribution sometimes. - -# - Find pcre -# Find the native PCRE headers and libraries. -# -# PCRE_INCLUDE_DIRS - where to find pcre.h, etc. -# PCRE_LIBRARIES - List of libraries when using pcre. -# PCRE_FOUND - True if pcre found. - -# Look for the header file. -FIND_PATH(PCRE_INCLUDE_DIR NAMES pcre.h) - -# Look for the library. -FIND_LIBRARY(PCRE_LIBRARY NAMES pcre) - -# Handle the QUIETLY and REQUIRED arguments and set PCRE_FOUND to TRUE if all listed variables are TRUE. -INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(PCRE DEFAULT_MSG PCRE_LIBRARY PCRE_INCLUDE_DIR) - -# Copy the results to the output variables. -IF(PCRE_FOUND) - SET(PCRE_LIBRARIES ${PCRE_LIBRARY}) - SET(PCRE_INCLUDE_DIRS ${PCRE_INCLUDE_DIR}) -ELSE(PCRE_FOUND) - SET(PCRE_LIBRARIES) - SET(PCRE_INCLUDE_DIRS) -ENDIF(PCRE_FOUND) - -MARK_AS_ADVANCED(PCRE_INCLUDE_DIRS PCRE_LIBRARIES) diff --git a/cmake/Modules/FindPCRE2.cmake b/cmake/Modules/FindPCRE2.cmake new file mode 100644 index 0000000..1e6b6df --- /dev/null +++ b/cmake/Modules/FindPCRE2.cmake @@ -0,0 +1,37 @@ +# Copyright (C) 2007-2009 LuaDist. +# Created by Peter Kapec +# Redistribution and use of this file is allowed according to the terms of the MIT license. +# For details see the COPYRIGHT file distributed with LuaDist. +# Note: +# Searching headers and libraries is very simple and is NOT as powerful as scripts +# distributed with CMake, because LuaDist defines directories to search for. +# Everyone is encouraged to contact the author with improvements. Maybe this file +# becomes part of CMake distribution sometimes. + +# - Find pcre2 +# Find the native PCRE2 headers and libraries. +# +# PCRE2_INCLUDE_DIRS - where to find pcre2.h, etc. +# PCRE2_LIBRARIES - List of libraries when using pcre2. +# PCRE2_FOUND - True if pcre2 found. + +# Look for the header file. +FIND_PATH(PCRE2_INCLUDE_DIR NAMES pcre2.h) + +# Look for the library. +FIND_LIBRARY(PCRE2_LIBRARY NAMES pcre2-8) + +# Handle the QUIETLY and REQUIRED arguments and set PCRE2_FOUND to TRUE if all listed variables are TRUE. +INCLUDE(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(PCRE2 DEFAULT_MSG PCRE2_LIBRARY PCRE2_INCLUDE_DIR) + +# Copy the results to the output variables. +IF(PCRE2_FOUND) + SET(PCRE2_LIBRARIES ${PCRE2_LIBRARY}) + SET(PCRE2_INCLUDE_DIRS ${PCRE2_INCLUDE_DIR}) +ELSE(PCRE2_FOUND) + SET(PCRE2_LIBRARIES) + SET(PCRE2_INCLUDE_DIRS) +ENDIF(PCRE2_FOUND) + +MARK_AS_ADVANCED(PCRE2_INCLUDE_DIRS PCRE2_LIBRARIES) diff --git a/configure.ac b/configure.ac index 3d7d7e7..8dd7d85 100644 --- a/configure.ac +++ b/configure.ac @@ -73,7 +73,7 @@ AM_CONDITIONAL(USE_JEMALLOC, test "x$have_jemalloc" = "xyes") # AC_DEFINE(USE_JEMALLOC, test "x$found_jemalloc" = "xyes" , "use jemalloc") -PKG_CHECK_MODULES(DEPS, [libpcre]) +PKG_CHECK_MODULES(DEPS, [libpcre2-8]) AC_SUBST(DEPS_CFLAGS) AC_SUBST(DEPS_LIBS) diff --git a/dist-debian/control b/dist-debian/control index c3edea4..1f14a29 100644 --- a/dist-debian/control +++ b/dist-debian/control @@ -2,7 +2,7 @@ Source: libr3 Priority: optional Maintainer: Ronmi Ren Build-Depends: debhelper (>= 8.0.0), automake, autotools-dev, autoconf, - libtool, libpcre3-dev, pkg-config, check + libtool, libpcre2-dev, pkg-config, check Standards-Version: 3.9.4 Section: libs Homepage: https://github.com/c9s/r3 diff --git a/include/r3.h b/include/r3.h index 02f6e9f..5f119d3 100644 --- a/include/r3.h +++ b/include/r3.h @@ -10,7 +10,8 @@ #include #include #include -#include +#define PCRE2_CODE_UNIT_WIDTH 8 +#include #if __STDC_VERSION__ <= 201710L #ifdef HAVE_STDBOOL_H @@ -43,13 +44,12 @@ struct _node { R3_VECTOR(R3Edge) edges; R3_VECTOR(R3Route) routes; char * combined_pattern; - pcre * pcre_pattern; - pcre_extra * pcre_extra; + pcre2_code * pcre_pattern; + pcre2_match_data * match_data; // edges are mostly less than 255 unsigned int compare_type; // compare_type: pcre, opcode, string unsigned int endpoint; // endpoint, should be zero for non-endpoint nodes - unsigned int ov_cnt; // capture vector array size for pcre // the pointer of R3Route data void * data; diff --git a/r3.pc.in b/r3.pc.in index 6f2ffb9..d82ddf8 100644 --- a/r3.pc.in +++ b/r3.pc.in @@ -6,6 +6,6 @@ libdir=@libdir@ Name: r3 Description: High-performance URL router library Version: @PACKAGE_VERSION@ -Requires: libpcre +Requires: libpcre2-8 Libs: -L${libdir} -lr3 CFlags: -I${includedir} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a307b3a..9f49f03 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,7 +19,7 @@ target_include_directories(r3 target_link_libraries(r3 PUBLIC - ${PCRE_LIBRARIES}) + ${PCRE2_LIBRARIES}) install( TARGETS r3 diff --git a/src/edge.c b/src/edge.c index 1e647b5..bbfd183 100644 --- a/src/edge.c +++ b/src/edge.c @@ -13,8 +13,6 @@ // Jemalloc memory management // #include -// PCRE -#include #include "r3.h" #include "r3_slug.h" #include "slug.h" diff --git a/src/match_entry.c b/src/match_entry.c index f0e4f8d..c907a1b 100644 --- a/src/match_entry.c +++ b/src/match_entry.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include "r3.h" diff --git a/src/node.c b/src/node.c index c5a3bfd..086f21b 100644 --- a/src/node.c +++ b/src/node.c @@ -7,9 +7,6 @@ #include #include -// PCRE -#include - #include "r3.h" #include "r3_slug.h" #include "slug.h" @@ -75,13 +72,11 @@ void r3_tree_free(R3Node * tree) { } free(tree->routes.entries); if (tree->pcre_pattern) { - pcre_free(tree->pcre_pattern); + pcre2_code_free(tree->pcre_pattern); } -#ifdef PCRE_STUDY_JIT_COMPILE - if (tree->pcre_extra) { - pcre_free_study(tree->pcre_extra); + if (tree->match_data) { + pcre2_match_data_free(tree->match_data); } -#endif free(tree->combined_pattern); free(tree); tree = NULL; @@ -223,41 +218,44 @@ int r3_tree_compile_patterns(R3Node * n, char **errstr) { free(n->combined_pattern); n->combined_pattern = cpat; - const char *pcre_error = NULL; - int pcre_erroffset = 0; + int pcre_errorcode = 0; + PCRE2_SIZE pcre_erroffset = 0; unsigned int option_bits = 0; - n->ov_cnt = (1 + n->edges.size) * 3; - if (n->pcre_pattern) { - pcre_free(n->pcre_pattern); + pcre2_code_free(n->pcre_pattern); } - n->pcre_pattern = pcre_compile( - n->combined_pattern, /* the pattern */ + n->pcre_pattern = pcre2_compile( + (PCRE2_SPTR)n->combined_pattern, /* the pattern, 8-bit code units */ + PCRE2_ZERO_TERMINATED, option_bits, /* default options */ - &pcre_error, /* for error message */ + &pcre_errorcode, /* for error code */ &pcre_erroffset, /* for error offset */ - NULL); /* use default character tables */ + NULL); /* compile context */ if (n->pcre_pattern == NULL) { if (errstr) { - int r = asprintf(errstr, "PCRE compilation failed at offset %d: %s, pattern: %s", pcre_erroffset, pcre_error, n->combined_pattern); - if (r) {}; + PCRE2_UCHAR buf[128]; + pcre2_get_error_message(pcre_errorcode, buf, sizeof(buf)); + int r = asprintf(errstr, "PCRE compilation failed at offset %ld: %s, pattern: %s", pcre_erroffset, buf, n->combined_pattern); + if (r < 0) { + *errstr = NULL; /* the content of errstr is undefined when asprintf() fails */ + } } return -1; } -#ifdef PCRE_STUDY_JIT_COMPILE - if (n->pcre_extra) { - pcre_free_study(n->pcre_extra); + if (n->match_data) { + pcre2_match_data_free(n->match_data); } - n->pcre_extra = pcre_study(n->pcre_pattern, 0, &pcre_error); - if (!n->pcre_extra && pcre_error) { + n->match_data = pcre2_match_data_create_from_pattern(n->pcre_pattern, NULL); + if (n->match_data == NULL) { if (errstr) { - int r = asprintf(errstr, "PCRE study failed at offset %s, pattern: %s", pcre_error, n->combined_pattern); - if (r) {}; + int r = asprintf(errstr, "Failed to allocate match data block"); + if (r < 0) { + *errstr = NULL; /* the content of errstr is undefined when asprintf() fails */ + } } return -1; } -#endif return 0; } @@ -339,20 +337,18 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path, info("COMPARE PCRE_PATTERN\n"); const char *substring_start = 0; int substring_length = 0; - int ov[ n->ov_cnt ]; int rc; info("pcre matching %s on [%s]\n", n->combined_pattern, path); - rc = pcre_exec( + rc = pcre2_match( n->pcre_pattern, /* the compiled pattern */ - n->pcre_extra, - path, /* the subject string */ + (PCRE2_SPTR)path,/* the subject string, 8-bit code units */ path_len, /* the length of the subject */ 0, /* start at offset 0 in the subject */ 0, /* default options */ - ov, /* output vector for substring information */ - n->ov_cnt); /* number of elements in the output vector */ + n->match_data,/* match data results */ + NULL); /* match context */ // does not match all edges, return NULL; if (rc < 0) { @@ -360,7 +356,7 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path, printf("pcre rc: %d\n", rc ); switch(rc) { - case PCRE_ERROR_NOMATCH: + case PCRE2_ERROR_NOMATCH: printf("pcre: no match '%s' on pattern '%s'\n", path, n->combined_pattern); break; @@ -373,23 +369,22 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path, return NULL; } - + PCRE2_SIZE *ov = pcre2_get_ovector_pointer(n->match_data); restlen = path_len - ov[1]; // if it's fully matched to the end (rest string length) - int *inv = ov + 2; + if (!restlen) { // Check the substring to decide we should go deeper on which edge for (i = 1; i < rc; i++) { - substring_length = *(inv+1) - *inv; + substring_length = ov[2*i+1] - ov[2*i]; // if it's not matched for this edge, just skip them quickly if (!is_end && !substring_length) { - inv += 2; continue; } - substring_start = path + *inv; + substring_start = path + ov[2*i]; e = n->edges.entries + i - 1; if (entry && e->has_slug) { @@ -404,18 +399,16 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path, // Check the substring to decide we should go deeper on which edge - inv = ov + 2; for (i = 1; i < rc; i++) { - substring_length = *(inv+1) - *inv; + substring_length = ov[2*i+1] - ov[2*i]; // if it's not matched for this edge, just skip them quickly if (!is_end && !substring_length) { - inv += 2; continue; } - substring_start = path + *inv; + substring_start = path + ov[2*i]; e = n->edges.entries + i - 1; if (entry && e->has_slug) { @@ -520,7 +513,6 @@ inline R3Edge * r3_node_find_edge_str(const R3Node * n, const char * str, int st // n->endpoint = 0; // n->combined_pattern = NULL; // n->pcre_pattern = NULL; -// n->pcre_extra = NULL; // n->data = NULL; // return n; // }