Use PCRE2 instead of PCRE (#153)

PCRE is now at end of life and is no longer actively maintained.
Lift the dependency to the next major version, i.e. PCRE2.

Implementation notes:
- Removed the pcre study option since:
  "The new API ... was simplified by abolishing the separate "study" optimizing
  function; in PCRE2, patterns are automatically optimized where possible."
- If asprintf() fails the content of the 'strp' variable is undefined.
  Lets check the return value and return NULL upon error.
- Pattern and subject can straightforwardly be cast to PCRE2_SPTR since we
  only work with 8-bit code units.
This commit is contained in:
Björn Svensson 2023-10-23 12:38:24 +02:00 committed by GitHub
parent 9168f7e4d4
commit c105117b40
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 88 additions and 99 deletions

View file

@ -10,7 +10,7 @@ apt-get install -qq \
cmake \ cmake \
graphviz-dev \ graphviz-dev \
libjemalloc-dev \ libjemalloc-dev \
libpcre3-dev \ libpcre2-dev \
libtool \ libtool \
ninja-build \ ninja-build \
pkg-config pkg-config

View file

@ -5,7 +5,7 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
set(CMAKE_C_STANDARD 99) set(CMAKE_C_STANDARD 99)
find_package(Check) find_package(Check)
find_package(PCRE REQUIRED) find_package(PCRE2 REQUIRED)
include(CheckSymbolExists) include(CheckSymbolExists)
include(CheckIncludeFile) include(CheckIncludeFile)

View file

@ -25,7 +25,7 @@ Requirement
### Runtime Requirement ### Runtime Requirement
* pcre * pcre2
* (optional) graphviz version 2.38.0 (20140413.2041) * (optional) graphviz version 2.38.0 (20140413.2041)
* (optional) libjson-c-dev * (optional) libjson-c-dev
@ -187,13 +187,13 @@ Optimization
Simple regular expressions are optimized through a regexp pattern to opcode Simple regular expressions are optimized through a regexp pattern to opcode
translator, which translates simple patterns into small & fast scanners. translator, which translates simple patterns into small & fast scanners.
By using this method, r3 reduces the matching overhead of pcre library. By using this method, r3 reduces the matching overhead of pcre2 library.
Optimized patterns are: `[a-z]+`, `[0-9]+`, `\d+`, `\w+`, `[^/]+`, `[^-]+` or `.*`. Optimized patterns are: `[a-z]+`, `[0-9]+`, `\d+`, `\w+`, `[^/]+`, `[^-]+` or `.*`.
Slugs without specified regular expression will be compiled into the `[^/]+` pattern. therefore, it's optimized too. Slugs without specified regular expression will be compiled into the `[^/]+` pattern. therefore, it's optimized too.
Complex regular expressions will still use libpcre to match URL (partially). Complex regular expressions will still use libpcre2 to match URL (partially).
Performance Performance
@ -356,7 +356,7 @@ if ( $error ) {
Install Install
---------------------- ----------------------
sudo apt-get install check libpcre3 libpcre3-dev libjemalloc-dev libjemalloc1 build-essential libtool automake autoconf pkg-config sudo apt-get install check libpcre2 libpcre2-dev libjemalloc-dev libjemalloc1 build-essential libtool automake autoconf pkg-config
sudo apt-get install graphviz-dev graphviz # if you want graphviz sudo apt-get install graphviz-dev graphviz # if you want graphviz
./autogen.sh ./autogen.sh
./configure && make ./configure && make
@ -364,7 +364,7 @@ Install
And we support debian-based distro now! And we support debian-based distro now!
sudo apt-get install build-essential autoconf automake libpcre3-dev pkg-config debhelper libtool check sudo apt-get install build-essential autoconf automake libpcre2-dev pkg-config debhelper libtool check
mv dist-debian debian mv dist-debian debian
dpkg-buildpackage -b -us -uc dpkg-buildpackage -b -us -uc
sudo gdebi ../libr3*.deb sudo gdebi ../libr3*.deb

View file

@ -1,37 +0,0 @@
# Copyright (C) 2007-2009 LuaDist.
# Created by Peter Kapec <kapecp@gmail.com>
# Redistribution and use of this file is allowed according to the terms of the MIT license.
# For details see the COPYRIGHT file distributed with LuaDist.
# Note:
# Searching headers and libraries is very simple and is NOT as powerful as scripts
# distributed with CMake, because LuaDist defines directories to search for.
# Everyone is encouraged to contact the author with improvements. Maybe this file
# becomes part of CMake distribution sometimes.
# - Find pcre
# Find the native PCRE headers and libraries.
#
# PCRE_INCLUDE_DIRS - where to find pcre.h, etc.
# PCRE_LIBRARIES - List of libraries when using pcre.
# PCRE_FOUND - True if pcre found.
# Look for the header file.
FIND_PATH(PCRE_INCLUDE_DIR NAMES pcre.h)
# Look for the library.
FIND_LIBRARY(PCRE_LIBRARY NAMES pcre)
# Handle the QUIETLY and REQUIRED arguments and set PCRE_FOUND to TRUE if all listed variables are TRUE.
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(PCRE DEFAULT_MSG PCRE_LIBRARY PCRE_INCLUDE_DIR)
# Copy the results to the output variables.
IF(PCRE_FOUND)
SET(PCRE_LIBRARIES ${PCRE_LIBRARY})
SET(PCRE_INCLUDE_DIRS ${PCRE_INCLUDE_DIR})
ELSE(PCRE_FOUND)
SET(PCRE_LIBRARIES)
SET(PCRE_INCLUDE_DIRS)
ENDIF(PCRE_FOUND)
MARK_AS_ADVANCED(PCRE_INCLUDE_DIRS PCRE_LIBRARIES)

View file

@ -0,0 +1,37 @@
# Copyright (C) 2007-2009 LuaDist.
# Created by Peter Kapec <kapecp@gmail.com>
# Redistribution and use of this file is allowed according to the terms of the MIT license.
# For details see the COPYRIGHT file distributed with LuaDist.
# Note:
# Searching headers and libraries is very simple and is NOT as powerful as scripts
# distributed with CMake, because LuaDist defines directories to search for.
# Everyone is encouraged to contact the author with improvements. Maybe this file
# becomes part of CMake distribution sometimes.
# - Find pcre2
# Find the native PCRE2 headers and libraries.
#
# PCRE2_INCLUDE_DIRS - where to find pcre2.h, etc.
# PCRE2_LIBRARIES - List of libraries when using pcre2.
# PCRE2_FOUND - True if pcre2 found.
# Look for the header file.
FIND_PATH(PCRE2_INCLUDE_DIR NAMES pcre2.h)
# Look for the library.
FIND_LIBRARY(PCRE2_LIBRARY NAMES pcre2-8)
# Handle the QUIETLY and REQUIRED arguments and set PCRE2_FOUND to TRUE if all listed variables are TRUE.
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(PCRE2 DEFAULT_MSG PCRE2_LIBRARY PCRE2_INCLUDE_DIR)
# Copy the results to the output variables.
IF(PCRE2_FOUND)
SET(PCRE2_LIBRARIES ${PCRE2_LIBRARY})
SET(PCRE2_INCLUDE_DIRS ${PCRE2_INCLUDE_DIR})
ELSE(PCRE2_FOUND)
SET(PCRE2_LIBRARIES)
SET(PCRE2_INCLUDE_DIRS)
ENDIF(PCRE2_FOUND)
MARK_AS_ADVANCED(PCRE2_INCLUDE_DIRS PCRE2_LIBRARIES)

View file

@ -73,7 +73,7 @@ AM_CONDITIONAL(USE_JEMALLOC, test "x$have_jemalloc" = "xyes")
# AC_DEFINE(USE_JEMALLOC, test "x$found_jemalloc" = "xyes" , "use jemalloc") # AC_DEFINE(USE_JEMALLOC, test "x$found_jemalloc" = "xyes" , "use jemalloc")
PKG_CHECK_MODULES(DEPS, [libpcre]) PKG_CHECK_MODULES(DEPS, [libpcre2-8])
AC_SUBST(DEPS_CFLAGS) AC_SUBST(DEPS_CFLAGS)
AC_SUBST(DEPS_LIBS) AC_SUBST(DEPS_LIBS)

View file

@ -2,7 +2,7 @@ Source: libr3
Priority: optional Priority: optional
Maintainer: Ronmi Ren <ronmi.ren@gmail.com> Maintainer: Ronmi Ren <ronmi.ren@gmail.com>
Build-Depends: debhelper (>= 8.0.0), automake, autotools-dev, autoconf, Build-Depends: debhelper (>= 8.0.0), automake, autotools-dev, autoconf,
libtool, libpcre3-dev, pkg-config, check libtool, libpcre2-dev, pkg-config, check
Standards-Version: 3.9.4 Standards-Version: 3.9.4
Section: libs Section: libs
Homepage: https://github.com/c9s/r3 Homepage: https://github.com/c9s/r3

View file

@ -10,7 +10,8 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <pcre.h> #define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
#if __STDC_VERSION__ <= 201710L #if __STDC_VERSION__ <= 201710L
#ifdef HAVE_STDBOOL_H #ifdef HAVE_STDBOOL_H
@ -43,13 +44,12 @@ struct _node {
R3_VECTOR(R3Edge) edges; R3_VECTOR(R3Edge) edges;
R3_VECTOR(R3Route) routes; R3_VECTOR(R3Route) routes;
char * combined_pattern; char * combined_pattern;
pcre * pcre_pattern; pcre2_code * pcre_pattern;
pcre_extra * pcre_extra; pcre2_match_data * match_data;
// edges are mostly less than 255 // edges are mostly less than 255
unsigned int compare_type; // compare_type: pcre, opcode, string unsigned int compare_type; // compare_type: pcre, opcode, string
unsigned int endpoint; // endpoint, should be zero for non-endpoint nodes unsigned int endpoint; // endpoint, should be zero for non-endpoint nodes
unsigned int ov_cnt; // capture vector array size for pcre
// the pointer of R3Route data // the pointer of R3Route data
void * data; void * data;

View file

@ -6,6 +6,6 @@ libdir=@libdir@
Name: r3 Name: r3
Description: High-performance URL router library Description: High-performance URL router library
Version: @PACKAGE_VERSION@ Version: @PACKAGE_VERSION@
Requires: libpcre Requires: libpcre2-8
Libs: -L${libdir} -lr3 Libs: -L${libdir} -lr3
CFlags: -I${includedir} CFlags: -I${includedir}

View file

@ -19,7 +19,7 @@ target_include_directories(r3
target_link_libraries(r3 target_link_libraries(r3
PUBLIC PUBLIC
${PCRE_LIBRARIES}) ${PCRE2_LIBRARIES})
install( install(
TARGETS r3 TARGETS r3

View file

@ -13,8 +13,6 @@
// Jemalloc memory management // Jemalloc memory management
// #include <jemalloc/jemalloc.h> // #include <jemalloc/jemalloc.h>
// PCRE
#include <pcre.h>
#include "r3.h" #include "r3.h"
#include "r3_slug.h" #include "r3_slug.h"
#include "slug.h" #include "slug.h"

View file

@ -8,7 +8,6 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <pcre.h>
#include <assert.h> #include <assert.h>
#include "r3.h" #include "r3.h"

View file

@ -7,9 +7,6 @@
#include <netinet/in.h> #include <netinet/in.h>
#include <arpa/inet.h> #include <arpa/inet.h>
// PCRE
#include <pcre.h>
#include "r3.h" #include "r3.h"
#include "r3_slug.h" #include "r3_slug.h"
#include "slug.h" #include "slug.h"
@ -75,13 +72,11 @@ void r3_tree_free(R3Node * tree) {
} }
free(tree->routes.entries); free(tree->routes.entries);
if (tree->pcre_pattern) { if (tree->pcre_pattern) {
pcre_free(tree->pcre_pattern); pcre2_code_free(tree->pcre_pattern);
} }
#ifdef PCRE_STUDY_JIT_COMPILE if (tree->match_data) {
if (tree->pcre_extra) { pcre2_match_data_free(tree->match_data);
pcre_free_study(tree->pcre_extra);
} }
#endif
free(tree->combined_pattern); free(tree->combined_pattern);
free(tree); free(tree);
tree = NULL; tree = NULL;
@ -223,41 +218,44 @@ int r3_tree_compile_patterns(R3Node * n, char **errstr) {
free(n->combined_pattern); free(n->combined_pattern);
n->combined_pattern = cpat; n->combined_pattern = cpat;
const char *pcre_error = NULL; int pcre_errorcode = 0;
int pcre_erroffset = 0; PCRE2_SIZE pcre_erroffset = 0;
unsigned int option_bits = 0; unsigned int option_bits = 0;
n->ov_cnt = (1 + n->edges.size) * 3;
if (n->pcre_pattern) { if (n->pcre_pattern) {
pcre_free(n->pcre_pattern); pcre2_code_free(n->pcre_pattern);
} }
n->pcre_pattern = pcre_compile( n->pcre_pattern = pcre2_compile(
n->combined_pattern, /* the pattern */ (PCRE2_SPTR)n->combined_pattern, /* the pattern, 8-bit code units */
PCRE2_ZERO_TERMINATED,
option_bits, /* default options */ option_bits, /* default options */
&pcre_error, /* for error message */ &pcre_errorcode, /* for error code */
&pcre_erroffset, /* for error offset */ &pcre_erroffset, /* for error offset */
NULL); /* use default character tables */ NULL); /* compile context */
if (n->pcre_pattern == NULL) { if (n->pcre_pattern == NULL) {
if (errstr) { if (errstr) {
int r = asprintf(errstr, "PCRE compilation failed at offset %d: %s, pattern: %s", pcre_erroffset, pcre_error, n->combined_pattern); PCRE2_UCHAR buf[128];
if (r) {}; pcre2_get_error_message(pcre_errorcode, buf, sizeof(buf));
int r = asprintf(errstr, "PCRE compilation failed at offset %ld: %s, pattern: %s", pcre_erroffset, buf, n->combined_pattern);
if (r < 0) {
*errstr = NULL; /* the content of errstr is undefined when asprintf() fails */
}
} }
return -1; return -1;
} }
#ifdef PCRE_STUDY_JIT_COMPILE if (n->match_data) {
if (n->pcre_extra) { pcre2_match_data_free(n->match_data);
pcre_free_study(n->pcre_extra);
} }
n->pcre_extra = pcre_study(n->pcre_pattern, 0, &pcre_error); n->match_data = pcre2_match_data_create_from_pattern(n->pcre_pattern, NULL);
if (!n->pcre_extra && pcre_error) { if (n->match_data == NULL) {
if (errstr) { if (errstr) {
int r = asprintf(errstr, "PCRE study failed at offset %s, pattern: %s", pcre_error, n->combined_pattern); int r = asprintf(errstr, "Failed to allocate match data block");
if (r) {}; if (r < 0) {
*errstr = NULL; /* the content of errstr is undefined when asprintf() fails */
}
} }
return -1; return -1;
} }
#endif
return 0; return 0;
} }
@ -339,20 +337,18 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path,
info("COMPARE PCRE_PATTERN\n"); info("COMPARE PCRE_PATTERN\n");
const char *substring_start = 0; const char *substring_start = 0;
int substring_length = 0; int substring_length = 0;
int ov[ n->ov_cnt ];
int rc; int rc;
info("pcre matching %s on [%s]\n", n->combined_pattern, path); info("pcre matching %s on [%s]\n", n->combined_pattern, path);
rc = pcre_exec( rc = pcre2_match(
n->pcre_pattern, /* the compiled pattern */ n->pcre_pattern, /* the compiled pattern */
n->pcre_extra, (PCRE2_SPTR)path,/* the subject string, 8-bit code units */
path, /* the subject string */
path_len, /* the length of the subject */ path_len, /* the length of the subject */
0, /* start at offset 0 in the subject */ 0, /* start at offset 0 in the subject */
0, /* default options */ 0, /* default options */
ov, /* output vector for substring information */ n->match_data,/* match data results */
n->ov_cnt); /* number of elements in the output vector */ NULL); /* match context */
// does not match all edges, return NULL; // does not match all edges, return NULL;
if (rc < 0) { if (rc < 0) {
@ -360,7 +356,7 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path,
printf("pcre rc: %d\n", rc ); printf("pcre rc: %d\n", rc );
switch(rc) switch(rc)
{ {
case PCRE_ERROR_NOMATCH: case PCRE2_ERROR_NOMATCH:
printf("pcre: no match '%s' on pattern '%s'\n", path, n->combined_pattern); printf("pcre: no match '%s' on pattern '%s'\n", path, n->combined_pattern);
break; break;
@ -373,23 +369,22 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path,
return NULL; return NULL;
} }
PCRE2_SIZE *ov = pcre2_get_ovector_pointer(n->match_data);
restlen = path_len - ov[1]; // if it's fully matched to the end (rest string length) restlen = path_len - ov[1]; // if it's fully matched to the end (rest string length)
int *inv = ov + 2;
if (!restlen) { if (!restlen) {
// Check the substring to decide we should go deeper on which edge // Check the substring to decide we should go deeper on which edge
for (i = 1; i < rc; i++) for (i = 1; i < rc; i++)
{ {
substring_length = *(inv+1) - *inv; substring_length = ov[2*i+1] - ov[2*i];
// if it's not matched for this edge, just skip them quickly // if it's not matched for this edge, just skip them quickly
if (!is_end && !substring_length) { if (!is_end && !substring_length) {
inv += 2;
continue; continue;
} }
substring_start = path + *inv; substring_start = path + ov[2*i];
e = n->edges.entries + i - 1; e = n->edges.entries + i - 1;
if (entry && e->has_slug) { if (entry && e->has_slug) {
@ -404,18 +399,16 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path,
// Check the substring to decide we should go deeper on which edge // Check the substring to decide we should go deeper on which edge
inv = ov + 2;
for (i = 1; i < rc; i++) for (i = 1; i < rc; i++)
{ {
substring_length = *(inv+1) - *inv; substring_length = ov[2*i+1] - ov[2*i];
// if it's not matched for this edge, just skip them quickly // if it's not matched for this edge, just skip them quickly
if (!is_end && !substring_length) { if (!is_end && !substring_length) {
inv += 2;
continue; continue;
} }
substring_start = path + *inv; substring_start = path + ov[2*i];
e = n->edges.entries + i - 1; e = n->edges.entries + i - 1;
if (entry && e->has_slug) { if (entry && e->has_slug) {
@ -520,7 +513,6 @@ inline R3Edge * r3_node_find_edge_str(const R3Node * n, const char * str, int st
// n->endpoint = 0; // n->endpoint = 0;
// n->combined_pattern = NULL; // n->combined_pattern = NULL;
// n->pcre_pattern = NULL; // n->pcre_pattern = NULL;
// n->pcre_extra = NULL;
// n->data = NULL; // n->data = NULL;
// return n; // return n;
// } // }