Use PCRE2 instead of PCRE (#153)

PCRE is now at end of life and is no longer actively maintained.
Lift the dependency to the next major version, i.e. PCRE2.

Implementation notes:
- Removed the pcre study option since:
  "The new API ... was simplified by abolishing the separate "study" optimizing
  function; in PCRE2, patterns are automatically optimized where possible."
- If asprintf() fails the content of the 'strp' variable is undefined.
  Lets check the return value and return NULL upon error.
- Pattern and subject can straightforwardly be cast to PCRE2_SPTR since we
  only work with 8-bit code units.
This commit is contained in:
Björn Svensson 2023-10-23 12:38:24 +02:00 committed by GitHub
parent 9168f7e4d4
commit c105117b40
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 88 additions and 99 deletions

View file

@ -10,7 +10,7 @@ apt-get install -qq \
cmake \
graphviz-dev \
libjemalloc-dev \
libpcre3-dev \
libpcre2-dev \
libtool \
ninja-build \
pkg-config

View file

@ -5,7 +5,7 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
set(CMAKE_C_STANDARD 99)
find_package(Check)
find_package(PCRE REQUIRED)
find_package(PCRE2 REQUIRED)
include(CheckSymbolExists)
include(CheckIncludeFile)

View file

@ -25,7 +25,7 @@ Requirement
### Runtime Requirement
* pcre
* pcre2
* (optional) graphviz version 2.38.0 (20140413.2041)
* (optional) libjson-c-dev
@ -187,13 +187,13 @@ Optimization
Simple regular expressions are optimized through a regexp pattern to opcode
translator, which translates simple patterns into small & fast scanners.
By using this method, r3 reduces the matching overhead of pcre library.
By using this method, r3 reduces the matching overhead of pcre2 library.
Optimized patterns are: `[a-z]+`, `[0-9]+`, `\d+`, `\w+`, `[^/]+`, `[^-]+` or `.*`.
Slugs without specified regular expression will be compiled into the `[^/]+` pattern. therefore, it's optimized too.
Complex regular expressions will still use libpcre to match URL (partially).
Complex regular expressions will still use libpcre2 to match URL (partially).
Performance
@ -356,7 +356,7 @@ if ( $error ) {
Install
----------------------
sudo apt-get install check libpcre3 libpcre3-dev libjemalloc-dev libjemalloc1 build-essential libtool automake autoconf pkg-config
sudo apt-get install check libpcre2 libpcre2-dev libjemalloc-dev libjemalloc1 build-essential libtool automake autoconf pkg-config
sudo apt-get install graphviz-dev graphviz # if you want graphviz
./autogen.sh
./configure && make
@ -364,7 +364,7 @@ Install
And we support debian-based distro now!
sudo apt-get install build-essential autoconf automake libpcre3-dev pkg-config debhelper libtool check
sudo apt-get install build-essential autoconf automake libpcre2-dev pkg-config debhelper libtool check
mv dist-debian debian
dpkg-buildpackage -b -us -uc
sudo gdebi ../libr3*.deb

View file

@ -1,37 +0,0 @@
# Copyright (C) 2007-2009 LuaDist.
# Created by Peter Kapec <kapecp@gmail.com>
# Redistribution and use of this file is allowed according to the terms of the MIT license.
# For details see the COPYRIGHT file distributed with LuaDist.
# Note:
# Searching headers and libraries is very simple and is NOT as powerful as scripts
# distributed with CMake, because LuaDist defines directories to search for.
# Everyone is encouraged to contact the author with improvements. Maybe this file
# becomes part of CMake distribution sometimes.
# - Find pcre
# Find the native PCRE headers and libraries.
#
# PCRE_INCLUDE_DIRS - where to find pcre.h, etc.
# PCRE_LIBRARIES - List of libraries when using pcre.
# PCRE_FOUND - True if pcre found.
# Look for the header file.
FIND_PATH(PCRE_INCLUDE_DIR NAMES pcre.h)
# Look for the library.
FIND_LIBRARY(PCRE_LIBRARY NAMES pcre)
# Handle the QUIETLY and REQUIRED arguments and set PCRE_FOUND to TRUE if all listed variables are TRUE.
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(PCRE DEFAULT_MSG PCRE_LIBRARY PCRE_INCLUDE_DIR)
# Copy the results to the output variables.
IF(PCRE_FOUND)
SET(PCRE_LIBRARIES ${PCRE_LIBRARY})
SET(PCRE_INCLUDE_DIRS ${PCRE_INCLUDE_DIR})
ELSE(PCRE_FOUND)
SET(PCRE_LIBRARIES)
SET(PCRE_INCLUDE_DIRS)
ENDIF(PCRE_FOUND)
MARK_AS_ADVANCED(PCRE_INCLUDE_DIRS PCRE_LIBRARIES)

View file

@ -0,0 +1,37 @@
# Copyright (C) 2007-2009 LuaDist.
# Created by Peter Kapec <kapecp@gmail.com>
# Redistribution and use of this file is allowed according to the terms of the MIT license.
# For details see the COPYRIGHT file distributed with LuaDist.
# Note:
# Searching headers and libraries is very simple and is NOT as powerful as scripts
# distributed with CMake, because LuaDist defines directories to search for.
# Everyone is encouraged to contact the author with improvements. Maybe this file
# becomes part of CMake distribution sometimes.
# - Find pcre2
# Find the native PCRE2 headers and libraries.
#
# PCRE2_INCLUDE_DIRS - where to find pcre2.h, etc.
# PCRE2_LIBRARIES - List of libraries when using pcre2.
# PCRE2_FOUND - True if pcre2 found.
# Look for the header file.
FIND_PATH(PCRE2_INCLUDE_DIR NAMES pcre2.h)
# Look for the library.
FIND_LIBRARY(PCRE2_LIBRARY NAMES pcre2-8)
# Handle the QUIETLY and REQUIRED arguments and set PCRE2_FOUND to TRUE if all listed variables are TRUE.
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(PCRE2 DEFAULT_MSG PCRE2_LIBRARY PCRE2_INCLUDE_DIR)
# Copy the results to the output variables.
IF(PCRE2_FOUND)
SET(PCRE2_LIBRARIES ${PCRE2_LIBRARY})
SET(PCRE2_INCLUDE_DIRS ${PCRE2_INCLUDE_DIR})
ELSE(PCRE2_FOUND)
SET(PCRE2_LIBRARIES)
SET(PCRE2_INCLUDE_DIRS)
ENDIF(PCRE2_FOUND)
MARK_AS_ADVANCED(PCRE2_INCLUDE_DIRS PCRE2_LIBRARIES)

View file

@ -73,7 +73,7 @@ AM_CONDITIONAL(USE_JEMALLOC, test "x$have_jemalloc" = "xyes")
# AC_DEFINE(USE_JEMALLOC, test "x$found_jemalloc" = "xyes" , "use jemalloc")
PKG_CHECK_MODULES(DEPS, [libpcre])
PKG_CHECK_MODULES(DEPS, [libpcre2-8])
AC_SUBST(DEPS_CFLAGS)
AC_SUBST(DEPS_LIBS)

View file

@ -2,7 +2,7 @@ Source: libr3
Priority: optional
Maintainer: Ronmi Ren <ronmi.ren@gmail.com>
Build-Depends: debhelper (>= 8.0.0), automake, autotools-dev, autoconf,
libtool, libpcre3-dev, pkg-config, check
libtool, libpcre2-dev, pkg-config, check
Standards-Version: 3.9.4
Section: libs
Homepage: https://github.com/c9s/r3

View file

@ -10,7 +10,8 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pcre.h>
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
#if __STDC_VERSION__ <= 201710L
#ifdef HAVE_STDBOOL_H
@ -43,13 +44,12 @@ struct _node {
R3_VECTOR(R3Edge) edges;
R3_VECTOR(R3Route) routes;
char * combined_pattern;
pcre * pcre_pattern;
pcre_extra * pcre_extra;
pcre2_code * pcre_pattern;
pcre2_match_data * match_data;
// edges are mostly less than 255
unsigned int compare_type; // compare_type: pcre, opcode, string
unsigned int endpoint; // endpoint, should be zero for non-endpoint nodes
unsigned int ov_cnt; // capture vector array size for pcre
// the pointer of R3Route data
void * data;

View file

@ -6,6 +6,6 @@ libdir=@libdir@
Name: r3
Description: High-performance URL router library
Version: @PACKAGE_VERSION@
Requires: libpcre
Requires: libpcre2-8
Libs: -L${libdir} -lr3
CFlags: -I${includedir}

View file

@ -19,7 +19,7 @@ target_include_directories(r3
target_link_libraries(r3
PUBLIC
${PCRE_LIBRARIES})
${PCRE2_LIBRARIES})
install(
TARGETS r3

View file

@ -13,8 +13,6 @@
// Jemalloc memory management
// #include <jemalloc/jemalloc.h>
// PCRE
#include <pcre.h>
#include "r3.h"
#include "r3_slug.h"
#include "slug.h"

View file

@ -8,7 +8,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pcre.h>
#include <assert.h>
#include "r3.h"

View file

@ -7,9 +7,6 @@
#include <netinet/in.h>
#include <arpa/inet.h>
// PCRE
#include <pcre.h>
#include "r3.h"
#include "r3_slug.h"
#include "slug.h"
@ -75,13 +72,11 @@ void r3_tree_free(R3Node * tree) {
}
free(tree->routes.entries);
if (tree->pcre_pattern) {
pcre_free(tree->pcre_pattern);
pcre2_code_free(tree->pcre_pattern);
}
#ifdef PCRE_STUDY_JIT_COMPILE
if (tree->pcre_extra) {
pcre_free_study(tree->pcre_extra);
if (tree->match_data) {
pcre2_match_data_free(tree->match_data);
}
#endif
free(tree->combined_pattern);
free(tree);
tree = NULL;
@ -223,41 +218,44 @@ int r3_tree_compile_patterns(R3Node * n, char **errstr) {
free(n->combined_pattern);
n->combined_pattern = cpat;
const char *pcre_error = NULL;
int pcre_erroffset = 0;
int pcre_errorcode = 0;
PCRE2_SIZE pcre_erroffset = 0;
unsigned int option_bits = 0;
n->ov_cnt = (1 + n->edges.size) * 3;
if (n->pcre_pattern) {
pcre_free(n->pcre_pattern);
pcre2_code_free(n->pcre_pattern);
}
n->pcre_pattern = pcre_compile(
n->combined_pattern, /* the pattern */
n->pcre_pattern = pcre2_compile(
(PCRE2_SPTR)n->combined_pattern, /* the pattern, 8-bit code units */
PCRE2_ZERO_TERMINATED,
option_bits, /* default options */
&pcre_error, /* for error message */
&pcre_errorcode, /* for error code */
&pcre_erroffset, /* for error offset */
NULL); /* use default character tables */
NULL); /* compile context */
if (n->pcre_pattern == NULL) {
if (errstr) {
int r = asprintf(errstr, "PCRE compilation failed at offset %d: %s, pattern: %s", pcre_erroffset, pcre_error, n->combined_pattern);
if (r) {};
PCRE2_UCHAR buf[128];
pcre2_get_error_message(pcre_errorcode, buf, sizeof(buf));
int r = asprintf(errstr, "PCRE compilation failed at offset %ld: %s, pattern: %s", pcre_erroffset, buf, n->combined_pattern);
if (r < 0) {
*errstr = NULL; /* the content of errstr is undefined when asprintf() fails */
}
}
return -1;
}
#ifdef PCRE_STUDY_JIT_COMPILE
if (n->pcre_extra) {
pcre_free_study(n->pcre_extra);
if (n->match_data) {
pcre2_match_data_free(n->match_data);
}
n->pcre_extra = pcre_study(n->pcre_pattern, 0, &pcre_error);
if (!n->pcre_extra && pcre_error) {
n->match_data = pcre2_match_data_create_from_pattern(n->pcre_pattern, NULL);
if (n->match_data == NULL) {
if (errstr) {
int r = asprintf(errstr, "PCRE study failed at offset %s, pattern: %s", pcre_error, n->combined_pattern);
if (r) {};
int r = asprintf(errstr, "Failed to allocate match data block");
if (r < 0) {
*errstr = NULL; /* the content of errstr is undefined when asprintf() fails */
}
}
return -1;
}
#endif
return 0;
}
@ -339,20 +337,18 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path,
info("COMPARE PCRE_PATTERN\n");
const char *substring_start = 0;
int substring_length = 0;
int ov[ n->ov_cnt ];
int rc;
info("pcre matching %s on [%s]\n", n->combined_pattern, path);
rc = pcre_exec(
rc = pcre2_match(
n->pcre_pattern, /* the compiled pattern */
n->pcre_extra,
path, /* the subject string */
(PCRE2_SPTR)path,/* the subject string, 8-bit code units */
path_len, /* the length of the subject */
0, /* start at offset 0 in the subject */
0, /* default options */
ov, /* output vector for substring information */
n->ov_cnt); /* number of elements in the output vector */
n->match_data,/* match data results */
NULL); /* match context */
// does not match all edges, return NULL;
if (rc < 0) {
@ -360,7 +356,7 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path,
printf("pcre rc: %d\n", rc );
switch(rc)
{
case PCRE_ERROR_NOMATCH:
case PCRE2_ERROR_NOMATCH:
printf("pcre: no match '%s' on pattern '%s'\n", path, n->combined_pattern);
break;
@ -373,23 +369,22 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path,
return NULL;
}
PCRE2_SIZE *ov = pcre2_get_ovector_pointer(n->match_data);
restlen = path_len - ov[1]; // if it's fully matched to the end (rest string length)
int *inv = ov + 2;
if (!restlen) {
// Check the substring to decide we should go deeper on which edge
for (i = 1; i < rc; i++)
{
substring_length = *(inv+1) - *inv;
substring_length = ov[2*i+1] - ov[2*i];
// if it's not matched for this edge, just skip them quickly
if (!is_end && !substring_length) {
inv += 2;
continue;
}
substring_start = path + *inv;
substring_start = path + ov[2*i];
e = n->edges.entries + i - 1;
if (entry && e->has_slug) {
@ -404,18 +399,16 @@ static R3Node * r3_tree_matchl_base(const R3Node * n, const char * path,
// Check the substring to decide we should go deeper on which edge
inv = ov + 2;
for (i = 1; i < rc; i++)
{
substring_length = *(inv+1) - *inv;
substring_length = ov[2*i+1] - ov[2*i];
// if it's not matched for this edge, just skip them quickly
if (!is_end && !substring_length) {
inv += 2;
continue;
}
substring_start = path + *inv;
substring_start = path + ov[2*i];
e = n->edges.entries + i - 1;
if (entry && e->has_slug) {
@ -520,7 +513,6 @@ inline R3Edge * r3_node_find_edge_str(const R3Node * n, const char * str, int st
// n->endpoint = 0;
// n->combined_pattern = NULL;
// n->pcre_pattern = NULL;
// n->pcre_extra = NULL;
// n->data = NULL;
// return n;
// }