[gtksourceview/wip/chergert/pcre2] pcre2: port GtkSourceRegex to PCRE2
- From: Christian Hergert <chergert src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gtksourceview/wip/chergert/pcre2] pcre2: port GtkSourceRegex to PCRE2
- Date: Wed, 30 Sep 2020 15:39:10 +0000 (UTC)
commit 07ee3db2f58c77efc7d1ce46219ea843e6a33bc7
Author: Christian Hergert <chergert redhat com>
Date: Fri Sep 25 10:23:18 2020 -0700
pcre2: port GtkSourceRegex to PCRE2
The goal here is to move to a JIT backed PCRE2 implementation, however
this just gets an initial port in place to use PCRE2 for this code. We can
eventually move other parts of GtkSourceView over to this implementation
as well depending on how complete we need to be.
TODO:
* Implement more profiling with Sysprof to compare performance
gtksourceview/gtksourceregex.c | 4 +-
gtksourceview/implregex-private.h | 8 +-
gtksourceview/implregex.c | 510 +++++++++++++++++++++++++++++++-------
3 files changed, 423 insertions(+), 99 deletions(-)
---
diff --git a/gtksourceview/gtksourceregex.c b/gtksourceview/gtksourceregex.c
index 80d334a2..dc36c1ac 100644
--- a/gtksourceview/gtksourceregex.c
+++ b/gtksourceview/gtksourceregex.c
@@ -354,8 +354,8 @@ _gtk_source_regex_fetch_pos_bytes (GtkSourceRegex *regex,
gint *start_pos_p, /* byte offsets */
gint *end_pos_p) /* byte offsets */
{
- gint start_pos;
- gint end_pos;
+ gint start_pos = -1;
+ gint end_pos = -1;
g_assert (regex->resolved);
diff --git a/gtksourceview/implregex-private.h b/gtksourceview/implregex-private.h
index da52474e..940ecb01 100644
--- a/gtksourceview/implregex-private.h
+++ b/gtksourceview/implregex-private.h
@@ -41,6 +41,7 @@ gboolean impl_regex_match (const ImplRegex *regex,
const char *string,
GRegexMatchFlags match_options,
ImplMatchInfo **match_info);
+ImplRegex *impl_regex_ref (ImplRegex *regex);
void impl_regex_unref (ImplRegex *regex);
void impl_match_info_free (ImplMatchInfo *match_info);
char *impl_match_info_fetch (const ImplMatchInfo *match_info,
@@ -58,18 +59,21 @@ char *impl_regex_replace_eval (const ImplRegex *regex,
gboolean impl_regex_match_full (const ImplRegex *regex,
const char *string,
gssize string_len,
- int start_position,
+ gsize start_position,
GRegexMatchFlags match_options,
ImplMatchInfo **match_info,
GError **error);
gboolean impl_match_info_fetch_pos (const ImplMatchInfo *match_info,
- int match_num,
+ guint match_num,
int *start_pos,
int *end_pos);
gboolean impl_match_info_fetch_named_pos (const ImplMatchInfo *match_info,
const char *name,
int *start_pos,
int *end_pos);
+gboolean impl_match_info_matches (const ImplMatchInfo *match_info);
+gboolean impl_match_info_next (ImplMatchInfo *match_info,
+ GError **error);
const char *impl_regex_get_pattern (const ImplRegex *regex);
G_END_DECLS
diff --git a/gtksourceview/implregex.c b/gtksourceview/implregex.c
index 56a12799..32803486 100644
--- a/gtksourceview/implregex.c
+++ b/gtksourceview/implregex.c
@@ -21,46 +21,99 @@
#include "config.h"
+#define PCRE2_CODE_UNIT_WIDTH 8
+
+#include <pcre2.h>
+#include <string.h>
+
#include "implregex-private.h"
+#include "gtksourcetrace.h"
struct _ImplRegex
{
- int ref_count;
- char *pattern;
- GRegex *re;
+ int ref_count;
+ char *pattern;
+ gsize compile_flags;
+ gsize match_flags;
+ pcre2_compile_context *context;
+ pcre2_code *code;
+ PCRE2_SPTR name_table;
+ int name_count;
+ int name_entry_size;
};
struct _ImplMatchInfo
{
- GMatchInfo *match_info;
+ gsize compile_flags;
+ gsize match_flags;
+ ImplRegex *regex;
+ const char *string;
+ gsize string_len;
+ pcre2_match_data *match_data;
+ PCRE2_SIZE *offsets;
+ int n_groups;
+ gsize start_pos;
};
-#if 0
-static void
-set_regex_error (GError **error,
- int errnum)
+static gsize
+translate_compile_flags (GRegexCompileFlags flags)
{
- guchar errstr[128];
+ gsize ret = 0;
+
+ if (flags & G_REGEX_RAW)
+ ret |= PCRE2_NO_UTF_CHECK;
+ else
+ ret |= PCRE2_UTF;
+
+ if (flags & G_REGEX_ANCHORED)
+ ret |= PCRE2_ANCHORED;
+
+ if (flags & G_REGEX_CASELESS)
+ ret |= PCRE2_CASELESS;
- pcre2_get_error_message (errnum, errstr, sizeof errstr - 1);
- errstr[sizeof errstr - 1] = 0;
+ if (flags & G_REGEX_NEWLINE_LF)
+ ret |= PCRE2_NEWLINE_LF;
- g_set_error_literal (error,
- G_REGEX_ERROR,
- G_REGEX_ERROR_COMPILE,
- (const gchar *)errstr);
+ if (flags & G_REGEX_NEWLINE_CR)
+ ret |= PCRE2_NEWLINE_CR;
+
+ return ret;
}
-#endif
-static ImplMatchInfo *
-impl_match_info_new (const ImplRegex *regex)
+static gsize
+translate_match_flags (GRegexMatchFlags flags)
{
- ImplMatchInfo *match_info;
+ gsize ret = 0;
- match_info = g_slice_new0 (ImplMatchInfo);
- match_info->match_info = NULL;
+ if (flags & G_REGEX_MATCH_ANCHORED)
+ ret |= PCRE2_ANCHORED;
- return match_info;
+ return ret;
+}
+
+static gboolean
+set_regex_error (GError **error,
+ int errnum)
+{
+ if (errnum > 0)
+ {
+ return FALSE;
+ }
+
+ if (error != NULL)
+ {
+ guchar errstr[128];
+
+ pcre2_get_error_message (errnum, errstr, sizeof errstr - 1);
+ errstr[sizeof errstr - 1] = 0;
+
+ g_set_error_literal (error,
+ G_REGEX_ERROR,
+ G_REGEX_ERROR_COMPILE,
+ (const gchar *)errstr);
+ }
+
+ return TRUE;
}
ImplRegex *
@@ -69,22 +122,67 @@ impl_regex_new (const char *pattern,
GRegexMatchFlags match_options,
GError **error)
{
- GRegex *re;
+ pcre2_compile_context *context;
+ pcre2_code *code;
ImplRegex *regex;
+ PCRE2_SIZE erroffset;
+ int errnumber = 0;
+#ifdef GTK_SOURCE_PROFILER_ENABLED
+ char *message;
+#endif
+
+ GTK_SOURCE_PROFILER_BEGIN_MARK;
g_return_val_if_fail (pattern != NULL, NULL);
+ g_return_val_if_fail (strstr (pattern, "\\K") == NULL, NULL);
- re = g_regex_new (pattern, compile_options, match_options, error);
+ context = pcre2_compile_context_create (NULL);
- if (re == NULL)
+ regex = g_slice_new0 (ImplRegex);
+ regex->ref_count = 1;
+ regex->context = context;
+ regex->pattern = g_strdup (pattern);
+ regex->compile_flags = translate_compile_flags (compile_options);
+ regex->match_flags = translate_match_flags (match_options);
+
+ if (regex->compile_flags & PCRE2_NEWLINE_LF)
+ pcre2_set_newline (context, PCRE2_NEWLINE_LF);
+ else if (regex->compile_flags & PCRE2_NEWLINE_CR)
+ pcre2_set_newline (context, PCRE2_NEWLINE_CR);
+
+ regex->code = pcre2_compile ((PCRE2_SPTR)pattern,
+ PCRE2_ZERO_TERMINATED,
+ regex->compile_flags,
+ &errnumber,
+ &erroffset,
+ context);
+
+ if (set_regex_error (error, errnumber))
{
+ impl_regex_unref (regex);
return NULL;
}
- regex = g_slice_new0 (ImplRegex);
- regex->ref_count = 1;
- regex->pattern = g_strdup (pattern);
- regex->re = re;
+ pcre2_pattern_info (code, PCRE2_INFO_NAMECOUNT, ®ex->name_count);
+
+ if (regex->name_count > 0)
+ {
+ (void)pcre2_pattern_info (code,
+ PCRE2_INFO_NAMEENTRYSIZE,
+ ®ex->name_entry_size);
+ (void)pcre2_pattern_info (code,
+ PCRE2_INFO_NAMETABLE,
+ ®ex->name_table);
+ }
+
+#ifdef GTK_SOURCE_PROFILER_ENABLED
+ message = g_strdup_printf ("compile=%lx match=%lx pattern=%s",
+ regex->compile_flags,
+ regex->match_flags,
+ regex->pattern);
+ GTK_SOURCE_PROFILER_END_MARK (G_STRFUNC, message);
+ g_free (message);
+#endif
return regex;
}
@@ -97,6 +195,17 @@ impl_regex_get_pattern (const ImplRegex *regex)
return regex->pattern;
}
+ImplRegex *
+impl_regex_ref (ImplRegex *regex)
+{
+ g_return_val_if_fail (regex != NULL, NULL);
+ g_return_val_if_fail (regex->ref_count > 0, NULL);
+
+ regex->ref_count++;
+
+ return regex;
+}
+
void
impl_regex_unref (ImplRegex *regex)
{
@@ -108,16 +217,64 @@ impl_regex_unref (ImplRegex *regex)
if (regex->ref_count == 0)
{
g_clear_pointer (®ex->pattern, g_free);
- g_clear_pointer (®ex->re, g_regex_unref);
+ g_clear_pointer (®ex->code, pcre2_code_free);
+ g_clear_pointer (®ex->context, pcre2_compile_context_free);
g_slice_free (ImplRegex, regex);
}
}
+static ImplMatchInfo *
+impl_match_info_new (ImplRegex *regex,
+ GRegexMatchFlags match_options,
+ const char *string,
+ gssize string_len)
+{
+ ImplMatchInfo *match_info;
+
+ g_assert (regex != NULL);
+ g_assert (string != NULL);
+ g_assert (string_len <= strlen (string));
+
+ if (string_len < 0)
+ {
+ string_len = strlen (string);
+ }
+
+ match_info = g_slice_new0 (ImplMatchInfo);
+ match_info->regex = impl_regex_ref (regex);
+ match_info->match_flags = regex->match_flags | translate_match_flags (match_options);
+ match_info->start_pos = -1;
+ match_info->n_groups = -1;
+ match_info->string = string;
+ match_info->string_len = string_len;
+ match_info->match_data = pcre2_match_data_create_from_pattern (regex->code, NULL);
+
+ if (match_info->match_data == NULL)
+ {
+ g_error ("Failed to allocate match data");
+ }
+
+ match_info->offsets = pcre2_get_ovector_pointer (match_info->match_data);
+
+ return match_info;
+}
+
void
impl_match_info_free (ImplMatchInfo *match_info)
{
- g_clear_pointer (&match_info->match_info, g_match_info_free);
- g_slice_free (ImplMatchInfo, match_info);
+ if (match_info != NULL)
+ {
+ g_clear_pointer (&match_info->match_data, pcre2_match_data_free);
+ g_clear_pointer (&match_info->regex, impl_regex_unref);
+ match_info->string = NULL;
+ match_info->string_len = 0;
+ match_info->compile_flags = 0;
+ match_info->match_flags = 0;
+ match_info->n_groups = 0;
+ match_info->start_pos = 0;
+ match_info->offsets = NULL;
+ g_slice_free (ImplMatchInfo, match_info);
+ }
}
gboolean
@@ -127,51 +284,58 @@ impl_regex_match (const ImplRegex *regex,
ImplMatchInfo **match_info)
{
g_return_val_if_fail (regex != NULL, FALSE);
- g_return_val_if_fail (regex->re != NULL, FALSE);
+ g_return_val_if_fail (regex->code != NULL, FALSE);
+ g_return_val_if_fail (string != NULL, FALSE);
- if (match_info != NULL)
- {
- *match_info = impl_match_info_new (regex);
- }
-
- return g_regex_match (regex->re,
- string,
- match_options,
- match_info ? &(*match_info)->match_info : NULL);
+ return impl_regex_match_full (regex, string, -1, 0, match_options, match_info, NULL);
}
char *
impl_match_info_fetch (const ImplMatchInfo *match_info,
int match_num)
{
+ int begin = -1;
+ int end = -1;
+
g_return_val_if_fail (match_info != NULL, NULL);
+ g_return_val_if_fail (match_info->string != NULL, NULL);
+ g_return_val_if_fail (match_info->offsets != NULL, NULL);
+
+ if (match_info->start_pos < match_info->string_len)
+ {
+ if (impl_match_info_fetch_pos (match_info, match_num, &begin, &end))
+ {
+ if (begin >= 0 && end >= 0)
+ {
+ return g_strndup (match_info->string + begin, end - begin);
+ }
+ }
+ }
- return g_match_info_fetch (match_info->match_info, match_num);
+ return NULL;
}
char *
impl_match_info_fetch_named (const ImplMatchInfo *match_info,
const char *name)
{
+ int begin = -1;
+ int end = -1;
+
g_return_val_if_fail (match_info != NULL, NULL);
- return g_match_info_fetch_named (match_info->match_info, name);
-}
+ if (match_info->start_pos < match_info->string_len)
+ {
+ if (impl_match_info_fetch_named_pos (match_info, name, &begin, &end))
+ {
+ if (begin >= 0 && end >= 0)
+ {
+ return g_strndup (match_info->string + begin, end - begin);
+ }
+ }
+ }
-static gboolean
-wrapper_eval (const GMatchInfo *match_info,
- GString *result,
- gpointer user_data)
-{
- struct {
- ImplRegexEvalCallback callback;
- gpointer user_data;
- } *wrapper = user_data;
- ImplMatchInfo wrapped = {
- .match_info = (GMatchInfo *)match_info,
- };
-
- return wrapper->callback (&wrapped, result, wrapper->user_data);
+ return NULL;
}
char *
@@ -184,58 +348,98 @@ impl_regex_replace_eval (const ImplRegex *regex,
gpointer user_data,
GError **error)
{
- struct {
- ImplRegexEvalCallback callback;
- gpointer user_data;
- } wrapper;
+ ImplMatchInfo *match_info;
+ GString *out_string;
+ gboolean done;
+ gsize prev_begin;
+ gsize str_pos;
g_return_val_if_fail (regex != NULL, NULL);
- g_return_val_if_fail (regex->re != NULL, NULL);
-
- wrapper.callback = eval;
- wrapper.user_data = user_data;
-
- return g_regex_replace_eval (regex->re,
- string,
- string_len,
- start_position,
- match_options,
- wrapper_eval,
- &wrapper,
- error);
+ g_return_val_if_fail (regex->code != NULL, NULL);
+ g_return_val_if_fail (start_position >= 0, NULL);
+
+ g_error ("++++++ Replace eval\n");
+
+ if (string_len < 0)
+ {
+ string_len = strlen (string);
+ }
+
+ match_info = NULL;
+
+ if (!impl_regex_match_full (regex, string, string_len, start_position, match_options, &match_info,
error))
+ {
+ impl_match_info_free (match_info);
+ return g_strndup (string, string_len);
+ }
+
+ g_assert (match_info != NULL);
+ g_assert (match_info->n_groups > 0);
+
+ str_pos = 0;
+ out_string = g_string_sized_new (string_len);
+ done = FALSE;
+
+ while (!done && impl_match_info_matches (match_info))
+ {
+ prev_begin = match_info->offsets[0];
+ g_string_append_len (out_string, string + str_pos, prev_begin - str_pos);
+ str_pos = match_info->offsets[1];
+
+ done = eval (match_info, out_string, user_data);
+
+ if (!impl_match_info_next (match_info, NULL))
+ {
+ break;
+ }
+ }
+
+ g_string_append_len (out_string,
+ string + str_pos,
+ string_len - str_pos);
+
+ impl_match_info_free (match_info);
+
+ return g_string_free (out_string, FALSE);
}
gboolean
impl_regex_match_full (const ImplRegex *regex,
const char *string,
gssize string_len,
- int start_position,
+ gsize start_position,
GRegexMatchFlags match_options,
ImplMatchInfo **match_info,
GError **error)
{
- GMatchInfo *wrapped = NULL;
- gboolean ret;
+ ImplMatchInfo *local_match_info = NULL;
+ gboolean ret = FALSE;
g_return_val_if_fail (regex != NULL, FALSE);
- g_return_val_if_fail (regex->re != NULL, FALSE);
+ g_return_val_if_fail (regex->code != NULL, FALSE);
+ g_return_val_if_fail (match_options == 0, FALSE);
+ g_return_val_if_fail (string != NULL, FALSE);
+
+ if (string_len < 0)
+ {
+ string_len = strlen (string);
+ }
+
+ local_match_info = impl_match_info_new ((ImplRegex *)regex, match_options, string, string_len);
- ret = g_regex_match_full (regex->re,
- string,
- string_len,
- start_position,
- match_options,
- &wrapped,
- error);
+ local_match_info->start_pos = start_position;
+ local_match_info->offsets[0] = start_position;
+ local_match_info->offsets[1] = start_position;
+
+ ret = impl_match_info_next (local_match_info, error);
if (match_info != NULL)
{
- *match_info = g_slice_new0 (ImplMatchInfo);
- (*match_info)->match_info = wrapped;
+ *match_info = g_steal_pointer (&local_match_info);
}
else
{
- g_match_info_free (wrapped);
+ impl_match_info_free (local_match_info);
}
return ret;
@@ -243,14 +447,26 @@ impl_regex_match_full (const ImplRegex *regex,
gboolean
impl_match_info_fetch_pos (const ImplMatchInfo *match_info,
- int match_num,
+ guint match_num,
int *start_pos,
int *end_pos)
{
g_return_val_if_fail (match_info != NULL, FALSE);
- g_return_val_if_fail (match_info->match_info != NULL, FALSE);
+ g_return_val_if_fail (match_info->match_data != NULL, FALSE);
+ g_return_val_if_fail (match_info->offsets != NULL, FALSE);
+
+ if (match_info->n_groups > 0 && match_num < match_info->n_groups)
+ {
+ if (start_pos)
+ *start_pos = match_info->offsets[2*match_num];
- return g_match_info_fetch_pos (match_info->match_info, match_num, start_pos, end_pos);
+ if (end_pos)
+ *end_pos = match_info->offsets[2*match_num+1];
+
+ return TRUE;
+ }
+
+ return FALSE;
}
gboolean
@@ -259,8 +475,112 @@ impl_match_info_fetch_named_pos (const ImplMatchInfo *match_info,
int *start_pos,
int *end_pos)
{
+ PCRE2_SPTR tabptr;
+
g_return_val_if_fail (match_info != NULL, FALSE);
- g_return_val_if_fail (match_info->match_info != NULL, FALSE);
+ g_return_val_if_fail (match_info->match_data != NULL, FALSE);
+ g_return_val_if_fail (match_info->regex != NULL, FALSE);
+ g_return_val_if_fail (start_pos != NULL, FALSE);
+ g_return_val_if_fail (end_pos != NULL, FALSE);
+
+ tabptr = match_info->regex->name_table;
+
+ for (gsize i = 0; i < match_info->regex->name_count; i++)
+ {
+ PCRE2_SIZE n = (tabptr[0] << 8) | tabptr[1];
+
+ if (g_strcmp0 (name, (const char *)(tabptr+2)) == 0)
+ {
+ return impl_match_info_fetch_pos (match_info, n, start_pos, end_pos);
+ }
+
+ tabptr += match_info->regex->name_entry_size;
+ }
+
+ return FALSE;
+}
+
+gboolean
+impl_match_info_matches (const ImplMatchInfo *match_info)
+{
+ g_return_val_if_fail (match_info != NULL, FALSE);
+ g_return_val_if_fail (match_info->n_groups != 0, FALSE);
+
+ return match_info->n_groups > 0;
+}
+
+gboolean
+impl_match_info_next (ImplMatchInfo *match_info,
+ GError **error)
+{
+ gssize prev_end;
+ gssize prev_begin;
+ int rc;
+
+ g_assert (match_info != NULL);
+ g_assert (match_info->regex != NULL);
+ g_assert (match_info->regex->code != NULL);
+ g_assert (match_info->offsets == pcre2_get_ovector_pointer (match_info->match_data));
+
+again:
+ match_info->n_groups = -1;
+
+ if (match_info->start_pos >= match_info->string_len)
+ {
+ g_set_error_literal (error,
+ G_REGEX_ERROR,
+ G_REGEX_ERROR_MATCH,
+ "No matches");
+ return FALSE;
+ }
+
+ prev_begin = match_info->offsets[0];
+ prev_end = match_info->offsets[1];
+
+ rc = pcre2_match (match_info->regex->code,
+ (PCRE2_SPTR)match_info->string,
+ match_info->string_len,
+ match_info->start_pos,
+ match_info->match_flags,
+ match_info->match_data,
+ NULL);
+
+ if (set_regex_error (error, rc))
+ {
+ match_info->n_groups = -1;
+ match_info->start_pos = match_info->string_len + 1;
+ return FALSE;
+ }
+
+ if (prev_end == match_info->offsets[1])
+ {
+ const char *next = g_utf8_next_char (match_info->string + prev_end);
+
+ if (match_info->start_pos > match_info->string_len)
+ {
+ match_info->start_pos = match_info->string_len + 1;
+ match_info->n_groups = -1;
+ return FALSE;
+ }
+
+ match_info->start_pos = next - match_info->string;
+ }
+ else
+ {
+ match_info->start_pos = match_info->offsets[1];
+ }
+
+ if (match_info->n_groups >= 0 &&
+ prev_begin == match_info->offsets[0] &&
+ prev_end == match_info->offsets[1])
+ {
+ goto again;
+ }
+
+ match_info->n_groups = rc;
+
+ g_assert (match_info->offsets == pcre2_get_ovector_pointer (match_info->match_data));
+ g_assert (impl_match_info_matches (match_info));
- return g_match_info_fetch_named_pos (match_info->match_info, name, start_pos, end_pos);
+ return impl_match_info_matches (match_info);
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]