[gtksourceview/wip/chergert/pcre2] pcre2: port GtkSourceRegex to PCRE2




commit 07ee3db2f58c77efc7d1ce46219ea843e6a33bc7
Author: Christian Hergert <chergert redhat com>
Date:   Fri Sep 25 10:23:18 2020 -0700

    pcre2: port GtkSourceRegex to PCRE2
    
    The goal here is to move to a JIT backed PCRE2 implementation, however
    this just gets an initial port in place to use PCRE2 for this code. We can
    eventually move other parts of GtkSourceView over to this implementation
    as well depending on how complete we need to be.
    
    TODO:
    
      * Implement more profiling with Sysprof to compare performance

 gtksourceview/gtksourceregex.c    |   4 +-
 gtksourceview/implregex-private.h |   8 +-
 gtksourceview/implregex.c         | 510 +++++++++++++++++++++++++++++++-------
 3 files changed, 423 insertions(+), 99 deletions(-)
---
diff --git a/gtksourceview/gtksourceregex.c b/gtksourceview/gtksourceregex.c
index 80d334a2..dc36c1ac 100644
--- a/gtksourceview/gtksourceregex.c
+++ b/gtksourceview/gtksourceregex.c
@@ -354,8 +354,8 @@ _gtk_source_regex_fetch_pos_bytes (GtkSourceRegex *regex,
                                   gint           *start_pos_p, /* byte offsets */
                                   gint           *end_pos_p)   /* byte offsets */
 {
-       gint start_pos;
-       gint end_pos;
+       gint start_pos = -1;
+       gint end_pos = -1;
 
        g_assert (regex->resolved);
 
diff --git a/gtksourceview/implregex-private.h b/gtksourceview/implregex-private.h
index da52474e..940ecb01 100644
--- a/gtksourceview/implregex-private.h
+++ b/gtksourceview/implregex-private.h
@@ -41,6 +41,7 @@ gboolean    impl_regex_match                (const ImplRegex        *regex,
                                              const char             *string,
                                              GRegexMatchFlags        match_options,
                                              ImplMatchInfo         **match_info);
+ImplRegex  *impl_regex_ref                  (ImplRegex              *regex);
 void        impl_regex_unref                (ImplRegex              *regex);
 void        impl_match_info_free            (ImplMatchInfo          *match_info);
 char       *impl_match_info_fetch           (const ImplMatchInfo    *match_info,
@@ -58,18 +59,21 @@ char       *impl_regex_replace_eval         (const ImplRegex        *regex,
 gboolean    impl_regex_match_full           (const ImplRegex        *regex,
                                              const char             *string,
                                              gssize                  string_len,
-                                             int                     start_position,
+                                             gsize                   start_position,
                                              GRegexMatchFlags        match_options,
                                              ImplMatchInfo         **match_info,
                                              GError                **error);
 gboolean    impl_match_info_fetch_pos       (const ImplMatchInfo    *match_info,
-                                             int                     match_num,
+                                             guint                   match_num,
                                              int                    *start_pos,
                                              int                    *end_pos);
 gboolean    impl_match_info_fetch_named_pos (const ImplMatchInfo    *match_info,
                                              const char             *name,
                                              int                    *start_pos,
                                              int                    *end_pos);
+gboolean    impl_match_info_matches         (const ImplMatchInfo    *match_info);
+gboolean    impl_match_info_next            (ImplMatchInfo          *match_info,
+                                             GError                **error);
 const char *impl_regex_get_pattern          (const ImplRegex        *regex);
 
 G_END_DECLS
diff --git a/gtksourceview/implregex.c b/gtksourceview/implregex.c
index 56a12799..32803486 100644
--- a/gtksourceview/implregex.c
+++ b/gtksourceview/implregex.c
@@ -21,46 +21,99 @@
 
 #include "config.h"
 
+#define PCRE2_CODE_UNIT_WIDTH 8
+
+#include <pcre2.h>
+#include <string.h>
+
 #include "implregex-private.h"
+#include "gtksourcetrace.h"
 
 struct _ImplRegex
 {
-       int         ref_count;
-       char       *pattern;
-       GRegex     *re;
+       int                    ref_count;
+       char                  *pattern;
+       gsize                  compile_flags;
+       gsize                  match_flags;
+       pcre2_compile_context *context;
+       pcre2_code            *code;
+       PCRE2_SPTR             name_table;
+       int                    name_count;
+       int                    name_entry_size;
 };
 
 struct _ImplMatchInfo
 {
-       GMatchInfo *match_info;
+       gsize             compile_flags;
+       gsize             match_flags;
+       ImplRegex        *regex;
+       const char       *string;
+       gsize             string_len;
+       pcre2_match_data *match_data;
+       PCRE2_SIZE       *offsets;
+       int               n_groups;
+       gsize             start_pos;
 };
 
-#if 0
-static void
-set_regex_error (GError **error,
-                 int      errnum)
+static gsize
+translate_compile_flags (GRegexCompileFlags flags)
 {
-       guchar errstr[128];
+       gsize ret = 0;
+
+       if (flags & G_REGEX_RAW)
+               ret |= PCRE2_NO_UTF_CHECK;
+       else
+               ret |= PCRE2_UTF;
+
+       if (flags & G_REGEX_ANCHORED)
+               ret |= PCRE2_ANCHORED;
+
+       if (flags & G_REGEX_CASELESS)
+               ret |= PCRE2_CASELESS;
 
-       pcre2_get_error_message (errnum, errstr, sizeof errstr - 1);
-       errstr[sizeof errstr - 1] = 0;
+       if (flags & G_REGEX_NEWLINE_LF)
+               ret |= PCRE2_NEWLINE_LF;
 
-       g_set_error_literal (error,
-                            G_REGEX_ERROR,
-                            G_REGEX_ERROR_COMPILE,
-                            (const gchar *)errstr);
+       if (flags & G_REGEX_NEWLINE_CR)
+               ret |= PCRE2_NEWLINE_CR;
+
+       return ret;
 }
-#endif
 
-static ImplMatchInfo *
-impl_match_info_new (const ImplRegex *regex)
+static gsize
+translate_match_flags (GRegexMatchFlags flags)
 {
-       ImplMatchInfo *match_info;
+       gsize ret = 0;
 
-       match_info = g_slice_new0 (ImplMatchInfo);
-       match_info->match_info = NULL;
+       if (flags & G_REGEX_MATCH_ANCHORED)
+               ret |= PCRE2_ANCHORED;
 
-       return match_info;
+       return ret;
+}
+
+static gboolean
+set_regex_error (GError **error,
+                 int      errnum)
+{
+       if (errnum > 0)
+       {
+               return FALSE;
+       }
+
+       if (error != NULL)
+       {
+               guchar errstr[128];
+
+               pcre2_get_error_message (errnum, errstr, sizeof errstr - 1);
+               errstr[sizeof errstr - 1] = 0;
+
+               g_set_error_literal (error,
+                                    G_REGEX_ERROR,
+                                    G_REGEX_ERROR_COMPILE,
+                                    (const gchar *)errstr);
+       }
+
+       return TRUE;
 }
 
 ImplRegex *
@@ -69,22 +122,67 @@ impl_regex_new (const char          *pattern,
                 GRegexMatchFlags     match_options,
                 GError             **error)
 {
-       GRegex *re;
+       pcre2_compile_context *context;
+       pcre2_code *code;
        ImplRegex *regex;
+       PCRE2_SIZE erroffset;
+       int errnumber = 0;
+#ifdef GTK_SOURCE_PROFILER_ENABLED
+       char *message;
+#endif
+
+       GTK_SOURCE_PROFILER_BEGIN_MARK;
 
        g_return_val_if_fail (pattern != NULL, NULL);
+       g_return_val_if_fail (strstr (pattern, "\\K") == NULL, NULL);
 
-       re = g_regex_new (pattern, compile_options, match_options, error);
+       context = pcre2_compile_context_create (NULL);
 
-       if (re == NULL)
+       regex = g_slice_new0 (ImplRegex);
+       regex->ref_count = 1;
+       regex->context = context;
+       regex->pattern = g_strdup (pattern);
+       regex->compile_flags = translate_compile_flags (compile_options);
+       regex->match_flags = translate_match_flags (match_options);
+
+       if (regex->compile_flags & PCRE2_NEWLINE_LF)
+               pcre2_set_newline (context, PCRE2_NEWLINE_LF);
+       else if (regex->compile_flags & PCRE2_NEWLINE_CR)
+               pcre2_set_newline (context, PCRE2_NEWLINE_CR);
+
+       regex->code = pcre2_compile ((PCRE2_SPTR)pattern,
+                                    PCRE2_ZERO_TERMINATED,
+                                    regex->compile_flags,
+                                    &errnumber,
+                                    &erroffset,
+                                    context);
+
+       if (set_regex_error (error, errnumber))
        {
+               impl_regex_unref (regex);
                return NULL;
        }
 
-       regex = g_slice_new0 (ImplRegex);
-       regex->ref_count = 1;
-       regex->pattern = g_strdup (pattern);
-       regex->re = re;
+       pcre2_pattern_info (code, PCRE2_INFO_NAMECOUNT, &regex->name_count);
+
+       if (regex->name_count > 0)
+       {
+               (void)pcre2_pattern_info (code,
+                                         PCRE2_INFO_NAMEENTRYSIZE,
+                                         &regex->name_entry_size);
+               (void)pcre2_pattern_info (code,
+                                         PCRE2_INFO_NAMETABLE,
+                                         &regex->name_table);
+       }
+
+#ifdef GTK_SOURCE_PROFILER_ENABLED
+       message = g_strdup_printf ("compile=%lx match=%lx pattern=%s",
+                                  regex->compile_flags,
+                                  regex->match_flags,
+                                  regex->pattern);
+       GTK_SOURCE_PROFILER_END_MARK (G_STRFUNC, message);
+       g_free (message);
+#endif
 
        return regex;
 }
@@ -97,6 +195,17 @@ impl_regex_get_pattern (const ImplRegex *regex)
        return regex->pattern;
 }
 
+ImplRegex *
+impl_regex_ref (ImplRegex *regex)
+{
+       g_return_val_if_fail (regex != NULL, NULL);
+       g_return_val_if_fail (regex->ref_count > 0, NULL);
+
+       regex->ref_count++;
+
+       return regex;
+}
+
 void
 impl_regex_unref (ImplRegex *regex)
 {
@@ -108,16 +217,64 @@ impl_regex_unref (ImplRegex *regex)
        if (regex->ref_count == 0)
        {
                g_clear_pointer (&regex->pattern, g_free);
-               g_clear_pointer (&regex->re, g_regex_unref);
+               g_clear_pointer (&regex->code, pcre2_code_free);
+               g_clear_pointer (&regex->context, pcre2_compile_context_free);
                g_slice_free (ImplRegex, regex);
        }
 }
 
+static ImplMatchInfo *
+impl_match_info_new (ImplRegex        *regex,
+                     GRegexMatchFlags  match_options,
+                     const char       *string,
+                     gssize            string_len)
+{
+       ImplMatchInfo *match_info;
+
+       g_assert (regex != NULL);
+       g_assert (string != NULL);
+       g_assert (string_len <= strlen (string));
+
+       if (string_len < 0)
+       {
+               string_len = strlen (string);
+       }
+
+       match_info = g_slice_new0 (ImplMatchInfo);
+       match_info->regex = impl_regex_ref (regex);
+       match_info->match_flags = regex->match_flags | translate_match_flags (match_options);
+       match_info->start_pos = -1;
+       match_info->n_groups = -1;
+       match_info->string = string;
+       match_info->string_len = string_len;
+       match_info->match_data = pcre2_match_data_create_from_pattern (regex->code, NULL);
+
+       if (match_info->match_data == NULL)
+       {
+               g_error ("Failed to allocate match data");
+       }
+
+       match_info->offsets = pcre2_get_ovector_pointer (match_info->match_data);
+
+       return match_info;
+}
+
 void
 impl_match_info_free (ImplMatchInfo *match_info)
 {
-       g_clear_pointer (&match_info->match_info, g_match_info_free);
-       g_slice_free (ImplMatchInfo, match_info);
+       if (match_info != NULL)
+       {
+               g_clear_pointer (&match_info->match_data, pcre2_match_data_free);
+               g_clear_pointer (&match_info->regex, impl_regex_unref);
+               match_info->string = NULL;
+               match_info->string_len = 0;
+               match_info->compile_flags = 0;
+               match_info->match_flags = 0;
+               match_info->n_groups = 0;
+               match_info->start_pos = 0;
+               match_info->offsets = NULL;
+               g_slice_free (ImplMatchInfo, match_info);
+       }
 }
 
 gboolean
@@ -127,51 +284,58 @@ impl_regex_match (const ImplRegex   *regex,
                   ImplMatchInfo    **match_info)
 {
        g_return_val_if_fail (regex != NULL, FALSE);
-       g_return_val_if_fail (regex->re != NULL, FALSE);
+       g_return_val_if_fail (regex->code != NULL, FALSE);
+       g_return_val_if_fail (string != NULL, FALSE);
 
-       if (match_info != NULL)
-       {
-               *match_info = impl_match_info_new (regex);
-       }
-
-       return g_regex_match (regex->re,
-                             string,
-                             match_options,
-                             match_info ? &(*match_info)->match_info : NULL);
+       return impl_regex_match_full (regex, string, -1, 0, match_options, match_info, NULL);
 }
 
 char *
 impl_match_info_fetch (const ImplMatchInfo *match_info,
                        int                  match_num)
 {
+       int begin =  -1;
+       int end =  -1;
+
        g_return_val_if_fail (match_info != NULL, NULL);
+       g_return_val_if_fail (match_info->string != NULL, NULL);
+       g_return_val_if_fail (match_info->offsets != NULL, NULL);
+
+       if (match_info->start_pos < match_info->string_len)
+       {
+               if (impl_match_info_fetch_pos (match_info, match_num, &begin, &end))
+               {
+                       if (begin >= 0 && end >= 0)
+                       {
+                               return g_strndup (match_info->string + begin, end - begin);
+                       }
+               }
+       }
 
-       return g_match_info_fetch (match_info->match_info, match_num);
+       return NULL;
 }
 
 char *
 impl_match_info_fetch_named (const ImplMatchInfo *match_info,
                              const char          *name)
 {
+       int begin = -1;
+       int end = -1;
+
        g_return_val_if_fail (match_info != NULL, NULL);
 
-       return g_match_info_fetch_named (match_info->match_info, name);
-}
+       if (match_info->start_pos < match_info->string_len)
+       {
+               if (impl_match_info_fetch_named_pos (match_info, name, &begin, &end))
+               {
+                       if (begin >= 0 && end >= 0)
+                       {
+                               return g_strndup (match_info->string + begin, end - begin);
+                       }
+               }
+       }
 
-static gboolean
-wrapper_eval (const GMatchInfo *match_info,
-              GString          *result,
-              gpointer          user_data)
-{
-       struct {
-               ImplRegexEvalCallback callback;
-               gpointer user_data;
-       } *wrapper = user_data;
-       ImplMatchInfo wrapped = {
-               .match_info = (GMatchInfo *)match_info,
-       };
-
-       return wrapper->callback (&wrapped, result, wrapper->user_data);
+       return NULL;
 }
 
 char *
@@ -184,58 +348,98 @@ impl_regex_replace_eval (const ImplRegex        *regex,
                          gpointer                user_data,
                          GError                **error)
 {
-       struct {
-               ImplRegexEvalCallback callback;
-               gpointer user_data;
-       } wrapper;
+       ImplMatchInfo *match_info;
+       GString *out_string;
+       gboolean done;
+       gsize prev_begin;
+       gsize str_pos;
 
        g_return_val_if_fail (regex != NULL, NULL);
-       g_return_val_if_fail (regex->re != NULL, NULL);
-
-       wrapper.callback = eval;
-       wrapper.user_data = user_data;
-
-       return g_regex_replace_eval (regex->re,
-                                    string,
-                                    string_len,
-                                    start_position,
-                                    match_options,
-                                    wrapper_eval,
-                                    &wrapper,
-                                    error);
+       g_return_val_if_fail (regex->code != NULL, NULL);
+       g_return_val_if_fail (start_position >= 0, NULL);
+
+       g_error ("++++++ Replace eval\n");
+
+       if (string_len < 0)
+       {
+               string_len = strlen (string);
+       }
+
+       match_info = NULL;
+
+       if (!impl_regex_match_full (regex, string, string_len, start_position, match_options, &match_info, 
error))
+       {
+               impl_match_info_free (match_info);
+               return g_strndup (string, string_len);
+       }
+
+       g_assert (match_info != NULL);
+       g_assert (match_info->n_groups > 0);
+
+       str_pos = 0;
+       out_string = g_string_sized_new (string_len);
+       done = FALSE;
+
+       while (!done && impl_match_info_matches (match_info))
+       {
+               prev_begin = match_info->offsets[0];
+               g_string_append_len (out_string, string + str_pos, prev_begin - str_pos);
+               str_pos = match_info->offsets[1];
+
+               done = eval (match_info, out_string, user_data);
+
+               if (!impl_match_info_next (match_info, NULL))
+               {
+                       break;
+               }
+       }
+
+       g_string_append_len (out_string,
+                            string + str_pos,
+                            string_len - str_pos);
+
+       impl_match_info_free (match_info);
+
+       return g_string_free (out_string, FALSE);
 }
 
 gboolean
 impl_regex_match_full (const ImplRegex   *regex,
                        const char        *string,
                        gssize             string_len,
-                       int                start_position,
+                       gsize              start_position,
                        GRegexMatchFlags   match_options,
                        ImplMatchInfo    **match_info,
                        GError           **error)
 {
-       GMatchInfo *wrapped = NULL;
-       gboolean ret;
+       ImplMatchInfo *local_match_info = NULL;
+       gboolean ret = FALSE;
 
        g_return_val_if_fail (regex != NULL, FALSE);
-       g_return_val_if_fail (regex->re != NULL, FALSE);
+       g_return_val_if_fail (regex->code != NULL, FALSE);
+       g_return_val_if_fail (match_options == 0, FALSE);
+       g_return_val_if_fail (string != NULL, FALSE);
+
+       if (string_len < 0)
+       {
+               string_len = strlen (string);
+       }
+
+       local_match_info = impl_match_info_new ((ImplRegex *)regex, match_options, string, string_len);
 
-       ret = g_regex_match_full (regex->re,
-                                 string,
-                                 string_len,
-                                 start_position,
-                                 match_options,
-                                 &wrapped,
-                                 error);
+       local_match_info->start_pos = start_position;
+       local_match_info->offsets[0] = start_position;
+       local_match_info->offsets[1] = start_position;
+
+       ret = impl_match_info_next (local_match_info, error);
 
        if (match_info != NULL)
        {
-               *match_info = g_slice_new0 (ImplMatchInfo);
-               (*match_info)->match_info = wrapped;
+               *match_info = g_steal_pointer (&local_match_info);
        }
        else
        {
-               g_match_info_free (wrapped);
+               impl_match_info_free (local_match_info);
        }
 
        return ret;
@@ -243,14 +447,26 @@ impl_regex_match_full (const ImplRegex   *regex,
 
 gboolean
 impl_match_info_fetch_pos (const ImplMatchInfo *match_info,
-                           int                  match_num,
+                           guint                match_num,
                            int                 *start_pos,
                            int                 *end_pos)
 {
        g_return_val_if_fail (match_info != NULL, FALSE);
-       g_return_val_if_fail (match_info->match_info != NULL, FALSE);
+       g_return_val_if_fail (match_info->match_data != NULL, FALSE);
+       g_return_val_if_fail (match_info->offsets != NULL, FALSE);
+
+       if (match_info->n_groups > 0 && match_num < match_info->n_groups)
+       {
+               if (start_pos)
+                       *start_pos = match_info->offsets[2*match_num];
 
-       return g_match_info_fetch_pos (match_info->match_info, match_num, start_pos, end_pos);
+               if (end_pos)
+                       *end_pos = match_info->offsets[2*match_num+1];
+
+               return TRUE;
+       }
+
+       return FALSE;
 }
 
 gboolean
@@ -259,8 +475,112 @@ impl_match_info_fetch_named_pos (const ImplMatchInfo *match_info,
                                  int                 *start_pos,
                                  int                 *end_pos)
 {
+       PCRE2_SPTR tabptr;
+
        g_return_val_if_fail (match_info != NULL, FALSE);
-       g_return_val_if_fail (match_info->match_info != NULL, FALSE);
+       g_return_val_if_fail (match_info->match_data != NULL, FALSE);
+       g_return_val_if_fail (match_info->regex != NULL, FALSE);
+       g_return_val_if_fail (start_pos != NULL, FALSE);
+       g_return_val_if_fail (end_pos != NULL, FALSE);
+
+       tabptr = match_info->regex->name_table;
+
+       for (gsize i = 0; i < match_info->regex->name_count; i++)
+       {
+               PCRE2_SIZE n = (tabptr[0] << 8) | tabptr[1];
+
+               if (g_strcmp0 (name, (const char *)(tabptr+2)) == 0)
+               {
+                       return impl_match_info_fetch_pos (match_info, n, start_pos, end_pos);
+               }
+
+               tabptr += match_info->regex->name_entry_size;
+       }
+
+       return FALSE;
+}
+
+gboolean
+impl_match_info_matches (const ImplMatchInfo *match_info)
+{
+       g_return_val_if_fail (match_info != NULL, FALSE);
+       g_return_val_if_fail (match_info->n_groups != 0, FALSE);
+
+       return match_info->n_groups > 0;
+}
+
+gboolean
+impl_match_info_next (ImplMatchInfo  *match_info,
+                      GError        **error)
+{
+       gssize prev_end;
+       gssize prev_begin;
+       int rc;
+
+       g_assert (match_info != NULL);
+       g_assert (match_info->regex != NULL);
+       g_assert (match_info->regex->code != NULL);
+       g_assert (match_info->offsets == pcre2_get_ovector_pointer (match_info->match_data));
+
+again:
+       match_info->n_groups = -1;
+
+       if (match_info->start_pos >= match_info->string_len)
+       {
+               g_set_error_literal (error,
+                                    G_REGEX_ERROR,
+                                    G_REGEX_ERROR_MATCH,
+                                    "No matches");
+               return FALSE;
+       }
+
+       prev_begin = match_info->offsets[0];
+       prev_end = match_info->offsets[1];
+
+       rc = pcre2_match (match_info->regex->code,
+                         (PCRE2_SPTR)match_info->string,
+                         match_info->string_len,
+                         match_info->start_pos,
+                         match_info->match_flags,
+                         match_info->match_data,
+                         NULL);
+
+       if (set_regex_error (error, rc))
+       {
+               match_info->n_groups = -1;
+               match_info->start_pos = match_info->string_len + 1;
+               return FALSE;
+       }
+
+       if (prev_end == match_info->offsets[1])
+       {
+               const char *next = g_utf8_next_char (match_info->string + prev_end);
+
+               if (match_info->start_pos > match_info->string_len)
+               {
+                       match_info->start_pos = match_info->string_len + 1;
+                       match_info->n_groups = -1;
+                       return FALSE;
+               }
+
+               match_info->start_pos = next - match_info->string;
+       }
+       else
+       {
+               match_info->start_pos = match_info->offsets[1];
+       }
+
+       if (match_info->n_groups >= 0 &&
+           prev_begin == match_info->offsets[0] &&
+           prev_end == match_info->offsets[1])
+       {
+               goto again;
+       }
+
+       match_info->n_groups = rc;
+
+       g_assert (match_info->offsets == pcre2_get_ovector_pointer (match_info->match_data));
+       g_assert (impl_match_info_matches (match_info));
 
-       return g_match_info_fetch_named_pos (match_info->match_info, name, start_pos, end_pos);
+       return impl_match_info_matches (match_info);
 }


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]