[vte/wip/regex-builtins: 94/94] lib: Add builtin regexes
- From: Christian Persch <chpe src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [vte/wip/regex-builtins: 94/94] lib: Add builtin regexes
- Date: Thu, 21 Nov 2019 21:06:32 +0000 (UTC)
commit 74b11d087969f9fb4ca9a64a37fe4c7b6247e57f
Author: Christian Persch <chpe src gnome org>
Date: Thu Nov 21 22:05:25 2019 +0100
lib: Add builtin regexes
Add builin regexes to recognise URLs, copied from gnome-terminal.
https://gitlab.gnome.org/GNOME/vte/issues/114
doc/reference/vte-sections.txt | 5 +
meson.build | 2 +-
src/app/app.cc | 41 +++-
src/fwd.hh | 1 +
src/meson.build | 24 +-
src/regex-builtins-patterns.hh | 151 ++++++++++++
src/regex-builtins.cc | 104 +++++++++
src/regex-builtins.hh | 79 +++++++
src/regex-test.cc | 507 +++++++++++++++++++++++++++++++++++++++++
src/vte.cc | 39 +++-
src/vte/vteenums.h | 14 ++
src/vte/vteterminal.h | 4 +
src/vtedefines.hh | 1 +
src/vtegtk.cc | 78 ++++++-
src/vteinternal.hh | 6 +
src/vteregex.cc | 6 +-
16 files changed, 1036 insertions(+), 26 deletions(-)
---
diff --git a/doc/reference/vte-sections.txt b/doc/reference/vte-sections.txt
index 5ce2093e..ffef439f 100644
--- a/doc/reference/vte-sections.txt
+++ b/doc/reference/vte-sections.txt
@@ -6,6 +6,7 @@ VteCursorBlinkMode
VteCursorShape
VteEraseBinding
VteTextBlinkMode
+VteBuiltinMatchTags
VteFormat
VteWriteFlags
VteSelectionFunc
@@ -71,8 +72,10 @@ vte_terminal_get_text_range
vte_terminal_get_cursor_position
vte_terminal_hyperlink_check_event
vte_terminal_match_add_regex
+vte_terminal_match_add_builtins
vte_terminal_match_remove
vte_terminal_match_remove_all
+vte_terminal_match_remove_builtins
vte_terminal_match_check
vte_terminal_match_check_event
vte_terminal_match_set_cursor_name
@@ -118,6 +121,8 @@ VTE_TYPE_ERASE_BINDING
vte_erase_binding_get_type
VTE_TYPE_TEXT_BLINK_MODE
vte_text_blink_mode_get_type
+VTE_TYPE_BUILTIN_MATCH_TAGS
+vte_builtin_match_tags_get_type
VTE_TYPE_FORMAT
vte_format_get_type
VTE_TYPE_WRITE_FLAGS
diff --git a/meson.build b/meson.build
index c3b08772..4c70a2bc 100644
--- a/meson.build
+++ b/meson.build
@@ -33,7 +33,7 @@ project(
gtk3_req_version = '3.20.0'
gtk3_min_req_version = '3.18'
-gtk3_max_allowed_version = '3.20'
+gtk3_max_allowed_version = '3.24'
gtk4_req_version = '4.0.0'
fribidi_req_version = '1.0.0'
diff --git a/src/app/app.cc b/src/app/app.cc
index e554418e..a929eaba 100644
--- a/src/app/app.cc
+++ b/src/app/app.cc
@@ -995,8 +995,7 @@ struct _VteappWindowClass {
static GType vteapp_window_get_type(void);
static char const* const builtin_dingus[] = {
-
"(((gopher|news|telnet|nntp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+(:[0-9]*)?",
-
"(((gopher|news|telnet|nntp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\)
,\\\"]",
+ "(foo|bar|baz)+",
nullptr,
};
@@ -1397,6 +1396,29 @@ window_action_copy_match_cb(GSimpleAction* action,
gtk_clipboard_set_text(window->clipboard, str, len);
}
+static void
+window_action_open_uri_cb(GSimpleAction* action,
+ GVariant* parameter,
+ void* data)
+{
+ VteappWindow* window = VTEAPP_WINDOW(data);
+ auto len = size_t{};
+ auto str = g_variant_get_string(parameter, &len);
+ GError* err{nullptr};
+
+#if GTK_CHECK_VERSION(3, 22, 0)
+ if (!gtk_show_uri_on_window(GTK_WINDOW(window),
+#else
+ if (!gtk_show_uri(gtk_widget_get_screen(GTK_WIDGET(window)),
+#endif
+ str,
+ gtk_get_current_event_time(),
+ &err)) {
+ verbose_printerr("Opening URI \"%s\" failed: %s\n", str, err->message);
+ g_error_free(err);
+ }
+}
+
static void
window_action_paste_cb(GSimpleAction* action,
GVariant* parameter,
@@ -1477,7 +1499,8 @@ vteapp_window_show_context_menu(VteappWindow* window,
g_object_unref(item);
}
- auto match = vte_terminal_match_check_event(window->terminal, event, nullptr);
+ auto tag = int{-1};
+ auto match = vte_terminal_match_check_event(window->terminal, event, &tag);
if (match != nullptr) {
verbose_print("Match: %s\n", match);
GVariant* target = g_variant_new_string(match);
@@ -1486,6 +1509,13 @@ vteapp_window_show_context_menu(VteappWindow* window,
g_menu_append_item(menu, item);
g_object_unref(item);
}
+ if (match != nullptr && tag == VTE_BUILTIN_MATCH_TAG_URI) {
+ GVariant* target = g_variant_new_string(match);
+ auto item = g_menu_item_new("_Open URI", nullptr);
+ g_menu_item_set_action_and_target_value(item, "win.open-uri", target);
+ g_menu_append_item(menu, item);
+ g_object_unref(item);
+ }
/* Test extra match API */
static const char extra_pattern[] = "(\\d+)\\s*(\\w+)";
@@ -1839,6 +1869,7 @@ vteapp_window_constructed(GObject *object)
GActionEntry const entries[] = {
{ "copy", window_action_copy_cb, "s", nullptr, nullptr },
{ "copy-match", window_action_copy_match_cb, "s", nullptr, nullptr },
+ { "open-uri", window_action_open_uri_cb, "s", nullptr, nullptr },
{ "paste", window_action_paste_cb, nullptr, nullptr, nullptr },
{ "reset", window_action_reset_cb, "b", nullptr, nullptr },
{ "find", window_action_find_cb, nullptr, nullptr, nullptr },
@@ -1960,8 +1991,10 @@ vteapp_window_constructed(GObject *object)
gtk_widget_set_opacity (GTK_WIDGET (window), options.get_alpha());
/* Dingus */
- if (!options.no_builtin_dingus)
+ if (!options.no_builtin_dingus) {
vteapp_window_add_dingus(window, builtin_dingus);
+ vte_terminal_match_add_builtins(window->terminal);
+ }
if (options.dingus != nullptr)
vteapp_window_add_dingus(window, options.dingus);
diff --git a/src/fwd.hh b/src/fwd.hh
index 11525a98..6d52f8a7 100644
--- a/src/fwd.hh
+++ b/src/fwd.hh
@@ -22,6 +22,7 @@ namespace vte {
namespace base {
class Pty;
+class RegexBuiltins;
} // namespace base
diff --git a/src/meson.build b/src/meson.build
index 78ee6341..775367f6 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -72,7 +72,12 @@ pty_sources = files(
regex_sources = files(
'regex.cc',
- 'regex.hh'
+ 'regex.hh',
+ 'regex-builtins.cc',
+ 'regex-builtins.hh',
+ 'regex-builtins-patterns.hh',
+ 'vteregex.cc',
+ 'vteregexinternal.hh',
)
utf8_sources = files(
@@ -90,6 +95,7 @@ libvte_common_sources = debug_sources + modes_sources + parser_sources + pty_sou
'chunk.cc',
'chunk.hh',
'color-triple.hh',
+ 'fwd.hh',
'keymap.cc',
'keymap.h',
'reaper.cc',
@@ -109,8 +115,6 @@ libvte_common_sources = debug_sources + modes_sources + parser_sources + pty_sou
'vtegtk.hh',
'vteinternal.hh',
'vtepcre2.h',
- 'vteregex.cc',
- 'vteregexinternal.hh',
'vterowdata.cc',
'vterowdata.hh',
'vteseq.cc',
@@ -405,6 +409,19 @@ test_refptr = executable(
install: false,
)
+test_regex_sources = regex_sources + files(
+ 'regex-test.cc',
+)
+
+test_regex = executable(
+ 'test-regex',
+ sources: test_regex_sources,
+ dependencies: [glib_dep, gobject_dep, pcre2_dep,],
+ cpp_args: ['-DVTE_COMPILATION',],
+ include_directories: top_inc,
+ install: false,
+)
+
test_tabstops_sources = files(
'tabstops-test.cc',
'tabstops.hh'
@@ -472,6 +489,7 @@ test_units = [
['parser', test_parser],
['reaper', test_reaper],
['refptr', test_refptr],
+ ['regex', test_regex],
['stream', test_stream],
['tabstops', test_tabstops],
['utf8', test_utf8],
diff --git a/src/regex-builtins-patterns.hh b/src/regex-builtins-patterns.hh
new file mode 100644
index 00000000..cc531b50
--- /dev/null
+++ b/src/regex-builtins-patterns.hh
@@ -0,0 +1,151 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Mini style-guide:
+ *
+ * #define'd fragments should preferably have an outermost group, for the
+ * exact same reason as why usually in C/C++ #define's the values are enclosed
+ * in parentheses: that is, so that you don't get surprised when you use the
+ * macro and append a quantifier.
+ *
+ * For repeated fragments prefer regex-style (?(DEFINE)(?<NAME>(...))) and use
+ * as (?&NAME), so that the regex string and the compiled regex object is
+ * smaller.
+ *
+ * Build small blocks, comment and unittest them heavily.
+ *
+ * Use free-spacing mode for improved readability. The hardest to read is
+ * which additional characters belong to a "(?" prefix. To improve
+ * readability, place a space after this, and for symmetry, before the closing
+ * parenthesis. Also place a space around "|" characters. No space before
+ * quantifiers. Try to be consistent with the existing style (yes I know the
+ * existing style is not consistent either, but please do your best).
+ *
+ * See http://www.rexegg.com/regex-disambiguation.html for all the "(?"
+ * syntaxes.
+ */
+
+#pragma once
+
+/* Lookbehind to see if there's a preceding apostrophe */
+#define APOS_START_DEF "(?<APOS_START>(?<='))?"
+
+#define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )"
+
+#define USERCHARS "-+.[:alnum:]"
+/* Nonempty username, e.g. "john.smith" */
+#define USER "[" USERCHARS "]+"
+
+#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
+/* Optional colon-prefixed password. I guess empty password should be allowed, right? E.g. ":secret", ":",
"" */
+#define PASS "(?x: :" PASSCHARS_CLASS "* )?"
+
+/* Optional at-terminated username (with perhaps a password too), e.g. "joe@", "pete:secret@", "" */
+#define USERPASS "(?:" USER PASS "@)?"
+
+/* S4: IPv4 segment (number between 0 and 255) with lookahead at the end so that we don't match "25" in the
string "256".
+ The lookahead could go to the last segment of IPv4 only but this construct allows nicer unittesting. */
+#define S4_DEF "(?(DEFINE)(?<S4>(?x: (?: [0-9] | [1-9][0-9] | 1[0-9]{2} | 2[0-4][0-9] | 25[0-5] ) (?! [0-9]
) )))"
+
+/* IPV4: Decimal IPv4, e.g. "1.2.3.4", with lookahead (implemented in S4) at the end so that we don't match
"192.168.1.123" in the string "192.168.1.1234". */
+#define IPV4_DEF S4_DEF "(?(DEFINE)(?<IPV4>(?x: (?: (?&S4) \\. ){3} (?&S4) )))"
+
+/* IPv6, including embedded IPv4, e.g. "::1", "dead:beef::1.2.3.4".
+ * Lookahead for the next char not being a dot or digit, so it doesn't get stuck matching "dead:beef::1" in
"dead:beef::1.2.3.4".
+ * This is not required since the surrounding brackets would trigger backtracking, but it allows nicer
unittesting.
+ * TODO: more strict check (right number of colons, etc.)
+ * TODO: add zone_id: RFC 4007 section 11, RFC 6874 */
+
+/* S6: IPv6 segment, S6C: IPv6 segment followed by a comma, CS6: comma followed by an IPv6 segment */
+#define S6_DEF "(?(DEFINE)(?<S6>[[:xdigit:]]{1,4})(?<CS6>:(?&S6))(?<S6C>(?&S6):))"
+
+/* No :: shorthand */
+#define IPV6_FULL "(?x: (?&S6C){7} (?&S6) )"
+/* Begins with :: */
+#define IPV6_LEFT "(?x: : (?&CS6){1,7} )"
+/* :: somewhere in the middle - use negative lookahead to make sure there aren't too many colons in total */
+#define IPV6_MID "(?x: (?! (?: [[:xdigit:]]*: ){8} ) (?&S6C){1,6} (?&CS6){1,6} )"
+/* Ends with :: */
+#define IPV6_RIGHT "(?x: (?&S6C){1,7} : )"
+/* Is "::" and nothing more */
+#define IPV6_NULL "(?x: :: )"
+
+/* The same ones for IPv4-embedded notation, without the actual IPv4 part */
+#define IPV6V4_FULL "(?x: (?&S6C){6} )"
+#define IPV6V4_LEFT "(?x: :: (?&S6C){0,5} )" /* includes "::<ipv4>" */
+#define IPV6V4_MID "(?x: (?! (?: [[:xdigit:]]*: ){7} ) (?&S6C){1,4} (?&CS6){1,4} ) :"
+#define IPV6V4_RIGHT "(?x: (?&S6C){1,5} : )"
+
+/* IPV6: An IPv6 address (possibly with an embedded IPv4).
+ * This macro defines both IPV4 and IPV6, since the latter one requires the former. */
+#define IP_DEF IPV4_DEF S6_DEF "(?(DEFINE)(?<IPV6>(?x: (?: " IPV6_NULL " | " IPV6_LEFT " | " IPV6_MID " | "
IPV6_RIGHT " | " IPV6_FULL " | (?: " IPV6V4_FULL " | " IPV6V4_LEFT " | " IPV6V4_MID " | " IPV6V4_RIGHT " )
(?&IPV4) ) (?! [.:[:xdigit:]] ) )))"
+
+/* Either an alphanumeric character or dash; or if [negative lookahead] not ASCII
+ * then any graphical Unicode character.
+ * A segment can consist entirely of numbers.
+ * (Note: PCRE doesn't support character class subtraction/intersection.) */
+#define HOSTNAMESEGMENTCHARS_CLASS "(?x: [-[:alnum:]] | (?! [[:ascii:]] ) [[:graph:]] )"
+
+/* A hostname of at least 1 component. The last component cannot be entirely numbers.
+ * E.g. "foo", "example.com", "1234.com", but not "foo.123" */
+#define HOSTNAME1 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\. )* " HOSTNAMESEGMENTCHARS_CLASS "* (?! [0-9]
) " HOSTNAMESEGMENTCHARS_CLASS "+ )"
+
+/* A hostname of at least 2 components. The last component cannot be entirely numbers.
+ * E.g. "example.com", "1234.com", but not "1234.56" */
+#define HOSTNAME2 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\.)+ " HOSTNAME1 " )"
+
+/* For URL: Hostname, IPv4, or bracket-enclosed IPv6, e.g. "example.com", "1.2.3.4", "[::1]" */
+#define URL_HOST "(?x: " HOSTNAME1 " | (?&IPV4) | \\[ (?&IPV6) \\] )"
+
+/* For e-mail: Hostname of at least two segments, or bracket-enclosed IPv4 or IPv6, e.g. "example.com",
"[1.2.3.4]", "[::1]".
+ * Technically an e-mail with a single-component hostname might be valid on a local network, but let's avoid
tons of false positives (e.g. in a typical shell prompt). */
+#define EMAIL_HOST "(?x: " HOSTNAME2 " | \\[ (?: (?&IPV4) | (?&IPV6) ) \\] )"
+
+/* Number between 1 and 65535, with lookahead at the end so that we don't match "6789" in the string "67890",
+ and in turn we don't eventually match "http://host:6789" in "http://host:67890". */
+#define N_1_65535 "(?x: (?: [1-9][0-9]{0,3} | [1-5][0-9]{4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} |
655[0-2][0-9] | 6553[0-5] ) (?! [0-9] ) )"
+
+/* Optional colon-prefixed port, e.g. ":1080", "" */
+#define PORT "(?x: \\:" N_1_65535 " )?"
+
+/* Omit the parentheses, see below */
+#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%'\\E]"
+/* Chars to end a URL. Apostrophe only allowed if there wasn't one in front of the URL, see bug 448044 */
+#define PATHTERM_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%'\\E]"
+#define PATHTERM_NOAPOS_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%\\E]"
+
+/* Recursive definition of PATH that allows parentheses and square brackets only if balanced, see bug
763980. */
+#define PATH_INNER_DEF "(?(DEFINE)(?<PATH_INNER>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) |
\\[ (?&PATH_INNER) \\] ) )* " PATHCHARS_CLASS "* )))"
+/* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */
+#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[
(?&PATH_INNER) \\] ) )* (?: " PATHCHARS_CLASS "* (?(<APOS_START>)" PATHTERM_NOAPOS_CLASS "|" PATHTERM_CLASS
") )? )))"
+
+#define URLPATH "(?x: /(?&PATH) )?"
+#define VOIP_PATH "(?x: [;?](?&PATH) )?"
+
+/* Now let's put these fragments together */
+
+#define DEFS APOS_START_DEF IP_DEF PATH_INNER_DEF PATH_DEF
+
+#define REGEX_URL_AS_IS DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
+/* TODO: also support file:/etc/passwd */
+#define REGEX_URL_FILE DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?&PATH)"
+/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience
(so that we can reuse HOSTNAME1). */
+#define REGEX_URL_HTTP DEFS "(?<!(?:" HOSTNAMESEGMENTCHARS_CLASS "|[.]))(?=(?i:www|ftp))" HOSTNAME1 PORT
URLPATH
+#define REGEX_URL_VOIP DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH
+#define REGEX_EMAIL DEFS "(?i:mailto:)?" USER "@" EMAIL_HOST
+#define REGEX_NEWS_MAN "(?i:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+"
diff --git a/src/regex-builtins.cc b/src/regex-builtins.cc
new file mode 100644
index 00000000..2819c518
--- /dev/null
+++ b/src/regex-builtins.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright © 2019 Christian Persch
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <glib.h>
+
+#include "regex.hh"
+#include "regex-builtins.hh"
+#include "regex-builtins-patterns.hh"
+
+#include "vtepcre2.h"
+
+namespace vte::base {
+
+RegexBuiltins::RegexBuiltins()
+{
+ m_builtins.reserve(8);
+
+ compile_builtin(REGEX_URL_AS_IS, InternalBuiltinsTags::eURL);
+ compile_builtin(REGEX_URL_HTTP, InternalBuiltinsTags::eHTTP);
+ compile_builtin(REGEX_URL_FILE, InternalBuiltinsTags::eFILE);
+ compile_builtin(REGEX_URL_VOIP, InternalBuiltinsTags::eVOIP);
+ compile_builtin(REGEX_EMAIL, InternalBuiltinsTags::eEMAIL);
+ compile_builtin(REGEX_NEWS_MAN, InternalBuiltinsTags::eNEWS_MAN);
+}
+
+void
+RegexBuiltins::compile_builtin(char const* pattern,
+ InternalBuiltinsTags tag)
+{
+ GError* error{nullptr};
+ auto regex = Regex::compile(Regex::Purpose::eMatch,
+ pattern,
+ PCRE2_ZERO_TERMINATED,
+ PCRE2_UTF | PCRE2_UCP | PCRE2_NO_UTF_CHECK | PCRE2_MULTILINE,
+ &error);
+ if (error) {
+ g_printerr("Failed to compile builtin regex %d: %s\n", int(tag), error->message);
+ g_error_free(error);
+ return;
+ }
+
+ regex->jit(PCRE2_JIT_COMPLETE, &error);
+ if (error) {
+ g_printerr("Failed to complete JIT compile builtin regex %d: %s\n", int(tag),
error->message);
+ g_clear_error(&error);
+ }
+
+ regex->jit(PCRE2_JIT_PARTIAL_SOFT, &error);
+ if (error) {
+ g_printerr("Failed to partial-soft JIT compile builtin regex %d: %s\n", int(tag),
error->message);
+ g_clear_error(&error);
+ }
+
+ m_builtins.emplace_back(take_ref(regex), int(tag));
+}
+
+int
+RegexBuiltins::transform_match(char*& match,
+ int tag) const noexcept
+{
+ switch (InternalBuiltinsTags(tag)) {
+ case InternalBuiltinsTags::eURL:
+ case InternalBuiltinsTags::eFILE:
+ case InternalBuiltinsTags::eNEWS_MAN:
+ case InternalBuiltinsTags::eVOIP:
+ /* No transformation */
+ return int(BuiltinsTags::eURI);
+
+ case InternalBuiltinsTags::eHTTP: {
+ auto v = match;
+ match = g_strdup_printf("http://%s", match);
+ g_free(v);
+ return int(BuiltinsTags::eURI);
+ }
+
+ case InternalBuiltinsTags::eEMAIL:
+ if (g_ascii_strncasecmp ("mailto:", match, 7) != 0) {
+ auto v = match;
+ match = g_strdup_printf ("mailto:%s", match);
+ g_free(v);
+ }
+ return int(BuiltinsTags::eURI);
+ }
+
+ return -1;
+}
+
+} // namespace vte::base
diff --git a/src/regex-builtins.hh b/src/regex-builtins.hh
new file mode 100644
index 00000000..83fed8e8
--- /dev/null
+++ b/src/regex-builtins.hh
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2019 Christian Persch
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "regex.hh"
+#include "refptr.hh"
+
+namespace vte {
+
+namespace base {
+
+class RegexBuiltins {
+private:
+ static inline std::weak_ptr<RegexBuiltins> s_weak_ptr{};
+
+ std::vector<std::pair<RefPtr<Regex>, int>> m_builtins{};
+
+ enum class InternalBuiltinsTags : int {
+ eURL = -2,
+ eHTTP = -3,
+ eFILE = -4,
+ eVOIP = -5,
+ eEMAIL = -6,
+ eNEWS_MAN = -7
+ };
+
+ void compile_builtin(char const* pattern,
+ InternalBuiltinsTags tag) noexcept;
+
+public:
+ // these must have the same values as the public VteBuiltinMatchTags
+ enum class BuiltinsTags : int {
+ eURI = -2
+ };
+
+ RegexBuiltins();
+ ~RegexBuiltins() { }
+ RegexBuiltins(RegexBuiltins const&) = delete;
+ RegexBuiltins(RegexBuiltins&&) = delete;
+
+ RegexBuiltins& operator= (RegexBuiltins const&) = delete;
+ RegexBuiltins& operator= (RegexBuiltins&&) = delete;
+
+ inline constexpr auto const& builtins() const noexcept { return m_builtins; }
+
+ int transform_match(char*& match,
+ int tag) const noexcept;
+
+ static std::shared_ptr<RegexBuiltins> get()
+ {
+ auto inst = s_weak_ptr.lock();
+ if (!inst)
+ s_weak_ptr = inst = std::make_shared<RegexBuiltins>();
+ return inst;
+ }
+};
+
+} // namespace base
+
+} // namespace vte
diff --git a/src/regex-test.cc b/src/regex-test.cc
new file mode 100644
index 00000000..fb759e1d
--- /dev/null
+++ b/src/regex-test.cc
@@ -0,0 +1,507 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ * Copyright © 2019 Christian Persch
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <locale.h>
+#include <glib.h>
+
+#include <cstdint>
+
+#include "regex.hh"
+#include "regex-builtins-patterns.hh"
+
+/* Shorthand for expecting the pattern to match the entire input string */
+#define ENTIRE ((char *) 1)
+
+static pcre2_match_context_8*
+create_match_context()
+{
+ pcre2_match_context_8 *match_context;
+
+ match_context = pcre2_match_context_create_8(nullptr /* general context */);
+ pcre2_set_match_limit_8(match_context, 65536); /* should be plenty */
+ pcre2_set_recursion_limit_8(match_context, 64); /* should be plenty */
+
+ return match_context;
+}
+
+static char*
+get_match(decltype(&pcre2_match_8) match_fn,
+ vte::base::Regex const* regex,
+ uint32_t match_flags,
+ char const* subject)
+{
+ auto match_context = create_match_context();
+ auto match_data = pcre2_match_data_create_8(256 /* should be plenty */,
+ nullptr /* general context */);
+
+ auto r = match_fn(regex->code(),
+ (PCRE2_SPTR8)subject,
+ strlen(subject),
+ 0, /* start offset */
+ match_flags |
+ PCRE2_NO_UTF_CHECK,
+ match_data,
+ match_context);
+
+ char* match;
+ if (r == PCRE2_ERROR_NOMATCH) {
+ match = nullptr;
+ } else if (r < 0) {
+ /* Error */
+ PCRE2_UCHAR8 buf[256];
+ int n = pcre2_get_error_message_8(r, buf, sizeof(buf));
+ g_assert_true(n >= 0);
+ g_printerr("PCRE2 error %d: %s\n", r, buf);
+
+ match = nullptr;
+ } else {
+ /* has match */
+ auto const* ovector = pcre2_get_ovector_pointer_8(match_data);
+ auto const so = ovector[0];
+ auto const eo = ovector[1];
+ if (so == PCRE2_UNSET || eo == PCRE2_UNSET)
+ match = nullptr;
+ else
+ match = g_strndup(subject + so, eo - so);
+ }
+
+ pcre2_match_data_free_8(match_data);
+ pcre2_match_context_free_8(match_context);
+
+ return match;
+}
+
+struct TestData {
+ char const* pattern;
+ char const* string;
+ char const* expected;
+ uint32_t match_flags;
+};
+
+static void
+assert_match_test(void const* ptr)
+{
+ TestData const* data = (TestData*)ptr;
+
+ GError *error{nullptr};
+ auto regex = vte::base::Regex::compile(vte::base::Regex::Purpose::eMatch,
+ data->pattern,
+ PCRE2_ZERO_TERMINATED,
+ PCRE2_UTF | PCRE2_NO_UTF_CHECK |
+ PCRE2_UCP |
+ PCRE2_MULTILINE,
+ &error);
+ g_assert_no_error(error);
+ g_assert_nonnull(regex);
+
+ auto match = get_match(&pcre2_match_8, regex, data->match_flags, data->string);
+
+ g_assert_cmpstr(match, ==, data->expected);
+ g_free(match);
+
+ if (vte::base::Regex::check_pcre_config_jit()) {
+ regex->jit(PCRE2_JIT_COMPLETE, &error);
+ g_assert_no_error(error);
+ regex->jit(PCRE2_JIT_PARTIAL_SOFT, &error);
+ g_assert_no_error(error);
+
+#if 0
+ // FIXME: some JIT matches are wrong, why?
+ match = get_match(&pcre2_jit_match_8, regex, data->match_flags, data->string);
+ #if 1
+ if (match != data->expected &&
+ g_strcmp0(match, data->expected) != 0)
+ g_printerr("JIT match: pattern: \"%s\"\n"
+ " flags: %08x\n"
+ " subject: \"%s\"\n"
+ " match: \"%s\"\n"
+ " expected: \"%s\"\n\n",
+ data->pattern,
+ data->match_flags,
+ data->string,
+ match ? match : "(nil)",
+ data->expected ? data->expected : "(nil)");
+ #else
+ g_assert_cmpstr(match, ==, data->expected);
+ #endif
+ g_free(match);
+#endif
+ }
+
+ regex->unref();
+}
+
+static void
+assert_match(char const* pattern,
+ char const* string,
+ char const* expected,
+ uint32_t match_flags = 0u,
+ int line = __builtin_LINE())
+{
+ TestData* data = g_new(TestData, 1);
+ data->pattern = pattern;
+ data->string = string;
+ data->expected = expected == ENTIRE ? string : expected;
+ data->match_flags = match_flags;
+
+ char* path = g_strdup_printf("/vte/regex/builtins/%d", line);
+ g_test_add_data_func_full(path, data, assert_match_test, (GDestroyNotify)g_free);
+ g_free(path);
+}
+
+static void
+assert_match_anchored(char const* pattern,
+ char const* string,
+ char const* expected,
+ int line = __builtin_LINE())
+{
+ assert_match(pattern, string, expected, PCRE2_ANCHORED, line);
+}
+
+static void
+setup_regex_builtins_tests(void)
+{
+ /* SCHEME is case insensitive */
+ assert_match_anchored (SCHEME, "http", ENTIRE);
+ assert_match_anchored (SCHEME, "HTTPS", ENTIRE);
+
+ /* USER is nonempty, alphanumeric, dot, plus and dash */
+ assert_match_anchored (USER, "", NULL);
+ assert_match_anchored (USER, "dr.john-smith", ENTIRE);
+ assert_match_anchored (USER, "abc+def@ghi", "abc+def");
+
+ /* PASS is optional colon-prefixed value, allowing quite some characters, but definitely not @ */
+ assert_match_anchored (PASS, "", ENTIRE);
+ assert_match_anchored (PASS, "nocolon", "");
+ assert_match_anchored (PASS, ":s3cr3T", ENTIRE);
+ assert_match_anchored (PASS, ":$?#@host", ":$?#");
+
+ /* Hostname of at least 1 component, containing at least one non-digit in at least one of the segments */
+ assert_match_anchored (HOSTNAME1, "example.com", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "a-b.c-d", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "a_b", "a"); /* TODO: can/should we totally abort here?
*/
+ assert_match_anchored (HOSTNAME1, "déjà-vu.com", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "➡.ws", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "cömbining-áccents", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "12", NULL);
+ assert_match_anchored (HOSTNAME1, "12.34", NULL);
+ assert_match_anchored (HOSTNAME1, "12.ab", ENTIRE);
+// assert_match_anchored (HOSTNAME1, "ab.12", NULL); /* errr... could we fail here?? */
+
+ /* Hostname of at least 2 components, containing at least one non-digit in at least one of the segments */
+ assert_match_anchored (HOSTNAME2, "example.com", ENTIRE);
+ assert_match_anchored (HOSTNAME2, "example", NULL);
+ assert_match_anchored (HOSTNAME2, "12", NULL);
+ assert_match_anchored (HOSTNAME2, "12.34", NULL);
+ assert_match_anchored (HOSTNAME2, "12.ab", ENTIRE);
+ assert_match_anchored (HOSTNAME2, "ab.12", NULL);
+// assert_match_anchored (HOSTNAME2, "ab.cd.12", NULL); /* errr... could we fail here?? */
+
+ /* IPv4 segment (number between 0 and 255) */
+ assert_match_anchored (DEFS "(?&S4)", "0", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "1", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "9", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "10", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "99", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "100", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "200", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "250", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "255", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "256", NULL);
+ assert_match_anchored (DEFS "(?&S4)", "260", NULL);
+ assert_match_anchored (DEFS "(?&S4)", "300", NULL);
+ assert_match_anchored (DEFS "(?&S4)", "1000", NULL);
+ assert_match_anchored (DEFS "(?&S4)", "", NULL);
+ assert_match_anchored (DEFS "(?&S4)", "a1b", NULL);
+
+ /* IPv4 addresses */
+ assert_match_anchored (DEFS "(?&IPV4)", "11.22.33.44", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV4)", "0.1.254.255", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV4)", "75.150.225.300", NULL);
+ assert_match_anchored (DEFS "(?&IPV4)", "1.2.3.4.5", "1.2.3.4"); /* we could also bail out and not
match at all */
+
+ /* IPv6 addresses */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:::22", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:44::55:66", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "dead::beef", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "faded::bee", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "live::pork", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "::1", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "11::22:33::44", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:::33", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "dead:beef::192.168.1.1", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "192.168.1.1", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:87654", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:45678", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.12345", NULL);
+
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77", NULL); /* no :: */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88:99", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77", ENTIRE); /* :: at the start */
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77:88", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77", ENTIRE); /* :: in the middle
*/
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77:88", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77::", ENTIRE); /* :: at the end */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88::", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "::", ENTIRE); /* :: only */
+
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:192.168.1.1", NULL); /* no :: */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.1", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:192.168.1.1", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:192.168.1.1", ENTIRE); /* :: at the start */
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:192.168.1.1", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:192.168.1.1", ENTIRE); /* :: in the imddle
*/
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:192.168.1.1", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55::192.168.1.1", ENTIRE); /* :: at the
end(ish) */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66::192.168.1.1", NULL);
+ assert_match_anchored (DEFS "(?&IPV6)", "::192.168.1.1", ENTIRE); /* :: only(ish) */
+
+ /* URL_HOST is either a hostname, or an IPv4 address, or a bracket-enclosed IPv6 address */
+ assert_match_anchored (DEFS URL_HOST, "example", ENTIRE);
+ assert_match_anchored (DEFS URL_HOST, "example.com", ENTIRE);
+ assert_match_anchored (DEFS URL_HOST, "11.22.33.44", ENTIRE);
+ assert_match_anchored (DEFS URL_HOST, "[11.22.33.44]", NULL);
+ assert_match_anchored (DEFS URL_HOST, "dead::be:ef", "dead"); /* TODO: can/should we totally abort
here? */
+ assert_match_anchored (DEFS URL_HOST, "[dead::be:ef]", ENTIRE);
+
+ /* EMAIL_HOST is either an at least two-component hostname, or a bracket-enclosed IPv[46] address */
+ assert_match_anchored (DEFS EMAIL_HOST, "example", NULL);
+ assert_match_anchored (DEFS EMAIL_HOST, "example.com", ENTIRE);
+ assert_match_anchored (DEFS EMAIL_HOST, "11.22.33.44", NULL);
+ assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.44]", ENTIRE);
+ assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.456]", NULL);
+ assert_match_anchored (DEFS EMAIL_HOST, "dead::be:ef", NULL);
+ assert_match_anchored (DEFS EMAIL_HOST, "[dead::be:ef]", ENTIRE);
+
+ /* Number between 1 and 65535 (helper for port) */
+ assert_match_anchored (N_1_65535, "0", NULL);
+ assert_match_anchored (N_1_65535, "1", ENTIRE);
+ assert_match_anchored (N_1_65535, "10", ENTIRE);
+ assert_match_anchored (N_1_65535, "100", ENTIRE);
+ assert_match_anchored (N_1_65535, "1000", ENTIRE);
+ assert_match_anchored (N_1_65535, "10000", ENTIRE);
+ assert_match_anchored (N_1_65535, "60000", ENTIRE);
+ assert_match_anchored (N_1_65535, "65000", ENTIRE);
+ assert_match_anchored (N_1_65535, "65500", ENTIRE);
+ assert_match_anchored (N_1_65535, "65530", ENTIRE);
+ assert_match_anchored (N_1_65535, "65535", ENTIRE);
+ assert_match_anchored (N_1_65535, "65536", NULL);
+ assert_match_anchored (N_1_65535, "65540", NULL);
+ assert_match_anchored (N_1_65535, "65600", NULL);
+ assert_match_anchored (N_1_65535, "66000", NULL);
+ assert_match_anchored (N_1_65535, "70000", NULL);
+ assert_match_anchored (N_1_65535, "100000", NULL);
+ assert_match_anchored (N_1_65535, "", NULL);
+ assert_match_anchored (N_1_65535, "a1b", NULL);
+
+ /* PORT is an optional colon-prefixed value */
+ assert_match_anchored (PORT, "", ENTIRE);
+ assert_match_anchored (PORT, ":1", ENTIRE);
+ assert_match_anchored (PORT, ":65535", ENTIRE);
+ assert_match_anchored (PORT, ":65536", ""); /* TODO: can/should we totally abort here? */
+
+ /* Parentheses are only allowed in matching pairs, see bug 763980. */
+ /* TODO: add tests for PATHCHARS and PATHNONTERM; and/or URLPATH */
+ assert_match_anchored (DEFS URLPATH, "/ab/cd", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/ab/cd.html.", "/ab/cd.html");
+ assert_match_anchored (DEFS URLPATH, "/The_Offspring_(album)", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/The_Offspring)", "/The_Offspring");
+ assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f))", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f)))", "/a((b(c)d)e(f))");
+ assert_match_anchored (DEFS URLPATH, "/a(b).(c).", "/a(b).(c)");
+ assert_match_anchored (DEFS URLPATH, "/a.(b.(c.).).(d.(e.).).)", "/a.(b.(c.).).(d.(e.).)");
+ assert_match_anchored (DEFS URLPATH, "/a)b(c", "/a");
+ assert_match_anchored (DEFS URLPATH, "/.", "/");
+ assert_match_anchored (DEFS URLPATH, "/(.", "/");
+ assert_match_anchored (DEFS URLPATH, "/).", "/");
+ assert_match_anchored (DEFS URLPATH, "/().", "/()");
+ assert_match_anchored (DEFS URLPATH, "/", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/php?param[]=value1¶m[]=value2", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/foo?param1[index1]=value1¶m2[index2]=value2", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/[[[]][]]", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/[([])]([()])", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/([()])[([])]", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/[(])", "/");
+ assert_match_anchored (DEFS URLPATH, "/([)]", "/");
+
+
+ /* Put the components together and test the big picture */
+
+ assert_match (REGEX_URL_AS_IS, "There's no URL here http:/foo", NULL);
+ assert_match (REGEX_URL_AS_IS, "Visit http://example.com for details", "http://example.com");
+ assert_match (REGEX_URL_AS_IS, "Trailing dot http://foo/bar.html.", "http://foo/bar.html");
+ assert_match (REGEX_URL_AS_IS, "Trailing ellipsis http://foo/bar.html...", "http://foo/bar.html");
+ assert_match (REGEX_URL_AS_IS, "Trailing comma http://foo/bar,baz,", "http://foo/bar,baz");
+ assert_match (REGEX_URL_AS_IS, "Trailing semicolon http://foo/bar;baz;", "http://foo/bar;baz");
+ assert_match (REGEX_URL_AS_IS, "See <http://foo/bar>", "http://foo/bar");
+ assert_match (REGEX_URL_AS_IS, "<http://foo.bar/asdf.qwer.html>",
"http://foo.bar/asdf.qwer.html");
+ assert_match (REGEX_URL_AS_IS, "Go to http://192.168.1.1.", "http://192.168.1.1");
+ assert_match (REGEX_URL_AS_IS, "If not, see <http://www.gnu.org/licenses/>.",
"http://www.gnu.org/licenses/");
+ assert_match (REGEX_URL_AS_IS, "<a href=\"http://foo/bar\">foo</a>", "http://foo/bar");
+ assert_match (REGEX_URL_AS_IS, "<a href='http://foo/bar'>foo</a>", "http://foo/bar");
+ assert_match (REGEX_URL_AS_IS, "<url>http://foo/bar</url>", "http://foo/bar");
+
+ assert_match (REGEX_URL_AS_IS, "http://", NULL);
+ assert_match (REGEX_URL_AS_IS, "http://a", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.", "http://aa");
+ assert_match (REGEX_URL_AS_IS, "http://aa.b", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb/c", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc/", ENTIRE);
+
+ assert_match (REGEX_URL_AS_IS, "HtTp://déjà-vu.com:10000/déjà/vu", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "HTTP://joe:sEcReT@➡.ws:1080", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "https://cömbining-áccents", ENTIRE);
+
+ assert_match (REGEX_URL_AS_IS, "http://111.222.33.44", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/foo", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:5555/xyz", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "https://[dead::beef]:12345/ipv6", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "https://[dead::beef:11.22.33.44]", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:", "http://1.2.3.4"); /* TODO:
can/should we totally abort here? */
+ assert_match (REGEX_URL_AS_IS, "https://dead::beef/no-brackets-ipv6", "https://dead"); /* ditto */
+ assert_match (REGEX_URL_AS_IS, "http://111.222.333.444/", NULL);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:70000", "http://1.2.3.4"); /* TODO:
can/should we totally abort here? */
+ assert_match (REGEX_URL_AS_IS, "http://[dead::beef:111.222.333.444]", NULL);
+
+ /* Username, password */
+ assert_match (REGEX_URL_AS_IS, "http://joe example com", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://user.name:sec ret host name", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://joe:secret@[::1]", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://dudewithnopassword:@example.com", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://safeguy:!#$%^&*@host", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://invalidusername!@host", "http://invalidusername");
+
+ assert_match (REGEX_URL_AS_IS, "http://ab.cd/ef?g=h&i=j|k=l#m=n:o=p", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http:///foo", NULL);
+
+ /* Parentheses are only allowed in matching pairs, see bug 763980. */
+ assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/The_Offspring_(album)", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring)",
"https://en.wikipedia.org/wiki/The_Offspring");
+ assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring_(album))",
"https://en.wikipedia.org/wiki/The_Offspring_(album)");
+ assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/(a(b)c)d)e)f", "http://foo.bar/(a(b)c)d");
+ assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/a)b(c", "http://foo.bar/a");
+
+ /* Apostrophes are allowed, except at trailing position if the URL is preceded by an apostrophe, see bug
448044. */
+ assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Moore's_law", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Moore's_law\">",
"https://en.wikipedia.org/wiki/Moore's_law");
+ assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Cryin'", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Cryin'\">",
"https://en.wikipedia.org/wiki/Cryin'");
+ assert_match (REGEX_URL_AS_IS, "<a href='https://en.wikipedia.org/wiki/Aerosmith'>",
"https://en.wikipedia.org/wiki/Aerosmith");
+
+ /* No scheme */
+ assert_match (REGEX_URL_HTTP, "www.foo.bar/baz", ENTIRE);
+ assert_match (REGEX_URL_HTTP, "WWW3.foo.bar/baz", ENTIRE);
+ assert_match (REGEX_URL_HTTP, "FTP.FOO.BAR/BAZ", ENTIRE); /* FIXME if no scheme is given and url
starts with ftp, can we make the protocol ftp instead of http? */
+ assert_match (REGEX_URL_HTTP, "ftpxy.foo.bar/baz", ENTIRE);
+// assert_match (REGEX_URL_HTTP, "ftp.123/baz", NULL); /* errr... could we fail here?? */
+ assert_match (REGEX_URL_HTTP, "foo.bar/baz", NULL);
+ assert_match (REGEX_URL_HTTP, "abc.www.foo.bar/baz", NULL);
+ assert_match (REGEX_URL_HTTP, "uvwww.foo.bar/baz", NULL);
+ assert_match (REGEX_URL_HTTP, "xftp.foo.bar/baz", NULL);
+
+ /* file:/ or file://(hostname)?/ */
+ assert_match (REGEX_URL_FILE, "file:", NULL);
+ assert_match (REGEX_URL_FILE, "file:/", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file://", NULL);
+ assert_match (REGEX_URL_FILE, "file:///", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file:////", NULL);
+ assert_match (REGEX_URL_FILE, "file:etc/passwd", NULL);
+ assert_match (REGEX_URL_FILE, "File:/etc/passwd", ENTIRE);
+ assert_match (REGEX_URL_FILE, "FILE:///etc/passwd", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file:////etc/passwd", NULL);
+ assert_match (REGEX_URL_FILE, "file://host.name", NULL);
+ assert_match (REGEX_URL_FILE, "file://host.name/", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file://host.name/etc", ENTIRE);
+
+ assert_match (REGEX_URL_FILE, "See file:/.", "file:/");
+ assert_match (REGEX_URL_FILE, "See file:///.", "file:///");
+ assert_match (REGEX_URL_FILE, "See file:/lost+found.", "file:/lost+found");
+ assert_match (REGEX_URL_FILE, "See file:///lost+found.", "file:///lost+found");
+
+ /* Email */
+ assert_match (REGEX_EMAIL, "Write to foo bar com.", "foo bar com");
+ assert_match (REGEX_EMAIL, "Write to <foo bar com>", "foo bar com");
+ assert_match (REGEX_EMAIL, "Write to mailto:foo bar com.", "mailto:foo bar com");
+ assert_match (REGEX_EMAIL, "Write to MAILTO:FOO BAR COM.", "MAILTO:FOO BAR COM");
+ assert_match (REGEX_EMAIL, "Write to foo@[1.2.3.4]", "foo@[1.2.3.4]");
+ assert_match (REGEX_EMAIL, "Write to foo@[1.2.3.456]", NULL);
+ assert_match (REGEX_EMAIL, "Write to foo@[1::2345]", "foo@[1::2345]");
+ assert_match (REGEX_EMAIL, "Write to foo@[dead::beef]", "foo@[dead::beef]");
+ assert_match (REGEX_EMAIL, "Write to foo@1.2.3.4", NULL);
+ assert_match (REGEX_EMAIL, "Write to foo@1.2.3.456", NULL);
+ assert_match (REGEX_EMAIL, "Write to foo@1::2345", NULL);
+ assert_match (REGEX_EMAIL, "Write to foo@dead::beef", NULL);
+ assert_match (REGEX_EMAIL, "<baz email=\"foo bar com\"/>", "foo bar com");
+ assert_match (REGEX_EMAIL, "<baz email='foo bar com'/>", "foo bar com");
+ assert_match (REGEX_EMAIL, "<email>foo bar com</email>", "foo bar com");
+
+ /* Sip, examples from rfc 3261 */
+ assert_match (REGEX_URL_VOIP, "sip:alice atlanta com;maddr=239.255.255.1;ttl=15", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:alice atlanta com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:alice:secretword atlanta com;transport=tcp", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sips:alice atlanta com?subject=project%20x&priority=urgent", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:+1-212-555-1212:1234 gateway com;user=phone", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sips:1212 gateway com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:alice@192.0.2.4", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:atlanta.com;method=REGISTER?to=alice%40atlanta.com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "SIP:alice;day=tuesday atlanta com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "Dial sip:alice@192.0.2.4.",
"sip:alice@192.0.2.4");
+
+ /* Extremely long match, bug 770147 */
+ assert_match (REGEX_URL_AS_IS, "http://www.example.com/ThisPathConsistsOfMoreThan1024Characters"
+
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+
"1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890",
ENTIRE);
+}
+
+static void
+test_regex_unicode(void)
+{
+ GError* error{nullptr};
+ g_assert_true(vte::base::Regex::check_pcre_config_unicode(&error));
+ g_assert_no_error(error);
+}
+
+int
+main(int argc,
+ char* argv[])
+{
+ setlocale(LC_ALL, "");
+
+ g_test_init(&argc, &argv, nullptr);
+
+ g_test_add_func("/vte/regex/unicode", test_regex_unicode);
+
+ setup_regex_builtins_tests();
+
+ return g_test_run();
+}
diff --git a/src/vte.cc b/src/vte.cc
index 48ac2205..37c41d43 100644
--- a/src/vte.cc
+++ b/src/vte.cc
@@ -65,10 +65,13 @@
#include "vtepty.h"
#include "vtegtk.hh"
+#include "regex-builtins.hh"
+
#ifdef WITH_A11Y
#include "vteaccess.h"
#endif
+#include <algorithm>
#include <new> /* placement new */
using namespace std::literals;
@@ -1021,6 +1024,28 @@ Terminal::regex_match_remove(int tag) noexcept
match_regexes_writable().erase(i);
}
+void
+Terminal::regex_match_add_builtins() noexcept
+{
+ auto& match_regexes = match_regexes_writable();
+ if (!m_match_regex_builtins)
+ m_match_regex_builtins = vte::base::RegexBuiltins::get();
+ for (auto const& [regex, tag] : m_match_regex_builtins->builtins()) {
+ match_regexes.emplace_back(make_ref(regex.get()),
+ 0 /* match flags */,
+ VTE_MATCH_BUILTINS_CURSOR,
+ tag);
+ }
+}
+
+void
+Terminal::regex_match_remove_builtins() noexcept
+{
+ auto& match_regexes = match_regexes_writable();
+ std::remove_if(std::begin(match_regexes), std::end(match_regexes),
+ [](MatchRegex const& rem) { return rem.tag() < 0; });
+}
+
/*
* match_rowcol_to_offset:
* @terminal:
@@ -1400,7 +1425,7 @@ Terminal::match_check_internal(vte::grid::column_t column,
char*
Terminal::regex_match_check(vte::grid::column_t column,
vte::grid::row_t row,
- int* tag)
+ int* tag_ptr)
{
long delta = m_screen->scroll_delta;
_vte_debug_print(VTE_DEBUG_EVENTS | VTE_DEBUG_REGEX,
@@ -1423,8 +1448,16 @@ Terminal::regex_match_check(vte::grid::column_t column,
_VTE_DEBUG_IF(VTE_DEBUG_EVENTS | VTE_DEBUG_REGEX) {
if (ret != NULL) g_printerr("Matched `%s'.\n", ret);
}
- if (tag != nullptr)
- *tag = (match != nullptr) ? match->tag() : -1;
+
+ int tag = -1;
+ if (match != nullptr) {
+ tag = match->tag();
+ if (tag < -1 && m_match_regex_builtins)
+ tag = m_match_regex_builtins->transform_match(ret, tag);
+ }
+
+ if (tag_ptr != nullptr)
+ *tag_ptr = tag;
return ret;
}
diff --git a/src/vte/vteenums.h b/src/vte/vteenums.h
index 84d07a0d..eddf8709 100644
--- a/src/vte/vteenums.h
+++ b/src/vte/vteenums.h
@@ -177,6 +177,20 @@ typedef enum {
VTE_FORMAT_HTML = 2
} VteFormat;
+/**
+ * VteBuiltinMatchTags:
+ * @VTE_BUILTIN_MATCH_TAG_URI: the match is an URI as recognised by
+ * the expressions added with vte_terminal_match_add_uris()
+ *
+ * An enumeration that will be returned from vte_terminal_match_check_event()
+ * if a builtin expression matched.
+ *
+ * Since: 0.60
+ */
+typedef enum {
+ VTE_BUILTIN_MATCH_TAG_URI = -2
+} VteBuiltinMatchTags;
+
G_END_DECLS
#endif /* __VTE_VTE_ENUMS_H__ */
diff --git a/src/vte/vteterminal.h b/src/vte/vteterminal.h
index 31012518..52a40630 100644
--- a/src/vte/vteterminal.h
+++ b/src/vte/vteterminal.h
@@ -382,6 +382,8 @@ _VTE_PUBLIC
int vte_terminal_match_add_regex(VteTerminal *terminal,
VteRegex *regex,
guint32 flags) _VTE_GNUC_NONNULL(1) _VTE_GNUC_NONNULL(2);
+_VTE_PUBLIC
+void vte_terminal_match_add_builtins(VteTerminal *terminal) _VTE_GNUC_NONNULL(1);
/* Set the cursor to be used when the pointer is over a given match. */
_VTE_PUBLIC
void vte_terminal_match_set_cursor_name(VteTerminal *terminal,
@@ -391,6 +393,8 @@ _VTE_PUBLIC
void vte_terminal_match_remove(VteTerminal *terminal,
int tag) _VTE_GNUC_NONNULL(1);
_VTE_PUBLIC
+void vte_terminal_match_remove_builtins(VteTerminal *terminal) _VTE_GNUC_NONNULL(1);
+_VTE_PUBLIC
void vte_terminal_match_remove_all(VteTerminal *terminal) _VTE_GNUC_NONNULL(1);
/* Check if a given cell on the screen contains part of a matched string. If
diff --git a/src/vtedefines.hh b/src/vtedefines.hh
index df45ecf6..09ff9657 100644
--- a/src/vtedefines.hh
+++ b/src/vtedefines.hh
@@ -79,6 +79,7 @@
#define VTE_MOUSING_CURSOR GDK_LEFT_PTR
#define VTE_HYPERLINK_CURSOR GDK_HAND2
#define VTE_HYPERLINK_CURSOR_DEBUG GDK_SPIDER
+#define VTE_MATCH_BUILTINS_CURSOR GDK_HAND2
#define VTE_CHILD_INPUT_PRIORITY G_PRIORITY_DEFAULT_IDLE
#define VTE_CHILD_OUTPUT_PRIORITY G_PRIORITY_HIGH
#define VTE_MAX_INPUT_READ 0x1000
diff --git a/src/vtegtk.cc b/src/vtegtk.cc
index a20a0e75..1198b584 100644
--- a/src/vtegtk.cc
+++ b/src/vtegtk.cc
@@ -2111,6 +2111,9 @@ vte_terminal_paste_primary(VteTerminal *terminal)
* user moves the mouse cursor over a section of displayed text which matches
* this expression, the text will be highlighted.
*
+ * When vte_terminal_match_check_event() returns a match for this regex, the
+ * returned tag will be the return value of this function.
+ *
* Returns: an integer associated with this expression, or -1 if @gregex could not be
* transformed into a #VteRegex or @gflags were incompatible
*
@@ -2137,13 +2140,20 @@ vte_terminal_match_add_gregex(VteTerminal *terminal,
* vte_terminal_match_add_regex:
* @terminal: a #VteTerminal
* @regex: (transfer none): a #VteRegex
- * @flags: PCRE2 match flags, or 0
+ * @flags: PCRE2 match flags, or 0 to use the default flags
*
* Adds the regular expression @regex to the list of matching expressions. When the
* user moves the mouse cursor over a section of displayed text which matches
* this expression, the text will be highlighted.
*
- * Returns: an integer associated with this expression
+ * When vte_terminal_match_check_event() returns a match for this regex, the
+ * returned tag will be the return value of this function.
+ *
+ * Note that the default flags only contain PCRE2_UTF (and some flags for internal use);
+ * if you want to match unicode properties, you need to pass PCRE2_UCP in @flags.
+ * See man:pcre2_compile(3) for more information on available flags.
+ *
+ * Returns: a nonnegative integer associated with this expression
*
* Since: 0.46
*/
@@ -2164,6 +2174,30 @@ vte_terminal_match_add_regex(VteTerminal *terminal,
impl->regex_match_next_tag()).tag();
}
+/**
+ * vte_terminal_match_add_builtins:
+ * @terminal: a #VteTerminal
+ *
+ * Adds regular expressions to recognise URIs to the list of matching expressions.
+ * When the user moves the mouse cursor over a section of displayed text which matches
+ * this expression, the text will be highlighted.
+ *
+ * When vte_terminal_match_check_event() returns a match for this regex, the
+ * returned tag will a value from #VteBuiltinMatchTags.
+ *
+ * Use vte_terminal_match_remove_builtins() or vte_terminal_match_remove_all() to remove
+ * the matching expressions added by this function.
+ *
+ * Since: 0.60
+ */
+void
+vte_terminal_match_add_builtins(VteTerminal *terminal)
+{
+ g_return_if_fail(VTE_IS_TERMINAL(terminal));
+
+ IMPL(terminal)->regex_match_add_builtins();
+}
+
/**
* vte_terminal_match_check:
* @terminal: a #VteTerminal
@@ -2204,20 +2238,24 @@ vte_terminal_match_check(VteTerminal *terminal,
*
* Checks if the text in and around the position of the event matches any of the
* regular expressions previously set using vte_terminal_match_add(). If a
- * match exists, the text string is returned and if @tag is not %NULL, the number
- * associated with the matched regular expression will be stored in @tag.
+ * match exists, the text string is returned.
*
- * If more than one regular expression has been set with
- * vte_terminal_match_add(), then expressions are checked in the order in
- * which they were added.
+ * If @tag is not %NULL, it will store the nonnegative integer associated with the
+ * matched regular expression, if it was added with vte_terminal_match_add_regex(),
+ * or a negative number from #VteBuiltinMatchTags if the matching regular expression
+ * is one added with vte_terminal_match_add_builtins() matched, or -1 if there is
+ * no match.
+ *
+ * Expressions are checked in the order in which they were added, returning the
+ * first match.
*
* Returns: (transfer full) (nullable): a newly allocated string which matches one of the previously
* set regular expressions, or %NULL if there is no match
*/
char *
-vte_terminal_match_check_event(VteTerminal *terminal,
- GdkEvent *event,
- int *tag)
+vte_terminal_match_check_event(VteTerminal* terminal,
+ GdkEvent* event,
+ int* tag)
{
g_return_val_if_fail(VTE_IS_TERMINAL(terminal), FALSE);
return IMPL(terminal)->regex_match_check(event, tag);
@@ -2390,19 +2428,35 @@ vte_terminal_match_set_cursor_name(VteTerminal *terminal,
/**
* vte_terminal_match_remove:
* @terminal: a #VteTerminal
- * @tag: the tag of the regex to remove
+ * @tag: the nonnegative tag of the regex to remove
*
* Removes the regular expression which is associated with the given @tag from
* the list of expressions which the terminal will highlight when the user
* moves the mouse cursor over matching text.
*/
void
-vte_terminal_match_remove(VteTerminal *terminal, int tag)
+vte_terminal_match_remove(VteTerminal *terminal,
+ int tag)
{
g_return_if_fail(VTE_IS_TERMINAL(terminal));
IMPL(terminal)->regex_match_remove(tag);
}
+/**
+ * vte_terminal_match_remove_builtins:
+ * @terminal: a #VteTerminal
+ *
+ * Removes the regular expression added with vte_terminal_match_add_builtins().
+ *
+ * Since: 0.60
+ */
+void
+vte_terminal_match_remove_builtins(VteTerminal *terminal)
+{
+ g_return_if_fail(VTE_IS_TERMINAL(terminal));
+ IMPL(terminal)->regex_match_remove_builtins();
+}
+
/**
* vte_terminal_match_remove_all:
* @terminal: a #VteTerminal
diff --git a/src/vteinternal.hh b/src/vteinternal.hh
index 2d580f12..bfb1010b 100644
--- a/src/vteinternal.hh
+++ b/src/vteinternal.hh
@@ -56,8 +56,10 @@
#include "chunk.hh"
#include "pty.hh"
#include "utf8.hh"
+#include "fwd.hh"
#include <list>
+#include <memory>
#include <queue>
#include <optional>
#include <string>
@@ -603,6 +605,10 @@ public:
return match_regexes_writable().emplace_back(std::forward<Args>(args)...);
}
+ std::shared_ptr<vte::base::RegexBuiltins> m_match_regex_builtins{};
+ void regex_match_add_builtins() noexcept;
+ void regex_match_remove_builtins() noexcept;
+
char* m_match_contents;
GArray* m_match_attributes;
char* m_match;
diff --git a/src/vteregex.cc b/src/vteregex.cc
index d95c77aa..39fc588b 100644
--- a/src/vteregex.cc
+++ b/src/vteregex.cc
@@ -24,9 +24,9 @@
#include "config.h"
-#include "vtemacros.h"
-#include "vteenums.h"
-#include "vteregex.h"
+#include "vte/vtemacros.h"
+#include "vte/vteenums.h"
+#include "vte/vteregex.h"
#include "vtepcre2.h"
#include "regex.hh"
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]