[glibmm] Make splitting tokens more robust in GtkDefs.pm and Enum.pm.
- From: Krzesimir Nowak <krnowak src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [glibmm] Make splitting tokens more robust in GtkDefs.pm and Enum.pm.
- Date: Tue, 9 Mar 2010 18:23:01 +0000 (UTC)
commit ea4170fa33a528717492b3b399308150a6d81683
Author: Krzesimir Nowak <qdlacz gmail com>
Date: Tue Jan 5 18:25:24 2010 +0100
Make splitting tokens more robust in GtkDefs.pm and Enum.pm.
* tools/pm/Enum.pm: Written splitter for values in
(flags|enum)-extended defs.
* tools/pm/GtkDefs.pm: Written splitter for general defs.
tools/pm/Enum.pm | 111 ++++++++++++++++++++++++++++++++++++++--
tools/pm/GtkDefs.pm | 141 ++++++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 227 insertions(+), 25 deletions(-)
---
diff --git a/tools/pm/Enum.pm b/tools/pm/Enum.pm
index 47485e1..6096645 100644
--- a/tools/pm/Enum.pm
+++ b/tools/pm/Enum.pm
@@ -31,6 +31,106 @@ our @EXPORT_OK;
# bool mark;
# }
+#
+# private functions:
+#
+
+sub split_enum_tokens($)
+{
+ my ($token_string) = @_;
+ my @tokens = ();
+ # index of first opening double quotes between parens - beginning of a new
+ # token.
+ my $begin_token = 0;
+ # index of last closing double quotes between parens - end of a token.
+ my $end_token = 0;
+ # whether we are inside double quotes.
+ my $inside_dquotes = 0;
+ # whether we are inside double and then single quotes (for situations like
+ # "'"'").
+ my $inside_squotes = 0;
+ my $len = length($token_string);
+ # whether we found opening paren and we are expecting an opening double
+ # quotes.
+ my $near_begin = 0;
+ # count of double quotes pairs between parens.
+ my $dq_count = 0;
+ # whether previous char was a backslash - important only when being between
+ # double quotes.
+ my $backslash = 0;
+ for (my $index = 0; $index < $len; $index++)
+ {
+ my $char = substr($token_string, $index, 1);
+ if ($inside_dquotes)
+ {
+ # if prevous char was backslash, then current char is not important -
+ # we are still inside double or double/single quotes anyway.
+ if ($backslash)
+ {
+ $backslash = 0;
+ }
+ # if current char is backslash.
+ elsif ($char eq '\\')
+ {
+ $backslash = 1;
+ }
+ # if current char is unescaped double quotes and we are not inside single
+ # ones - means, we are going outside string. We mark this place as an end
+ # of the token in case we find a closing paren after this.
+ elsif ($char eq '"' and not $inside_squotes)
+ {
+ $inside_dquotes = 0;
+ $end_token = $index;
+ }
+ # if current char is single quote then switch being inside single quotes
+ # state.
+ elsif ($char eq '\'')
+ {
+ $inside_squotes = not $inside_squotes;
+ }
+ }
+ # current char is opening paren - this means we are near the beginning of
+ # a token (first double quotes after this paren).
+ elsif ($char eq '(')
+ {
+ $near_begin = 1;
+ }
+ # current char is closing paren - this means we reached end of a token at
+ # last closing double quotes.
+ elsif ($char eq ')')
+ {
+ my $token_len = $end_token + 1 - $begin_token;
+ my $token = substr($token_string, $begin_token, $token_len);
+ # there should be three pairs of double quotes.
+ if ($dq_count == 3)
+ {
+ push(@tokens, $token);
+ }
+ else
+ {
+ print STDERR "Wrong value statement while parsing ($token)\n";
+ }
+ $dq_count = 0;
+ }
+ # current char is opening double quotes - this can be a beginning of
+ # a token.
+ elsif ($char eq '"')
+ {
+ if ($near_begin)
+ {
+ $begin_token = $index;
+ $near_begin = 0;
+ }
+ $inside_dquotes = 1;
+ $dq_count++;
+ }
+ }
+ return @tokens;
+}
+
+#
+# end of private functions.
+#
sub new
{
@@ -82,13 +182,11 @@ sub parse_values($$)
my $elem_names = [];
my $elem_values = [];
my $common_prefix = undef;
-
- # break up the value statements
- foreach(split(/\s*'*[()]\s*/, $value))
+ # break up the value statements - it works with parens inside double quotes
+ # and handles triples like '("dq-token", "MY_SCANNER_DQ_TOKEN", "'"'").
+ foreach (split_enum_tokens($value))
{
- next if($_ eq "");
-
- if(/^"\S+" "(\S+)" "([^"]+)"$/)
+ if (/^"\S+" "(\S+)" "(.+)"$/)
{
my ($name, $value) = ($1, $2);
@@ -143,6 +241,7 @@ sub beautify_values($)
# Continuous? (Aliases to prior enum values are allowed.)
foreach my $value (@$elem_values)
{
+ return if ($value =~ /[G-WY-Zg-wy-z_]/);
return if(($value < $first) || ($value > $prev + 1));
$prev = $value;
}
diff --git a/tools/pm/GtkDefs.pm b/tools/pm/GtkDefs.pm
index 7c791b8..b0002e2 100644
--- a/tools/pm/GtkDefs.pm
+++ b/tools/pm/GtkDefs.pm
@@ -95,25 +95,31 @@ sub read_defs($$;$)
# break the tokens into lisp phrases up to three levels deep.
# WARNING: reading the following perl statement may induce seizures,
# please flush eyes with water immediately, and consult a mortician.
- my @tokens = split(
- m/(
- \(
- (?:
- [^()]*
- \(
- (?:
- [^()]*
- \(
- [^()]*
- \)
- )*
- [^()]*
- \)
- )*
- [^()]*
- \)
- )/x,
- read_file($path, $filename));
+ #
+ # this regexp is weak - it does not work on multiple and/or unpaired parens
+ # inside double quotes - those shouldn't be ever considered. i replaced this
+ # splitting with my own function, which does the job very well - krnowak.
+# my @tokens = split(
+# m/(
+# \(
+# (?:
+# [^()]*
+# \(
+# (?:
+# [^()]*
+# \(
+# [^()]*
+# \)
+# )*
+# [^()]*
+# \)
+# )*
+# [^()]*
+# \)
+# )/x,
+# read_file($path, $filename));
+
+ my @tokens = split_tokens(read_file($path, $filename));
# scan through top level tokens
while ($#tokens > -1)
@@ -161,6 +167,103 @@ sub read_defs($$;$)
}
}
+sub split_tokens($)
+{
+ my ($token_string) = @_;
+ my @tokens = ();
+ # whether we are inside double quotes.
+ my $inside_dquotes = 0;
+ # whether we are inside double and then single quotes (for situations like
+ # "'"'").
+ my $inside_squotes = 0;
+ # number of yet unpaired opening parens.
+ my $parens = 0;
+ my $len = length($token_string);
+ # whether previous char was a backslash - important only when being between
+ # double quotes.
+ my $backslash = 0;
+ # index of first opening paren - beginning of a new token.
+ my $begin_token = 0;
+
+ for (my $index = 0; $index < $len; $index++)
+ {
+ my $char = substr($token_string, $index, 1);
+ # if we are inside double quotes.
+ if ($inside_dquotes)
+ {
+ # if prevous char was backslash, then current char is not important -
+ # we are still inside double or double/single quotes anyway.
+ if ($backslash)
+ {
+ $backslash = 0;
+ }
+ # if current char is backslash.
+ elsif ($char eq '\\')
+ {
+ $backslash = 1;
+ }
+ # if current char is unescaped double quotes and we are not inside single
+ # ones - means, we are going outside string.
+ elsif ($char eq '"' and not $inside_squotes)
+ {
+ $inside_dquotes = 0;
+ }
+ # if current char is unescaped single quote, then we have two cases:
+ # 1. it just plain apostrophe.
+ # 2. it is a piece of a C code:
+ # a) opening quotes,
+ # b) closing quotes.
+ # if there is near (2 or 3 indexes away) second quote, then it is 2a,
+ # if 2a occured earlier, then it is 2b.
+ # otherwise is 1.
+ elsif ($char eq '\'')
+ {
+ # if we are already inside single quotes, it is 2b.
+ if ($inside_squotes)
+ {
+ $inside_squotes = 0;
+ }
+ else
+ {
+ # if there is closing quotes near, it is 2a.
+ if (substr($token_string, $index, 4) =~ /^'\\?.'/)
+ {
+ $inside_squotes = 1;
+ }
+ # else it is just 1.
+ }
+ }
+ }
+ # double quotes - beginning of a string.
+ elsif ($char eq '"')
+ {
+ $inside_dquotes = 1;
+ }
+ # opening paren - if paren count is 0 then this is a beginning of a token.
+ elsif ($char eq '(')
+ {
+ unless ($parens)
+ {
+ $begin_token = $index;
+ }
+ $parens++;
+ }
+ # closing paren - if paren count is 1 then this is an end of a token, so we
+ # extract it from token string and push into token list.
+ elsif ($char eq ')')
+ {
+ $parens--;
+ unless ($parens)
+ {
+ my $token_len = $index + 1 - $begin_token;
+ my $token = substr($token_string, $begin_token, $token_len);
+ push(@tokens, $token);
+ }
+ }
+ # do nothing on other chars.
+ }
+ return @tokens;
+}
sub read_file($$)
{
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]