[librsvg: 2/5] (#219) Make path parser have a tokenizer/parser

From: Federico Mena Quintero <federico src gnome org>
To: commits-list gnome org
Cc:
Subject: [librsvg: 2/5] (#219) Make path parser have a tokenizer/parser
Date: Thu, 11 Jun 2020 22:07:22 +0000 (UTC)
commit 111058f9e495a82155a7ac8b0e54646e35530182
Author: Emile Snyder <emile snyder gmail com>
Date:   Sun Jun 7 17:03:34 2020 -0700

    (#219) Make path parser have a tokenizer/parser
    
    I did try just using chars and str everywhere, and doing parse::<f64>
    on slices for the Number tokens; it made the
    path_parser::Parser::parse calls take about 2x as long as the original
    code.  The [u8] version is still slower, but not that much slower.
    
    https://gitlab.gnome.org/GNOME/librsvg/-/issues/219

 rsvg_internals/benches/path_parser.rs |  57 ++-
 rsvg_internals/src/path_parser.rs     | 920 ++++++++++++++++------------------
 2 files changed, 475 insertions(+), 502 deletions(-)
---
diff --git a/rsvg_internals/benches/path_parser.rs b/rsvg_internals/benches/path_parser.rs
index d8c892e4..41842eca 100644
--- a/rsvg_internals/benches/path_parser.rs
+++ b/rsvg_internals/benches/path_parser.rs
@@ -3,9 +3,38 @@ extern crate criterion;
 use criterion::{black_box, Criterion};
 
 use rsvg_internals::path_builder::PathBuilder;
-use rsvg_internals::path_parser::parse_path_into_builder;
+use rsvg_internals::path_parser::{parse_path_into_builder, Lexer};
 
 static INPUT: &'static str = "M10 20 C 30,40 50 60-70,80,90 100,110 120,130,140";
+//                                  0    .    1    .    2    .    3    .    4    .
+
+static BYTES: &'static [u8; 49] = b"M10 20 C 30,40 50 60-70,80,90 100,110 120,130,140";
+//                                  0    .    1    .    2    .    3    .    4    .
+
+static SLICE_EDGES: [(usize, usize); 14] = [
+    (1, 3),
+    (4, 6),
+    (9, 11),
+    (12, 14),
+    (15, 17),
+    (18, 20),
+    (20, 23),
+    (24, 26),
+    (27, 29),
+    (30, 33),
+    (34, 37),
+    (38, 41),
+    (42, 45),
+    (46, 49),
+];
+
+fn lex_path(input: &str) {
+    let lexer = Lexer::new(black_box(INPUT));
+
+    for (_pos, _token) in lexer {
+        //
+    }
+}
 
 fn path_parser(c: &mut Criterion) {
     c.bench_function("parse path into builder", |b| {
@@ -16,6 +45,32 @@ fn path_parser(c: &mut Criterion) {
             let _ = parse_path_into_builder(&input, &mut builder);
         });
     });
+
+    c.bench_function("lex str", |b| {
+        let input = black_box(INPUT);
+
+        b.iter(|| {
+            lex_path(input);
+        });
+    });
+
+    // look at how much time *just* the parse::<i32> part of the lexer should be taking...
+    c.bench_function("std i32 parse (bytes)", |b| {
+        let input = black_box(BYTES);
+        let slice_boundaries = black_box(SLICE_EDGES);
+
+        b.iter(|| {
+            for (a, b) in slice_boundaries.iter() {
+                let a: usize = *a;
+                let b: usize = *b;
+                unsafe {
+                    let _ = std::str::from_utf8_unchecked(&input[a..b])
+                        .parse::<i32>()
+                        .unwrap();
+                }
+            }
+        });
+    });
 }
 
 criterion_group!(benches, path_parser);
diff --git a/rsvg_internals/src/path_parser.rs b/rsvg_internals/src/path_parser.rs
index 0fe3b5e3..b9384799 100644
--- a/rsvg_internals/src/path_parser.rs
+++ b/rsvg_internals/src/path_parser.rs
@@ -4,14 +4,179 @@ use std::error::Error;
 use std::fmt;
 use std::iter::Enumerate;
 use std::str;
-use std::str::Chars;
+use std::str::Bytes;
 
 use crate::path_builder::*;
 
+#[derive(Debug, PartialEq, Copy, Clone)]
+pub enum Token {
+    // pub to allow benchmarking
+    Number(f64),
+    Command(u8),
+    Comma,
+}
+
+use crate::path_parser::Token::{Comma, Command, Number};
+
+#[derive(Debug)]
+pub struct Lexer<'a> {
+    // pub to allow benchmarking
+    input: &'a [u8],
+    ci: Enumerate<Bytes<'a>>,
+    current: Option<(usize, u8)>,
+}
+
+#[derive(Debug, PartialEq, Copy, Clone)]
+pub enum LexError {
+    // pub to allow benchmarking
+    ParseFloatError,
+    ParseIntError,
+    UnexpectedByte(u8),
+    UnexpectedEof,
+}
+
+impl<'a> Lexer<'_> {
+    pub fn new(input: &'a str) -> Lexer {
+        let mut ci = input.bytes().enumerate();
+        let current = ci.next();
+        Lexer {
+            input: input.as_bytes(),
+            ci: ci,
+            current: current,
+        }
+    }
+
+    fn current_pos(&mut self) -> usize {
+        match self.current {
+            None => self.input.len(),
+            Some((pos, _)) => pos,
+        }
+    }
+
+    fn advance(&mut self) {
+        self.current = self.ci.next();
+    }
+
+    fn advance_over_whitespace(&mut self) -> bool {
+        let mut found_some = false;
+
+        while self.current.is_some() && self.current.unwrap().1.is_ascii_whitespace() {
+            found_some = true;
+            self.current = self.ci.next();
+        }
+
+        return found_some;
+    }
+
+    fn advance_over_optional(&mut self, needle: u8) -> bool {
+        match self.current {
+            Some((_, c)) if c == needle => {
+                self.advance();
+                return true;
+            }
+            _ => return false,
+        }
+    }
+
+    fn advance_over_digits(&mut self) -> bool {
+        let mut found_some = false;
+        while self.current.is_some() && self.current.unwrap().1.is_ascii_digit() {
+            found_some = true;
+            self.current = self.ci.next();
+        }
+
+        return found_some;
+    }
+
+    fn advance_over_simple_number(&mut self, is_float: &mut bool) -> bool {
+        let _ = self.advance_over_optional(b'-') || self.advance_over_optional(b'+');
+        let found_digit = self.advance_over_digits();
+        if self.advance_over_optional(b'.') {
+            *is_float = true;
+        }
+        self.advance_over_digits() || found_digit
+    }
+
+    fn match_number(&mut self) -> Result<Token, LexError> {
+        // remember the beginning
+        let (start_pos, _) = self.current.unwrap();
+        let mut is_float = false;
+        if !self.advance_over_simple_number(&mut is_float) && start_pos != self.current_pos() {
+            match self.current {
+                None => return Err(LexError::UnexpectedEof),
+                Some((_pos, c)) => return Err(LexError::UnexpectedByte(c)),
+            }
+        }
+        if self.advance_over_optional(b'e') || self.advance_over_optional(b'E') {
+            is_float = true;
+            self.advance_over_simple_number(&mut is_float);
+        }
+        let end_pos = match self.current {
+            None => self.input.len(),
+            Some((i, _)) => i,
+        };
+        unsafe {
+            // This is OK because we have personally checked every character of
+            // the slice we're about to use as a utf8 string so we know it's all legal.
+            let num_str = std::str::from_utf8_unchecked(&self.input[start_pos..end_pos]);
+
+            // using std::str::parse::<f64> causes quite a hit relative to
+            // the old mixed-lexing-parsing code which did it's own number parsing.
+            // Using parse::<i32> is, annoyingly, about 3-4x faster than the parse::<i64>
+            // call. So do that if we don't have a '.' or a 'e' in the number.
+            if is_float {
+                match num_str.parse::<f64>() {
+                    Ok(n) => Ok(Number(n)),
+                    Err(_e) => Err(LexError::ParseFloatError),
+                }
+            } else {
+                match num_str.parse::<i32>() {
+                    Ok(n) => Ok(Number(n as f64)),
+                    Err(_e) => Err(LexError::ParseIntError),
+                }
+            }
+        }
+    }
+}
+
+impl Iterator for Lexer<'_> {
+    type Item = (usize, Result<Token, LexError>);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // eat whitespace
+        self.advance_over_whitespace();
+
+        match self.current {
+            // commas are separators
+            Some((pos, c)) if c == b',' => {
+                self.advance();
+                Some((pos, Ok(Comma)))
+            }
+
+            // alphabetic chars are commands
+            Some((pos, c)) if c.is_ascii_alphabetic() => {
+                let token = Command(c);
+                self.advance();
+                Some((pos, Ok(token)))
+            }
+
+            Some((pos, c)) if c.is_ascii_digit() || c == b'-' || c == b'+' || c == b'.' => {
+                Some((pos, self.match_number()))
+            }
+
+            Some((pos, c)) => {
+                self.advance();
+                Some((pos, Err(LexError::UnexpectedByte(c))))
+            }
+
+            None => None,
+        }
+    }
+}
+
 struct PathParser<'b> {
-    chars_enumerator: Enumerate<Chars<'b>>,
-    lookahead: Option<char>,    // None if we are in EOF
-    current_pos: Option<usize>, // None if the string hasn't been scanned
+    tokens: Lexer<'b>,
+    current_pos_and_token: Option<(usize, Result<Token, LexError>)>,
 
     builder: &'b mut PathBuilder,
 
@@ -60,10 +225,11 @@ struct PathParser<'b> {
 //     M 0.1 -2 300 -4
 impl<'b> PathParser<'b> {
     fn new(builder: &'b mut PathBuilder, path_str: &'b str) -> PathParser<'b> {
+        let mut lexer = Lexer::new(path_str);
+        let pt = lexer.next();
         PathParser {
-            chars_enumerator: path_str.chars().enumerate(),
-            lookahead: None,
-            current_pos: None,
+            tokens: lexer,
+            current_pos_and_token: pt,
 
             builder,
 
@@ -81,216 +247,114 @@ impl<'b> PathParser<'b> {
         }
     }
 
-    fn parse(&mut self) -> Result<(), ParseError> {
-        self.getchar();
+    // Our match_* methods all either consume the token we requested
+    // and return the unwrapped value, or return an error without
+    // advancing the token stream.
+    //
+    // You can safely use them to probe for a particular kind of token,
+    // fail to match it, and try some other type.
 
-        self.optional_whitespace()?;
-        self.moveto_drawto_command_groups()
-    }
-
-    fn getchar(&mut self) {
-        if let Some((pos, c)) = self.chars_enumerator.next() {
-            self.lookahead = Some(c);
-            self.current_pos = Some(pos);
-        } else {
-            // We got to EOF; make current_pos point to the position after the last char in the
-            // string
-            self.lookahead = None;
-            if self.current_pos.is_none() {
-                self.current_pos = Some(0);
-            } else {
-                self.current_pos = Some(self.current_pos.unwrap() + 1);
-            }
+    fn match_command(&mut self) -> Result<u8, ParseError> {
+        let result = match &self.current_pos_and_token {
+            Some((_, Ok(Command(c)))) => Ok(*c),
+            Some((pos, Ok(t))) => Err(ParseError::new(*pos, UnexpectedToken(*t))),
+            Some((pos, Err(e))) => Err(ParseError::new(*pos, LexError(*e))),
+            None => Err(ParseError::new(self.tokens.input.len(), UnexpectedEof)),
+        };
+        if result.is_ok() {
+            self.current_pos_and_token = self.tokens.next();
         }
+        result
     }
 
-    fn error(&self, kind: ErrorKind) -> ParseError {
-        ParseError {
-            position: self.current_pos.unwrap(),
-            kind,
+    fn match_number(&mut self) -> Result<f64, ParseError> {
+        let result = match &self.current_pos_and_token {
+            Some((_, Ok(Number(n)))) => Ok(*n),
+            Some((pos, Ok(t))) => Err(ParseError::new(*pos, UnexpectedToken(*t))),
+            Some((pos, Err(e))) => Err(ParseError::new(*pos, LexError(*e))),
+            None => Err(ParseError::new(self.tokens.input.len(), UnexpectedEof)),
+        };
+        if result.is_ok() {
+            self.current_pos_and_token = self.tokens.next();
         }
+        result
     }
 
-    fn match_char(&mut self, c: char) -> bool {
-        if let Some(x) = self.lookahead {
-            if c == x {
-                self.getchar();
-                return true;
-            }
+    fn match_comma(&mut self) -> Result<(), ParseError> {
+        let result = match &self.current_pos_and_token {
+            Some((_, Ok(Comma))) => Ok(()),
+            Some((pos, Ok(t))) => Err(ParseError::new(*pos, UnexpectedToken(*t))),
+            Some((pos, Err(e))) => Err(ParseError::new(*pos, LexError(*e))),
+            None => Err(ParseError::new(self.tokens.input.len(), UnexpectedEof)),
+        };
+        if result.is_ok() {
+            self.current_pos_and_token = self.tokens.next();
         }
-
-        false
+        result
     }
 
-    fn whitespace(&mut self) -> Result<(), ParseError> {
-        if let Some(c) = self.lookahead {
-            if c.is_whitespace() {
-                assert!(self.match_char(c));
-
-                while let Some(c) = self.lookahead {
-                    if c.is_whitespace() {
-                        assert!(self.match_char(c));
-                        continue;
-                    } else {
-                        break;
-                    }
-                }
-            }
-        }
-
-        Ok(())
+    fn eat_optional_comma(&mut self) {
+        let _ = self.match_comma();
     }
 
-    fn optional_whitespace(&mut self) -> Result<(), ParseError> {
-        let _ = self.whitespace();
-        Ok(())
+    // Convenience function; like match_number, but eats a leading comma if present.
+    fn match_comma_number(&mut self) -> Result<f64, ParseError> {
+        self.eat_optional_comma();
+        self.match_number()
     }
 
-    fn optional_comma_whitespace(&mut self) -> Result<(), ParseError> {
-        self.optional_whitespace()?;
-        if self.lookahead_is(',') {
-            self.match_char(',');
-            self.optional_whitespace()?;
+    fn match_flag(&mut self) -> Result<bool, ParseError> {
+        let pos = match self.current_pos_and_token {
+            Some((pos, _)) => pos,
+            _ => self.tokens.input.len(),
+        };
+        match self.match_number()? {
+            n if n == 1.0 => Ok(true),
+            n if n == 0.0 => Ok(false),
+            n => Err(ParseError::new(pos, ErrorKind::UnexpectedToken(Number(n)))),
         }
-        Ok(())
     }
 
-    fn lookahead_is(&self, c: char) -> bool {
-        if let Some(x) = self.lookahead {
-            if x == c {
-                return true;
-            }
-        }
+    // peek_* methods are the twins of match_*, but don't consume the token, and so
+    // can't return ParseError
 
-        false
-    }
-
-    fn lookahead_is_digit(&self, d: &mut char) -> bool {
-        if let Some(c) = self.lookahead {
-            if c.is_digit(10) {
-                *d = c;
-                return true;
-            }
+    fn peek_command(&mut self) -> Option<u8> {
+        match &self.current_pos_and_token {
+            Some((_, Ok(Command(c)))) => Some(*c),
+            _ => None,
         }
-
-        false
     }
 
-    fn lookahead_is_start_of_number(&mut self) -> bool {
-        let mut c = ' ';
-        self.lookahead_is_digit(&mut c)
-            || self.lookahead_is('.')
-            || self.lookahead_is('+')
-            || self.lookahead_is('-')
+    fn peek_number(&mut self) -> Option<f64> {
+        match &self.current_pos_and_token {
+            Some((_, Ok(Number(n)))) => Some(*n),
+            _ => None,
+        }
     }
 
-    fn number(&mut self) -> Result<f64, ParseError> {
-        let mut sign: f64;
-
-        sign = 1.0;
-
-        if self.match_char('+') {
-            sign = 1.0;
-        } else if self.match_char('-') {
-            sign = -1.0;
+    // This is the entry point for parsing a given blob of path data.
+    // All the parsing just uses various match_* methods to consume tokens
+    // and retrieve the values.
+    fn parse(&mut self) -> Result<(), ParseError> {
+        if self.current_pos_and_token.is_none() {
+            return Ok(());
         }
 
-        let mut has_integer_part = false;
-        let mut value: f64;
-        let mut exponent_sign: f64;
-        let mut exponent: Option<f64>;
-
-        value = 0.0;
-        exponent_sign = 1.0;
-        exponent = None;
-
-        let mut c: char = ' ';
-
-        if self.lookahead_is_digit(&mut c) || self.lookahead_is('.') {
-            // Integer part
-            while self.lookahead_is_digit(&mut c) {
-                has_integer_part = true;
-                value = value * 10.0 + f64::from(char_to_digit(c));
-
-                assert!(self.match_char(c));
-            }
-
-            // Fractional part
-            if self.match_char('.') {
-                let mut fraction: f64 = 1.0;
-
-                let mut c: char = ' ';
-
-                if !has_integer_part && !self.lookahead_is_digit(&mut c) {
-                    return Err(self.error(ErrorKind::UnexpectedToken));
-                }
-
-                while self.lookahead_is_digit(&mut c) {
-                    fraction /= 10.0;
-                    value += fraction * f64::from(char_to_digit(c));
-
-                    assert!(self.match_char(c));
-                }
-            }
-
-            if self.match_char('E') || self.match_char('e') {
-                // exponent sign
-                if self.match_char('+') {
-                    exponent_sign = 1.0;
-                } else if self.match_char('-') {
-                    exponent_sign = -1.0;
-                }
-
-                // exponent
-                let mut c: char = ' ';
-
-                if self.lookahead_is_digit(&mut c) {
-                    let mut exp = 0.0;
-
-                    while self.lookahead_is_digit(&mut c) {
-                        exp = exp * 10.0 + f64::from(char_to_digit(c));
-
-                        assert!(self.match_char(c));
-                    }
-
-                    exponent = Some(exp);
-                } else if self.lookahead.is_some() {
-                    return Err(self.error(ErrorKind::UnexpectedToken));
-                } else {
-                    return Err(self.error(ErrorKind::UnexpectedEof));
-                }
-            }
-
-            if let Some(exp) = exponent {
-                Ok(sign * value * 10.0f64.powf(exp * exponent_sign))
-            } else {
-                Ok(sign * value)
-            }
-        } else if self.lookahead.is_some() {
-            Err(self.error(ErrorKind::UnexpectedToken))
-        } else {
-            Err(self.error(ErrorKind::UnexpectedEof))
-        }
+        self.moveto_drawto_command_groups()
     }
 
-    fn flag(&mut self) -> Result<bool, ParseError> {
-        if self.match_char('0') {
-            Ok(false)
-        } else if self.match_char('1') {
-            Ok(true)
-        } else if self.lookahead.is_some() {
-            Err(self.error(ErrorKind::UnexpectedToken))
-        } else {
-            Err(self.error(ErrorKind::UnexpectedEof))
+    fn error(&self, kind: ErrorKind) -> ParseError {
+        match self.current_pos_and_token {
+            Some((pos, _)) => ParseError {
+                position: pos,
+                kind,
+            },
+            None => ParseError { position: 0, kind }, // FIXME: ???
         }
     }
 
     fn coordinate_pair(&mut self) -> Result<(f64, f64), ParseError> {
-        let a = self.number()?;
-        self.optional_comma_whitespace()?;
-        let b = self.number()?;
-
-        Ok((a, b))
+        Ok((self.match_number()?, self.match_comma_number()?))
     }
 
     fn set_current_point(&mut self, x: f64, y: f64) {
@@ -388,37 +452,6 @@ impl<'b> PathParser<'b> {
         );
     }
 
-    fn emit_close_path(&mut self) {
-        let (x, y) = (self.subpath_start_x, self.subpath_start_y);
-        self.set_current_point(x, y);
-
-        self.builder.close_path();
-    }
-
-    fn lineto_argument_sequence(&mut self, absolute: bool) -> Result<(), ParseError> {
-        loop {
-            let (mut x, mut y) = self.coordinate_pair()?;
-
-            if !absolute {
-                x += self.current_x;
-                y += self.current_y;
-            }
-
-            self.emit_line_to(x, y);
-
-            self.whitespace()?;
-
-            if self.lookahead_is(',') {
-                assert!(self.match_char(','));
-                self.optional_whitespace()?;
-            } else if !self.lookahead_is_start_of_number() {
-                break;
-            }
-        }
-
-        Ok(())
-    }
-
     fn moveto_argument_sequence(
         &mut self,
         absolute: bool,
@@ -437,13 +470,7 @@ impl<'b> PathParser<'b> {
             self.emit_move_to(x, y);
         }
 
-        self.whitespace()?;
-
-        if self.lookahead_is(',') {
-            assert!(self.match_char(','));
-            self.optional_whitespace()?;
-            self.lineto_argument_sequence(absolute)
-        } else if self.lookahead_is_start_of_number() {
+        if self.match_comma().is_ok() || self.peek_number().is_some() {
             self.lineto_argument_sequence(absolute)
         } else {
             Ok(())
@@ -451,27 +478,15 @@ impl<'b> PathParser<'b> {
     }
 
     fn moveto(&mut self, is_initial_moveto: bool) -> Result<(), ParseError> {
-        if self.lookahead_is('M') || self.lookahead_is('m') {
-            let absolute = if self.match_char('M') {
-                true
-            } else {
-                assert!(self.match_char('m'));
-                false
-            };
-
-            self.optional_whitespace()?;
-            self.moveto_argument_sequence(absolute, is_initial_moveto)
-        } else if self.lookahead.is_some() {
-            Err(self.error(ErrorKind::UnexpectedToken))
-        } else {
-            Err(self.error(ErrorKind::UnexpectedEof))
+        match self.match_command()? {
+            b'M' => self.moveto_argument_sequence(true, is_initial_moveto),
+            b'm' => self.moveto_argument_sequence(false, is_initial_moveto),
+            c => Err(self.error(ErrorKind::UnexpectedCommand(c))),
         }
     }
 
     fn moveto_drawto_command_group(&mut self, is_initial_moveto: bool) -> Result<(), ParseError> {
         self.moveto(is_initial_moveto)?;
-        self.optional_whitespace()?;
-
         self.optional_drawto_commands().map(|_| ())
     }
 
@@ -482,8 +497,7 @@ impl<'b> PathParser<'b> {
             self.moveto_drawto_command_group(initial)?;
             initial = false;
 
-            self.optional_whitespace()?;
-            if self.lookahead.is_none() {
+            if self.current_pos_and_token.is_none() {
                 break;
             }
         }
@@ -493,69 +507,110 @@ impl<'b> PathParser<'b> {
 
     fn optional_drawto_commands(&mut self) -> Result<bool, ParseError> {
         while self.drawto_command()? {
-            self.optional_whitespace()?;
+            // everything happens in the drawto_command() calls.
         }
 
         Ok(false)
     }
 
+    // FIXME: This should not just fail to match 'M' and 'm', but make sure the
+    // command is in the set of drawto command characters.
+    fn match_if_drawto_command_with_absolute(&mut self) -> Option<(u8, bool)> {
+        let cmd = self.peek_command();
+        let result = match cmd {
+            Some(b'M') => None,
+            Some(b'm') => None,
+            Some(c) => {
+                let c_up = c.to_ascii_uppercase();
+                if c == c_up {
+                    Some((c_up, true))
+                } else {
+                    Some((c_up, false))
+                }
+            }
+            _ => None,
+        };
+        if result.is_some() {
+            let _ = self.match_command();
+        }
+        result
+    }
+
     fn drawto_command(&mut self) -> Result<bool, ParseError> {
-        Ok(self.close_path()?
-            || self.line_to()?
-            || self.horizontal_line_to()?
-            || self.vertical_line_to()?
-            || self.curve_to()?
-            || self.smooth_curve_to()?
-            || self.quadratic_bezier_curve_to()?
-            || self.smooth_quadratic_bezier_curve_to()?
-            || self.elliptical_arc()?)
-    }
-
-    fn close_path(&mut self) -> Result<bool, ParseError> {
-        if self.match_char('Z') || self.match_char('z') {
-            self.emit_close_path();
-            Ok(true)
-        } else {
-            Ok(false)
+        match self.match_if_drawto_command_with_absolute() {
+            Some((b'Z', _)) => {
+                self.emit_close_path();
+                Ok(true)
+            }
+            Some((b'L', abs)) => {
+                self.lineto_argument_sequence(abs)?;
+                Ok(true)
+            }
+            Some((b'H', abs)) => {
+                self.horizontal_lineto_argument_sequence(abs)?;
+                Ok(true)
+            }
+            Some((b'V', abs)) => {
+                self.vertical_lineto_argument_sequence(abs)?;
+                Ok(true)
+            }
+            Some((b'C', abs)) => {
+                self.curveto_argument_sequence(abs)?;
+                Ok(true)
+            }
+            Some((b'S', abs)) => {
+                self.smooth_curveto_argument_sequence(abs)?;
+                Ok(true)
+            }
+            Some((b'Q', abs)) => {
+                self.quadratic_curveto_argument_sequence(abs)?;
+                Ok(true)
+            }
+            Some((b'T', abs)) => {
+                self.smooth_quadratic_curveto_argument_sequence(abs)?;
+                Ok(true)
+            }
+            Some((b'A', abs)) => {
+                self.elliptical_arc_argument_sequence(abs)?;
+                Ok(true)
+            }
+            _ => Ok(false),
         }
     }
 
-    fn line_to(&mut self) -> Result<bool, ParseError> {
-        if self.lookahead_is('L') || self.lookahead_is('l') {
-            let absolute = if self.match_char('L') {
-                true
-            } else {
-                assert!(self.match_char('l'));
-                false
-            };
+    fn emit_close_path(&mut self) {
+        let (x, y) = (self.subpath_start_x, self.subpath_start_y);
+        self.set_current_point(x, y);
 
-            self.optional_whitespace()?;
+        self.builder.close_path();
+    }
 
-            self.lineto_argument_sequence(absolute)?;
-            Ok(true)
+    fn should_break_arg_sequence(&mut self) -> bool {
+        if self.match_comma().is_ok() {
+            // if there is a comma (indicating we should continue to loop), eat the comma
+            // so we're ready at the next start of the loop to process the next token.
+            false
+        } else if !self.peek_number().is_some() {
+            // if the next token is not a number or a comma, we do want to break the loop
+            true
         } else {
-            Ok(false)
+            // otherwise, continue to loop to process args in the sequence.
+            false
         }
     }
 
-    fn horizontal_lineto_argument_sequence(&mut self, absolute: bool) -> Result<(), ParseError> {
+    fn lineto_argument_sequence(&mut self, absolute: bool) -> Result<(), ParseError> {
         loop {
-            let mut x = self.number()?;
+            let (mut x, mut y) = self.coordinate_pair()?;
 
             if !absolute {
                 x += self.current_x;
+                y += self.current_y;
             }
 
-            let y = self.current_y;
-
             self.emit_line_to(x, y);
 
-            self.whitespace()?;
-
-            if self.lookahead_is(',') {
-                assert!(self.match_char(','));
-                self.optional_whitespace()?;
-            } else if !self.lookahead_is_start_of_number() {
+            if self.should_break_arg_sequence() {
                 break;
             }
         }
@@ -563,27 +618,29 @@ impl<'b> PathParser<'b> {
         Ok(())
     }
 
-    fn horizontal_line_to(&mut self) -> Result<bool, ParseError> {
-        if self.lookahead_is('H') || self.lookahead_is('h') {
-            let absolute = if self.match_char('H') {
-                true
-            } else {
-                assert!(self.match_char('h'));
-                false
-            };
+    fn horizontal_lineto_argument_sequence(&mut self, absolute: bool) -> Result<(), ParseError> {
+        loop {
+            let mut x = self.match_number()?;
 
-            self.optional_whitespace()?;
+            if !absolute {
+                x += self.current_x;
+            }
 
-            self.horizontal_lineto_argument_sequence(absolute)?;
-            Ok(true)
-        } else {
-            Ok(false)
+            let y = self.current_y;
+
+            self.emit_line_to(x, y);
+
+            if self.should_break_arg_sequence() {
+                break;
+            }
         }
+
+        Ok(())
     }
 
     fn vertical_lineto_argument_sequence(&mut self, absolute: bool) -> Result<(), ParseError> {
         loop {
-            let mut y = self.number()?;
+            let mut y = self.match_number()?;
 
             if !absolute {
                 y += self.current_y;
@@ -593,12 +650,7 @@ impl<'b> PathParser<'b> {
 
             self.emit_line_to(x, y);
 
-            self.whitespace()?;
-
-            if self.lookahead_is(',') {
-                assert!(self.match_char(','));
-                self.optional_whitespace()?;
-            } else if !self.lookahead_is_start_of_number() {
+            if self.should_break_arg_sequence() {
                 break;
             }
         }
@@ -606,34 +658,14 @@ impl<'b> PathParser<'b> {
         Ok(())
     }
 
-    fn vertical_line_to(&mut self) -> Result<bool, ParseError> {
-        if self.lookahead_is('V') || self.lookahead_is('v') {
-            let absolute = if self.match_char('V') {
-                true
-            } else {
-                assert!(self.match_char('v'));
-                false
-            };
-
-            self.optional_whitespace()?;
-
-            self.vertical_lineto_argument_sequence(absolute)?;
-            Ok(true)
-        } else {
-            Ok(false)
-        }
-    }
-
     fn curveto_argument_sequence(&mut self, absolute: bool) -> Result<(), ParseError> {
         loop {
             let (mut x2, mut y2) = self.coordinate_pair()?;
 
-            self.optional_comma_whitespace()?;
-
+            self.eat_optional_comma();
             let (mut x3, mut y3) = self.coordinate_pair()?;
 
-            self.optional_comma_whitespace()?;
-
+            self.eat_optional_comma();
             let (mut x4, mut y4) = self.coordinate_pair()?;
 
             if !absolute {
@@ -647,12 +679,7 @@ impl<'b> PathParser<'b> {
 
             self.emit_curve_to(x2, y2, x3, y3, x4, y4);
 
-            self.whitespace()?;
-
-            if self.lookahead_is(',') {
-                assert!(self.match_char(','));
-                self.optional_whitespace()?;
-            } else if !self.lookahead_is_start_of_number() {
+            if self.should_break_arg_sequence() {
                 break;
             }
         }
@@ -663,9 +690,7 @@ impl<'b> PathParser<'b> {
     fn smooth_curveto_argument_sequence(&mut self, absolute: bool) -> Result<(), ParseError> {
         loop {
             let (mut x3, mut y3) = self.coordinate_pair()?;
-
-            self.optional_comma_whitespace()?;
-
+            self.eat_optional_comma();
             let (mut x4, mut y4) = self.coordinate_pair()?;
 
             if !absolute {
@@ -682,12 +707,7 @@ impl<'b> PathParser<'b> {
 
             self.emit_curve_to(x2, y2, x3, y3, x4, y4);
 
-            self.whitespace()?;
-
-            if self.lookahead_is(',') {
-                assert!(self.match_char(','));
-                self.optional_whitespace()?;
-            } else if !self.lookahead_is_start_of_number() {
+            if self.should_break_arg_sequence() {
                 break;
             }
         }
@@ -695,48 +715,10 @@ impl<'b> PathParser<'b> {
         Ok(())
     }
 
-    fn curve_to(&mut self) -> Result<bool, ParseError> {
-        if self.lookahead_is('C') || self.lookahead_is('c') {
-            let absolute = if self.match_char('C') {
-                true
-            } else {
-                assert!(self.match_char('c'));
-                false
-            };
-
-            self.optional_whitespace()?;
-
-            self.curveto_argument_sequence(absolute)?;
-            Ok(true)
-        } else {
-            Ok(false)
-        }
-    }
-
-    fn smooth_curve_to(&mut self) -> Result<bool, ParseError> {
-        if self.lookahead_is('S') || self.lookahead_is('s') {
-            let absolute = if self.match_char('S') {
-                true
-            } else {
-                assert!(self.match_char('s'));
-                false
-            };
-
-            self.optional_whitespace()?;
-
-            self.smooth_curveto_argument_sequence(absolute)?;
-            Ok(true)
-        } else {
-            Ok(false)
-        }
-    }
-
     fn quadratic_curveto_argument_sequence(&mut self, absolute: bool) -> Result<(), ParseError> {
         loop {
             let (mut a, mut b) = self.coordinate_pair()?;
-
-            self.optional_comma_whitespace()?;
-
+            self.eat_optional_comma();
             let (mut c, mut d) = self.coordinate_pair()?;
 
             if !absolute {
@@ -748,12 +730,7 @@ impl<'b> PathParser<'b> {
 
             self.emit_quadratic_curve_to(a, b, c, d);
 
-            self.whitespace()?;
-
-            if self.lookahead_is(',') {
-                assert!(self.match_char(','));
-                self.optional_whitespace()?;
-            } else if !self.lookahead_is_start_of_number() {
+            if self.should_break_arg_sequence() {
                 break;
             }
         }
@@ -761,24 +738,6 @@ impl<'b> PathParser<'b> {
         Ok(())
     }
 
-    fn quadratic_bezier_curve_to(&mut self) -> Result<bool, ParseError> {
-        if self.lookahead_is('Q') || self.lookahead_is('q') {
-            let absolute = if self.match_char('Q') {
-                true
-            } else {
-                assert!(self.match_char('q'));
-                false
-            };
-
-            self.optional_whitespace()?;
-
-            self.quadratic_curveto_argument_sequence(absolute)?;
-            Ok(true)
-        } else {
-            Ok(false)
-        }
-    }
-
     fn smooth_quadratic_curveto_argument_sequence(
         &mut self,
         absolute: bool,
@@ -798,12 +757,7 @@ impl<'b> PathParser<'b> {
 
             self.emit_quadratic_curve_to(a, b, c, d);
 
-            self.whitespace()?;
-
-            if self.lookahead_is(',') {
-                assert!(self.match_char(','));
-                self.optional_whitespace()?;
-            } else if !self.lookahead_is_start_of_number() {
+            if self.should_break_arg_sequence() {
                 break;
             }
         }
@@ -811,49 +765,25 @@ impl<'b> PathParser<'b> {
         Ok(())
     }
 
-    fn smooth_quadratic_bezier_curve_to(&mut self) -> Result<bool, ParseError> {
-        if self.lookahead_is('T') || self.lookahead_is('t') {
-            let absolute = if self.match_char('T') {
-                true
-            } else {
-                assert!(self.match_char('t'));
-                false
-            };
-
-            self.optional_whitespace()?;
-
-            self.smooth_quadratic_curveto_argument_sequence(absolute)?;
-            Ok(true)
-        } else {
-            Ok(false)
-        }
-    }
-
     fn elliptical_arc_argument_sequence(&mut self, absolute: bool) -> Result<(), ParseError> {
         loop {
-            let rx = self.number()?.abs();
+            let rx = self.match_number()?.abs();
+            let ry = self.match_comma_number()?.abs();
+            let x_axis_rotation = self.match_comma_number()?;
 
-            self.optional_comma_whitespace()?;
+            self.eat_optional_comma();
 
-            let ry = self.number()?.abs();
+            let large_arc = LargeArc(self.match_flag()?);
 
-            self.optional_comma_whitespace()?;
+            self.eat_optional_comma();
 
-            let x_axis_rotation = self.number()?;
-
-            self.optional_comma_whitespace()?;
-
-            let large_arc = LargeArc(self.flag()?);
-
-            self.optional_comma_whitespace()?;
-
-            let sweep = if self.flag()? {
+            let sweep = if self.match_flag()? {
                 Sweep::Positive
             } else {
                 Sweep::Negative
             };
 
-            self.optional_comma_whitespace()?;
+            self.eat_optional_comma();
 
             let (mut x, mut y) = self.coordinate_pair()?;
 
@@ -864,46 +794,21 @@ impl<'b> PathParser<'b> {
 
             self.emit_arc(rx, ry, x_axis_rotation, large_arc, sweep, x, y);
 
-            self.whitespace()?;
-
-            if self.lookahead_is(',') {
-                assert!(self.match_char(','));
-                self.optional_whitespace()?;
-            } else if !self.lookahead_is_start_of_number() {
+            if self.should_break_arg_sequence() {
                 break;
             }
         }
 
         Ok(())
     }
-
-    fn elliptical_arc(&mut self) -> Result<bool, ParseError> {
-        if self.lookahead_is('A') || self.lookahead_is('a') {
-            let absolute = if self.match_char('A') {
-                true
-            } else {
-                assert!(self.match_char('a'));
-                false
-            };
-
-            self.optional_whitespace()?;
-
-            self.elliptical_arc_argument_sequence(absolute)?;
-            Ok(true)
-        } else {
-            Ok(false)
-        }
-    }
-}
-
-fn char_to_digit(c: char) -> i32 {
-    c as i32 - '0' as i32
 }
 
 #[derive(Debug, PartialEq)]
 pub enum ErrorKind {
-    UnexpectedToken,
+    UnexpectedToken(Token),
+    UnexpectedCommand(u8),
     UnexpectedEof,
+    LexError(LexError),
 }
 
 #[derive(Debug, PartialEq)]
@@ -914,11 +819,24 @@ pub struct ParseError {
 
 impl Error for ParseError {}
 
+impl ParseError {
+    fn new(pos: usize, k: ErrorKind) -> ParseError {
+        ParseError {
+            position: pos,
+            kind: k,
+        }
+    }
+}
+
+use crate::path_parser::ErrorKind::*;
+
 impl fmt::Display for ParseError {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let description = match self.kind {
-            ErrorKind::UnexpectedToken => "unexpected token",
-            ErrorKind::UnexpectedEof => "unexpected end of data",
+            UnexpectedToken(_t) => "unexpected token",
+            UnexpectedCommand(_c) => "unexpected command",
+            UnexpectedEof => "unexpected end of data",
+            LexError(_le) => "error processing token",
         };
         write!(f, "error at position {}: {}", self.position, description)
     }
@@ -999,9 +917,9 @@ mod tests {
     fn handles_empty_data() {
         test_parser(
             "",
-            "^",
+            "",
             &Vec::<PathCommand>::new(),
-            Some(ErrorKind::UnexpectedEof),
+            None,
         );
     }
 
@@ -1089,51 +1007,51 @@ mod tests {
     fn detects_bogus_numbers() {
         test_parser(
             "M+",
-            "  ^",
+            " ^",
             &vec![],
-            Some(ErrorKind::UnexpectedEof),
+            Some(ErrorKind::LexError(LexError::UnexpectedEof)),
         );
 
         test_parser(
             "M-",
-            "  ^",
+            " ^",
             &vec![],
-            Some(ErrorKind::UnexpectedEof),
+            Some(ErrorKind::LexError(LexError::UnexpectedEof)),
         );
 
         test_parser(
             "M+x",
-            "  ^",
+            " ^",
             &vec![],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::LexError(LexError::UnexpectedByte(b'x'))),
         );
 
         test_parser(
             "M10e",
-            "    ^",
+            " ^",
             &vec![],
-            Some(ErrorKind::UnexpectedEof),
+            Some(ErrorKind::LexError(LexError::ParseFloatError)),
         );
 
         test_parser(
             "M10ex",
-            "    ^",
+            " ^",
             &vec![],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::LexError(LexError::ParseFloatError)),
         );
 
         test_parser(
             "M10e-",
-            "     ^",
+            " ^",
             &vec![],
-            Some(ErrorKind::UnexpectedEof),
+            Some(ErrorKind::LexError(LexError::ParseFloatError)),
         );
 
         test_parser(
             "M10e+x",
-            "     ^",
+            " ^",
             &vec![],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::LexError(LexError::ParseFloatError)),
         );
     }
 
@@ -1676,9 +1594,9 @@ mod tests {
     fn first_command_must_be_moveto() {
         test_parser(
             "  L10 20",
-            "  ^",
+            "   ^", // FIXME: why is this not at position 2?
             &vec![],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedCommand(b'L')),
         );
     }
 
@@ -1695,7 +1613,7 @@ mod tests {
             "M,",
             " ^",
             &vec![],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedToken(Comma)),
         );
 
         test_parser(
@@ -1716,14 +1634,14 @@ mod tests {
             "M10x",
             "   ^",
             &vec![],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedToken(Command(b'x'))),
         );
 
         test_parser(
             "M10,x",
             "    ^",
             &vec![],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedToken(Command(b'x'))),
         );
     }
 
@@ -1747,7 +1665,7 @@ mod tests {
             "M10-20-30 x",
             "          ^",
             &vec![moveto(10.0, -20.0)],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedToken(Command(b'x'))),
         );
     }
 
@@ -1757,14 +1675,14 @@ mod tests {
             "M10-20z10",
             "       ^",
             &vec![moveto(10.0, -20.0), closepath()],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedToken(Number(10.0))),
         );
 
         test_parser(
             "M10-20z,",
             "       ^",
             &vec![moveto(10.0, -20.0), closepath()],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedToken(Comma)),
         );
     }
 
@@ -1805,7 +1723,7 @@ mod tests {
             "M10-20H,",
             "       ^",
             &vec![moveto(10.0, -20.0)],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedToken(Comma)),
         );
 
         test_parser(
@@ -1829,7 +1747,7 @@ mod tests {
             "M10-20v,",
             "       ^",
             &vec![moveto(10.0, -20.0)],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedToken(Comma)),
         );
 
         test_parser(
@@ -2127,7 +2045,7 @@ mod tests {
             "M10-20A1 2 3 4",
             "             ^",
             &vec![moveto(10.0, -20.0)],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedToken(Number(4.0))),
         );
 
         test_parser(
@@ -2147,7 +2065,7 @@ mod tests {
             "M10-20A1 2 3 1 5",
             "               ^",
             &vec![moveto(10.0, -20.0)],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::UnexpectedToken(Number(5.0))),
         );
 
         test_parser(
@@ -2190,9 +2108,9 @@ mod tests {
         // https://gitlab.gnome.org/GNOME/librsvg/issues/345
         test_parser(
             "M.. 1,0 0,100000",
-            "  ^",
+            " ^", // FIXME: we have to report position of error in lexer errors to make this right
             &vec![],
-            Some(ErrorKind::UnexpectedToken),
+            Some(ErrorKind::LexError(LexError::UnexpectedByte(b'.'))),
         );
     }
 }
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]