[pango/bidi-stack-size: 8/11] Split pango_default_break into functions
- From: Matthias Clasen <matthiasc src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [pango/bidi-stack-size: 8/11] Split pango_default_break into functions
- Date: Fri, 30 Jul 2021 13:06:30 +0000 (UTC)
commit 73cd563d7bd19b31b57e94e3bb0969970990e589
Author: Matthias Clasen <mclasen redhat com>
Date: Wed Jul 28 14:09:23 2021 -0400
Split pango_default_break into functions
This makes it easier to see what part is taking time.
pango/break.c | 814 ++++++++++++++++++++++++++++++++++------------------------
1 file changed, 472 insertions(+), 342 deletions(-)
---
diff --git a/pango/break.c b/pango/break.c
index b9cf3cae..00290a59 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -138,225 +138,13 @@ typedef enum
WordNumbers
} WordType;
-
-/**
- * pango_default_break:
- * @text: text to break. Must be valid UTF-8
- * @length: length of text in bytes (may be -1 if @text is nul-terminated)
- * @analysis: (nullable): a `PangoAnalysis` structure for the @text
- * @attrs: logical attributes to fill in
- * @attrs_len: size of the array passed as @attrs
- *
- * This is the default break algorithm.
- *
- * It applies Unicode rules without language-specific
- * tailoring, therefore the @analyis argument is unused
- * and can be %NULL.
- *
- * See [func@Pango.tailor_break] for language-specific breaks.
- */
-void
-pango_default_break (const gchar *text,
- gint length,
- PangoAnalysis *analysis G_GNUC_UNUSED,
- PangoLogAttr *attrs,
- int attrs_len G_GNUC_UNUSED)
+ __attribute__((noinline))
+static void
+determine_whitespace (gunichar wc,
+ GUnicodeType type,
+ PangoLogAttr *attrs,
+ int i)
{
- /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
- * the line breaking stuff is also in TR14 on unicode.org
- */
-
- /* This is a default break implementation that should work for nearly all
- * languages. Language engines can override it optionally.
- */
-
- /* FIXME one cheesy optimization here would be to memset attrs to 0
- * before we start, and then never assign %FALSE to anything
- */
-
- const gchar *next;
- gint i;
-
- gunichar prev_wc;
- gunichar next_wc;
-
- JamoType prev_jamo;
-
- GUnicodeBreakType next_break_type;
- GUnicodeBreakType prev_break_type;
- GUnicodeBreakType prev_prev_break_type;
-
- /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
- typedef enum
- {
- GB_Other,
- GB_ControlCRLF,
- GB_Extend,
- GB_ZWJ,
- GB_Prepend,
- GB_SpacingMark,
- GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
- /* Use state machine to handle emoji sequence */
- /* Rule GB12 and GB13 */
- GB_RI_Odd, /* Meets odd number of RI */
- GB_RI_Even, /* Meets even number of RI */
- } GraphemeBreakType;
- GraphemeBreakType prev_GB_type = GB_Other;
- gboolean met_Extended_Pictographic = FALSE;
-
- /* See Word_Break Property Values table of UAX#29 */
- typedef enum
- {
- WB_Other,
- WB_NewlineCRLF,
- WB_ExtendFormat,
- WB_Katakana,
- WB_Hebrew_Letter,
- WB_ALetter,
- WB_MidNumLet,
- WB_MidLetter,
- WB_MidNum,
- WB_Numeric,
- WB_ExtendNumLet,
- WB_RI_Odd,
- WB_RI_Even,
- WB_WSegSpace,
- } WordBreakType;
- WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
- gint prev_WB_i = -1;
-
- /* See Sentence_Break Property Values table of UAX#29 */
- typedef enum
- {
- SB_Other,
- SB_ExtendFormat,
- SB_ParaSep,
- SB_Sp,
- SB_Lower,
- SB_Upper,
- SB_OLetter,
- SB_Numeric,
- SB_ATerm,
- SB_SContinue,
- SB_STerm,
- SB_Close,
- /* Rules SB8 and SB8a */
- SB_ATerm_Close_Sp,
- SB_STerm_Close_Sp,
- } SentenceBreakType;
- SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
- gint prev_SB_i = -1;
-
- /* Rule LB25 with Example 7 of Customization */
- typedef enum
- {
- LB_Other,
- LB_Numeric,
- LB_Numeric_Close,
- LB_RI_Odd,
- LB_RI_Even,
- } LineBreakType;
- LineBreakType prev_LB_type = LB_Other;
-
- WordType current_word_type = WordNone;
- gunichar last_word_letter = 0;
- gunichar base_character = 0;
-
- gint last_sentence_start = -1;
- gint last_non_space = -1;
-
- gboolean almost_done = FALSE;
- gboolean done = FALSE;
-
- g_return_if_fail (length == 0 || text != NULL);
- g_return_if_fail (attrs != NULL);
-
- next = text;
-
- prev_break_type = G_UNICODE_BREAK_UNKNOWN;
- prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN;
- prev_wc = 0;
- prev_jamo = NO_JAMO;
-
- if (length == 0 || *text == '\0')
- {
- next_wc = PARAGRAPH_SEPARATOR;
- almost_done = TRUE;
- }
- else
- next_wc = g_utf8_get_char (next);
-
- next_break_type = g_unichar_break_type (next_wc);
- next_break_type = BREAK_TYPE_SAFE (next_break_type);
-
- for (i = 0; !done ; i++)
- {
- GUnicodeType type;
- gunichar wc;
- GUnicodeBreakType break_type;
- GUnicodeBreakType row_break_type;
- BreakOpportunity break_op;
- JamoType jamo;
- gboolean makes_hangul_syllable;
-
- /* UAX#29 boundaries */
- gboolean is_grapheme_boundary;
- gboolean is_word_boundary;
- gboolean is_sentence_boundary;
-
- /* Emoji extended pictographics */
- gboolean is_Extended_Pictographic;
-
- gboolean can_break;
-
- wc = next_wc;
- break_type = next_break_type;
-
- if (almost_done)
- {
- /*
- * If we have already reached the end of @text g_utf8_next_char()
- * may not increment next
- */
- next_wc = 0;
- next_break_type = G_UNICODE_BREAK_UNKNOWN;
- done = TRUE;
- }
- else
- {
- next = g_utf8_next_char (next);
-
- if ((length >= 0 && next >= text + length) || *next == '\0')
- {
- /* This is how we fill in the last element (end position) of the
- * attr array - assume there's a paragraph separators off the end
- * of @text.
- */
- next_wc = PARAGRAPH_SEPARATOR;
- almost_done = TRUE;
- }
- else
- next_wc = g_utf8_get_char (next);
-
- next_break_type = g_unichar_break_type (next_wc);
- next_break_type = BREAK_TYPE_SAFE (next_break_type);
- }
-
- type = g_unichar_type (wc);
- jamo = JAMO_TYPE (break_type);
-
- /* Determine wheter this forms a Hangul syllable with prev. */
- if (jamo == NO_JAMO)
- makes_hangul_syllable = FALSE;
- else
- {
- JamoType prev_end = HangulJamoProps[prev_jamo].end ;
- JamoType this_start = HangulJamoProps[ jamo].start;
-
- /* See comments before IS_JAMO */
- makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
- }
-
switch (type)
{
case G_UNICODE_SPACE_SEPARATOR:
@@ -379,14 +167,39 @@ pango_default_break (const gchar *text,
attrs[i].is_expandable_space = FALSE;
break;
}
+}
- is_Extended_Pictographic =
- _pango_Is_Emoji_Extended_Pictographic (wc);
-
+ /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
+ typedef enum
+ {
+ GB_Other,
+ GB_ControlCRLF,
+ GB_Extend,
+ GB_ZWJ,
+ GB_Prepend,
+ GB_SpacingMark,
+ GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
+ /* Use state machine to handle emoji sequence */
+ /* Rule GB12 and GB13 */
+ GB_RI_Odd, /* Meets odd number of RI */
+ GB_RI_Even, /* Meets even number of RI */
+ } GraphemeBreakType;
/* ---- UAX#29 Grapheme Boundaries ---- */
+ __attribute__((noinline))
+static void
+determine_grapheme_boundaries (gunichar wc,
+ GUnicodeType type,
+ gunichar prev_wc,
+ gboolean makes_hangul_syllable,
+ gboolean is_Extended_Pictographic,
+ gboolean *met_Extended_Pictographic,
+ GraphemeBreakType *prev_GB_type,
+ PangoLogAttr *attrs,
+ int i)
{
GraphemeBreakType GB_type;
+ gboolean is_grapheme_boundary;
/* Find the GraphemeBreakType of wc */
GB_type = GB_Other;
@@ -472,9 +285,9 @@ pango_default_break (const gchar *text,
case G_UNICODE_OTHER_SYMBOL:
if (G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF))
{
- if (prev_GB_type == GB_RI_Odd)
+ if (*prev_GB_type == GB_RI_Odd)
GB_type = GB_RI_Even;
- else if (prev_GB_type == GB_RI_Even)
+ else if (*prev_GB_type == GB_RI_Even)
GB_type = GB_RI_Odd;
else
GB_type = GB_RI_Odd;
@@ -489,20 +302,20 @@ pango_default_break (const gchar *text,
break;
}
- /* Rule GB11 */
- if (met_Extended_Pictographic)
+ /* Rule GB11 */
+ if (*met_Extended_Pictographic)
{
if (GB_type == GB_Extend)
- met_Extended_Pictographic = TRUE;
+ *met_Extended_Pictographic = TRUE;
else if (_pango_Is_Emoji_Extended_Pictographic (prev_wc) &&
GB_type == GB_ZWJ)
- met_Extended_Pictographic = TRUE;
- else if (prev_GB_type == GB_Extend && GB_type == GB_ZWJ)
- met_Extended_Pictographic = TRUE;
- else if (prev_GB_type == GB_ZWJ && is_Extended_Pictographic)
- met_Extended_Pictographic = TRUE;
+ *met_Extended_Pictographic = TRUE;
+ else if (*prev_GB_type == GB_Extend && GB_type == GB_ZWJ)
+ *met_Extended_Pictographic = TRUE;
+ else if (*prev_GB_type == GB_ZWJ && is_Extended_Pictographic)
+ *met_Extended_Pictographic = TRUE;
else
- met_Extended_Pictographic = FALSE;
+ *met_Extended_Pictographic = FALSE;
}
/* Grapheme Cluster Boundary Rules */
@@ -511,7 +324,7 @@ pango_default_break (const gchar *text,
/* We apply Rules GB1 and GB2 at the end of the function */
if (wc == '\n' && prev_wc == '\r')
is_grapheme_boundary = FALSE; /* Rule GB3 */
- else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
+ else if (*prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
else if (GB_type == GB_InHangulSyllable)
is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */
@@ -521,23 +334,34 @@ pango_default_break (const gchar *text,
is_grapheme_boundary = FALSE; /* Rule GB9 */
else if (GB_type == GB_SpacingMark)
is_grapheme_boundary = FALSE; /* Rule GB9a */
- else if (prev_GB_type == GB_Prepend)
+ else if (*prev_GB_type == GB_Prepend)
is_grapheme_boundary = FALSE; /* Rule GB9b */
else if (is_Extended_Pictographic)
{ /* Rule GB11 */
- if (prev_GB_type == GB_ZWJ && met_Extended_Pictographic)
+ if (*prev_GB_type == GB_ZWJ && *met_Extended_Pictographic)
is_grapheme_boundary = FALSE;
}
- else if (prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even)
+ else if (*prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even)
is_grapheme_boundary = FALSE; /* Rule GB12 and GB13 */
if (is_Extended_Pictographic)
- met_Extended_Pictographic = TRUE;
+ *met_Extended_Pictographic = TRUE;
attrs[i].is_cursor_position = is_grapheme_boundary;
+
+ *prev_GB_type = GB_type;
+ }
+
+ __attribute__((noinline))
+static void
+determine_backspace (gunichar prev_wc,
+ gunichar base_character,
+ PangoLogAttr *attrs,
+ int i)
+{
/* If this is a grapheme boundary, we have to decide if backspace
* deletes a character or the whole grapheme cluster */
- if (is_grapheme_boundary)
+ if (attrs[i].is_cursor_position)
{
attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
@@ -548,14 +372,46 @@ pango_default_break (const gchar *text,
}
else
attrs[i].backspace_deletes_character = FALSE;
+}
- prev_GB_type = GB_type;
- }
+ /* See Word_Break Property Values table of UAX#29 */
+ typedef enum
+ {
+ WB_Other,
+ WB_NewlineCRLF,
+ WB_ExtendFormat,
+ WB_Katakana,
+ WB_Hebrew_Letter,
+ WB_ALetter,
+ WB_MidNumLet,
+ WB_MidLetter,
+ WB_MidNum,
+ WB_Numeric,
+ WB_ExtendNumLet,
+ WB_RI_Odd,
+ WB_RI_Even,
+ WB_WSegSpace,
+ } WordBreakType;
/* ---- UAX#29 Word Boundaries ---- */
+ __attribute__((noinline))
+static void
+determine_word_boundaries (gunichar wc,
+ GUnicodeType type,
+ GUnicodeBreakType break_type,
+ gunichar prev_wc,
+ gboolean is_Extended_Pictographic,
+ WordBreakType *prev_WB_type,
+ int *prev_WB_i,
+ WordBreakType *prev_prev_WB_type,
+ PangoLogAttr *attrs,
+ int i)
{
+ gboolean is_word_boundary;
+
is_word_boundary = FALSE;
- if (is_grapheme_boundary ||
+
+ if (attrs[i].is_cursor_position ||
G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF)) /* Rules WB3 and WB4 */
{
PangoScript script;
@@ -655,7 +511,7 @@ pango_default_break (const gchar *text,
if (G_UNLIKELY(wc >= 0x1F1E6 && wc <= 0x1F1FF))
{
- if (prev_WB_type == WB_RI_Odd)
+ if (*prev_WB_type == WB_RI_Odd)
WB_type = WB_RI_Even;
else
WB_type = WB_RI_Odd;
@@ -700,8 +556,8 @@ pango_default_break (const gchar *text,
/* We apply Rules WB1 and WB2 at the end of the function */
if (prev_wc == 0x3031 && wc == 0x41)
- g_debug ("Y %d %d", prev_WB_type, WB_type);
- if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i)
+ g_debug ("Y %d %d", *prev_WB_type, WB_type);
+ if (*prev_WB_type == WB_NewlineCRLF && *prev_WB_i + 1 == i)
{
/* The extra check for prev_WB_i is to correctly handle sequences like
* Newline ÷ Extend × Extend
@@ -713,80 +569,116 @@ pango_default_break (const gchar *text,
is_word_boundary = TRUE; /* Rule WB3b */
else if (prev_wc == 0x200D && is_Extended_Pictographic)
is_word_boundary = FALSE; /* Rule WB3c */
- else if (prev_WB_type == WB_WSegSpace &&
- WB_type == WB_WSegSpace && prev_WB_i + 1 == i)
+ else if (*prev_WB_type == WB_WSegSpace &&
+ WB_type == WB_WSegSpace && *prev_WB_i + 1 == i)
is_word_boundary = FALSE; /* Rule WB3d */
else if (WB_type == WB_ExtendFormat)
is_word_boundary = FALSE; /* Rules WB4? */
- else if ((prev_WB_type == WB_ALetter ||
- prev_WB_type == WB_Hebrew_Letter ||
- prev_WB_type == WB_Numeric) &&
+ else if ((*prev_WB_type == WB_ALetter ||
+ *prev_WB_type == WB_Hebrew_Letter ||
+ *prev_WB_type == WB_Numeric) &&
(WB_type == WB_ALetter ||
WB_type == WB_Hebrew_Letter ||
WB_type == WB_Numeric))
is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10 */
- else if (prev_WB_type == WB_Katakana && WB_type == WB_Katakana)
+ else if (*prev_WB_type == WB_Katakana && WB_type == WB_Katakana)
is_word_boundary = FALSE; /* Rule WB13 */
- else if ((prev_WB_type == WB_ALetter ||
- prev_WB_type == WB_Hebrew_Letter ||
- prev_WB_type == WB_Numeric ||
- prev_WB_type == WB_Katakana ||
- prev_WB_type == WB_ExtendNumLet) &&
+ else if ((*prev_WB_type == WB_ALetter ||
+ *prev_WB_type == WB_Hebrew_Letter ||
+ *prev_WB_type == WB_Numeric ||
+ *prev_WB_type == WB_Katakana ||
+ *prev_WB_type == WB_ExtendNumLet) &&
WB_type == WB_ExtendNumLet)
is_word_boundary = FALSE; /* Rule WB13a */
- else if (prev_WB_type == WB_ExtendNumLet &&
+ else if (*prev_WB_type == WB_ExtendNumLet &&
(WB_type == WB_ALetter ||
WB_type == WB_Hebrew_Letter ||
WB_type == WB_Numeric ||
WB_type == WB_Katakana))
is_word_boundary = FALSE; /* Rule WB13b */
- else if (((prev_prev_WB_type == WB_ALetter ||
- prev_prev_WB_type == WB_Hebrew_Letter) &&
+ else if (((*prev_prev_WB_type == WB_ALetter ||
+ *prev_prev_WB_type == WB_Hebrew_Letter) &&
(WB_type == WB_ALetter ||
WB_type == WB_Hebrew_Letter)) &&
- (prev_WB_type == WB_MidLetter ||
- prev_WB_type == WB_MidNumLet ||
+ (*prev_WB_type == WB_MidLetter ||
+ *prev_WB_type == WB_MidNumLet ||
prev_wc == 0x0027))
{
- attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
+ attrs[*prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
is_word_boundary = FALSE; /* Rule WB7 */
}
- else if (prev_WB_type == WB_Hebrew_Letter && wc == 0x0027)
+ else if (*prev_WB_type == WB_Hebrew_Letter && wc == 0x0027)
is_word_boundary = FALSE; /* Rule WB7a */
- else if (prev_prev_WB_type == WB_Hebrew_Letter && prev_wc == 0x0022 &&
+ else if (*prev_prev_WB_type == WB_Hebrew_Letter && prev_wc == 0x0022 &&
WB_type == WB_Hebrew_Letter)
{
- attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB7b */
+ attrs[*prev_WB_i].is_word_boundary = FALSE; /* Rule WB7b */
is_word_boundary = FALSE; /* Rule WB7c */
}
- else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
- (prev_WB_type == WB_MidNum ||
- prev_WB_type == WB_MidNumLet ||
+ else if ((*prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
+ (*prev_WB_type == WB_MidNum ||
+ *prev_WB_type == WB_MidNumLet ||
prev_wc == 0x0027))
{
is_word_boundary = FALSE; /* Rule WB11 */
- attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
+ attrs[*prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
}
- else if (prev_WB_type == WB_RI_Odd && WB_type == WB_RI_Even)
+ else if (*prev_WB_type == WB_RI_Odd && WB_type == WB_RI_Even)
is_word_boundary = FALSE; /* Rule WB15 and WB16 */
else
is_word_boundary = TRUE; /* Rule WB999 */
if (WB_type != WB_ExtendFormat)
{
- prev_prev_WB_type = prev_WB_type;
- prev_WB_type = WB_type;
- prev_WB_i = i;
+ *prev_prev_WB_type = *prev_WB_type;
+ *prev_WB_type = WB_type;
+ *prev_WB_i = i;
}
}
attrs[i].is_word_boundary = is_word_boundary;
}
+
+ /* See Sentence_Break Property Values table of UAX#29 */
+ typedef enum
+ {
+ SB_Other,
+ SB_ExtendFormat,
+ SB_ParaSep,
+ SB_Sp,
+ SB_Lower,
+ SB_Upper,
+ SB_OLetter,
+ SB_Numeric,
+ SB_ATerm,
+ SB_SContinue,
+ SB_STerm,
+ SB_Close,
+ /* Rules SB8 and SB8a */
+ SB_ATerm_Close_Sp,
+ SB_STerm_Close_Sp,
+ } SentenceBreakType;
/* ---- UAX#29 Sentence Boundaries ---- */
+
+ __attribute__((noinline))
+static void
+determine_sentence_boundaries (gunichar wc,
+ GUnicodeType type,
+ GUnicodeBreakType break_type,
+ gunichar prev_wc,
+ SentenceBreakType *prev_SB_type,
+ int *prev_SB_i,
+ SentenceBreakType *prev_prev_SB_type,
+ int *last_sentence_start,
+ PangoLogAttr *attrs,
+ int i,
+ gboolean done)
{
+ gboolean is_sentence_boundary;
+
is_sentence_boundary = FALSE;
- if (is_word_boundary ||
+ if (attrs[i].is_word_boundary ||
wc == '\r' || wc == '\n') /* Rules SB3 and SB5 */
{
SentenceBreakType SB_type;
@@ -910,7 +802,7 @@ pango_default_break (const gchar *text,
if (wc == '\n' && prev_wc == '\r')
is_sentence_boundary = FALSE; /* Rule SB3 */
- else if (prev_SB_type == SB_ParaSep && prev_SB_i + 1 == i)
+ else if (*prev_SB_type == SB_ParaSep && *prev_SB_i + 1 == i)
{
/* The extra check for prev_SB_i is to correctly handle sequences like
* ParaSep ÷ Extend × Extend
@@ -921,80 +813,80 @@ pango_default_break (const gchar *text,
}
else if (SB_type == SB_ExtendFormat)
is_sentence_boundary = FALSE; /* Rule SB5? */
- else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
+ else if (*prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
is_sentence_boundary = FALSE; /* Rule SB6 */
- else if ((prev_prev_SB_type == SB_Upper ||
- prev_prev_SB_type == SB_Lower) &&
- prev_SB_type == SB_ATerm &&
+ else if ((*prev_prev_SB_type == SB_Upper ||
+ *prev_prev_SB_type == SB_Lower) &&
+ *prev_SB_type == SB_ATerm &&
SB_type == SB_Upper)
is_sentence_boundary = FALSE; /* Rule SB7 */
- else if (prev_SB_type == SB_ATerm && SB_type == SB_Close)
+ else if (*prev_SB_type == SB_ATerm && SB_type == SB_Close)
SB_type = SB_ATerm;
- else if (prev_SB_type == SB_STerm && SB_type == SB_Close)
+ else if (*prev_SB_type == SB_STerm && SB_type == SB_Close)
SB_type = SB_STerm;
- else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp)
+ else if (*prev_SB_type == SB_ATerm && SB_type == SB_Sp)
SB_type = SB_ATerm_Close_Sp;
- else if (prev_SB_type == SB_STerm && SB_type == SB_Sp)
+ else if (*prev_SB_type == SB_STerm && SB_type == SB_Sp)
SB_type = SB_STerm_Close_Sp;
/* Rule SB8 */
- else if ((prev_SB_type == SB_ATerm ||
- prev_SB_type == SB_ATerm_Close_Sp) &&
+ else if ((*prev_SB_type == SB_ATerm ||
+ *prev_SB_type == SB_ATerm_Close_Sp) &&
SB_type == SB_Lower)
is_sentence_boundary = FALSE;
- else if ((prev_prev_SB_type == SB_ATerm ||
- prev_prev_SB_type == SB_ATerm_Close_Sp) &&
- IS_OTHER_TERM(prev_SB_type) &&
+ else if ((*prev_prev_SB_type == SB_ATerm ||
+ *prev_prev_SB_type == SB_ATerm_Close_Sp) &&
+ IS_OTHER_TERM(*prev_SB_type) &&
SB_type == SB_Lower)
{
- attrs[prev_SB_i].is_sentence_boundary = FALSE;
- attrs[prev_SB_i].is_sentence_start = FALSE;
- attrs[prev_SB_i].is_sentence_end = FALSE;
- last_sentence_start = -1;
- for (int j = prev_SB_i - 1; j >= 0; j--)
+ attrs[*prev_SB_i].is_sentence_boundary = FALSE;
+ attrs[*prev_SB_i].is_sentence_start = FALSE;
+ attrs[*prev_SB_i].is_sentence_end = FALSE;
+ *last_sentence_start = -1;
+ for (int j = *prev_SB_i - 1; j >= 0; j--)
{
if (attrs[j].is_sentence_boundary)
{
- last_sentence_start = j;
+ *last_sentence_start = j;
break;
}
}
}
- else if ((prev_SB_type == SB_ATerm ||
- prev_SB_type == SB_ATerm_Close_Sp ||
- prev_SB_type == SB_STerm ||
- prev_SB_type == SB_STerm_Close_Sp) &&
+ else if ((*prev_SB_type == SB_ATerm ||
+ *prev_SB_type == SB_ATerm_Close_Sp ||
+ *prev_SB_type == SB_STerm ||
+ *prev_SB_type == SB_STerm_Close_Sp) &&
(SB_type == SB_SContinue ||
SB_type == SB_ATerm || SB_type == SB_STerm))
is_sentence_boundary = FALSE; /* Rule SB8a */
- else if ((prev_SB_type == SB_ATerm ||
- prev_SB_type == SB_STerm) &&
+ else if ((*prev_SB_type == SB_ATerm ||
+ *prev_SB_type == SB_STerm) &&
(SB_type == SB_Close || SB_type == SB_Sp ||
SB_type == SB_ParaSep))
is_sentence_boundary = FALSE; /* Rule SB9 */
- else if ((prev_SB_type == SB_ATerm ||
- prev_SB_type == SB_ATerm_Close_Sp ||
- prev_SB_type == SB_STerm ||
- prev_SB_type == SB_STerm_Close_Sp) &&
+ else if ((*prev_SB_type == SB_ATerm ||
+ *prev_SB_type == SB_ATerm_Close_Sp ||
+ *prev_SB_type == SB_STerm ||
+ *prev_SB_type == SB_STerm_Close_Sp) &&
(SB_type == SB_Sp || SB_type == SB_ParaSep))
is_sentence_boundary = FALSE; /* Rule SB10 */
- else if ((prev_SB_type == SB_ATerm ||
- prev_SB_type == SB_ATerm_Close_Sp ||
- prev_SB_type == SB_STerm ||
- prev_SB_type == SB_STerm_Close_Sp) &&
+ else if ((*prev_SB_type == SB_ATerm ||
+ *prev_SB_type == SB_ATerm_Close_Sp ||
+ *prev_SB_type == SB_STerm ||
+ *prev_SB_type == SB_STerm_Close_Sp) &&
SB_type != SB_ParaSep)
is_sentence_boundary = TRUE; /* Rule SB11 */
else
is_sentence_boundary = FALSE; /* Rule SB998 */
if (SB_type != SB_ExtendFormat &&
- !((prev_prev_SB_type == SB_ATerm ||
- prev_prev_SB_type == SB_ATerm_Close_Sp) &&
- IS_OTHER_TERM(prev_SB_type) &&
+ !((*prev_prev_SB_type == SB_ATerm ||
+ *prev_prev_SB_type == SB_ATerm_Close_Sp) &&
+ IS_OTHER_TERM(*prev_SB_type) &&
IS_OTHER_TERM(SB_type)))
{
- prev_prev_SB_type = prev_SB_type;
- prev_SB_type = SB_type;
- prev_SB_i = i;
+ *prev_prev_SB_type = *prev_SB_type;
+ *prev_SB_type = SB_type;
+ *prev_SB_i = i;
}
#undef IS_OTHER_TERM
@@ -1007,7 +899,38 @@ pango_default_break (const gchar *text,
attrs[i].is_sentence_boundary = is_sentence_boundary;
}
+
/* ---- Line breaking ---- */
+ typedef enum
+ {
+ LB_Other,
+ LB_Numeric,
+ LB_Numeric_Close,
+ LB_RI_Odd,
+ LB_RI_Even,
+ } LineBreakType;
+
+ __attribute__((noinline))
+static void
+determine_line_breaks (gunichar wc,
+ GUnicodeType type,
+ GUnicodeBreakType break_type,
+ gunichar prev_wc,
+ GUnicodeBreakType *pprev_break_type,
+ GUnicodeBreakType *pprev_prev_break_type,
+ GUnicodeBreakType next_break_type,
+ LineBreakType *pprev_LB_type,
+ JamoType jamo,
+ JamoType *pprev_jamo,
+ PangoLogAttr *attrs,
+ int i)
+{
+ gboolean can_break;
+ BreakOpportunity break_op;
+ GUnicodeBreakType row_break_type;
+ GUnicodeBreakType prev_break_type = *pprev_break_type;
+ GUnicodeBreakType prev_prev_break_type = *pprev_prev_break_type;
+ LineBreakType prev_LB_type = *pprev_LB_type;
break_op = BREAK_ALREADY_HANDLED;
@@ -1373,12 +1296,12 @@ pango_default_break (const gchar *text,
break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
{
if (prev_LB_type != LB_Numeric)
- prev_LB_type = LB_type;
+ *pprev_LB_type = LB_type;
/* else don't change the prev_LB_type */
}
else
{
- prev_LB_type = LB_type;
+ *pprev_LB_type = LB_type;
}
}
/* else don't change the prev_LB_type for Rule LB9 */
@@ -1388,8 +1311,8 @@ pango_default_break (const gchar *text,
{
if (prev_break_type != G_UNICODE_BREAK_SPACE)
{
- prev_prev_break_type = prev_break_type;
- prev_break_type = break_type;
+ *pprev_prev_break_type = prev_break_type;
+ *pprev_break_type = break_type;
}
/* else don't change the prev_break_type */
}
@@ -1403,20 +1326,33 @@ pango_default_break (const gchar *text,
prev_break_type == G_UNICODE_BREAK_NEXT_LINE ||
prev_break_type == G_UNICODE_BREAK_SPACE ||
prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
- prev_break_type = G_UNICODE_BREAK_ALPHABETIC; /* Rule LB10 */
+ *pprev_break_type = G_UNICODE_BREAK_ALPHABETIC; /* Rule LB10 */
/* else don't change the prev_break_type for Rule LB9 */
- prev_jamo = jamo;
+ *pprev_jamo = jamo;
}
else
{
- prev_prev_break_type = prev_break_type;
- prev_break_type = break_type;
- prev_jamo = jamo;
+ *pprev_prev_break_type = prev_break_type;
+ *pprev_break_type = break_type;
+ *pprev_jamo = jamo;
}
+}
/* ---- Word breaks ---- */
+ __attribute__((noinline))
+static void
+determine_word_breaks (gunichar wc,
+ GUnicodeType type,
+ WordType *pcurrent_word_type,
+ gunichar *plast_word_letter,
+ PangoLogAttr *attrs,
+ int i)
+{
+ WordType current_word_type = *pcurrent_word_type;
+ gunichar last_word_letter = *plast_word_letter;
+
/* default to not a word start/end */
attrs[i].is_word_start = FALSE;
attrs[i].is_word_end = FALSE;
@@ -1457,19 +1393,19 @@ pango_default_break (const gchar *text,
attrs[i].is_word_end = TRUE;
}
}
- last_word_letter = wc;
+ *plast_word_letter = wc;
break;
case G_UNICODE_DECIMAL_NUMBER:
case G_UNICODE_LETTER_NUMBER:
case G_UNICODE_OTHER_NUMBER:
- last_word_letter = wc;
+ *plast_word_letter = wc;
break;
default:
/* Punctuation, control/format chars, etc. all end a word. */
attrs[i].is_word_end = TRUE;
- current_word_type = WordNone;
+ *pcurrent_word_type = WordNone;
break;
}
}
@@ -1483,16 +1419,16 @@ pango_default_break (const gchar *text,
case G_UNICODE_OTHER_LETTER:
case G_UNICODE_TITLECASE_LETTER:
case G_UNICODE_UPPERCASE_LETTER:
- current_word_type = WordLetters;
- last_word_letter = wc;
+ *pcurrent_word_type = WordLetters;
+ *plast_word_letter = wc;
attrs[i].is_word_start = TRUE;
break;
case G_UNICODE_DECIMAL_NUMBER:
case G_UNICODE_LETTER_NUMBER:
case G_UNICODE_OTHER_NUMBER:
- current_word_type = WordNumbers;
- last_word_letter = wc;
+ *pcurrent_word_type = WordNumbers;
+ *plast_word_letter = wc;
attrs[i].is_word_start = TRUE;
break;
@@ -1501,42 +1437,236 @@ pango_default_break (const gchar *text,
break;
}
}
+}
/* ---- Sentence breaks ---- */
+static void
+determine_sentence_breaks (gunichar wc,
+ GUnicodeType type,
+ int *last_sentence_start,
+ int *last_non_space,
+ PangoLogAttr *attrs,
+ int i)
{
-
/* default to not a sentence start/end */
attrs[i].is_sentence_start = FALSE;
attrs[i].is_sentence_end = FALSE;
/* maybe start sentence */
- if (last_sentence_start == -1 && !is_sentence_boundary)
- last_sentence_start = i - 1;
+ if (*last_sentence_start == -1 && !attrs[i].is_sentence_boundary)
+ *last_sentence_start = i - 1;
/* remember last non space character position */
if (i > 0 && !attrs[i - 1].is_white)
- last_non_space = i;
+ *last_non_space = i;
/* meets sentence end, mark both sentence start and end */
- if (last_sentence_start != -1 && is_sentence_boundary) {
- if (last_non_space != -1) {
- attrs[last_sentence_start].is_sentence_start = TRUE;
- attrs[last_non_space].is_sentence_end = TRUE;
+
+ if (*last_sentence_start != -1 && attrs[i].is_sentence_boundary) {
+ if (*last_non_space != -1) {
+ attrs[*last_sentence_start].is_sentence_start = TRUE;
+ attrs[*last_non_space].is_sentence_end = TRUE;
}
- last_sentence_start = -1;
- last_non_space = -1;
+ *last_sentence_start = -1;
+ *last_non_space = -1;
}
/* meets space character, move sentence start */
- if (last_sentence_start != -1 &&
- last_sentence_start == i - 1 &&
+ if (*last_sentence_start != -1 &&
+ *last_sentence_start == i - 1 &&
attrs[i - 1].is_white) {
- last_sentence_start++;
+ (*last_sentence_start)++;
}
}
+/**
+ * pango_default_break:
+ * @text: text to break. Must be valid UTF-8
+ * @length: length of text in bytes (may be -1 if @text is nul-terminated)
+ * @analysis: (nullable): a `PangoAnalysis` structure for the @text
+ * @attrs: logical attributes to fill in
+ * @attrs_len: size of the array passed as @attrs
+ *
+ * This is the default break algorithm.
+ *
+ * It applies Unicode rules without language-specific
+ * tailoring, therefore the @analyis argument is unused
+ * and can be %NULL.
+ *
+ * See [func@Pango.tailor_break] for language-specific breaks.
+ */
+void
+pango_default_break (const gchar *text,
+ gint length,
+ PangoAnalysis *analysis G_GNUC_UNUSED,
+ PangoLogAttr *attrs,
+ int attrs_len G_GNUC_UNUSED)
+{
+ /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
+ * the line breaking stuff is also in TR14 on unicode.org
+ */
+
+ /* This is a default break implementation that should work for nearly all
+ * languages. Language engines can override it optionally.
+ */
+
+ /* FIXME one cheesy optimization here would be to memset attrs to 0
+ * before we start, and then never assign %FALSE to anything
+ */
+
+ const gchar *next;
+ gint i;
+
+ gunichar prev_wc;
+ gunichar next_wc;
+
+ JamoType prev_jamo;
+
+ GUnicodeBreakType next_break_type;
+ GUnicodeBreakType prev_break_type;
+ GUnicodeBreakType prev_prev_break_type;
+
+ GraphemeBreakType prev_GB_type = GB_Other;
+ gboolean met_Extended_Pictographic = FALSE;
+
+ WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
+ gint prev_WB_i = -1;
+
+ SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
+ gint prev_SB_i = -1;
+
+ LineBreakType prev_LB_type = LB_Other;
+
+ WordType current_word_type = WordNone;
+ gunichar last_word_letter = 0;
+ gunichar base_character = 0;
+
+ gint last_sentence_start = -1;
+ gint last_non_space = -1;
+
+ gboolean almost_done = FALSE;
+ gboolean done = FALSE;
+
+ g_return_if_fail (length == 0 || text != NULL);
+ g_return_if_fail (attrs != NULL);
+
+ next = text;
+
+ prev_break_type = G_UNICODE_BREAK_UNKNOWN;
+ prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN;
+ prev_wc = 0;
+ prev_jamo = NO_JAMO;
+
+ if (length == 0 || *text == '\0')
+ {
+ next_wc = PARAGRAPH_SEPARATOR;
+ almost_done = TRUE;
+ }
+ else
+ next_wc = g_utf8_get_char (next);
+
+ next_break_type = g_unichar_break_type (next_wc);
+ next_break_type = BREAK_TYPE_SAFE (next_break_type);
+
+ for (i = 0; !done ; i++)
+ {
+ GUnicodeType type;
+ gunichar wc;
+ GUnicodeBreakType break_type;
+ JamoType jamo;
+ gboolean makes_hangul_syllable;
+
+ /* Emoji extended pictographics */
+ gboolean is_Extended_Pictographic;
+
+ wc = next_wc;
+ break_type = next_break_type;
+
+ if (almost_done)
+ {
+ /*
+ * If we have already reached the end of @text g_utf8_next_char()
+ * may not increment next
+ */
+ next_wc = 0;
+ next_break_type = G_UNICODE_BREAK_UNKNOWN;
+ done = TRUE;
+ }
+ else
+ {
+ next = g_utf8_next_char (next);
+
+ if ((length >= 0 && next >= text + length) || *next == '\0')
+ {
+ /* This is how we fill in the last element (end position) of the
+ * attr array - assume there's a paragraph separators off the end
+ * of @text.
+ */
+ next_wc = PARAGRAPH_SEPARATOR;
+ almost_done = TRUE;
+ }
+ else
+ next_wc = g_utf8_get_char (next);
+
+ next_break_type = g_unichar_break_type (next_wc);
+ next_break_type = BREAK_TYPE_SAFE (next_break_type);
+ }
+
+ type = g_unichar_type (wc);
+ jamo = JAMO_TYPE (break_type);
+
+ /* Determine wheter this forms a Hangul syllable with prev. */
+ if (jamo == NO_JAMO)
+ makes_hangul_syllable = FALSE;
+ else
+ {
+ JamoType prev_end = HangulJamoProps[prev_jamo].end ;
+ JamoType this_start = HangulJamoProps[ jamo].start;
+
+ /* See comments before IS_JAMO */
+ makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
+ }
+
+ is_Extended_Pictographic =
+ _pango_Is_Emoji_Extended_Pictographic (wc);
+
+ determine_whitespace (wc, type, attrs, i);
+
+ determine_grapheme_boundaries (wc, type, prev_wc,
+ makes_hangul_syllable,
+ is_Extended_Pictographic,
+ &met_Extended_Pictographic,
+ &prev_GB_type,
+ attrs, i);
+
+ determine_backspace (prev_wc, base_character, attrs, i);
+
+ determine_word_boundaries (wc, type, break_type, prev_wc,
+ is_Extended_Pictographic,
+ &prev_WB_type, &prev_WB_i,
+ &prev_prev_WB_type,
+ attrs, i);
+
+ determine_sentence_boundaries (wc, type, break_type, prev_wc,
+ &prev_SB_type, &prev_SB_i,
+ &prev_prev_SB_type,
+ &last_sentence_start,
+ attrs, i, done);
+
+ determine_line_breaks (wc, type, break_type, prev_wc,
+ &prev_break_type, &prev_prev_break_type,
+ next_break_type,
+ &prev_LB_type, jamo, &prev_jamo,
+ attrs, i);
+
+ determine_word_breaks (wc, type, ¤t_word_type, &last_word_letter,
+ attrs, i);
+
+ determine_sentence_breaks (wc, type, &last_sentence_start, &last_non_space,
+ attrs, i);
+
prev_wc = wc;
/* wc might not be a valid Unicode base character, but really all we
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]