[pango/break-tailoring: 19/19] Add segmentation attributes




commit 84cd9a305af6e6d91e83320f12c48e0f281b7cd4
Author: Matthias Clasen <mclasen redhat com>
Date:   Sat Aug 21 21:01:29 2021 -0400

    Add segmentation attributes
    
    Add attributes that let us override word and
    sentence boundaries.

 docs/pango_markup.md     |   6 ++
 pango/break.c            | 175 +++++++++++++++++++++++++++++++++++++++++++++++
 pango/pango-attributes.c |  56 +++++++++++++++
 pango/pango-attributes.h |   9 +++
 pango/pango-layout.c     |   2 +
 pango/pango-markup.c     |  21 ++++++
 tests/test-common.c      |   2 +
 tests/testattributes.c   |   6 +-
 8 files changed, 276 insertions(+), 1 deletion(-)
---
diff --git a/docs/pango_markup.md b/docs/pango_markup.md
index 6c421795..9187c1a9 100644
--- a/docs/pango_markup.md
+++ b/docs/pango_markup.md
@@ -209,6 +209,12 @@ text_transform
   'none', 'lowercase', 'uppercase' or 'capitalize'. Support for text transformation
   was added in Pango 1.50.
 
+segment
+: Overrides word or sentence boundaries. The value can be 'word' or 'sentence',
+  to indicate that the span should be treated as a single word or sentence.
+  Overlapping segments will be split to allow this.
+  Available since Pango 1.50.
+
 ## Convenience Tags
 
 `<b>`
diff --git a/pango/break.c b/pango/break.c
index 053fb329..8ecb34ef 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -1634,11 +1634,15 @@ break_attrs (const char   *text,
 {
   PangoAttrList allow_breaks;
   PangoAttrList line_breaks;
+  PangoAttrList words;
+  PangoAttrList sentences;
   GSList *l;
   gboolean tailored = FALSE;
 
   _pango_attr_list_init (&allow_breaks);
   _pango_attr_list_init (&line_breaks);
+  _pango_attr_list_init (&words);
+  _pango_attr_list_init (&sentences);
 
   for (l = attributes; l; l = l->next)
     {
@@ -1659,6 +1663,10 @@ break_attrs (const char   *text,
         pango_attr_list_insert (&allow_breaks, pango_attribute_copy (attr));
       else if (attr->klass->type == PANGO_ATTR_LINE_BREAK)
         pango_attr_list_insert (&line_breaks, pango_attribute_copy (attr));
+      else if (attr->klass->type == PANGO_ATTR_WORD)
+        pango_attr_list_insert (&words, pango_attribute_copy (attr));
+      else if (attr->klass->type == PANGO_ATTR_SENTENCE)
+        pango_attr_list_insert (&sentences, pango_attribute_copy (attr));
     }
 
   if (_pango_attr_list_has_attributes (&allow_breaks))
@@ -1756,8 +1764,175 @@ break_attrs (const char   *text,
       _pango_attr_iterator_destroy (&iter);
     }
 
+  if (_pango_attr_list_has_attributes (&words))
+    {
+      PangoAttrIterator iter;
+
+      _pango_attr_list_get_iterator (&words, &iter);
+      do
+        {
+          const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_WORD);
+          int start, end;
+          int start_pos, end_pos;
+          int pos;
+
+          if (!attr)
+            continue;
+
+          start = attr->start_index;
+          end = attr->end_index;
+          if (start < offset)
+            start_pos = 0;
+          else
+            start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+          if (end >= offset + length)
+            end_pos = log_attrs_len;
+          else
+            end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+          if ((start >= offset && !log_attrs[start_pos].is_cursor_position) ||
+              (end < offset + length && !log_attrs[end_pos].is_cursor_position))
+            {
+              g_warning ("Can't place word boundary on non-grapheme break");
+              continue;
+            }
+
+          for (pos = start_pos + 1; pos < end_pos; pos++)
+            {
+              log_attrs[pos].is_word_start = FALSE;
+              log_attrs[pos].is_word_end = FALSE;
+              log_attrs[pos].is_word_boundary = FALSE;
+              log_attrs[pos].is_sentence_start = FALSE;
+              log_attrs[pos].is_sentence_end = FALSE;
+              log_attrs[pos].is_sentence_boundary = FALSE;
+
+              tailored = TRUE;
+            }
+          if (start >= offset)
+            {
+              gboolean in_word = FALSE;
+              for (pos = start - 1; pos >= offset; pos--)
+                {
+                  if (log_attrs[pos].is_word_end)
+                    break;
+                  if (log_attrs[pos].is_word_start)
+                    {
+                      in_word = TRUE;
+                      break;
+                    }
+                }
+              log_attrs[start_pos].is_word_start = TRUE;
+              log_attrs[start_pos].is_word_end = in_word;
+              log_attrs[start_pos].is_word_boundary = TRUE;
+            }
+          if (end < offset + length)
+            {
+              gboolean in_word = FALSE;
+              for (pos = end + 1; pos < offset + length; pos++)
+                {
+                  if (log_attrs[pos].is_word_start)
+                    break;
+                  if (log_attrs[pos].is_word_end)
+                    {
+                      in_word = TRUE;
+                      break;
+                    }
+                }
+              log_attrs[end_pos].is_word_start = in_word;
+              log_attrs[end_pos].is_word_end = TRUE;
+              log_attrs[end_pos].is_word_boundary = TRUE;
+            }
+        }
+      while (pango_attr_iterator_next (&iter));
+
+      _pango_attr_iterator_destroy (&iter);
+    }
+
+  if (_pango_attr_list_has_attributes (&sentences))
+    {
+      PangoAttrIterator iter;
+
+      _pango_attr_list_get_iterator (&sentences, &iter);
+      do
+        {
+          const PangoAttribute *attr = pango_attr_iterator_get (&iter, PANGO_ATTR_SENTENCE);
+          int start, end;
+          int start_pos, end_pos;
+          int pos;
+
+          if (!attr)
+            continue;
+
+          start = attr->start_index;
+          end = attr->end_index;
+          if (start < offset)
+            start_pos = 0;
+          else
+            start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
+          if (end >= offset + length)
+            end_pos = log_attrs_len;
+          else
+            end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
+
+          if ((start >= offset && !log_attrs[start_pos].is_word_boundary) ||
+              (end < offset + length && !log_attrs[end_pos].is_word_boundary))
+            {
+              g_warning ("Can't place sentence boundary on non-word boundary");
+              continue;
+            }
+
+          for (pos = start_pos + 1; pos < end_pos; pos++)
+            {
+              log_attrs[pos].is_sentence_start = FALSE;
+              log_attrs[pos].is_sentence_end = FALSE;
+              log_attrs[pos].is_sentence_boundary = FALSE;
+
+              tailored = TRUE;
+            }
+          if (start >= offset)
+            {
+              gboolean in_sentence = FALSE;
+              for (pos = start - 1; pos >= offset; pos--)
+                {
+                  if (log_attrs[pos].is_sentence_end)
+                    break;
+                  if (log_attrs[pos].is_sentence_start)
+                    {
+                      in_sentence = TRUE;
+                      break;
+                    }
+                }
+              log_attrs[start_pos].is_sentence_start = TRUE;
+              log_attrs[start_pos].is_sentence_end = in_sentence;
+              log_attrs[start_pos].is_sentence_boundary = TRUE;
+            }
+          if (end < offset + length)
+            {
+              gboolean in_sentence = FALSE;
+              for (pos = end + 1; pos < offset + length; pos++)
+                {
+                  if (log_attrs[pos].is_sentence_start)
+                    break;
+                  if (log_attrs[pos].is_sentence_end)
+                    {
+                      in_sentence = TRUE;
+                      break;
+                    }
+                }
+              log_attrs[end_pos].is_sentence_start = in_sentence;
+              log_attrs[end_pos].is_sentence_end = TRUE;
+              log_attrs[end_pos].is_sentence_boundary = TRUE;
+            }
+        }
+      while (pango_attr_iterator_next (&iter));
+
+      _pango_attr_iterator_destroy (&iter);
+    }
+
   _pango_attr_list_destroy (&allow_breaks);
   _pango_attr_list_destroy (&line_breaks);
+  _pango_attr_list_destroy (&words);
+  _pango_attr_list_destroy (&sentences);
 
   return tailored;
 }
diff --git a/pango/pango-attributes.c b/pango/pango-attributes.c
index b2dce858..2831c4fd 100644
--- a/pango/pango-attributes.c
+++ b/pango/pango-attributes.c
@@ -1391,6 +1391,60 @@ pango_attr_line_break_new (PangoLineBreak before,
   return pango_attr_int_new (&klass, before | (after << 16));
 }
 
+/**
+ * pango_attr_word_new:
+ *
+ * Marks the range of the attribute as a single word.
+ *
+ * Note that this may require adjustments to word and
+ * sentence classification around the range.
+ *
+ * Return value: (transfer full): the newly allocated
+ *   `PangoAttribute`, which should be freed with
+ *   [method@Pango.Attribute.destroy]
+ *
+ * Since: 1.50
+ */
+PangoAttribute *
+pango_attr_word_new (void)
+{
+  static const PangoAttrClass klass = {
+    PANGO_ATTR_WORD,
+    pango_attr_int_copy,
+    pango_attr_int_destroy,
+    pango_attr_int_equal,
+  };
+
+  return pango_attr_int_new (&klass, 0);
+}
+
+/**
+ * pango_attr_sentence_new:
+ *
+ * Marks the range of the attribute as a single sentence.
+ *
+ * Note that this may require adjustments to word and
+ * sentence classification around the range.
+ *
+ * Return value: (transfer full): the newly allocated
+ *   `PangoAttribute`, which should be freed with
+ *   [method@Pango.Attribute.destroy]
+ *
+ * Since: 1.50
+ */
+PangoAttribute *
+pango_attr_sentence_new (void)
+{
+  static const PangoAttrClass klass = {
+    PANGO_ATTR_SENTENCE,
+    pango_attr_int_copy,
+    pango_attr_int_destroy,
+    pango_attr_int_equal,
+  };
+
+  return pango_attr_int_new (&klass, 0);
+}
+
 /**
  * pango_attr_overline_new:
  * @overline: the overline style
@@ -1568,6 +1622,8 @@ pango_attribute_as_int (PangoAttribute *attr)
     case PANGO_ATTR_OVERLINE:
     case PANGO_ATTR_ABSOLUTE_LINE_HEIGHT:
     case PANGO_ATTR_TEXT_TRANSFORM:
+    case PANGO_ATTR_WORD:
+    case PANGO_ATTR_SENTENCE:
       return (PangoAttrInt *)attr;
 
     default:
diff --git a/pango/pango-attributes.h b/pango/pango-attributes.h
index 9180e960..8e99e5f0 100644
--- a/pango/pango-attributes.h
+++ b/pango/pango-attributes.h
@@ -79,6 +79,8 @@ typedef struct _PangoAttrFontFeatures PangoAttrFontFeatures;
  * @PANGO_ATTR_ABSOLUTE_LINE_HEIGHT: line height ([struct@Pango.AttrInt]). Since: 1.50
  * @PANGO_ATTR_LINE_BREAK: override line breaks at the ends of the range ([struct@Pango.AttrInt]). Since 1.50
  * @PANGO_ATTR_ALLOW_LINE_BREAKS: what algorithmically determined line breaks to allow 
([struct@Pango.AttrInt]). Since 1.50
+ * @PANGO_ATTR_WORD: override segmentation to classify the range of the attribute as a single word 
([struct@Pango.AttrInt]). Since 1.50
+ * @PANGO_ATTR_SENTENCE: override segmentation to classify the range of the attribute as a single sentence 
([struct@Pango.AttrInt]). Since 1.50
  *
  * The `PangoAttrType` distinguishes between different types of attributes.
  *
@@ -125,6 +127,8 @@ typedef enum
   PANGO_ATTR_TEXT_TRANSFORM,    /* PangoAttrInt */
   PANGO_ATTR_LINE_BREAK,        /* PangoAttrInt */
   PANGO_ATTR_ALLOW_LINE_BREAKS,  /* PangoAttrInt */
+  PANGO_ATTR_WORD,              /* PangoAttrInt */
+  PANGO_ATTR_SENTENCE,          /* PangoAttrInt */
 } PangoAttrType;
 
 /**
@@ -570,6 +574,11 @@ PANGO_AVAILABLE_IN_1_50
 PangoAttribute *        pango_attr_line_break_new               (PangoLineBreak               before,
                                                                  PangoLineBreak               after);
 
+PANGO_AVAILABLE_IN_1_50
+PangoAttribute *        pango_attr_word_new                     (void);
+PANGO_AVAILABLE_IN_1_50
+PangoAttribute *        pango_attr_sentence_new                 (void);
+
 PANGO_AVAILABLE_IN_1_44
 PangoAttribute *        pango_attr_insert_hyphens_new           (gboolean                     
insert_hyphens);
 PANGO_AVAILABLE_IN_1_46
diff --git a/pango/pango-layout.c b/pango/pango-layout.c
index f9005b72..14a6066d 100644
--- a/pango/pango-layout.c
+++ b/pango/pango-layout.c
@@ -4326,6 +4326,8 @@ affects_break_or_shape (PangoAttribute *attr,
     case PANGO_ATTR_ALLOW_BREAKS:
     case PANGO_ATTR_ALLOW_LINE_BREAKS:
     case PANGO_ATTR_LINE_BREAK:
+    case PANGO_ATTR_WORD:
+    case PANGO_ATTR_SENTENCE:
     /* Affects shaping */
     case PANGO_ATTR_INSERT_HYPHENS:
     case PANGO_ATTR_FONT_FEATURES:
diff --git a/pango/pango-markup.c b/pango/pango-markup.c
index ab0a9487..d62d95ab 100644
--- a/pango/pango-markup.c
+++ b/pango/pango-markup.c
@@ -1232,6 +1232,7 @@ span_parse_func     (MarkupData            *md G_GNUC_UNUSED,
   const char *text_transform = NULL;
   const char *break_before = NULL;
   const char *break_after = NULL;
+  const char *segment = NULL;
 
   g_markup_parse_context_get_position (context,
                                       &line_number, &char_number);
@@ -1301,6 +1302,7 @@ span_parse_func     (MarkupData            *md G_GNUC_UNUSED,
        CHECK_ATTRIBUTE (strikethrough);
        CHECK_ATTRIBUTE (strikethrough_color);
        CHECK_ATTRIBUTE (style);
+       CHECK_ATTRIBUTE (segment);
        break;
       case 't':
         CHECK_ATTRIBUTE (text_transform);
@@ -1760,6 +1762,25 @@ span_parse_func     (MarkupData            *md G_GNUC_UNUSED,
       add_attribute (tag, pango_attr_insert_hyphens_new (b));
     }
 
+  if (G_UNLIKELY (segment))
+    {
+      if (strcmp (segment, "word") == 0)
+        add_attribute (tag, pango_attr_word_new ());
+      else if (strcmp (segment, "sentence") == 0)
+        add_attribute (tag, pango_attr_sentence_new ());
+      else
+        {
+          g_set_error (error,
+                       G_MARKUP_ERROR,
+                       G_MARKUP_ERROR_INVALID_CONTENT,
+                       _("Value of 'segment' attribute on <span> tag on line %d "
+                         "could not be parsed; should be one of 'word' or "
+                         "'sentence', not '%s'"),
+                       line_number, segment);
+          goto error;
+        }
+    }
+
   return TRUE;
 
  error:
diff --git a/tests/test-common.c b/tests/test-common.c
index c93197eb..1a9c1154 100644
--- a/tests/test-common.c
+++ b/tests/test-common.c
@@ -145,6 +145,8 @@ print_attribute (PangoAttribute *attr, GString *string)
     case PANGO_ATTR_SHOW:
     case PANGO_ATTR_TEXT_TRANSFORM:
     case PANGO_ATTR_ABSOLUTE_LINE_HEIGHT:
+    case PANGO_ATTR_WORD:
+    case PANGO_ATTR_SENTENCE:
       g_string_append_printf (string, "%d", ((PangoAttrInt *)attr)->value);
       break;
     case PANGO_ATTR_LINE_BREAK:
diff --git a/tests/testattributes.c b/tests/testattributes.c
index d491a380..fd05aea0 100644
--- a/tests/testattributes.c
+++ b/tests/testattributes.c
@@ -77,6 +77,8 @@ test_attributes_basic (void)
   test_copy (pango_attr_line_height_new_absolute (3000));
   test_copy (pango_attr_line_break_new (PANGO_LINE_BREAK_CHAR, PANGO_LINE_BREAK_MANDATORY));
   test_copy (pango_attr_allow_line_breaks_new (PANGO_LINE_BREAK_LINE));
+  test_copy (pango_attr_word_new ());
+  test_copy (pango_attr_sentence_new ());
 }
 
 static void
@@ -127,7 +129,7 @@ test_binding (PangoAttribute *attr)
     INVALID, LANGUAGE, STRING, INT, INT, INT, INT, SIZE, FONT_DESC, COLOR,
     COLOR, INT, INT, INT, SHAPE, FLOAT, INT, INT, COLOR, COLOR, SIZE,
     INT, INT, FONT_FEATURES, INT, INT, INT, INT, INT, INT, COLOR, FLOAT,
-    INT, INT, INT, INT
+    INT, INT, INT, INT, INT, INT
   };
 
   switch (attr_base[attr->klass->type])
@@ -211,6 +213,8 @@ test_binding_helpers (void)
   test_binding (pango_attr_line_height_new_absolute (3000));
   test_binding (pango_attr_line_break_new (PANGO_LINE_BREAK_CHAR, PANGO_LINE_BREAK_MANDATORY));
   test_binding (pango_attr_allow_line_breaks_new (PANGO_LINE_BREAK_LINE));
+  test_binding (pango_attr_word_new ());
+  test_binding (pango_attr_sentence_new ());
 }
 
 static void


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]