pango  1.44.7
About: Pango is a library for laying out and rendering of text, with an emphasis on internationalization (an offshoot of the GTK+ and GNOME projects).
  Fossies Dox: pango-1.44.7.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

break.c
Go to the documentation of this file.
1 /* Pango
2  * break.c:
3  *
4  * Copyright (C) 1999 Red Hat Software
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Library General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Library General Public License for more details.
15  *
16  * You should have received a copy of the GNU Library General Public
17  * License along with this library; if not, write to the
18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19  * Boston, MA 02111-1307, USA.
20  */
21 
22 #include "config.h"
23 
24 #include "pango-break.h"
25 #include "pango-script-private.h"
26 #include "pango-emoji-private.h"
27 #include "pango-break-table.h"
28 #include "pango-impl-utils.h"
29 #include <string.h>
30 
31 #define PARAGRAPH_SEPARATOR 0x2029
32 #define PARAGRAPH_SEPARATOR_STRING "\xE2\x80\xA9"
33 
34 /* See http://www.unicode.org/unicode/reports/tr14/ if you hope
35  * to understand the line breaking code.
36  */
37 
38 typedef enum
39 {
40  BREAK_ALREADY_HANDLED, /* didn't use the table */
41  BREAK_PROHIBITED, /* no break, even if spaces intervene */
42  BREAK_IF_SPACES, /* "indirect break" (only if there are spaces) */
43  BREAK_ALLOWED /* "direct break" (can always break here) */
44  /* TR 14 has two more break-opportunity classes,
45  * "indirect break opportunity for combining marks following a space"
46  * and "prohibited break for combining marks"
47  * but we handle that inline in the code.
48  */
50 
51 /* need to sync the break range to glib/gunicode.h . */
52 #define BREAK_TYPE_SAFE(btype) \
53  ((btype) <= G_UNICODE_BREAK_ZERO_WIDTH_JOINER ? (btype) : G_UNICODE_BREAK_UNKNOWN)
54 
55 
56 /*
57  * Hangul Conjoining Jamo handling.
58  *
59  * The way we implement it is just a bit different from TR14,
60  * but produces the same results.
61  * The same algorithm is also used in TR29 for cluster boundaries.
62  *
63  */
64 
65 
66 /* An enum that works as the states of the Hangul syllables system.
67  **/
68 typedef enum
69 {
70  JAMO_L, /* G_UNICODE_BREAK_HANGUL_L_JAMO */
71  JAMO_V, /* G_UNICODE_BREAK_HANGUL_V_JAMO */
72  JAMO_T, /* G_UNICODE_BREAK_HANGUL_T_JAMO */
73  JAMO_LV, /* G_UNICODE_BREAK_HANGUL_LV_SYLLABLE */
74  JAMO_LVT, /* G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE */
75  NO_JAMO /* Other */
76 } JamoType;
77 
78 /* There are Hangul syllables encoded as characters, that act like a
79  * sequence of Jamos. For each character we define a JamoType
80  * that the character starts with, and one that it ends with. This
81  * decomposes JAMO_LV and JAMO_LVT to simple other JAMOs. So for
82  * example, a character with LineBreak type
83  * G_UNICODE_BREAK_HANGUL_LV_SYLLABLE has start=JAMO_L and end=JAMO_V.
84  */
85 typedef struct _CharJamoProps
86 {
89 
90 /* Map from JamoType to CharJamoProps that hold only simple
91  * JamoTypes (no LV or LVT) or none.
92  */
93 static const CharJamoProps HangulJamoProps[] = {
94  {JAMO_L, JAMO_L}, /* JAMO_L */
95  {JAMO_V, JAMO_V}, /* JAMO_V */
96  {JAMO_T, JAMO_T}, /* JAMO_T */
97  {JAMO_L, JAMO_V}, /* JAMO_LV */
98  {JAMO_L, JAMO_T}, /* JAMO_LVT */
99  {NO_JAMO, NO_JAMO} /* NO_JAMO */
100 };
101 
102 /* A character forms a syllable with the previous character if and only if:
103  * JamoType(this) is not NO_JAMO and:
104  *
105  * HangulJamoProps[JamoType(prev)].end and
106  * HangulJamoProps[JamoType(this)].start are equal,
107  * or the former is one less than the latter.
108  */
109 
110 #define IS_JAMO(btype) \
111  ((btype >= G_UNICODE_BREAK_HANGUL_L_JAMO) && \
112  (btype <= G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
113 #define JAMO_TYPE(btype) \
114  (IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)
115 
116 /* Types of Japanese characters */
117 #define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
118 #define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
119 #define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
120 #define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)
121 
122 #define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))
123 #define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))
124 #define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))
125 #define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)
126 #define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
127 #define EMOJI(wc) (_pango_Is_Emoji_Base_Character (wc))
128 #define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA (wc) && !HANGUL (wc) && !EMOJI (wc))
129 
130 /* Previously "123foo" was two words. But in UAX 29 of Unicode,
131  * we know don't break words between consecutive letters and numbers
132  */
133 typedef enum
134 {
138 } WordType;
139 
140 
155 void
156 pango_default_break (const gchar *text,
157  gint length,
158  PangoAnalysis *analysis G_GNUC_UNUSED,
159  PangoLogAttr *attrs,
160  int attrs_len G_GNUC_UNUSED)
161 {
162  /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
163  * the line breaking stuff is also in TR14 on unicode.org
164  */
165 
166  /* This is a default break implementation that should work for nearly all
167  * languages. Language engines can override it optionally.
168  */
169 
170  /* FIXME one cheesy optimization here would be to memset attrs to 0
171  * before we start, and then never assign %FALSE to anything
172  */
173 
174  const gchar *next;
175  gint i;
176 
177  gunichar prev_wc;
178  gunichar next_wc;
179 
180  JamoType prev_jamo;
181 
182  GUnicodeBreakType next_break_type;
183  GUnicodeBreakType prev_break_type;
184  GUnicodeBreakType prev_prev_break_type;
185 
186  /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
187  typedef enum
188  {
189  GB_Other,
190  GB_ControlCRLF,
191  GB_Extend,
192  GB_ZWJ,
193  GB_Prepend,
194  GB_SpacingMark,
195  GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
196  /* Use state machine to handle emoji sequence */
197  /* Rule GB12 and GB13 */
198  GB_RI_Odd, /* Meets odd number of RI */
199  GB_RI_Even, /* Meets even number of RI */
200  } GraphemeBreakType;
201  GraphemeBreakType prev_GB_type = GB_Other;
202  gboolean met_Extended_Pictographic = FALSE;
203 
204  /* See Word_Break Property Values table of UAX#29 */
205  typedef enum
206  {
207  WB_Other,
208  WB_NewlineCRLF,
209  WB_ExtendFormat,
210  WB_Katakana,
211  WB_Hebrew_Letter,
212  WB_ALetter,
213  WB_MidNumLet,
214  WB_MidLetter,
215  WB_MidNum,
216  WB_Numeric,
217  WB_ExtendNumLet,
218  WB_RI_Odd,
219  WB_RI_Even,
220  WB_WSegSpace,
221  } WordBreakType;
222  WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
223  gint prev_WB_i = -1;
224 
225  /* See Sentence_Break Property Values table of UAX#29 */
226  typedef enum
227  {
228  SB_Other,
229  SB_ExtendFormat,
230  SB_ParaSep,
231  SB_Sp,
232  SB_Lower,
233  SB_Upper,
234  SB_OLetter,
235  SB_Numeric,
236  SB_ATerm,
237  SB_SContinue,
238  SB_STerm,
239  SB_Close,
240  /* Rules SB8 and SB8a */
241  SB_ATerm_Close_Sp,
242  SB_STerm_Close_Sp,
243  } SentenceBreakType;
244  SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
245  gint prev_SB_i = -1;
246 
247  /* Rule LB25 with Example 7 of Customization */
248  typedef enum
249  {
250  LB_Other,
251  LB_Numeric,
252  LB_Numeric_Close,
253  LB_RI_Odd,
254  LB_RI_Even,
255  } LineBreakType;
256  LineBreakType prev_LB_type = LB_Other;
257 
258  WordType current_word_type = WordNone;
259  gunichar last_word_letter = 0;
260  gunichar base_character = 0;
261 
262  gint last_sentence_start = -1;
263  gint last_non_space = -1;
264 
265  gboolean almost_done = FALSE;
266  gboolean done = FALSE;
267 
268  g_return_if_fail (length == 0 || text != NULL);
269  g_return_if_fail (attrs != NULL);
270 
271  next = text;
272 
273  prev_break_type = G_UNICODE_BREAK_UNKNOWN;
274  prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN;
275  prev_wc = 0;
276  prev_jamo = NO_JAMO;
277 
278  if (length == 0 || *text == '\0')
279  {
280  next_wc = PARAGRAPH_SEPARATOR;
281  almost_done = TRUE;
282  }
283  else
284  next_wc = g_utf8_get_char (next);
285 
286  next_break_type = g_unichar_break_type (next_wc);
287  next_break_type = BREAK_TYPE_SAFE (next_break_type);
288 
289  for (i = 0; !done ; i++)
290  {
291  GUnicodeType type;
292  gunichar wc;
293  GUnicodeBreakType break_type;
294  GUnicodeBreakType row_break_type;
295  BreakOpportunity break_op;
296  JamoType jamo;
297  gboolean makes_hangul_syllable;
298 
299  /* UAX#29 boundaries */
300  gboolean is_grapheme_boundary;
301  gboolean is_word_boundary;
302  gboolean is_sentence_boundary;
303 
304  /* Emoji extended pictographics */
305  gboolean is_Extended_Pictographic;
306 
307 
308  wc = next_wc;
309  break_type = next_break_type;
310 
311  if (almost_done)
312  {
313  /*
314  * If we have already reached the end of @text g_utf8_next_char()
315  * may not increment next
316  */
317  next_wc = 0;
318  next_break_type = G_UNICODE_BREAK_UNKNOWN;
319  done = TRUE;
320  }
321  else
322  {
323  next = g_utf8_next_char (next);
324 
325  if ((length >= 0 && next >= text + length) || *next == '\0')
326  {
327  /* This is how we fill in the last element (end position) of the
328  * attr array - assume there's a paragraph separators off the end
329  * of @text.
330  */
331  next_wc = PARAGRAPH_SEPARATOR;
332  almost_done = TRUE;
333  }
334  else
335  next_wc = g_utf8_get_char (next);
336 
337  next_break_type = g_unichar_break_type (next_wc);
338  next_break_type = BREAK_TYPE_SAFE (next_break_type);
339  }
340 
341  type = g_unichar_type (wc);
342  jamo = JAMO_TYPE (break_type);
343 
344  /* Determine wheter this forms a Hangul syllable with prev. */
345  if (jamo == NO_JAMO)
346  makes_hangul_syllable = FALSE;
347  else
348  {
349  JamoType prev_end = HangulJamoProps[prev_jamo].end ;
350  JamoType this_start = HangulJamoProps[ jamo].start;
351 
352  /* See comments before IS_JAMO */
353  makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
354  }
355 
356  switch (type)
357  {
358  case G_UNICODE_SPACE_SEPARATOR:
359  case G_UNICODE_LINE_SEPARATOR:
360  case G_UNICODE_PARAGRAPH_SEPARATOR:
361  attrs[i].is_white = TRUE;
362  break;
363  default:
364  if (wc == '\t' || wc == '\n' || wc == '\r' || wc == '\f')
365  attrs[i].is_white = TRUE;
366  else
367  attrs[i].is_white = FALSE;
368  break;
369  }
370 
371  /* Just few spaces have variable width. So explicitly mark them.
372  */
373  attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc);
374  is_Extended_Pictographic =
376 
377 
378  /* ---- UAX#29 Grapheme Boundaries ---- */
379  {
380  GraphemeBreakType GB_type;
381 
382  /* Find the GraphemeBreakType of wc */
383  GB_type = GB_Other;
384  switch ((int) type)
385  {
386  case G_UNICODE_FORMAT:
387  if (G_UNLIKELY (wc == 0x200C))
388  {
389  GB_type = GB_Extend;
390  break;
391  }
392  if (G_UNLIKELY (wc == 0x200D))
393  {
394  GB_type = GB_ZWJ;
395  break;
396  }
397  if (G_UNLIKELY((wc >= 0x600 && wc <= 0x605) ||
398  wc == 0x6DD ||
399  wc == 0x70F ||
400  wc == 0x8E2 ||
401  wc == 0xD4E ||
402  wc == 0x110BD ||
403  (wc >= 0x111C2 && wc <= 0x111C3)))
404  {
405  GB_type = GB_Prepend;
406  break;
407  }
408  /* fall through */
409  case G_UNICODE_CONTROL:
410  case G_UNICODE_LINE_SEPARATOR:
411  case G_UNICODE_PARAGRAPH_SEPARATOR:
412  case G_UNICODE_SURROGATE:
413  GB_type = GB_ControlCRLF;
414  break;
415 
416  case G_UNICODE_UNASSIGNED:
417  /* Unassigned default ignorables */
418  if ((wc >= 0xFFF0 && wc <= 0xFFF8) ||
419  (wc >= 0xE0000 && wc <= 0xE0FFF))
420  {
421  GB_type = GB_ControlCRLF;
422  break;
423  }
424  /* fall through */
425 
426  case G_UNICODE_OTHER_LETTER:
427  if (makes_hangul_syllable)
428  GB_type = GB_InHangulSyllable;
429  break;
430 
431  case G_UNICODE_MODIFIER_LETTER:
432  if (wc >= 0xFF9E && wc <= 0xFF9F)
433  GB_type = GB_Extend; /* Other_Grapheme_Extend */
434  break;
435 
436  case G_UNICODE_SPACING_MARK:
437  GB_type = GB_SpacingMark; /* SpacingMark */
438  if (wc >= 0x0900)
439  {
440  if (wc == 0x09BE || wc == 0x09D7 ||
441  wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 ||
442  wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 ||
443  wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF ||
444  wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172))
445  GB_type = GB_Extend; /* Other_Grapheme_Extend */
446  }
447  break;
448 
449  case G_UNICODE_ENCLOSING_MARK:
450  case G_UNICODE_NON_SPACING_MARK:
451  GB_type = GB_Extend; /* Grapheme_Extend */
452  break;
453 
454  case G_UNICODE_OTHER_SYMBOL:
455  if (G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF))
456  {
457  if (prev_GB_type == GB_RI_Odd)
458  GB_type = GB_RI_Even;
459  else if (prev_GB_type == GB_RI_Even)
460  GB_type = GB_RI_Odd;
461  else
462  GB_type = GB_RI_Odd;
463  break;
464  }
465  break;
466 
467  case G_UNICODE_MODIFIER_SYMBOL:
468  if (wc >= 0x1F3FB && wc <= 0x1F3FF)
469  GB_type = GB_Extend;
470  break;
471  }
472 
473  /* Rule GB11 */
474  if (met_Extended_Pictographic)
475  {
476  if (GB_type == GB_Extend)
477  met_Extended_Pictographic = TRUE;
478  else if (_pango_Is_Emoji_Extended_Pictographic (prev_wc) &&
479  GB_type == GB_ZWJ)
480  met_Extended_Pictographic = TRUE;
481  else if (prev_GB_type == GB_Extend && GB_type == GB_ZWJ)
482  met_Extended_Pictographic = TRUE;
483  else if (prev_GB_type == GB_ZWJ && is_Extended_Pictographic)
484  met_Extended_Pictographic = TRUE;
485  else
486  met_Extended_Pictographic = FALSE;
487  }
488 
489  /* Grapheme Cluster Boundary Rules */
490  is_grapheme_boundary = TRUE; /* Rule GB999 */
491 
492  /* We apply Rules GB1 and GB2 at the end of the function */
493  if (wc == '\n' && prev_wc == '\r')
494  is_grapheme_boundary = FALSE; /* Rule GB3 */
495  else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
496  is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
497  else if (GB_type == GB_InHangulSyllable)
498  is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */
499  else if (GB_type == GB_Extend)
500  {
501  is_grapheme_boundary = FALSE; /* Rule GB9 */
502  }
503  else if (GB_type == GB_ZWJ)
504  is_grapheme_boundary = FALSE; /* Rule GB9 */
505  else if (GB_type == GB_SpacingMark)
506  is_grapheme_boundary = FALSE; /* Rule GB9a */
507  else if (prev_GB_type == GB_Prepend)
508  is_grapheme_boundary = FALSE; /* Rule GB9b */
509  else if (is_Extended_Pictographic)
510  { /* Rule GB11 */
511  if (prev_GB_type == GB_ZWJ && met_Extended_Pictographic)
512  is_grapheme_boundary = FALSE;
513  }
514  else if (prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even)
515  is_grapheme_boundary = FALSE; /* Rule GB12 and GB13 */
516 
517  if (is_Extended_Pictographic)
518  met_Extended_Pictographic = TRUE;
519 
520  attrs[i].is_cursor_position = is_grapheme_boundary;
521  /* If this is a grapheme boundary, we have to decide if backspace
522  * deletes a character or the whole grapheme cluster */
523  if (is_grapheme_boundary)
524  {
525  attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
526 
527  /* Dependent Vowels for Indic language */
528  if (_pango_is_Virama (prev_wc) ||
529  _pango_is_Vowel_Dependent (prev_wc))
530  attrs[i].backspace_deletes_character = TRUE;
531  }
532  else
533  attrs[i].backspace_deletes_character = FALSE;
534 
535  prev_GB_type = GB_type;
536  }
537 
538  /* ---- UAX#29 Word Boundaries ---- */
539  {
540  is_word_boundary = FALSE;
541  if (is_grapheme_boundary ||
542  G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF)) /* Rules WB3 and WB4 */
543  {
544  PangoScript script;
545  WordBreakType WB_type;
546 
547  script = (PangoScript)g_unichar_get_script (wc);
548 
549  /* Find the WordBreakType of wc */
550  WB_type = WB_Other;
551 
552  if (script == PANGO_SCRIPT_KATAKANA)
553  WB_type = WB_Katakana;
554 
555  if (script == PANGO_SCRIPT_HEBREW && type == G_UNICODE_OTHER_LETTER)
556  WB_type = WB_Hebrew_Letter;
557 
558  if (WB_type == WB_Other)
559  switch (wc >> 8)
560  {
561  case 0x30:
562  if (wc == 0x3031 || wc == 0x3032 || wc == 0x3033 || wc == 0x3034 || wc == 0x3035 ||
563  wc == 0x309b || wc == 0x309c || wc == 0x30a0 || wc == 0x30fc)
564  WB_type = WB_Katakana; /* Katakana exceptions */
565  break;
566  case 0xFF:
567  if (wc == 0xFF70)
568  WB_type = WB_Katakana; /* Katakana exceptions */
569  else if (wc >= 0xFF9E && wc <= 0xFF9F)
570  WB_type = WB_ExtendFormat; /* Other_Grapheme_Extend */
571  break;
572  case 0x05:
573  if (wc == 0x05F3)
574  WB_type = WB_ALetter; /* ALetter exceptions */
575  break;
576  }
577 
578  if (WB_type == WB_Other)
579  switch ((int) break_type)
580  {
581  case G_UNICODE_BREAK_NUMERIC:
582  if (wc != 0x066C)
583  WB_type = WB_Numeric; /* Numeric */
584  break;
585  case G_UNICODE_BREAK_INFIX_SEPARATOR:
586  if (wc != 0x003A && wc != 0xFE13 && wc != 0x002E)
587  WB_type = WB_MidNum; /* MidNum */
588  break;
589  }
590 
591  if (WB_type == WB_Other)
592  switch ((int) type)
593  {
594  case G_UNICODE_CONTROL:
595  if (wc != 0x000D && wc != 0x000A && wc != 0x000B && wc != 0x000C && wc != 0x0085)
596  break;
597  /* fall through */
598  case G_UNICODE_LINE_SEPARATOR:
599  case G_UNICODE_PARAGRAPH_SEPARATOR:
600  WB_type = WB_NewlineCRLF; /* CR, LF, Newline */
601  break;
602 
603  case G_UNICODE_FORMAT:
604  case G_UNICODE_SPACING_MARK:
605  case G_UNICODE_ENCLOSING_MARK:
606  case G_UNICODE_NON_SPACING_MARK:
607  WB_type = WB_ExtendFormat; /* Extend, Format */
608  break;
609 
610  case G_UNICODE_CONNECT_PUNCTUATION:
611  WB_type = WB_ExtendNumLet; /* ExtendNumLet */
612  break;
613 
614  case G_UNICODE_INITIAL_PUNCTUATION:
615  case G_UNICODE_FINAL_PUNCTUATION:
616  if (wc == 0x2018 || wc == 0x2019)
617  WB_type = WB_MidNumLet; /* MidNumLet */
618  break;
619  case G_UNICODE_OTHER_PUNCTUATION:
620  if (wc == 0x0027 || wc == 0x002e || wc == 0x2024 ||
621  wc == 0xfe52 || wc == 0xff07 || wc == 0xff0e)
622  WB_type = WB_MidNumLet; /* MidNumLet */
623  else if (wc == 0x00b7 || wc == 0x05f4 || wc == 0x2027 || wc == 0x003a || wc == 0x0387 ||
624  wc == 0xfe13 || wc == 0xfe55 || wc == 0xff1a)
625  WB_type = WB_MidLetter; /* WB_MidLetter */
626  else if (wc == 0x066c ||
627  wc == 0xfe50 || wc == 0xfe54 || wc == 0xff0c || wc == 0xff1b)
628  WB_type = WB_MidNum; /* MidNum */
629  break;
630 
631  case G_UNICODE_OTHER_SYMBOL:
632  if (wc >= 0x24B6 && wc <= 0x24E9) /* Other_Alphabetic */
633  goto Alphabetic;
634 
635  if (G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF))
636  {
637  if (prev_WB_type == WB_RI_Odd)
638  WB_type = WB_RI_Even;
639  else if (prev_WB_type == WB_RI_Even)
640  WB_type = WB_RI_Odd;
641  else
642  WB_type = WB_RI_Odd;
643  }
644 
645  break;
646 
647  case G_UNICODE_OTHER_LETTER:
648  case G_UNICODE_LETTER_NUMBER:
649  if (wc == 0x3006 || wc == 0x3007 ||
650  (wc >= 0x3021 && wc <= 0x3029) ||
651  (wc >= 0x3038 && wc <= 0x303A) ||
652  (wc >= 0x3400 && wc <= 0x4DB5) ||
653  (wc >= 0x4E00 && wc <= 0x9FC3) ||
654  (wc >= 0xF900 && wc <= 0xFA2D) ||
655  (wc >= 0xFA30 && wc <= 0xFA6A) ||
656  (wc >= 0xFA70 && wc <= 0xFAD9) ||
657  (wc >= 0x20000 && wc <= 0x2A6D6) ||
658  (wc >= 0x2F800 && wc <= 0x2FA1D))
659  break; /* ALetter exceptions: Ideographic */
660  goto Alphabetic;
661 
662  case G_UNICODE_LOWERCASE_LETTER:
663  case G_UNICODE_MODIFIER_LETTER:
664  case G_UNICODE_TITLECASE_LETTER:
665  case G_UNICODE_UPPERCASE_LETTER:
666  Alphabetic:
667  if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != PANGO_SCRIPT_HIRAGANA)
668  WB_type = WB_ALetter; /* ALetter */
669  break;
670  }
671 
672  if (WB_type == WB_Other)
673  {
674  if (type == G_UNICODE_SPACE_SEPARATOR &&
675  break_type != G_UNICODE_BREAK_NON_BREAKING_GLUE)
676  WB_type = WB_WSegSpace;
677  }
678 
679  /* Word Cluster Boundary Rules */
680 
681  /* We apply Rules WB1 and WB2 at the end of the function */
682 
683  if (prev_wc == 0x3031 && wc == 0x41)
684  g_debug ("Y %d %d", prev_WB_type, WB_type);
685  if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i)
686  {
687  /* The extra check for prev_WB_i is to correctly handle sequences like
688  * Newline ÷ Extend × Extend
689  * since we have not skipped ExtendFormat yet.
690  */
691  is_word_boundary = TRUE; /* Rule WB3a */
692  }
693  else if (WB_type == WB_NewlineCRLF)
694  is_word_boundary = TRUE; /* Rule WB3b */
695  else if (prev_wc == 0x200D && is_Extended_Pictographic)
696  is_word_boundary = FALSE; /* Rule WB3c */
697  else if (prev_WB_type == WB_WSegSpace &&
698  WB_type == WB_WSegSpace && prev_WB_i + 1 == i)
699  is_word_boundary = FALSE; /* Rule WB3d */
700  else if (WB_type == WB_ExtendFormat)
701  is_word_boundary = FALSE; /* Rules WB4? */
702  else if ((prev_WB_type == WB_ALetter ||
703  prev_WB_type == WB_Hebrew_Letter ||
704  prev_WB_type == WB_Numeric) &&
705  (WB_type == WB_ALetter ||
706  WB_type == WB_Hebrew_Letter ||
707  WB_type == WB_Numeric))
708  is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10 */
709  else if (prev_WB_type == WB_Katakana && WB_type == WB_Katakana)
710  is_word_boundary = FALSE; /* Rule WB13 */
711  else if ((prev_WB_type == WB_ALetter ||
712  prev_WB_type == WB_Hebrew_Letter ||
713  prev_WB_type == WB_Numeric ||
714  prev_WB_type == WB_Katakana ||
715  prev_WB_type == WB_ExtendNumLet) &&
716  WB_type == WB_ExtendNumLet)
717  is_word_boundary = FALSE; /* Rule WB13a */
718  else if (prev_WB_type == WB_ExtendNumLet &&
719  (WB_type == WB_ALetter ||
720  WB_type == WB_Hebrew_Letter ||
721  WB_type == WB_Numeric ||
722  WB_type == WB_Katakana))
723  is_word_boundary = FALSE; /* Rule WB13b */
724  else if (((prev_prev_WB_type == WB_ALetter ||
725  prev_prev_WB_type == WB_Hebrew_Letter) &&
726  (WB_type == WB_ALetter ||
727  WB_type == WB_Hebrew_Letter)) &&
728  (prev_WB_type == WB_MidLetter ||
729  prev_WB_type == WB_MidNumLet ||
730  prev_wc == 0x0027))
731  {
732  attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
733  is_word_boundary = FALSE; /* Rule WB7 */
734  }
735  else if (prev_WB_type == WB_Hebrew_Letter && wc == 0x0027)
736  is_word_boundary = FALSE; /* Rule WB7a */
737  else if (prev_prev_WB_type == WB_Hebrew_Letter && prev_wc == 0x0022 &&
738  WB_type == WB_Hebrew_Letter) {
739  attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB7b */
740  is_word_boundary = FALSE; /* Rule WB7c */
741  }
742  else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
743  (prev_WB_type == WB_MidNum || prev_WB_type == WB_MidNumLet ||
744  prev_wc == 0x0027))
745  {
746  is_word_boundary = FALSE; /* Rule WB11 */
747  attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
748  }
749  else if (prev_WB_type == WB_RI_Odd && WB_type == WB_RI_Even)
750  is_word_boundary = FALSE; /* Rule WB15 and WB16 */
751  else
752  is_word_boundary = TRUE; /* Rule WB999 */
753 
754  if (WB_type != WB_ExtendFormat)
755  {
756  prev_prev_WB_type = prev_WB_type;
757  prev_WB_type = WB_type;
758  prev_WB_i = i;
759  }
760  }
761 
762  attrs[i].is_word_boundary = is_word_boundary;
763  }
764 
765  /* ---- UAX#29 Sentence Boundaries ---- */
766  {
767  is_sentence_boundary = FALSE;
768  if (is_word_boundary ||
769  wc == '\r' || wc == '\n') /* Rules SB3 and SB5 */
770  {
771  SentenceBreakType SB_type;
772 
773  /* Find the SentenceBreakType of wc */
774  SB_type = SB_Other;
775 
776  if (break_type == G_UNICODE_BREAK_NUMERIC)
777  SB_type = SB_Numeric; /* Numeric */
778 
779  if (SB_type == SB_Other)
780  switch ((int) type)
781  {
782  case G_UNICODE_CONTROL:
783  if (wc == '\r' || wc == '\n')
784  SB_type = SB_ParaSep;
785  else if (wc == 0x0009 || wc == 0x000B || wc == 0x000C)
786  SB_type = SB_Sp;
787  else if (wc == 0x0085)
788  SB_type = SB_ParaSep;
789  break;
790 
791  case G_UNICODE_SPACE_SEPARATOR:
792  if (wc == 0x0020 || wc == 0x00A0 || wc == 0x1680 ||
793  (wc >= 0x2000 && wc <= 0x200A) ||
794  wc == 0x202F || wc == 0x205F || wc == 0x3000)
795  SB_type = SB_Sp;
796  break;
797 
798  case G_UNICODE_LINE_SEPARATOR:
799  case G_UNICODE_PARAGRAPH_SEPARATOR:
800  SB_type = SB_ParaSep;
801  break;
802 
803  case G_UNICODE_FORMAT:
804  case G_UNICODE_SPACING_MARK:
805  case G_UNICODE_ENCLOSING_MARK:
806  case G_UNICODE_NON_SPACING_MARK:
807  SB_type = SB_ExtendFormat; /* Extend, Format */
808  break;
809 
810  case G_UNICODE_MODIFIER_LETTER:
811  if (wc >= 0xFF9E && wc <= 0xFF9F)
812  SB_type = SB_ExtendFormat; /* Other_Grapheme_Extend */
813  break;
814 
815  case G_UNICODE_TITLECASE_LETTER:
816  SB_type = SB_Upper;
817  break;
818 
819  case G_UNICODE_DASH_PUNCTUATION:
820  if (wc == 0x002D ||
821  (wc >= 0x2013 && wc <= 0x2014) ||
822  (wc >= 0xFE31 && wc <= 0xFE32) ||
823  wc == 0xFE58 ||
824  wc == 0xFE63 ||
825  wc == 0xFF0D)
826  SB_type = SB_SContinue;
827  break;
828 
829  case G_UNICODE_OTHER_PUNCTUATION:
830  if (wc == 0x05F3)
831  SB_type = SB_OLetter;
832  else if (wc == 0x002E || wc == 0x2024 ||
833  wc == 0xFE52 || wc == 0xFF0E)
834  SB_type = SB_ATerm;
835 
836  if (wc == 0x002C ||
837  wc == 0x003A ||
838  wc == 0x055D ||
839  (wc >= 0x060C && wc <= 0x060D) ||
840  wc == 0x07F8 ||
841  wc == 0x1802 ||
842  wc == 0x1808 ||
843  wc == 0x3001 ||
844  (wc >= 0xFE10 && wc <= 0xFE11) ||
845  wc == 0xFE13 ||
846  (wc >= 0xFE50 && wc <= 0xFE51) ||
847  wc == 0xFE55 ||
848  wc == 0xFF0C ||
849  wc == 0xFF1A ||
850  wc == 0xFF64)
851  SB_type = SB_SContinue;
852 
853  if (_pango_is_STerm(wc))
854  SB_type = SB_STerm;
855 
856  break;
857  }
858 
859  if (SB_type == SB_Other)
860  {
861  if (type == G_UNICODE_LOWERCASE_LETTER)
862  SB_type = SB_Lower;
863  else if (type == G_UNICODE_UPPERCASE_LETTER)
864  SB_type = SB_Upper;
865  else if (type == G_UNICODE_TITLECASE_LETTER ||
866  type == G_UNICODE_MODIFIER_LETTER ||
867  type == G_UNICODE_OTHER_LETTER)
868  SB_type = SB_OLetter;
869 
870  if (type == G_UNICODE_OPEN_PUNCTUATION ||
871  type == G_UNICODE_CLOSE_PUNCTUATION ||
872  break_type == G_UNICODE_BREAK_QUOTATION)
873  SB_type = SB_Close;
874  }
875 
876  /* Sentence Boundary Rules */
877 
878  /* We apply Rules SB1 and SB2 at the end of the function */
879 
880 #define IS_OTHER_TERM(SB_type) \
881  /* not in (OLetter | Upper | Lower | ParaSep | SATerm) */ \
882  !(SB_type == SB_OLetter || \
883  SB_type == SB_Upper || SB_type == SB_Lower || \
884  SB_type == SB_ParaSep || \
885  SB_type == SB_ATerm || SB_type == SB_STerm || \
886  SB_type == SB_ATerm_Close_Sp || \
887  SB_type == SB_STerm_Close_Sp)
888 
889 
890  if (wc == '\n' && prev_wc == '\r')
891  is_sentence_boundary = FALSE; /* Rule SB3 */
892  else if (prev_SB_type == SB_ParaSep && prev_SB_i + 1 == i)
893  {
894  /* The extra check for prev_SB_i is to correctly handle sequences like
895  * ParaSep ÷ Extend × Extend
896  * since we have not skipped ExtendFormat yet.
897  */
898 
899  is_sentence_boundary = TRUE; /* Rule SB4 */
900  }
901  else if (SB_type == SB_ExtendFormat)
902  is_sentence_boundary = FALSE; /* Rule SB5? */
903  else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
904  is_sentence_boundary = FALSE; /* Rule SB6 */
905  else if ((prev_prev_SB_type == SB_Upper ||
906  prev_prev_SB_type == SB_Lower) &&
907  prev_SB_type == SB_ATerm &&
908  SB_type == SB_Upper)
909  is_sentence_boundary = FALSE; /* Rule SB7 */
910  else if (prev_SB_type == SB_ATerm && SB_type == SB_Close)
911  SB_type = SB_ATerm;
912  else if (prev_SB_type == SB_STerm && SB_type == SB_Close)
913  SB_type = SB_STerm;
914  else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp)
915  SB_type = SB_ATerm_Close_Sp;
916  else if (prev_SB_type == SB_STerm && SB_type == SB_Sp)
917  SB_type = SB_STerm_Close_Sp;
918  /* Rule SB8 */
919  else if ((prev_SB_type == SB_ATerm ||
920  prev_SB_type == SB_ATerm_Close_Sp) &&
921  SB_type == SB_Lower)
922  is_sentence_boundary = FALSE;
923  else if ((prev_prev_SB_type == SB_ATerm ||
924  prev_prev_SB_type == SB_ATerm_Close_Sp) &&
925  IS_OTHER_TERM(prev_SB_type) &&
926  SB_type == SB_Lower)
927  attrs[prev_SB_i].is_sentence_boundary = FALSE;
928  else if ((prev_SB_type == SB_ATerm ||
929  prev_SB_type == SB_ATerm_Close_Sp ||
930  prev_SB_type == SB_STerm ||
931  prev_SB_type == SB_STerm_Close_Sp) &&
932  (SB_type == SB_SContinue ||
933  SB_type == SB_ATerm || SB_type == SB_STerm))
934  is_sentence_boundary = FALSE; /* Rule SB8a */
935  else if ((prev_SB_type == SB_ATerm ||
936  prev_SB_type == SB_STerm) &&
937  (SB_type == SB_Close || SB_type == SB_Sp ||
938  SB_type == SB_ParaSep))
939  is_sentence_boundary = FALSE; /* Rule SB9 */
940  else if ((prev_SB_type == SB_ATerm ||
941  prev_SB_type == SB_ATerm_Close_Sp ||
942  prev_SB_type == SB_STerm ||
943  prev_SB_type == SB_STerm_Close_Sp) &&
944  (SB_type == SB_Sp || SB_type == SB_ParaSep))
945  is_sentence_boundary = FALSE; /* Rule SB10 */
946  else if ((prev_SB_type == SB_ATerm ||
947  prev_SB_type == SB_ATerm_Close_Sp ||
948  prev_SB_type == SB_STerm ||
949  prev_SB_type == SB_STerm_Close_Sp) &&
950  SB_type != SB_ParaSep)
951  is_sentence_boundary = TRUE; /* Rule SB11 */
952  else
953  is_sentence_boundary = FALSE; /* Rule SB998 */
954 
955  if (SB_type != SB_ExtendFormat &&
956  !((prev_prev_SB_type == SB_ATerm ||
957  prev_prev_SB_type == SB_ATerm_Close_Sp) &&
958  IS_OTHER_TERM(prev_SB_type) &&
959  IS_OTHER_TERM(SB_type)))
960  {
961  prev_prev_SB_type = prev_SB_type;
962  prev_SB_type = SB_type;
963  prev_SB_i = i;
964  }
965 
966 #undef IS_OTHER_TERM
967 
968  }
969 
970  if (i == 0 || done)
971  is_sentence_boundary = TRUE; /* Rules SB1 and SB2 */
972 
973  attrs[i].is_sentence_boundary = is_sentence_boundary;
974  }
975 
976  /* ---- Line breaking ---- */
977 
978  break_op = BREAK_ALREADY_HANDLED;
979 
980  row_break_type = prev_break_type == G_UNICODE_BREAK_SPACE ?
981  prev_prev_break_type : prev_break_type;
982  g_assert (row_break_type != G_UNICODE_BREAK_SPACE);
983 
984  attrs[i].is_char_break = FALSE;
985  attrs[i].is_line_break = FALSE;
986  attrs[i].is_mandatory_break = FALSE;
987 
988  /* Rule LB1:
989  assign a line breaking class to each code point of the input. */
990  switch (break_type)
991  {
992  case G_UNICODE_BREAK_AMBIGUOUS:
993  case G_UNICODE_BREAK_SURROGATE:
994  case G_UNICODE_BREAK_UNKNOWN:
995  break_type = G_UNICODE_BREAK_ALPHABETIC;
996  break;
997 
998  case G_UNICODE_BREAK_COMPLEX_CONTEXT:
999  if (type == G_UNICODE_NON_SPACING_MARK ||
1000  type == G_UNICODE_SPACING_MARK)
1001  break_type = G_UNICODE_BREAK_COMBINING_MARK;
1002  else
1003  break_type = G_UNICODE_BREAK_ALPHABETIC;
1004  break;
1005 
1006  case G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER:
1007  break_type = G_UNICODE_BREAK_NON_STARTER;
1008  break;
1009 
1010  default:
1011  ;
1012  }
1013 
1014  /* If it's not a grapheme boundary, it's not a line break either */
1015  if (attrs[i].is_cursor_position ||
1016  break_type == G_UNICODE_BREAK_COMBINING_MARK ||
1017  break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER ||
1018  break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
1019  break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1020  break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
1021  break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
1022  break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE ||
1023  break_type == G_UNICODE_BREAK_EMOJI_MODIFIER ||
1024  break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
1025  {
1026  LineBreakType LB_type;
1027 
1028  /* Find the LineBreakType of wc */
1029  LB_type = LB_Other;
1030 
1031  if (break_type == G_UNICODE_BREAK_NUMERIC)
1032  LB_type = LB_Numeric;
1033 
1034  if (break_type == G_UNICODE_BREAK_SYMBOL ||
1035  break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
1036  {
1037  if (!(prev_LB_type == LB_Numeric))
1038  LB_type = LB_Other;
1039  }
1040 
1041  if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
1042  break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS)
1043  {
1044  if (prev_LB_type == LB_Numeric)
1045  LB_type = LB_Numeric_Close;
1046  else
1047  LB_type = LB_Other;
1048  }
1049 
1050  if (break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
1051  {
1052  if (prev_LB_type == LB_RI_Odd)
1053  LB_type = LB_RI_Even;
1054  else if (prev_LB_type == LB_RI_Even)
1055  LB_type = LB_RI_Odd;
1056  else
1057  LB_type = LB_RI_Odd;
1058  }
1059 
1060  attrs[i].is_line_break = TRUE; /* Rule LB31 */
1061  /* Unicode doesn't specify char wrap;
1062  we wrap around all chars currently. */
1063  if (attrs[i].is_cursor_position)
1064  attrs[i].is_char_break = TRUE;
1065 
1066  /* Make any necessary replacements first */
1067  if (row_break_type == G_UNICODE_BREAK_UNKNOWN)
1068  row_break_type = G_UNICODE_BREAK_ALPHABETIC;
1069 
1070  /* add the line break rules in reverse order to override
1071  the lower priority rules. */
1072 
1073  /* Rule LB30 */
1074  if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
1075  prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER ||
1076  prev_break_type == G_UNICODE_BREAK_NUMERIC) &&
1077  break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
1078  break_op = BREAK_PROHIBITED;
1079 
1080  if (prev_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS &&
1081  (break_type == G_UNICODE_BREAK_ALPHABETIC ||
1082  break_type == G_UNICODE_BREAK_HEBREW_LETTER ||
1083  break_type == G_UNICODE_BREAK_NUMERIC))
1084  break_op = BREAK_PROHIBITED;
1085 
1086  /* Rule LB30a */
1087  if (prev_LB_type == LB_RI_Odd && LB_type == LB_RI_Even)
1088  break_op = BREAK_PROHIBITED;
1089 
1090  /* Rule LB30b */
1091  if (prev_break_type == G_UNICODE_BREAK_EMOJI_BASE &&
1092  break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)
1093  break_op = BREAK_PROHIBITED;
1094 
1095  /* Rule LB29 */
1096  if (prev_break_type == G_UNICODE_BREAK_INFIX_SEPARATOR &&
1097  (break_type == G_UNICODE_BREAK_ALPHABETIC ||
1098  break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1099  break_op = BREAK_PROHIBITED;
1100 
1101  /* Rule LB28 */
1102  if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
1103  prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
1104  (break_type == G_UNICODE_BREAK_ALPHABETIC ||
1105  break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1106  break_op = BREAK_PROHIBITED;
1107 
1108  /* Rule LB27 */
1109  if ((prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
1110  prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1111  prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
1112  prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
1113  prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) &&
1114  (break_type == G_UNICODE_BREAK_INSEPARABLE ||
1115  break_type == G_UNICODE_BREAK_POSTFIX))
1116  break_op = BREAK_PROHIBITED;
1117 
1118  if (prev_break_type == G_UNICODE_BREAK_PREFIX &&
1119  (break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
1120  break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1121  break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
1122  break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
1123  break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
1124  break_op = BREAK_PROHIBITED;
1125 
1126  /* Rule LB26 */
1127  if (prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO &&
1128  (break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
1129  break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1130  break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
1131  break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
1132  break_op = BREAK_PROHIBITED;
1133 
1134  if ((prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1135  prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE) &&
1136  (break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
1137  break_type == G_UNICODE_BREAK_HANGUL_T_JAMO))
1138  break_op = BREAK_PROHIBITED;
1139 
1140  if ((prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
1141  prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) &&
1142  break_type == G_UNICODE_BREAK_HANGUL_T_JAMO)
1143  break_op = BREAK_PROHIBITED;
1144 
1145  /* Rule LB25 with Example 7 of Customization */
1146  if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
1147  prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
1148  break_type == G_UNICODE_BREAK_NUMERIC)
1149  break_op = BREAK_PROHIBITED;
1150 
1151  if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
1152  prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
1153  (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION ||
1154  break_type == G_UNICODE_BREAK_HYPHEN) &&
1155  next_break_type == G_UNICODE_BREAK_NUMERIC)
1156  break_op = BREAK_PROHIBITED;
1157 
1158  if ((prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION ||
1159  prev_break_type == G_UNICODE_BREAK_HYPHEN) &&
1160  break_type == G_UNICODE_BREAK_NUMERIC)
1161  break_op = BREAK_PROHIBITED;
1162 
1163  if (prev_break_type == G_UNICODE_BREAK_NUMERIC &&
1164  (break_type == G_UNICODE_BREAK_NUMERIC ||
1165  break_type == G_UNICODE_BREAK_SYMBOL ||
1166  break_type == G_UNICODE_BREAK_INFIX_SEPARATOR))
1167  break_op = BREAK_PROHIBITED;
1168 
1169  if (prev_LB_type == LB_Numeric &&
1170  (break_type == G_UNICODE_BREAK_NUMERIC ||
1171  break_type == G_UNICODE_BREAK_SYMBOL ||
1172  break_type == G_UNICODE_BREAK_INFIX_SEPARATOR ||
1173  break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
1174  break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS))
1175  break_op = BREAK_PROHIBITED;
1176 
1177  if ((prev_LB_type == LB_Numeric ||
1178  prev_LB_type == LB_Numeric_Close) &&
1179  (break_type == G_UNICODE_BREAK_POSTFIX ||
1180  break_type == G_UNICODE_BREAK_PREFIX))
1181  break_op = BREAK_PROHIBITED;
1182 
1183  /* Rule LB24 */
1184  if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
1185  prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
1186  (break_type == G_UNICODE_BREAK_ALPHABETIC ||
1187  break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1188  break_op = BREAK_PROHIBITED;
1189 
1190  if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
1191  prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
1192  (break_type == G_UNICODE_BREAK_PREFIX ||
1193  break_type == G_UNICODE_BREAK_POSTFIX))
1194  break_op = BREAK_PROHIBITED;
1195 
1196  /* Rule LB23 */
1197  if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
1198  prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
1199  break_type == G_UNICODE_BREAK_NUMERIC)
1200  break_op = BREAK_PROHIBITED;
1201 
1202  if (prev_break_type == G_UNICODE_BREAK_NUMERIC &&
1203  (break_type == G_UNICODE_BREAK_ALPHABETIC ||
1204  break_type == G_UNICODE_BREAK_HEBREW_LETTER))
1205  break_op = BREAK_PROHIBITED;
1206 
1207  /* Rule LB23a */
1208  if (prev_break_type == G_UNICODE_BREAK_PREFIX &&
1209  (break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
1210  break_type == G_UNICODE_BREAK_EMOJI_BASE ||
1211  break_type == G_UNICODE_BREAK_EMOJI_MODIFIER))
1212  break_op = BREAK_PROHIBITED;
1213 
1214  if ((prev_break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
1215  prev_break_type == G_UNICODE_BREAK_EMOJI_BASE ||
1216  prev_break_type == G_UNICODE_BREAK_EMOJI_MODIFIER) &&
1217  break_type == G_UNICODE_BREAK_POSTFIX)
1218  break_op = BREAK_PROHIBITED;
1219 
1220  /* Rule LB22 */
1221  if (break_type == G_UNICODE_BREAK_INSEPARABLE)
1222  {
1223  if (prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
1224  prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER)
1225  break_op = BREAK_PROHIBITED;
1226 
1227  if (prev_break_type == G_UNICODE_BREAK_EXCLAMATION)
1228  break_op = BREAK_PROHIBITED;
1229 
1230  if (prev_break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
1231  prev_break_type == G_UNICODE_BREAK_EMOJI_BASE ||
1232  prev_break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)
1233  break_op = BREAK_PROHIBITED;
1234 
1235  if (prev_break_type == G_UNICODE_BREAK_INSEPARABLE)
1236  break_op = BREAK_PROHIBITED;
1237 
1238  if (prev_break_type == G_UNICODE_BREAK_NUMERIC)
1239  break_op = BREAK_PROHIBITED;
1240  }
1241 
1242  if (break_type == G_UNICODE_BREAK_AFTER ||
1243  break_type == G_UNICODE_BREAK_HYPHEN ||
1244  break_type == G_UNICODE_BREAK_NON_STARTER ||
1245  prev_break_type == G_UNICODE_BREAK_BEFORE)
1246  break_op = BREAK_PROHIBITED; /* Rule LB21 */
1247 
1248  if (prev_prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER &&
1249  (prev_break_type == G_UNICODE_BREAK_HYPHEN ||
1250  prev_break_type == G_UNICODE_BREAK_AFTER))
1251  break_op = BREAK_PROHIBITED; /* Rule LB21a */
1252 
1253  if (prev_break_type == G_UNICODE_BREAK_SYMBOL &&
1254  break_type == G_UNICODE_BREAK_HEBREW_LETTER)
1255  break_op = BREAK_PROHIBITED; /* Rule LB21b */
1256 
1257  if (prev_break_type == G_UNICODE_BREAK_CONTINGENT ||
1258  break_type == G_UNICODE_BREAK_CONTINGENT)
1259  break_op = BREAK_ALLOWED; /* Rule LB20 */
1260 
1261  if (prev_break_type == G_UNICODE_BREAK_QUOTATION ||
1262  break_type == G_UNICODE_BREAK_QUOTATION)
1263  break_op = BREAK_PROHIBITED; /* Rule LB19 */
1264 
1265  /* handle related rules for Space as state machine here,
1266  and override the pair table result. */
1267  if (prev_break_type == G_UNICODE_BREAK_SPACE) /* Rule LB18 */
1268  break_op = BREAK_ALLOWED;
1269 
1270  if (row_break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER &&
1271  break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER)
1272  break_op = BREAK_PROHIBITED; /* Rule LB17 */
1273 
1274  if ((row_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
1275  row_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS) &&
1276  break_type == G_UNICODE_BREAK_NON_STARTER)
1277  break_op = BREAK_PROHIBITED; /* Rule LB16 */
1278 
1279  if (row_break_type == G_UNICODE_BREAK_QUOTATION &&
1280  break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
1281  break_op = BREAK_PROHIBITED; /* Rule LB15 */
1282 
1283  if (row_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
1284  break_op = BREAK_PROHIBITED; /* Rule LB14 */
1285 
1286  /* Rule LB13 with Example 7 of Customization */
1287  if (break_type == G_UNICODE_BREAK_EXCLAMATION)
1288  break_op = BREAK_PROHIBITED;
1289 
1290  if (prev_break_type != G_UNICODE_BREAK_NUMERIC &&
1291  (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
1292  break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS ||
1293  break_type == G_UNICODE_BREAK_INFIX_SEPARATOR ||
1294  break_type == G_UNICODE_BREAK_SYMBOL))
1295  break_op = BREAK_PROHIBITED;
1296 
1297  if (prev_break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE)
1298  break_op = BREAK_PROHIBITED; /* Rule LB12 */
1299 
1300  if (break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE &&
1301  (prev_break_type != G_UNICODE_BREAK_SPACE &&
1302  prev_break_type != G_UNICODE_BREAK_AFTER &&
1303  prev_break_type != G_UNICODE_BREAK_HYPHEN))
1304  break_op = BREAK_PROHIBITED; /* Rule LB12a */
1305 
1306  if (prev_break_type == G_UNICODE_BREAK_WORD_JOINER ||
1307  break_type == G_UNICODE_BREAK_WORD_JOINER)
1308  break_op = BREAK_PROHIBITED; /* Rule LB11 */
1309 
1310 
1311  /* Rule LB9 */
1312  if (break_type == G_UNICODE_BREAK_COMBINING_MARK ||
1313  break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)
1314  {
1315  if (!(prev_break_type == G_UNICODE_BREAK_MANDATORY ||
1316  prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
1317  prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
1318  prev_break_type == G_UNICODE_BREAK_NEXT_LINE ||
1319  prev_break_type == G_UNICODE_BREAK_SPACE ||
1320  prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE))
1321  break_op = BREAK_PROHIBITED;
1322  }
1323 
1324  if (row_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
1325  break_op = BREAK_ALLOWED; /* Rule LB8 */
1326 
1327  if (prev_wc == 0x200D)
1328  break_op = BREAK_PROHIBITED; /* Rule LB8a */
1329 
1330  if (break_type == G_UNICODE_BREAK_SPACE ||
1331  break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
1332  break_op = BREAK_PROHIBITED; /* Rule LB7 */
1333 
1334  /* Rule LB6 */
1335  if (break_type == G_UNICODE_BREAK_MANDATORY ||
1336  break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
1337  break_type == G_UNICODE_BREAK_LINE_FEED ||
1338  break_type == G_UNICODE_BREAK_NEXT_LINE)
1339  break_op = BREAK_PROHIBITED;
1340 
1341  /* Rules LB4 and LB5 */
1342  if (prev_break_type == G_UNICODE_BREAK_MANDATORY ||
1343  (prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN &&
1344  wc != '\n') ||
1345  prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
1346  prev_break_type == G_UNICODE_BREAK_NEXT_LINE)
1347  {
1348  attrs[i].is_mandatory_break = TRUE;
1349  break_op = BREAK_ALLOWED;
1350  }
1351 
1352  switch (break_op)
1353  {
1354  case BREAK_PROHIBITED:
1355  /* can't break here */
1356  attrs[i].is_line_break = FALSE;
1357  break;
1358 
1359  case BREAK_IF_SPACES:
1360  /* break if prev char was space */
1361  if (prev_break_type != G_UNICODE_BREAK_SPACE)
1362  attrs[i].is_line_break = FALSE;
1363  break;
1364 
1365  case BREAK_ALLOWED:
1366  attrs[i].is_line_break = TRUE;
1367  break;
1368 
1369  case BREAK_ALREADY_HANDLED:
1370  break;
1371 
1372  default:
1373  g_assert_not_reached ();
1374  break;
1375  }
1376 
1377  /* Rule LB9 */
1378  if (!(break_type == G_UNICODE_BREAK_COMBINING_MARK ||
1379  break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER))
1380  {
1381  /* Rule LB25 with Example 7 of Customization */
1382  if (break_type == G_UNICODE_BREAK_NUMERIC ||
1383  break_type == G_UNICODE_BREAK_SYMBOL ||
1384  break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
1385  {
1386  if (prev_LB_type != LB_Numeric)
1387  prev_LB_type = LB_type;
1388  /* else don't change the prev_LB_type */
1389  }
1390  else
1391  {
1392  prev_LB_type = LB_type;
1393  }
1394  }
1395  /* else don't change the prev_LB_type for Rule LB9 */
1396  }
1397 
1398  if (break_type != G_UNICODE_BREAK_SPACE)
1399  {
1400  /* Rule LB9 */
1401  if (break_type == G_UNICODE_BREAK_COMBINING_MARK ||
1402  break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)
1403  {
1404  if (i == 0 /* start of text */ ||
1405  prev_break_type == G_UNICODE_BREAK_MANDATORY ||
1406  prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
1407  prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
1408  prev_break_type == G_UNICODE_BREAK_NEXT_LINE ||
1409  prev_break_type == G_UNICODE_BREAK_SPACE ||
1410  prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
1411  prev_break_type = G_UNICODE_BREAK_ALPHABETIC; /* Rule LB10 */
1412  /* else don't change the prev_break_type for Rule LB9 */
1413  }
1414  else
1415  {
1416  prev_prev_break_type = prev_break_type;
1417  prev_break_type = break_type;
1418  }
1419 
1420  prev_jamo = jamo;
1421  }
1422  else
1423  {
1424  if (prev_break_type != G_UNICODE_BREAK_SPACE)
1425  {
1426  prev_prev_break_type = prev_break_type;
1427  prev_break_type = break_type;
1428  }
1429  /* else don't change the prev_break_type */
1430  }
1431 
1432  /* ---- Word breaks ---- */
1433 
1434  /* default to not a word start/end */
1435  attrs[i].is_word_start = FALSE;
1436  attrs[i].is_word_end = FALSE;
1437 
1438  if (current_word_type != WordNone)
1439  {
1440  /* Check for a word end */
1441  switch ((int) type)
1442  {
1443  case G_UNICODE_SPACING_MARK:
1444  case G_UNICODE_ENCLOSING_MARK:
1445  case G_UNICODE_NON_SPACING_MARK:
1446  case G_UNICODE_FORMAT:
1447  /* nothing, we just eat these up as part of the word */
1448  break;
1449 
1450  case G_UNICODE_LOWERCASE_LETTER:
1451  case G_UNICODE_MODIFIER_LETTER:
1452  case G_UNICODE_OTHER_LETTER:
1453  case G_UNICODE_TITLECASE_LETTER:
1454  case G_UNICODE_UPPERCASE_LETTER:
1455  if (current_word_type == WordLetters)
1456  {
1457  /* Japanese special cases for ending the word */
1458  if (JAPANESE (last_word_letter) ||
1459  JAPANESE (wc))
1460  {
1461  if ((HIRAGANA (last_word_letter) &&
1462  !HIRAGANA (wc)) ||
1463  (KATAKANA (last_word_letter) &&
1464  !(KATAKANA (wc) || HIRAGANA (wc))) ||
1465  (KANJI (last_word_letter) &&
1466  !(HIRAGANA (wc) || KANJI (wc))) ||
1467  (JAPANESE (last_word_letter) &&
1468  !JAPANESE (wc)) ||
1469  (!JAPANESE (last_word_letter) &&
1470  JAPANESE (wc)))
1471  attrs[i].is_word_end = TRUE;
1472  }
1473  }
1474  last_word_letter = wc;
1475  break;
1476 
1477  case G_UNICODE_DECIMAL_NUMBER:
1478  case G_UNICODE_LETTER_NUMBER:
1479  case G_UNICODE_OTHER_NUMBER:
1480  last_word_letter = wc;
1481  break;
1482 
1483  default:
1484  /* Punctuation, control/format chars, etc. all end a word. */
1485  attrs[i].is_word_end = TRUE;
1486  current_word_type = WordNone;
1487  break;
1488  }
1489  }
1490  else
1491  {
1492  /* Check for a word start */
1493  switch ((int) type)
1494  {
1495  case G_UNICODE_LOWERCASE_LETTER:
1496  case G_UNICODE_MODIFIER_LETTER:
1497  case G_UNICODE_OTHER_LETTER:
1498  case G_UNICODE_TITLECASE_LETTER:
1499  case G_UNICODE_UPPERCASE_LETTER:
1500  current_word_type = WordLetters;
1501  last_word_letter = wc;
1502  attrs[i].is_word_start = TRUE;
1503  break;
1504 
1505  case G_UNICODE_DECIMAL_NUMBER:
1506  case G_UNICODE_LETTER_NUMBER:
1507  case G_UNICODE_OTHER_NUMBER:
1508  current_word_type = WordNumbers;
1509  last_word_letter = wc;
1510  attrs[i].is_word_start = TRUE;
1511  break;
1512 
1513  default:
1514  /* No word here */
1515  break;
1516  }
1517  }
1518 
1519  /* ---- Sentence breaks ---- */
1520  {
1521 
1522  /* default to not a sentence start/end */
1523  attrs[i].is_sentence_start = FALSE;
1524  attrs[i].is_sentence_end = FALSE;
1525 
1526  /* maybe start sentence */
1527  if (last_sentence_start == -1 && !is_sentence_boundary)
1528  last_sentence_start = i - 1;
1529 
1530  /* remember last non space character position */
1531  if (i > 0 && !attrs[i - 1].is_white)
1532  last_non_space = i;
1533 
1534  /* meets sentence end, mark both sentence start and end */
1535  if (last_sentence_start != -1 && is_sentence_boundary) {
1536  if (last_non_space != -1) {
1537  attrs[last_sentence_start].is_sentence_start = TRUE;
1538  attrs[last_non_space].is_sentence_end = TRUE;
1539  }
1540 
1541  last_sentence_start = -1;
1542  last_non_space = -1;
1543  }
1544 
1545  /* meets space character, move sentence start */
1546  if (last_sentence_start != -1 &&
1547  last_sentence_start == i - 1 &&
1548  attrs[i - 1].is_white)
1549  last_sentence_start++;
1550 
1551  }
1552 
1553  prev_wc = wc;
1554 
1555  /* wc might not be a valid Unicode base character, but really all we
1556  * need to know is the last non-combining character */
1557  if (type != G_UNICODE_SPACING_MARK &&
1558  type != G_UNICODE_ENCLOSING_MARK &&
1559  type != G_UNICODE_NON_SPACING_MARK)
1560  base_character = wc;
1561  }
1562 
1563  i--;
1564 
1565  attrs[i].is_cursor_position = TRUE; /* Rule GB2 */
1566  attrs[0].is_cursor_position = TRUE; /* Rule GB1 */
1567 
1568  attrs[i].is_word_boundary = TRUE; /* Rule WB2 */
1569  attrs[0].is_word_boundary = TRUE; /* Rule WB1 */
1570 
1571  attrs[i].is_line_break = TRUE; /* Rule LB3 */
1572  attrs[0].is_line_break = FALSE; /* Rule LB2 */
1573 
1574 }
1575 
1576 static gboolean
1577 break_script (const char *item_text,
1578  unsigned int item_length,
1579  const PangoAnalysis *analysis,
1580  PangoLogAttr *attrs,
1581  int attrs_len);
1582 
1583 static gboolean
1584 break_attrs (const char *text,
1585  int length,
1586  GSList *attributes,
1587  int item_offset,
1588  PangoLogAttr *attrs,
1589  int attrs_len);
1590 
1591 static gboolean
1592 tailor_break (const char *text,
1593  int length,
1594  PangoAnalysis *analysis,
1595  int item_offset,
1596  PangoLogAttr *attrs,
1597  int attrs_len)
1598 {
1599  gboolean res;
1600 
1601  if (length < 0)
1602  length = strlen (text);
1603  else if (text == NULL)
1604  text = "";
1605 
1606  res = break_script (text, length, analysis, attrs, attrs_len);
1607 
1608  if (item_offset >= 0)
1609  res |= break_attrs (text, length, analysis->extra_attrs, item_offset, attrs, attrs_len);
1610 
1611  return res;
1612 }
1613 
1629 void
1630 pango_break (const gchar *text,
1631  gint length,
1632  PangoAnalysis *analysis,
1633  PangoLogAttr *attrs,
1634  int attrs_len)
1635 {
1636  g_return_if_fail (analysis != NULL);
1637  g_return_if_fail (attrs != NULL);
1638 
1639  pango_default_break (text, length, analysis, attrs, attrs_len);
1640  tailor_break (text, length, analysis, -1, attrs, attrs_len);
1641 }
1642 
1663 void
1665  gint length,
1666  gint *paragraph_delimiter_index,
1667  gint *next_paragraph_start)
1668 {
1669  const gchar *p = text;
1670  const gchar *end;
1671  const gchar *start = NULL;
1672  const gchar *delimiter = NULL;
1673 
1674  /* Only one character has type G_UNICODE_PARAGRAPH_SEPARATOR in
1675  * Unicode 5.0; update the following code if that changes.
1676  */
1677 
1678  /* prev_sep is the first byte of the previous separator. Since
1679  * the valid separators are \r, \n, and PARAGRAPH_SEPARATOR, the
1680  * first byte is enough to identify it.
1681  */
1682  gchar prev_sep;
1683 
1684 
1685  if (length < 0)
1686  length = strlen (text);
1687 
1688  end = text + length;
1689 
1690  if (paragraph_delimiter_index)
1691  *paragraph_delimiter_index = length;
1692 
1693  if (next_paragraph_start)
1694  *next_paragraph_start = length;
1695 
1696  if (length == 0)
1697  return;
1698 
1699  prev_sep = 0;
1700 
1701  while (p < end)
1702  {
1703  if (prev_sep == '\n' ||
1704  prev_sep == PARAGRAPH_SEPARATOR_STRING[0])
1705  {
1706  g_assert (delimiter);
1707  start = p;
1708  break;
1709  }
1710  else if (prev_sep == '\r')
1711  {
1712  /* don't break between \r and \n */
1713  if (*p != '\n')
1714  {
1715  g_assert (delimiter);
1716  start = p;
1717  break;
1718  }
1719  }
1720 
1721  if (*p == '\n' ||
1722  *p == '\r' ||
1723  !strncmp(p, PARAGRAPH_SEPARATOR_STRING,
1724  strlen(PARAGRAPH_SEPARATOR_STRING)))
1725  {
1726  if (delimiter == NULL)
1727  delimiter = p;
1728  prev_sep = *p;
1729  }
1730  else
1731  prev_sep = 0;
1732 
1733  p = g_utf8_next_char (p);
1734  }
1735 
1736  if (delimiter && paragraph_delimiter_index)
1737  *paragraph_delimiter_index = delimiter - text;
1738 
1739  if (start && next_paragraph_start)
1740  *next_paragraph_start = start - text;
1741 }
1742 
1763 void
1765  int length,
1766  PangoAnalysis *analysis,
1767  int offset,
1768  PangoLogAttr *log_attrs,
1769  int log_attrs_len)
1770 {
1771  PangoLogAttr *start = log_attrs;
1772  PangoLogAttr attr_before = *start;
1773 
1774  if (tailor_break (text, length, analysis, offset, log_attrs, log_attrs_len))
1775  {
1776  /* if tailored, we enforce some of the attrs from before
1777  * tailoring at the boundary
1778  */
1779 
1780  start->backspace_deletes_character = attr_before.backspace_deletes_character;
1781 
1782  start->is_line_break |= attr_before.is_line_break;
1783  start->is_mandatory_break |= attr_before.is_mandatory_break;
1784  start->is_cursor_position |= attr_before.is_cursor_position;
1785  }
1786 }
1787 
1788 static int
1789 tailor_segment (const char *range_start,
1790  const char *range_end,
1791  int chars_broken,
1792  PangoAnalysis *analysis,
1793  PangoLogAttr *log_attrs)
1794 {
1795  int chars_in_range;
1796  PangoLogAttr *start = log_attrs + chars_broken;
1797 
1798  chars_in_range = pango_utf8_strlen (range_start, range_end - range_start);
1799 
1800  pango_tailor_break (range_start,
1801  range_end - range_start,
1802  analysis,
1803  -1,
1804  start,
1805  chars_in_range + 1);
1806 
1807  return chars_in_range;
1808 }
1809 
1828 void
1830  int length,
1831  int level,
1832  PangoLanguage *language,
1833  PangoLogAttr *log_attrs,
1834  int attrs_len)
1835 {
1836  int chars_broken;
1837  PangoAnalysis analysis = { NULL };
1838  PangoScriptIter iter;
1839 
1840  g_return_if_fail (length == 0 || text != NULL);
1841  g_return_if_fail (log_attrs != NULL);
1842 
1843  analysis.level = level;
1844 
1845  pango_default_break (text, length, &analysis, log_attrs, attrs_len);
1846 
1847  chars_broken = 0;
1848 
1849  _pango_script_iter_init (&iter, text, length);
1850  do
1851  {
1852  const char *run_start, *run_end;
1853  PangoScript script;
1854 
1855  pango_script_iter_get_range (&iter, &run_start, &run_end, &script);
1856  analysis.script = script;
1857 
1858  chars_broken += tailor_segment (run_start, run_end, chars_broken, &analysis, log_attrs);
1859  }
1860  while (pango_script_iter_next (&iter));
1861  _pango_script_iter_fini (&iter);
1862 
1863  if (chars_broken + 1 > attrs_len)
1864  g_warning ("pango_get_log_attrs: attrs_len should have been at least %d, but was %d. Expect corrupted memory.",
1865  chars_broken + 1,
1866  attrs_len);
1867 }
1868 
1869 #include "break-arabic.c"
1870 #include "break-indic.c"
1871 #include "break-thai.c"
1872 
1873 static gboolean
1874 break_script (const char *item_text,
1875  unsigned int item_length,
1876  const PangoAnalysis *analysis,
1877  PangoLogAttr *attrs,
1878  int attrs_len)
1879 {
1880  switch (analysis->script)
1881  {
1882  case PANGO_SCRIPT_ARABIC:
1883  break_arabic (item_text, item_length, analysis, attrs, attrs_len);
1884  break;
1885 
1887  case PANGO_SCRIPT_BENGALI:
1888  case PANGO_SCRIPT_GURMUKHI:
1889  case PANGO_SCRIPT_GUJARATI:
1890  case PANGO_SCRIPT_ORIYA:
1891  case PANGO_SCRIPT_TAMIL:
1892  case PANGO_SCRIPT_TELUGU:
1893  case PANGO_SCRIPT_KANNADA:
1895  case PANGO_SCRIPT_SINHALA:
1896  break_indic (item_text, item_length, analysis, attrs, attrs_len);
1897  break;
1898 
1899  case PANGO_SCRIPT_THAI:
1900  break_thai (item_text, item_length, analysis, attrs, attrs_len);
1901  break;
1902  default:
1903  return FALSE;
1904  }
1905 
1906  return TRUE;
1907 }
1908 
1909 static gboolean
1910 break_attrs (const char *text,
1911  int length,
1912  GSList *attributes,
1913  int offset,
1914  PangoLogAttr *log_attrs,
1915  int log_attrs_len)
1916 {
1917  PangoAttrList *list;
1918  PangoAttrIterator *iter;
1919  GSList *l;
1920 
1921  list = pango_attr_list_new ();
1922  for (l = attributes; l; l = l->next)
1923  {
1924  PangoAttribute *attr = l->data;
1925 
1926  if (attr->klass->type == PANGO_ATTR_ALLOW_BREAKS)
1928  }
1929 
1930  iter = pango_attr_list_get_iterator (list);
1931  do {
1932  PangoAttribute *attr;
1933 
1935  if (attr && ((PangoAttrInt*)attr)->value == 0)
1936  {
1937  int start, end;
1938  int start_pos, end_pos;
1939  int pos;
1940 
1941  pango_attr_iterator_range (iter, &start, &end);
1942  if (start < offset)
1943  start_pos = 0;
1944  else
1945  start_pos = g_utf8_pointer_to_offset (text, text + start - offset);
1946  if (end >= offset + length)
1947  end_pos = log_attrs_len;
1948  else
1949  end_pos = g_utf8_pointer_to_offset (text, text + end - offset);
1950 
1951  for (pos = start_pos + 1; pos < end_pos; pos++)
1952  {
1953  log_attrs[pos].is_mandatory_break = FALSE;
1954  log_attrs[pos].is_line_break = FALSE;
1955  log_attrs[pos].is_char_break = FALSE;
1956  }
1957  }
1958  } while (pango_attr_iterator_next (iter));
1959 
1961  pango_attr_list_unref (list);
1962 
1963  return TRUE;
1964 }
break_arabic
static void break_arabic(const char *text, int length, const PangoAnalysis *analysis G_GNUC_UNUSED, PangoLogAttr *attrs, int attrs_len G_GNUC_UNUSED)
Definition: break-arabic.c:51
PARAGRAPH_SEPARATOR_STRING
#define PARAGRAPH_SEPARATOR_STRING
Definition: break.c:32
break_attrs
static gboolean break_attrs(const char *text, int length, GSList *attributes, int item_offset, PangoLogAttr *attrs, int attrs_len)
Definition: break.c:1910
BREAK_IF_SPACES
Definition: break.c:42
PANGO_SCRIPT_ARABIC
Definition: pango-script.h:174
PANGO_SCRIPT_KATAKANA
Definition: pango-script.h:194
JAMO_LV
Definition: break.c:73
PANGO_SCRIPT_DEVANAGARI
Definition: pango-script.h:182
JAMO_LVT
Definition: break.c:74
pango_find_paragraph_boundary
void pango_find_paragraph_boundary(const gchar *text, gint length, gint *paragraph_delimiter_index, gint *next_paragraph_start)
Definition: break.c:1664
JAPANESE
#define JAPANESE(wc)
Definition: break.c:117
BreakOpportunity
BreakOpportunity
Definition: break.c:38
pango_tailor_break
void pango_tailor_break(const char *text, int length, PangoAnalysis *analysis, int offset, PangoLogAttr *log_attrs, int log_attrs_len)
Definition: break.c:1764
CharJamoProps
struct _CharJamoProps CharJamoProps
_PangoAttribute::klass
const PangoAttrClass * klass
Definition: pango-attributes.h:258
JAMO_TYPE
#define JAMO_TYPE(btype)
Definition: break.c:113
NO_JAMO
Definition: break.c:75
PANGO_SCRIPT_MALAYALAM
Definition: pango-script.h:198
pango_attr_list_unref
void pango_attr_list_unref(PangoAttrList *list)
Definition: pango-attributes.c:1319
BACKSPACE_DELETES_CHARACTER
#define BACKSPACE_DELETES_CHARACTER(wc)
Definition: break.c:128
pango-break.h
gen-emoji-table.start
start
Definition: gen-emoji-table.py:24
pango_attr_list_get_iterator
PANGO_AVAILABLE_IN_ALL PangoAttrIterator * pango_attr_list_get_iterator(PangoAttrList *list)
break-arabic.c
pango_attr_iterator_get
PangoAttribute * pango_attr_iterator_get(PangoAttrIterator *iterator, PangoAttrType type)
Definition: pango-attributes.c:2038
tailor_break
static gboolean tailor_break(const char *text, int length, PangoAnalysis *analysis, int item_offset, PangoLogAttr *attrs, int attrs_len)
Definition: break.c:1592
pango-impl-utils.h
WordNumbers
Definition: break.c:137
PANGO_SCRIPT_GUJARATI
Definition: pango-script.h:187
pango_attr_list_insert
void pango_attr_list_insert(PangoAttrList *list, PangoAttribute *attr)
Definition: pango-attributes.c:1445
PangoLanguage
typedefG_BEGIN_DECLS struct _PangoLanguage PangoLanguage
Definition: pango-language.h:32
break_thai
static void break_thai(const char *text, int len, const PangoAnalysis *analysis G_GNUC_UNUSED, PangoLogAttr *attrs, int attrs_len G_GNUC_UNUSED)
Definition: break-thai.c:55
pango_get_log_attrs
void pango_get_log_attrs(const char *text, int length, int level, PangoLanguage *language, PangoLogAttr *log_attrs, int attrs_len)
Definition: break.c:1829
PANGO_SCRIPT_HEBREW
Definition: pango-script.h:191
_CharJamoProps
Definition: break.c:85
break_indic
static void break_indic(const char *text, int length, const PangoAnalysis *analysis, PangoLogAttr *attrs, int attrs_len G_GNUC_UNUSED)
Definition: break-indic.c:104
PANGO_SCRIPT_TAMIL
Definition: pango-script.h:207
break_script
static gboolean break_script(const char *item_text, unsigned int item_length, const PangoAnalysis *analysis, PangoLogAttr *attrs, int attrs_len)
Definition: break.c:1874
break-indic.c
pango_break
void pango_break(const gchar *text, gint length, PangoAnalysis *analysis, PangoLogAttr *attrs, int attrs_len)
Definition: break.c:1630
_pango_is_Virama
static gboolean _pango_is_Virama(gunichar wc)
Definition: pango-break-table.h:213
JAMO_V
Definition: break.c:71
_pango_script_iter_init
PangoScriptIter * _pango_script_iter_init(PangoScriptIter *iter, const char *text, int length)
HIRAGANA
#define HIRAGANA(wc)
Definition: break.c:119
PANGO_ATTR_ALLOW_BREAKS
Definition: pango-attributes.h:189
PangoAnalysis
typedefG_BEGIN_DECLS struct _PangoAnalysis PangoAnalysis
Definition: pango-item.h:30
PANGO_SCRIPT_TELUGU
Definition: pango-script.h:208
PANGO_SCRIPT_SINHALA
Definition: pango-script.h:205
PANGO_SCRIPT_THAI
Definition: pango-script.h:210
break-thai.c
PangoScriptIter
typedefG_BEGIN_DECLS struct _PangoScriptIter PangoScriptIter
Definition: pango-script.h:35
_PangoAttribute
Definition: pango-attributes.h:256
pango_attr_iterator_destroy
void pango_attr_iterator_destroy(PangoAttrIterator *iterator)
Definition: pango-attributes.c:2015
WordType
WordType
Definition: break.c:133
WordNone
Definition: break.c:135
pango_attr_list_new
PangoAttrList * pango_attr_list_new(void)
Definition: pango-attributes.c:1278
pango-emoji-private.h
pango_attr_iterator_next
gboolean pango_attr_iterator_next(PangoAttrIterator *iterator)
Definition: pango-attributes.c:1934
BREAK_ALREADY_HANDLED
Definition: break.c:40
PANGO_SCRIPT_HIRAGANA
Definition: pango-script.h:192
pango_attr_iterator_range
void pango_attr_iterator_range(PangoAttrIterator *iterator, gint *start, gint *end)
Definition: pango-attributes.c:1913
JamoType
JamoType
Definition: break.c:68
IS_OTHER_TERM
#define IS_OTHER_TERM(SB_type)
_PangoAttrList
Definition: pango-attributes.c:39
PANGO_SCRIPT_ORIYA
Definition: pango-script.h:203
PANGO_SCRIPT_KANNADA
Definition: pango-script.h:193
JAMO_T
Definition: break.c:72
KATAKANA
#define KATAKANA(wc)
Definition: break.c:120
pango-break-table.h
gen-emoji-table.end
end
Definition: gen-emoji-table.py:24
PangoLogAttr
typedefG_BEGIN_DECLS struct _PangoLogAttr PangoLogAttr
Definition: pango-types.h:32
pango_utf8_strlen
static G_GNUC_UNUSED glong pango_utf8_strlen(const gchar *p, gssize max)
Definition: pango-impl-utils.h:82
pango-script-private.h
_pango_is_STerm
static gboolean _pango_is_STerm(gunichar wc)
Definition: pango-break-table.h:79
_pango_is_Vowel_Dependent
static gboolean _pango_is_Vowel_Dependent(gunichar wc)
Definition: pango-break-table.h:267
HangulJamoProps
static const CharJamoProps HangulJamoProps[]
Definition: break.c:93
_CharJamoProps::end
JamoType end
Definition: break.c:87
PANGO_SCRIPT_BENGALI
Definition: pango-script.h:176
pango_default_break
void pango_default_break(const gchar *text, gint length, PangoAnalysis *analysis G_GNUC_UNUSED, PangoLogAttr *attrs, int attrs_len G_GNUC_UNUSED)
Definition: break.c:156
pango_script_iter_get_range
void pango_script_iter_get_range(PangoScriptIter *iter, const char **start, const char **end, PangoScript *script)
Definition: pango-script.c:197
PangoScript
PangoScript
Definition: pango-script.h:170
_pango_script_iter_fini
void _pango_script_iter_fini(PangoScriptIter *iter)
Definition: pango-script.c:159
_PangoAttrInt
Definition: pango-attributes.h:341
_CharJamoProps::start
JamoType start
Definition: break.c:87
pango_script_iter_next
gboolean pango_script_iter_next(PangoScriptIter *iter)
Definition: pango-script.c:314
_pango_Is_Emoji_Extended_Pictographic
gboolean _pango_Is_Emoji_Extended_Pictographic(gunichar ch)
Definition: pango-emoji.c:104
text
static char * text
Definition: viewer-render.c:80
KANJI
#define KANJI(wc)
Definition: break.c:118
BREAK_ALLOWED
Definition: break.c:43
WordLetters
Definition: break.c:136
BREAK_TYPE_SAFE
#define BREAK_TYPE_SAFE(btype)
Definition: break.c:52
pango_attribute_copy
PangoAttribute * pango_attribute_copy(const PangoAttribute *attr)
Definition: pango-attributes.c:168
_PangoAttrIterator
Definition: pango-attributes.c:46
PARAGRAPH_SEPARATOR
#define PARAGRAPH_SEPARATOR
Definition: break.c:31
PANGO_SCRIPT_GURMUKHI
Definition: pango-script.h:188
_PangoAttrClass::type
PangoAttrType type
Definition: pango-attributes.h:301
tailor_segment
static int tailor_segment(const char *range_start, const char *range_end, int chars_broken, PangoAnalysis *analysis, PangoLogAttr *log_attrs)
Definition: break.c:1789
JAMO_L
Definition: break.c:70
BREAK_PROHIBITED
Definition: break.c:41