KHtml

break_lines.cpp
1 #include <break_lines.h>
2 #include <QLibrary>
3 #include <QTextCodec>
4 #include <stdio.h>
5 #include <stdlib.h>
6 
7 /* If HAVE_LIBTHAI is defined, libkhtml will link against
8  * libthai since compile time. Otherwise it will try to
9  * dlopen at run-time
10  *
11  * Ott Pattara Nov 14, 2004
12  */
13 
14 #ifndef HAVE_LIBTHAI
15 typedef int (*th_brk_def)(const unsigned char *, int[], int);
16 static th_brk_def th_brk;
17 #else
18 #include <thai/thailib.h>
19 #include <thai/thbrk.h>
20 #endif
21 
22 namespace khtml
23 {
24 struct ThaiCache {
25  ThaiCache()
26  {
27  string = nullptr;
28  allocated = 0x400;
29  wbrpos = (int *) malloc(allocated * sizeof(int));
30  numwbrpos = 0;
31  numisbreakable = 0x400;
32  isbreakable = (int *) malloc(numisbreakable * sizeof(int));
33  library = nullptr;
34  }
35  ~ThaiCache()
36  {
37  free(wbrpos);
38  free(isbreakable);
39  if (library) {
40  library->unload();
41  }
42  delete library;
43  }
44  const QChar *string;
45  int *wbrpos;
46  int *isbreakable;
47  int allocated;
48  int numwbrpos, numisbreakable;
49  QLibrary *library;
50 };
51 static ThaiCache *cache = nullptr;
52 
53 void cleanup_thaibreaks()
54 {
55  delete cache;
56  cache = nullptr;
57 #ifndef HAVE_LIBTHAI
58  th_brk = nullptr;
59 #endif
60 }
61 
62 bool isBreakableThai(const QChar *string, const int pos, const int len)
63 {
64  static QTextCodec *thaiCodec = QTextCodec::codecForMib(2259);
65  //printf("Entering isBreakableThai with pos = %d\n", pos);
66 
67 #ifndef HAVE_LIBTHAI
68 
69  QLibrary *lib = new QLibrary(QLatin1String("libthai"));
70 
71  /* load libthai dynamically */
72  if ((!th_brk) && thaiCodec) {
73  printf("Try to load libthai dynamically...\n");
74  if (lib->load()) {
75  th_brk = (th_brk_def) lib->resolve("th_brk");
76  }
77  if (!th_brk) {
78  // indication that loading failed and we shouldn't try to load again
79  printf("Error, can't load libthai...\n");
80  thaiCodec = nullptr;
81  if (lib->isLoaded()) {
82  lib->unload();
83  }
84  }
85  }
86 
87  if (!th_brk) {
88  return true;
89  }
90 #endif
91 
92  if (!cache) {
93  cache = new ThaiCache;
94 #ifndef HAVE_LIBTHAI
95  cache->library = lib;
96 #endif
97  }
98 
99  // build up string of thai chars
100  if (string != cache->string) {
101  //fprintf(stderr,"new string found (not in cache), calling libthai\n");
102  QByteArray cstr = thaiCodec->fromUnicode(QString::fromRawData(string, len));
103  //printf("About to call libthai::th_brk with str: %s",cstr.data());
104 
105  cache->numwbrpos = th_brk((const unsigned char *) cstr.data(), cache->wbrpos, cache->allocated);
106  //fprintf(stderr,"libthai returns with value %d\n",cache->numwbrpos);
107  if (cache->numwbrpos > cache->allocated) {
108  cache->allocated = cache->numwbrpos;
109  cache->wbrpos = (int *)realloc(cache->wbrpos, cache->allocated * sizeof(int));
110  cache->numwbrpos = th_brk((const unsigned char *) cstr.data(), cache->wbrpos, cache->allocated);
111  }
112  if (len > cache->numisbreakable) {
113  cache->numisbreakable = len;
114  cache->isbreakable = (int *)realloc(cache->isbreakable, cache->numisbreakable * sizeof(int));
115  }
116  for (int i = 0; i < len; ++i) {
117  cache->isbreakable[i] = 0;
118  }
119  if (cache->numwbrpos > 0) {
120  for (int i = cache->numwbrpos - 1; i >= 0; --i) {
121  cache->isbreakable[cache->wbrpos[i]] = 1;
122  }
123  }
124  cache->string = string;
125  }
126  //printf("Returning %d\n", cache->isbreakable[pos]);
127  return cache->isbreakable[pos];
128 }
129 
130 /*
131  array of unicode codes where breaking shouldn't occur.
132  (in sorted order because of using with binary search)
133  these are currently for Japanese, though simply adding
134  Korean, Chinese ones should work as well
135 */
136 /*
137  dontbreakbefore[] contains characters not covered by QChar::Punctuation_Close that shouldn't be broken before.
138  chars included in QChar::Punctuation_Close are listed below.(look at UAX #14)
139  - 3001 ideographic comma
140  - 3002 ideographic full stop
141  - FE50 small comma
142  - FF52 small full stop
143  - FF0C fullwidth comma
144  - FF0E fullwidth full stop
145  - FF61 halfwidth ideographic full stop
146  - FF64 halfwidth ideographic comma
147  these character is commented out.
148 */
149 static const ushort dontbreakbefore[] = {
150  //0x3001, //ideographic comma
151  //0x3002, //ideographic full stop
152  0x3005, //ideographic iteration mark
153  0x3009, //right angle bracket
154  0x300b, //right double angle bracket
155  0x300d, //right corner bracket
156  0x300f, //right white corner bracket
157  0x3011, //right black lenticular bracket
158  0x3015, //right tortoise shell bracket
159  0x3041, //small a hiragana
160  0x3043, //small i hiragana
161  0x3045, //small u hiragana
162  0x3047, //small e hiragana
163  0x3049, //small o hiragana
164  0x3063, //small tsu hiragana
165  0x3083, //small ya hiragana
166  0x3085, //small yu hiragana
167  0x3087, //small yo hiragana
168  0x308E, //small wa hiragana
169  0x309B, //jap voiced sound mark
170  0x309C, //jap semi-voiced sound mark
171  0x309D, //jap iteration mark hiragana
172  0x309E, //jap voiced iteration mark hiragana
173  0x30A1, //small a katakana
174  0x30A3, //small i katakana
175  0x30A5, //small u katakana
176  0x30A7, //small e katakana
177  0x30A9, //small o katakana
178  0x30C3, //small tsu katakana
179  0x30E3, //small ya katakana
180  0x30E5, //small yu katakana
181  0x30E7, //small yo katakana
182  0x30EE, //small wa katakana
183  0x30F5, //small ka katakana
184  0x30F6, //small ke katakana
185  0x30FC, //jap prolonged sound mark
186  0x30FD, //jap iteration mark katakana
187  0x30FE, //jap voiced iteration mark katakana
188  //0xFE50, //small comma
189  //0xFF52, //small full stop
190  0xFF01, //fullwidth exclamation mark
191  0xFF09, //fullwidth right parenthesis
192  //0xFF0C, //fullwidth comma
193  0xFF0D, //fullwidth hyphen-minus
194  //0xFF0E, //fullwidth full stop
195  0xFF1F, //fullwidth question mark
196  0xFF3D, //fullwidth right square bracket
197  0xFF5D, //fullwidth right curly bracket
198  //0xFF61, //halfwidth ideographic full stop
199  0xFF63, //halfwidth right corner bracket
200  //0xFF64, //halfwidth ideographic comma
201  0xFF67, //halfwidth katakana letter small a
202  0xFF68, //halfwidth katakana letter small i
203  0xFF69, //halfwidth katakana letter small u
204  0xFF6a, //halfwidth katakana letter small e
205  0xFF6b, //halfwidth katakana letter small o
206  0xFF6c, //halfwidth katakana letter small ya
207  0xFF6d, //halfwidth katakana letter small yu
208  0xFF6e, //halfwidth katakana letter small yo
209  0xFF6f, //halfwidth katakana letter small tu
210  0xFF70 //halfwidth katakana-hiragana prolonged sound mark
211 };
212 
213 // characters that aren't covered by QChar::Punctuation_Open
214 static const ushort dontbreakafter[] = {
215  0x3012, //postal mark
216  0xFF03, //full width pound mark
217  0xFF04, //full width dollar sign
218  0xFF20, //full width @
219  0xFFE1, //full width british pound sign
220  0xFFE5 //full width yen sign
221 };
222 
223 static bool break_bsearch(const ushort *arr, const unsigned int count, const ushort val)
224 {
225  unsigned int left = 0;
226  unsigned int right = count - 1;
227 
228  while (left != right) {
229  unsigned int i = (left + right) / 2;
230  if (val == arr[i]) {
231  return false;
232  }
233  if (val < arr[i]) {
234  right = i;
235  } else {
236  left = i + 1;
237  }
238  }
239 
240  return val != arr[left];
241 }
242 
243 bool isBreakable(const QChar *str, const int pos, int len)
244 {
245  const QChar *c = str + pos;
246  unsigned short ch = c->unicode();
247  if (ch > 0xff) {
248  // not latin1, need to do more sophisticated checks for asian fonts
249  unsigned char row = c->row();
250  if (row == 0x0e) {
251  // 0e00 - 0e7f == Thai
252  if (c->cell() < 0x80) {
253  // consult libthai
254  return isBreakableThai(str, pos, len);
255  } else {
256  return false;
257  }
258  }
259  if ((row > 0x2d && row < 0xfb) || row == 0x11) {
260  /* asian line breaking. */
261  if (pos == 0) {
262  return false; // never break before first character
263  }
264 
265  // check for simple punctuation cases
266  QChar::Category cat = c->category();
267  if (cat == QChar::Punctuation_Close ||
268  cat == QChar::Punctuation_Other ||
269  (str + (pos - 1))->category() == QChar::Punctuation_Open) {
270  return false;
271  }
272 
273  // do binary search in dontbreak[]
274  return break_bsearch(dontbreakbefore, (sizeof(dontbreakbefore) / sizeof(*dontbreakbefore)), c->unicode()) &&
275  break_bsearch(dontbreakafter, (sizeof(dontbreakafter) / sizeof(*dontbreakafter)), (str + (pos - 1))->unicode());
276  } else { // no asian font
277  return c->isSpace();
278  }
279  } else {
280  if (ch == ' ' || ch == '\n') {
281  return true;
282  }
283  }
284  return false;
285 }
286 
287 }
QByteArray fromUnicode(const QString &str) const const
This file is part of the HTML rendering engine for KDE.
QTextStream & right(QTextStream &stream)
uchar cell() const const
bool unload()
QChar::Category category() const const
QTextStream & left(QTextStream &stream)
QString fromRawData(const QChar *unicode, int size)
bool isSpace() const const
bool isLoaded() const const
ushort unicode() const const
bool load()
Category category(StandardShortcut id)
uchar row() const const
QFunctionPointer resolve(const char *symbol)
QTextCodec * codecForMib(int mib)
char * data()
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Sat Oct 16 2021 22:47:51 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.