Okular

textpage.cpp
1 /*
2  SPDX-FileCopyrightText: 2005 Piotr Szymanski <[email protected]>
3 
4  SPDX-License-Identifier: GPL-2.0-or-later
5 */
6 
7 #include "textpage.h"
8 #include "textpage_p.h"
9 
10 #include <QDebug>
11 
12 #include "area.h"
13 #include "debug_p.h"
14 #include "misc.h"
15 #include "page.h"
16 #include "page_p.h"
17 
18 #include <cstring>
19 
20 #include <QVarLengthArray>
21 #include <QtAlgorithms>
22 
23 using namespace Okular;
24 
25 class SearchPoint
26 {
27 public:
28  SearchPoint()
29  : offset_begin(-1)
30  , offset_end(-1)
31  {
32  }
33 
34  /** The TinyTextEntity containing the first character of the match. */
35  TextList::ConstIterator it_begin;
36 
37  /** The TinyTextEntity containing the last character of the match. */
38  TextList::ConstIterator it_end;
39 
40  /** The index of the first character of the match in (*it_begin)->text().
41  * Satisfies 0 <= offset_begin < (*it_begin)->text().length().
42  */
43  int offset_begin;
44 
45  /** One plus the index of the last character of the match in (*it_end)->text().
46  * Satisfies 0 < offset_end <= (*it_end)->text().length().
47  */
48  int offset_end;
49 };
50 
51 /* text comparison functions */
52 
53 static bool CaseInsensitiveCmpFn(const QStringRef &from, const QStringRef &to)
54 {
55 #ifdef DEBUG_TEXTPAGE
56  qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
57 #endif
58  return from.compare(to, Qt::CaseInsensitive) == 0;
59 }
60 
61 static bool CaseSensitiveCmpFn(const QStringRef &from, const QStringRef &to)
62 {
63 #ifdef DEBUG_TEXTPAGE
64  qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
65 #endif
66  return from.compare(to, Qt::CaseSensitive) == 0;
67 }
68 
69 /**
70  * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line
71  * overlap within @p threshold percent, i. e. iff the ratio of the length of the
72  * intersection of the segments to the length of the shortest of the two input segments
73  * is not smaller than the threshold.
74  */
75 static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
76 {
77  // check if one consumes another fully (speed optimization)
78 
79  if (left1 <= left2 && right1 >= right2)
80  return true;
81 
82  if (left1 >= left2 && right1 <= right2)
83  return true;
84 
85  // check if there is overlap above threshold
86  if (right2 >= left1 && right1 >= left2) {
87  double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
88 
89  double length1 = right1 - left1, length2 = right2 - left2;
90 
91  return overlap * 100 >= threshold * qMin(length1, length2);
92  }
93 
94  return false;
95 }
96 
97 static bool doesConsumeY(const QRect first, const QRect second, int threshold)
98 {
99  return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
100 }
101 
102 static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
103 {
104  return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
105 }
106 
107 /*
108  Rationale behind TinyTextEntity:
109 
110  instead of storing directly a QString for the text of an entity,
111  we store the UTF-16 data and their length. This way, we save about
112  4 int's wrt a QString, and we can create a new string from that
113  raw data (that's the only penalty of that).
114  Even better, if the string we need to store has at most
115  MaxStaticChars characters, then we store those in place of the QChar*
116  that would be used (with new[] + free[]) for the data.
117  */
118 class TinyTextEntity
119 {
120  static const int MaxStaticChars = sizeof(void *) / sizeof(QChar);
121 
122 public:
123  TinyTextEntity(const QString &text, const NormalizedRect &rect)
124  : area(rect)
125  {
126  Q_ASSERT_X(!text.isEmpty(), "TinyTextEntity", "empty string");
127  Q_ASSERT_X(sizeof(d) == sizeof(void *), "TinyTextEntity", "internal storage is wider than QChar*, fix it!");
128  length = text.length();
129  switch (length) {
130 #if QT_POINTER_SIZE >= 8
131  case 4:
132  d.qc[3] = text.at(3).unicode();
133  // fall through
134  case 3:
135  d.qc[2] = text.at(2).unicode();
136 #endif
137  // fall through
138  case 2:
139  d.qc[1] = text.at(1).unicode();
140  // fall through
141  case 1:
142  d.qc[0] = text.at(0).unicode();
143  break;
144  default:
145  d.data = new QChar[length];
146  std::memcpy(d.data, text.constData(), length * sizeof(QChar));
147  }
148  }
149 
150  ~TinyTextEntity()
151  {
152  if (length > MaxStaticChars) {
153  delete[] d.data;
154  }
155  }
156 
157  inline QString text() const
158  {
159  return length <= MaxStaticChars ? QString::fromRawData((const QChar *)&d.qc[0], length) : QString::fromRawData(d.data, length);
160  }
161 
162  inline NormalizedRect transformedArea(const QTransform &matrix) const
163  {
164  NormalizedRect transformed_area = area;
165  transformed_area.transform(matrix);
166  return transformed_area;
167  }
168 
169  NormalizedRect area;
170 
171 private:
172  Q_DISABLE_COPY(TinyTextEntity)
173 
174  union {
175  QChar *data;
176  ushort qc[MaxStaticChars];
177  } d;
178  int length;
179 };
180 
182  : m_text(text)
183  , m_area(area)
184  , d(nullptr)
185 {
186 }
187 
189 {
190  delete m_area;
191 }
192 
194 {
195  return m_text;
196 }
197 
199 {
200  return m_area;
201 }
202 
204 {
205  NormalizedRect transformed_area = *m_area;
206  transformed_area.transform(matrix);
207  return transformed_area;
208 }
209 
210 TextPagePrivate::TextPagePrivate()
211  : m_page(nullptr)
212 {
213 }
214 
215 TextPagePrivate::~TextPagePrivate()
216 {
217  qDeleteAll(m_searchPoints);
218  qDeleteAll(m_words);
219 }
220 
222  : d(new TextPagePrivate())
223 {
224 }
225 
227  : d(new TextPagePrivate())
228 {
229  TextEntity::List::ConstIterator it = words.constBegin(), itEnd = words.constEnd();
230  for (; it != itEnd; ++it) {
231  TextEntity *e = *it;
232  if (!e->text().isEmpty())
233  d->m_words.append(new TinyTextEntity(e->text(), *e->area()));
234  delete e;
235  }
236 }
237 
239 {
240  delete d;
241 }
242 
243 void TextPage::append(const QString &text, NormalizedRect *area)
244 {
245  if (!text.isEmpty()) {
246  if (!d->m_words.isEmpty()) {
247  TinyTextEntity *lastEntity = d->m_words.last();
248  const QString concatText = lastEntity->text() + text.normalized(QString::NormalizationForm_KC);
249  if (concatText != concatText.normalized(QString::NormalizationForm_KC)) {
250  // If this happens it means that the new text + old one have combined, for example A and ◌̊ form Å
251  NormalizedRect newArea = *area | lastEntity->area;
252  delete area;
253  delete lastEntity;
254  d->m_words.removeLast();
255  d->m_words.append(new TinyTextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea));
256  return;
257  }
258  }
259 
260  d->m_words.append(new TinyTextEntity(text.normalized(QString::NormalizationForm_KC), *area));
261  }
262  delete area;
263 }
264 
265 struct WordWithCharacters {
266  WordWithCharacters(TinyTextEntity *w, const TextList &c)
267  : word(w)
268  , characters(c)
269  {
270  }
271 
272  inline QString text() const
273  {
274  return word->text();
275  }
276 
277  inline const NormalizedRect &area() const
278  {
279  return word->area;
280  }
281 
282  TinyTextEntity *word;
283  TextList characters;
284 };
286 
287 /**
288  * We will divide the whole page in some regions depending on the horizontal and
289  * vertical spacing among different regions. Each region will have an area and an
290  * associated WordsWithCharacters in sorted order.
291  */
292 class RegionText
293 {
294 public:
295  RegionText() {};
296 
297  RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
298  : m_region_wordWithCharacters(wordsWithCharacters)
299  , m_area(area)
300  {
301  }
302 
303  inline QString string() const
304  {
305  QString res;
306  for (const WordWithCharacters &word : m_region_wordWithCharacters) {
307  res += word.text();
308  }
309  return res;
310  }
311 
312  inline WordsWithCharacters text() const
313  {
314  return m_region_wordWithCharacters;
315  }
316 
317  inline QRect area() const
318  {
319  return m_area;
320  }
321 
322  inline void setArea(const QRect area)
323  {
324  m_area = area;
325  }
326 
327  inline void setText(const WordsWithCharacters &wordsWithCharacters)
328  {
329  m_region_wordWithCharacters = wordsWithCharacters;
330  }
331 
332 private:
333  WordsWithCharacters m_region_wordWithCharacters;
334  QRect m_area;
335 };
336 
338 {
339  if (d->m_words.isEmpty())
340  return new RegularAreaRect();
341 
342  /**
343  It works like this:
344  There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0)
345  rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those
346  points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search
347  for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both
348  of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a
349  1. (rx,ry)x(1,ty)
350  2. (0,ty)x(1,uy)
351  3. (0,uy)x(vx,vy)
352 
353  To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor
354  or that has a left border >= cx and bottom border >= cy.
355  */
356  RegularAreaRect *ret = new RegularAreaRect;
357 
358  PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
359  const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
360  const double scaleX = d->m_page->width();
361  const double scaleY = d->m_page->height();
362 
363  NormalizedPoint startC = sel->start();
364  NormalizedPoint endC = sel->end();
365  NormalizedPoint temp;
366 
367  // if startPoint is right to endPoint swap them
368  if (startC.x > endC.x) {
369  temp = startC;
370  startC = endC;
371  endC = temp;
372  }
373 
374  // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
375  const NormalizedRect boundingRect = d->m_page->boundingBox();
376  const QRect content = boundingRect.geometry(scaleX, scaleY);
377  const double minX = content.left();
378  const double maxX = content.right();
379  const double minY = content.top();
380  const double maxY = content.bottom();
381 
382  /**
383  * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for
384  * the endRectangle. We have four cases:
385  *
386  * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start
387  * and endPoint are outof the bounding rect (do not intersect)
388  *
389  * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle
390  *
391  * Case 2(a): find the rectangle which contains start and endpoint and having some
392  * TextEntity
393  *
394  * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any
395  * TextEntity within the rect made by startPoint and endPoint
396  *
397  * Case 3: Now, we may have two type of selection.
398  * 1. startpoint is left-top of start_end and endpoint is right-bottom
399  * 2. startpoint is left-bottom of start_end and endpoint is top-right
400  *
401  * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have
402  * text within them. so, we need to search for the best suitable textposition for start and end.
403  *
404  * Case 3(a): We search the nearest rectangle consisting of some
405  * TinyTextEntity right to or bottom of the startPoint for selection 01.
406  * And, for selection 02, we have to search for right and top
407  *
408  * Case 3(b): For endpoint, we have to find the point top of or left to
409  * endpoint if we have selection 01.
410  * Otherwise, the search will be left and bottom
411  */
412 
413  // we know that startC.x > endC.x, we need to decide which is top and which is bottom
414  const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
415 
416  // Case 1(a)
417  if (!boundingRect.intersects(start_end))
418  return ret;
419 
420  // case 1(b)
421  /**
422  note that, after swapping of start and end, we know that,
423  start is always left to end. but, we cannot say start is
424  positioned upper than end.
425  **/
426  else {
427  // if start is left to content rect take it to content rect boundary
428  if (startC.x * scaleX < minX)
429  startC.x = minX / scaleX;
430  if (endC.x * scaleX > maxX)
431  endC.x = maxX / scaleX;
432 
433  // if start is top to end (selection type 01)
434  if (startC.y * scaleY < minY)
435  startC.y = minY / scaleY;
436  if (endC.y * scaleY > maxY)
437  endC.y = maxY / scaleY;
438 
439  // if start is bottom to end (selection type 02)
440  if (startC.y * scaleY > maxY)
441  startC.y = maxY / scaleY;
442  if (endC.y * scaleY < minY)
443  endC.y = minY / scaleY;
444  }
445 
446  TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
447  TextList::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
448  const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
449 
450  NormalizedRect tmp;
451  // case 2(a)
452  for (; it != itEnd; ++it) {
453  tmp = (*it)->area;
454  if (tmp.contains(startC.x, startC.y)) {
455  start = it;
456  }
457  if (tmp.contains(endC.x, endC.y)) {
458  end = it;
459  }
460  }
461 
462  // case 2(b)
463  it = tmpIt;
464  if (start == it && end == itEnd) {
465  for (; it != itEnd; ++it) {
466  // is there any text rectangle within the start_end rect
467  tmp = (*it)->area;
468  if (start_end.intersects(tmp))
469  break;
470  }
471 
472  // we have searched every text entities, but none is within the rectangle created by start and end
473  // so, no selection should be done
474  if (it == itEnd) {
475  return ret;
476  }
477  }
478  it = tmpIt;
479  bool selection_two_start = false;
480 
481  // case 3.a
482  if (start == it) {
483  bool flagV = false;
484  NormalizedRect rect;
485 
486  // selection type 01
487  if (startC.y <= endC.y) {
488  for (; it != itEnd; ++it) {
489  rect = (*it)->area;
490  rect.isBottom(startC) ? flagV = false : flagV = true;
491 
492  if (flagV && rect.isRight(startC)) {
493  start = it;
494  break;
495  }
496  }
497  }
498 
499  // selection type 02
500  else {
501  selection_two_start = true;
502  int distance = scaleX + scaleY + 100;
503  int count = 0;
504 
505  for (; it != itEnd; ++it) {
506  rect = (*it)->area;
507 
508  if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
509  count++;
510  QRect entRect = rect.geometry(scaleX, scaleY);
511  int xdist, ydist;
512  xdist = entRect.center().x() - startC.x * scaleX;
513  ydist = entRect.center().y() - startC.y * scaleY;
514 
515  // make them positive
516  if (xdist < 0)
517  xdist = -xdist;
518  if (ydist < 0)
519  ydist = -ydist;
520 
521  if ((xdist + ydist) < distance) {
522  distance = xdist + ydist;
523  start = it;
524  }
525  }
526  }
527  }
528  }
529 
530  // case 3.b
531  if (end == itEnd) {
532  it = tmpIt;
533  itEnd = itEnd - 1;
534 
535  bool flagV = false;
536  NormalizedRect rect;
537 
538  if (startC.y <= endC.y) {
539  for (; itEnd >= it; itEnd--) {
540  rect = (*itEnd)->area;
541  rect.isTop(endC) ? flagV = false : flagV = true;
542 
543  if (flagV && rect.isLeft(endC)) {
544  end = itEnd;
545  break;
546  }
547  }
548  }
549 
550  else {
551  int distance = scaleX + scaleY + 100;
552  for (; itEnd >= it; itEnd--) {
553  rect = (*itEnd)->area;
554 
555  if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
556  QRect entRect = rect.geometry(scaleX, scaleY);
557  int xdist, ydist;
558  xdist = entRect.center().x() - endC.x * scaleX;
559  ydist = entRect.center().y() - endC.y * scaleY;
560 
561  // make them positive
562  if (xdist < 0)
563  xdist = -xdist;
564  if (ydist < 0)
565  ydist = -ydist;
566 
567  if ((xdist + ydist) < distance) {
568  distance = xdist + ydist;
569  end = itEnd;
570  }
571  }
572  }
573  }
574  }
575 
576  /* if start and end in selection 02 are in the same column, and we
577  start at an empty space we have to remove the selection of last
578  character
579  */
580  if (selection_two_start) {
581  if (start > end) {
582  start = start - 1;
583  }
584  }
585 
586  // if start is less than end swap them
587  if (start > end) {
588  it = start;
589  start = end;
590  end = it;
591  }
592 
593  // removes the possibility of crash, in case none of 1 to 3 is true
594  if (end == d->m_words.constEnd())
595  end--;
596 
597  for (; start <= end; start++) {
598  ret->appendShape((*start)->transformedArea(matrix), side);
599  }
600 
601  return ret;
602 }
603 
604 RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
605 {
606  SearchDirection dir = direct;
607  // invalid search request
608  if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull()))
609  return nullptr;
610  TextList::ConstIterator start;
611  int start_offset = 0;
612  TextList::ConstIterator end;
613  const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
614  if (sIt == d->m_searchPoints.constEnd()) {
615  // if no previous run of this search is found, then set it to start
616  // from the beginning (respecting the search direction)
617  if (dir == NextResult)
618  dir = FromTop;
619  else if (dir == PreviousResult)
620  dir = FromBottom;
621  }
622  bool forward = true;
623  switch (dir) {
624  case FromTop:
625  start = d->m_words.constBegin();
626  start_offset = 0;
627  end = d->m_words.constEnd();
628  break;
629  case FromBottom:
630  start = d->m_words.constEnd();
631  start_offset = 0;
632  end = d->m_words.constBegin();
633  forward = false;
634  break;
635  case NextResult:
636  start = (*sIt)->it_end;
637  start_offset = (*sIt)->offset_end;
638  end = d->m_words.constEnd();
639  break;
640  case PreviousResult:
641  start = (*sIt)->it_begin;
642  start_offset = (*sIt)->offset_begin;
643  end = d->m_words.constBegin();
644  forward = false;
645  break;
646  };
647  RegularAreaRect *ret = nullptr;
648  const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
649  if (forward) {
650  ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
651  } else {
652  ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
653  }
654  return ret;
655 }
656 
657 // hyphenated '-' must be at the end of a word, so hyphenation means
658 // we have a '-' just followed by a '\n' character
659 // check if the string contains a '-' character
660 // if the '-' is the last entry
661 static int stringLengthAdaptedWithHyphen(const QString &str, const TextList::ConstIterator &it, const TextList::ConstIterator &textListEnd)
662 {
663  const int len = str.length();
664 
665  // hyphenated '-' must be at the end of a word, so hyphenation means
666  // we have a '-' just followed by a '\n' character
667  // check if the string contains a '-' character
668  // if the '-' is the last entry
669  if (str.endsWith(QLatin1Char('-'))) {
670  // validity chek of it + 1
671  if ((it + 1) != textListEnd) {
672  // 1. if the next character is '\n'
673  const QString &lookahedStr = (*(it + 1))->text();
674  if (lookahedStr.startsWith(QLatin1Char('\n'))) {
675  return len - 1;
676  }
677 
678  // 2. if the next word is in a different line or not
679  const NormalizedRect &hyphenArea = (*it)->area;
680  const NormalizedRect &lookaheadArea = (*(it + 1))->area;
681 
682  // lookahead to check whether both the '-' rect and next character rect overlap
683  if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
684  return len - 1;
685  }
686  }
687  }
688  // else if it is the second last entry - for example in pdf format
689  else if (str.endsWith(QLatin1String("-\n"))) {
690  return len - 2;
691  }
692 
693  return len;
694 }
695 
696 RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
697 {
698  PagePrivate *pagePrivate = PagePrivate::get(m_page);
699  const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
700  RegularAreaRect *ret = new RegularAreaRect;
701 
702  for (TextList::ConstIterator it = sp->it_begin;; it++) {
703  const TinyTextEntity *curEntity = *it;
704  ret->append(curEntity->transformedArea(matrix));
705 
706  if (it == sp->it_end) {
707  break;
708  }
709  }
710 
711  ret->simplify();
712  return ret;
713 }
714 
715 RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
716 {
717  // normalize query search all unicode (including glyphs)
718  const QString query = _query.normalized(QString::NormalizationForm_KC);
719 
720  // j is the current position in our query
721  // queryLeft is the length of the query we have left to match
722  int j = 0, queryLeft = query.length();
723 
724  TextList::ConstIterator it = start;
725  int offset = start_offset;
726 
727  TextList::ConstIterator it_begin = TextList::ConstIterator();
728  int offset_begin = 0; // dummy initial value to suppress compiler warnings
729 
730  while (it != end) {
731  const TinyTextEntity *curEntity = *it;
732  const QString &str = curEntity->text();
733  const int strLen = str.length();
734  const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
735  // adjustedLen <= strLen
736 
737  if (offset >= strLen) {
738  it++;
739  offset = 0;
740  continue;
741  }
742 
743  if (it_begin == TextList::ConstIterator()) {
744  it_begin = it;
745  offset_begin = offset;
746  }
747 
748  // Let the user write the hyphen or not when searching for text
749  int matchedLen = -1;
750  for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
751  // we have equal (or less than) area of the query left as the length of the current
752  // entity
753  const int min = qMin(queryLeft, matchingLen - offset);
754  if (comparer(str.midRef(offset, min), query.midRef(j, min))) {
755  matchedLen = min;
756  break;
757  }
758  }
759 
760  if (matchedLen == -1) {
761  // we have not matched
762  // this means we do not have a complete match
763  // we need to get back to query start
764  // and continue the search from this place
765 #ifdef DEBUG_TEXTPAGE
766  qCDebug(OkularCoreDebug) << "\tnot matched";
767 #endif
768  j = 0;
769  queryLeft = query.length();
770  it = it_begin;
771  offset = offset_begin + 1;
772  it_begin = TextList::ConstIterator();
773  } else {
774  // we have a match
775  // move the current position in the query
776  // to the position after the length of this string
777  // we matched
778  // subtract the length of the current entity from
779  // the left length of the query
780 #ifdef DEBUG_TEXTPAGE
781  qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
782 #endif
783  j += matchedLen;
784  queryLeft -= matchedLen;
785 
786  if (queryLeft == 0) {
787  // save or update the search point for the current searchID
788  QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
789  if (sIt == m_searchPoints.end()) {
790  sIt = m_searchPoints.insert(searchID, new SearchPoint);
791  }
792  SearchPoint *sp = *sIt;
793  sp->it_begin = it_begin;
794  sp->it_end = it;
795  sp->offset_begin = offset_begin;
796  sp->offset_end = offset + matchedLen;
797  return searchPointToArea(sp);
798  }
799 
800  it++;
801  offset = 0;
802  }
803  }
804  // end of loop - it means that we've ended the textentities
805 
806  const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
807  if (sIt != m_searchPoints.end()) {
808  SearchPoint *sp = *sIt;
809  m_searchPoints.erase(sIt);
810  delete sp;
811  }
812  return nullptr;
813 }
814 
815 RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
816 {
817  // normalize query to search all unicode (including glyphs)
818  const QString query = _query.normalized(QString::NormalizationForm_KC);
819 
820  // j is the current position in our query
821  // len is the length of the string in TextEntity
822  // queryLeft is the length of the query we have left
823  int j = query.length(), queryLeft = query.length();
824 
825  TextList::ConstIterator it = start;
826  int offset = start_offset;
827 
828  TextList::ConstIterator it_begin = TextList::ConstIterator();
829  int offset_begin = 0; // dummy initial value to suppress compiler warnings
830 
831  while (true) {
832  if (offset <= 0) {
833  if (it == end) {
834  break;
835  }
836  it--;
837  }
838 
839  const TinyTextEntity *curEntity = *it;
840  const QString &str = curEntity->text();
841  const int strLen = str.length();
842  const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
843  // adjustedLen <= strLen
844 
845  if (offset <= 0) {
846  offset = strLen;
847  }
848 
849  if (it_begin == TextList::ConstIterator()) {
850  it_begin = it;
851  offset_begin = offset;
852  }
853 
854  // Let the user write the hyphen or not when searching for text
855  int matchedLen = -1;
856  // we have equal (or less than) area of the query left as the length of the current
857  // entity
858  for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
859  const int hyphenOffset = (strLen - matchingLen);
860  const int min = qMin(queryLeft + hyphenOffset, offset);
861  if (comparer(str.midRef(offset - min, min - hyphenOffset), query.midRef(j - min + hyphenOffset, min - hyphenOffset))) {
862  matchedLen = min - hyphenOffset;
863  break;
864  }
865  }
866 
867  if (matchedLen == -1) {
868  // we have not matched
869  // this means we do not have a complete match
870  // we need to get back to query start
871  // and continue the search from this place
872 #ifdef DEBUG_TEXTPAGE
873  qCDebug(OkularCoreDebug) << "\tnot matched";
874 #endif
875 
876  j = query.length();
877  queryLeft = query.length();
878  it = it_begin;
879  offset = offset_begin - 1;
880  it_begin = TextList::ConstIterator();
881  } else {
882  // we have a match
883  // move the current position in the query
884  // to the position after the length of this string
885  // we matched
886  // subtract the length of the current entity from
887  // the left length of the query
888 #ifdef DEBUG_TEXTPAGE
889  qCDebug(OkularCoreDebug) << "\tmatched";
890 #endif
891  j -= matchedLen;
892  queryLeft -= matchedLen;
893 
894  if (queryLeft == 0) {
895  // save or update the search point for the current searchID
896  QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
897  if (sIt == m_searchPoints.end()) {
898  sIt = m_searchPoints.insert(searchID, new SearchPoint);
899  }
900  SearchPoint *sp = *sIt;
901  sp->it_begin = it;
902  sp->it_end = it_begin;
903  sp->offset_begin = offset - matchedLen;
904  sp->offset_end = offset_begin;
905  return searchPointToArea(sp);
906  }
907 
908  offset = 0;
909  }
910  }
911  // end of loop - it means that we've ended the textentities
912 
913  const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
914  if (sIt != m_searchPoints.end()) {
915  SearchPoint *sp = *sIt;
916  m_searchPoints.erase(sIt);
917  delete sp;
918  }
919  return nullptr;
920 }
921 
923 {
925 }
926 
928 {
929  if (area && area->isNull())
930  return QString();
931 
932  TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
933  QString ret;
934  if (area) {
935  for (; it != itEnd; ++it) {
937  if (area->intersects((*it)->area)) {
938  ret += (*it)->text();
939  }
940  } else {
941  NormalizedPoint center = (*it)->area.center();
942  if (area->contains(center.x, center.y)) {
943  ret += (*it)->text();
944  }
945  }
946  }
947  } else {
948  for (; it != itEnd; ++it)
949  ret += (*it)->text();
950  }
951  return ret;
952 }
953 
954 static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
955 {
956  QRect firstArea = first.area().roundedGeometry(1000, 1000);
957  QRect secondArea = second.area().roundedGeometry(1000, 1000);
958 
959  return firstArea.left() < secondArea.left();
960 }
961 
962 static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
963 {
964  const QRect firstArea = first.area().roundedGeometry(1000, 1000);
965  const QRect secondArea = second.area().roundedGeometry(1000, 1000);
966 
967  return firstArea.top() < secondArea.top();
968 }
969 
970 /**
971  * Sets a new world list. Deleting the contents of the old one
972  */
973 void TextPagePrivate::setWordList(const TextList &list)
974 {
975  qDeleteAll(m_words);
976  m_words = list;
977 }
978 
979 /**
980  * Remove all the spaces in between texts. It will make all the generators
981  * same, whether they save spaces(like pdf) or not(like djvu).
982  */
983 static void removeSpace(TextList *words)
984 {
985  TextList::Iterator it = words->begin();
986  const QString str(QLatin1Char(' '));
987 
988  while (it != words->end()) {
989  if ((*it)->text() == str) {
990  it = words->erase(it);
991  } else {
992  ++it;
993  }
994  }
995 }
996 
997 /**
998  * We will read the TinyTextEntity from characters and try to create words from there.
999  * Note: characters might be already characters for some generators, but we will keep
1000  * the nomenclature characters for the generator produced data. The resulting
1001  * WordsWithCharacters memory has to be managed by the caller, both the
1002  * WordWithCharacters::word and WordWithCharacters::characters contents
1003  */
1004 static WordsWithCharacters makeWordFromCharacters(const TextList &characters, int pageWidth, int pageHeight)
1005 {
1006  /**
1007  * We will traverse characters and try to create words from the TinyTextEntities in it.
1008  * We will search TinyTextEntity blocks and merge them until we get a
1009  * space between two consecutive TinyTextEntities. When we get a space
1010  * we can take it as a end of word. Then we store the word as a TinyTextEntity
1011  * and keep it in newList.
1012 
1013  * We create a RegionText named regionWord that contains the word and the characters associated with it and
1014  * a rectangle area of the element in newList.
1015 
1016  */
1017  WordsWithCharacters wordsWithCharacters;
1018 
1019  TextList::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
1020  int newLeft, newRight, newTop, newBottom;
1021  int index = 0;
1022 
1023  for (; it != itEnd; it++) {
1024  QString textString = (*it)->text();
1025  QString newString;
1026  QRect lineArea = (*it)->area.roundedGeometry(pageWidth, pageHeight), elementArea;
1027  TextList wordCharacters;
1028  tmpIt = it;
1029  int space = 0;
1030 
1031  while (!space) {
1032  if (!textString.isEmpty()) {
1033  newString.append(textString);
1034 
1035  // when textString is the start of the word
1036  if (tmpIt == it) {
1037  NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1038  wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1039  } else {
1040  NormalizedRect newRect(elementArea, pageWidth, pageHeight);
1041  wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1042  }
1043  }
1044 
1045  ++it;
1046 
1047  /*
1048  we must have to put this line before the if condition of it==itEnd
1049  otherwise the last character can be missed
1050  */
1051  if (it == itEnd)
1052  break;
1053  elementArea = (*it)->area.roundedGeometry(pageWidth, pageHeight);
1054  if (!doesConsumeY(elementArea, lineArea, 60)) {
1055  --it;
1056  break;
1057  }
1058 
1059  const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1060  const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1061 
1062  space = elementArea.left() - lineArea.right();
1063 
1064  if (space != 0) {
1065  it--;
1066  break;
1067  }
1068 
1069  newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1070  newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1071  newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1072  newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1073 
1074  lineArea.setLeft(newLeft);
1075  lineArea.setTop(newTop);
1076  lineArea.setWidth(newRight - newLeft);
1077  lineArea.setHeight(newBottom - newTop);
1078 
1079  textString = (*it)->text();
1080  }
1081 
1082  // if newString is not empty, save it
1083  if (!newString.isEmpty()) {
1084  const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1085  TinyTextEntity *word = new TinyTextEntity(newString.normalized(QString::NormalizationForm_KC), newRect);
1086  wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1087 
1088  index++;
1089  }
1090 
1091  if (it == itEnd)
1092  break;
1093  }
1094 
1095  return wordsWithCharacters;
1096 }
1097 
1098 /**
1099  * Create Lines from the words and sort them
1100  */
1101 QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1102 {
1103  /**
1104  * We cannot assume that the generator will give us texts in the right order.
1105  * We can only assume that we will get texts in the page and their bounding
1106  * rectangle. The texts can be character, word, half-word anything.
1107  * So, we need to:
1108  **
1109  * 1. Sort rectangles/boxes containing texts by y0(top)
1110  * 2. Create textline where there is y overlap between TinyTextEntity 's
1111  * 3. Within each line sort the TinyTextEntity 's by x0(left)
1112  */
1113 
1115 
1116  /*
1117  Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1118  not contain same pointers for all the TinyTextEntity.
1119  */
1120  QList<WordWithCharacters> words = wordsTmp;
1121 
1122  // Step 1
1123  std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1124 
1125  // Step 2
1126  QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1127 
1128  // for every non-space texts(characters/words) in the textList
1129  for (; it != itEnd; it++) {
1130  const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1131  bool found = false;
1132 
1133  for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1134  /* the line area which will be expanded
1135  line_rects is only necessary to preserve the topmin and bottommax of all
1136  the texts in the line, left and right is not necessary at all
1137  */
1138  QRect &lineArea = linesI.second;
1139  const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1140  const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1141 
1142  /*
1143  if the new text and the line has y overlapping parts of more than 70%,
1144  the text will be added to this line
1145  */
1146  if (doesConsumeY(elementArea, lineArea, 70)) {
1147  WordsWithCharacters &line = linesI.first;
1148  line.append(*it);
1149 
1150  const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1151  const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1152  const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1153  const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1154 
1155  lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1156  found = true;
1157  }
1158 
1159  if (found)
1160  break;
1161  }
1162 
1163  /* when we have found a new line create a new TextList containing
1164  only one element and append it to the lines
1165  */
1166  if (!found) {
1167  WordsWithCharacters tmp;
1168  tmp.append((*it));
1169  lines.append(QPair<WordsWithCharacters, QRect>(tmp, elementArea));
1170  }
1171  }
1172 
1173  // Step 3
1174  for (QPair<WordsWithCharacters, QRect> &line : lines) {
1175  WordsWithCharacters &list = line.first;
1176  std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1177  }
1178 
1179  return lines;
1180 }
1181 
1182 /**
1183  * Calculate Statistical information from the lines we made previously
1184  */
1185 static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1186 {
1187  /**
1188  * For the region, defined by line_rects and lines
1189  * 1. Make line statistical analysis to find the line spacing
1190  * 2. Make character statistical analysis to differentiate between
1191  * word spacing and column spacing.
1192  */
1193 
1194  /**
1195  * Step 0
1196  */
1197  const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1198 
1199  /**
1200  * Step 1
1201  */
1202  QMap<int, int> line_space_stat;
1203  for (int i = 0; i < sortedLines.length(); i++) {
1204  const QRect rectUpper = sortedLines.at(i).second;
1205 
1206  if (i + 1 == sortedLines.length())
1207  break;
1208  const QRect rectLower = sortedLines.at(i + 1).second;
1209 
1210  int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1211  if (linespace < 0)
1212  linespace = -linespace;
1213 
1214  if (line_space_stat.contains(linespace))
1215  line_space_stat[linespace]++;
1216  else
1217  line_space_stat[linespace] = 1;
1218  }
1219 
1220  *line_spacing = 0;
1221  int weighted_count = 0;
1222  QMapIterator<int, int> iterate_linespace(line_space_stat);
1223 
1224  while (iterate_linespace.hasNext()) {
1225  iterate_linespace.next();
1226  *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1227  weighted_count += iterate_linespace.value();
1228  }
1229  if (*line_spacing != 0)
1230  *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1231 
1232  /**
1233  * Step 2
1234  */
1235  // We would like to use QMap instead of QHash as it will keep the keys sorted
1236  QMap<int, int> hor_space_stat;
1237  QMap<int, int> col_space_stat;
1238  QList<QList<QRect>> space_rects;
1239  QVector<QRect> max_hor_space_rects;
1240 
1241  // Space in every line
1242  for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1243  const WordsWithCharacters list = sortedLine.first;
1244  QList<QRect> line_space_rects;
1245  int maxSpace = 0, minSpace = pageWidth;
1246 
1247  // for every TinyTextEntity element in the line
1248  WordsWithCharacters::ConstIterator it = list.begin(), itEnd = list.end();
1249  QRect max_area1, max_area2;
1250  QString before_max, after_max;
1251 
1252  // for every line
1253  for (; it != itEnd; it++) {
1254  const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1255  if (it + 1 == itEnd)
1256  break;
1257 
1258  const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1259  int space = area2.left() - area1.right();
1260 
1261  if (space > maxSpace) {
1262  max_area1 = area1;
1263  max_area2 = area2;
1264  maxSpace = space;
1265  before_max = (*it).text();
1266  after_max = (*(it + 1)).text();
1267  }
1268 
1269  if (space < minSpace && space != 0)
1270  minSpace = space;
1271 
1272  // if we found a real space, whose length is not zero and also less than the pageWidth
1273  if (space != 0 && space != pageWidth) {
1274  // increase the count of the space amount
1275  if (hor_space_stat.contains(space))
1276  hor_space_stat[space]++;
1277  else
1278  hor_space_stat[space] = 1;
1279 
1280  int left, right, top, bottom;
1281 
1282  left = area1.right();
1283  right = area2.left();
1284 
1285  top = area2.top() < area1.top() ? area2.top() : area1.top();
1286  bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1287 
1288  QRect rect(left, top, right - left, bottom - top);
1289  line_space_rects.append(rect);
1290  }
1291  }
1292 
1293  space_rects.append(line_space_rects);
1294 
1295  if (hor_space_stat.contains(maxSpace)) {
1296  if (hor_space_stat[maxSpace] != 1)
1297  hor_space_stat[maxSpace]--;
1298  else
1299  hor_space_stat.remove(maxSpace);
1300  }
1301 
1302  if (maxSpace != 0) {
1303  if (col_space_stat.contains(maxSpace))
1304  col_space_stat[maxSpace]++;
1305  else
1306  col_space_stat[maxSpace] = 1;
1307 
1308  // store the max rect of each line
1309  const int left = max_area1.right();
1310  const int right = max_area2.left();
1311  const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1312  const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1313 
1314  const QRect rect(left, top, right - left, bottom - top);
1315  max_hor_space_rects.append(rect);
1316  } else
1317  max_hor_space_rects.append(QRect(0, 0, 0, 0));
1318  }
1319 
1320  // All the between word space counts are in hor_space_stat
1321  *word_spacing = 0;
1322  weighted_count = 0;
1323  QMapIterator<int, int> iterate(hor_space_stat);
1324 
1325  while (iterate.hasNext()) {
1326  iterate.next();
1327 
1328  if (iterate.key() > 0) {
1329  *word_spacing += iterate.value() * iterate.key();
1330  weighted_count += iterate.value();
1331  }
1332  }
1333  if (weighted_count)
1334  *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1335 
1336  *col_spacing = 0;
1337  QMapIterator<int, int> iterate_col(col_space_stat);
1338 
1339  while (iterate_col.hasNext()) {
1340  iterate_col.next();
1341  if (iterate_col.value() > *col_spacing)
1342  *col_spacing = iterate_col.value();
1343  }
1344  *col_spacing = col_space_stat.key(*col_spacing);
1345 
1346  // if there is just one line in a region, there is no point in dividing it
1347  if (sortedLines.length() == 1)
1348  *word_spacing = *col_spacing;
1349 }
1350 
1351 /**
1352  * Implements the XY Cut algorithm for textpage segmentation
1353  * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and
1354  * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function)
1355  */
1356 static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight)
1357 {
1358  RegionTextList tree;
1359  QRect contentRect(0, 0, pageWidth, pageHeight);
1360  const RegionText root(wordsWithCharacters, contentRect);
1361 
1362  // start the tree with the root, it is our only region at the start
1363  tree.push_back(root);
1364 
1365  int i = 0;
1366 
1367  // while traversing the tree has not been ended
1368  while (i < tree.length()) {
1369  const RegionText node = tree.at(i);
1370  QRect regionRect = node.area();
1371 
1372  /**
1373  * 1. calculation of projection profiles
1374  */
1375  // allocate the size of proj profiles and initialize with 0
1376  int size_proj_y = node.area().height();
1377  int size_proj_x = node.area().width();
1378  // dynamic memory allocation
1379  QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1380  QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1381 
1382  for (int j = 0; j < size_proj_y; ++j)
1383  proj_on_yaxis[j] = 0;
1384  for (int j = 0; j < size_proj_x; ++j)
1385  proj_on_xaxis[j] = 0;
1386 
1387  const QList<WordWithCharacters> list = node.text();
1388 
1389  // Calculate tcx and tcy locally for each new region
1390  int word_spacing, line_spacing, column_spacing;
1391  calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1392 
1393  const int tcx = word_spacing * 2;
1394  const int tcy = line_spacing * 2;
1395 
1396  int maxX = 0, maxY = 0;
1397  int avgX = 0;
1398  int count;
1399 
1400  // for every text in the region
1401  for (const WordWithCharacters &wwc : list) {
1402  TinyTextEntity *ent = wwc.word;
1403  const QRect entRect = ent->area.geometry(pageWidth, pageHeight);
1404 
1405  // calculate vertical projection profile proj_on_xaxis1
1406  for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1407  if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0)
1408  proj_on_xaxis[k - regionRect.left()] += entRect.height();
1409  }
1410 
1411  // calculate horizontal projection profile in the same way
1412  for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1413  if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0)
1414  proj_on_yaxis[k - regionRect.top()] += entRect.width();
1415  }
1416  }
1417 
1418  for (int j = 0; j < size_proj_y; ++j) {
1419  if (proj_on_yaxis[j] > maxY)
1420  maxY = proj_on_yaxis[j];
1421  }
1422 
1423  avgX = count = 0;
1424  for (int j = 0; j < size_proj_x; ++j) {
1425  if (proj_on_xaxis[j] > maxX)
1426  maxX = proj_on_xaxis[j];
1427  if (proj_on_xaxis[j]) {
1428  count++;
1429  avgX += proj_on_xaxis[j];
1430  }
1431  }
1432  if (count)
1433  avgX /= count;
1434 
1435  /**
1436  * 2. Cleanup Boundary White Spaces and removal of noise
1437  */
1438  int xbegin = 0, xend = size_proj_x - 1;
1439  int ybegin = 0, yend = size_proj_y - 1;
1440  while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0)
1441  xbegin++;
1442  while (xend >= 0 && proj_on_xaxis[xend] <= 0)
1443  xend--;
1444  while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0)
1445  ybegin++;
1446  while (yend >= 0 && proj_on_yaxis[yend] <= 0)
1447  yend--;
1448 
1449  // update the regionRect
1450  int old_left = regionRect.left(), old_top = regionRect.top();
1451  regionRect.setLeft(old_left + xbegin);
1452  regionRect.setRight(old_left + xend);
1453  regionRect.setTop(old_top + ybegin);
1454  regionRect.setBottom(old_top + yend);
1455 
1456  int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1457  for (int j = 0; j < size_proj_x; ++j)
1458  proj_on_xaxis[j] -= tnx;
1459  for (int j = 0; j < size_proj_y; ++j)
1460  proj_on_yaxis[j] -= tny;
1461 
1462  /**
1463  * 3. Find the Widest gap
1464  */
1465  int gap_hor = -1, pos_hor = -1;
1466  int begin = -1, end = -1;
1467 
1468  // find all hor_gaps and find the maximum between them
1469  for (int j = 1; j < size_proj_y; ++j) {
1470  // transition from white to black
1471  if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0)
1472  end = j;
1473 
1474  // transition from black to white
1475  if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0)
1476  begin = j;
1477 
1478  if (begin > 0 && end > 0 && end - begin > gap_hor) {
1479  gap_hor = end - begin;
1480  pos_hor = (end + begin) / 2;
1481  begin = -1;
1482  end = -1;
1483  }
1484  }
1485 
1486  begin = -1, end = -1;
1487  int gap_ver = -1, pos_ver = -1;
1488 
1489  // find all the ver_gaps and find the maximum between them
1490  for (int j = 1; j < size_proj_x; ++j) {
1491  // transition from white to black
1492  if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1493  end = j;
1494  }
1495 
1496  // transition from black to white
1497  if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0)
1498  begin = j;
1499 
1500  if (begin > 0 && end > 0 && end - begin > gap_ver) {
1501  gap_ver = end - begin;
1502  pos_ver = (end + begin) / 2;
1503  begin = -1;
1504  end = -1;
1505  }
1506  }
1507 
1508  int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1509  int gap_x = gap_ver, gap_y = gap_hor;
1510 
1511  /**
1512  * 4. Cut the region and make nodes (left,right) or (up,down)
1513  */
1514  bool cut_hor = false, cut_ver = false;
1515 
1516  // For horizontal cut
1517  const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1518  const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1519  const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1520 
1521  // For vertical Cut
1522  const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1523  const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1524  const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1525 
1526  if (gap_y >= gap_x && gap_y >= tcy)
1527  cut_hor = true;
1528  else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx)
1529  cut_ver = true;
1530  else if (gap_x >= gap_y && gap_x >= tcx)
1531  cut_ver = true;
1532  else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy)
1533  cut_hor = true;
1534  // no cut possible
1535  else {
1536  // we can now update the node rectangle with the shrinked rectangle
1537  RegionText tmpNode = tree.at(i);
1538  tmpNode.setArea(regionRect);
1539  tree.replace(i, tmpNode);
1540  i++;
1541  continue;
1542  }
1543 
1544  WordsWithCharacters list1, list2;
1545 
1546  // horizontal cut, topRect and bottomRect
1547  if (cut_hor) {
1548  for (const WordWithCharacters &word : list) {
1549  const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1550 
1551  if (topRect.intersects(wordRect))
1552  list1.append(word);
1553  else
1554  list2.append(word);
1555  }
1556 
1557  RegionText node1(list1, topRect);
1558  RegionText node2(list2, bottomRect);
1559 
1560  tree.replace(i, node1);
1561  tree.insert(i + 1, node2);
1562  }
1563 
1564  // vertical cut, leftRect and rightRect
1565  else if (cut_ver) {
1566  for (const WordWithCharacters &word : list) {
1567  const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1568 
1569  if (leftRect.intersects(wordRect))
1570  list1.append(word);
1571  else
1572  list2.append(word);
1573  }
1574 
1575  RegionText node1(list1, leftRect);
1576  RegionText node2(list2, rightRect);
1577 
1578  tree.replace(i, node1);
1579  tree.insert(i + 1, node2);
1580  }
1581  }
1582 
1583  return tree;
1584 }
1585 
1586 /**
1587  * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
1588  */
1589 WordsWithCharacters addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1590 {
1591  /**
1592  * 1. Call makeAndSortLines before adding spaces in between words in a line
1593  * 2. Now add spaces between every two words in a line
1594  * 3. Finally, extract all the space separated texts from each region and return it
1595  */
1596 
1597  // Only change the texts under RegionTexts, not the area
1598  for (RegionText &tmpRegion : tree) {
1599  // Step 01
1600  QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1601 
1602  // Step 02
1603  for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1604  WordsWithCharacters &list = sortedLine.first;
1605  for (int k = 0; k < list.length(); k++) {
1606  const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1607  if (k + 1 >= list.length())
1608  break;
1609 
1610  const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1611  const int space = area2.left() - area1.right();
1612 
1613  if (space != 0) {
1614  // Make a TinyTextEntity of string space and push it between it and it+1
1615  const int left = area1.right();
1616  const int right = area2.left();
1617  const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1618  const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1619 
1620  const QString spaceStr(QStringLiteral(" "));
1621  const QRect rect(QPoint(left, top), QPoint(right, bottom));
1622  const NormalizedRect entRect(rect, pageWidth, pageHeight);
1623  TinyTextEntity *ent1 = new TinyTextEntity(spaceStr, entRect);
1624  TinyTextEntity *ent2 = new TinyTextEntity(spaceStr, entRect);
1625  WordWithCharacters word(ent1, QList<TinyTextEntity *>() << ent2);
1626 
1627  list.insert(k + 1, word);
1628 
1629  // Skip the space
1630  k++;
1631  }
1632  }
1633  }
1634 
1635  WordsWithCharacters tmpList;
1636  for (const QPair<WordsWithCharacters, QRect> &sortedLine : qAsConst(sortedLines)) {
1637  tmpList += sortedLine.first;
1638  }
1639  tmpRegion.setText(tmpList);
1640  }
1641 
1642  // Step 03
1643  WordsWithCharacters tmp;
1644  for (const RegionText &tmpRegion : qAsConst(tree)) {
1645  tmp += tmpRegion.text();
1646  }
1647  return tmp;
1648 }
1649 
1650 /**
1651  * Correct the textOrder, all layout recognition works here
1652  */
1653 void TextPagePrivate::correctTextOrder()
1654 {
1655  // m_page->width() and m_page->height() are in pixels at
1656  // 100% zoom level, and thus depend on display DPI.
1657  // To avoid Okular failing on lowDPI displays,
1658  // we scale pageWidth and pageHeight so their sum equals 2000.
1659  const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1660  const int pageWidth = (int)(scalingFactor * m_page->width());
1661  const int pageHeight = (int)(scalingFactor * m_page->height());
1662 
1663  TextList characters = m_words;
1664 
1665  /**
1666  * Remove spaces from the text
1667  */
1668  removeSpace(&characters);
1669 
1670  /**
1671  * Construct words from characters
1672  */
1673  const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1674 
1675  /**
1676  * Make a XY Cut tree for segmentation of the texts
1677  */
1678  const RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight);
1679 
1680  /**
1681  * Add spaces to the word
1682  */
1683  const WordsWithCharacters listWithWordsAndSpaces = addNecessarySpace(tree, pageWidth, pageHeight);
1684 
1685  /**
1686  * Break the words into characters
1687  */
1688  TextList listOfCharacters;
1689  for (const WordWithCharacters &word : listWithWordsAndSpaces) {
1690  delete word.word;
1691  listOfCharacters.append(word.characters);
1692  }
1693  setWordList(listOfCharacters);
1694 }
1695 
1697 {
1698  if (area && area->isNull())
1699  return TextEntity::List();
1700 
1701  TextEntity::List ret;
1702  if (area) {
1703  for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1705  if (area->intersects(te->area)) {
1706  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1707  }
1708  } else {
1709  const NormalizedPoint center = te->area.center();
1710  if (area->contains(center.x, center.y)) {
1711  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1712  }
1713  }
1714  }
1715  } else {
1716  for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1717  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1718  }
1719  }
1720  return ret;
1721 }
1722 
1724 {
1725  TextList::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1726  TextList::ConstIterator it = itBegin;
1727  TextList::ConstIterator posIt = itEnd;
1728  for (; it != itEnd; ++it) {
1729  if ((*it)->area.contains(p.x, p.y)) {
1730  posIt = it;
1731  break;
1732  }
1733  }
1734  QString text;
1735  if (posIt != itEnd) {
1736  if ((*posIt)->text().simplified().isEmpty()) {
1737  return nullptr;
1738  }
1739  // Find the first TinyTextEntity of the word
1740  while (posIt != itBegin) {
1741  --posIt;
1742  const QString itText = (*posIt)->text();
1743  if (itText.right(1).at(0).isSpace()) {
1744  if (itText.endsWith(QLatin1String("-\n"))) {
1745  // Is an hyphenated word
1746  // continue searching the start of the word back
1747  continue;
1748  }
1749 
1750  if (itText == QLatin1String("\n") && posIt != itBegin) {
1751  --posIt;
1752  if ((*posIt)->text().endsWith(QLatin1String("-"))) {
1753  // Is an hyphenated word
1754  // continue searching the start of the word back
1755  continue;
1756  }
1757  ++posIt;
1758  }
1759 
1760  ++posIt;
1761  break;
1762  }
1763  }
1764  RegularAreaRect *ret = new RegularAreaRect();
1765  for (; posIt != itEnd; ++posIt) {
1766  const QString itText = (*posIt)->text();
1767  if (itText.simplified().isEmpty()) {
1768  break;
1769  }
1770 
1771  ret->appendShape((*posIt)->area);
1772  text += (*posIt)->text();
1773  if (itText.right(1).at(0).isSpace()) {
1774  if (!text.endsWith(QLatin1String("-\n"))) {
1775  break;
1776  }
1777  }
1778  }
1779 
1780  if (word) {
1781  *word = text;
1782  }
1783  return ret;
1784  } else {
1785  return nullptr;
1786  }
1787 }
SearchDirection
Describes the direction of searching.
Definition: global.h:35
NormalizedPoint is a helper class which stores the coordinates of a normalized point.
Definition: area.h:116
QString::const_iterator constBegin() const const
QTextStream & center(QTextStream &s)
void setBottom(int y)
NormalizedRect * area() const
Returns the bounding area of the text entity.
Definition: textpage.cpp:198
QString & append(QChar ch)
const QChar * constData() const const
Searching for the next result on the page, earlier result should be located so we search from the las...
Definition: global.h:38
bool contains(const Key &key) const const
void append(const T &value)
bool isRight(const NormalizedPoint &pt) const
Returns true if the point pt is located to the left of the right edge of the rectangle.
Definition: area.h:373
int right() const const
MergeSide
The side(s) to be considered when merging areas.
Definition: global.h:63
TextPage()
Creates a new text page.
Definition: textpage.cpp:221
int length() const const
QString text() const
Returns the text of the text entity.
Definition: textpage.cpp:193
void transform(const QTransform &matrix)
Transforms the normalized rectangle with the operations defined by matrix.
Definition: area.cpp:248
bool contains(double x, double y) const
Returns whether this area contains the normalized point (x, y).
Definition: area.h:851
const T & at(int i) const const
QString simplified() const const
A NormalizedRect is a rectangle which can be defined by two NormalizedPoints.
Definition: area.h:189
void appendShape(const NormalizedShape &shape, MergeSide side=MergeAll)
Appends the given shape to this area.
Definition: area.h:796
QString text(const RegularAreaRect *area=nullptr) const
Text extraction function.
Definition: textpage.cpp:922
This is a list of NormalizedRect, to describe an area consisting of multiple rectangles using normali...
Definition: area.h:910
Merge only if the right side of the first area intersect.
Definition: global.h:64
int height() const const
double y
The normalized y coordinate.
Definition: area.h:172
int x() const const
int y() const const
T & first()
bool intersects(const RegularArea< NormalizedShape, Shape > *area) const
Returns whether this area intersects with the given area.
Definition: area.h:772
global.h
Definition: action.h:16
NormalizedPoint start() const
Returns the start point of the selection.
Definition: misc.cpp:68
bool intersects(const NormalizedRect &other) const
Returns whether the normalized rectangle intersects the other normalized rectangle.
Definition: area.cpp:166
int x() const const
int y() const const
Searching from top of the page, next result is to be found, there was no earlier search result...
Definition: global.h:36
QString normalized(QString::NormalizationForm mode, QChar::UnicodeVersion version) const const
Searching from bottom of the page, next result is to be found, there was no earlier search result...
Definition: global.h:37
Represents a piece of text on a TextPage, containing its textual representation and its bounding box...
Definition: textpage.h:51
NormalizedRect transformedArea(const QTransform &matrix) const
Returns the transformed area of the text entity.
Definition: textpage.cpp:203
bool contains(double x, double y) const
Returns whether the normalized rectangle contains the normalized point (x, y).
Definition: area.cpp:161
bool isNull() const
Returns whether the regular area is a null area.
Definition: area.h:746
QString::const_iterator constEnd() const const
Searching for the previous result on the page, earlier result should be located so we search from the...
Definition: global.h:39
QString fromRawData(const QChar *unicode, int size)
void append(const T &value)
bool isSpace() const const
int top() const const
QMapIterator::Item next()
void setTop(int y)
int left() const const
CaseInsensitive
void setWidth(int width)
bool isTopOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located above the bottom of the rectangle.
Definition: area.h:355
bool isEmpty() const const
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
void end(const NormalizedPoint &point)
Changes the end point of the selection to the given point.
Definition: misc.cpp:42
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
QPoint center() const const
T & first()
~TextEntity()
Destroys the text entity.
Definition: textpage.cpp:188
const Key & key() const const
bool isBottom(const NormalizedPoint &pt) const
Returns true if the point pt is located below the bottom of the rectangle.
Definition: area.h:328
const T & value() const const
QRect geometry(int xScale, int yScale) const
Returns the rectangle mapped to a reference area of xScale x yScale.
Definition: area.cpp:234
int compare(const QString &other, Qt::CaseSensitivity cs) const const
QString right(int n) const const
ushort unicode() const const
QList::iterator end()
bool isBottomOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located below the top of the rectangle.
Definition: area.h:346
QStringRef midRef(int position, int n) const const
double top
The normalized top coordinate.
Definition: area.h:420
const Key key(const T &value, const Key &defaultKey) const const
TextEntity::List words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
Text entity extraction function.
Definition: textpage.cpp:1696
void setRight(int x)
RegularAreaRect * textArea(TextSelection *selection) const
Returns the rectangular area of the given selection.
Definition: textpage.cpp:337
double x
The normalized x coordinate.
Definition: area.h:167
bool isTop(const NormalizedPoint &pt) const
Returns true if the point pt is located above the top of the rectangle.
Definition: area.h:337
int width() const const
A character is included into text() result if any pixel of his bounding box is in the given area...
Definition: textpage.h:118
TextAreaInclusionBehaviour
Defines the behaviour of adding characters to text() result.
Definition: textpage.h:117
void insert(int i, const T &value)
void setHeight(int height)
const QChar at(int position) const const
void simplify()
Simplifies this regular area by merging its intersecting subareas.
Definition: area.h:725
typedef ConstIterator
int bottom() const const
int length() const const
NormalizationForm_KC
double bottom
The normalized bottom coordinate.
Definition: area.h:430
bool isLeft(const NormalizedPoint &pt) const
Returns true if the point pt is located to the right of the left edge of the rectangle.
Definition: area.h:364
QList::const_iterator constEnd() const const
RegularAreaRect * wordAt(const NormalizedPoint &p, QString *word=nullptr) const
Returns the area and text of the word at the given point Note that ownership of the returned area bel...
Definition: textpage.cpp:1723
QList::const_iterator constBegin() const const
TextEntity(const QString &text, NormalizedRect *area)
Creates a new text entity with the given text and the given area.
Definition: textpage.cpp:181
~TextPage()
Destroys the text page.
Definition: textpage.cpp:238
void setLeft(int x)
Wrapper around the information needed to generate the selection area There are two assumptions inside...
Definition: misc.h:33
QList::iterator begin()
void append(const QString &text, NormalizedRect *area)
Appends the given text with the given area as new TextEntity to the page.
Definition: textpage.cpp:243
bool hasNext() const const
int remove(const Key &key)
RegularAreaRect * findText(int searchID, const QString &query, SearchDirection direction, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
Returns the bounding rect of the text which matches the following criteria or 0 if the search is not ...
Definition: textpage.cpp:604
This file is part of the KDE documentation.
Documentation copyright © 1996-2021 The KDE developers.
Generated on Tue Dec 7 2021 22:32:39 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.