Okular

textpage.cpp
1 /*
2  SPDX-FileCopyrightText: 2005 Piotr Szymanski <[email protected]>
3 
4  SPDX-License-Identifier: GPL-2.0-or-later
5 */
6 
7 #include "textpage.h"
8 #include "textpage_p.h"
9 
10 #include <QDebug>
11 
12 #include "area.h"
13 #include "debug_p.h"
14 #include "misc.h"
15 #include "page.h"
16 #include "page_p.h"
17 
18 #include <cstring>
19 
20 #include <QVarLengthArray>
21 #include <QtAlgorithms>
22 
23 using namespace Okular;
24 
25 class SearchPoint
26 {
27 public:
28  SearchPoint()
29  : offset_begin(-1)
30  , offset_end(-1)
31  {
32  }
33 
34  /** The TinyTextEntity containing the first character of the match. */
35  TextList::ConstIterator it_begin;
36 
37  /** The TinyTextEntity containing the last character of the match. */
38  TextList::ConstIterator it_end;
39 
40  /** The index of the first character of the match in (*it_begin)->text().
41  * Satisfies 0 <= offset_begin < (*it_begin)->text().length().
42  */
43  int offset_begin;
44 
45  /** One plus the index of the last character of the match in (*it_end)->text().
46  * Satisfies 0 < offset_end <= (*it_end)->text().length().
47  */
48  int offset_end;
49 };
50 
51 /* text comparison functions */
52 
53 static bool CaseInsensitiveCmpFn(const QStringRef &from, const QStringRef &to)
54 {
55 #ifdef DEBUG_TEXTPAGE
56  qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
57 #endif
58  return from.compare(to, Qt::CaseInsensitive) == 0;
59 }
60 
61 static bool CaseSensitiveCmpFn(const QStringRef &from, const QStringRef &to)
62 {
63 #ifdef DEBUG_TEXTPAGE
64  qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
65 #endif
66  return from.compare(to, Qt::CaseSensitive) == 0;
67 }
68 
69 /**
70  * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line
71  * overlap within @p threshold percent, i. e. iff the ratio of the length of the
72  * intersection of the segments to the length of the shortest of the two input segments
73  * is not smaller than the threshold.
74  */
75 static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
76 {
77  // check if one consumes another fully (speed optimization)
78 
79  if (left1 <= left2 && right1 >= right2) {
80  return true;
81  }
82 
83  if (left1 >= left2 && right1 <= right2) {
84  return true;
85  }
86 
87  // check if there is overlap above threshold
88  if (right2 >= left1 && right1 >= left2) {
89  double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
90 
91  double length1 = right1 - left1, length2 = right2 - left2;
92 
93  return overlap * 100 >= threshold * qMin(length1, length2);
94  }
95 
96  return false;
97 }
98 
99 static bool doesConsumeY(const QRect first, const QRect second, int threshold)
100 {
101  return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
102 }
103 
104 static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
105 {
106  return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
107 }
108 
109 /*
110  Rationale behind TinyTextEntity:
111 
112  instead of storing directly a QString for the text of an entity,
113  we store the UTF-16 data and their length. This way, we save about
114  4 int's wrt a QString, and we can create a new string from that
115  raw data (that's the only penalty of that).
116  Even better, if the string we need to store has at most
117  MaxStaticChars characters, then we store those in place of the QChar*
118  that would be used (with new[] + free[]) for the data.
119  */
120 class TinyTextEntity
121 {
122  static const int MaxStaticChars = sizeof(void *) / sizeof(QChar);
123 
124 public:
125  TinyTextEntity(const QString &text, const NormalizedRect &rect)
126  : area(rect)
127  {
128  Q_ASSERT_X(!text.isEmpty(), "TinyTextEntity", "empty string");
129  Q_ASSERT_X(sizeof(d) == sizeof(void *), "TinyTextEntity", "internal storage is wider than QChar*, fix it!");
130  length = text.length();
131  switch (length) {
132 #if QT_POINTER_SIZE >= 8
133  case 4:
134  d.qc[3] = text.at(3).unicode();
135  // fall through
136  case 3:
137  d.qc[2] = text.at(2).unicode();
138 #endif
139  // fall through
140  case 2:
141  d.qc[1] = text.at(1).unicode();
142  // fall through
143  case 1:
144  d.qc[0] = text.at(0).unicode();
145  break;
146  default:
147  d.data = new QChar[length];
148  std::memcpy(d.data, text.constData(), length * sizeof(QChar));
149  }
150  }
151 
152  ~TinyTextEntity()
153  {
154  if (length > MaxStaticChars) {
155  delete[] d.data;
156  }
157  }
158 
159  inline QString text() const
160  {
161  return length <= MaxStaticChars ? QString::fromRawData((const QChar *)&d.qc[0], length) : QString::fromRawData(d.data, length);
162  }
163 
164  inline NormalizedRect transformedArea(const QTransform &matrix) const
165  {
166  NormalizedRect transformed_area = area;
167  transformed_area.transform(matrix);
168  return transformed_area;
169  }
170 
171  NormalizedRect area;
172 
173 private:
174  Q_DISABLE_COPY(TinyTextEntity)
175 
176  union {
177  QChar *data;
178  ushort qc[MaxStaticChars];
179  } d;
180  int length;
181 };
182 
184  : m_text(text)
185  , m_area(area)
186  , d(nullptr)
187 {
188 }
189 
191 {
192  delete m_area;
193 }
194 
196 {
197  return m_text;
198 }
199 
201 {
202  return m_area;
203 }
204 
206 {
207  NormalizedRect transformed_area = *m_area;
208  transformed_area.transform(matrix);
209  return transformed_area;
210 }
211 
212 TextPagePrivate::TextPagePrivate()
213  : m_page(nullptr)
214 {
215 }
216 
217 TextPagePrivate::~TextPagePrivate()
218 {
219  qDeleteAll(m_searchPoints);
220  qDeleteAll(m_words);
221 }
222 
224  : d(new TextPagePrivate())
225 {
226 }
227 
229  : d(new TextPagePrivate())
230 {
232  for (; it != itEnd; ++it) {
233  TextEntity *e = *it;
234  if (!e->text().isEmpty()) {
235  d->m_words.append(new TinyTextEntity(e->text(), *e->area()));
236  }
237  delete e;
238  }
239 }
240 
242 {
243  delete d;
244 }
245 
246 void TextPage::append(const QString &text, NormalizedRect *area)
247 {
248  if (!text.isEmpty()) {
249  if (!d->m_words.isEmpty()) {
250  TinyTextEntity *lastEntity = d->m_words.last();
251  const QString concatText = lastEntity->text() + text.normalized(QString::NormalizationForm_KC);
252  if (concatText != concatText.normalized(QString::NormalizationForm_KC)) {
253  // If this happens it means that the new text + old one have combined, for example A and ◌̊ form Å
254  NormalizedRect newArea = *area | lastEntity->area;
255  delete area;
256  delete lastEntity;
257  d->m_words.removeLast();
258  d->m_words.append(new TinyTextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea));
259  return;
260  }
261  }
262 
263  d->m_words.append(new TinyTextEntity(text.normalized(QString::NormalizationForm_KC), *area));
264  }
265  delete area;
266 }
267 
268 struct WordWithCharacters {
269  WordWithCharacters(TinyTextEntity *w, const TextList &c)
270  : word(w)
271  , characters(c)
272  {
273  }
274 
275  inline QString text() const
276  {
277  return word->text();
278  }
279 
280  inline const NormalizedRect &area() const
281  {
282  return word->area;
283  }
284 
285  TinyTextEntity *word;
286  TextList characters;
287 };
289 
290 /**
291  * We will divide the whole page in some regions depending on the horizontal and
292  * vertical spacing among different regions. Each region will have an area and an
293  * associated WordsWithCharacters in sorted order.
294  */
295 class RegionText
296 {
297 public:
298  RegionText() {};
299 
300  RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
301  : m_region_wordWithCharacters(wordsWithCharacters)
302  , m_area(area)
303  {
304  }
305 
306  inline QString string() const
307  {
308  QString res;
309  for (const WordWithCharacters &word : m_region_wordWithCharacters) {
310  res += word.text();
311  }
312  return res;
313  }
314 
315  inline WordsWithCharacters text() const
316  {
317  return m_region_wordWithCharacters;
318  }
319 
320  inline QRect area() const
321  {
322  return m_area;
323  }
324 
325  inline void setArea(const QRect area)
326  {
327  m_area = area;
328  }
329 
330  inline void setText(const WordsWithCharacters &wordsWithCharacters)
331  {
332  m_region_wordWithCharacters = wordsWithCharacters;
333  }
334 
335 private:
336  WordsWithCharacters m_region_wordWithCharacters;
337  QRect m_area;
338 };
339 
341 {
342  if (d->m_words.isEmpty()) {
343  return new RegularAreaRect();
344  }
345 
346  /**
347  It works like this:
348  There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0)
349  rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those
350  points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search
351  for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both
352  of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a
353  1. (rx,ry)x(1,ty)
354  2. (0,ty)x(1,uy)
355  3. (0,uy)x(vx,vy)
356 
357  To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor
358  or that has a left border >= cx and bottom border >= cy.
359  */
360  RegularAreaRect *ret = new RegularAreaRect;
361 
362  PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
363  const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
364  const double scaleX = d->m_page->width();
365  const double scaleY = d->m_page->height();
366 
367  NormalizedPoint startC = sel->start();
368  NormalizedPoint endC = sel->end();
369  NormalizedPoint temp;
370 
371  // if startPoint is right to endPoint swap them
372  if (startC.x > endC.x) {
373  temp = startC;
374  startC = endC;
375  endC = temp;
376  }
377 
378  // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
379  const NormalizedRect boundingRect = d->m_page->boundingBox();
380  const QRect content = boundingRect.geometry(scaleX, scaleY);
381  const double minX = content.left();
382  const double maxX = content.right();
383  const double minY = content.top();
384  const double maxY = content.bottom();
385 
386  /**
387  * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for
388  * the endRectangle. We have four cases:
389  *
390  * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start
391  * and endPoint are outof the bounding rect (do not intersect)
392  *
393  * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle
394  *
395  * Case 2(a): find the rectangle which contains start and endpoint and having some
396  * TextEntity
397  *
398  * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any
399  * TextEntity within the rect made by startPoint and endPoint
400  *
401  * Case 3: Now, we may have two type of selection.
402  * 1. startpoint is left-top of start_end and endpoint is right-bottom
403  * 2. startpoint is left-bottom of start_end and endpoint is top-right
404  *
405  * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have
406  * text within them. so, we need to search for the best suitable textposition for start and end.
407  *
408  * Case 3(a): We search the nearest rectangle consisting of some
409  * TinyTextEntity right to or bottom of the startPoint for selection 01.
410  * And, for selection 02, we have to search for right and top
411  *
412  * Case 3(b): For endpoint, we have to find the point top of or left to
413  * endpoint if we have selection 01.
414  * Otherwise, the search will be left and bottom
415  */
416 
417  // we know that startC.x > endC.x, we need to decide which is top and which is bottom
418  const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
419 
420  // Case 1(a)
421  if (!boundingRect.intersects(start_end)) {
422  return ret;
423  } else {
424  // case 1(b)
425  /**
426  note that, after swapping of start and end, we know that,
427  start is always left to end. but, we cannot say start is
428  positioned upper than end.
429  **/
430  // if start is left to content rect take it to content rect boundary
431  if (startC.x * scaleX < minX) {
432  startC.x = minX / scaleX;
433  }
434  if (endC.x * scaleX > maxX) {
435  endC.x = maxX / scaleX;
436  }
437 
438  // if start is top to end (selection type 01)
439  if (startC.y * scaleY < minY) {
440  startC.y = minY / scaleY;
441  }
442  if (endC.y * scaleY > maxY) {
443  endC.y = maxY / scaleY;
444  }
445 
446  // if start is bottom to end (selection type 02)
447  if (startC.y * scaleY > maxY) {
448  startC.y = maxY / scaleY;
449  }
450  if (endC.y * scaleY < minY) {
451  endC.y = minY / scaleY;
452  }
453  }
454 
455  TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
456  TextList::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
457  const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
458 
459  NormalizedRect tmp;
460  // case 2(a)
461  for (; it != itEnd; ++it) {
462  tmp = (*it)->area;
463  if (tmp.contains(startC.x, startC.y)) {
464  start = it;
465  }
466  if (tmp.contains(endC.x, endC.y)) {
467  end = it;
468  }
469  }
470 
471  // case 2(b)
472  it = tmpIt;
473  if (start == it && end == itEnd) {
474  for (; it != itEnd; ++it) {
475  // is there any text rectangle within the start_end rect
476  tmp = (*it)->area;
477  if (start_end.intersects(tmp)) {
478  break;
479  }
480  }
481 
482  // we have searched every text entities, but none is within the rectangle created by start and end
483  // so, no selection should be done
484  if (it == itEnd) {
485  return ret;
486  }
487  }
488  it = tmpIt;
489  bool selection_two_start = false;
490 
491  // case 3.a
492  if (start == it) {
493  bool flagV = false;
494  NormalizedRect rect;
495 
496  // selection type 01
497  if (startC.y <= endC.y) {
498  for (; it != itEnd; ++it) {
499  rect = (*it)->area;
500  rect.isBottom(startC) ? flagV = false : flagV = true;
501 
502  if (flagV && rect.isRight(startC)) {
503  start = it;
504  break;
505  }
506  }
507  }
508 
509  // selection type 02
510  else {
511  selection_two_start = true;
512  int distance = scaleX + scaleY + 100;
513  int count = 0;
514 
515  for (; it != itEnd; ++it) {
516  rect = (*it)->area;
517 
518  if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
519  count++;
520  QRect entRect = rect.geometry(scaleX, scaleY);
521  int xdist, ydist;
522  xdist = entRect.center().x() - startC.x * scaleX;
523  ydist = entRect.center().y() - startC.y * scaleY;
524 
525  // make them positive
526  if (xdist < 0) {
527  xdist = -xdist;
528  }
529  if (ydist < 0) {
530  ydist = -ydist;
531  }
532 
533  if ((xdist + ydist) < distance) {
534  distance = xdist + ydist;
535  start = it;
536  }
537  }
538  }
539  }
540  }
541 
542  // case 3.b
543  if (end == itEnd) {
544  it = tmpIt;
545  itEnd = itEnd - 1;
546 
547  bool flagV = false;
548  NormalizedRect rect;
549 
550  if (startC.y <= endC.y) {
551  for (; itEnd >= it; itEnd--) {
552  rect = (*itEnd)->area;
553  rect.isTop(endC) ? flagV = false : flagV = true;
554 
555  if (flagV && rect.isLeft(endC)) {
556  end = itEnd;
557  break;
558  }
559  }
560  }
561 
562  else {
563  int distance = scaleX + scaleY + 100;
564  for (; itEnd >= it; itEnd--) {
565  rect = (*itEnd)->area;
566 
567  if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
568  QRect entRect = rect.geometry(scaleX, scaleY);
569  int xdist, ydist;
570  xdist = entRect.center().x() - endC.x * scaleX;
571  ydist = entRect.center().y() - endC.y * scaleY;
572 
573  // make them positive
574  if (xdist < 0) {
575  xdist = -xdist;
576  }
577  if (ydist < 0) {
578  ydist = -ydist;
579  }
580 
581  if ((xdist + ydist) < distance) {
582  distance = xdist + ydist;
583  end = itEnd;
584  }
585  }
586  }
587  }
588  }
589 
590  /* if start and end in selection 02 are in the same column, and we
591  start at an empty space we have to remove the selection of last
592  character
593  */
594  if (selection_two_start) {
595  if (start > end) {
596  start = start - 1;
597  }
598  }
599 
600  // if start is less than end swap them
601  if (start > end) {
602  it = start;
603  start = end;
604  end = it;
605  }
606 
607  // removes the possibility of crash, in case none of 1 to 3 is true
608  if (end == d->m_words.constEnd()) {
609  end--;
610  }
611 
612  for (; start <= end; start++) {
613  ret->appendShape((*start)->transformedArea(matrix), side);
614  }
615 
616  return ret;
617 }
618 
619 RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
620 {
621  SearchDirection dir = direct;
622  // invalid search request
623  if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull())) {
624  return nullptr;
625  }
626  TextList::ConstIterator start;
627  int start_offset = 0;
628  TextList::ConstIterator end;
629  const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
630  if (sIt == d->m_searchPoints.constEnd()) {
631  // if no previous run of this search is found, then set it to start
632  // from the beginning (respecting the search direction)
633  if (dir == NextResult) {
634  dir = FromTop;
635  } else if (dir == PreviousResult) {
636  dir = FromBottom;
637  }
638  }
639  bool forward = true;
640  switch (dir) {
641  case FromTop:
642  start = d->m_words.constBegin();
643  start_offset = 0;
644  end = d->m_words.constEnd();
645  break;
646  case FromBottom:
647  start = d->m_words.constEnd();
648  start_offset = 0;
649  end = d->m_words.constBegin();
650  forward = false;
651  break;
652  case NextResult:
653  start = (*sIt)->it_end;
654  start_offset = (*sIt)->offset_end;
655  end = d->m_words.constEnd();
656  break;
657  case PreviousResult:
658  start = (*sIt)->it_begin;
659  start_offset = (*sIt)->offset_begin;
660  end = d->m_words.constBegin();
661  forward = false;
662  break;
663  };
664  RegularAreaRect *ret = nullptr;
665  const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
666  if (forward) {
667  ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
668  } else {
669  ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
670  }
671  return ret;
672 }
673 
674 // hyphenated '-' must be at the end of a word, so hyphenation means
675 // we have a '-' just followed by a '\n' character
676 // check if the string contains a '-' character
677 // if the '-' is the last entry
678 static int stringLengthAdaptedWithHyphen(const QString &str, const TextList::ConstIterator &it, const TextList::ConstIterator &textListEnd)
679 {
680  const int len = str.length();
681 
682  // hyphenated '-' must be at the end of a word, so hyphenation means
683  // we have a '-' just followed by a '\n' character
684  // check if the string contains a '-' character
685  // if the '-' is the last entry
686  if (str.endsWith(QLatin1Char('-'))) {
687  // validity chek of it + 1
688  if ((it + 1) != textListEnd) {
689  // 1. if the next character is '\n'
690  const QString &lookahedStr = (*(it + 1))->text();
691  if (lookahedStr.startsWith(QLatin1Char('\n'))) {
692  return len - 1;
693  }
694 
695  // 2. if the next word is in a different line or not
696  const NormalizedRect &hyphenArea = (*it)->area;
697  const NormalizedRect &lookaheadArea = (*(it + 1))->area;
698 
699  // lookahead to check whether both the '-' rect and next character rect overlap
700  if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
701  return len - 1;
702  }
703  }
704  }
705  // else if it is the second last entry - for example in pdf format
706  else if (str.endsWith(QLatin1String("-\n"))) {
707  return len - 2;
708  }
709 
710  return len;
711 }
712 
713 RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
714 {
715  PagePrivate *pagePrivate = PagePrivate::get(m_page);
716  const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
717  RegularAreaRect *ret = new RegularAreaRect;
718 
719  for (TextList::ConstIterator it = sp->it_begin;; it++) {
720  const TinyTextEntity *curEntity = *it;
721  ret->append(curEntity->transformedArea(matrix));
722 
723  if (it == sp->it_end) {
724  break;
725  }
726  }
727 
728  ret->simplify();
729  return ret;
730 }
731 
732 RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
733 {
734  // normalize query search all unicode (including glyphs)
736 
737  // j is the current position in our query
738  // queryLeft is the length of the query we have left to match
739  int j = 0, queryLeft = query.length();
740 
741  TextList::ConstIterator it = start;
742  int offset = start_offset;
743 
744  TextList::ConstIterator it_begin = TextList::ConstIterator();
745  int offset_begin = 0; // dummy initial value to suppress compiler warnings
746 
747  while (it != end) {
748  const TinyTextEntity *curEntity = *it;
749  const QString &str = curEntity->text();
750  const int strLen = str.length();
751  const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
752  // adjustedLen <= strLen
753 
754  if (offset >= strLen) {
755  it++;
756  offset = 0;
757  continue;
758  }
759 
760  if (it_begin == TextList::ConstIterator()) {
761  it_begin = it;
762  offset_begin = offset;
763  }
764 
765  // Let the user write the hyphen or not when searching for text
766  int matchedLen = -1;
767  for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
768  // we have equal (or less than) area of the query left as the length of the current
769  // entity
770  const int min = qMin(queryLeft, matchingLen - offset);
771  if (comparer(str.midRef(offset, min), query.midRef(j, min))) {
772  matchedLen = min;
773  break;
774  }
775  }
776 
777  if (matchedLen == -1) {
778  // we have not matched
779  // this means we do not have a complete match
780  // we need to get back to query start
781  // and continue the search from this place
782 #ifdef DEBUG_TEXTPAGE
783  qCDebug(OkularCoreDebug) << "\tnot matched";
784 #endif
785  j = 0;
786  queryLeft = query.length();
787  it = it_begin;
788  offset = offset_begin + 1;
789  it_begin = TextList::ConstIterator();
790  } else {
791  // we have a match
792  // move the current position in the query
793  // to the position after the length of this string
794  // we matched
795  // subtract the length of the current entity from
796  // the left length of the query
797 #ifdef DEBUG_TEXTPAGE
798  qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
799 #endif
800  j += matchedLen;
801  queryLeft -= matchedLen;
802 
803  if (queryLeft == 0) {
804  // save or update the search point for the current searchID
805  QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
806  if (sIt == m_searchPoints.end()) {
807  sIt = m_searchPoints.insert(searchID, new SearchPoint);
808  }
809  SearchPoint *sp = *sIt;
810  sp->it_begin = it_begin;
811  sp->it_end = it;
812  sp->offset_begin = offset_begin;
813  sp->offset_end = offset + matchedLen;
814  return searchPointToArea(sp);
815  }
816 
817  it++;
818  offset = 0;
819  }
820  }
821  // end of loop - it means that we've ended the textentities
822 
823  const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
824  if (sIt != m_searchPoints.end()) {
825  SearchPoint *sp = *sIt;
826  m_searchPoints.erase(sIt);
827  delete sp;
828  }
829  return nullptr;
830 }
831 
832 RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
833 {
834  // normalize query to search all unicode (including glyphs)
836 
837  // j is the current position in our query
838  // len is the length of the string in TextEntity
839  // queryLeft is the length of the query we have left
840  int j = query.length(), queryLeft = query.length();
841 
842  TextList::ConstIterator it = start;
843  int offset = start_offset;
844 
845  TextList::ConstIterator it_begin = TextList::ConstIterator();
846  int offset_begin = 0; // dummy initial value to suppress compiler warnings
847 
848  while (true) {
849  if (offset <= 0) {
850  if (it == end) {
851  break;
852  }
853  it--;
854  }
855 
856  const TinyTextEntity *curEntity = *it;
857  const QString &str = curEntity->text();
858  const int strLen = str.length();
859  const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
860  // adjustedLen <= strLen
861 
862  if (offset <= 0) {
863  offset = strLen;
864  }
865 
866  if (it_begin == TextList::ConstIterator()) {
867  it_begin = it;
868  offset_begin = offset;
869  }
870 
871  // Let the user write the hyphen or not when searching for text
872  int matchedLen = -1;
873  // we have equal (or less than) area of the query left as the length of the current
874  // entity
875  for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
876  const int hyphenOffset = (strLen - matchingLen);
877  const int min = qMin(queryLeft + hyphenOffset, offset);
878  if (comparer(str.midRef(offset - min, min - hyphenOffset), query.midRef(j - min + hyphenOffset, min - hyphenOffset))) {
879  matchedLen = min - hyphenOffset;
880  break;
881  }
882  }
883 
884  if (matchedLen == -1) {
885  // we have not matched
886  // this means we do not have a complete match
887  // we need to get back to query start
888  // and continue the search from this place
889 #ifdef DEBUG_TEXTPAGE
890  qCDebug(OkularCoreDebug) << "\tnot matched";
891 #endif
892 
893  j = query.length();
894  queryLeft = query.length();
895  it = it_begin;
896  offset = offset_begin - 1;
897  it_begin = TextList::ConstIterator();
898  } else {
899  // we have a match
900  // move the current position in the query
901  // to the position after the length of this string
902  // we matched
903  // subtract the length of the current entity from
904  // the left length of the query
905 #ifdef DEBUG_TEXTPAGE
906  qCDebug(OkularCoreDebug) << "\tmatched";
907 #endif
908  j -= matchedLen;
909  queryLeft -= matchedLen;
910 
911  if (queryLeft == 0) {
912  // save or update the search point for the current searchID
913  QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
914  if (sIt == m_searchPoints.end()) {
915  sIt = m_searchPoints.insert(searchID, new SearchPoint);
916  }
917  SearchPoint *sp = *sIt;
918  sp->it_begin = it;
919  sp->it_end = it_begin;
920  sp->offset_begin = offset - matchedLen;
921  sp->offset_end = offset_begin;
922  return searchPointToArea(sp);
923  }
924 
925  offset = 0;
926  }
927  }
928  // end of loop - it means that we've ended the textentities
929 
930  const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
931  if (sIt != m_searchPoints.end()) {
932  SearchPoint *sp = *sIt;
933  m_searchPoints.erase(sIt);
934  delete sp;
935  }
936  return nullptr;
937 }
938 
940 {
942 }
943 
945 {
946  if (area && area->isNull()) {
947  return QString();
948  }
949 
950  TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
951  QString ret;
952  if (area) {
953  for (; it != itEnd; ++it) {
955  if (area->intersects((*it)->area)) {
956  ret += (*it)->text();
957  }
958  } else {
959  NormalizedPoint center = (*it)->area.center();
960  if (area->contains(center.x, center.y)) {
961  ret += (*it)->text();
962  }
963  }
964  }
965  } else {
966  for (; it != itEnd; ++it) {
967  ret += (*it)->text();
968  }
969  }
970  return ret;
971 }
972 
973 static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
974 {
975  QRect firstArea = first.area().roundedGeometry(1000, 1000);
976  QRect secondArea = second.area().roundedGeometry(1000, 1000);
977 
978  return firstArea.left() < secondArea.left();
979 }
980 
981 static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
982 {
983  const QRect firstArea = first.area().roundedGeometry(1000, 1000);
984  const QRect secondArea = second.area().roundedGeometry(1000, 1000);
985 
986  return firstArea.top() < secondArea.top();
987 }
988 
989 /**
990  * Sets a new world list. Deleting the contents of the old one
991  */
992 void TextPagePrivate::setWordList(const TextList &list)
993 {
994  qDeleteAll(m_words);
995  m_words = list;
996 }
997 
998 /**
999  * Remove all the spaces in between texts. It will make all the generators
1000  * same, whether they save spaces(like pdf) or not(like djvu).
1001  */
1002 static void removeSpace(TextList *words)
1003 {
1004  TextList::Iterator it = words->begin();
1005  const QString str(QLatin1Char(' '));
1006 
1007  while (it != words->end()) {
1008  if ((*it)->text() == str) {
1009  it = words->erase(it);
1010  } else {
1011  ++it;
1012  }
1013  }
1014 }
1015 
1016 /**
1017  * We will read the TinyTextEntity from characters and try to create words from there.
1018  * Note: characters might be already characters for some generators, but we will keep
1019  * the nomenclature characters for the generator produced data. The resulting
1020  * WordsWithCharacters memory has to be managed by the caller, both the
1021  * WordWithCharacters::word and WordWithCharacters::characters contents
1022  */
1023 static WordsWithCharacters makeWordFromCharacters(const TextList &characters, int pageWidth, int pageHeight)
1024 {
1025  /**
1026  * We will traverse characters and try to create words from the TinyTextEntities in it.
1027  * We will search TinyTextEntity blocks and merge them until we get a
1028  * space between two consecutive TinyTextEntities. When we get a space
1029  * we can take it as a end of word. Then we store the word as a TinyTextEntity
1030  * and keep it in newList.
1031 
1032  * We create a RegionText named regionWord that contains the word and the characters associated with it and
1033  * a rectangle area of the element in newList.
1034 
1035  */
1036  WordsWithCharacters wordsWithCharacters;
1037 
1038  TextList::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
1039  int newLeft, newRight, newTop, newBottom;
1040  int index = 0;
1041 
1042  for (; it != itEnd; it++) {
1043  QString textString = (*it)->text();
1044  QString newString;
1045  QRect lineArea = (*it)->area.roundedGeometry(pageWidth, pageHeight), elementArea;
1046  TextList wordCharacters;
1047  tmpIt = it;
1048  int space = 0;
1049 
1050  while (!space) {
1051  if (!textString.isEmpty()) {
1052  newString.append(textString);
1053 
1054  // when textString is the start of the word
1055  if (tmpIt == it) {
1056  NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1057  wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1058  } else {
1059  NormalizedRect newRect(elementArea, pageWidth, pageHeight);
1060  wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1061  }
1062  }
1063 
1064  ++it;
1065 
1066  /*
1067  we must have to put this line before the if condition of it==itEnd
1068  otherwise the last character can be missed
1069  */
1070  if (it == itEnd) {
1071  break;
1072  }
1073  elementArea = (*it)->area.roundedGeometry(pageWidth, pageHeight);
1074  if (!doesConsumeY(elementArea, lineArea, 60)) {
1075  --it;
1076  break;
1077  }
1078 
1079  const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1080  const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1081 
1082  space = elementArea.left() - lineArea.right();
1083 
1084  if (space != 0) {
1085  it--;
1086  break;
1087  }
1088 
1089  newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1090  newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1091  newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1092  newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1093 
1094  lineArea.setLeft(newLeft);
1095  lineArea.setTop(newTop);
1096  lineArea.setWidth(newRight - newLeft);
1097  lineArea.setHeight(newBottom - newTop);
1098 
1099  textString = (*it)->text();
1100  }
1101 
1102  // if newString is not empty, save it
1103  if (!newString.isEmpty()) {
1104  const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1105  TinyTextEntity *word = new TinyTextEntity(newString.normalized(QString::NormalizationForm_KC), newRect);
1106  wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1107 
1108  index++;
1109  }
1110 
1111  if (it == itEnd) {
1112  break;
1113  }
1114  }
1115 
1116  return wordsWithCharacters;
1117 }
1118 
1119 /**
1120  * Create Lines from the words and sort them
1121  */
1122 QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1123 {
1124  /**
1125  * We cannot assume that the generator will give us texts in the right order.
1126  * We can only assume that we will get texts in the page and their bounding
1127  * rectangle. The texts can be character, word, half-word anything.
1128  * So, we need to:
1129  **
1130  * 1. Sort rectangles/boxes containing texts by y0(top)
1131  * 2. Create textline where there is y overlap between TinyTextEntity 's
1132  * 3. Within each line sort the TinyTextEntity 's by x0(left)
1133  */
1134 
1136 
1137  /*
1138  Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1139  not contain same pointers for all the TinyTextEntity.
1140  */
1141  QList<WordWithCharacters> words = wordsTmp;
1142 
1143  // Step 1
1144  std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1145 
1146  // Step 2
1147  QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1148 
1149  // for every non-space texts(characters/words) in the textList
1150  for (; it != itEnd; it++) {
1151  const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1152  bool found = false;
1153 
1154  for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1155  /* the line area which will be expanded
1156  line_rects is only necessary to preserve the topmin and bottommax of all
1157  the texts in the line, left and right is not necessary at all
1158  */
1159  QRect &lineArea = linesI.second;
1160  const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1161  const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1162 
1163  /*
1164  if the new text and the line has y overlapping parts of more than 70%,
1165  the text will be added to this line
1166  */
1167  if (doesConsumeY(elementArea, lineArea, 70)) {
1168  WordsWithCharacters &line = linesI.first;
1169  line.append(*it);
1170 
1171  const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1172  const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1173  const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1174  const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1175 
1176  lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1177  found = true;
1178  }
1179 
1180  if (found) {
1181  break;
1182  }
1183  }
1184 
1185  /* when we have found a new line create a new TextList containing
1186  only one element and append it to the lines
1187  */
1188  if (!found) {
1189  WordsWithCharacters tmp;
1190  tmp.append((*it));
1191  lines.append(QPair<WordsWithCharacters, QRect>(tmp, elementArea));
1192  }
1193  }
1194 
1195  // Step 3
1196  for (QPair<WordsWithCharacters, QRect> &line : lines) {
1197  WordsWithCharacters &list = line.first;
1198  std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1199  }
1200 
1201  return lines;
1202 }
1203 
1204 /**
1205  * Calculate Statistical information from the lines we made previously
1206  */
1207 static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1208 {
1209  /**
1210  * For the region, defined by line_rects and lines
1211  * 1. Make line statistical analysis to find the line spacing
1212  * 2. Make character statistical analysis to differentiate between
1213  * word spacing and column spacing.
1214  */
1215 
1216  /**
1217  * Step 0
1218  */
1219  const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1220 
1221  /**
1222  * Step 1
1223  */
1224  QMap<int, int> line_space_stat;
1225  for (int i = 0; i < sortedLines.length(); i++) {
1226  const QRect rectUpper = sortedLines.at(i).second;
1227 
1228  if (i + 1 == sortedLines.length()) {
1229  break;
1230  }
1231  const QRect rectLower = sortedLines.at(i + 1).second;
1232 
1233  int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1234  if (linespace < 0) {
1235  linespace = -linespace;
1236  }
1237 
1238  if (line_space_stat.contains(linespace)) {
1239  line_space_stat[linespace]++;
1240  } else {
1241  line_space_stat[linespace] = 1;
1242  }
1243  }
1244 
1245  *line_spacing = 0;
1246  int weighted_count = 0;
1247  QMapIterator<int, int> iterate_linespace(line_space_stat);
1248 
1249  while (iterate_linespace.hasNext()) {
1250  iterate_linespace.next();
1251  *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1252  weighted_count += iterate_linespace.value();
1253  }
1254  if (*line_spacing != 0) {
1255  *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1256  }
1257 
1258  /**
1259  * Step 2
1260  */
1261  // We would like to use QMap instead of QHash as it will keep the keys sorted
1262  QMap<int, int> hor_space_stat;
1263  QMap<int, int> col_space_stat;
1264  QList<QList<QRect>> space_rects;
1265  QVector<QRect> max_hor_space_rects;
1266 
1267  // Space in every line
1268  for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1269  const WordsWithCharacters list = sortedLine.first;
1270  QList<QRect> line_space_rects;
1271  int maxSpace = 0, minSpace = pageWidth;
1272 
1273  // for every TinyTextEntity element in the line
1275  QRect max_area1, max_area2;
1276  QString before_max, after_max;
1277 
1278  // for every line
1279  for (; it != itEnd; it++) {
1280  const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1281  if (it + 1 == itEnd) {
1282  break;
1283  }
1284 
1285  const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1286  int space = area2.left() - area1.right();
1287 
1288  if (space > maxSpace) {
1289  max_area1 = area1;
1290  max_area2 = area2;
1291  maxSpace = space;
1292  before_max = (*it).text();
1293  after_max = (*(it + 1)).text();
1294  }
1295 
1296  if (space < minSpace && space != 0) {
1297  minSpace = space;
1298  }
1299 
1300  // if we found a real space, whose length is not zero and also less than the pageWidth
1301  if (space != 0 && space != pageWidth) {
1302  // increase the count of the space amount
1303  if (hor_space_stat.contains(space)) {
1304  hor_space_stat[space]++;
1305  } else {
1306  hor_space_stat[space] = 1;
1307  }
1308 
1309  int left, right, top, bottom;
1310 
1311  left = area1.right();
1312  right = area2.left();
1313 
1314  top = area2.top() < area1.top() ? area2.top() : area1.top();
1315  bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1316 
1317  QRect rect(left, top, right - left, bottom - top);
1318  line_space_rects.append(rect);
1319  }
1320  }
1321 
1322  space_rects.append(line_space_rects);
1323 
1324  if (hor_space_stat.contains(maxSpace)) {
1325  if (hor_space_stat[maxSpace] != 1) {
1326  hor_space_stat[maxSpace]--;
1327  } else {
1328  hor_space_stat.remove(maxSpace);
1329  }
1330  }
1331 
1332  if (maxSpace != 0) {
1333  if (col_space_stat.contains(maxSpace)) {
1334  col_space_stat[maxSpace]++;
1335  } else {
1336  col_space_stat[maxSpace] = 1;
1337  }
1338 
1339  // store the max rect of each line
1340  const int left = max_area1.right();
1341  const int right = max_area2.left();
1342  const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1343  const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1344 
1345  const QRect rect(left, top, right - left, bottom - top);
1346  max_hor_space_rects.append(rect);
1347  } else {
1348  max_hor_space_rects.append(QRect(0, 0, 0, 0));
1349  }
1350  }
1351 
1352  // All the between word space counts are in hor_space_stat
1353  *word_spacing = 0;
1354  weighted_count = 0;
1355  QMapIterator<int, int> iterate(hor_space_stat);
1356 
1357  while (iterate.hasNext()) {
1358  iterate.next();
1359 
1360  if (iterate.key() > 0) {
1361  *word_spacing += iterate.value() * iterate.key();
1362  weighted_count += iterate.value();
1363  }
1364  }
1365  if (weighted_count) {
1366  *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1367  }
1368 
1369  *col_spacing = 0;
1370  QMapIterator<int, int> iterate_col(col_space_stat);
1371 
1372  while (iterate_col.hasNext()) {
1373  iterate_col.next();
1374  if (iterate_col.value() > *col_spacing) {
1375  *col_spacing = iterate_col.value();
1376  }
1377  }
1378  *col_spacing = col_space_stat.key(*col_spacing);
1379 
1380  // if there is just one line in a region, there is no point in dividing it
1381  if (sortedLines.length() == 1) {
1382  *word_spacing = *col_spacing;
1383  }
1384 }
1385 
1386 /**
1387  * Implements the XY Cut algorithm for textpage segmentation
1388  * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and
1389  * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function)
1390  */
1391 static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight)
1392 {
1393  RegionTextList tree;
1394  QRect contentRect(0, 0, pageWidth, pageHeight);
1395  const RegionText root(wordsWithCharacters, contentRect);
1396 
1397  // start the tree with the root, it is our only region at the start
1398  tree.push_back(root);
1399 
1400  int i = 0;
1401 
1402  // while traversing the tree has not been ended
1403  while (i < tree.length()) {
1404  const RegionText node = tree.at(i);
1405  QRect regionRect = node.area();
1406 
1407  /**
1408  * 1. calculation of projection profiles
1409  */
1410  // allocate the size of proj profiles and initialize with 0
1411  int size_proj_y = node.area().height();
1412  int size_proj_x = node.area().width();
1413  // dynamic memory allocation
1414  QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1415  QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1416 
1417  for (int j = 0; j < size_proj_y; ++j) {
1418  proj_on_yaxis[j] = 0;
1419  }
1420  for (int j = 0; j < size_proj_x; ++j) {
1421  proj_on_xaxis[j] = 0;
1422  }
1423 
1424  const QList<WordWithCharacters> list = node.text();
1425 
1426  // Calculate tcx and tcy locally for each new region
1427  int word_spacing, line_spacing, column_spacing;
1428  calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1429 
1430  const int tcx = word_spacing * 2;
1431  const int tcy = line_spacing * 2;
1432 
1433  int maxX = 0, maxY = 0;
1434  int avgX = 0;
1435  int count;
1436 
1437  // for every text in the region
1438  for (const WordWithCharacters &wwc : list) {
1439  TinyTextEntity *ent = wwc.word;
1440  const QRect entRect = ent->area.geometry(pageWidth, pageHeight);
1441 
1442  // calculate vertical projection profile proj_on_xaxis1
1443  for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1444  if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0) {
1445  proj_on_xaxis[k - regionRect.left()] += entRect.height();
1446  }
1447  }
1448 
1449  // calculate horizontal projection profile in the same way
1450  for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1451  if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0) {
1452  proj_on_yaxis[k - regionRect.top()] += entRect.width();
1453  }
1454  }
1455  }
1456 
1457  for (int j = 0; j < size_proj_y; ++j) {
1458  if (proj_on_yaxis[j] > maxY) {
1459  maxY = proj_on_yaxis[j];
1460  }
1461  }
1462 
1463  avgX = count = 0;
1464  for (int j = 0; j < size_proj_x; ++j) {
1465  if (proj_on_xaxis[j] > maxX) {
1466  maxX = proj_on_xaxis[j];
1467  }
1468  if (proj_on_xaxis[j]) {
1469  count++;
1470  avgX += proj_on_xaxis[j];
1471  }
1472  }
1473  if (count) {
1474  avgX /= count;
1475  }
1476 
1477  /**
1478  * 2. Cleanup Boundary White Spaces and removal of noise
1479  */
1480  int xbegin = 0, xend = size_proj_x - 1;
1481  int ybegin = 0, yend = size_proj_y - 1;
1482  while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0) {
1483  xbegin++;
1484  }
1485  while (xend >= 0 && proj_on_xaxis[xend] <= 0) {
1486  xend--;
1487  }
1488  while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0) {
1489  ybegin++;
1490  }
1491  while (yend >= 0 && proj_on_yaxis[yend] <= 0) {
1492  yend--;
1493  }
1494 
1495  // update the regionRect
1496  int old_left = regionRect.left(), old_top = regionRect.top();
1497  regionRect.setLeft(old_left + xbegin);
1498  regionRect.setRight(old_left + xend);
1499  regionRect.setTop(old_top + ybegin);
1500  regionRect.setBottom(old_top + yend);
1501 
1502  int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1503  for (int j = 0; j < size_proj_x; ++j) {
1504  proj_on_xaxis[j] -= tnx;
1505  }
1506  for (int j = 0; j < size_proj_y; ++j) {
1507  proj_on_yaxis[j] -= tny;
1508  }
1509 
1510  /**
1511  * 3. Find the Widest gap
1512  */
1513  int gap_hor = -1, pos_hor = -1;
1514  int begin = -1, end = -1;
1515 
1516  // find all hor_gaps and find the maximum between them
1517  for (int j = 1; j < size_proj_y; ++j) {
1518  // transition from white to black
1519  if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0) {
1520  end = j;
1521  }
1522 
1523  // transition from black to white
1524  if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0) {
1525  begin = j;
1526  }
1527 
1528  if (begin > 0 && end > 0 && end - begin > gap_hor) {
1529  gap_hor = end - begin;
1530  pos_hor = (end + begin) / 2;
1531  begin = -1;
1532  end = -1;
1533  }
1534  }
1535 
1536  begin = -1, end = -1;
1537  int gap_ver = -1, pos_ver = -1;
1538 
1539  // find all the ver_gaps and find the maximum between them
1540  for (int j = 1; j < size_proj_x; ++j) {
1541  // transition from white to black
1542  if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1543  end = j;
1544  }
1545 
1546  // transition from black to white
1547  if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0) {
1548  begin = j;
1549  }
1550 
1551  if (begin > 0 && end > 0 && end - begin > gap_ver) {
1552  gap_ver = end - begin;
1553  pos_ver = (end + begin) / 2;
1554  begin = -1;
1555  end = -1;
1556  }
1557  }
1558 
1559  int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1560  int gap_x = gap_ver, gap_y = gap_hor;
1561 
1562  /**
1563  * 4. Cut the region and make nodes (left,right) or (up,down)
1564  */
1565  bool cut_hor = false, cut_ver = false;
1566 
1567  // For horizontal cut
1568  const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1569  const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1570  const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1571 
1572  // For vertical Cut
1573  const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1574  const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1575  const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1576 
1577  if (gap_y >= gap_x && gap_y >= tcy) {
1578  cut_hor = true;
1579  } else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx) {
1580  cut_ver = true;
1581  } else if (gap_x >= gap_y && gap_x >= tcx) {
1582  cut_ver = true;
1583  } else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy) {
1584  cut_hor = true;
1585  } else {
1586  // no cut possible
1587  // we can now update the node rectangle with the shrinked rectangle
1588  RegionText tmpNode = tree.at(i);
1589  tmpNode.setArea(regionRect);
1590  tree.replace(i, tmpNode);
1591  i++;
1592  continue;
1593  }
1594 
1595  WordsWithCharacters list1, list2;
1596 
1597  // horizontal cut, topRect and bottomRect
1598  if (cut_hor) {
1599  for (const WordWithCharacters &word : list) {
1600  const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1601 
1602  if (topRect.intersects(wordRect)) {
1603  list1.append(word);
1604  } else {
1605  list2.append(word);
1606  }
1607  }
1608 
1609  RegionText node1(list1, topRect);
1610  RegionText node2(list2, bottomRect);
1611 
1612  tree.replace(i, node1);
1613  tree.insert(i + 1, node2);
1614  }
1615 
1616  // vertical cut, leftRect and rightRect
1617  else if (cut_ver) {
1618  for (const WordWithCharacters &word : list) {
1619  const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1620 
1621  if (leftRect.intersects(wordRect)) {
1622  list1.append(word);
1623  } else {
1624  list2.append(word);
1625  }
1626  }
1627 
1628  RegionText node1(list1, leftRect);
1629  RegionText node2(list2, rightRect);
1630 
1631  tree.replace(i, node1);
1632  tree.insert(i + 1, node2);
1633  }
1634  }
1635 
1636  return tree;
1637 }
1638 
1639 /**
1640  * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
1641  */
1642 WordsWithCharacters addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1643 {
1644  /**
1645  * 1. Call makeAndSortLines before adding spaces in between words in a line
1646  * 2. Now add spaces between every two words in a line
1647  * 3. Finally, extract all the space separated texts from each region and return it
1648  */
1649 
1650  // Only change the texts under RegionTexts, not the area
1651  for (RegionText &tmpRegion : tree) {
1652  // Step 01
1653  QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1654 
1655  // Step 02
1656  for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1657  WordsWithCharacters &list = sortedLine.first;
1658  for (int k = 0; k < list.length(); k++) {
1659  const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1660  if (k + 1 >= list.length()) {
1661  break;
1662  }
1663 
1664  const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1665  const int space = area2.left() - area1.right();
1666 
1667  if (space != 0) {
1668  // Make a TinyTextEntity of string space and push it between it and it+1
1669  const int left = area1.right();
1670  const int right = area2.left();
1671  const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1672  const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1673 
1674  const QString spaceStr(QStringLiteral(" "));
1675  const QRect rect(QPoint(left, top), QPoint(right, bottom));
1676  const NormalizedRect entRect(rect, pageWidth, pageHeight);
1677  TinyTextEntity *ent1 = new TinyTextEntity(spaceStr, entRect);
1678  TinyTextEntity *ent2 = new TinyTextEntity(spaceStr, entRect);
1679  WordWithCharacters word(ent1, QList<TinyTextEntity *>() << ent2);
1680 
1681  list.insert(k + 1, word);
1682 
1683  // Skip the space
1684  k++;
1685  }
1686  }
1687  }
1688 
1689  WordsWithCharacters tmpList;
1690  for (const QPair<WordsWithCharacters, QRect> &sortedLine : qAsConst(sortedLines)) {
1691  tmpList += sortedLine.first;
1692  }
1693  tmpRegion.setText(tmpList);
1694  }
1695 
1696  // Step 03
1697  WordsWithCharacters tmp;
1698  for (const RegionText &tmpRegion : qAsConst(tree)) {
1699  tmp += tmpRegion.text();
1700  }
1701  return tmp;
1702 }
1703 
1704 /**
1705  * Correct the textOrder, all layout recognition works here
1706  */
1707 void TextPagePrivate::correctTextOrder()
1708 {
1709  // m_page->width() and m_page->height() are in pixels at
1710  // 100% zoom level, and thus depend on display DPI.
1711  // To avoid Okular failing on lowDPI displays,
1712  // we scale pageWidth and pageHeight so their sum equals 2000.
1713  const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1714  const int pageWidth = (int)(scalingFactor * m_page->width());
1715  const int pageHeight = (int)(scalingFactor * m_page->height());
1716 
1717  TextList characters = m_words;
1718 
1719  /**
1720  * Remove spaces from the text
1721  */
1722  removeSpace(&characters);
1723 
1724  /**
1725  * Construct words from characters
1726  */
1727  const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1728 
1729  /**
1730  * Make a XY Cut tree for segmentation of the texts
1731  */
1732  const RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight);
1733 
1734  /**
1735  * Add spaces to the word
1736  */
1737  const WordsWithCharacters listWithWordsAndSpaces = addNecessarySpace(tree, pageWidth, pageHeight);
1738 
1739  /**
1740  * Break the words into characters
1741  */
1742  TextList listOfCharacters;
1743  for (const WordWithCharacters &word : listWithWordsAndSpaces) {
1744  delete word.word;
1745  listOfCharacters.append(word.characters);
1746  }
1747  setWordList(listOfCharacters);
1748 }
1749 
1751 {
1752  if (area && area->isNull()) {
1753  return TextEntity::List();
1754  }
1755 
1756  TextEntity::List ret;
1757  if (area) {
1758  for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1760  if (area->intersects(te->area)) {
1761  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1762  }
1763  } else {
1764  const NormalizedPoint center = te->area.center();
1765  if (area->contains(center.x, center.y)) {
1766  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1767  }
1768  }
1769  }
1770  } else {
1771  for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1772  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1773  }
1774  }
1775  return ret;
1776 }
1777 
1779 {
1780  TextList::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1781  TextList::ConstIterator it = itBegin;
1782  TextList::ConstIterator posIt = itEnd;
1783  for (; it != itEnd; ++it) {
1784  if ((*it)->area.contains(p.x, p.y)) {
1785  posIt = it;
1786  break;
1787  }
1788  }
1789  QString text;
1790  if (posIt != itEnd) {
1791  if ((*posIt)->text().simplified().isEmpty()) {
1792  return nullptr;
1793  }
1794  // Find the first TinyTextEntity of the word
1795  while (posIt != itBegin) {
1796  --posIt;
1797  const QString itText = (*posIt)->text();
1798  if (itText.right(1).at(0).isSpace()) {
1799  if (itText.endsWith(QLatin1String("-\n"))) {
1800  // Is an hyphenated word
1801  // continue searching the start of the word back
1802  continue;
1803  }
1804 
1805  if (itText == QLatin1String("\n") && posIt != itBegin) {
1806  --posIt;
1807  if ((*posIt)->text().endsWith(QLatin1String("-"))) {
1808  // Is an hyphenated word
1809  // continue searching the start of the word back
1810  continue;
1811  }
1812  ++posIt;
1813  }
1814 
1815  ++posIt;
1816  break;
1817  }
1818  }
1819  RegularAreaRect *ret = new RegularAreaRect();
1820  for (; posIt != itEnd; ++posIt) {
1821  const QString itText = (*posIt)->text();
1822  if (itText.simplified().isEmpty()) {
1823  break;
1824  }
1825 
1826  ret->appendShape((*posIt)->area);
1827  text += (*posIt)->text();
1828  if (itText.right(1).at(0).isSpace()) {
1829  if (!text.endsWith(QLatin1String("-\n"))) {
1830  break;
1831  }
1832  }
1833  }
1834 
1835  if (word) {
1836  *word = text;
1837  }
1838  return ret;
1839  } else {
1840  return nullptr;
1841  }
1842 }
void append(const T &value)
T & first()
QTextStream & center(QTextStream &s)
NormalizationForm_KC
QTextStream & right(QTextStream &stream)
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
bool contains(double x, double y) const
Returns whether this area contains the normalized point (x, y).
Definition: area.h:866
std::optional< QSqlQuery > query(const QString &queryStatement)
void setWidth(int width)
bool contains(const Key &key) const const
NormalizedPoint start() const
Returns the start point of the selection.
Definition: misc.cpp:70
double y
The normalized y coordinate.
Definition: area.h:171
const QChar * constData() const const
NormalizedRect transformedArea(const QTransform &matrix) const
Returns the transformed area of the text entity.
Definition: textpage.cpp:205
CaseInsensitive
The documentation to the global Okular namespace.
Definition: action.h:16
int right() const const
QStringRef midRef(int position, int n) const const
QTextStream & left(QTextStream &stream)
SearchDirection
Describes the direction of searching.
Definition: global.h:35
int length() const const
void append(const T &value)
Q_SCRIPTABLE Q_NOREPLY void start()
NormalizedPoint is a helper class which stores the coordinates of a normalized point.
Definition: area.h:116
int width() const const
int x() const const
int y() const const
QList::const_iterator constBegin() const const
int x() const const
int y() const const
bool isRight(const NormalizedPoint &pt) const
Returns true if the point pt is located to the left of the right edge of the rectangle.
Definition: area.h:376
KIOFILEWIDGETS_EXPORT QStringList list(const QString &fileClass)
int compare(const QString &other, Qt::CaseSensitivity cs) const const
const QList< QKeySequence > & begin()
~TextEntity()
Destroys the text entity.
Definition: textpage.cpp:190
bool isBottom(const NormalizedPoint &pt) const
Returns true if the point pt is located below the bottom of the rectangle.
Definition: area.h:331
bool intersects(const RegularArea< NormalizedShape, Shape > *area) const
Returns whether this area intersects with the given area.
Definition: area.h:783
QString simplified() const const
int left() const const
TextPage()
Creates a new text page.
Definition: textpage.cpp:223
QString normalized(QString::NormalizationForm mode, QChar::UnicodeVersion version) const const
void setRight(int x)
QRect geometry(int xScale, int yScale) const
Returns the rectangle mapped to a reference area of xScale x yScale.
Definition: area.cpp:232
int remove(const Key &key)
TextAreaInclusionBehaviour
Defines the behaviour of adding characters to text() result.
Definition: textpage.h:117
int bottom() const const
int top() const const
RegularAreaRect * textArea(TextSelection *selection) const
Returns the rectangular area of the given selection.
Definition: textpage.cpp:340
Wrapper around the information needed to generate the selection area There are two assumptions inside...
Definition: misc.h:33
bool isSpace() const const
This is a list of NormalizedRect, to describe an area consisting of multiple rectangles using normali...
Definition: area.h:932
void setHeight(int height)
QString text() const
Returns the text of the text entity.
Definition: textpage.cpp:195
void setBottom(int y)
MergeSide
The side(s) to be considered when merging areas.
Definition: global.h:63
bool isEmpty() const const
int length() const const
const T & at(int i) const const
@ FromBottom
Searching from bottom of the page, next result is to be found, there was no earlier search result.
Definition: global.h:37
bool isBottomOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located below the top of the rectangle.
Definition: area.h:349
QPoint center() const const
A NormalizedRect is a rectangle which can be defined by two NormalizedPoints.
Definition: area.h:188
@ PreviousResult
Searching for the previous result on the page, earlier result should be located so we search from the...
Definition: global.h:39
void append(const QString &text, NormalizedRect *area)
Appends the given text with the given area as new TextEntity to the page.
Definition: textpage.cpp:246
bool isTopOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located above the bottom of the rectangle.
Definition: area.h:358
RegularAreaRect * findText(int searchID, const QString &query, SearchDirection direction, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
Returns the bounding rect of the text which matches the following criteria or 0 if the search is not ...
Definition: textpage.cpp:619
@ MergeRight
Merge only if the right side of the first area intersect.
Definition: global.h:64
const Key key(const T &value, const Key &defaultKey) const const
void insert(int i, const T &value)
@ NextResult
Searching for the next result on the page, earlier result should be located so we search from the las...
Definition: global.h:38
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
typedef ConstIterator
void setTop(int y)
double x
The normalized x coordinate.
Definition: area.h:166
int height() const const
QList::const_iterator constEnd() const const
NormalizedRect * area() const
Returns the bounding area of the text entity.
Definition: textpage.cpp:200
TextEntity::List words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
Text entity extraction function.
Definition: textpage.cpp:1750
void transform(const QTransform &matrix)
Transforms the normalized rectangle with the operations defined by matrix.
Definition: area.cpp:253
QString right(int n) const const
@ FromTop
Searching from top of the page, next result is to be found, there was no earlier search result.
Definition: global.h:36
QString fromRawData(const QChar *unicode, int size)
const QChar at(int position) const const
void end(const NormalizedPoint &point)
Changes the end point of the selection to the given point.
Definition: misc.cpp:43
bool isTop(const NormalizedPoint &pt) const
Returns true if the point pt is located above the top of the rectangle.
Definition: area.h:340
QList::iterator begin()
bool intersects(const NormalizedRect &other) const
Returns whether the normalized rectangle intersects the other normalized rectangle.
Definition: area.cpp:165
bool contains(double x, double y) const
Returns whether the normalized rectangle contains the normalized point (x, y).
Definition: area.cpp:160
bool isNull() const
Returns whether the regular area is a null area.
Definition: area.h:751
void simplify()
Simplifies this regular area by merging its intersecting subareas.
Definition: area.h:730
void appendShape(const NormalizedShape &shape, MergeSide side=MergeAll)
Appends the given shape to this area.
Definition: area.h:810
Represents a piece of text on a TextPage, containing its textual representation and its bounding box.
Definition: textpage.h:51
RegularAreaRect * wordAt(const NormalizedPoint &p, QString *word=nullptr) const
Returns the area and text of the word at the given point Note that ownership of the returned area bel...
Definition: textpage.cpp:1778
QList::iterator end()
TextEntity(const QString &text, NormalizedRect *area)
Creates a new text entity with the given text and the given area.
Definition: textpage.cpp:183
@ AnyPixelTextAreaInclusionBehaviour
A character is included into text() result if any pixel of his bounding box is in the given area.
Definition: textpage.h:118
void setLeft(int x)
bool isLeft(const NormalizedPoint &pt) const
Returns true if the point pt is located to the right of the left edge of the rectangle.
Definition: area.h:367
const QList< QKeySequence > & end()
QString & append(QChar ch)
double top
The normalized top coordinate.
Definition: area.h:425
ushort unicode() const const
QString text(const RegularAreaRect *area=nullptr) const
Text extraction function.
Definition: textpage.cpp:939
double bottom
The normalized bottom coordinate.
Definition: area.h:435
virtual QVariant get(ScriptableExtension *callerPrincipal, quint64 objId, const QString &propName)
~TextPage()
Destroys the text page.
Definition: textpage.cpp:241
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Thu Mar 23 2023 04:04:24 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.